claude-mpm 4.0.10__py3-none-any.whl → 4.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_mpm/core/interactive_session.py +2 -2
- claude_mpm/dashboard/static/js/socket-client.js +270 -3
- claude_mpm/hooks/claude_hooks/hook_handler.py +3 -1
- claude_mpm/hooks/claude_hooks/hook_wrapper.sh +3 -3
- claude_mpm/scripts/socketio_daemon.py +0 -0
- claude_mpm/scripts/start_activity_logging.py +0 -0
- claude_mpm/services/agents/memory/agent_memory_manager.py +1 -1
- claude_mpm/services/socketio/handlers/connection.py +284 -1
- claude_mpm/services/socketio/server/broadcaster.py +276 -10
- claude_mpm/services/socketio/server/main.py +15 -1
- {claude_mpm-4.0.10.dist-info → claude_mpm-4.0.11.dist-info}/METADATA +1 -1
- {claude_mpm-4.0.10.dist-info → claude_mpm-4.0.11.dist-info}/RECORD +14 -14
- {claude_mpm-4.0.10.dist-info → claude_mpm-4.0.11.dist-info}/WHEEL +0 -0
- {claude_mpm-4.0.10.dist-info → claude_mpm-4.0.11.dist-info}/entry_points.txt +0 -0
- {claude_mpm-4.0.10.dist-info → claude_mpm-4.0.11.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-4.0.10.dist-info → claude_mpm-4.0.11.dist-info}/top_level.txt +0 -0
|
@@ -469,9 +469,9 @@ class InteractiveSession:
|
|
|
469
469
|
def _show_available_agents(self) -> bool:
|
|
470
470
|
"""Show available agents in the system."""
|
|
471
471
|
try:
|
|
472
|
-
from claude_mpm.cli import
|
|
472
|
+
from claude_mpm.cli.utils import get_agent_versions_display
|
|
473
473
|
|
|
474
|
-
agent_versions =
|
|
474
|
+
agent_versions = get_agent_versions_display()
|
|
475
475
|
|
|
476
476
|
if agent_versions:
|
|
477
477
|
print(agent_versions)
|
|
@@ -20,14 +20,33 @@ class SocketClient {
|
|
|
20
20
|
// Connection state
|
|
21
21
|
this.isConnected = false;
|
|
22
22
|
this.isConnecting = false;
|
|
23
|
+
this.lastConnectTime = null;
|
|
24
|
+
this.disconnectTime = null;
|
|
23
25
|
|
|
24
26
|
// Event processing
|
|
25
27
|
this.events = [];
|
|
26
28
|
this.sessions = new Map();
|
|
27
29
|
this.currentSessionId = null;
|
|
28
30
|
|
|
31
|
+
// Event queue for disconnection periods
|
|
32
|
+
this.eventQueue = [];
|
|
33
|
+
this.maxQueueSize = 100;
|
|
34
|
+
|
|
35
|
+
// Retry configuration
|
|
36
|
+
this.retryAttempts = 0;
|
|
37
|
+
this.maxRetryAttempts = 3;
|
|
38
|
+
this.retryDelays = [1000, 2000, 4000]; // Exponential backoff
|
|
39
|
+
this.pendingEmissions = new Map(); // Track pending emissions for retry
|
|
40
|
+
|
|
41
|
+
// Health monitoring
|
|
42
|
+
this.lastPingTime = null;
|
|
43
|
+
this.lastPongTime = null;
|
|
44
|
+
this.pingTimeout = 40000; // 40 seconds (server sends every 30s)
|
|
45
|
+
this.healthCheckInterval = null;
|
|
46
|
+
|
|
29
47
|
// Start periodic status check as fallback mechanism
|
|
30
48
|
this.startStatusCheckFallback();
|
|
49
|
+
this.startHealthMonitoring();
|
|
31
50
|
}
|
|
32
51
|
|
|
33
52
|
/**
|
|
@@ -88,8 +107,21 @@ class SocketClient {
|
|
|
88
107
|
setupSocketHandlers() {
|
|
89
108
|
this.socket.on('connect', () => {
|
|
90
109
|
console.log('Connected to Socket.IO server');
|
|
110
|
+
const previouslyConnected = this.isConnected;
|
|
91
111
|
this.isConnected = true;
|
|
92
112
|
this.isConnecting = false;
|
|
113
|
+
this.lastConnectTime = Date.now();
|
|
114
|
+
this.retryAttempts = 0; // Reset retry counter on successful connect
|
|
115
|
+
|
|
116
|
+
// Calculate downtime if this is a reconnection
|
|
117
|
+
if (this.disconnectTime && previouslyConnected === false) {
|
|
118
|
+
const downtime = (Date.now() - this.disconnectTime) / 1000;
|
|
119
|
+
console.log(`Reconnected after ${downtime.toFixed(1)}s downtime`);
|
|
120
|
+
|
|
121
|
+
// Flush queued events after reconnection
|
|
122
|
+
this.flushEventQueue();
|
|
123
|
+
}
|
|
124
|
+
|
|
93
125
|
this.notifyConnectionStatus('Connected', 'connected');
|
|
94
126
|
|
|
95
127
|
// Emit connect callback
|
|
@@ -106,12 +138,25 @@ class SocketClient {
|
|
|
106
138
|
console.log('Disconnected from server:', reason);
|
|
107
139
|
this.isConnected = false;
|
|
108
140
|
this.isConnecting = false;
|
|
141
|
+
this.disconnectTime = Date.now();
|
|
142
|
+
|
|
143
|
+
// Calculate uptime
|
|
144
|
+
if (this.lastConnectTime) {
|
|
145
|
+
const uptime = (Date.now() - this.lastConnectTime) / 1000;
|
|
146
|
+
console.log(`Connection uptime was ${uptime.toFixed(1)}s`);
|
|
147
|
+
}
|
|
148
|
+
|
|
109
149
|
this.notifyConnectionStatus(`Disconnected: ${reason}`, 'disconnected');
|
|
110
150
|
|
|
111
151
|
// Emit disconnect callback
|
|
112
152
|
this.connectionCallbacks.disconnect.forEach(callback =>
|
|
113
153
|
callback(reason)
|
|
114
154
|
);
|
|
155
|
+
|
|
156
|
+
// Start auto-reconnect if it was an unexpected disconnect
|
|
157
|
+
if (reason === 'transport close' || reason === 'ping timeout') {
|
|
158
|
+
this.scheduleReconnect();
|
|
159
|
+
}
|
|
115
160
|
});
|
|
116
161
|
|
|
117
162
|
this.socket.on('connect_error', (error) => {
|
|
@@ -124,13 +169,20 @@ class SocketClient {
|
|
|
124
169
|
this.addEvent({
|
|
125
170
|
type: 'connection.error',
|
|
126
171
|
timestamp: new Date().toISOString(),
|
|
127
|
-
data: {
|
|
172
|
+
data: {
|
|
173
|
+
error: errorMsg,
|
|
174
|
+
url: this.socket.io.uri,
|
|
175
|
+
retry_attempt: this.retryAttempts
|
|
176
|
+
}
|
|
128
177
|
});
|
|
129
178
|
|
|
130
179
|
// Emit error callback
|
|
131
180
|
this.connectionCallbacks.error.forEach(callback =>
|
|
132
181
|
callback(errorMsg)
|
|
133
182
|
);
|
|
183
|
+
|
|
184
|
+
// Schedule reconnect with backoff
|
|
185
|
+
this.scheduleReconnect();
|
|
134
186
|
});
|
|
135
187
|
|
|
136
188
|
// Primary event handler - this is what the server actually emits
|
|
@@ -143,6 +195,18 @@ class SocketClient {
|
|
|
143
195
|
this.addEvent(transformedEvent);
|
|
144
196
|
});
|
|
145
197
|
|
|
198
|
+
// Add ping/pong handlers for health monitoring
|
|
199
|
+
this.socket.on('ping', (data) => {
|
|
200
|
+
// console.log('Received ping from server');
|
|
201
|
+
this.lastPingTime = Date.now();
|
|
202
|
+
|
|
203
|
+
// Send pong response immediately
|
|
204
|
+
this.socket.emit('pong', {
|
|
205
|
+
timestamp: data.timestamp,
|
|
206
|
+
client_time: Date.now()
|
|
207
|
+
});
|
|
208
|
+
});
|
|
209
|
+
|
|
146
210
|
// Session and event handlers (legacy/fallback)
|
|
147
211
|
this.socket.on('session.started', (data) => {
|
|
148
212
|
this.addEvent({ type: 'session', subtype: 'started', timestamp: new Date().toISOString(), data });
|
|
@@ -235,13 +299,144 @@ class SocketClient {
|
|
|
235
299
|
this.isConnecting = false;
|
|
236
300
|
}
|
|
237
301
|
|
|
302
|
+
/**
|
|
303
|
+
* Emit an event with retry support
|
|
304
|
+
* @param {string} event - Event name
|
|
305
|
+
* @param {any} data - Event data
|
|
306
|
+
* @param {Object} options - Options for retry behavior
|
|
307
|
+
*/
|
|
308
|
+
emitWithRetry(event, data = null, options = {}) {
|
|
309
|
+
const {
|
|
310
|
+
maxRetries = 3,
|
|
311
|
+
retryDelays = [1000, 2000, 4000],
|
|
312
|
+
onSuccess = null,
|
|
313
|
+
onFailure = null
|
|
314
|
+
} = options;
|
|
315
|
+
|
|
316
|
+
const emissionId = `${event}_${Date.now()}_${Math.random()}`;
|
|
317
|
+
|
|
318
|
+
const attemptEmission = (attemptNum = 0) => {
|
|
319
|
+
if (!this.socket || !this.socket.connected) {
|
|
320
|
+
// Queue for later if disconnected
|
|
321
|
+
if (attemptNum === 0) {
|
|
322
|
+
this.queueEvent(event, data);
|
|
323
|
+
console.log(`Queued ${event} for later emission (disconnected)`);
|
|
324
|
+
if (onFailure) onFailure('disconnected');
|
|
325
|
+
}
|
|
326
|
+
return;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
try {
|
|
330
|
+
// Attempt emission
|
|
331
|
+
this.socket.emit(event, data);
|
|
332
|
+
console.log(`Emitted ${event} successfully`);
|
|
333
|
+
|
|
334
|
+
// Remove from pending
|
|
335
|
+
this.pendingEmissions.delete(emissionId);
|
|
336
|
+
|
|
337
|
+
if (onSuccess) onSuccess();
|
|
338
|
+
|
|
339
|
+
} catch (error) {
|
|
340
|
+
console.error(`Failed to emit ${event} (attempt ${attemptNum + 1}):`, error);
|
|
341
|
+
|
|
342
|
+
if (attemptNum < maxRetries - 1) {
|
|
343
|
+
const delay = retryDelays[attemptNum] || retryDelays[retryDelays.length - 1];
|
|
344
|
+
console.log(`Retrying ${event} in ${delay}ms...`);
|
|
345
|
+
|
|
346
|
+
// Store pending emission
|
|
347
|
+
this.pendingEmissions.set(emissionId, {
|
|
348
|
+
event,
|
|
349
|
+
data,
|
|
350
|
+
attemptNum: attemptNum + 1,
|
|
351
|
+
scheduledTime: Date.now() + delay
|
|
352
|
+
});
|
|
353
|
+
|
|
354
|
+
setTimeout(() => attemptEmission(attemptNum + 1), delay);
|
|
355
|
+
} else {
|
|
356
|
+
console.error(`Failed to emit ${event} after ${maxRetries} attempts`);
|
|
357
|
+
this.pendingEmissions.delete(emissionId);
|
|
358
|
+
if (onFailure) onFailure('max_retries_exceeded');
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
};
|
|
362
|
+
|
|
363
|
+
attemptEmission();
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
/**
|
|
367
|
+
* Queue an event for later emission
|
|
368
|
+
* @param {string} event - Event name
|
|
369
|
+
* @param {any} data - Event data
|
|
370
|
+
*/
|
|
371
|
+
queueEvent(event, data) {
|
|
372
|
+
if (this.eventQueue.length >= this.maxQueueSize) {
|
|
373
|
+
// Remove oldest event if queue is full
|
|
374
|
+
const removed = this.eventQueue.shift();
|
|
375
|
+
console.warn(`Event queue full, dropped oldest event: ${removed.event}`);
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
this.eventQueue.push({
|
|
379
|
+
event,
|
|
380
|
+
data,
|
|
381
|
+
timestamp: Date.now()
|
|
382
|
+
});
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
/**
|
|
386
|
+
* Flush queued events after reconnection
|
|
387
|
+
*/
|
|
388
|
+
flushEventQueue() {
|
|
389
|
+
if (this.eventQueue.length === 0) return;
|
|
390
|
+
|
|
391
|
+
console.log(`Flushing ${this.eventQueue.length} queued events...`);
|
|
392
|
+
const events = [...this.eventQueue];
|
|
393
|
+
this.eventQueue = [];
|
|
394
|
+
|
|
395
|
+
// Emit each queued event with a small delay between them
|
|
396
|
+
events.forEach((item, index) => {
|
|
397
|
+
setTimeout(() => {
|
|
398
|
+
if (this.socket && this.socket.connected) {
|
|
399
|
+
this.socket.emit(item.event, item.data);
|
|
400
|
+
console.log(`Flushed queued event: ${item.event}`);
|
|
401
|
+
}
|
|
402
|
+
}, index * 100); // 100ms between each event
|
|
403
|
+
});
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
/**
|
|
407
|
+
* Schedule a reconnection attempt with exponential backoff
|
|
408
|
+
*/
|
|
409
|
+
scheduleReconnect() {
|
|
410
|
+
if (this.retryAttempts >= this.maxRetryAttempts) {
|
|
411
|
+
console.log('Max reconnection attempts reached, stopping auto-reconnect');
|
|
412
|
+
this.notifyConnectionStatus('Reconnection failed', 'disconnected');
|
|
413
|
+
return;
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
const delay = this.retryDelays[this.retryAttempts] || this.retryDelays[this.retryDelays.length - 1];
|
|
417
|
+
this.retryAttempts++;
|
|
418
|
+
|
|
419
|
+
console.log(`Scheduling reconnect attempt ${this.retryAttempts}/${this.maxRetryAttempts} in ${delay}ms...`);
|
|
420
|
+
this.notifyConnectionStatus(`Reconnecting in ${delay/1000}s...`, 'connecting');
|
|
421
|
+
|
|
422
|
+
setTimeout(() => {
|
|
423
|
+
if (!this.isConnected && this.port) {
|
|
424
|
+
console.log(`Attempting reconnection ${this.retryAttempts}/${this.maxRetryAttempts}...`);
|
|
425
|
+
this.connect(this.port);
|
|
426
|
+
}
|
|
427
|
+
}, delay);
|
|
428
|
+
}
|
|
429
|
+
|
|
238
430
|
/**
|
|
239
431
|
* Request server status
|
|
240
432
|
*/
|
|
241
433
|
requestStatus() {
|
|
242
434
|
if (this.socket && this.socket.connected) {
|
|
243
435
|
console.log('Requesting server status...');
|
|
244
|
-
this.
|
|
436
|
+
this.emitWithRetry('request.status', null, {
|
|
437
|
+
maxRetries: 2,
|
|
438
|
+
retryDelays: [500, 1000]
|
|
439
|
+
});
|
|
245
440
|
}
|
|
246
441
|
}
|
|
247
442
|
|
|
@@ -258,7 +453,13 @@ class SocketClient {
|
|
|
258
453
|
event_types: options.event_types || []
|
|
259
454
|
};
|
|
260
455
|
console.log('Requesting event history...', params);
|
|
261
|
-
this.
|
|
456
|
+
this.emitWithRetry('get_history', params, {
|
|
457
|
+
maxRetries: 3,
|
|
458
|
+
retryDelays: [1000, 2000, 3000],
|
|
459
|
+
onFailure: (reason) => {
|
|
460
|
+
console.error(`Failed to request history: ${reason}`);
|
|
461
|
+
}
|
|
462
|
+
});
|
|
262
463
|
} else {
|
|
263
464
|
console.warn('Cannot request history: not connected to server');
|
|
264
465
|
}
|
|
@@ -554,6 +755,43 @@ class SocketClient {
|
|
|
554
755
|
};
|
|
555
756
|
}
|
|
556
757
|
|
|
758
|
+
/**
|
|
759
|
+
* Start health monitoring
|
|
760
|
+
* Detects stale connections and triggers reconnection
|
|
761
|
+
*/
|
|
762
|
+
startHealthMonitoring() {
|
|
763
|
+
this.healthCheckInterval = setInterval(() => {
|
|
764
|
+
if (this.isConnected && this.lastPingTime) {
|
|
765
|
+
const timeSinceLastPing = Date.now() - this.lastPingTime;
|
|
766
|
+
|
|
767
|
+
if (timeSinceLastPing > this.pingTimeout) {
|
|
768
|
+
console.warn(`No ping from server for ${timeSinceLastPing/1000}s, connection may be stale`);
|
|
769
|
+
|
|
770
|
+
// Force reconnection
|
|
771
|
+
if (this.socket) {
|
|
772
|
+
console.log('Forcing reconnection due to stale connection...');
|
|
773
|
+
this.socket.disconnect();
|
|
774
|
+
setTimeout(() => {
|
|
775
|
+
if (this.port) {
|
|
776
|
+
this.connect(this.port);
|
|
777
|
+
}
|
|
778
|
+
}, 1000);
|
|
779
|
+
}
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
}, 10000); // Check every 10 seconds
|
|
783
|
+
}
|
|
784
|
+
|
|
785
|
+
/**
|
|
786
|
+
* Stop health monitoring
|
|
787
|
+
*/
|
|
788
|
+
stopHealthMonitoring() {
|
|
789
|
+
if (this.healthCheckInterval) {
|
|
790
|
+
clearInterval(this.healthCheckInterval);
|
|
791
|
+
this.healthCheckInterval = null;
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
|
|
557
795
|
/**
|
|
558
796
|
* Start periodic status check as fallback mechanism
|
|
559
797
|
* This ensures the UI stays in sync with actual socket state
|
|
@@ -615,6 +853,35 @@ class SocketClient {
|
|
|
615
853
|
}
|
|
616
854
|
}
|
|
617
855
|
|
|
856
|
+
/**
|
|
857
|
+
* Clean up resources
|
|
858
|
+
*/
|
|
859
|
+
destroy() {
|
|
860
|
+
this.stopHealthMonitoring();
|
|
861
|
+
if (this.socket) {
|
|
862
|
+
this.socket.disconnect();
|
|
863
|
+
this.socket = null;
|
|
864
|
+
}
|
|
865
|
+
this.eventQueue = [];
|
|
866
|
+
this.pendingEmissions.clear();
|
|
867
|
+
}
|
|
868
|
+
|
|
869
|
+
/**
|
|
870
|
+
* Get connection metrics
|
|
871
|
+
* @returns {Object} Connection metrics
|
|
872
|
+
*/
|
|
873
|
+
getConnectionMetrics() {
|
|
874
|
+
return {
|
|
875
|
+
isConnected: this.isConnected,
|
|
876
|
+
uptime: this.lastConnectTime ? (Date.now() - this.lastConnectTime) / 1000 : 0,
|
|
877
|
+
lastPing: this.lastPingTime ? (Date.now() - this.lastPingTime) / 1000 : null,
|
|
878
|
+
queuedEvents: this.eventQueue.length,
|
|
879
|
+
pendingEmissions: this.pendingEmissions.size,
|
|
880
|
+
retryAttempts: this.retryAttempts
|
|
881
|
+
};
|
|
882
|
+
}
|
|
883
|
+
}
|
|
884
|
+
|
|
618
885
|
// ES6 Module export
|
|
619
886
|
export { SocketClient };
|
|
620
887
|
export default SocketClient;
|
|
@@ -903,7 +903,9 @@ def main():
|
|
|
903
903
|
)
|
|
904
904
|
# Always output continue action to not block Claude
|
|
905
905
|
print(json.dumps({"action": "continue"}))
|
|
906
|
-
|
|
906
|
+
# Only exit if this is a signal handler call, not atexit
|
|
907
|
+
if signum is not None:
|
|
908
|
+
sys.exit(0)
|
|
907
909
|
|
|
908
910
|
# Register cleanup handlers
|
|
909
911
|
signal.signal(signal.SIGTERM, cleanup_handler)
|
|
@@ -45,12 +45,12 @@ export CLAUDE_MPM_HOOK_DEBUG="true"
|
|
|
45
45
|
|
|
46
46
|
# Debug log (optional)
|
|
47
47
|
echo "[$(date -u +%Y-%m-%dT%H:%M:%S.%3NZ)] PYTHONPATH: $PYTHONPATH" >> /tmp/hook-wrapper.log
|
|
48
|
-
echo "[$(date -u +%Y-%m-%dT%H:%M:%S.%3NZ)] Running: $PYTHON_CMD
|
|
48
|
+
echo "[$(date -u +%Y-%m-%dT%H:%M:%S.%3NZ)] Running: $PYTHON_CMD -m claude_mpm.hooks.claude_hooks.hook_handler" >> /tmp/hook-wrapper.log
|
|
49
49
|
echo "[$(date -u +%Y-%m-%dT%H:%M:%S.%3NZ)] SOCKETIO_PORT: $CLAUDE_MPM_SOCKETIO_PORT" >> /tmp/hook-wrapper.log
|
|
50
50
|
|
|
51
|
-
# Run the Python hook handler with error handling
|
|
51
|
+
# Run the Python hook handler as a module with error handling
|
|
52
52
|
# Use exec to replace the shell process, but wrap in error handling
|
|
53
|
-
if ! "$PYTHON_CMD"
|
|
53
|
+
if ! "$PYTHON_CMD" -m claude_mpm.hooks.claude_hooks.hook_handler "$@" 2>/tmp/hook-error.log; then
|
|
54
54
|
# If the Python handler fails, always return continue to not block Claude
|
|
55
55
|
echo '{"action": "continue"}'
|
|
56
56
|
# Log the error for debugging
|
|
File without changes
|
|
File without changes
|
|
@@ -82,7 +82,7 @@ class AgentMemoryManager(MemoryServiceInterface):
|
|
|
82
82
|
self._logger_name = None
|
|
83
83
|
|
|
84
84
|
self.config = config or Config()
|
|
85
|
-
self.project_root = get_path_manager().
|
|
85
|
+
self.project_root = get_path_manager().project_root
|
|
86
86
|
# Use current working directory by default, not project root
|
|
87
87
|
self.working_directory = working_directory or Path(os.getcwd())
|
|
88
88
|
self.memories_dir = self.working_directory / ".claude-mpm" / "memories"
|
|
@@ -5,13 +5,87 @@ disconnect, status requests, and history management. Separating these
|
|
|
5
5
|
from other handlers makes connection management more maintainable.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
+
import asyncio
|
|
9
|
+
import functools
|
|
10
|
+
import time
|
|
8
11
|
from datetime import datetime
|
|
9
|
-
from typing import Any, Dict, List, Optional, Set
|
|
12
|
+
from typing import Any, Callable, Dict, List, Optional, Set
|
|
10
13
|
|
|
11
14
|
from ....core.typing_utils import ClaudeStatus, EventData, SocketId
|
|
12
15
|
from .base import BaseEventHandler
|
|
13
16
|
|
|
14
17
|
|
|
18
|
+
def timeout_handler(timeout_seconds: float = 5.0):
|
|
19
|
+
"""Decorator to add timeout protection to async handlers.
|
|
20
|
+
|
|
21
|
+
WHY: Network operations can hang indefinitely, causing resource leaks
|
|
22
|
+
and poor user experience. This decorator ensures handlers complete
|
|
23
|
+
within a reasonable time or fail gracefully.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
timeout_seconds: Maximum time allowed for handler execution (default: 5s)
|
|
27
|
+
"""
|
|
28
|
+
def decorator(func: Callable) -> Callable:
|
|
29
|
+
@functools.wraps(func)
|
|
30
|
+
async def wrapper(self, *args, **kwargs):
|
|
31
|
+
handler_name = func.__name__
|
|
32
|
+
start_time = time.time()
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
# Create a task with timeout
|
|
36
|
+
result = await asyncio.wait_for(
|
|
37
|
+
func(self, *args, **kwargs),
|
|
38
|
+
timeout=timeout_seconds
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
elapsed = time.time() - start_time
|
|
42
|
+
if elapsed > timeout_seconds * 0.8: # Warn if close to timeout
|
|
43
|
+
self.logger.warning(
|
|
44
|
+
f"⚠️ Handler {handler_name} took {elapsed:.2f}s "
|
|
45
|
+
f"(close to {timeout_seconds}s timeout)"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
return result
|
|
49
|
+
|
|
50
|
+
except asyncio.TimeoutError:
|
|
51
|
+
elapsed = time.time() - start_time
|
|
52
|
+
self.logger.error(
|
|
53
|
+
f"❌ Handler {handler_name} timed out after {elapsed:.2f}s"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Try to send error response to client if we have their sid
|
|
57
|
+
if args and isinstance(args[0], str): # First arg is usually sid
|
|
58
|
+
sid = args[0]
|
|
59
|
+
try:
|
|
60
|
+
# Use a short timeout for error response
|
|
61
|
+
await asyncio.wait_for(
|
|
62
|
+
self.emit_to_client(
|
|
63
|
+
sid,
|
|
64
|
+
"error",
|
|
65
|
+
{
|
|
66
|
+
"message": f"Handler {handler_name} timed out",
|
|
67
|
+
"handler": handler_name,
|
|
68
|
+
"timeout": timeout_seconds
|
|
69
|
+
}
|
|
70
|
+
),
|
|
71
|
+
timeout=1.0
|
|
72
|
+
)
|
|
73
|
+
except:
|
|
74
|
+
pass # Best effort error notification
|
|
75
|
+
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
except Exception as e:
|
|
79
|
+
elapsed = time.time() - start_time
|
|
80
|
+
self.logger.error(
|
|
81
|
+
f"❌ Handler {handler_name} failed after {elapsed:.2f}s: {e}"
|
|
82
|
+
)
|
|
83
|
+
raise
|
|
84
|
+
|
|
85
|
+
return wrapper
|
|
86
|
+
return decorator
|
|
87
|
+
|
|
88
|
+
|
|
15
89
|
class ConnectionEventHandler(BaseEventHandler):
|
|
16
90
|
"""Handles Socket.IO connection lifecycle events.
|
|
17
91
|
|
|
@@ -19,11 +93,189 @@ class ConnectionEventHandler(BaseEventHandler):
|
|
|
19
93
|
that deserves its own focused handler. This includes client connections,
|
|
20
94
|
disconnections, status updates, and event history management.
|
|
21
95
|
"""
|
|
96
|
+
|
|
97
|
+
def __init__(self, server):
|
|
98
|
+
"""Initialize connection handler with health monitoring.
|
|
99
|
+
|
|
100
|
+
WHY: We need to track connection health metrics and implement
|
|
101
|
+
ping/pong mechanism for detecting stale connections.
|
|
102
|
+
"""
|
|
103
|
+
super().__init__(server)
|
|
104
|
+
|
|
105
|
+
# Connection health tracking
|
|
106
|
+
self.connection_metrics = {}
|
|
107
|
+
self.last_ping_times = {}
|
|
108
|
+
self.ping_interval = 30 # seconds
|
|
109
|
+
self.ping_timeout = 10 # seconds
|
|
110
|
+
self.stale_check_interval = 60 # seconds
|
|
111
|
+
|
|
112
|
+
# Health monitoring tasks (will be started after event registration)
|
|
113
|
+
self.ping_task = None
|
|
114
|
+
self.stale_check_task = None
|
|
22
115
|
|
|
116
|
+
def _start_health_monitoring(self):
|
|
117
|
+
"""Start background tasks for connection health monitoring.
|
|
118
|
+
|
|
119
|
+
WHY: We need to actively monitor connection health to detect
|
|
120
|
+
and clean up stale connections, ensuring reliable event delivery.
|
|
121
|
+
"""
|
|
122
|
+
# Only start if we have a valid event loop and tasks aren't already running
|
|
123
|
+
if hasattr(self.server, 'core') and hasattr(self.server.core, 'loop'):
|
|
124
|
+
loop = self.server.core.loop
|
|
125
|
+
if loop and not loop.is_closed():
|
|
126
|
+
if not self.ping_task or self.ping_task.done():
|
|
127
|
+
self.ping_task = asyncio.run_coroutine_threadsafe(
|
|
128
|
+
self._periodic_ping(), loop
|
|
129
|
+
)
|
|
130
|
+
self.logger.info("🏓 Started connection ping monitoring")
|
|
131
|
+
|
|
132
|
+
if not self.stale_check_task or self.stale_check_task.done():
|
|
133
|
+
self.stale_check_task = asyncio.run_coroutine_threadsafe(
|
|
134
|
+
self._check_stale_connections(), loop
|
|
135
|
+
)
|
|
136
|
+
self.logger.info("🧹 Started stale connection checker")
|
|
137
|
+
|
|
138
|
+
def stop_health_monitoring(self):
|
|
139
|
+
"""Stop health monitoring tasks.
|
|
140
|
+
|
|
141
|
+
WHY: Clean shutdown requires stopping background tasks to
|
|
142
|
+
prevent errors and resource leaks.
|
|
143
|
+
"""
|
|
144
|
+
if self.ping_task and not self.ping_task.done():
|
|
145
|
+
self.ping_task.cancel()
|
|
146
|
+
self.logger.info("🚫 Stopped connection ping monitoring")
|
|
147
|
+
|
|
148
|
+
if self.stale_check_task and not self.stale_check_task.done():
|
|
149
|
+
self.stale_check_task.cancel()
|
|
150
|
+
self.logger.info("🚫 Stopped stale connection checker")
|
|
151
|
+
|
|
152
|
+
async def _periodic_ping(self):
|
|
153
|
+
"""Send periodic pings to all connected clients.
|
|
154
|
+
|
|
155
|
+
WHY: WebSocket connections can silently fail. Regular pings
|
|
156
|
+
help detect dead connections and maintain connection state.
|
|
157
|
+
"""
|
|
158
|
+
while True:
|
|
159
|
+
try:
|
|
160
|
+
await asyncio.sleep(self.ping_interval)
|
|
161
|
+
|
|
162
|
+
if not self.clients:
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
current_time = time.time()
|
|
166
|
+
disconnected = []
|
|
167
|
+
|
|
168
|
+
for sid in list(self.clients):
|
|
169
|
+
try:
|
|
170
|
+
# Send ping and record time
|
|
171
|
+
await self.sio.emit('ping', {'timestamp': current_time}, room=sid)
|
|
172
|
+
self.last_ping_times[sid] = current_time
|
|
173
|
+
|
|
174
|
+
# Update connection metrics
|
|
175
|
+
if sid not in self.connection_metrics:
|
|
176
|
+
self.connection_metrics[sid] = {
|
|
177
|
+
'connected_at': current_time,
|
|
178
|
+
'reconnects': 0,
|
|
179
|
+
'failures': 0,
|
|
180
|
+
'last_activity': current_time
|
|
181
|
+
}
|
|
182
|
+
self.connection_metrics[sid]['last_activity'] = current_time
|
|
183
|
+
|
|
184
|
+
except Exception as e:
|
|
185
|
+
self.logger.warning(f"Failed to ping client {sid}: {e}")
|
|
186
|
+
disconnected.append(sid)
|
|
187
|
+
|
|
188
|
+
# Clean up failed connections
|
|
189
|
+
for sid in disconnected:
|
|
190
|
+
await self._cleanup_stale_connection(sid)
|
|
191
|
+
|
|
192
|
+
if self.clients:
|
|
193
|
+
self.logger.debug(
|
|
194
|
+
f"🏓 Sent pings to {len(self.clients)} clients, "
|
|
195
|
+
f"{len(disconnected)} failed"
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
except Exception as e:
|
|
199
|
+
self.logger.error(f"Error in periodic ping: {e}")
|
|
200
|
+
|
|
201
|
+
async def _check_stale_connections(self):
|
|
202
|
+
"""Check for and clean up stale connections.
|
|
203
|
+
|
|
204
|
+
WHY: Some clients may not properly disconnect, leaving zombie
|
|
205
|
+
connections that consume resources and prevent proper cleanup.
|
|
206
|
+
"""
|
|
207
|
+
while True:
|
|
208
|
+
try:
|
|
209
|
+
await asyncio.sleep(self.stale_check_interval)
|
|
210
|
+
|
|
211
|
+
current_time = time.time()
|
|
212
|
+
stale_threshold = current_time - (self.ping_timeout + self.ping_interval)
|
|
213
|
+
stale_sids = []
|
|
214
|
+
|
|
215
|
+
for sid in list(self.clients):
|
|
216
|
+
last_ping = self.last_ping_times.get(sid, 0)
|
|
217
|
+
|
|
218
|
+
if last_ping < stale_threshold:
|
|
219
|
+
stale_sids.append(sid)
|
|
220
|
+
self.logger.warning(
|
|
221
|
+
f"🧟 Detected stale connection {sid} "
|
|
222
|
+
f"(last ping: {current_time - last_ping:.1f}s ago)"
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# Clean up stale connections
|
|
226
|
+
for sid in stale_sids:
|
|
227
|
+
await self._cleanup_stale_connection(sid)
|
|
228
|
+
|
|
229
|
+
if stale_sids:
|
|
230
|
+
self.logger.info(
|
|
231
|
+
f"🧹 Cleaned up {len(stale_sids)} stale connections"
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
except Exception as e:
|
|
235
|
+
self.logger.error(f"Error checking stale connections: {e}")
|
|
236
|
+
|
|
237
|
+
async def _cleanup_stale_connection(self, sid: str):
|
|
238
|
+
"""Clean up a stale or dead connection.
|
|
239
|
+
|
|
240
|
+
WHY: Proper cleanup prevents memory leaks and ensures
|
|
241
|
+
accurate connection tracking.
|
|
242
|
+
"""
|
|
243
|
+
try:
|
|
244
|
+
if sid in self.clients:
|
|
245
|
+
self.clients.remove(sid)
|
|
246
|
+
|
|
247
|
+
if sid in self.last_ping_times:
|
|
248
|
+
del self.last_ping_times[sid]
|
|
249
|
+
|
|
250
|
+
if sid in self.connection_metrics:
|
|
251
|
+
metrics = self.connection_metrics[sid]
|
|
252
|
+
uptime = time.time() - metrics.get('connected_at', 0)
|
|
253
|
+
self.logger.info(
|
|
254
|
+
f"📊 Connection {sid} stats - uptime: {uptime:.1f}s, "
|
|
255
|
+
f"reconnects: {metrics.get('reconnects', 0)}, "
|
|
256
|
+
f"failures: {metrics.get('failures', 0)}"
|
|
257
|
+
)
|
|
258
|
+
del self.connection_metrics[sid]
|
|
259
|
+
|
|
260
|
+
# Force disconnect if still connected
|
|
261
|
+
try:
|
|
262
|
+
await self.sio.disconnect(sid)
|
|
263
|
+
except:
|
|
264
|
+
pass # Already disconnected
|
|
265
|
+
|
|
266
|
+
self.logger.info(f"🔌 Cleaned up stale connection: {sid}")
|
|
267
|
+
|
|
268
|
+
except Exception as e:
|
|
269
|
+
self.logger.error(f"Error cleaning up connection {sid}: {e}")
|
|
270
|
+
|
|
23
271
|
def register_events(self) -> None:
|
|
24
272
|
"""Register connection-related event handlers."""
|
|
273
|
+
|
|
274
|
+
# Start health monitoring now that we're registering events
|
|
275
|
+
self._start_health_monitoring()
|
|
25
276
|
|
|
26
277
|
@self.sio.event
|
|
278
|
+
@timeout_handler(timeout_seconds=5.0)
|
|
27
279
|
async def connect(sid, environ, *args):
|
|
28
280
|
"""Handle client connection.
|
|
29
281
|
|
|
@@ -72,6 +324,7 @@ class ConnectionEventHandler(BaseEventHandler):
|
|
|
72
324
|
self.log_error(f"sending welcome to client {sid}", e)
|
|
73
325
|
|
|
74
326
|
@self.sio.event
|
|
327
|
+
@timeout_handler(timeout_seconds=3.0)
|
|
75
328
|
async def disconnect(sid):
|
|
76
329
|
"""Handle client disconnection.
|
|
77
330
|
|
|
@@ -86,8 +339,15 @@ class ConnectionEventHandler(BaseEventHandler):
|
|
|
86
339
|
self.logger.warning(
|
|
87
340
|
f"⚠️ Attempted to disconnect unknown client: {sid}"
|
|
88
341
|
)
|
|
342
|
+
|
|
343
|
+
# Clean up health tracking
|
|
344
|
+
if sid in self.last_ping_times:
|
|
345
|
+
del self.last_ping_times[sid]
|
|
346
|
+
if sid in self.connection_metrics:
|
|
347
|
+
del self.connection_metrics[sid]
|
|
89
348
|
|
|
90
349
|
@self.sio.event
|
|
350
|
+
@timeout_handler(timeout_seconds=3.0)
|
|
91
351
|
async def get_status(sid):
|
|
92
352
|
"""Handle status request.
|
|
93
353
|
|
|
@@ -105,6 +365,7 @@ class ConnectionEventHandler(BaseEventHandler):
|
|
|
105
365
|
await self.emit_to_client(sid, "status", status_data)
|
|
106
366
|
|
|
107
367
|
@self.sio.event
|
|
368
|
+
@timeout_handler(timeout_seconds=5.0)
|
|
108
369
|
async def get_history(sid, data=None):
|
|
109
370
|
"""Handle history request.
|
|
110
371
|
|
|
@@ -118,6 +379,7 @@ class ConnectionEventHandler(BaseEventHandler):
|
|
|
118
379
|
await self._send_event_history(sid, event_types=event_types, limit=limit)
|
|
119
380
|
|
|
120
381
|
@self.sio.event
|
|
382
|
+
@timeout_handler(timeout_seconds=5.0)
|
|
121
383
|
async def request_history(sid, data=None):
|
|
122
384
|
"""Handle legacy history request (for client compatibility).
|
|
123
385
|
|
|
@@ -131,6 +393,7 @@ class ConnectionEventHandler(BaseEventHandler):
|
|
|
131
393
|
await self._send_event_history(sid, event_types=event_types, limit=limit)
|
|
132
394
|
|
|
133
395
|
@self.sio.event
|
|
396
|
+
@timeout_handler(timeout_seconds=3.0)
|
|
134
397
|
async def subscribe(sid, data=None):
|
|
135
398
|
"""Handle subscription request.
|
|
136
399
|
|
|
@@ -141,6 +404,7 @@ class ConnectionEventHandler(BaseEventHandler):
|
|
|
141
404
|
await self.emit_to_client(sid, "subscribed", {"channels": channels})
|
|
142
405
|
|
|
143
406
|
@self.sio.event
|
|
407
|
+
@timeout_handler(timeout_seconds=5.0)
|
|
144
408
|
async def claude_event(sid, data):
|
|
145
409
|
"""Handle events from client proxies.
|
|
146
410
|
|
|
@@ -198,6 +462,25 @@ class ConnectionEventHandler(BaseEventHandler):
|
|
|
198
462
|
self.logger.info(f"📡 Broadcasting claude_event to all clients except {sid}")
|
|
199
463
|
await self.broadcast_event("claude_event", data, skip_sid=sid)
|
|
200
464
|
self.logger.info(f"✅ Broadcast complete")
|
|
465
|
+
|
|
466
|
+
@self.sio.event
|
|
467
|
+
async def pong(sid, data=None):
|
|
468
|
+
"""Handle pong response from client.
|
|
469
|
+
|
|
470
|
+
WHY: Clients respond to our pings with pongs, confirming
|
|
471
|
+
they're still alive and the connection is healthy.
|
|
472
|
+
"""
|
|
473
|
+
current_time = time.time()
|
|
474
|
+
|
|
475
|
+
# Update last activity time
|
|
476
|
+
if sid in self.connection_metrics:
|
|
477
|
+
self.connection_metrics[sid]['last_activity'] = current_time
|
|
478
|
+
|
|
479
|
+
# Calculate round-trip time if timestamp provided
|
|
480
|
+
if data and 'timestamp' in data:
|
|
481
|
+
rtt = current_time - data['timestamp']
|
|
482
|
+
if rtt < 10: # Reasonable RTT
|
|
483
|
+
self.logger.debug(f"🏓 Pong from {sid}, RTT: {rtt*1000:.1f}ms")
|
|
201
484
|
|
|
202
485
|
def _normalize_event(self, event_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
203
486
|
"""Normalize event format to ensure consistency.
|
|
@@ -10,12 +10,144 @@ to create focused, testable modules with single responsibilities.
|
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
12
|
import asyncio
|
|
13
|
+
import time
|
|
14
|
+
from collections import deque
|
|
15
|
+
from dataclasses import dataclass
|
|
13
16
|
from datetime import datetime
|
|
14
|
-
from typing import Any, Dict, List, Optional, Set
|
|
17
|
+
from typing import Any, Deque, Dict, List, Optional, Set
|
|
15
18
|
|
|
16
19
|
from ....core.logging_config import get_logger
|
|
17
20
|
|
|
18
21
|
|
|
22
|
+
@dataclass
|
|
23
|
+
class RetryableEvent:
|
|
24
|
+
"""Represents an event that can be retried on failure.
|
|
25
|
+
|
|
26
|
+
WHY: Network failures are common and transient. By tracking retry
|
|
27
|
+
attempts, we can recover from temporary issues while avoiding
|
|
28
|
+
infinite retry loops.
|
|
29
|
+
"""
|
|
30
|
+
event_type: str
|
|
31
|
+
data: Dict[str, Any]
|
|
32
|
+
attempt_count: int = 0
|
|
33
|
+
max_retries: int = 3
|
|
34
|
+
created_at: float = None
|
|
35
|
+
last_attempt: float = None
|
|
36
|
+
skip_sid: Optional[str] = None
|
|
37
|
+
|
|
38
|
+
def __post_init__(self):
|
|
39
|
+
if self.created_at is None:
|
|
40
|
+
self.created_at = time.time()
|
|
41
|
+
if self.last_attempt is None:
|
|
42
|
+
self.last_attempt = time.time()
|
|
43
|
+
|
|
44
|
+
def should_retry(self) -> bool:
|
|
45
|
+
"""Check if this event should be retried.
|
|
46
|
+
|
|
47
|
+
WHY: We need to balance reliability with resource usage.
|
|
48
|
+
Events older than 30 seconds or with too many attempts
|
|
49
|
+
should be abandoned.
|
|
50
|
+
"""
|
|
51
|
+
if self.attempt_count >= self.max_retries:
|
|
52
|
+
return False
|
|
53
|
+
|
|
54
|
+
# Don't retry events older than 30 seconds
|
|
55
|
+
if time.time() - self.created_at > 30:
|
|
56
|
+
return False
|
|
57
|
+
|
|
58
|
+
return True
|
|
59
|
+
|
|
60
|
+
def get_backoff_delay(self) -> float:
|
|
61
|
+
"""Calculate exponential backoff delay.
|
|
62
|
+
|
|
63
|
+
WHY: Exponential backoff prevents overwhelming the system
|
|
64
|
+
during recovery from failures.
|
|
65
|
+
"""
|
|
66
|
+
base_delay = 1.0 # 1 second
|
|
67
|
+
max_delay = 8.0 # 8 seconds max
|
|
68
|
+
|
|
69
|
+
delay = min(base_delay * (2 ** self.attempt_count), max_delay)
|
|
70
|
+
return delay
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class RetryQueue:
|
|
74
|
+
"""Manages retry queue for failed event broadcasts.
|
|
75
|
+
|
|
76
|
+
WHY: Transient network issues shouldn't cause event loss.
|
|
77
|
+
This queue provides resilient event delivery with backoff.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __init__(self, max_size: int = 1000):
|
|
81
|
+
self.queue: Deque[RetryableEvent] = deque(maxlen=max_size)
|
|
82
|
+
self.lock = asyncio.Lock()
|
|
83
|
+
self.stats = {
|
|
84
|
+
'queued': 0,
|
|
85
|
+
'retried': 0,
|
|
86
|
+
'succeeded': 0,
|
|
87
|
+
'abandoned': 0
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
async def add(self, event: RetryableEvent) -> None:
|
|
91
|
+
"""Add an event to the retry queue."""
|
|
92
|
+
async with self.lock:
|
|
93
|
+
self.queue.append(event)
|
|
94
|
+
self.stats['queued'] += 1
|
|
95
|
+
|
|
96
|
+
async def get_ready_events(self) -> List[RetryableEvent]:
|
|
97
|
+
"""Get events that are ready for retry.
|
|
98
|
+
|
|
99
|
+
WHY: We need to respect backoff delays to avoid
|
|
100
|
+
overwhelming the system during recovery.
|
|
101
|
+
"""
|
|
102
|
+
async with self.lock:
|
|
103
|
+
current_time = time.time()
|
|
104
|
+
ready = []
|
|
105
|
+
|
|
106
|
+
# Check each event in queue
|
|
107
|
+
remaining = []
|
|
108
|
+
for event in self.queue:
|
|
109
|
+
if not event.should_retry():
|
|
110
|
+
self.stats['abandoned'] += 1
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
# First attempt (attempt_count == 0) should be immediate
|
|
114
|
+
if event.attempt_count == 0:
|
|
115
|
+
ready.append(event)
|
|
116
|
+
else:
|
|
117
|
+
# For retries, check backoff delay
|
|
118
|
+
time_since_attempt = current_time - event.last_attempt
|
|
119
|
+
if time_since_attempt >= event.get_backoff_delay():
|
|
120
|
+
ready.append(event)
|
|
121
|
+
else:
|
|
122
|
+
remaining.append(event)
|
|
123
|
+
|
|
124
|
+
# Update queue with events not ready yet
|
|
125
|
+
self.queue.clear()
|
|
126
|
+
self.queue.extend(remaining)
|
|
127
|
+
|
|
128
|
+
return ready
|
|
129
|
+
|
|
130
|
+
async def mark_success(self, event: RetryableEvent) -> None:
|
|
131
|
+
"""Mark an event as successfully sent."""
|
|
132
|
+
self.stats['succeeded'] += 1
|
|
133
|
+
|
|
134
|
+
async def mark_retry(self, event: RetryableEvent) -> None:
|
|
135
|
+
"""Mark an event for retry."""
|
|
136
|
+
event.attempt_count += 1
|
|
137
|
+
event.last_attempt = time.time()
|
|
138
|
+
self.stats['retried'] += 1
|
|
139
|
+
|
|
140
|
+
if event.should_retry():
|
|
141
|
+
await self.add(event)
|
|
142
|
+
|
|
143
|
+
def get_stats(self) -> Dict[str, int]:
|
|
144
|
+
"""Get retry queue statistics."""
|
|
145
|
+
return {
|
|
146
|
+
**self.stats,
|
|
147
|
+
'queue_size': len(self.queue)
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
|
|
19
151
|
class SocketIOEventBroadcaster:
|
|
20
152
|
"""Handles broadcasting events to connected Socket.IO clients.
|
|
21
153
|
|
|
@@ -41,9 +173,113 @@ class SocketIOEventBroadcaster:
|
|
|
41
173
|
self.logger = logger
|
|
42
174
|
self.loop = None # Will be set by main server
|
|
43
175
|
self.server = server # Reference to main server for event history
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
176
|
+
|
|
177
|
+
# Initialize retry queue for resilient delivery
|
|
178
|
+
self.retry_queue = RetryQueue(max_size=1000)
|
|
179
|
+
self.retry_task = None
|
|
180
|
+
self.retry_interval = 2.0 # Process retry queue every 2 seconds
|
|
181
|
+
|
|
182
|
+
def start_retry_processor(self):
|
|
183
|
+
"""Start the background retry processor.
|
|
184
|
+
|
|
185
|
+
WHY: Failed broadcasts need to be retried automatically
|
|
186
|
+
to ensure reliable event delivery.
|
|
187
|
+
"""
|
|
188
|
+
if self.loop and not self.retry_task:
|
|
189
|
+
self.retry_task = asyncio.create_task(self._process_retry_queue())
|
|
190
|
+
self.logger.info("🔄 Started retry queue processor")
|
|
191
|
+
|
|
192
|
+
def stop_retry_processor(self):
|
|
193
|
+
"""Stop the background retry processor."""
|
|
194
|
+
if self.retry_task:
|
|
195
|
+
self.retry_task.cancel()
|
|
196
|
+
self.retry_task = None
|
|
197
|
+
self.logger.info("🚫 Stopped retry queue processor")
|
|
198
|
+
|
|
199
|
+
async def _process_retry_queue(self):
|
|
200
|
+
"""Process the retry queue periodically.
|
|
201
|
+
|
|
202
|
+
WHY: Regular processing ensures failed events are retried
|
|
203
|
+
with appropriate backoff delays.
|
|
204
|
+
"""
|
|
205
|
+
while True:
|
|
206
|
+
try:
|
|
207
|
+
await asyncio.sleep(self.retry_interval)
|
|
208
|
+
|
|
209
|
+
# Get events ready for retry
|
|
210
|
+
ready_events = await self.retry_queue.get_ready_events()
|
|
211
|
+
|
|
212
|
+
if ready_events:
|
|
213
|
+
self.logger.debug(
|
|
214
|
+
f"🔄 Processing {len(ready_events)} events from retry queue"
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
for event in ready_events:
|
|
218
|
+
success = await self._retry_broadcast(event)
|
|
219
|
+
|
|
220
|
+
if success:
|
|
221
|
+
await self.retry_queue.mark_success(event)
|
|
222
|
+
else:
|
|
223
|
+
await self.retry_queue.mark_retry(event)
|
|
224
|
+
|
|
225
|
+
# Log stats periodically
|
|
226
|
+
stats = self.retry_queue.get_stats()
|
|
227
|
+
if stats['retried'] > 0 or stats['abandoned'] > 0:
|
|
228
|
+
self.logger.info(
|
|
229
|
+
f"📊 Retry queue stats - "
|
|
230
|
+
f"queued: {stats['queued']}, "
|
|
231
|
+
f"retried: {stats['retried']}, "
|
|
232
|
+
f"succeeded: {stats['succeeded']}, "
|
|
233
|
+
f"abandoned: {stats['abandoned']}, "
|
|
234
|
+
f"current size: {stats['queue_size']}"
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
except asyncio.CancelledError:
|
|
238
|
+
break
|
|
239
|
+
except Exception as e:
|
|
240
|
+
self.logger.error(f"Error processing retry queue: {e}")
|
|
241
|
+
|
|
242
|
+
async def _retry_broadcast(self, event: RetryableEvent) -> bool:
|
|
243
|
+
"""Retry broadcasting a failed event.
|
|
244
|
+
|
|
245
|
+
WHY: Isolated retry logic allows for special handling
|
|
246
|
+
and metrics tracking of retry attempts.
|
|
247
|
+
"""
|
|
248
|
+
try:
|
|
249
|
+
self.logger.debug(
|
|
250
|
+
f"🔄 Retrying {event.event_type} (attempt {event.attempt_count + 1}/{event.max_retries})"
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Reconstruct the full event
|
|
254
|
+
full_event = {
|
|
255
|
+
"type": event.event_type,
|
|
256
|
+
"timestamp": datetime.now().isoformat(),
|
|
257
|
+
"data": event.data,
|
|
258
|
+
"retry_attempt": event.attempt_count + 1
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
# Attempt broadcast
|
|
262
|
+
if event.skip_sid:
|
|
263
|
+
await self.sio.emit("claude_event", full_event, skip_sid=event.skip_sid)
|
|
264
|
+
else:
|
|
265
|
+
await self.sio.emit("claude_event", full_event)
|
|
266
|
+
|
|
267
|
+
self.logger.debug(f"✅ Successfully retried {event.event_type}")
|
|
268
|
+
return True
|
|
269
|
+
|
|
270
|
+
except Exception as e:
|
|
271
|
+
self.logger.warning(
|
|
272
|
+
f"⚠️ Retry failed for {event.event_type} "
|
|
273
|
+
f"(attempt {event.attempt_count + 1}): {e}"
|
|
274
|
+
)
|
|
275
|
+
return False
|
|
276
|
+
|
|
277
|
+
def broadcast_event(self, event_type: str, data: Dict[str, Any], skip_sid: Optional[str] = None):
|
|
278
|
+
"""Broadcast an event to all connected clients with retry support.
|
|
279
|
+
|
|
280
|
+
WHY: Enhanced with retry queue to ensure reliable delivery
|
|
281
|
+
even during transient network issues.
|
|
282
|
+
"""
|
|
47
283
|
if not self.sio:
|
|
48
284
|
return
|
|
49
285
|
|
|
@@ -65,15 +301,27 @@ class SocketIOEventBroadcaster:
|
|
|
65
301
|
self.logger.debug(f"Added {event_type} to history (total: {len(self.server.event_history)})")
|
|
66
302
|
|
|
67
303
|
# Broadcast to all connected clients
|
|
304
|
+
broadcast_success = False
|
|
68
305
|
try:
|
|
69
306
|
# Use run_coroutine_threadsafe to safely call from any thread
|
|
70
307
|
if hasattr(self, "loop") and self.loop and not self.loop.is_closed():
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
308
|
+
# Create broadcast coroutine
|
|
309
|
+
if skip_sid:
|
|
310
|
+
coro = self.sio.emit("claude_event", event, skip_sid=skip_sid)
|
|
311
|
+
else:
|
|
312
|
+
coro = self.sio.emit("claude_event", event)
|
|
313
|
+
|
|
314
|
+
future = asyncio.run_coroutine_threadsafe(coro, self.loop)
|
|
315
|
+
|
|
316
|
+
# Wait briefly to see if broadcast succeeds
|
|
317
|
+
try:
|
|
318
|
+
future.result(timeout=0.5) # 500ms timeout
|
|
319
|
+
broadcast_success = True
|
|
320
|
+
self.stats["events_sent"] += 1
|
|
321
|
+
self.logger.debug(f"Broadcasted event: {event_type}")
|
|
322
|
+
except:
|
|
323
|
+
# Will be added to retry queue below
|
|
324
|
+
pass
|
|
77
325
|
else:
|
|
78
326
|
self.logger.warning(
|
|
79
327
|
f"Cannot broadcast {event_type}: server loop not available"
|
|
@@ -81,6 +329,24 @@ class SocketIOEventBroadcaster:
|
|
|
81
329
|
|
|
82
330
|
except Exception as e:
|
|
83
331
|
self.logger.error(f"Failed to broadcast event {event_type}: {e}")
|
|
332
|
+
|
|
333
|
+
# Add to retry queue if broadcast failed
|
|
334
|
+
if not broadcast_success and self.loop:
|
|
335
|
+
retryable_event = RetryableEvent(
|
|
336
|
+
event_type=event_type,
|
|
337
|
+
data=data,
|
|
338
|
+
skip_sid=skip_sid
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
# Queue for retry
|
|
342
|
+
asyncio.run_coroutine_threadsafe(
|
|
343
|
+
self.retry_queue.add(retryable_event),
|
|
344
|
+
self.loop
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
self.logger.warning(
|
|
348
|
+
f"⚠️ Queued {event_type} for retry (queue size: {len(self.retry_queue.queue)})"
|
|
349
|
+
)
|
|
84
350
|
|
|
85
351
|
def session_started(self, session_id: str, launch_method: str, working_dir: str):
|
|
86
352
|
"""Notify that a session has started."""
|
|
@@ -119,6 +119,9 @@ class SocketIOServer(SocketIOServiceInterface):
|
|
|
119
119
|
|
|
120
120
|
# Set the loop reference for broadcaster
|
|
121
121
|
self.broadcaster.loop = self.core.loop
|
|
122
|
+
|
|
123
|
+
# Start the retry processor for resilient event delivery
|
|
124
|
+
self.broadcaster.start_retry_processor()
|
|
122
125
|
|
|
123
126
|
# Register events
|
|
124
127
|
self._register_events()
|
|
@@ -128,11 +131,22 @@ class SocketIOServer(SocketIOServiceInterface):
|
|
|
128
131
|
self.stats["start_time"] = self.core.stats["start_time"]
|
|
129
132
|
|
|
130
133
|
self.logger.info(
|
|
131
|
-
f"SocketIO server started successfully on {self.host}:{self.port}"
|
|
134
|
+
f"SocketIO server started successfully on {self.host}:{self.port} with retry queue enabled"
|
|
132
135
|
)
|
|
133
136
|
|
|
134
137
|
def stop_sync(self):
|
|
135
138
|
"""Stop the Socket.IO server (synchronous version)."""
|
|
139
|
+
# Stop the retry processor if running
|
|
140
|
+
if self.broadcaster:
|
|
141
|
+
self.broadcaster.stop_retry_processor()
|
|
142
|
+
|
|
143
|
+
# Stop health monitoring in connection handler
|
|
144
|
+
if self.event_registry:
|
|
145
|
+
from ..handlers import ConnectionEventHandler
|
|
146
|
+
conn_handler = self.event_registry.get_handler(ConnectionEventHandler)
|
|
147
|
+
if conn_handler and hasattr(conn_handler, 'stop_health_monitoring'):
|
|
148
|
+
conn_handler.stop_health_monitoring()
|
|
149
|
+
|
|
136
150
|
self.core.stop_sync()
|
|
137
151
|
self.running = False
|
|
138
152
|
|
|
@@ -106,7 +106,7 @@ claude_mpm/core/framework_loader.py,sha256=E_l5hrqwrGvAWCE-CHwjqAQErjxdzwgJ7AY31
|
|
|
106
106
|
claude_mpm/core/hook_manager.py,sha256=N-54QwNsyrUZSrH533HAo6hlJKPFbgMeDirg92H8en8,11014
|
|
107
107
|
claude_mpm/core/hook_performance_config.py,sha256=Cqf-89mjiLlFJ-v4PhxfzsaOVovYYQa5pa-RMm42Ki8,5748
|
|
108
108
|
claude_mpm/core/injectable_service.py,sha256=HXuviYX-h0sNxOFzgZiYjf-gLDUnQIeG-0-KuWdfCzk,7397
|
|
109
|
-
claude_mpm/core/interactive_session.py,sha256=
|
|
109
|
+
claude_mpm/core/interactive_session.py,sha256=bYrQDEYUvKAoQEEU8CVBlqE2nkDhLqJQCijbGJQnHxw,19002
|
|
110
110
|
claude_mpm/core/interfaces.py,sha256=unClq54EgOIamVICS0G_LJzUa4gLXixqfDxiAvNjEkk,26914
|
|
111
111
|
claude_mpm/core/lazy.py,sha256=to0DBciTUnWL9UO8o5bCVSgFCRHE1VtSrgX_B8qYg0Y,14724
|
|
112
112
|
claude_mpm/core/logger.py,sha256=lzwREb0SBEXL-IkCRnJHK_GKuRaegNOx4CJltfAsNAE,15774
|
|
@@ -162,7 +162,7 @@ claude_mpm/dashboard/static/dist/components/ui-state-manager.js,sha256=PtZs6sxNh
|
|
|
162
162
|
claude_mpm/dashboard/static/dist/components/working-directory.js,sha256=7V1pBeuxWOxZVlDp3kZvivTHAfzucv2ceBXWP6SlZTY,15339
|
|
163
163
|
claude_mpm/dashboard/static/js/dashboard.js,sha256=4u-oV6k_AUj8PneNXbCoDnYiGG2y6e6eHr5JIisfTXg,60178
|
|
164
164
|
claude_mpm/dashboard/static/js/extension-error-handler.js,sha256=DZHrJ3gbfv4nsjmZpNMj-Sc3GKjVJ5ds8lgoaLRnq5I,6274
|
|
165
|
-
claude_mpm/dashboard/static/js/socket-client.js,sha256=
|
|
165
|
+
claude_mpm/dashboard/static/js/socket-client.js,sha256=miNfAKfJr5z4br5eOpZMVoKLS5LBYO0Rp2q6I0xroW4,32437
|
|
166
166
|
claude_mpm/dashboard/static/js/components/agent-inference.js,sha256=BDqgr1o0DlcgzNa5KwDsNODelQvXA-tab0O1PeAnVtM,27269
|
|
167
167
|
claude_mpm/dashboard/static/js/components/event-processor.js,sha256=_GnAz8pxN1iyXw0O4AIR482QFyQAigEKO9IDUOUbGqc,24844
|
|
168
168
|
claude_mpm/dashboard/static/js/components/event-viewer.js,sha256=aHF__HZHZsfSkpELb4GAIGxNrFGAiBcizrWW00bTG9c,33105
|
|
@@ -188,8 +188,8 @@ claude_mpm/hooks/validation_hooks.py,sha256=w3bSApxZE6hfLnBLVEs62DpJ6grxOr1-JZn1
|
|
|
188
188
|
claude_mpm/hooks/claude_hooks/__init__.py,sha256=b4mud_g3S-3itHY_Dzpbb_SmdMEcJwtUU8fTcqpLqqs,130
|
|
189
189
|
claude_mpm/hooks/claude_hooks/connection_pool.py,sha256=8bluFsgiM6HUJuNw-Dl7RkuzLR1vVMWGdEwucp9r3gc,6508
|
|
190
190
|
claude_mpm/hooks/claude_hooks/event_handlers.py,sha256=rLaKV_pV8NOCwbJQtg7At0xYNfYcCDiajSge7TiiuPk,30081
|
|
191
|
-
claude_mpm/hooks/claude_hooks/hook_handler.py,sha256=
|
|
192
|
-
claude_mpm/hooks/claude_hooks/hook_wrapper.sh,sha256=
|
|
191
|
+
claude_mpm/hooks/claude_hooks/hook_handler.py,sha256=quEOyJ6pKwxPdzoFt9RaFHa9u88nNdLN7mvg6_a-5F0,38649
|
|
192
|
+
claude_mpm/hooks/claude_hooks/hook_wrapper.sh,sha256=xcNT3yIx9srBSd_9aosBp9w45JwaZHDlsp5ocio9cM8,2405
|
|
193
193
|
claude_mpm/hooks/claude_hooks/memory_integration.py,sha256=fxkawPwMkMCGxJ5Ki_PJ7blN1vtQXEYqGxxQPJytOq4,8801
|
|
194
194
|
claude_mpm/hooks/claude_hooks/response_tracking.py,sha256=Cm5uyWgMLC5HZF0FA0xhP7bhQ1vC1H3Cak8wfzzW5MU,13887
|
|
195
195
|
claude_mpm/hooks/claude_hooks/tool_analysis.py,sha256=z75UOImtbECItB0pleRUSeNXqjh0jk3dMWq6eekPQw8,7981
|
|
@@ -292,7 +292,7 @@ claude_mpm/services/agents/management/__init__.py,sha256=dZcFI9EBajH2KbCoNQWAFQ0
|
|
|
292
292
|
claude_mpm/services/agents/management/agent_capabilities_generator.py,sha256=2j5yoyCXhsK41Bpr8gDAS6jXKhTI_lyYPkTa2djPls4,7002
|
|
293
293
|
claude_mpm/services/agents/management/agent_management_service.py,sha256=ek8qGMHqyAb2kks4GG0GpS123dl-zyhRas9pnn6fUy8,23027
|
|
294
294
|
claude_mpm/services/agents/memory/__init__.py,sha256=AcO-bZajiUBjnp1UNNMJyKFR3F2CDTgn5b0NEjN9Oiw,667
|
|
295
|
-
claude_mpm/services/agents/memory/agent_memory_manager.py,sha256=
|
|
295
|
+
claude_mpm/services/agents/memory/agent_memory_manager.py,sha256=ITLrLTTvJcw5XOrHQOYhNgQsuqax6XsiTeSEv3zBi6M,23914
|
|
296
296
|
claude_mpm/services/agents/memory/agent_persistence_service.py,sha256=mx67hEZtpMG-fom_qUvOL0U8HSWam1F55LJj4g4k3nw,2783
|
|
297
297
|
claude_mpm/services/agents/memory/analyzer.py,sha256=dDpOquis1OjaHKr5IlHS125BpCFjxh8hng6kN-6eHFw,16973
|
|
298
298
|
claude_mpm/services/agents/memory/content_manager.py,sha256=LifCY4VLFtWxwOPd8pKXWCTo2_p6xlGbHocSS9s6THo,13103
|
|
@@ -371,7 +371,7 @@ claude_mpm/services/socketio/__init__.py,sha256=FJ2BYTa1i_E9FUdjEuJ1FV1Joq9I5rNx
|
|
|
371
371
|
claude_mpm/services/socketio/client_proxy.py,sha256=ISLDS0fXOi2a4txcZ9KTEk2Ywu4wrG08CmdYUlIezvU,6654
|
|
372
372
|
claude_mpm/services/socketio/handlers/__init__.py,sha256=-lTRs1zdzDEf_QhT5ajE1_Z27MxKaFC4CzFtWxjK5ss,789
|
|
373
373
|
claude_mpm/services/socketio/handlers/base.py,sha256=YaidwKRhDNhntQvfGTMX0XVkwdNfbYASopRJ9sFLbcA,4858
|
|
374
|
-
claude_mpm/services/socketio/handlers/connection.py,sha256=
|
|
374
|
+
claude_mpm/services/socketio/handlers/connection.py,sha256=6GCG_wWbxQrKYp1nNPhO2AGZzP5yYW5RtYPfUv_nB7E,24646
|
|
375
375
|
claude_mpm/services/socketio/handlers/file.py,sha256=xYrzaZCQLkNvWvt3Ebnd7w66eaMhTr5ClxAvFMFDEIg,8431
|
|
376
376
|
claude_mpm/services/socketio/handlers/git.py,sha256=sVmPKDC5VQb5ukzFelgTYu52M-wZntnANNNxrTewhVY,37861
|
|
377
377
|
claude_mpm/services/socketio/handlers/hook.py,sha256=HrrcnRu7FfbbOFwLRTBxZEtXhVBBJatJlSTMvhhqiso,7658
|
|
@@ -379,9 +379,9 @@ claude_mpm/services/socketio/handlers/memory.py,sha256=0RltTwhXPhBZEpqSYFoXmVuSJ
|
|
|
379
379
|
claude_mpm/services/socketio/handlers/project.py,sha256=_SbSRZ54IivtHWtfl4sKfbgRQL6sKaqVM5Rsxvql5JQ,789
|
|
380
380
|
claude_mpm/services/socketio/handlers/registry.py,sha256=LiPfkRLGj0LLicWapRTnDzoqpPLr8UeE_TrXhZONqaU,6002
|
|
381
381
|
claude_mpm/services/socketio/server/__init__.py,sha256=a5SYqKHCER3ger9dv4_P-RmLtND6pHd-2VFSrP04Gqo,640
|
|
382
|
-
claude_mpm/services/socketio/server/broadcaster.py,sha256=
|
|
382
|
+
claude_mpm/services/socketio/server/broadcaster.py,sha256=xRlEQBtIHxgnE2x7t0WN9PlPDaThudS-5FcZl2rFOc4,19063
|
|
383
383
|
claude_mpm/services/socketio/server/core.py,sha256=mvIIV80WLOPWhs16JM17nA9Nwoo9j8anBc-sjaalp4c,14433
|
|
384
|
-
claude_mpm/services/socketio/server/main.py,sha256=
|
|
384
|
+
claude_mpm/services/socketio/server/main.py,sha256=74tMkSW0kC3kgJjYhW89kjiwfYT6liIuUg78PbmH08M,12654
|
|
385
385
|
claude_mpm/services/version_control/__init__.py,sha256=xj13GwrNw4t1KieGlmw62hj14HPj2BWP2Tlv-9E_tTE,1510
|
|
386
386
|
claude_mpm/services/version_control/branch_strategy.py,sha256=XsmXwB98-zoTGKsM27fPS813tcKaJwgB7DWilgKKhio,23232
|
|
387
387
|
claude_mpm/services/version_control/conflict_resolution.py,sha256=RM5cFxlEmvqS3SeCwMYAkhTu5UTIGlhI2Kgh2Yw7VO0,26381
|
|
@@ -410,9 +410,9 @@ claude_mpm/utils/subprocess_utils.py,sha256=e74VlIPozCljZss_0SwLO3J1ZuIKRT9FrrFi
|
|
|
410
410
|
claude_mpm/validation/__init__.py,sha256=YZhwE3mhit-lslvRLuwfX82xJ_k4haZeKmh4IWaVwtk,156
|
|
411
411
|
claude_mpm/validation/agent_validator.py,sha256=szbK9d29v_T6xE_KvW845WLKXbS_yYpXQscrSrSeldI,20937
|
|
412
412
|
claude_mpm/validation/frontmatter_validator.py,sha256=Q_aTyjigfKZQG7eSwLtwMfd0h_eyS9FQAm59ZFf5VYA,7036
|
|
413
|
-
claude_mpm-4.0.
|
|
414
|
-
claude_mpm-4.0.
|
|
415
|
-
claude_mpm-4.0.
|
|
416
|
-
claude_mpm-4.0.
|
|
417
|
-
claude_mpm-4.0.
|
|
418
|
-
claude_mpm-4.0.
|
|
413
|
+
claude_mpm-4.0.11.dist-info/licenses/LICENSE,sha256=lpaivOlPuBZW1ds05uQLJJswy8Rp_HMNieJEbFlqvLk,1072
|
|
414
|
+
claude_mpm-4.0.11.dist-info/METADATA,sha256=cgnjym6qmxFMd0WUk4-n3OLlkkGGnzOk3ec6NJ1--cE,11819
|
|
415
|
+
claude_mpm-4.0.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
416
|
+
claude_mpm-4.0.11.dist-info/entry_points.txt,sha256=uafLVeHm4AXrzvdR77fXO3a2MmvvfGtmVU2iY8uIPt8,403
|
|
417
|
+
claude_mpm-4.0.11.dist-info/top_level.txt,sha256=1nUg3FEaBySgm8t-s54jK5zoPnu3_eY6EP6IOlekyHA,11
|
|
418
|
+
claude_mpm-4.0.11.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|