jarviscore-framework 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/calculator_agent_example.py +77 -0
- examples/multi_agent_workflow.py +132 -0
- examples/research_agent_example.py +76 -0
- jarviscore/__init__.py +54 -0
- jarviscore/cli/__init__.py +7 -0
- jarviscore/cli/__main__.py +33 -0
- jarviscore/cli/check.py +404 -0
- jarviscore/cli/smoketest.py +371 -0
- jarviscore/config/__init__.py +7 -0
- jarviscore/config/settings.py +128 -0
- jarviscore/core/__init__.py +7 -0
- jarviscore/core/agent.py +163 -0
- jarviscore/core/mesh.py +463 -0
- jarviscore/core/profile.py +64 -0
- jarviscore/docs/API_REFERENCE.md +932 -0
- jarviscore/docs/CONFIGURATION.md +753 -0
- jarviscore/docs/GETTING_STARTED.md +600 -0
- jarviscore/docs/TROUBLESHOOTING.md +424 -0
- jarviscore/docs/USER_GUIDE.md +983 -0
- jarviscore/execution/__init__.py +94 -0
- jarviscore/execution/code_registry.py +298 -0
- jarviscore/execution/generator.py +268 -0
- jarviscore/execution/llm.py +430 -0
- jarviscore/execution/repair.py +283 -0
- jarviscore/execution/result_handler.py +332 -0
- jarviscore/execution/sandbox.py +555 -0
- jarviscore/execution/search.py +281 -0
- jarviscore/orchestration/__init__.py +18 -0
- jarviscore/orchestration/claimer.py +101 -0
- jarviscore/orchestration/dependency.py +143 -0
- jarviscore/orchestration/engine.py +292 -0
- jarviscore/orchestration/status.py +96 -0
- jarviscore/p2p/__init__.py +23 -0
- jarviscore/p2p/broadcaster.py +353 -0
- jarviscore/p2p/coordinator.py +364 -0
- jarviscore/p2p/keepalive.py +361 -0
- jarviscore/p2p/swim_manager.py +290 -0
- jarviscore/profiles/__init__.py +6 -0
- jarviscore/profiles/autoagent.py +264 -0
- jarviscore/profiles/customagent.py +137 -0
- jarviscore_framework-0.1.0.dist-info/METADATA +136 -0
- jarviscore_framework-0.1.0.dist-info/RECORD +55 -0
- jarviscore_framework-0.1.0.dist-info/WHEEL +5 -0
- jarviscore_framework-0.1.0.dist-info/licenses/LICENSE +21 -0
- jarviscore_framework-0.1.0.dist-info/top_level.txt +3 -0
- tests/conftest.py +44 -0
- tests/test_agent.py +165 -0
- tests/test_autoagent.py +140 -0
- tests/test_autoagent_day4.py +186 -0
- tests/test_customagent.py +248 -0
- tests/test_integration.py +293 -0
- tests/test_llm_fallback.py +185 -0
- tests/test_mesh.py +356 -0
- tests/test_p2p_integration.py +375 -0
- tests/test_remote_sandbox.py +116 -0
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
"""
|
|
2
|
+
P2P Keepalive Manager for maintaining active ZMQ connections in agent mesh.
|
|
3
|
+
|
|
4
|
+
Prevents idle connection closure by sending periodic keepalive messages
|
|
5
|
+
while intelligently suppressing when real workflow traffic exists.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import time
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Optional, Dict, Any, Callable
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from enum import Enum
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CircuitState(Enum):
|
|
19
|
+
"""Circuit breaker states for connection health."""
|
|
20
|
+
CLOSED = "CLOSED"
|
|
21
|
+
HALF_OPEN = "HALF_OPEN"
|
|
22
|
+
OPEN = "OPEN"
|
|
23
|
+
UNKNOWN = "UNKNOWN"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class KeepaliveMetrics:
|
|
28
|
+
"""Metrics for P2P keepalive health monitoring."""
|
|
29
|
+
keepalives_sent: int = 0
|
|
30
|
+
keepalives_received: int = 0
|
|
31
|
+
acks_received: int = 0
|
|
32
|
+
timeouts: int = 0
|
|
33
|
+
suppressed_count: int = 0
|
|
34
|
+
last_successful_keepalive: float = 0.0
|
|
35
|
+
last_keepalive_latency: float = 0.0
|
|
36
|
+
circuit_breaker_events: int = 0
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class P2PKeepaliveManager:
|
|
40
|
+
"""
|
|
41
|
+
Manages P2P keepalive messages to prevent ZMQ connection idle closure.
|
|
42
|
+
|
|
43
|
+
Features:
|
|
44
|
+
- Periodic keepalive with configurable interval
|
|
45
|
+
- Smart suppression when recent workflow traffic exists
|
|
46
|
+
- Circuit breaker integration for adaptive behavior
|
|
47
|
+
- Health metrics and observability
|
|
48
|
+
- Bidirectional keepalive + ACK pattern
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
agent_id: str,
|
|
54
|
+
send_p2p_callback: Callable[[str, str, Dict[str, Any]], bool],
|
|
55
|
+
broadcast_p2p_callback: Optional[Callable[[str, Dict[str, Any]], int]] = None,
|
|
56
|
+
config: Optional[Dict[str, Any]] = None
|
|
57
|
+
):
|
|
58
|
+
"""
|
|
59
|
+
Initialize P2P Keepalive Manager.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
agent_id: Unique identifier for this agent
|
|
63
|
+
send_p2p_callback: Function to send P2P message to specific peer
|
|
64
|
+
broadcast_p2p_callback: Optional function to broadcast to all peers
|
|
65
|
+
config: Configuration dictionary
|
|
66
|
+
"""
|
|
67
|
+
self.agent_id = agent_id
|
|
68
|
+
self.send_p2p_message = send_p2p_callback
|
|
69
|
+
self.broadcast_p2p_message = broadcast_p2p_callback
|
|
70
|
+
|
|
71
|
+
# Configuration with production defaults
|
|
72
|
+
config = config or {}
|
|
73
|
+
self.enabled = config.get('P2P_KEEPALIVE_ENABLED', True)
|
|
74
|
+
self.interval = config.get('P2P_KEEPALIVE_INTERVAL', 90) # 90s default
|
|
75
|
+
self.timeout = config.get('P2P_KEEPALIVE_TIMEOUT', 10) # 10s timeout
|
|
76
|
+
self.activity_suppress_window = config.get('P2P_ACTIVITY_SUPPRESS_WINDOW', 60) # 60s
|
|
77
|
+
self.circuit_half_open_interval = config.get('P2P_CIRCUIT_HALF_OPEN_INTERVAL', 30) # 30s aggressive
|
|
78
|
+
|
|
79
|
+
# State tracking
|
|
80
|
+
self.last_p2p_activity = time.time() # Track any P2P activity
|
|
81
|
+
self.last_keepalive_sent = 0.0
|
|
82
|
+
self.pending_keepalives: Dict[str, float] = {} # peer_id -> sent_time
|
|
83
|
+
self.circuit_state = CircuitState.UNKNOWN
|
|
84
|
+
|
|
85
|
+
# Metrics
|
|
86
|
+
self.metrics = KeepaliveMetrics()
|
|
87
|
+
|
|
88
|
+
# Control
|
|
89
|
+
self._running = False
|
|
90
|
+
self._keepalive_task: Optional[asyncio.Task] = None
|
|
91
|
+
|
|
92
|
+
logger.info(f"P2P_KEEPALIVE ({self.agent_id}): Initialized with interval={self.interval}s, "
|
|
93
|
+
f"suppress_window={self.activity_suppress_window}s, enabled={self.enabled}")
|
|
94
|
+
|
|
95
|
+
async def start(self):
|
|
96
|
+
"""Start the keepalive loop."""
|
|
97
|
+
if not self.enabled:
|
|
98
|
+
logger.info(f"P2P_KEEPALIVE ({self.agent_id}): Disabled by configuration")
|
|
99
|
+
return
|
|
100
|
+
|
|
101
|
+
if self._running:
|
|
102
|
+
logger.warning(f"P2P_KEEPALIVE ({self.agent_id}): Already running")
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
self._running = True
|
|
106
|
+
self._keepalive_task = asyncio.create_task(self._keepalive_loop())
|
|
107
|
+
logger.info(f"P2P_KEEPALIVE ({self.agent_id}): Started keepalive loop")
|
|
108
|
+
|
|
109
|
+
async def stop(self):
|
|
110
|
+
"""Stop the keepalive loop."""
|
|
111
|
+
self._running = False
|
|
112
|
+
|
|
113
|
+
if self._keepalive_task:
|
|
114
|
+
self._keepalive_task.cancel()
|
|
115
|
+
try:
|
|
116
|
+
await self._keepalive_task
|
|
117
|
+
except asyncio.CancelledError:
|
|
118
|
+
pass
|
|
119
|
+
|
|
120
|
+
logger.info(f"P2P_KEEPALIVE ({self.agent_id}): Stopped keepalive loop")
|
|
121
|
+
|
|
122
|
+
def record_p2p_activity(self):
|
|
123
|
+
"""
|
|
124
|
+
Record that P2P activity occurred (workflow message, nudge, broadcast).
|
|
125
|
+
Used for smart suppression of keepalives.
|
|
126
|
+
"""
|
|
127
|
+
self.last_p2p_activity = time.time()
|
|
128
|
+
|
|
129
|
+
def update_circuit_state(self, state: CircuitState):
|
|
130
|
+
"""
|
|
131
|
+
Update circuit breaker state for adaptive keepalive behavior.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
state: Current circuit breaker state
|
|
135
|
+
"""
|
|
136
|
+
if state != self.circuit_state:
|
|
137
|
+
logger.info(f"P2P_KEEPALIVE ({self.agent_id}): Circuit state changed: "
|
|
138
|
+
f"{self.circuit_state.value} -> {state.value}")
|
|
139
|
+
self.circuit_state = state
|
|
140
|
+
self.metrics.circuit_breaker_events += 1
|
|
141
|
+
|
|
142
|
+
def _should_send_keepalive(self) -> bool:
|
|
143
|
+
"""
|
|
144
|
+
Determine if keepalive should be sent based on smart suppression logic.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
True if keepalive should be sent, False if suppressed
|
|
148
|
+
"""
|
|
149
|
+
current_time = time.time()
|
|
150
|
+
|
|
151
|
+
# Check if recent P2P activity exists
|
|
152
|
+
time_since_activity = current_time - self.last_p2p_activity
|
|
153
|
+
if time_since_activity < self.activity_suppress_window:
|
|
154
|
+
logger.debug(f"P2P_KEEPALIVE ({self.agent_id}): Suppressed - recent activity "
|
|
155
|
+
f"{time_since_activity:.1f}s ago")
|
|
156
|
+
self.metrics.suppressed_count += 1
|
|
157
|
+
return False
|
|
158
|
+
|
|
159
|
+
# Check interval based on circuit state
|
|
160
|
+
interval = self._get_adaptive_interval()
|
|
161
|
+
time_since_last_keepalive = current_time - self.last_keepalive_sent
|
|
162
|
+
|
|
163
|
+
if time_since_last_keepalive < interval:
|
|
164
|
+
return False
|
|
165
|
+
|
|
166
|
+
return True
|
|
167
|
+
|
|
168
|
+
def _get_adaptive_interval(self) -> float:
|
|
169
|
+
"""
|
|
170
|
+
Get adaptive keepalive interval based on circuit breaker state.
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Keepalive interval in seconds
|
|
174
|
+
"""
|
|
175
|
+
if self.circuit_state == CircuitState.HALF_OPEN:
|
|
176
|
+
# Aggressive keepalives to help circuit recovery
|
|
177
|
+
return self.circuit_half_open_interval
|
|
178
|
+
elif self.circuit_state == CircuitState.OPEN:
|
|
179
|
+
# Try to trigger recovery probes
|
|
180
|
+
return self.circuit_half_open_interval
|
|
181
|
+
else:
|
|
182
|
+
# Normal operation
|
|
183
|
+
return self.interval
|
|
184
|
+
|
|
185
|
+
async def _keepalive_loop(self):
|
|
186
|
+
"""Main keepalive loop with adaptive timing."""
|
|
187
|
+
logger.info(f"P2P_KEEPALIVE ({self.agent_id}): Keepalive loop started")
|
|
188
|
+
|
|
189
|
+
# Initial delay to allow P2P mesh to stabilize
|
|
190
|
+
await asyncio.sleep(30)
|
|
191
|
+
|
|
192
|
+
while self._running:
|
|
193
|
+
try:
|
|
194
|
+
if self._should_send_keepalive():
|
|
195
|
+
await self._send_keepalive()
|
|
196
|
+
|
|
197
|
+
# Check for keepalive timeouts
|
|
198
|
+
await self._check_timeouts()
|
|
199
|
+
|
|
200
|
+
# Log metrics periodically
|
|
201
|
+
await self._log_metrics()
|
|
202
|
+
|
|
203
|
+
# Sleep with adaptive interval
|
|
204
|
+
await asyncio.sleep(10) # Check every 10s, send based on interval
|
|
205
|
+
|
|
206
|
+
except Exception as e:
|
|
207
|
+
logger.error(f"P2P_KEEPALIVE ({self.agent_id}): Error in keepalive loop: {e}",
|
|
208
|
+
exc_info=True)
|
|
209
|
+
await asyncio.sleep(30) # Back off on error
|
|
210
|
+
|
|
211
|
+
logger.info(f"P2P_KEEPALIVE ({self.agent_id}): Keepalive loop stopped")
|
|
212
|
+
|
|
213
|
+
async def _send_keepalive(self):
|
|
214
|
+
"""Send keepalive message to all peers."""
|
|
215
|
+
try:
|
|
216
|
+
current_time = time.time()
|
|
217
|
+
|
|
218
|
+
payload = {
|
|
219
|
+
'agent_id': self.agent_id,
|
|
220
|
+
'timestamp': current_time,
|
|
221
|
+
'circuit_state': self.circuit_state.value,
|
|
222
|
+
'metrics': {
|
|
223
|
+
'sent': self.metrics.keepalives_sent,
|
|
224
|
+
'received': self.metrics.keepalives_received,
|
|
225
|
+
'acks': self.metrics.acks_received
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
# Broadcast keepalive to all peers
|
|
230
|
+
if self.broadcast_p2p_message:
|
|
231
|
+
success_count = await self.broadcast_p2p_message('P2P_KEEPALIVE', payload)
|
|
232
|
+
|
|
233
|
+
if success_count > 0:
|
|
234
|
+
self.last_keepalive_sent = current_time
|
|
235
|
+
self.metrics.keepalives_sent += 1
|
|
236
|
+
logger.debug(f"P2P_KEEPALIVE ({self.agent_id}): Sent keepalive to {success_count} peers")
|
|
237
|
+
else:
|
|
238
|
+
logger.warning(f"P2P_KEEPALIVE ({self.agent_id}): Failed to send keepalive to any peer")
|
|
239
|
+
else:
|
|
240
|
+
logger.warning(f"P2P_KEEPALIVE ({self.agent_id}): No broadcast callback available")
|
|
241
|
+
|
|
242
|
+
except Exception as e:
|
|
243
|
+
logger.error(f"P2P_KEEPALIVE ({self.agent_id}): Error sending keepalive: {e}")
|
|
244
|
+
|
|
245
|
+
async def handle_keepalive_received(self, sender_id: str, payload: Dict[str, Any]):
|
|
246
|
+
"""
|
|
247
|
+
Handle incoming keepalive message from peer.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
sender_id: ID of the peer that sent keepalive
|
|
251
|
+
payload: Keepalive message payload
|
|
252
|
+
"""
|
|
253
|
+
try:
|
|
254
|
+
self.metrics.keepalives_received += 1
|
|
255
|
+
logger.debug(f"P2P_KEEPALIVE ({self.agent_id}): Received keepalive from {sender_id}")
|
|
256
|
+
|
|
257
|
+
# Send ACK back to sender
|
|
258
|
+
ack_payload = {
|
|
259
|
+
'agent_id': self.agent_id,
|
|
260
|
+
'timestamp': time.time(),
|
|
261
|
+
'original_timestamp': payload.get('timestamp')
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
# Send ACK using direct message (not broadcast)
|
|
265
|
+
if self.send_p2p_message:
|
|
266
|
+
success = await self.send_p2p_message(sender_id, 'P2P_KEEPALIVE_ACK', ack_payload)
|
|
267
|
+
if success:
|
|
268
|
+
logger.debug(f"P2P_KEEPALIVE ({self.agent_id}): Sent ACK to {sender_id}")
|
|
269
|
+
else:
|
|
270
|
+
logger.warning(f"P2P_KEEPALIVE ({self.agent_id}): Failed to send ACK to {sender_id}")
|
|
271
|
+
|
|
272
|
+
except Exception as e:
|
|
273
|
+
logger.error(f"P2P_KEEPALIVE ({self.agent_id}): Error handling keepalive: {e}")
|
|
274
|
+
|
|
275
|
+
async def handle_keepalive_ack(self, sender_id: str, payload: Dict[str, Any]):
|
|
276
|
+
"""
|
|
277
|
+
Handle incoming keepalive ACK from peer.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
sender_id: ID of the peer that sent ACK
|
|
281
|
+
payload: ACK message payload
|
|
282
|
+
"""
|
|
283
|
+
try:
|
|
284
|
+
self.metrics.acks_received += 1
|
|
285
|
+
current_time = time.time()
|
|
286
|
+
|
|
287
|
+
# Calculate latency if original timestamp available
|
|
288
|
+
original_timestamp = payload.get('original_timestamp')
|
|
289
|
+
if original_timestamp:
|
|
290
|
+
latency = current_time - original_timestamp
|
|
291
|
+
self.metrics.last_keepalive_latency = latency
|
|
292
|
+
logger.debug(f"P2P_KEEPALIVE ({self.agent_id}): ACK from {sender_id}, "
|
|
293
|
+
f"latency={latency*1000:.1f}ms")
|
|
294
|
+
|
|
295
|
+
self.metrics.last_successful_keepalive = current_time
|
|
296
|
+
|
|
297
|
+
# Remove from pending if tracked
|
|
298
|
+
if sender_id in self.pending_keepalives:
|
|
299
|
+
del self.pending_keepalives[sender_id]
|
|
300
|
+
|
|
301
|
+
except Exception as e:
|
|
302
|
+
logger.error(f"P2P_KEEPALIVE ({self.agent_id}): Error handling ACK: {e}")
|
|
303
|
+
|
|
304
|
+
async def _check_timeouts(self):
|
|
305
|
+
"""Check for keepalive timeouts and clean up pending requests."""
|
|
306
|
+
current_time = time.time()
|
|
307
|
+
timed_out = []
|
|
308
|
+
|
|
309
|
+
for peer_id, sent_time in self.pending_keepalives.items():
|
|
310
|
+
if current_time - sent_time > self.timeout:
|
|
311
|
+
timed_out.append(peer_id)
|
|
312
|
+
|
|
313
|
+
for peer_id in timed_out:
|
|
314
|
+
del self.pending_keepalives[peer_id]
|
|
315
|
+
self.metrics.timeouts += 1
|
|
316
|
+
logger.warning(f"P2P_KEEPALIVE ({self.agent_id}): Keepalive timeout for peer {peer_id}")
|
|
317
|
+
|
|
318
|
+
async def _log_metrics(self):
|
|
319
|
+
"""Periodically log keepalive metrics."""
|
|
320
|
+
current_time = time.time()
|
|
321
|
+
|
|
322
|
+
# Log every 5 minutes
|
|
323
|
+
if current_time % 300 < 10:
|
|
324
|
+
logger.info(
|
|
325
|
+
f"P2P_KEEPALIVE_METRICS ({self.agent_id}): "
|
|
326
|
+
f"Sent={self.metrics.keepalives_sent}, "
|
|
327
|
+
f"Received={self.metrics.keepalives_received}, "
|
|
328
|
+
f"ACKs={self.metrics.acks_received}, "
|
|
329
|
+
f"Timeouts={self.metrics.timeouts}, "
|
|
330
|
+
f"Suppressed={self.metrics.suppressed_count}, "
|
|
331
|
+
f"Circuit_Events={self.metrics.circuit_breaker_events}, "
|
|
332
|
+
f"Last_Latency={self.metrics.last_keepalive_latency*1000:.1f}ms"
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
def get_health_status(self) -> Dict[str, Any]:
|
|
336
|
+
"""
|
|
337
|
+
Get current health status and metrics.
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
Dictionary with health status and metrics
|
|
341
|
+
"""
|
|
342
|
+
current_time = time.time()
|
|
343
|
+
time_since_last_success = current_time - self.metrics.last_successful_keepalive
|
|
344
|
+
|
|
345
|
+
return {
|
|
346
|
+
'enabled': self.enabled,
|
|
347
|
+
'running': self._running,
|
|
348
|
+
'circuit_state': self.circuit_state.value,
|
|
349
|
+
'last_activity': current_time - self.last_p2p_activity,
|
|
350
|
+
'last_keepalive': current_time - self.last_keepalive_sent,
|
|
351
|
+
'last_success': time_since_last_success,
|
|
352
|
+
'metrics': {
|
|
353
|
+
'sent': self.metrics.keepalives_sent,
|
|
354
|
+
'received': self.metrics.keepalives_received,
|
|
355
|
+
'acks': self.metrics.acks_received,
|
|
356
|
+
'timeouts': self.metrics.timeouts,
|
|
357
|
+
'suppressed': self.metrics.suppressed_count,
|
|
358
|
+
'circuit_events': self.metrics.circuit_breaker_events,
|
|
359
|
+
'latency_ms': self.metrics.last_keepalive_latency * 1000
|
|
360
|
+
}
|
|
361
|
+
}
|
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SWIM Thread Manager for JarvisCore Framework
|
|
3
|
+
Runs SWIM protocol in dedicated thread to prevent GIL blocking from CPU-bound workflow operations.
|
|
4
|
+
|
|
5
|
+
Adapted from integration-agent/src/swim_thread_manager.py
|
|
6
|
+
- Updated imports to use jarviscore.config
|
|
7
|
+
- Kept core functionality identical
|
|
8
|
+
"""
|
|
9
|
+
import asyncio
|
|
10
|
+
import logging
|
|
11
|
+
import threading
|
|
12
|
+
import time
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
from swim.transport.hybrid import HybridTransport
|
|
16
|
+
from swim.protocol.node import Node
|
|
17
|
+
from swim.config import get_config as get_swim_config, validate_config as validate_swim_config
|
|
18
|
+
from swim.events.dispatcher import EventDispatcher
|
|
19
|
+
from swim.integration.agent import ZMQAgentIntegration
|
|
20
|
+
from swim.main import SWIMZMQBridge, parse_address as swim_parse_address
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SWIMThreadManager:
|
|
26
|
+
"""Manages SWIM node in a dedicated thread with its own event loop."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, config: Optional[dict] = None):
|
|
29
|
+
"""
|
|
30
|
+
Initialize SWIM Thread Manager.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
config: Configuration dictionary (uses defaults if not provided)
|
|
34
|
+
"""
|
|
35
|
+
self.config = config or {}
|
|
36
|
+
self.swim_loop: Optional[asyncio.AbstractEventLoop] = None
|
|
37
|
+
self.swim_thread: Optional[threading.Thread] = None
|
|
38
|
+
self.swim_node = None
|
|
39
|
+
self.zmq_agent = None
|
|
40
|
+
self.swim_zmq_bridge = None
|
|
41
|
+
self.event_dispatcher = None
|
|
42
|
+
self.bind_addr = None # Store bind address for node_id access
|
|
43
|
+
self._started = False
|
|
44
|
+
self._initialized = threading.Event()
|
|
45
|
+
self._shutdown_event = threading.Event()
|
|
46
|
+
self._init_error: Optional[str] = None
|
|
47
|
+
|
|
48
|
+
def start_swim_in_thread_simple(self):
|
|
49
|
+
"""
|
|
50
|
+
Start SWIM in dedicated thread using configuration.
|
|
51
|
+
"""
|
|
52
|
+
if self._started:
|
|
53
|
+
logger.warning("SWIM thread already started")
|
|
54
|
+
return
|
|
55
|
+
|
|
56
|
+
logger.info("Starting SWIM in dedicated thread...")
|
|
57
|
+
self.swim_thread = threading.Thread(
|
|
58
|
+
target=self._run_swim_loop,
|
|
59
|
+
daemon=True,
|
|
60
|
+
name="SWIM-Protocol-Thread"
|
|
61
|
+
)
|
|
62
|
+
self.swim_thread.start()
|
|
63
|
+
self._started = True
|
|
64
|
+
logger.info("SWIM thread started")
|
|
65
|
+
|
|
66
|
+
def _run_swim_loop(self):
|
|
67
|
+
"""Run SWIM in dedicated event loop (runs in thread)."""
|
|
68
|
+
try:
|
|
69
|
+
# Create new event loop for this thread
|
|
70
|
+
self.swim_loop = asyncio.new_event_loop()
|
|
71
|
+
asyncio.set_event_loop(self.swim_loop)
|
|
72
|
+
|
|
73
|
+
logger.info("SWIM thread event loop created")
|
|
74
|
+
|
|
75
|
+
# Initialize SWIM
|
|
76
|
+
self.swim_loop.run_until_complete(self._init_swim())
|
|
77
|
+
|
|
78
|
+
if not self.swim_node:
|
|
79
|
+
logger.error("SWIM initialization failed in thread")
|
|
80
|
+
self._init_error = "Failed to create SWIM node"
|
|
81
|
+
self._initialized.set()
|
|
82
|
+
return
|
|
83
|
+
|
|
84
|
+
logger.info("✅ SWIM initialized successfully in dedicated thread")
|
|
85
|
+
logger.info(" SWIM will never be blocked by workflow execution!")
|
|
86
|
+
|
|
87
|
+
# Signal that initialization is complete
|
|
88
|
+
self._initialized.set()
|
|
89
|
+
|
|
90
|
+
# Run event loop until shutdown
|
|
91
|
+
self._run_until_shutdown()
|
|
92
|
+
|
|
93
|
+
except Exception as e:
|
|
94
|
+
logger.error(f"Error in SWIM thread: {e}", exc_info=True)
|
|
95
|
+
self._init_error = str(e)
|
|
96
|
+
self._initialized.set()
|
|
97
|
+
finally:
|
|
98
|
+
if self.swim_loop:
|
|
99
|
+
try:
|
|
100
|
+
self.swim_loop.close()
|
|
101
|
+
except Exception as e:
|
|
102
|
+
logger.error(f"Error closing SWIM loop: {e}")
|
|
103
|
+
|
|
104
|
+
async def _init_swim(self):
|
|
105
|
+
"""Initialize SWIM components."""
|
|
106
|
+
try:
|
|
107
|
+
# Get configuration with defaults
|
|
108
|
+
bind_host = self.config.get('bind_host', '127.0.0.1')
|
|
109
|
+
bind_port = self.config.get('bind_port', 7946)
|
|
110
|
+
node_name = self.config.get('node_name', 'jarviscore-node')
|
|
111
|
+
seed_nodes = self.config.get('seed_nodes', '')
|
|
112
|
+
transport_type = self.config.get('transport_type', 'hybrid')
|
|
113
|
+
zmq_port_offset = self.config.get('zmq_port_offset', 1000)
|
|
114
|
+
|
|
115
|
+
# Parse bind address
|
|
116
|
+
self.bind_addr = swim_parse_address(f"{bind_host}:{bind_port}")
|
|
117
|
+
logger.info(f"SWIM bind address: {self.bind_addr}")
|
|
118
|
+
|
|
119
|
+
# Parse seed nodes
|
|
120
|
+
seed_addrs = []
|
|
121
|
+
if seed_nodes:
|
|
122
|
+
for seed in seed_nodes.split(','):
|
|
123
|
+
if seed.strip():
|
|
124
|
+
seed_addrs.append(swim_parse_address(seed.strip()))
|
|
125
|
+
logger.info(f"SWIM seed nodes: {seed_addrs}")
|
|
126
|
+
|
|
127
|
+
# Get SWIM config
|
|
128
|
+
swim_config = get_swim_config()
|
|
129
|
+
swim_config.update({
|
|
130
|
+
"NODE_NAME": node_name,
|
|
131
|
+
"ZMQ_ENABLED": True,
|
|
132
|
+
"SEND_ON_JOIN": True,
|
|
133
|
+
"ZMQ_PORT_OFFSET": zmq_port_offset,
|
|
134
|
+
"TRANSPORT_TYPE": transport_type,
|
|
135
|
+
"STABILITY_TIMEOUT_SECONDS": 3.0
|
|
136
|
+
})
|
|
137
|
+
|
|
138
|
+
# Validate config
|
|
139
|
+
errors = validate_swim_config(swim_config)
|
|
140
|
+
if errors:
|
|
141
|
+
logger.error(f"SWIM config validation errors: {errors}")
|
|
142
|
+
return
|
|
143
|
+
|
|
144
|
+
# Create transport
|
|
145
|
+
transport = HybridTransport(
|
|
146
|
+
udp_max_size=swim_config.get("UDP_MAX_SIZE", 1400),
|
|
147
|
+
tcp_buffer_size=swim_config.get("TCP_BUFFER_SIZE", 65536),
|
|
148
|
+
tcp_max_connections=swim_config.get("TCP_MAX_CONNECTIONS", 128)
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Create event dispatcher
|
|
152
|
+
self.event_dispatcher = EventDispatcher(
|
|
153
|
+
max_history_size=swim_config.get("EVENT_HISTORY", 1000),
|
|
154
|
+
enable_history=swim_config.get("EVENTS_ENABLED", True)
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Create SWIM node
|
|
158
|
+
logger.info("Creating SWIM node in dedicated thread...")
|
|
159
|
+
self.swim_node = await Node.create(
|
|
160
|
+
bind_addr=self.bind_addr,
|
|
161
|
+
transport=transport,
|
|
162
|
+
seed_addrs=seed_addrs,
|
|
163
|
+
config=swim_config,
|
|
164
|
+
event_dispatcher=self.event_dispatcher,
|
|
165
|
+
validate_ports=True
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
if not self.swim_node:
|
|
169
|
+
logger.error("Failed to create SWIM node")
|
|
170
|
+
return
|
|
171
|
+
|
|
172
|
+
logger.info(f"SWIM node created at {self.bind_addr}")
|
|
173
|
+
|
|
174
|
+
# Setup ZMQ integration
|
|
175
|
+
zmq_port = self.bind_addr[1] + swim_config.get("ZMQ_PORT_OFFSET", zmq_port_offset)
|
|
176
|
+
zmq_addr = f"{self.bind_addr[0]}:{zmq_port}"
|
|
177
|
+
node_id = f"{self.bind_addr[0]}:{self.bind_addr[1]}"
|
|
178
|
+
logger.info(f"Setting up ZMQ integration at {zmq_addr}")
|
|
179
|
+
|
|
180
|
+
self.zmq_agent = ZMQAgentIntegration(
|
|
181
|
+
node_id=node_id,
|
|
182
|
+
bind_address=zmq_addr,
|
|
183
|
+
event_dispatcher=self.event_dispatcher,
|
|
184
|
+
config=swim_config
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
# Start ZMQ agent
|
|
188
|
+
logger.info("Starting ZMQ agent...")
|
|
189
|
+
await self.zmq_agent.start()
|
|
190
|
+
logger.info("ZMQ agent started successfully")
|
|
191
|
+
|
|
192
|
+
# Setup SWIM-ZMQ Bridge
|
|
193
|
+
logger.info("Setting up SWIM-ZMQ Bridge...")
|
|
194
|
+
self.swim_zmq_bridge = SWIMZMQBridge(self.swim_node, self.zmq_agent, swim_config)
|
|
195
|
+
await self.swim_zmq_bridge.start()
|
|
196
|
+
logger.info("SWIM-ZMQ Bridge started successfully")
|
|
197
|
+
|
|
198
|
+
# Start the SWIM protocol
|
|
199
|
+
logger.info("Starting SWIM protocol...")
|
|
200
|
+
await self.swim_node.start()
|
|
201
|
+
logger.info("SWIM node started successfully")
|
|
202
|
+
|
|
203
|
+
except Exception as e:
|
|
204
|
+
logger.error(f"Error initializing SWIM: {e}", exc_info=True)
|
|
205
|
+
raise
|
|
206
|
+
|
|
207
|
+
def _run_until_shutdown(self):
|
|
208
|
+
"""Keep SWIM event loop running until shutdown requested."""
|
|
209
|
+
while not self._shutdown_event.is_set():
|
|
210
|
+
try:
|
|
211
|
+
# Process events with timeout so we can check shutdown flag
|
|
212
|
+
self.swim_loop.run_until_complete(asyncio.sleep(0.5))
|
|
213
|
+
except Exception as e:
|
|
214
|
+
logger.error(f"Error in SWIM event loop: {e}")
|
|
215
|
+
|
|
216
|
+
logger.info("SWIM thread shutdown requested")
|
|
217
|
+
|
|
218
|
+
# Cleanup SWIM components
|
|
219
|
+
try:
|
|
220
|
+
if self.swim_zmq_bridge and hasattr(self.swim_zmq_bridge, 'stop'):
|
|
221
|
+
self.swim_loop.run_until_complete(self.swim_zmq_bridge.stop())
|
|
222
|
+
if self.zmq_agent:
|
|
223
|
+
self.swim_loop.run_until_complete(self.zmq_agent.stop())
|
|
224
|
+
if self.swim_node:
|
|
225
|
+
self.swim_loop.run_until_complete(self.swim_node.stop())
|
|
226
|
+
except Exception as e:
|
|
227
|
+
logger.error(f"Error during SWIM shutdown: {e}")
|
|
228
|
+
|
|
229
|
+
def wait_for_init(self, timeout: float = 20.0) -> bool:
|
|
230
|
+
"""
|
|
231
|
+
Wait for SWIM to initialize.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
timeout: Maximum time to wait in seconds
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
True if initialized successfully, False if timeout or error
|
|
238
|
+
"""
|
|
239
|
+
logger.info(f"Waiting for SWIM initialization (timeout: {timeout}s)...")
|
|
240
|
+
|
|
241
|
+
if self._initialized.wait(timeout=timeout):
|
|
242
|
+
if self._init_error:
|
|
243
|
+
logger.error(f"SWIM initialization failed: {self._init_error}")
|
|
244
|
+
return False
|
|
245
|
+
if self.swim_node and self.zmq_agent:
|
|
246
|
+
logger.info("SWIM initialization confirmed")
|
|
247
|
+
return True
|
|
248
|
+
else:
|
|
249
|
+
logger.error("SWIM initialization incomplete")
|
|
250
|
+
return False
|
|
251
|
+
else:
|
|
252
|
+
logger.error(f"SWIM initialization timeout after {timeout}s")
|
|
253
|
+
return False
|
|
254
|
+
|
|
255
|
+
def is_healthy(self) -> bool:
|
|
256
|
+
"""Check if SWIM thread is healthy."""
|
|
257
|
+
return (
|
|
258
|
+
self._started and
|
|
259
|
+
self.swim_thread is not None and
|
|
260
|
+
self.swim_thread.is_alive() and
|
|
261
|
+
self.swim_node is not None and
|
|
262
|
+
self.zmq_agent is not None
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
def get_status(self) -> dict:
|
|
266
|
+
"""Get SWIM thread status."""
|
|
267
|
+
return {
|
|
268
|
+
'started': self._started,
|
|
269
|
+
'thread_alive': self.swim_thread.is_alive() if self.swim_thread else False,
|
|
270
|
+
'swim_node': self.swim_node is not None,
|
|
271
|
+
'zmq_agent': self.zmq_agent is not None,
|
|
272
|
+
'bridge': self.swim_zmq_bridge is not None,
|
|
273
|
+
'healthy': self.is_healthy()
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
def shutdown(self, timeout: float = 10.0):
|
|
277
|
+
"""Shutdown SWIM thread gracefully."""
|
|
278
|
+
if not self._started:
|
|
279
|
+
return
|
|
280
|
+
|
|
281
|
+
logger.info("Shutting down SWIM thread...")
|
|
282
|
+
self._shutdown_event.set()
|
|
283
|
+
|
|
284
|
+
if self.swim_thread:
|
|
285
|
+
self.swim_thread.join(timeout=timeout)
|
|
286
|
+
if self.swim_thread.is_alive():
|
|
287
|
+
logger.warning("SWIM thread did not exit cleanly")
|
|
288
|
+
|
|
289
|
+
self._started = False
|
|
290
|
+
logger.info("SWIM thread shutdown complete")
|