puda-comms 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- puda_comms/command_service.py +87 -48
- puda_comms/machine_client.py +142 -61
- puda_comms/models.py +5 -4
- {puda_comms-0.0.2.dist-info → puda_comms-0.0.4.dist-info}/METADATA +34 -6
- puda_comms-0.0.4.dist-info/RECORD +8 -0
- puda_comms-0.0.2.dist-info/RECORD +0 -8
- {puda_comms-0.0.2.dist-info → puda_comms-0.0.4.dist-info}/WHEEL +0 -0
puda_comms/command_service.py
CHANGED
|
@@ -10,14 +10,13 @@ This service handles:
|
|
|
10
10
|
import asyncio
|
|
11
11
|
import json
|
|
12
12
|
import logging
|
|
13
|
-
import os
|
|
14
13
|
import signal
|
|
15
14
|
from datetime import datetime, timezone
|
|
16
|
-
from typing import Dict, Any, Optional
|
|
15
|
+
from typing import Dict, Any, Optional
|
|
17
16
|
import nats
|
|
18
17
|
from nats.js.client import JetStreamContext
|
|
19
18
|
from nats.aio.msg import Msg
|
|
20
|
-
from puda_comms.models import CommandRequest,
|
|
19
|
+
from puda_comms.models import CommandRequest, CommandResponseStatus, NATSMessage, MessageHeader, MessageType
|
|
21
20
|
|
|
22
21
|
logger = logging.getLogger(__name__)
|
|
23
22
|
|
|
@@ -38,7 +37,7 @@ class ResponseHandler:
|
|
|
38
37
|
def __init__(self, js: JetStreamContext, machine_id: str):
|
|
39
38
|
self.js = js
|
|
40
39
|
self.machine_id = machine_id
|
|
41
|
-
self._pending_responses: Dict[str,
|
|
40
|
+
self._pending_responses: Dict[str, Dict[str, Any]] = {} # {'event': asyncio.Event, 'response': Optional[NATSMessage]}
|
|
42
41
|
self._queue_consumer = None
|
|
43
42
|
self._immediate_consumer = None
|
|
44
43
|
self._initialized = False
|
|
@@ -103,8 +102,8 @@ class ResponseHandler:
|
|
|
103
102
|
|
|
104
103
|
# Get the pending response
|
|
105
104
|
pending = self._pending_responses[key]
|
|
106
|
-
# Store the
|
|
107
|
-
pending['response'] = message
|
|
105
|
+
# Store the NATSMessage directly
|
|
106
|
+
pending['response'] = message
|
|
108
107
|
# Signal that response was received
|
|
109
108
|
# Don't delete here - let get_response() delete it after retrieval
|
|
110
109
|
pending['event'].set()
|
|
@@ -153,7 +152,7 @@ class ResponseHandler:
|
|
|
153
152
|
}
|
|
154
153
|
return event
|
|
155
154
|
|
|
156
|
-
def get_response(self, run_id: str, step_number: int) -> Optional[
|
|
155
|
+
def get_response(self, run_id: str, step_number: int) -> Optional[NATSMessage]:
|
|
157
156
|
"""
|
|
158
157
|
Get the response for a pending command.
|
|
159
158
|
|
|
@@ -162,7 +161,7 @@ class ResponseHandler:
|
|
|
162
161
|
step_number: Step number for the command
|
|
163
162
|
|
|
164
163
|
Returns:
|
|
165
|
-
The NATSMessage
|
|
164
|
+
The NATSMessage if available, None otherwise
|
|
166
165
|
"""
|
|
167
166
|
key = f"{run_id}:{str(step_number)}"
|
|
168
167
|
if key in self._pending_responses:
|
|
@@ -178,8 +177,16 @@ class ResponseHandler:
|
|
|
178
177
|
if key in self._pending_responses:
|
|
179
178
|
del self._pending_responses[key]
|
|
180
179
|
|
|
180
|
+
def cancel_all_pending(self):
|
|
181
|
+
"""Cancel all pending responses by setting their events. This wakes up any waiting tasks immediately."""
|
|
182
|
+
for pending in self._pending_responses.values():
|
|
183
|
+
pending['event'].set()
|
|
184
|
+
|
|
181
185
|
async def cleanup(self):
|
|
182
186
|
"""Clean up subscriptions."""
|
|
187
|
+
# Cancel all pending responses first to wake up waiting tasks
|
|
188
|
+
self.cancel_all_pending()
|
|
189
|
+
|
|
183
190
|
if self._queue_consumer:
|
|
184
191
|
try:
|
|
185
192
|
await self._queue_consumer.unsubscribe()
|
|
@@ -200,11 +207,6 @@ class CommandService:
|
|
|
200
207
|
Handles connection management, command parsing, and response handling.
|
|
201
208
|
Can send commands to multiple machines.
|
|
202
209
|
|
|
203
|
-
Supports async context manager usage for automatic cleanup:
|
|
204
|
-
async with CommandService() as service:
|
|
205
|
-
await service.send_queue_command(...)
|
|
206
|
-
# Automatically disconnects on exit
|
|
207
|
-
|
|
208
210
|
Automatically registers signal handlers (SIGTERM, SIGINT) for graceful shutdown.
|
|
209
211
|
"""
|
|
210
212
|
|
|
@@ -212,20 +214,19 @@ class CommandService:
|
|
|
212
214
|
|
|
213
215
|
def __init__(
|
|
214
216
|
self,
|
|
215
|
-
servers:
|
|
217
|
+
servers: list[str]
|
|
216
218
|
):
|
|
217
219
|
"""
|
|
218
220
|
Initialize NATS service.
|
|
219
221
|
|
|
220
222
|
Args:
|
|
221
|
-
servers: List of NATS server URLs.
|
|
223
|
+
servers: List of NATS server URLs. Must be a non-empty list.
|
|
224
|
+
|
|
225
|
+
Raises:
|
|
226
|
+
ValueError: If servers is None or empty.
|
|
222
227
|
"""
|
|
223
|
-
if servers is None:
|
|
224
|
-
|
|
225
|
-
"NATS_SERVERS",
|
|
226
|
-
"nats://192.168.50.201:4222,nats://192.168.50.201:4223,nats://192.168.50.201:4224"
|
|
227
|
-
)
|
|
228
|
-
servers = [s.strip() for s in nats_servers_env.split(",")]
|
|
228
|
+
if servers is None or len(servers) == 0:
|
|
229
|
+
raise ValueError("Please provide a non-empty list of NATS server URLs")
|
|
229
230
|
|
|
230
231
|
self.servers = servers
|
|
231
232
|
self.nc: Optional[nats.NATS] = None
|
|
@@ -254,24 +255,50 @@ class CommandService:
|
|
|
254
255
|
"""
|
|
255
256
|
Connect to NATS servers.
|
|
256
257
|
|
|
258
|
+
Limits connection attempts to 3. After 3 failed attempts, gives up and logs error.
|
|
259
|
+
|
|
257
260
|
Returns:
|
|
258
261
|
True if connected successfully, False otherwise
|
|
259
262
|
"""
|
|
260
263
|
if self._connected:
|
|
261
264
|
return True
|
|
262
265
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
266
|
+
max_attempts = 3
|
|
267
|
+
connect_timeout = 3 # 3 seconds timeout per connection attempt
|
|
268
|
+
|
|
269
|
+
for attempt in range(1, max_attempts + 1):
|
|
270
|
+
try:
|
|
271
|
+
logger.info("Connection attempt %d/%d to NATS servers: %s", attempt, max_attempts, self.servers)
|
|
272
|
+
self.nc = await asyncio.wait_for(
|
|
273
|
+
nats.connect(
|
|
274
|
+
servers=self.servers,
|
|
275
|
+
connect_timeout=connect_timeout,
|
|
276
|
+
reconnect_time_wait=2,
|
|
277
|
+
max_reconnect_attempts=0 # No reconnection during initial connection
|
|
278
|
+
),
|
|
279
|
+
timeout=connect_timeout + 1 # Slightly longer timeout for the wait_for
|
|
280
|
+
)
|
|
281
|
+
self.js = self.nc.jetstream()
|
|
282
|
+
|
|
283
|
+
self._connected = True
|
|
284
|
+
logger.info("Connected to NATS servers")
|
|
285
|
+
return True
|
|
286
|
+
|
|
287
|
+
except asyncio.TimeoutError:
|
|
288
|
+
logger.warning("Connection attempt %d/%d timed out after %d seconds", attempt, max_attempts, connect_timeout)
|
|
289
|
+
if attempt < max_attempts:
|
|
290
|
+
logger.info("Retrying connection...")
|
|
291
|
+
else:
|
|
292
|
+
logger.error("Failed to connect after %d attempts. Giving up.", max_attempts)
|
|
293
|
+
except Exception as e:
|
|
294
|
+
logger.warning("Connection attempt %d/%d failed: %s", attempt, max_attempts, e)
|
|
295
|
+
if attempt < max_attempts:
|
|
296
|
+
logger.info("Retrying connection...")
|
|
297
|
+
else:
|
|
298
|
+
logger.error("Failed to connect after %d attempts. Giving up.", max_attempts)
|
|
299
|
+
|
|
300
|
+
self._connected = False
|
|
301
|
+
return False
|
|
275
302
|
|
|
276
303
|
async def _get_response_handler(self, machine_id: str) -> ResponseHandler:
|
|
277
304
|
"""
|
|
@@ -316,6 +343,8 @@ class CommandService:
|
|
|
316
343
|
request: CommandRequest,
|
|
317
344
|
machine_id: str,
|
|
318
345
|
run_id: str,
|
|
346
|
+
user_id: str,
|
|
347
|
+
username: str,
|
|
319
348
|
timeout: int = 120
|
|
320
349
|
) -> Optional[NATSMessage]:
|
|
321
350
|
"""
|
|
@@ -325,6 +354,8 @@ class CommandService:
|
|
|
325
354
|
request: CommandRequest model containing command details
|
|
326
355
|
machine_id: Machine ID to send the command to
|
|
327
356
|
run_id: Run ID for the command
|
|
357
|
+
user_id: User ID who initiated the command
|
|
358
|
+
username: Username who initiated the command
|
|
328
359
|
timeout: Maximum time to wait for response in seconds
|
|
329
360
|
|
|
330
361
|
Returns:
|
|
@@ -337,8 +368,8 @@ class CommandService:
|
|
|
337
368
|
subject = f"{NAMESPACE}.{machine_id}.cmd.queue"
|
|
338
369
|
|
|
339
370
|
logger.info(
|
|
340
|
-
"Sending queue command:
|
|
341
|
-
|
|
371
|
+
"Sending queue command: subject=%s, command=%s, run_id=%s, step_number=%s",
|
|
372
|
+
subject, request.name, run_id, request.step_number
|
|
342
373
|
)
|
|
343
374
|
|
|
344
375
|
# Get or create response handler for this machine
|
|
@@ -347,7 +378,7 @@ class CommandService:
|
|
|
347
378
|
response_event = response_handler.register_pending(run_id, request.step_number)
|
|
348
379
|
|
|
349
380
|
# Build payload
|
|
350
|
-
payload = self._build_command_payload(request, machine_id, run_id)
|
|
381
|
+
payload = self._build_command_payload(request, machine_id, run_id, user_id, username)
|
|
351
382
|
|
|
352
383
|
try:
|
|
353
384
|
# Publish to JetStream
|
|
@@ -370,11 +401,7 @@ class CommandService:
|
|
|
370
401
|
await asyncio.sleep(0.1)
|
|
371
402
|
|
|
372
403
|
# Get the response
|
|
373
|
-
|
|
374
|
-
if response_data is None:
|
|
375
|
-
return None
|
|
376
|
-
|
|
377
|
-
return NATSMessage.model_validate(response_data)
|
|
404
|
+
return response_handler.get_response(run_id, request.step_number)
|
|
378
405
|
|
|
379
406
|
except Exception as e:
|
|
380
407
|
logger.error("Error sending queue command: %s", e)
|
|
@@ -387,6 +414,8 @@ class CommandService:
|
|
|
387
414
|
requests: list[CommandRequest],
|
|
388
415
|
machine_id: str,
|
|
389
416
|
run_id: str,
|
|
417
|
+
user_id: str,
|
|
418
|
+
username: str,
|
|
390
419
|
timeout: int = 120
|
|
391
420
|
) -> Optional[NATSMessage]:
|
|
392
421
|
"""
|
|
@@ -400,6 +429,8 @@ class CommandService:
|
|
|
400
429
|
requests: List of CommandRequest models to send sequentially
|
|
401
430
|
machine_id: Machine ID to send the commands to
|
|
402
431
|
run_id: Run ID for all commands
|
|
432
|
+
user_id: User ID who initiated the commands
|
|
433
|
+
username: Username who initiated the commands
|
|
403
434
|
timeout: Maximum time to wait for each response in seconds
|
|
404
435
|
|
|
405
436
|
Returns:
|
|
@@ -435,6 +466,8 @@ class CommandService:
|
|
|
435
466
|
request=request,
|
|
436
467
|
machine_id=machine_id,
|
|
437
468
|
run_id=run_id,
|
|
469
|
+
user_id=user_id,
|
|
470
|
+
username=username,
|
|
438
471
|
timeout=timeout
|
|
439
472
|
)
|
|
440
473
|
|
|
@@ -495,6 +528,8 @@ class CommandService:
|
|
|
495
528
|
request: CommandRequest,
|
|
496
529
|
machine_id: str,
|
|
497
530
|
run_id: str,
|
|
531
|
+
user_id: str,
|
|
532
|
+
username: str,
|
|
498
533
|
timeout: int = 120
|
|
499
534
|
) -> Optional[NATSMessage]:
|
|
500
535
|
"""
|
|
@@ -504,6 +539,8 @@ class CommandService:
|
|
|
504
539
|
request: CommandRequest model containing command details
|
|
505
540
|
machine_id: Machine ID to send the command to
|
|
506
541
|
run_id: Run ID for the command
|
|
542
|
+
user_id: User ID who initiated the command
|
|
543
|
+
username: Username who initiated the command
|
|
507
544
|
timeout: Maximum time to wait for response in seconds
|
|
508
545
|
|
|
509
546
|
Returns:
|
|
@@ -528,7 +565,7 @@ class CommandService:
|
|
|
528
565
|
response_received = response_handler.register_pending(run_id, request.step_number)
|
|
529
566
|
|
|
530
567
|
# Build payload
|
|
531
|
-
payload = self._build_command_payload(request, machine_id, run_id)
|
|
568
|
+
payload = self._build_command_payload(request, machine_id, run_id, user_id, username)
|
|
532
569
|
|
|
533
570
|
try:
|
|
534
571
|
# Publish to JetStream
|
|
@@ -551,11 +588,7 @@ class CommandService:
|
|
|
551
588
|
await asyncio.sleep(0.1)
|
|
552
589
|
|
|
553
590
|
# Get the response
|
|
554
|
-
|
|
555
|
-
if response_data is None:
|
|
556
|
-
return None
|
|
557
|
-
|
|
558
|
-
return NATSMessage.model_validate(response_data)
|
|
591
|
+
return response_handler.get_response(run_id, request.step_number)
|
|
559
592
|
|
|
560
593
|
except Exception as e:
|
|
561
594
|
logger.error("Error sending immediate command: %s", e)
|
|
@@ -608,7 +641,9 @@ class CommandService:
|
|
|
608
641
|
self,
|
|
609
642
|
command_request: CommandRequest,
|
|
610
643
|
machine_id: str,
|
|
611
|
-
run_id: str
|
|
644
|
+
run_id: str,
|
|
645
|
+
user_id: str,
|
|
646
|
+
username: str
|
|
612
647
|
) -> NATSMessage:
|
|
613
648
|
"""
|
|
614
649
|
Build a command payload in the expected format.
|
|
@@ -617,6 +652,8 @@ class CommandService:
|
|
|
617
652
|
command_request: CommandRequest model containing command details
|
|
618
653
|
machine_id: Machine ID for the command
|
|
619
654
|
run_id: Run ID for the command
|
|
655
|
+
user_id: User ID who initiated the command
|
|
656
|
+
username: Username who initiated the command
|
|
620
657
|
|
|
621
658
|
Returns:
|
|
622
659
|
NATSMessage object ready for NATS transmission
|
|
@@ -625,6 +662,8 @@ class CommandService:
|
|
|
625
662
|
message_type=MessageType.COMMAND,
|
|
626
663
|
version="1.0",
|
|
627
664
|
timestamp=datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
|
|
665
|
+
user_id=user_id,
|
|
666
|
+
username=username,
|
|
628
667
|
machine_id=machine_id,
|
|
629
668
|
run_id=run_id
|
|
630
669
|
)
|
puda_comms/machine_client.py
CHANGED
|
@@ -20,7 +20,7 @@ from puda_comms.models import (
|
|
|
20
20
|
ImmediateCommand,
|
|
21
21
|
)
|
|
22
22
|
from nats.js.client import JetStreamContext
|
|
23
|
-
from nats.js.api import StreamConfig
|
|
23
|
+
from nats.js.api import StreamConfig, ConsumerConfig
|
|
24
24
|
from nats.js.errors import NotFoundError
|
|
25
25
|
from nats.aio.msg import Msg
|
|
26
26
|
|
|
@@ -69,11 +69,13 @@ class MachineClient:
|
|
|
69
69
|
|
|
70
70
|
# Default subscriptions
|
|
71
71
|
self._cmd_queue_sub = None
|
|
72
|
+
self._cmd_queue_task = None # Background task for pull consumer
|
|
72
73
|
self._cmd_immediate_sub = None
|
|
73
74
|
|
|
74
75
|
# Connection state
|
|
75
76
|
self._is_connected = False
|
|
76
|
-
self.
|
|
77
|
+
self._queue_handler = None
|
|
78
|
+
self._immediate_handler = None
|
|
77
79
|
|
|
78
80
|
# Queue control state
|
|
79
81
|
self._pause_lock = asyncio.Lock()
|
|
@@ -184,30 +186,22 @@ class MachineClient:
|
|
|
184
186
|
logger.error("Error ensuring %s stream: %s", stream_name, e, exc_info=True)
|
|
185
187
|
raise
|
|
186
188
|
|
|
187
|
-
async def
|
|
188
|
-
"""Ensure
|
|
189
|
+
async def _ensure_all_streams(self):
|
|
190
|
+
"""Ensure all required streams exist with correct retention policies."""
|
|
189
191
|
await self._ensure_stream(
|
|
190
192
|
self.STREAM_COMMAND_QUEUE,
|
|
191
|
-
f"{self.NAMESPACE}.*.cmd.queue"
|
|
193
|
+
f"{self.NAMESPACE}.*.cmd.queue",
|
|
194
|
+
retention='workqueue'
|
|
192
195
|
)
|
|
193
|
-
|
|
194
|
-
async def _ensure_command_immediate_stream(self):
|
|
195
|
-
"""Ensure COMMAND_IMMEDIATE stream exists with WorkQueue retention policy."""
|
|
196
196
|
await self._ensure_stream(
|
|
197
197
|
self.STREAM_COMMAND_IMMEDIATE,
|
|
198
198
|
f"{self.NAMESPACE}.*.cmd.immediate"
|
|
199
199
|
)
|
|
200
|
-
|
|
201
|
-
async def _ensure_response_queue_stream(self):
|
|
202
|
-
"""Ensure RESPONSE_QUEUE stream exists with Interest retention policy."""
|
|
203
200
|
await self._ensure_stream(
|
|
204
201
|
self.STREAM_RESPONSE_QUEUE,
|
|
205
202
|
f"{self.NAMESPACE}.*.cmd.response.queue",
|
|
206
203
|
retention='interest'
|
|
207
204
|
)
|
|
208
|
-
|
|
209
|
-
async def _ensure_response_immediate_stream(self):
|
|
210
|
-
"""Ensure RESPONSE_IMMEDIATE stream exists with Interest retention policy."""
|
|
211
205
|
await self._ensure_stream(
|
|
212
206
|
self.STREAM_RESPONSE_IMMEDIATE,
|
|
213
207
|
f"{self.NAMESPACE}.*.cmd.response.immediate",
|
|
@@ -230,7 +224,17 @@ class MachineClient:
|
|
|
230
224
|
|
|
231
225
|
async def _cleanup_subscriptions(self):
|
|
232
226
|
"""Unsubscribe from all subscriptions."""
|
|
233
|
-
# Clean up
|
|
227
|
+
# Clean up queue subscription (pull consumer)
|
|
228
|
+
if self._cmd_queue_task:
|
|
229
|
+
try:
|
|
230
|
+
self._cmd_queue_task.cancel()
|
|
231
|
+
await self._cmd_queue_task
|
|
232
|
+
except asyncio.CancelledError:
|
|
233
|
+
pass
|
|
234
|
+
except Exception:
|
|
235
|
+
pass
|
|
236
|
+
self._cmd_queue_task = None
|
|
237
|
+
|
|
234
238
|
if self._cmd_queue_sub:
|
|
235
239
|
try:
|
|
236
240
|
await self._cmd_queue_sub.unsubscribe()
|
|
@@ -252,6 +256,7 @@ class MachineClient:
|
|
|
252
256
|
self.kv = None
|
|
253
257
|
# Subscriptions will be recreated on reconnection
|
|
254
258
|
self._cmd_queue_sub = None
|
|
259
|
+
self._cmd_queue_task = None
|
|
255
260
|
self._cmd_immediate_sub = None
|
|
256
261
|
|
|
257
262
|
# ==================== CONNECTION MANAGEMENT ====================
|
|
@@ -261,6 +266,7 @@ class MachineClient:
|
|
|
261
266
|
try:
|
|
262
267
|
self.nc = await nats.connect(
|
|
263
268
|
servers=self.servers,
|
|
269
|
+
connect_timeout=10, # 10 seconds timeout for initial connection
|
|
264
270
|
reconnect_time_wait=2,
|
|
265
271
|
max_reconnect_attempts=-1,
|
|
266
272
|
error_cb=self._error_callback,
|
|
@@ -269,10 +275,7 @@ class MachineClient:
|
|
|
269
275
|
closed_cb=self._closed_callback
|
|
270
276
|
)
|
|
271
277
|
self.js = self.nc.jetstream()
|
|
272
|
-
await self.
|
|
273
|
-
await self._ensure_command_immediate_stream()
|
|
274
|
-
await self._ensure_response_queue_stream()
|
|
275
|
-
await self._ensure_response_immediate_stream()
|
|
278
|
+
await self._ensure_all_streams()
|
|
276
279
|
self.kv = await self._get_or_create_kv_bucket()
|
|
277
280
|
self._is_connected = True
|
|
278
281
|
logger.info("Connected to NATS servers: %s", self.servers)
|
|
@@ -298,32 +301,16 @@ class MachineClient:
|
|
|
298
301
|
|
|
299
302
|
if self.nc:
|
|
300
303
|
self.js = self.nc.jetstream()
|
|
301
|
-
await self.
|
|
302
|
-
await self._ensure_command_immediate_stream()
|
|
303
|
-
await self._ensure_response_queue_stream()
|
|
304
|
-
await self._ensure_response_immediate_stream()
|
|
304
|
+
await self._ensure_all_streams()
|
|
305
305
|
self.kv = await self._get_or_create_kv_bucket()
|
|
306
306
|
await self._resubscribe_handlers()
|
|
307
307
|
|
|
308
308
|
async def _resubscribe_handlers(self):
|
|
309
309
|
"""Re-subscribe to all handlers after reconnection."""
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
for handler_info in self._reconnect_handlers:
|
|
316
|
-
try:
|
|
317
|
-
handler_type = handler_info['type']
|
|
318
|
-
handler = handler_info['handler']
|
|
319
|
-
subscribe_method = subscribe_methods.get(handler_type)
|
|
320
|
-
|
|
321
|
-
if subscribe_method:
|
|
322
|
-
await subscribe_method(handler)
|
|
323
|
-
else:
|
|
324
|
-
logger.warning("Unknown handler type: %s", handler_type)
|
|
325
|
-
except Exception as e:
|
|
326
|
-
logger.error("Failed to re-subscribe %s: %s", handler_type, e)
|
|
310
|
+
if self._queue_handler:
|
|
311
|
+
await self.subscribe_queue(self._queue_handler)
|
|
312
|
+
if self._immediate_handler:
|
|
313
|
+
await self.subscribe_immediate(self._immediate_handler)
|
|
327
314
|
|
|
328
315
|
async def _closed_callback(self):
|
|
329
316
|
"""Callback when connection is closed."""
|
|
@@ -438,7 +425,7 @@ class MachineClient:
|
|
|
438
425
|
async def process_queue_cmd(
|
|
439
426
|
self,
|
|
440
427
|
msg: Msg,
|
|
441
|
-
handler: Callable[[
|
|
428
|
+
handler: Callable[[NATSMessage], Awaitable[CommandResponse]]
|
|
442
429
|
) -> None:
|
|
443
430
|
"""
|
|
444
431
|
Handle the lifecycle of a single message: Parse -> Handle -> Ack/Nak/Term.
|
|
@@ -660,9 +647,54 @@ class MachineClient:
|
|
|
660
647
|
)
|
|
661
648
|
await self.publish_state({'state': 'error', 'run_id': None})
|
|
662
649
|
|
|
650
|
+
async def _verify_or_recreate_consumer(self, durable_name: str):
|
|
651
|
+
"""
|
|
652
|
+
Check if consumer exists and verify/update its configuration.
|
|
653
|
+
Deletes and recreates the consumer if configuration doesn't match.
|
|
654
|
+
|
|
655
|
+
Args:
|
|
656
|
+
durable_name: Name of the durable consumer to verify
|
|
657
|
+
"""
|
|
658
|
+
# Check if consumer exists and verify/update its configuration
|
|
659
|
+
try:
|
|
660
|
+
consumer_info = await self.js.consumer_info(self.STREAM_COMMAND_QUEUE, durable_name)
|
|
661
|
+
logger.debug("Durable consumer %s already exists", durable_name)
|
|
662
|
+
|
|
663
|
+
# Check if consumer config matches what we need
|
|
664
|
+
config = consumer_info.config
|
|
665
|
+
needs_recreate = False
|
|
666
|
+
if getattr(config, 'filter_subject', None) != self.cmd_queue:
|
|
667
|
+
logger.warning("Consumer filter_subject mismatch: expected %s, got %s",
|
|
668
|
+
self.cmd_queue, getattr(config, 'filter_subject', None))
|
|
669
|
+
needs_recreate = True
|
|
670
|
+
if getattr(config, 'ack_policy', None) != 'explicit':
|
|
671
|
+
logger.warning("Consumer ack_policy mismatch: expected explicit, got %s",
|
|
672
|
+
getattr(config, 'ack_policy', None))
|
|
673
|
+
needs_recreate = True
|
|
674
|
+
if getattr(config, 'deliver_policy', None) != 'all':
|
|
675
|
+
logger.warning("Consumer deliver_policy mismatch: expected all, got %s",
|
|
676
|
+
getattr(config, 'deliver_policy', None))
|
|
677
|
+
needs_recreate = True
|
|
678
|
+
|
|
679
|
+
if needs_recreate:
|
|
680
|
+
# Consumer exists but config doesn't match - delete and recreate
|
|
681
|
+
logger.info("Consumer config mismatch, deleting and recreating: %s", durable_name)
|
|
682
|
+
try:
|
|
683
|
+
await self.js.delete_consumer(self.STREAM_COMMAND_QUEUE, durable_name)
|
|
684
|
+
except Exception as e:
|
|
685
|
+
logger.warning("Error deleting consumer: %s", e)
|
|
686
|
+
else:
|
|
687
|
+
# Log consumer state for diagnostics
|
|
688
|
+
logger.info("Consumer exists with correct config - pending: %d, delivered: %d, ack_pending: %d",
|
|
689
|
+
consumer_info.num_pending, consumer_info.delivered.consumer_seq,
|
|
690
|
+
consumer_info.num_ack_pending)
|
|
691
|
+
except NotFoundError:
|
|
692
|
+
# Consumer doesn't exist, will be created by pull_subscribe
|
|
693
|
+
logger.debug("Durable consumer %s does not exist, will be created", durable_name)
|
|
694
|
+
|
|
663
695
|
async def subscribe_queue(self, handler: Callable[[NATSMessage], Awaitable[CommandResponse]]):
|
|
664
696
|
"""
|
|
665
|
-
Subscribe to queue commands with
|
|
697
|
+
Subscribe to queue commands with pull consumer.
|
|
666
698
|
|
|
667
699
|
Args:
|
|
668
700
|
handler: Async function that processes command payloads and returns CommandResponse
|
|
@@ -672,19 +704,65 @@ class MachineClient:
|
|
|
672
704
|
return
|
|
673
705
|
|
|
674
706
|
# Ensure stream exists before attempting to subscribe
|
|
675
|
-
await self.
|
|
707
|
+
await self._ensure_all_streams()
|
|
676
708
|
|
|
677
709
|
try:
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
710
|
+
durable_name = f"cmd_queue_{self.machine_id}"
|
|
711
|
+
|
|
712
|
+
await self._verify_or_recreate_consumer(durable_name)
|
|
713
|
+
|
|
714
|
+
# Create pull subscription - this will create the consumer if it doesn't exist
|
|
715
|
+
# Pass config directly to ensure correct consumer configuration
|
|
716
|
+
consumer_config = ConsumerConfig(
|
|
717
|
+
durable_name=durable_name,
|
|
718
|
+
filter_subject=self.cmd_queue,
|
|
719
|
+
ack_policy="explicit",
|
|
720
|
+
deliver_policy="all", # Required for WorkQueue: deliver all messages from the beginning
|
|
721
|
+
)
|
|
722
|
+
|
|
723
|
+
self._cmd_queue_sub = await self.js.pull_subscribe(
|
|
683
724
|
subject=self.cmd_queue,
|
|
725
|
+
durable=durable_name,
|
|
684
726
|
stream=self.STREAM_COMMAND_QUEUE,
|
|
685
|
-
|
|
686
|
-
cb=message_handler
|
|
727
|
+
config=consumer_config
|
|
687
728
|
)
|
|
729
|
+
|
|
730
|
+
# Log final consumer info for diagnostics
|
|
731
|
+
try:
|
|
732
|
+
consumer_info = await self.js.consumer_info(self.STREAM_COMMAND_QUEUE, durable_name)
|
|
733
|
+
logger.info("Pull subscription created - subject: %s, durable: %s, stream: %s, pending: %d, ack_pending: %d",
|
|
734
|
+
self.cmd_queue, durable_name, self.STREAM_COMMAND_QUEUE,
|
|
735
|
+
consumer_info.num_pending, consumer_info.num_ack_pending)
|
|
736
|
+
except Exception as e:
|
|
737
|
+
logger.warning("Could not get consumer info after subscription: %s", e)
|
|
738
|
+
logger.info("Pull subscription created - subject: %s, durable: %s, stream: %s",
|
|
739
|
+
self.cmd_queue, durable_name, self.STREAM_COMMAND_QUEUE)
|
|
740
|
+
|
|
741
|
+
# Start background task to pull and process messages
|
|
742
|
+
async def pull_messages():
|
|
743
|
+
"""Continuously pull messages from the queue."""
|
|
744
|
+
try:
|
|
745
|
+
while True:
|
|
746
|
+
try:
|
|
747
|
+
# Fetch messages (batch of 1, timeout 1 second)
|
|
748
|
+
msgs = await self._cmd_queue_sub.fetch(batch=1, timeout=1.0)
|
|
749
|
+
if msgs:
|
|
750
|
+
logger.debug("Pulled %d message(s) from queue", len(msgs))
|
|
751
|
+
for msg in msgs:
|
|
752
|
+
await self.process_queue_cmd(msg, handler)
|
|
753
|
+
except asyncio.TimeoutError:
|
|
754
|
+
# Timeout is expected when no messages are available
|
|
755
|
+
continue
|
|
756
|
+
except Exception as e:
|
|
757
|
+
logger.error("Error pulling queue messages: %s", e, exc_info=True)
|
|
758
|
+
await asyncio.sleep(1) # Wait before retrying
|
|
759
|
+
except asyncio.CancelledError:
|
|
760
|
+
logger.debug("Queue pull task cancelled")
|
|
761
|
+
raise
|
|
762
|
+
|
|
763
|
+
self._cmd_queue_task = asyncio.create_task(pull_messages())
|
|
764
|
+
logger.info("Started background task for pulling queue messages")
|
|
765
|
+
|
|
688
766
|
except NotFoundError:
|
|
689
767
|
# Stream still not found after ensuring it exists - this shouldn't happen
|
|
690
768
|
# but handle it gracefully with detailed diagnostics
|
|
@@ -702,10 +780,9 @@ class MachineClient:
|
|
|
702
780
|
logger.error(" Stream verification failed: %s", stream_check_error)
|
|
703
781
|
raise
|
|
704
782
|
|
|
705
|
-
#
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
logger.info("Subscribed to queue commands: %s (durable: cmd_queue_%s, stream: %s)",
|
|
783
|
+
# Store handler for reconnection
|
|
784
|
+
self._queue_handler = handler
|
|
785
|
+
logger.info("Subscribed to queue commands: %s (durable: cmd_queue_%s, stream: %s, pull consumer)",
|
|
709
786
|
self.cmd_queue, self.machine_id, self.STREAM_COMMAND_QUEUE)
|
|
710
787
|
|
|
711
788
|
async def subscribe_immediate(self, handler: Callable[[NATSMessage], Awaitable[CommandResponse]]):
|
|
@@ -719,19 +796,26 @@ class MachineClient:
|
|
|
719
796
|
logger.error("JetStream not available for immediate subscription")
|
|
720
797
|
return
|
|
721
798
|
|
|
799
|
+
# Store handler for use in callback and reconnection
|
|
800
|
+
self._immediate_handler = handler
|
|
801
|
+
|
|
722
802
|
async def message_handler(msg: Msg):
|
|
723
|
-
"""
|
|
724
|
-
await self.process_immediate_cmd(msg,
|
|
803
|
+
"""Process immediate messages using stored handler."""
|
|
804
|
+
await self.process_immediate_cmd(msg, self._immediate_handler)
|
|
725
805
|
|
|
726
806
|
# Ensure stream exists before attempting to subscribe
|
|
727
|
-
await self.
|
|
807
|
+
await self._ensure_stream(
|
|
808
|
+
self.STREAM_COMMAND_IMMEDIATE,
|
|
809
|
+
f"{self.NAMESPACE}.*.cmd.immediate",
|
|
810
|
+
retention='workqueue'
|
|
811
|
+
)
|
|
728
812
|
|
|
729
813
|
try:
|
|
730
814
|
self._cmd_immediate_sub = await self.js.subscribe(
|
|
731
815
|
subject=self.cmd_immediate,
|
|
732
816
|
stream=self.STREAM_COMMAND_IMMEDIATE,
|
|
733
817
|
durable=f"cmd_immed_{self.machine_id}",
|
|
734
|
-
cb=message_handler
|
|
818
|
+
cb=message_handler # required for push consumer to handle messages
|
|
735
819
|
)
|
|
736
820
|
except NotFoundError:
|
|
737
821
|
# Stream still not found after ensuring it exists - this shouldn't happen
|
|
@@ -740,9 +824,6 @@ class MachineClient:
|
|
|
740
824
|
self.STREAM_COMMAND_IMMEDIATE)
|
|
741
825
|
raise
|
|
742
826
|
|
|
743
|
-
# Register handler for reconnection
|
|
744
|
-
if not any(h['type'] == 'immediate' for h in self._reconnect_handlers):
|
|
745
|
-
self._reconnect_handlers.append({'type': 'immediate', 'handler': handler})
|
|
746
827
|
logger.info("Subscribed to immediate commands: %s (durable: cmd_immed_%s, stream: %s)",
|
|
747
828
|
self.cmd_immediate, self.machine_id, self.STREAM_COMMAND_IMMEDIATE)
|
|
748
829
|
|
puda_comms/models.py
CHANGED
|
@@ -68,18 +68,19 @@ class CommandResponse(BaseModel):
|
|
|
68
68
|
|
|
69
69
|
class MessageHeader(BaseModel):
|
|
70
70
|
"""Header for NATS messages."""
|
|
71
|
-
message_type: MessageType = Field(description="Type of message")
|
|
72
71
|
version: str = Field(default="1.0", description="Message version")
|
|
73
|
-
|
|
72
|
+
message_type: MessageType = Field(description="Type of message")
|
|
73
|
+
user_id: str = Field(description="User ID")
|
|
74
|
+
username: str = Field(description="User name")
|
|
74
75
|
machine_id: str = Field(description="Machine ID")
|
|
75
76
|
run_id: Optional[str] = Field(default=None, description="Unique identifier (uuid) for the run/workflow")
|
|
76
|
-
|
|
77
|
+
timestamp: str = Field(default_factory=_get_current_timestamp, description="ISO format timestamp (auto-set on creation)")
|
|
77
78
|
class NATSMessage(BaseModel):
|
|
78
79
|
"""
|
|
79
80
|
Complete NATS message structure.
|
|
80
81
|
|
|
81
82
|
Structure:
|
|
82
|
-
- header: MessageHeader with message_type, version, timestamp, machine_id, run_id
|
|
83
|
+
- header: MessageHeader with message_type, version, timestamp, user_id, username, machine_id, run_id
|
|
83
84
|
- command: Optional CommandRequest (for command messages)
|
|
84
85
|
- response: Optional CommandResponse data (for response messages)
|
|
85
86
|
"""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: puda-comms
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: Communication library for the PUDA platform.
|
|
5
5
|
Author: zhao
|
|
6
6
|
Author-email: zhao <20024592+agentzhao@users.noreply.github.com>
|
|
@@ -121,6 +121,8 @@ Header metadata for NATS messages.
|
|
|
121
121
|
- `message_type` (MessageType): Type of message (COMMAND, RESPONSE, LOG, etc.)
|
|
122
122
|
- `version` (str): Message version (default: "1.0")
|
|
123
123
|
- `timestamp` (str): ISO 8601 UTC timestamp (auto-generated)
|
|
124
|
+
- `user_id` (str): User ID who initiated the command
|
|
125
|
+
- `username` (str): Username who initiated the command
|
|
124
126
|
- `machine_id` (str): Identifier for the target machine
|
|
125
127
|
- `run_id` (Optional[str]): Unique identifier (UUID) for the run/workflow
|
|
126
128
|
|
|
@@ -130,6 +132,8 @@ header = MessageHeader(
|
|
|
130
132
|
message_type=MessageType.RESPONSE,
|
|
131
133
|
version="1.0",
|
|
132
134
|
timestamp="2026-01-20T02:00:46Z",
|
|
135
|
+
user_id="user123",
|
|
136
|
+
username="John Doe",
|
|
133
137
|
machine_id="first",
|
|
134
138
|
run_id="092073e6-13d0-4756-8d99-eff1612a5a72"
|
|
135
139
|
)
|
|
@@ -154,6 +158,8 @@ Complete NATS message structure combining header with optional command or respon
|
|
|
154
158
|
"message_type": "response",
|
|
155
159
|
"version": "1.0",
|
|
156
160
|
"timestamp": "2026-01-20T02:00:46Z",
|
|
161
|
+
"user_id": "user123",
|
|
162
|
+
"username": "John Doe",
|
|
157
163
|
"machine_id": "first",
|
|
158
164
|
"run_id": "092073e6-13d0-4756-8d99-eff1612a5a72"
|
|
159
165
|
},
|
|
@@ -229,6 +235,8 @@ reply = await service.send_queue_command(
|
|
|
229
235
|
request=request,
|
|
230
236
|
machine_id="first",
|
|
231
237
|
run_id=run_id,
|
|
238
|
+
user_id="user123",
|
|
239
|
+
username="John Doe",
|
|
232
240
|
timeout=60 # Wait up to 60 seconds
|
|
233
241
|
)
|
|
234
242
|
|
|
@@ -237,6 +245,8 @@ reply = await service.send_queue_commands(
|
|
|
237
245
|
requests=commands,
|
|
238
246
|
machine_id="first",
|
|
239
247
|
run_id=run_id,
|
|
248
|
+
user_id="user123",
|
|
249
|
+
username="John Doe",
|
|
240
250
|
timeout=60 # Wait up to 60 seconds per command
|
|
241
251
|
)
|
|
242
252
|
```
|
|
@@ -274,7 +284,9 @@ Always check the response status and handle errors appropriately:
|
|
|
274
284
|
reply: NATSMessage = await service.send_queue_command(
|
|
275
285
|
request=request,
|
|
276
286
|
machine_id="first",
|
|
277
|
-
run_id=run_id
|
|
287
|
+
run_id=run_id,
|
|
288
|
+
user_id="user123",
|
|
289
|
+
username="John Doe"
|
|
278
290
|
)
|
|
279
291
|
|
|
280
292
|
if reply is None:
|
|
@@ -292,14 +304,30 @@ else:
|
|
|
292
304
|
|
|
293
305
|
### Configuration
|
|
294
306
|
|
|
295
|
-
|
|
307
|
+
#### NATS Server Configuration
|
|
308
|
+
|
|
309
|
+
The `CommandService` requires NATS server URLs to be specified explicitly. There are no default values. You must provide servers in one of two ways:
|
|
310
|
+
|
|
311
|
+
**Option 1: Via environment variable (comma-separated string)**
|
|
312
|
+
|
|
313
|
+
Set the `NATS_SERVERS` environment variable with comma-separated server URLs:
|
|
314
|
+
|
|
315
|
+
```bash
|
|
316
|
+
export NATS_SERVERS="nats://192.168.50.201:4222,nats://192.168.50.201:4223,nats://192.168.50.201:4224"
|
|
296
317
|
```
|
|
297
|
-
|
|
318
|
+
|
|
319
|
+
Then parse it when creating a `CommandService`:
|
|
320
|
+
```python
|
|
321
|
+
import os
|
|
322
|
+
nats_servers = [s.strip() for s in os.getenv("NATS_SERVERS", "").split(",") if s.strip()]
|
|
323
|
+
service = CommandService(servers=nats_servers)
|
|
298
324
|
```
|
|
299
325
|
|
|
300
|
-
|
|
326
|
+
**Option 2: Directly as a list**
|
|
327
|
+
|
|
328
|
+
Specify servers directly when creating a `CommandService`:
|
|
301
329
|
```python
|
|
302
|
-
service = CommandService(servers=["nats://
|
|
330
|
+
service = CommandService(servers=["nats://192.168.50.201:4222", "nats://192.168.50.201:4223", "nats://192.168.50.201:4224"])
|
|
303
331
|
```
|
|
304
332
|
## Validation
|
|
305
333
|
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
puda_comms/__init__.py,sha256=lntvVFJJez_rv5lZy5mYj4_43B9Y3NRNzxWfBuSAQ1M,194
|
|
2
|
+
puda_comms/command_service.py,sha256=KFremcEGfsTeUVQMIhyk1knYmUCvRYQ12vS_jy_14wA,25193
|
|
3
|
+
puda_comms/execution_state.py,sha256=aTaejCnJgg1y_FP-ymIC1GQzqC81FIWo0RZ18XzAQnA,2881
|
|
4
|
+
puda_comms/machine_client.py,sha256=wj6t_QHGs7l1Oc8JQ6hq2hqBd5C14TCPA_dTU9qOLzw,37430
|
|
5
|
+
puda_comms/models.py,sha256=9ZGX0PR7SgMBOL5zVLrPuSUhZqutQU96PubyjyQLhf8,3617
|
|
6
|
+
puda_comms-0.0.4.dist-info/WHEEL,sha256=ZyFSCYkV2BrxH6-HRVRg3R9Fo7MALzer9KiPYqNxSbo,79
|
|
7
|
+
puda_comms-0.0.4.dist-info/METADATA,sha256=0cMHDub_3NZt7Cj5U1jzrQXI8atQqpMM-i3vSMrT5lo,11512
|
|
8
|
+
puda_comms-0.0.4.dist-info/RECORD,,
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
puda_comms/__init__.py,sha256=lntvVFJJez_rv5lZy5mYj4_43B9Y3NRNzxWfBuSAQ1M,194
|
|
2
|
-
puda_comms/command_service.py,sha256=B4fKiQNF0slvGS1fXVoh5UZax_-xk4IS-KT96teSRfg,23272
|
|
3
|
-
puda_comms/execution_state.py,sha256=aTaejCnJgg1y_FP-ymIC1GQzqC81FIWo0RZ18XzAQnA,2881
|
|
4
|
-
puda_comms/machine_client.py,sha256=F2i0BYBuOLjKAnfZAblNrb3Lzs0yhEO1d4XA-k_dkIU,33039
|
|
5
|
-
puda_comms/models.py,sha256=cVH5uKzyLmjzPeBcm3RIJMTkoynmxqe_P26GtZwlIN8,3500
|
|
6
|
-
puda_comms-0.0.2.dist-info/WHEEL,sha256=ZyFSCYkV2BrxH6-HRVRg3R9Fo7MALzer9KiPYqNxSbo,79
|
|
7
|
-
puda_comms-0.0.2.dist-info/METADATA,sha256=jHHcSSmdWOykobTsieX2bqDeRtqSaqdUd-xZeeWxJZ8,10585
|
|
8
|
-
puda_comms-0.0.2.dist-info/RECORD,,
|
|
File without changes
|