puda-comms 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- puda_comms/command_service.py +233 -86
- puda_comms/machine_client.py +269 -137
- puda_comms/models.py +8 -4
- puda_comms/run_manager.py +112 -0
- {puda_comms-0.0.3.dist-info → puda_comms-0.0.5.dist-info}/METADATA +14 -2
- puda_comms-0.0.5.dist-info/RECORD +9 -0
- puda_comms-0.0.3.dist-info/RECORD +0 -8
- {puda_comms-0.0.3.dist-info → puda_comms-0.0.5.dist-info}/WHEEL +0 -0
puda_comms/command_service.py
CHANGED
|
@@ -12,11 +12,17 @@ import json
|
|
|
12
12
|
import logging
|
|
13
13
|
import signal
|
|
14
14
|
from datetime import datetime, timezone
|
|
15
|
-
from typing import Dict, Any, Optional
|
|
15
|
+
from typing import Dict, Any, Optional
|
|
16
16
|
import nats
|
|
17
17
|
from nats.js.client import JetStreamContext
|
|
18
18
|
from nats.aio.msg import Msg
|
|
19
|
-
from puda_comms.models import
|
|
19
|
+
from puda_comms.models import (
|
|
20
|
+
CommandRequest,
|
|
21
|
+
CommandResponseStatus,
|
|
22
|
+
NATSMessage,
|
|
23
|
+
MessageHeader,
|
|
24
|
+
MessageType,
|
|
25
|
+
)
|
|
20
26
|
|
|
21
27
|
logger = logging.getLogger(__name__)
|
|
22
28
|
|
|
@@ -37,7 +43,7 @@ class ResponseHandler:
|
|
|
37
43
|
def __init__(self, js: JetStreamContext, machine_id: str):
|
|
38
44
|
self.js = js
|
|
39
45
|
self.machine_id = machine_id
|
|
40
|
-
self._pending_responses: Dict[str,
|
|
46
|
+
self._pending_responses: Dict[str, Dict[str, Any]] = {} # {'event': asyncio.Event, 'response': Optional[NATSMessage]}
|
|
41
47
|
self._queue_consumer = None
|
|
42
48
|
self._immediate_consumer = None
|
|
43
49
|
self._initialized = False
|
|
@@ -102,8 +108,8 @@ class ResponseHandler:
|
|
|
102
108
|
|
|
103
109
|
# Get the pending response
|
|
104
110
|
pending = self._pending_responses[key]
|
|
105
|
-
# Store the
|
|
106
|
-
pending['response'] = message
|
|
111
|
+
# Store the NATSMessage directly
|
|
112
|
+
pending['response'] = message
|
|
107
113
|
# Signal that response was received
|
|
108
114
|
# Don't delete here - let get_response() delete it after retrieval
|
|
109
115
|
pending['event'].set()
|
|
@@ -152,7 +158,7 @@ class ResponseHandler:
|
|
|
152
158
|
}
|
|
153
159
|
return event
|
|
154
160
|
|
|
155
|
-
def get_response(self, run_id: str, step_number: int) -> Optional[
|
|
161
|
+
def get_response(self, run_id: str, step_number: int) -> Optional[NATSMessage]:
|
|
156
162
|
"""
|
|
157
163
|
Get the response for a pending command.
|
|
158
164
|
|
|
@@ -161,7 +167,7 @@ class ResponseHandler:
|
|
|
161
167
|
step_number: Step number for the command
|
|
162
168
|
|
|
163
169
|
Returns:
|
|
164
|
-
The NATSMessage
|
|
170
|
+
The NATSMessage if available, None otherwise
|
|
165
171
|
"""
|
|
166
172
|
key = f"{run_id}:{str(step_number)}"
|
|
167
173
|
if key in self._pending_responses:
|
|
@@ -266,9 +272,9 @@ class CommandService:
|
|
|
266
272
|
max_attempts = 3
|
|
267
273
|
connect_timeout = 3 # 3 seconds timeout per connection attempt
|
|
268
274
|
|
|
269
|
-
for attempt in range(
|
|
275
|
+
for attempt in range(max_attempts):
|
|
270
276
|
try:
|
|
271
|
-
logger.info("Connection attempt %d/%d to NATS servers: %s", attempt, max_attempts, self.servers)
|
|
277
|
+
logger.info("Connection attempt %d/%d to NATS servers: %s", attempt + 1, max_attempts, self.servers)
|
|
272
278
|
self.nc = await asyncio.wait_for(
|
|
273
279
|
nats.connect(
|
|
274
280
|
servers=self.servers,
|
|
@@ -285,14 +291,14 @@ class CommandService:
|
|
|
285
291
|
return True
|
|
286
292
|
|
|
287
293
|
except asyncio.TimeoutError:
|
|
288
|
-
logger.warning("Connection attempt %d/%d timed out after %d seconds", attempt, max_attempts, connect_timeout)
|
|
289
|
-
if attempt < max_attempts:
|
|
294
|
+
logger.warning("Connection attempt %d/%d timed out after %d seconds", attempt + 1, max_attempts, connect_timeout)
|
|
295
|
+
if attempt < max_attempts - 1:
|
|
290
296
|
logger.info("Retrying connection...")
|
|
291
297
|
else:
|
|
292
298
|
logger.error("Failed to connect after %d attempts. Giving up.", max_attempts)
|
|
293
299
|
except Exception as e:
|
|
294
|
-
logger.warning("Connection attempt %d/%d failed: %s", attempt, max_attempts, e)
|
|
295
|
-
if attempt < max_attempts:
|
|
300
|
+
logger.warning("Connection attempt %d/%d failed: %s", attempt + 1, max_attempts, e)
|
|
301
|
+
if attempt < max_attempts - 1:
|
|
296
302
|
logger.info("Retrying connection...")
|
|
297
303
|
else:
|
|
298
304
|
logger.error("Failed to connect after %d attempts. Giving up.", max_attempts)
|
|
@@ -343,6 +349,8 @@ class CommandService:
|
|
|
343
349
|
request: CommandRequest,
|
|
344
350
|
machine_id: str,
|
|
345
351
|
run_id: str,
|
|
352
|
+
user_id: str,
|
|
353
|
+
username: str,
|
|
346
354
|
timeout: int = 120
|
|
347
355
|
) -> Optional[NATSMessage]:
|
|
348
356
|
"""
|
|
@@ -352,6 +360,8 @@ class CommandService:
|
|
|
352
360
|
request: CommandRequest model containing command details
|
|
353
361
|
machine_id: Machine ID to send the command to
|
|
354
362
|
run_id: Run ID for the command
|
|
363
|
+
user_id: User ID who initiated the command
|
|
364
|
+
username: Username who initiated the command
|
|
355
365
|
timeout: Maximum time to wait for response in seconds
|
|
356
366
|
|
|
357
367
|
Returns:
|
|
@@ -364,8 +374,8 @@ class CommandService:
|
|
|
364
374
|
subject = f"{NAMESPACE}.{machine_id}.cmd.queue"
|
|
365
375
|
|
|
366
376
|
logger.info(
|
|
367
|
-
"Sending queue command:
|
|
368
|
-
|
|
377
|
+
"Sending queue command: subject=%s, command=%s, run_id=%s, step_number=%s",
|
|
378
|
+
subject, request.name, run_id, request.step_number
|
|
369
379
|
)
|
|
370
380
|
|
|
371
381
|
# Get or create response handler for this machine
|
|
@@ -374,7 +384,7 @@ class CommandService:
|
|
|
374
384
|
response_event = response_handler.register_pending(run_id, request.step_number)
|
|
375
385
|
|
|
376
386
|
# Build payload
|
|
377
|
-
payload = self._build_command_payload(request, machine_id, run_id)
|
|
387
|
+
payload = self._build_command_payload(request, machine_id, run_id, user_id, username)
|
|
378
388
|
|
|
379
389
|
try:
|
|
380
390
|
# Publish to JetStream
|
|
@@ -397,36 +407,107 @@ class CommandService:
|
|
|
397
407
|
await asyncio.sleep(0.1)
|
|
398
408
|
|
|
399
409
|
# Get the response
|
|
400
|
-
|
|
401
|
-
if response_data is None:
|
|
402
|
-
return None
|
|
403
|
-
|
|
404
|
-
return NATSMessage.model_validate(response_data)
|
|
410
|
+
return response_handler.get_response(run_id, request.step_number)
|
|
405
411
|
|
|
406
412
|
except Exception as e:
|
|
407
413
|
logger.error("Error sending queue command: %s", e)
|
|
408
414
|
response_handler.remove_pending(run_id, request.step_number)
|
|
409
415
|
return None
|
|
410
416
|
|
|
417
|
+
async def start_run(
|
|
418
|
+
self,
|
|
419
|
+
machine_id: str,
|
|
420
|
+
run_id: str,
|
|
421
|
+
user_id: str,
|
|
422
|
+
username: str,
|
|
423
|
+
timeout: int = 120
|
|
424
|
+
) -> Optional[NATSMessage]:
|
|
425
|
+
"""
|
|
426
|
+
Send START immediate command to begin a run.
|
|
427
|
+
|
|
428
|
+
Args:
|
|
429
|
+
machine_id: Machine ID to send the command to
|
|
430
|
+
run_id: Run ID for the command
|
|
431
|
+
user_id: User ID who initiated the command
|
|
432
|
+
username: Username who initiated the command
|
|
433
|
+
timeout: Maximum time to wait for response in seconds
|
|
434
|
+
|
|
435
|
+
Returns:
|
|
436
|
+
NATSMessage if successful, None if failed or timeout
|
|
437
|
+
"""
|
|
438
|
+
request = CommandRequest(
|
|
439
|
+
name="start",
|
|
440
|
+
params={},
|
|
441
|
+
step_number=0
|
|
442
|
+
)
|
|
443
|
+
return await self.send_immediate_command(
|
|
444
|
+
request=request,
|
|
445
|
+
machine_id=machine_id,
|
|
446
|
+
run_id=run_id,
|
|
447
|
+
user_id=user_id,
|
|
448
|
+
username=username,
|
|
449
|
+
timeout=timeout
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
async def complete_run(
|
|
453
|
+
self,
|
|
454
|
+
machine_id: str,
|
|
455
|
+
run_id: str,
|
|
456
|
+
user_id: str,
|
|
457
|
+
username: str,
|
|
458
|
+
timeout: int = 120
|
|
459
|
+
) -> Optional[NATSMessage]:
|
|
460
|
+
"""
|
|
461
|
+
Send COMPLETE immediate command to end a run.
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
machine_id: Machine ID to send the command to
|
|
465
|
+
run_id: Run ID for the command
|
|
466
|
+
user_id: User ID who initiated the command
|
|
467
|
+
username: Username who initiated the command
|
|
468
|
+
timeout: Maximum time to wait for response in seconds
|
|
469
|
+
|
|
470
|
+
Returns:
|
|
471
|
+
NATSMessage if successful, None if failed or timeout
|
|
472
|
+
"""
|
|
473
|
+
request = CommandRequest(
|
|
474
|
+
name="complete",
|
|
475
|
+
params={},
|
|
476
|
+
step_number=0
|
|
477
|
+
)
|
|
478
|
+
return await self.send_immediate_command(
|
|
479
|
+
request=request,
|
|
480
|
+
machine_id=machine_id,
|
|
481
|
+
run_id=run_id,
|
|
482
|
+
user_id=user_id,
|
|
483
|
+
username=username,
|
|
484
|
+
timeout=timeout
|
|
485
|
+
)
|
|
486
|
+
|
|
411
487
|
async def send_queue_commands(
|
|
412
488
|
self,
|
|
413
489
|
*,
|
|
414
490
|
requests: list[CommandRequest],
|
|
415
491
|
machine_id: str,
|
|
416
492
|
run_id: str,
|
|
493
|
+
user_id: str,
|
|
494
|
+
username: str,
|
|
417
495
|
timeout: int = 120
|
|
418
496
|
) -> Optional[NATSMessage]:
|
|
419
497
|
"""
|
|
420
498
|
Send multiple queue commands sequentially and wait for responses.
|
|
421
499
|
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
If
|
|
500
|
+
Automatically sends START command before the sequence and COMPLETE command after
|
|
501
|
+
successful completion. Sends commands one by one, waiting for each response before
|
|
502
|
+
sending the next. If any command fails or times out, stops immediately and returns
|
|
503
|
+
the error response. If all commands succeed, returns the last command's response.
|
|
425
504
|
|
|
426
505
|
Args:
|
|
427
506
|
requests: List of CommandRequest models to send sequentially
|
|
428
507
|
machine_id: Machine ID to send the commands to
|
|
429
508
|
run_id: Run ID for all commands
|
|
509
|
+
user_id: User ID who initiated the commands
|
|
510
|
+
username: Username who initiated the commands
|
|
430
511
|
timeout: Maximum time to wait for each response in seconds
|
|
431
512
|
|
|
432
513
|
Returns:
|
|
@@ -447,74 +528,131 @@ class CommandService:
|
|
|
447
528
|
run_id
|
|
448
529
|
)
|
|
449
530
|
|
|
531
|
+
# Always send START command before sequence
|
|
532
|
+
logger.info("Sending START command before sequence")
|
|
533
|
+
start_response = await self.start_run(
|
|
534
|
+
machine_id=machine_id,
|
|
535
|
+
run_id=run_id,
|
|
536
|
+
user_id=user_id,
|
|
537
|
+
username=username,
|
|
538
|
+
timeout=timeout
|
|
539
|
+
)
|
|
540
|
+
if start_response is None:
|
|
541
|
+
logger.error("START command timed out")
|
|
542
|
+
return None
|
|
543
|
+
if start_response.response and start_response.response.status == CommandResponseStatus.ERROR:
|
|
544
|
+
logger.error("START command failed: %s", start_response.response.message)
|
|
545
|
+
return start_response
|
|
546
|
+
|
|
450
547
|
last_response: Optional[NATSMessage] = None
|
|
451
548
|
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
request
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
request=request,
|
|
463
|
-
machine_id=machine_id,
|
|
464
|
-
run_id=run_id,
|
|
465
|
-
timeout=timeout
|
|
466
|
-
)
|
|
467
|
-
|
|
468
|
-
# Check if command failed (None means timeout or exception)
|
|
469
|
-
if response is None:
|
|
470
|
-
logger.error(
|
|
471
|
-
"Command %d/%d failed or timed out: %s (step %s)",
|
|
549
|
+
try:
|
|
550
|
+
for idx, request in enumerate(requests, start=1):
|
|
551
|
+
# Validate request - convert dict to CommandRequest if needed
|
|
552
|
+
if isinstance(request, dict):
|
|
553
|
+
request = CommandRequest.model_validate(request)
|
|
554
|
+
elif not isinstance(request, CommandRequest):
|
|
555
|
+
raise ValueError(f"Request {idx} must be a CommandRequest or dict, got {type(request)}")
|
|
556
|
+
|
|
557
|
+
logger.info(
|
|
558
|
+
"Sending command %d/%d: %s (step %s)",
|
|
472
559
|
idx,
|
|
473
560
|
len(requests),
|
|
474
561
|
request.name,
|
|
475
562
|
request.step_number
|
|
476
563
|
)
|
|
477
|
-
return None
|
|
478
564
|
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
565
|
+
response = await self.send_queue_command(
|
|
566
|
+
request=request,
|
|
567
|
+
machine_id=machine_id,
|
|
568
|
+
run_id=run_id,
|
|
569
|
+
user_id=user_id,
|
|
570
|
+
username=username,
|
|
571
|
+
timeout=timeout
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
# Check if command failed (None means timeout or exception)
|
|
575
|
+
if response is None:
|
|
482
576
|
logger.error(
|
|
483
|
-
"Command %d/%d failed
|
|
577
|
+
"Command %d/%d failed or timed out: %s (step %s)",
|
|
578
|
+
idx,
|
|
579
|
+
len(requests),
|
|
580
|
+
request.name,
|
|
581
|
+
request.step_number
|
|
582
|
+
)
|
|
583
|
+
return None
|
|
584
|
+
|
|
585
|
+
# Check if command returned an error status
|
|
586
|
+
if response.response is not None:
|
|
587
|
+
if response.response.status == CommandResponseStatus.ERROR:
|
|
588
|
+
logger.error(
|
|
589
|
+
"Command %d/%d failed with error: %s (step %s) - code: %s, message: %s",
|
|
590
|
+
idx,
|
|
591
|
+
len(requests),
|
|
592
|
+
request.name,
|
|
593
|
+
request.step_number,
|
|
594
|
+
response.response.code,
|
|
595
|
+
response.response.message
|
|
596
|
+
)
|
|
597
|
+
return response
|
|
598
|
+
|
|
599
|
+
# Command succeeded, store as last response
|
|
600
|
+
last_response = response
|
|
601
|
+
logger.info(
|
|
602
|
+
"Command %d/%d succeeded: %s (step %s)",
|
|
484
603
|
idx,
|
|
485
604
|
len(requests),
|
|
486
605
|
request.name,
|
|
487
|
-
request.step_number
|
|
488
|
-
|
|
489
|
-
|
|
606
|
+
request.step_number
|
|
607
|
+
)
|
|
608
|
+
else:
|
|
609
|
+
# Response exists but has no response data (shouldn't happen, but handle it)
|
|
610
|
+
logger.warning(
|
|
611
|
+
"Command %d/%d returned response with no response data: %s (step %s)",
|
|
612
|
+
idx,
|
|
613
|
+
len(requests),
|
|
614
|
+
request.name,
|
|
615
|
+
request.step_number
|
|
490
616
|
)
|
|
491
617
|
return response
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
618
|
+
|
|
619
|
+
logger.info(
|
|
620
|
+
"All %d commands completed successfully",
|
|
621
|
+
len(requests)
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
# Always send COMPLETE command after successful sequence
|
|
625
|
+
logger.info("Sending COMPLETE command after successful sequence")
|
|
626
|
+
complete_response = await self.complete_run(
|
|
627
|
+
machine_id=machine_id,
|
|
628
|
+
run_id=run_id,
|
|
629
|
+
user_id=user_id,
|
|
630
|
+
username=username,
|
|
631
|
+
timeout=timeout
|
|
632
|
+
)
|
|
633
|
+
if complete_response is None:
|
|
634
|
+
logger.error("COMPLETE command timed out")
|
|
635
|
+
return None
|
|
636
|
+
if complete_response.response and complete_response.response.status == CommandResponseStatus.ERROR:
|
|
637
|
+
logger.error("COMPLETE command failed: %s", complete_response.response.message)
|
|
638
|
+
return complete_response
|
|
639
|
+
# Return the last command response, not the COMPLETE response
|
|
640
|
+
return last_response
|
|
641
|
+
except Exception as e:
|
|
642
|
+
# If any error occurs during command execution, try to complete the run
|
|
643
|
+
# to clean up state (but don't fail if this also fails)
|
|
644
|
+
logger.warning("Error during command sequence, attempting to complete run: %s", e)
|
|
645
|
+
try:
|
|
646
|
+
await self.complete_run(
|
|
647
|
+
machine_id=machine_id,
|
|
648
|
+
run_id=run_id,
|
|
649
|
+
user_id=user_id,
|
|
650
|
+
username=username,
|
|
651
|
+
timeout=timeout
|
|
510
652
|
)
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
"All %d commands completed successfully",
|
|
515
|
-
len(requests)
|
|
516
|
-
)
|
|
517
|
-
return last_response
|
|
653
|
+
except Exception as cleanup_error:
|
|
654
|
+
logger.error("Failed to complete run during error cleanup: %s", cleanup_error)
|
|
655
|
+
raise
|
|
518
656
|
|
|
519
657
|
async def send_immediate_command(
|
|
520
658
|
self,
|
|
@@ -522,6 +660,8 @@ class CommandService:
|
|
|
522
660
|
request: CommandRequest,
|
|
523
661
|
machine_id: str,
|
|
524
662
|
run_id: str,
|
|
663
|
+
user_id: str,
|
|
664
|
+
username: str,
|
|
525
665
|
timeout: int = 120
|
|
526
666
|
) -> Optional[NATSMessage]:
|
|
527
667
|
"""
|
|
@@ -531,6 +671,8 @@ class CommandService:
|
|
|
531
671
|
request: CommandRequest model containing command details
|
|
532
672
|
machine_id: Machine ID to send the command to
|
|
533
673
|
run_id: Run ID for the command
|
|
674
|
+
user_id: User ID who initiated the command
|
|
675
|
+
username: Username who initiated the command
|
|
534
676
|
timeout: Maximum time to wait for response in seconds
|
|
535
677
|
|
|
536
678
|
Returns:
|
|
@@ -555,7 +697,7 @@ class CommandService:
|
|
|
555
697
|
response_received = response_handler.register_pending(run_id, request.step_number)
|
|
556
698
|
|
|
557
699
|
# Build payload
|
|
558
|
-
payload = self._build_command_payload(request, machine_id, run_id)
|
|
700
|
+
payload = self._build_command_payload(request, machine_id, run_id, user_id, username)
|
|
559
701
|
|
|
560
702
|
try:
|
|
561
703
|
# Publish to JetStream
|
|
@@ -578,11 +720,7 @@ class CommandService:
|
|
|
578
720
|
await asyncio.sleep(0.1)
|
|
579
721
|
|
|
580
722
|
# Get the response
|
|
581
|
-
|
|
582
|
-
if response_data is None:
|
|
583
|
-
return None
|
|
584
|
-
|
|
585
|
-
return NATSMessage.model_validate(response_data)
|
|
723
|
+
return response_handler.get_response(run_id, request.step_number)
|
|
586
724
|
|
|
587
725
|
except Exception as e:
|
|
588
726
|
logger.error("Error sending immediate command: %s", e)
|
|
@@ -635,7 +773,9 @@ class CommandService:
|
|
|
635
773
|
self,
|
|
636
774
|
command_request: CommandRequest,
|
|
637
775
|
machine_id: str,
|
|
638
|
-
run_id: str
|
|
776
|
+
run_id: str,
|
|
777
|
+
user_id: str,
|
|
778
|
+
username: str
|
|
639
779
|
) -> NATSMessage:
|
|
640
780
|
"""
|
|
641
781
|
Build a command payload in the expected format.
|
|
@@ -643,17 +783,24 @@ class CommandService:
|
|
|
643
783
|
Args:
|
|
644
784
|
command_request: CommandRequest model containing command details
|
|
645
785
|
machine_id: Machine ID for the command
|
|
646
|
-
run_id: Run ID for the command
|
|
786
|
+
run_id: Run ID for the command (empty string will be converted to None)
|
|
787
|
+
user_id: User ID who initiated the command
|
|
788
|
+
username: Username who initiated the command
|
|
647
789
|
|
|
648
790
|
Returns:
|
|
649
791
|
NATSMessage object ready for NATS transmission
|
|
650
792
|
"""
|
|
793
|
+
# Convert empty string to None for run_id
|
|
794
|
+
run_id_value = run_id if run_id else None
|
|
795
|
+
|
|
651
796
|
header = MessageHeader(
|
|
652
797
|
message_type=MessageType.COMMAND,
|
|
653
798
|
version="1.0",
|
|
654
799
|
timestamp=datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
|
|
800
|
+
user_id=user_id,
|
|
801
|
+
username=username,
|
|
655
802
|
machine_id=machine_id,
|
|
656
|
-
run_id=
|
|
803
|
+
run_id=run_id_value
|
|
657
804
|
)
|
|
658
805
|
|
|
659
806
|
return NATSMessage(
|
puda_comms/machine_client.py
CHANGED
|
@@ -19,8 +19,9 @@ from puda_comms.models import (
|
|
|
19
19
|
MessageType,
|
|
20
20
|
ImmediateCommand,
|
|
21
21
|
)
|
|
22
|
+
from puda_comms.run_manager import RunManager
|
|
22
23
|
from nats.js.client import JetStreamContext
|
|
23
|
-
from nats.js.api import StreamConfig
|
|
24
|
+
from nats.js.api import StreamConfig, ConsumerConfig
|
|
24
25
|
from nats.js.errors import NotFoundError
|
|
25
26
|
from nats.aio.msg import Msg
|
|
26
27
|
|
|
@@ -69,16 +70,20 @@ class MachineClient:
|
|
|
69
70
|
|
|
70
71
|
# Default subscriptions
|
|
71
72
|
self._cmd_queue_sub = None
|
|
73
|
+
self._cmd_queue_task = None # Background task for pull consumer
|
|
72
74
|
self._cmd_immediate_sub = None
|
|
73
75
|
|
|
74
76
|
# Connection state
|
|
75
77
|
self._is_connected = False
|
|
76
|
-
self.
|
|
78
|
+
self._queue_handler = None
|
|
79
|
+
self._immediate_handler = None
|
|
77
80
|
|
|
78
81
|
# Queue control state
|
|
79
82
|
self._pause_lock = asyncio.Lock()
|
|
80
83
|
self._is_paused = False
|
|
81
|
-
|
|
84
|
+
|
|
85
|
+
# Run state management
|
|
86
|
+
self.run_manager = RunManager(machine_id=machine_id)
|
|
82
87
|
|
|
83
88
|
def _init_subjects(self):
|
|
84
89
|
"""Initialize all subject and stream names."""
|
|
@@ -184,30 +189,22 @@ class MachineClient:
|
|
|
184
189
|
logger.error("Error ensuring %s stream: %s", stream_name, e, exc_info=True)
|
|
185
190
|
raise
|
|
186
191
|
|
|
187
|
-
async def
|
|
188
|
-
"""Ensure
|
|
192
|
+
async def _ensure_all_streams(self):
|
|
193
|
+
"""Ensure all required streams exist with correct retention policies."""
|
|
189
194
|
await self._ensure_stream(
|
|
190
195
|
self.STREAM_COMMAND_QUEUE,
|
|
191
|
-
f"{self.NAMESPACE}.*.cmd.queue"
|
|
196
|
+
f"{self.NAMESPACE}.*.cmd.queue",
|
|
197
|
+
retention='workqueue'
|
|
192
198
|
)
|
|
193
|
-
|
|
194
|
-
async def _ensure_command_immediate_stream(self):
|
|
195
|
-
"""Ensure COMMAND_IMMEDIATE stream exists with WorkQueue retention policy."""
|
|
196
199
|
await self._ensure_stream(
|
|
197
200
|
self.STREAM_COMMAND_IMMEDIATE,
|
|
198
201
|
f"{self.NAMESPACE}.*.cmd.immediate"
|
|
199
202
|
)
|
|
200
|
-
|
|
201
|
-
async def _ensure_response_queue_stream(self):
|
|
202
|
-
"""Ensure RESPONSE_QUEUE stream exists with Interest retention policy."""
|
|
203
203
|
await self._ensure_stream(
|
|
204
204
|
self.STREAM_RESPONSE_QUEUE,
|
|
205
205
|
f"{self.NAMESPACE}.*.cmd.response.queue",
|
|
206
206
|
retention='interest'
|
|
207
207
|
)
|
|
208
|
-
|
|
209
|
-
async def _ensure_response_immediate_stream(self):
|
|
210
|
-
"""Ensure RESPONSE_IMMEDIATE stream exists with Interest retention policy."""
|
|
211
208
|
await self._ensure_stream(
|
|
212
209
|
self.STREAM_RESPONSE_IMMEDIATE,
|
|
213
210
|
f"{self.NAMESPACE}.*.cmd.response.immediate",
|
|
@@ -230,7 +227,17 @@ class MachineClient:
|
|
|
230
227
|
|
|
231
228
|
async def _cleanup_subscriptions(self):
|
|
232
229
|
"""Unsubscribe from all subscriptions."""
|
|
233
|
-
# Clean up
|
|
230
|
+
# Clean up queue subscription (pull consumer)
|
|
231
|
+
if self._cmd_queue_task:
|
|
232
|
+
try:
|
|
233
|
+
self._cmd_queue_task.cancel()
|
|
234
|
+
await self._cmd_queue_task
|
|
235
|
+
except asyncio.CancelledError:
|
|
236
|
+
pass
|
|
237
|
+
except Exception:
|
|
238
|
+
pass
|
|
239
|
+
self._cmd_queue_task = None
|
|
240
|
+
|
|
234
241
|
if self._cmd_queue_sub:
|
|
235
242
|
try:
|
|
236
243
|
await self._cmd_queue_sub.unsubscribe()
|
|
@@ -252,6 +259,7 @@ class MachineClient:
|
|
|
252
259
|
self.kv = None
|
|
253
260
|
# Subscriptions will be recreated on reconnection
|
|
254
261
|
self._cmd_queue_sub = None
|
|
262
|
+
self._cmd_queue_task = None
|
|
255
263
|
self._cmd_immediate_sub = None
|
|
256
264
|
|
|
257
265
|
# ==================== CONNECTION MANAGEMENT ====================
|
|
@@ -270,10 +278,7 @@ class MachineClient:
|
|
|
270
278
|
closed_cb=self._closed_callback
|
|
271
279
|
)
|
|
272
280
|
self.js = self.nc.jetstream()
|
|
273
|
-
await self.
|
|
274
|
-
await self._ensure_command_immediate_stream()
|
|
275
|
-
await self._ensure_response_queue_stream()
|
|
276
|
-
await self._ensure_response_immediate_stream()
|
|
281
|
+
await self._ensure_all_streams()
|
|
277
282
|
self.kv = await self._get_or_create_kv_bucket()
|
|
278
283
|
self._is_connected = True
|
|
279
284
|
logger.info("Connected to NATS servers: %s", self.servers)
|
|
@@ -299,32 +304,16 @@ class MachineClient:
|
|
|
299
304
|
|
|
300
305
|
if self.nc:
|
|
301
306
|
self.js = self.nc.jetstream()
|
|
302
|
-
await self.
|
|
303
|
-
await self._ensure_command_immediate_stream()
|
|
304
|
-
await self._ensure_response_queue_stream()
|
|
305
|
-
await self._ensure_response_immediate_stream()
|
|
307
|
+
await self._ensure_all_streams()
|
|
306
308
|
self.kv = await self._get_or_create_kv_bucket()
|
|
307
309
|
await self._resubscribe_handlers()
|
|
308
310
|
|
|
309
311
|
async def _resubscribe_handlers(self):
|
|
310
312
|
"""Re-subscribe to all handlers after reconnection."""
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
for handler_info in self._reconnect_handlers:
|
|
317
|
-
try:
|
|
318
|
-
handler_type = handler_info['type']
|
|
319
|
-
handler = handler_info['handler']
|
|
320
|
-
subscribe_method = subscribe_methods.get(handler_type)
|
|
321
|
-
|
|
322
|
-
if subscribe_method:
|
|
323
|
-
await subscribe_method(handler)
|
|
324
|
-
else:
|
|
325
|
-
logger.warning("Unknown handler type: %s", handler_type)
|
|
326
|
-
except Exception as e:
|
|
327
|
-
logger.error("Failed to re-subscribe %s: %s", handler_type, e)
|
|
313
|
+
if self._queue_handler:
|
|
314
|
+
await self.subscribe_queue(self._queue_handler)
|
|
315
|
+
if self._immediate_handler:
|
|
316
|
+
await self.subscribe_immediate(self._immediate_handler)
|
|
328
317
|
|
|
329
318
|
async def _closed_callback(self):
|
|
330
319
|
"""Callback when connection is closed."""
|
|
@@ -437,41 +426,35 @@ class MachineClient:
|
|
|
437
426
|
logger.error("Error publishing command response: %s", e)
|
|
438
427
|
|
|
439
428
|
async def process_queue_cmd(
|
|
440
|
-
self,
|
|
429
|
+
self,
|
|
441
430
|
msg: Msg,
|
|
442
|
-
handler: Callable[[
|
|
431
|
+
handler: Callable[[NATSMessage], Awaitable[CommandResponse]]
|
|
443
432
|
) -> None:
|
|
444
433
|
"""
|
|
445
434
|
Handle the lifecycle of a single message: Parse -> Handle -> Ack/Nak/Term.
|
|
446
435
|
|
|
447
436
|
Args:
|
|
448
437
|
msg: NATS message
|
|
449
|
-
handler: Handler function that processes the message and returns CommandResponse
|
|
438
|
+
handler: Handler function that processes the message and returns a CommandResponse object
|
|
450
439
|
"""
|
|
440
|
+
# Initialize variables for exception handlers
|
|
441
|
+
run_id = None
|
|
442
|
+
step_number = None
|
|
443
|
+
command = None
|
|
444
|
+
|
|
451
445
|
try:
|
|
452
446
|
# Parse message
|
|
453
447
|
message = NATSMessage.model_validate_json(msg.data)
|
|
454
448
|
run_id = message.header.run_id
|
|
455
|
-
step_number = message.command.step_number
|
|
456
|
-
command = message.command.name
|
|
449
|
+
step_number = message.command.step_number if message.command else None
|
|
450
|
+
command = message.command.name if message.command else None
|
|
457
451
|
|
|
458
|
-
#
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
await self._publish_command_response(
|
|
463
|
-
msg=msg,
|
|
464
|
-
response=CommandResponse(
|
|
465
|
-
status=CommandResponseStatus.ERROR,
|
|
466
|
-
code=CommandResponseCode.COMMAND_CANCELLED,
|
|
467
|
-
message='Command cancelled'
|
|
468
|
-
),
|
|
469
|
-
subject=self.response_queue
|
|
470
|
-
)
|
|
471
|
-
# Note: Final state update should be published by the handler with machine-specific data
|
|
472
|
-
return
|
|
452
|
+
# For all commands, continue with normal processing:
|
|
453
|
+
# 1. Check if paused
|
|
454
|
+
# 2. Validate run_id matches active run
|
|
455
|
+
# 3. Execute handler
|
|
473
456
|
|
|
474
|
-
#
|
|
457
|
+
# If machine is paused, publish error response and return
|
|
475
458
|
async with self._pause_lock:
|
|
476
459
|
if self._is_paused:
|
|
477
460
|
await self._publish_command_response(
|
|
@@ -484,24 +467,42 @@ class MachineClient:
|
|
|
484
467
|
subject=self.response_queue
|
|
485
468
|
)
|
|
486
469
|
return
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
if
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
470
|
+
|
|
471
|
+
# Wait while paused (release lock during wait so RESUME can acquire it)
|
|
472
|
+
while True:
|
|
473
|
+
async with self._pause_lock:
|
|
474
|
+
if not self._is_paused:
|
|
475
|
+
break
|
|
476
|
+
# Release lock before sleeping so RESUME can set _is_paused = False
|
|
477
|
+
await msg.in_progress()
|
|
478
|
+
await asyncio.sleep(1)
|
|
479
|
+
|
|
480
|
+
# Validate run_id matches active run (run_id is required)
|
|
481
|
+
if run_id is None:
|
|
482
|
+
await msg.ack()
|
|
483
|
+
await self._publish_command_response(
|
|
484
|
+
msg=msg,
|
|
485
|
+
response=CommandResponse(
|
|
486
|
+
status=CommandResponseStatus.ERROR,
|
|
487
|
+
code=CommandResponseCode.EXECUTION_ERROR,
|
|
488
|
+
message='Command requires run_id'
|
|
489
|
+
),
|
|
490
|
+
subject=self.response_queue
|
|
491
|
+
)
|
|
492
|
+
return
|
|
493
|
+
|
|
494
|
+
if not await self.run_manager.validate_run_id(run_id):
|
|
495
|
+
await msg.ack()
|
|
496
|
+
await self._publish_command_response(
|
|
497
|
+
msg=msg,
|
|
498
|
+
response=CommandResponse(
|
|
499
|
+
status=CommandResponseStatus.ERROR,
|
|
500
|
+
code=CommandResponseCode.RUN_ID_MISMATCH,
|
|
501
|
+
message=f'Run ID mismatch: expected active run, got {run_id}'
|
|
502
|
+
),
|
|
503
|
+
subject=self.response_queue
|
|
504
|
+
)
|
|
505
|
+
return
|
|
505
506
|
|
|
506
507
|
# Execute handler with auto-heartbeat (task might take a while for machine to complete)
|
|
507
508
|
# The handler should be defined in the machine-specific edge module.
|
|
@@ -553,34 +554,19 @@ class MachineClient:
|
|
|
553
554
|
# This is a rare case - consider if handler should be called with None payload
|
|
554
555
|
|
|
555
556
|
except Exception as e:
|
|
556
|
-
#
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
# Note: Final state update should be published by the handler with machine-specific data
|
|
570
|
-
else:
|
|
571
|
-
# Terminate all errors to prevent infinite redelivery loops
|
|
572
|
-
logger.error("Handler failed (terminating message): %s", e)
|
|
573
|
-
await msg.term()
|
|
574
|
-
await self._publish_command_response(
|
|
575
|
-
msg=msg,
|
|
576
|
-
response=CommandResponse(
|
|
577
|
-
status=CommandResponseStatus.ERROR,
|
|
578
|
-
code=CommandResponseCode.EXECUTION_ERROR,
|
|
579
|
-
message=str(e)
|
|
580
|
-
),
|
|
581
|
-
subject=self.response_queue
|
|
582
|
-
)
|
|
583
|
-
# Note: Final state update should be published by the handler with machine-specific data
|
|
557
|
+
# Terminate all errors to prevent infinite redelivery loops
|
|
558
|
+
logger.error("Handler failed (terminating message): %s", e)
|
|
559
|
+
await msg.term()
|
|
560
|
+
await self._publish_command_response(
|
|
561
|
+
msg=msg,
|
|
562
|
+
response=CommandResponse(
|
|
563
|
+
status=CommandResponseStatus.ERROR,
|
|
564
|
+
code=CommandResponseCode.EXECUTION_ERROR,
|
|
565
|
+
message=str(e)
|
|
566
|
+
),
|
|
567
|
+
subject=self.response_queue
|
|
568
|
+
)
|
|
569
|
+
# Note: Final state update should be published by the handler with machine-specific data
|
|
584
570
|
|
|
585
571
|
async def process_immediate_cmd(self, msg: Msg, handler: Callable[[CommandRequest], Awaitable[CommandResponse]]) -> None:
|
|
586
572
|
"""Process immediate commands (pause, cancel, resume, etc.)."""
|
|
@@ -595,8 +581,49 @@ class MachineClient:
|
|
|
595
581
|
return
|
|
596
582
|
|
|
597
583
|
command_name = message.command.name.lower()
|
|
584
|
+
run_id = message.header.run_id
|
|
585
|
+
response: CommandResponse
|
|
598
586
|
|
|
599
587
|
match command_name:
|
|
588
|
+
case ImmediateCommand.START:
|
|
589
|
+
if run_id:
|
|
590
|
+
success = await self.run_manager.start_run(run_id)
|
|
591
|
+
if not success:
|
|
592
|
+
# Run already active
|
|
593
|
+
response = CommandResponse(
|
|
594
|
+
status=CommandResponseStatus.ERROR,
|
|
595
|
+
code=CommandResponseCode.RUN_ID_MISMATCH,
|
|
596
|
+
message='cannot start, another run is currently running'
|
|
597
|
+
)
|
|
598
|
+
else:
|
|
599
|
+
await self.publish_state({'state': 'active', 'run_id': run_id})
|
|
600
|
+
response = CommandResponse(status=CommandResponseStatus.SUCCESS)
|
|
601
|
+
else:
|
|
602
|
+
response = CommandResponse(
|
|
603
|
+
status=CommandResponseStatus.ERROR,
|
|
604
|
+
code=CommandResponseCode.MISSING_RUN_ID,
|
|
605
|
+
message='START command requires RUN_ID'
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
case ImmediateCommand.COMPLETE:
|
|
609
|
+
if not run_id:
|
|
610
|
+
response = CommandResponse(
|
|
611
|
+
status=CommandResponseStatus.ERROR,
|
|
612
|
+
code=CommandResponseCode.MISSING_RUN_ID,
|
|
613
|
+
message='COMPLETE command requires RUN_ID'
|
|
614
|
+
)
|
|
615
|
+
else:
|
|
616
|
+
success = await self.run_manager.complete_run(run_id)
|
|
617
|
+
if success:
|
|
618
|
+
await self.publish_state({'state': 'idle', 'run_id': None})
|
|
619
|
+
response = CommandResponse(status=CommandResponseStatus.SUCCESS)
|
|
620
|
+
else:
|
|
621
|
+
response = CommandResponse(
|
|
622
|
+
status=CommandResponseStatus.ERROR,
|
|
623
|
+
code=CommandResponseCode.RUN_ID_MISMATCH,
|
|
624
|
+
message=f'Run {run_id} not active'
|
|
625
|
+
)
|
|
626
|
+
|
|
600
627
|
case ImmediateCommand.PAUSE:
|
|
601
628
|
async with self._pause_lock:
|
|
602
629
|
if not self._is_paused:
|
|
@@ -604,7 +631,7 @@ class MachineClient:
|
|
|
604
631
|
logger.info("Queue paused")
|
|
605
632
|
await self.publish_state({'state': 'paused', 'run_id': message.header.run_id})
|
|
606
633
|
# Call handler and use its response
|
|
607
|
-
response
|
|
634
|
+
response = await handler(message)
|
|
608
635
|
|
|
609
636
|
case ImmediateCommand.RESUME:
|
|
610
637
|
async with self._pause_lock:
|
|
@@ -613,19 +640,30 @@ class MachineClient:
|
|
|
613
640
|
logger.info("Queue resumed")
|
|
614
641
|
await self.publish_state({'state': 'idle', 'run_id': None})
|
|
615
642
|
# Call handler and use its response
|
|
616
|
-
response
|
|
643
|
+
response = await handler(message)
|
|
617
644
|
|
|
618
645
|
case ImmediateCommand.CANCEL:
|
|
619
|
-
if
|
|
620
|
-
|
|
621
|
-
|
|
646
|
+
if not run_id:
|
|
647
|
+
response = CommandResponse(
|
|
648
|
+
status=CommandResponseStatus.ERROR,
|
|
649
|
+
code=CommandResponseCode.MISSING_RUN_ID,
|
|
650
|
+
message='CANCEL command requires RUN_ID'
|
|
651
|
+
)
|
|
652
|
+
else:
|
|
653
|
+
logger.info("Cancelling all commands with run_id: %s", run_id)
|
|
654
|
+
# Clear the active run_id when cancelling (try to complete, but clear anyway)
|
|
655
|
+
await self.run_manager.complete_run(run_id)
|
|
622
656
|
await self.publish_state({'state': 'idle', 'run_id': None})
|
|
623
|
-
|
|
624
|
-
|
|
657
|
+
# Call handler and use its response
|
|
658
|
+
response = await handler(message)
|
|
625
659
|
|
|
626
660
|
case _:
|
|
627
|
-
#
|
|
628
|
-
response
|
|
661
|
+
# Unknown immediate command
|
|
662
|
+
response = CommandResponse(
|
|
663
|
+
status=CommandResponseStatus.ERROR,
|
|
664
|
+
code=CommandResponseCode.UNKNOWN_COMMAND,
|
|
665
|
+
message=f'Unknown immediate command: {command_name}'
|
|
666
|
+
)
|
|
629
667
|
|
|
630
668
|
await self._publish_command_response(
|
|
631
669
|
msg=msg,
|
|
@@ -661,9 +699,54 @@ class MachineClient:
|
|
|
661
699
|
)
|
|
662
700
|
await self.publish_state({'state': 'error', 'run_id': None})
|
|
663
701
|
|
|
702
|
+
async def _verify_or_recreate_consumer(self, durable_name: str):
|
|
703
|
+
"""
|
|
704
|
+
Check if consumer exists and verify/update its configuration.
|
|
705
|
+
Deletes and recreates the consumer if configuration doesn't match.
|
|
706
|
+
|
|
707
|
+
Args:
|
|
708
|
+
durable_name: Name of the durable consumer to verify
|
|
709
|
+
"""
|
|
710
|
+
# Check if consumer exists and verify/update its configuration
|
|
711
|
+
try:
|
|
712
|
+
consumer_info = await self.js.consumer_info(self.STREAM_COMMAND_QUEUE, durable_name)
|
|
713
|
+
logger.debug("Durable consumer %s already exists", durable_name)
|
|
714
|
+
|
|
715
|
+
# Check if consumer config matches what we need
|
|
716
|
+
config = consumer_info.config
|
|
717
|
+
needs_recreate = False
|
|
718
|
+
if getattr(config, 'filter_subject', None) != self.cmd_queue:
|
|
719
|
+
logger.warning("Consumer filter_subject mismatch: expected %s, got %s",
|
|
720
|
+
self.cmd_queue, getattr(config, 'filter_subject', None))
|
|
721
|
+
needs_recreate = True
|
|
722
|
+
if getattr(config, 'ack_policy', None) != 'explicit':
|
|
723
|
+
logger.warning("Consumer ack_policy mismatch: expected explicit, got %s",
|
|
724
|
+
getattr(config, 'ack_policy', None))
|
|
725
|
+
needs_recreate = True
|
|
726
|
+
if getattr(config, 'deliver_policy', None) != 'all':
|
|
727
|
+
logger.warning("Consumer deliver_policy mismatch: expected all, got %s",
|
|
728
|
+
getattr(config, 'deliver_policy', None))
|
|
729
|
+
needs_recreate = True
|
|
730
|
+
|
|
731
|
+
if needs_recreate:
|
|
732
|
+
# Consumer exists but config doesn't match - delete and recreate
|
|
733
|
+
logger.info("Consumer config mismatch, deleting and recreating: %s", durable_name)
|
|
734
|
+
try:
|
|
735
|
+
await self.js.delete_consumer(self.STREAM_COMMAND_QUEUE, durable_name)
|
|
736
|
+
except Exception as e:
|
|
737
|
+
logger.warning("Error deleting consumer: %s", e)
|
|
738
|
+
else:
|
|
739
|
+
# Log consumer state for diagnostics
|
|
740
|
+
logger.info("Consumer exists with correct config - pending: %d, delivered: %d, ack_pending: %d",
|
|
741
|
+
consumer_info.num_pending, consumer_info.delivered.consumer_seq,
|
|
742
|
+
consumer_info.num_ack_pending)
|
|
743
|
+
except NotFoundError:
|
|
744
|
+
# Consumer doesn't exist, will be created by pull_subscribe
|
|
745
|
+
logger.debug("Durable consumer %s does not exist, will be created", durable_name)
|
|
746
|
+
|
|
664
747
|
async def subscribe_queue(self, handler: Callable[[NATSMessage], Awaitable[CommandResponse]]):
|
|
665
748
|
"""
|
|
666
|
-
Subscribe to queue commands with
|
|
749
|
+
Subscribe to queue commands with pull consumer.
|
|
667
750
|
|
|
668
751
|
Args:
|
|
669
752
|
handler: Async function that processes command payloads and returns CommandResponse
|
|
@@ -671,21 +754,69 @@ class MachineClient:
|
|
|
671
754
|
if not self.js:
|
|
672
755
|
logger.error("JetStream not available for queue subscription")
|
|
673
756
|
return
|
|
757
|
+
|
|
758
|
+
# Store handler for reconnection
|
|
759
|
+
self._queue_handler = handler
|
|
674
760
|
|
|
675
761
|
# Ensure stream exists before attempting to subscribe
|
|
676
|
-
await self.
|
|
762
|
+
await self._ensure_all_streams()
|
|
677
763
|
|
|
678
764
|
try:
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
765
|
+
durable_name = f"cmd_queue_{self.machine_id}"
|
|
766
|
+
|
|
767
|
+
await self._verify_or_recreate_consumer(durable_name)
|
|
768
|
+
|
|
769
|
+
# Create pull subscription - this will create the consumer if it doesn't exist
|
|
770
|
+
# Pass config directly to ensure correct consumer configuration
|
|
771
|
+
consumer_config = ConsumerConfig(
|
|
772
|
+
durable_name=durable_name,
|
|
773
|
+
filter_subject=self.cmd_queue,
|
|
774
|
+
ack_policy="explicit",
|
|
775
|
+
deliver_policy="all", # Required for WorkQueue: deliver all messages from the beginning
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
self._cmd_queue_sub = await self.js.pull_subscribe(
|
|
684
779
|
subject=self.cmd_queue,
|
|
780
|
+
durable=durable_name,
|
|
685
781
|
stream=self.STREAM_COMMAND_QUEUE,
|
|
686
|
-
|
|
687
|
-
cb=message_handler
|
|
782
|
+
config=consumer_config
|
|
688
783
|
)
|
|
784
|
+
|
|
785
|
+
# Log final consumer info for diagnostics
|
|
786
|
+
try:
|
|
787
|
+
consumer_info = await self.js.consumer_info(self.STREAM_COMMAND_QUEUE, durable_name)
|
|
788
|
+
logger.info("Pull subscription created - subject: %s, durable: %s, stream: %s, pending: %d, ack_pending: %d",
|
|
789
|
+
self.cmd_queue, durable_name, self.STREAM_COMMAND_QUEUE,
|
|
790
|
+
consumer_info.num_pending, consumer_info.num_ack_pending)
|
|
791
|
+
except Exception as e:
|
|
792
|
+
logger.warning("Could not get consumer info after subscription: %s", e)
|
|
793
|
+
logger.info("Pull subscription created - subject: %s, durable: %s, stream: %s",
|
|
794
|
+
self.cmd_queue, durable_name, self.STREAM_COMMAND_QUEUE)
|
|
795
|
+
|
|
796
|
+
# Start background task to pull and process messages
|
|
797
|
+
async def pull_messages():
|
|
798
|
+
"""Continuously pull messages from the queue."""
|
|
799
|
+
try:
|
|
800
|
+
while True:
|
|
801
|
+
try:
|
|
802
|
+
# Fetch one message (timeout 1 second)
|
|
803
|
+
msgs = await self._cmd_queue_sub.fetch(batch=1, timeout=1.0)
|
|
804
|
+
if msgs:
|
|
805
|
+
logger.debug("Pulled message from queue")
|
|
806
|
+
await self.process_queue_cmd(msgs[0], handler)
|
|
807
|
+
except asyncio.TimeoutError:
|
|
808
|
+
# Timeout is expected when no messages are available
|
|
809
|
+
continue
|
|
810
|
+
except Exception as e:
|
|
811
|
+
logger.error("Error pulling queue messages: %s", e, exc_info=True)
|
|
812
|
+
await asyncio.sleep(1) # Wait before retrying
|
|
813
|
+
except asyncio.CancelledError:
|
|
814
|
+
logger.debug("Queue pull task cancelled")
|
|
815
|
+
raise
|
|
816
|
+
|
|
817
|
+
self._cmd_queue_task = asyncio.create_task(pull_messages())
|
|
818
|
+
logger.info("Started background task for pulling queue messages")
|
|
819
|
+
|
|
689
820
|
except NotFoundError:
|
|
690
821
|
# Stream still not found after ensuring it exists - this shouldn't happen
|
|
691
822
|
# but handle it gracefully with detailed diagnostics
|
|
@@ -703,10 +834,7 @@ class MachineClient:
|
|
|
703
834
|
logger.error(" Stream verification failed: %s", stream_check_error)
|
|
704
835
|
raise
|
|
705
836
|
|
|
706
|
-
|
|
707
|
-
if not any(h['type'] == 'queue' for h in self._reconnect_handlers):
|
|
708
|
-
self._reconnect_handlers.append({'type': 'queue', 'handler': handler})
|
|
709
|
-
logger.info("Subscribed to queue commands: %s (durable: cmd_queue_%s, stream: %s)",
|
|
837
|
+
logger.info("Subscribed to queue commands: %s (durable: cmd_queue_%s, stream: %s, pull consumer)",
|
|
710
838
|
self.cmd_queue, self.machine_id, self.STREAM_COMMAND_QUEUE)
|
|
711
839
|
|
|
712
840
|
async def subscribe_immediate(self, handler: Callable[[NATSMessage], Awaitable[CommandResponse]]):
|
|
@@ -720,19 +848,26 @@ class MachineClient:
|
|
|
720
848
|
logger.error("JetStream not available for immediate subscription")
|
|
721
849
|
return
|
|
722
850
|
|
|
851
|
+
# Store handler for use in callback and reconnection
|
|
852
|
+
self._immediate_handler = handler
|
|
853
|
+
|
|
723
854
|
async def message_handler(msg: Msg):
|
|
724
|
-
"""
|
|
725
|
-
await self.process_immediate_cmd(msg,
|
|
855
|
+
"""Process immediate messages using stored handler."""
|
|
856
|
+
await self.process_immediate_cmd(msg, self._immediate_handler)
|
|
726
857
|
|
|
727
858
|
# Ensure stream exists before attempting to subscribe
|
|
728
|
-
await self.
|
|
859
|
+
await self._ensure_stream(
|
|
860
|
+
self.STREAM_COMMAND_IMMEDIATE,
|
|
861
|
+
f"{self.NAMESPACE}.*.cmd.immediate",
|
|
862
|
+
retention='workqueue'
|
|
863
|
+
)
|
|
729
864
|
|
|
730
865
|
try:
|
|
731
866
|
self._cmd_immediate_sub = await self.js.subscribe(
|
|
732
867
|
subject=self.cmd_immediate,
|
|
733
868
|
stream=self.STREAM_COMMAND_IMMEDIATE,
|
|
734
869
|
durable=f"cmd_immed_{self.machine_id}",
|
|
735
|
-
cb=message_handler
|
|
870
|
+
cb=message_handler # required for push consumer to handle messages
|
|
736
871
|
)
|
|
737
872
|
except NotFoundError:
|
|
738
873
|
# Stream still not found after ensuring it exists - this shouldn't happen
|
|
@@ -741,9 +876,6 @@ class MachineClient:
|
|
|
741
876
|
self.STREAM_COMMAND_IMMEDIATE)
|
|
742
877
|
raise
|
|
743
878
|
|
|
744
|
-
# Register handler for reconnection
|
|
745
|
-
if not any(h['type'] == 'immediate' for h in self._reconnect_handlers):
|
|
746
|
-
self._reconnect_handlers.append({'type': 'immediate', 'handler': handler})
|
|
747
879
|
logger.info("Subscribed to immediate commands: %s (durable: cmd_immed_%s, stream: %s)",
|
|
748
880
|
self.cmd_immediate, self.machine_id, self.STREAM_COMMAND_IMMEDIATE)
|
|
749
881
|
|
puda_comms/models.py
CHANGED
|
@@ -25,6 +25,7 @@ class CommandResponseCode(str, Enum):
|
|
|
25
25
|
RESUME_ERROR = 'RESUME_ERROR'
|
|
26
26
|
NO_EXECUTION = 'NO_EXECUTION'
|
|
27
27
|
RUN_ID_MISMATCH = 'RUN_ID_MISMATCH'
|
|
28
|
+
MISSING_RUN_ID = 'MISSING_RUN_ID'
|
|
28
29
|
CANCEL_ERROR = 'CANCEL_ERROR'
|
|
29
30
|
MACHINE_PAUSED = 'MACHINE_PAUSED'
|
|
30
31
|
|
|
@@ -40,6 +41,8 @@ class MessageType(str, Enum):
|
|
|
40
41
|
|
|
41
42
|
class ImmediateCommand(str, Enum):
|
|
42
43
|
"""Command names for immediate commands."""
|
|
44
|
+
START = 'start'
|
|
45
|
+
COMPLETE = 'complete'
|
|
43
46
|
PAUSE = 'pause'
|
|
44
47
|
RESUME = 'resume'
|
|
45
48
|
CANCEL = 'cancel'
|
|
@@ -68,18 +71,19 @@ class CommandResponse(BaseModel):
|
|
|
68
71
|
|
|
69
72
|
class MessageHeader(BaseModel):
|
|
70
73
|
"""Header for NATS messages."""
|
|
71
|
-
message_type: MessageType = Field(description="Type of message")
|
|
72
74
|
version: str = Field(default="1.0", description="Message version")
|
|
73
|
-
|
|
75
|
+
message_type: MessageType = Field(description="Type of message")
|
|
76
|
+
user_id: str = Field(description="User ID")
|
|
77
|
+
username: str = Field(description="User name")
|
|
74
78
|
machine_id: str = Field(description="Machine ID")
|
|
75
79
|
run_id: Optional[str] = Field(default=None, description="Unique identifier (uuid) for the run/workflow")
|
|
76
|
-
|
|
80
|
+
timestamp: str = Field(default_factory=_get_current_timestamp, description="ISO format timestamp (auto-set on creation)")
|
|
77
81
|
class NATSMessage(BaseModel):
|
|
78
82
|
"""
|
|
79
83
|
Complete NATS message structure.
|
|
80
84
|
|
|
81
85
|
Structure:
|
|
82
|
-
- header: MessageHeader with message_type, version, timestamp, machine_id, run_id
|
|
86
|
+
- header: MessageHeader with message_type, version, timestamp, user_id, username, machine_id, run_id
|
|
83
87
|
- command: Optional CommandRequest (for command messages)
|
|
84
88
|
- response: Optional CommandResponse data (for response messages)
|
|
85
89
|
"""
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Run State Management
|
|
3
|
+
Provides thread-safe run state tracking and validation for machine commands.
|
|
4
|
+
"""
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RunManager:
|
|
13
|
+
"""
|
|
14
|
+
Manages run state for a machine.
|
|
15
|
+
|
|
16
|
+
Tracks the active run_id and validates that commands match the active run.
|
|
17
|
+
Provides thread-safe operations for run lifecycle management.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, machine_id: str):
|
|
21
|
+
"""
|
|
22
|
+
Initialize RunManager for a machine.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
machine_id: Machine identifier
|
|
26
|
+
"""
|
|
27
|
+
self.machine_id = machine_id
|
|
28
|
+
self._active_run_id: Optional[str] = None
|
|
29
|
+
self._lock = asyncio.Lock()
|
|
30
|
+
|
|
31
|
+
async def start_run(self, run_id: str) -> bool:
|
|
32
|
+
"""
|
|
33
|
+
Set active run_id. Returns True if successful, False if run already active.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
run_id: Run ID to set as active
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
True if run was started successfully, False if another run is already active
|
|
40
|
+
"""
|
|
41
|
+
async with self._lock:
|
|
42
|
+
if self._active_run_id is not None:
|
|
43
|
+
logger.warning(
|
|
44
|
+
"Cannot start run %s: run %s is already active on machine %s",
|
|
45
|
+
run_id, self._active_run_id, self.machine_id
|
|
46
|
+
)
|
|
47
|
+
return False
|
|
48
|
+
|
|
49
|
+
self._active_run_id = run_id
|
|
50
|
+
logger.info("Started run %s on machine %s", run_id, self.machine_id)
|
|
51
|
+
return True
|
|
52
|
+
|
|
53
|
+
async def complete_run(self, run_id: str) -> bool:
|
|
54
|
+
"""
|
|
55
|
+
Clear run_id if it matches. Returns True if successful.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
run_id: Run ID to complete
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
True if run was completed successfully, False if run_id doesn't match active run
|
|
62
|
+
"""
|
|
63
|
+
async with self._lock:
|
|
64
|
+
if self._active_run_id != run_id:
|
|
65
|
+
logger.warning(
|
|
66
|
+
"Cannot complete run %s: active run is %s on machine %s",
|
|
67
|
+
run_id, self._active_run_id, self.machine_id
|
|
68
|
+
)
|
|
69
|
+
return False
|
|
70
|
+
|
|
71
|
+
self._active_run_id = None
|
|
72
|
+
logger.info("Completed run %s on machine %s", run_id, self.machine_id)
|
|
73
|
+
return True
|
|
74
|
+
|
|
75
|
+
async def validate_run_id(self, run_id: str) -> bool:
|
|
76
|
+
"""
|
|
77
|
+
Check if run_id matches active run. Returns True if valid.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
run_id: Run ID to validate (required)
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
True if run_id matches active run, False otherwise
|
|
84
|
+
"""
|
|
85
|
+
async with self._lock:
|
|
86
|
+
# If no active run, any run_id is invalid
|
|
87
|
+
if self._active_run_id is None:
|
|
88
|
+
logger.warning(
|
|
89
|
+
"Run ID validation failed: no active run, got %s on machine %s",
|
|
90
|
+
run_id, self.machine_id
|
|
91
|
+
)
|
|
92
|
+
return False
|
|
93
|
+
|
|
94
|
+
# Run_id must match active run
|
|
95
|
+
if self._active_run_id != run_id:
|
|
96
|
+
logger.warning(
|
|
97
|
+
"Run ID validation failed: expected %s, got %s on machine %s",
|
|
98
|
+
self._active_run_id, run_id, self.machine_id
|
|
99
|
+
)
|
|
100
|
+
return False
|
|
101
|
+
|
|
102
|
+
return True
|
|
103
|
+
|
|
104
|
+
def get_active_run_id(self) -> Optional[str]:
|
|
105
|
+
"""
|
|
106
|
+
Get current active run_id.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Active run_id if one exists, None otherwise
|
|
110
|
+
"""
|
|
111
|
+
return self._active_run_id
|
|
112
|
+
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: puda-comms
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.5
|
|
4
4
|
Summary: Communication library for the PUDA platform.
|
|
5
5
|
Author: zhao
|
|
6
6
|
Author-email: zhao <20024592+agentzhao@users.noreply.github.com>
|
|
@@ -121,6 +121,8 @@ Header metadata for NATS messages.
|
|
|
121
121
|
- `message_type` (MessageType): Type of message (COMMAND, RESPONSE, LOG, etc.)
|
|
122
122
|
- `version` (str): Message version (default: "1.0")
|
|
123
123
|
- `timestamp` (str): ISO 8601 UTC timestamp (auto-generated)
|
|
124
|
+
- `user_id` (str): User ID who initiated the command
|
|
125
|
+
- `username` (str): Username who initiated the command
|
|
124
126
|
- `machine_id` (str): Identifier for the target machine
|
|
125
127
|
- `run_id` (Optional[str]): Unique identifier (UUID) for the run/workflow
|
|
126
128
|
|
|
@@ -130,6 +132,8 @@ header = MessageHeader(
|
|
|
130
132
|
message_type=MessageType.RESPONSE,
|
|
131
133
|
version="1.0",
|
|
132
134
|
timestamp="2026-01-20T02:00:46Z",
|
|
135
|
+
user_id="user123",
|
|
136
|
+
username="John Doe",
|
|
133
137
|
machine_id="first",
|
|
134
138
|
run_id="092073e6-13d0-4756-8d99-eff1612a5a72"
|
|
135
139
|
)
|
|
@@ -154,6 +158,8 @@ Complete NATS message structure combining header with optional command or respon
|
|
|
154
158
|
"message_type": "response",
|
|
155
159
|
"version": "1.0",
|
|
156
160
|
"timestamp": "2026-01-20T02:00:46Z",
|
|
161
|
+
"user_id": "user123",
|
|
162
|
+
"username": "John Doe",
|
|
157
163
|
"machine_id": "first",
|
|
158
164
|
"run_id": "092073e6-13d0-4756-8d99-eff1612a5a72"
|
|
159
165
|
},
|
|
@@ -229,6 +235,8 @@ reply = await service.send_queue_command(
|
|
|
229
235
|
request=request,
|
|
230
236
|
machine_id="first",
|
|
231
237
|
run_id=run_id,
|
|
238
|
+
user_id="user123",
|
|
239
|
+
username="John Doe",
|
|
232
240
|
timeout=60 # Wait up to 60 seconds
|
|
233
241
|
)
|
|
234
242
|
|
|
@@ -237,6 +245,8 @@ reply = await service.send_queue_commands(
|
|
|
237
245
|
requests=commands,
|
|
238
246
|
machine_id="first",
|
|
239
247
|
run_id=run_id,
|
|
248
|
+
user_id="user123",
|
|
249
|
+
username="John Doe",
|
|
240
250
|
timeout=60 # Wait up to 60 seconds per command
|
|
241
251
|
)
|
|
242
252
|
```
|
|
@@ -274,7 +284,9 @@ Always check the response status and handle errors appropriately:
|
|
|
274
284
|
reply: NATSMessage = await service.send_queue_command(
|
|
275
285
|
request=request,
|
|
276
286
|
machine_id="first",
|
|
277
|
-
run_id=run_id
|
|
287
|
+
run_id=run_id,
|
|
288
|
+
user_id="user123",
|
|
289
|
+
username="John Doe"
|
|
278
290
|
)
|
|
279
291
|
|
|
280
292
|
if reply is None:
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
puda_comms/__init__.py,sha256=lntvVFJJez_rv5lZy5mYj4_43B9Y3NRNzxWfBuSAQ1M,194
|
|
2
|
+
puda_comms/command_service.py,sha256=Lxk-CUan_DwftBZlSYO3VnddxaM9fYKxxhWF8VCqABY,30423
|
|
3
|
+
puda_comms/execution_state.py,sha256=aTaejCnJgg1y_FP-ymIC1GQzqC81FIWo0RZ18XzAQnA,2881
|
|
4
|
+
puda_comms/machine_client.py,sha256=OnA8we1c62n1aEFr0NfiapklHWXR-WFzq5FXQrvuUM8,39378
|
|
5
|
+
puda_comms/models.py,sha256=CfXq_Wxqk5OQo5VknXR-BdLIT2SM69s8cGxGYr9T8WI,3701
|
|
6
|
+
puda_comms/run_manager.py,sha256=_s4VYVGwtRMcduz95_DPIObso4uWRS24n5NH7AiGgjI,3591
|
|
7
|
+
puda_comms-0.0.5.dist-info/WHEEL,sha256=ZyFSCYkV2BrxH6-HRVRg3R9Fo7MALzer9KiPYqNxSbo,79
|
|
8
|
+
puda_comms-0.0.5.dist-info/METADATA,sha256=REBvcpJsUCxiFCKihVVReP0lh6IkJcBl4I8XohjhSHE,11512
|
|
9
|
+
puda_comms-0.0.5.dist-info/RECORD,,
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
puda_comms/__init__.py,sha256=lntvVFJJez_rv5lZy5mYj4_43B9Y3NRNzxWfBuSAQ1M,194
|
|
2
|
-
puda_comms/command_service.py,sha256=E5kGzl2hjkSTubxv01nxuo9XMXHY5aTEsn-k3IDJVB8,24727
|
|
3
|
-
puda_comms/execution_state.py,sha256=aTaejCnJgg1y_FP-ymIC1GQzqC81FIWo0RZ18XzAQnA,2881
|
|
4
|
-
puda_comms/machine_client.py,sha256=r8oSnkRoqhKykvyR94kGlA1vRrCKLq-o9uNZQftxqDU,33120
|
|
5
|
-
puda_comms/models.py,sha256=cVH5uKzyLmjzPeBcm3RIJMTkoynmxqe_P26GtZwlIN8,3500
|
|
6
|
-
puda_comms-0.0.3.dist-info/WHEEL,sha256=ZyFSCYkV2BrxH6-HRVRg3R9Fo7MALzer9KiPYqNxSbo,79
|
|
7
|
-
puda_comms-0.0.3.dist-info/METADATA,sha256=Fnf_YWeOZAcefPUTY976BUT95M0w-8bSqAhjVMkmjxA,11158
|
|
8
|
-
puda_comms-0.0.3.dist-info/RECORD,,
|
|
File without changes
|