puda-comms 0.0.3__tar.gz → 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: puda-comms
3
- Version: 0.0.3
3
+ Version: 0.0.4
4
4
  Summary: Communication library for the PUDA platform.
5
5
  Author: zhao
6
6
  Author-email: zhao <20024592+agentzhao@users.noreply.github.com>
@@ -121,6 +121,8 @@ Header metadata for NATS messages.
121
121
  - `message_type` (MessageType): Type of message (COMMAND, RESPONSE, LOG, etc.)
122
122
  - `version` (str): Message version (default: "1.0")
123
123
  - `timestamp` (str): ISO 8601 UTC timestamp (auto-generated)
124
+ - `user_id` (str): User ID who initiated the command
125
+ - `username` (str): Username who initiated the command
124
126
  - `machine_id` (str): Identifier for the target machine
125
127
  - `run_id` (Optional[str]): Unique identifier (UUID) for the run/workflow
126
128
 
@@ -130,6 +132,8 @@ header = MessageHeader(
130
132
  message_type=MessageType.RESPONSE,
131
133
  version="1.0",
132
134
  timestamp="2026-01-20T02:00:46Z",
135
+ user_id="user123",
136
+ username="John Doe",
133
137
  machine_id="first",
134
138
  run_id="092073e6-13d0-4756-8d99-eff1612a5a72"
135
139
  )
@@ -154,6 +158,8 @@ Complete NATS message structure combining header with optional command or respon
154
158
  "message_type": "response",
155
159
  "version": "1.0",
156
160
  "timestamp": "2026-01-20T02:00:46Z",
161
+ "user_id": "user123",
162
+ "username": "John Doe",
157
163
  "machine_id": "first",
158
164
  "run_id": "092073e6-13d0-4756-8d99-eff1612a5a72"
159
165
  },
@@ -229,6 +235,8 @@ reply = await service.send_queue_command(
229
235
  request=request,
230
236
  machine_id="first",
231
237
  run_id=run_id,
238
+ user_id="user123",
239
+ username="John Doe",
232
240
  timeout=60 # Wait up to 60 seconds
233
241
  )
234
242
 
@@ -237,6 +245,8 @@ reply = await service.send_queue_commands(
237
245
  requests=commands,
238
246
  machine_id="first",
239
247
  run_id=run_id,
248
+ user_id="user123",
249
+ username="John Doe",
240
250
  timeout=60 # Wait up to 60 seconds per command
241
251
  )
242
252
  ```
@@ -274,7 +284,9 @@ Always check the response status and handle errors appropriately:
274
284
  reply: NATSMessage = await service.send_queue_command(
275
285
  request=request,
276
286
  machine_id="first",
277
- run_id=run_id
287
+ run_id=run_id,
288
+ user_id="user123",
289
+ username="John Doe"
278
290
  )
279
291
 
280
292
  if reply is None:
@@ -109,6 +109,8 @@ Header metadata for NATS messages.
109
109
  - `message_type` (MessageType): Type of message (COMMAND, RESPONSE, LOG, etc.)
110
110
  - `version` (str): Message version (default: "1.0")
111
111
  - `timestamp` (str): ISO 8601 UTC timestamp (auto-generated)
112
+ - `user_id` (str): User ID who initiated the command
113
+ - `username` (str): Username who initiated the command
112
114
  - `machine_id` (str): Identifier for the target machine
113
115
  - `run_id` (Optional[str]): Unique identifier (UUID) for the run/workflow
114
116
 
@@ -118,6 +120,8 @@ header = MessageHeader(
118
120
  message_type=MessageType.RESPONSE,
119
121
  version="1.0",
120
122
  timestamp="2026-01-20T02:00:46Z",
123
+ user_id="user123",
124
+ username="John Doe",
121
125
  machine_id="first",
122
126
  run_id="092073e6-13d0-4756-8d99-eff1612a5a72"
123
127
  )
@@ -142,6 +146,8 @@ Complete NATS message structure combining header with optional command or respon
142
146
  "message_type": "response",
143
147
  "version": "1.0",
144
148
  "timestamp": "2026-01-20T02:00:46Z",
149
+ "user_id": "user123",
150
+ "username": "John Doe",
145
151
  "machine_id": "first",
146
152
  "run_id": "092073e6-13d0-4756-8d99-eff1612a5a72"
147
153
  },
@@ -217,6 +223,8 @@ reply = await service.send_queue_command(
217
223
  request=request,
218
224
  machine_id="first",
219
225
  run_id=run_id,
226
+ user_id="user123",
227
+ username="John Doe",
220
228
  timeout=60 # Wait up to 60 seconds
221
229
  )
222
230
 
@@ -225,6 +233,8 @@ reply = await service.send_queue_commands(
225
233
  requests=commands,
226
234
  machine_id="first",
227
235
  run_id=run_id,
236
+ user_id="user123",
237
+ username="John Doe",
228
238
  timeout=60 # Wait up to 60 seconds per command
229
239
  )
230
240
  ```
@@ -262,7 +272,9 @@ Always check the response status and handle errors appropriately:
262
272
  reply: NATSMessage = await service.send_queue_command(
263
273
  request=request,
264
274
  machine_id="first",
265
- run_id=run_id
275
+ run_id=run_id,
276
+ user_id="user123",
277
+ username="John Doe"
266
278
  )
267
279
 
268
280
  if reply is None:
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "puda-comms"
3
- version = "0.0.3"
3
+ version = "0.0.4"
4
4
  description = "Communication library for the PUDA platform."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -12,11 +12,11 @@ import json
12
12
  import logging
13
13
  import signal
14
14
  from datetime import datetime, timezone
15
- from typing import Dict, Any, Optional, Tuple
15
+ from typing import Dict, Any, Optional
16
16
  import nats
17
17
  from nats.js.client import JetStreamContext
18
18
  from nats.aio.msg import Msg
19
- from puda_comms.models import CommandRequest, CommandResponse, CommandResponseStatus, NATSMessage, MessageHeader, MessageType
19
+ from puda_comms.models import CommandRequest, CommandResponseStatus, NATSMessage, MessageHeader, MessageType
20
20
 
21
21
  logger = logging.getLogger(__name__)
22
22
 
@@ -37,7 +37,7 @@ class ResponseHandler:
37
37
  def __init__(self, js: JetStreamContext, machine_id: str):
38
38
  self.js = js
39
39
  self.machine_id = machine_id
40
- self._pending_responses: Dict[str, Tuple[asyncio.Event, CommandResponse]] = {}
40
+ self._pending_responses: Dict[str, Dict[str, Any]] = {} # {'event': asyncio.Event, 'response': Optional[NATSMessage]}
41
41
  self._queue_consumer = None
42
42
  self._immediate_consumer = None
43
43
  self._initialized = False
@@ -102,8 +102,8 @@ class ResponseHandler:
102
102
 
103
103
  # Get the pending response
104
104
  pending = self._pending_responses[key]
105
- # Store the full NATSMessage JSON structure
106
- pending['response'] = message.model_dump()
105
+ # Store the NATSMessage directly
106
+ pending['response'] = message
107
107
  # Signal that response was received
108
108
  # Don't delete here - let get_response() delete it after retrieval
109
109
  pending['event'].set()
@@ -152,7 +152,7 @@ class ResponseHandler:
152
152
  }
153
153
  return event
154
154
 
155
- def get_response(self, run_id: str, step_number: int) -> Optional[Dict[str, Any]]:
155
+ def get_response(self, run_id: str, step_number: int) -> Optional[NATSMessage]:
156
156
  """
157
157
  Get the response for a pending command.
158
158
 
@@ -161,7 +161,7 @@ class ResponseHandler:
161
161
  step_number: Step number for the command
162
162
 
163
163
  Returns:
164
- The NATSMessage dict structure if available, None otherwise
164
+ The NATSMessage if available, None otherwise
165
165
  """
166
166
  key = f"{run_id}:{str(step_number)}"
167
167
  if key in self._pending_responses:
@@ -343,6 +343,8 @@ class CommandService:
343
343
  request: CommandRequest,
344
344
  machine_id: str,
345
345
  run_id: str,
346
+ user_id: str,
347
+ username: str,
346
348
  timeout: int = 120
347
349
  ) -> Optional[NATSMessage]:
348
350
  """
@@ -352,6 +354,8 @@ class CommandService:
352
354
  request: CommandRequest model containing command details
353
355
  machine_id: Machine ID to send the command to
354
356
  run_id: Run ID for the command
357
+ user_id: User ID who initiated the command
358
+ username: Username who initiated the command
355
359
  timeout: Maximum time to wait for response in seconds
356
360
 
357
361
  Returns:
@@ -364,8 +368,8 @@ class CommandService:
364
368
  subject = f"{NAMESPACE}.{machine_id}.cmd.queue"
365
369
 
366
370
  logger.info(
367
- "Sending queue command: machine_id=%s, command=%s, run_id=%s, step_number=%s",
368
- machine_id, request.name, run_id, request.step_number
371
+ "Sending queue command: subject=%s, command=%s, run_id=%s, step_number=%s",
372
+ subject, request.name, run_id, request.step_number
369
373
  )
370
374
 
371
375
  # Get or create response handler for this machine
@@ -374,7 +378,7 @@ class CommandService:
374
378
  response_event = response_handler.register_pending(run_id, request.step_number)
375
379
 
376
380
  # Build payload
377
- payload = self._build_command_payload(request, machine_id, run_id)
381
+ payload = self._build_command_payload(request, machine_id, run_id, user_id, username)
378
382
 
379
383
  try:
380
384
  # Publish to JetStream
@@ -397,11 +401,7 @@ class CommandService:
397
401
  await asyncio.sleep(0.1)
398
402
 
399
403
  # Get the response
400
- response_data = response_handler.get_response(run_id, request.step_number)
401
- if response_data is None:
402
- return None
403
-
404
- return NATSMessage.model_validate(response_data)
404
+ return response_handler.get_response(run_id, request.step_number)
405
405
 
406
406
  except Exception as e:
407
407
  logger.error("Error sending queue command: %s", e)
@@ -414,6 +414,8 @@ class CommandService:
414
414
  requests: list[CommandRequest],
415
415
  machine_id: str,
416
416
  run_id: str,
417
+ user_id: str,
418
+ username: str,
417
419
  timeout: int = 120
418
420
  ) -> Optional[NATSMessage]:
419
421
  """
@@ -427,6 +429,8 @@ class CommandService:
427
429
  requests: List of CommandRequest models to send sequentially
428
430
  machine_id: Machine ID to send the commands to
429
431
  run_id: Run ID for all commands
432
+ user_id: User ID who initiated the commands
433
+ username: Username who initiated the commands
430
434
  timeout: Maximum time to wait for each response in seconds
431
435
 
432
436
  Returns:
@@ -462,6 +466,8 @@ class CommandService:
462
466
  request=request,
463
467
  machine_id=machine_id,
464
468
  run_id=run_id,
469
+ user_id=user_id,
470
+ username=username,
465
471
  timeout=timeout
466
472
  )
467
473
 
@@ -522,6 +528,8 @@ class CommandService:
522
528
  request: CommandRequest,
523
529
  machine_id: str,
524
530
  run_id: str,
531
+ user_id: str,
532
+ username: str,
525
533
  timeout: int = 120
526
534
  ) -> Optional[NATSMessage]:
527
535
  """
@@ -531,6 +539,8 @@ class CommandService:
531
539
  request: CommandRequest model containing command details
532
540
  machine_id: Machine ID to send the command to
533
541
  run_id: Run ID for the command
542
+ user_id: User ID who initiated the command
543
+ username: Username who initiated the command
534
544
  timeout: Maximum time to wait for response in seconds
535
545
 
536
546
  Returns:
@@ -555,7 +565,7 @@ class CommandService:
555
565
  response_received = response_handler.register_pending(run_id, request.step_number)
556
566
 
557
567
  # Build payload
558
- payload = self._build_command_payload(request, machine_id, run_id)
568
+ payload = self._build_command_payload(request, machine_id, run_id, user_id, username)
559
569
 
560
570
  try:
561
571
  # Publish to JetStream
@@ -578,11 +588,7 @@ class CommandService:
578
588
  await asyncio.sleep(0.1)
579
589
 
580
590
  # Get the response
581
- response_data = response_handler.get_response(run_id, request.step_number)
582
- if response_data is None:
583
- return None
584
-
585
- return NATSMessage.model_validate(response_data)
591
+ return response_handler.get_response(run_id, request.step_number)
586
592
 
587
593
  except Exception as e:
588
594
  logger.error("Error sending immediate command: %s", e)
@@ -635,7 +641,9 @@ class CommandService:
635
641
  self,
636
642
  command_request: CommandRequest,
637
643
  machine_id: str,
638
- run_id: str
644
+ run_id: str,
645
+ user_id: str,
646
+ username: str
639
647
  ) -> NATSMessage:
640
648
  """
641
649
  Build a command payload in the expected format.
@@ -644,6 +652,8 @@ class CommandService:
644
652
  command_request: CommandRequest model containing command details
645
653
  machine_id: Machine ID for the command
646
654
  run_id: Run ID for the command
655
+ user_id: User ID who initiated the command
656
+ username: Username who initiated the command
647
657
 
648
658
  Returns:
649
659
  NATSMessage object ready for NATS transmission
@@ -652,6 +662,8 @@ class CommandService:
652
662
  message_type=MessageType.COMMAND,
653
663
  version="1.0",
654
664
  timestamp=datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
665
+ user_id=user_id,
666
+ username=username,
655
667
  machine_id=machine_id,
656
668
  run_id=run_id
657
669
  )
@@ -20,7 +20,7 @@ from puda_comms.models import (
20
20
  ImmediateCommand,
21
21
  )
22
22
  from nats.js.client import JetStreamContext
23
- from nats.js.api import StreamConfig
23
+ from nats.js.api import StreamConfig, ConsumerConfig
24
24
  from nats.js.errors import NotFoundError
25
25
  from nats.aio.msg import Msg
26
26
 
@@ -69,11 +69,13 @@ class MachineClient:
69
69
 
70
70
  # Default subscriptions
71
71
  self._cmd_queue_sub = None
72
+ self._cmd_queue_task = None # Background task for pull consumer
72
73
  self._cmd_immediate_sub = None
73
74
 
74
75
  # Connection state
75
76
  self._is_connected = False
76
- self._reconnect_handlers = []
77
+ self._queue_handler = None
78
+ self._immediate_handler = None
77
79
 
78
80
  # Queue control state
79
81
  self._pause_lock = asyncio.Lock()
@@ -184,30 +186,22 @@ class MachineClient:
184
186
  logger.error("Error ensuring %s stream: %s", stream_name, e, exc_info=True)
185
187
  raise
186
188
 
187
- async def _ensure_command_queue_stream(self):
188
- """Ensure COMMAND_QUEUE stream exists with WorkQueue retention policy."""
189
+ async def _ensure_all_streams(self):
190
+ """Ensure all required streams exist with correct retention policies."""
189
191
  await self._ensure_stream(
190
192
  self.STREAM_COMMAND_QUEUE,
191
- f"{self.NAMESPACE}.*.cmd.queue"
193
+ f"{self.NAMESPACE}.*.cmd.queue",
194
+ retention='workqueue'
192
195
  )
193
-
194
- async def _ensure_command_immediate_stream(self):
195
- """Ensure COMMAND_IMMEDIATE stream exists with WorkQueue retention policy."""
196
196
  await self._ensure_stream(
197
197
  self.STREAM_COMMAND_IMMEDIATE,
198
198
  f"{self.NAMESPACE}.*.cmd.immediate"
199
199
  )
200
-
201
- async def _ensure_response_queue_stream(self):
202
- """Ensure RESPONSE_QUEUE stream exists with Interest retention policy."""
203
200
  await self._ensure_stream(
204
201
  self.STREAM_RESPONSE_QUEUE,
205
202
  f"{self.NAMESPACE}.*.cmd.response.queue",
206
203
  retention='interest'
207
204
  )
208
-
209
- async def _ensure_response_immediate_stream(self):
210
- """Ensure RESPONSE_IMMEDIATE stream exists with Interest retention policy."""
211
205
  await self._ensure_stream(
212
206
  self.STREAM_RESPONSE_IMMEDIATE,
213
207
  f"{self.NAMESPACE}.*.cmd.response.immediate",
@@ -230,7 +224,17 @@ class MachineClient:
230
224
 
231
225
  async def _cleanup_subscriptions(self):
232
226
  """Unsubscribe from all subscriptions."""
233
- # Clean up subscriptions
227
+ # Clean up queue subscription (pull consumer)
228
+ if self._cmd_queue_task:
229
+ try:
230
+ self._cmd_queue_task.cancel()
231
+ await self._cmd_queue_task
232
+ except asyncio.CancelledError:
233
+ pass
234
+ except Exception:
235
+ pass
236
+ self._cmd_queue_task = None
237
+
234
238
  if self._cmd_queue_sub:
235
239
  try:
236
240
  await self._cmd_queue_sub.unsubscribe()
@@ -252,6 +256,7 @@ class MachineClient:
252
256
  self.kv = None
253
257
  # Subscriptions will be recreated on reconnection
254
258
  self._cmd_queue_sub = None
259
+ self._cmd_queue_task = None
255
260
  self._cmd_immediate_sub = None
256
261
 
257
262
  # ==================== CONNECTION MANAGEMENT ====================
@@ -270,10 +275,7 @@ class MachineClient:
270
275
  closed_cb=self._closed_callback
271
276
  )
272
277
  self.js = self.nc.jetstream()
273
- await self._ensure_command_queue_stream()
274
- await self._ensure_command_immediate_stream()
275
- await self._ensure_response_queue_stream()
276
- await self._ensure_response_immediate_stream()
278
+ await self._ensure_all_streams()
277
279
  self.kv = await self._get_or_create_kv_bucket()
278
280
  self._is_connected = True
279
281
  logger.info("Connected to NATS servers: %s", self.servers)
@@ -299,32 +301,16 @@ class MachineClient:
299
301
 
300
302
  if self.nc:
301
303
  self.js = self.nc.jetstream()
302
- await self._ensure_command_queue_stream()
303
- await self._ensure_command_immediate_stream()
304
- await self._ensure_response_queue_stream()
305
- await self._ensure_response_immediate_stream()
304
+ await self._ensure_all_streams()
306
305
  self.kv = await self._get_or_create_kv_bucket()
307
306
  await self._resubscribe_handlers()
308
307
 
309
308
  async def _resubscribe_handlers(self):
310
309
  """Re-subscribe to all handlers after reconnection."""
311
- subscribe_methods = {
312
- 'queue': self.subscribe_queue,
313
- 'immediate': self.subscribe_immediate,
314
- }
315
-
316
- for handler_info in self._reconnect_handlers:
317
- try:
318
- handler_type = handler_info['type']
319
- handler = handler_info['handler']
320
- subscribe_method = subscribe_methods.get(handler_type)
321
-
322
- if subscribe_method:
323
- await subscribe_method(handler)
324
- else:
325
- logger.warning("Unknown handler type: %s", handler_type)
326
- except Exception as e:
327
- logger.error("Failed to re-subscribe %s: %s", handler_type, e)
310
+ if self._queue_handler:
311
+ await self.subscribe_queue(self._queue_handler)
312
+ if self._immediate_handler:
313
+ await self.subscribe_immediate(self._immediate_handler)
328
314
 
329
315
  async def _closed_callback(self):
330
316
  """Callback when connection is closed."""
@@ -439,7 +425,7 @@ class MachineClient:
439
425
  async def process_queue_cmd(
440
426
  self,
441
427
  msg: Msg,
442
- handler: Callable[[CommandRequest], Awaitable[CommandResponse]]
428
+ handler: Callable[[NATSMessage], Awaitable[CommandResponse]]
443
429
  ) -> None:
444
430
  """
445
431
  Handle the lifecycle of a single message: Parse -> Handle -> Ack/Nak/Term.
@@ -661,9 +647,54 @@ class MachineClient:
661
647
  )
662
648
  await self.publish_state({'state': 'error', 'run_id': None})
663
649
 
650
+ async def _verify_or_recreate_consumer(self, durable_name: str):
651
+ """
652
+ Check if consumer exists and verify/update its configuration.
653
+ Deletes and recreates the consumer if configuration doesn't match.
654
+
655
+ Args:
656
+ durable_name: Name of the durable consumer to verify
657
+ """
658
+ # Check if consumer exists and verify/update its configuration
659
+ try:
660
+ consumer_info = await self.js.consumer_info(self.STREAM_COMMAND_QUEUE, durable_name)
661
+ logger.debug("Durable consumer %s already exists", durable_name)
662
+
663
+ # Check if consumer config matches what we need
664
+ config = consumer_info.config
665
+ needs_recreate = False
666
+ if getattr(config, 'filter_subject', None) != self.cmd_queue:
667
+ logger.warning("Consumer filter_subject mismatch: expected %s, got %s",
668
+ self.cmd_queue, getattr(config, 'filter_subject', None))
669
+ needs_recreate = True
670
+ if getattr(config, 'ack_policy', None) != 'explicit':
671
+ logger.warning("Consumer ack_policy mismatch: expected explicit, got %s",
672
+ getattr(config, 'ack_policy', None))
673
+ needs_recreate = True
674
+ if getattr(config, 'deliver_policy', None) != 'all':
675
+ logger.warning("Consumer deliver_policy mismatch: expected all, got %s",
676
+ getattr(config, 'deliver_policy', None))
677
+ needs_recreate = True
678
+
679
+ if needs_recreate:
680
+ # Consumer exists but config doesn't match - delete and recreate
681
+ logger.info("Consumer config mismatch, deleting and recreating: %s", durable_name)
682
+ try:
683
+ await self.js.delete_consumer(self.STREAM_COMMAND_QUEUE, durable_name)
684
+ except Exception as e:
685
+ logger.warning("Error deleting consumer: %s", e)
686
+ else:
687
+ # Log consumer state for diagnostics
688
+ logger.info("Consumer exists with correct config - pending: %d, delivered: %d, ack_pending: %d",
689
+ consumer_info.num_pending, consumer_info.delivered.consumer_seq,
690
+ consumer_info.num_ack_pending)
691
+ except NotFoundError:
692
+ # Consumer doesn't exist, will be created by pull_subscribe
693
+ logger.debug("Durable consumer %s does not exist, will be created", durable_name)
694
+
664
695
  async def subscribe_queue(self, handler: Callable[[NATSMessage], Awaitable[CommandResponse]]):
665
696
  """
666
- Subscribe to queue commands with default consumer.
697
+ Subscribe to queue commands with pull consumer.
667
698
 
668
699
  Args:
669
700
  handler: Async function that processes command payloads and returns CommandResponse
@@ -673,19 +704,65 @@ class MachineClient:
673
704
  return
674
705
 
675
706
  # Ensure stream exists before attempting to subscribe
676
- await self._ensure_command_queue_stream()
707
+ await self._ensure_all_streams()
677
708
 
678
709
  try:
679
- async def message_handler(msg: Msg):
680
- """Wrapper to process queue messages."""
681
- await self.process_queue_cmd(msg, handler)
682
-
683
- self._cmd_queue_sub = await self.js.subscribe(
710
+ durable_name = f"cmd_queue_{self.machine_id}"
711
+
712
+ await self._verify_or_recreate_consumer(durable_name)
713
+
714
+ # Create pull subscription - this will create the consumer if it doesn't exist
715
+ # Pass config directly to ensure correct consumer configuration
716
+ consumer_config = ConsumerConfig(
717
+ durable_name=durable_name,
718
+ filter_subject=self.cmd_queue,
719
+ ack_policy="explicit",
720
+ deliver_policy="all", # Required for WorkQueue: deliver all messages from the beginning
721
+ )
722
+
723
+ self._cmd_queue_sub = await self.js.pull_subscribe(
684
724
  subject=self.cmd_queue,
725
+ durable=durable_name,
685
726
  stream=self.STREAM_COMMAND_QUEUE,
686
- durable=f"cmd_queue_{self.machine_id}",
687
- cb=message_handler
727
+ config=consumer_config
688
728
  )
729
+
730
+ # Log final consumer info for diagnostics
731
+ try:
732
+ consumer_info = await self.js.consumer_info(self.STREAM_COMMAND_QUEUE, durable_name)
733
+ logger.info("Pull subscription created - subject: %s, durable: %s, stream: %s, pending: %d, ack_pending: %d",
734
+ self.cmd_queue, durable_name, self.STREAM_COMMAND_QUEUE,
735
+ consumer_info.num_pending, consumer_info.num_ack_pending)
736
+ except Exception as e:
737
+ logger.warning("Could not get consumer info after subscription: %s", e)
738
+ logger.info("Pull subscription created - subject: %s, durable: %s, stream: %s",
739
+ self.cmd_queue, durable_name, self.STREAM_COMMAND_QUEUE)
740
+
741
+ # Start background task to pull and process messages
742
+ async def pull_messages():
743
+ """Continuously pull messages from the queue."""
744
+ try:
745
+ while True:
746
+ try:
747
+ # Fetch messages (batch of 1, timeout 1 second)
748
+ msgs = await self._cmd_queue_sub.fetch(batch=1, timeout=1.0)
749
+ if msgs:
750
+ logger.debug("Pulled %d message(s) from queue", len(msgs))
751
+ for msg in msgs:
752
+ await self.process_queue_cmd(msg, handler)
753
+ except asyncio.TimeoutError:
754
+ # Timeout is expected when no messages are available
755
+ continue
756
+ except Exception as e:
757
+ logger.error("Error pulling queue messages: %s", e, exc_info=True)
758
+ await asyncio.sleep(1) # Wait before retrying
759
+ except asyncio.CancelledError:
760
+ logger.debug("Queue pull task cancelled")
761
+ raise
762
+
763
+ self._cmd_queue_task = asyncio.create_task(pull_messages())
764
+ logger.info("Started background task for pulling queue messages")
765
+
689
766
  except NotFoundError:
690
767
  # Stream still not found after ensuring it exists - this shouldn't happen
691
768
  # but handle it gracefully with detailed diagnostics
@@ -703,10 +780,9 @@ class MachineClient:
703
780
  logger.error(" Stream verification failed: %s", stream_check_error)
704
781
  raise
705
782
 
706
- # Register handler for reconnection
707
- if not any(h['type'] == 'queue' for h in self._reconnect_handlers):
708
- self._reconnect_handlers.append({'type': 'queue', 'handler': handler})
709
- logger.info("Subscribed to queue commands: %s (durable: cmd_queue_%s, stream: %s)",
783
+ # Store handler for reconnection
784
+ self._queue_handler = handler
785
+ logger.info("Subscribed to queue commands: %s (durable: cmd_queue_%s, stream: %s, pull consumer)",
710
786
  self.cmd_queue, self.machine_id, self.STREAM_COMMAND_QUEUE)
711
787
 
712
788
  async def subscribe_immediate(self, handler: Callable[[NATSMessage], Awaitable[CommandResponse]]):
@@ -720,19 +796,26 @@ class MachineClient:
720
796
  logger.error("JetStream not available for immediate subscription")
721
797
  return
722
798
 
799
+ # Store handler for use in callback and reconnection
800
+ self._immediate_handler = handler
801
+
723
802
  async def message_handler(msg: Msg):
724
- """Wrapper to process immediate messages."""
725
- await self.process_immediate_cmd(msg, handler)
803
+ """Process immediate messages using stored handler."""
804
+ await self.process_immediate_cmd(msg, self._immediate_handler)
726
805
 
727
806
  # Ensure stream exists before attempting to subscribe
728
- await self._ensure_command_immediate_stream()
807
+ await self._ensure_stream(
808
+ self.STREAM_COMMAND_IMMEDIATE,
809
+ f"{self.NAMESPACE}.*.cmd.immediate",
810
+ retention='workqueue'
811
+ )
729
812
 
730
813
  try:
731
814
  self._cmd_immediate_sub = await self.js.subscribe(
732
815
  subject=self.cmd_immediate,
733
816
  stream=self.STREAM_COMMAND_IMMEDIATE,
734
817
  durable=f"cmd_immed_{self.machine_id}",
735
- cb=message_handler
818
+ cb=message_handler # required for push consumer to handle messages
736
819
  )
737
820
  except NotFoundError:
738
821
  # Stream still not found after ensuring it exists - this shouldn't happen
@@ -741,9 +824,6 @@ class MachineClient:
741
824
  self.STREAM_COMMAND_IMMEDIATE)
742
825
  raise
743
826
 
744
- # Register handler for reconnection
745
- if not any(h['type'] == 'immediate' for h in self._reconnect_handlers):
746
- self._reconnect_handlers.append({'type': 'immediate', 'handler': handler})
747
827
  logger.info("Subscribed to immediate commands: %s (durable: cmd_immed_%s, stream: %s)",
748
828
  self.cmd_immediate, self.machine_id, self.STREAM_COMMAND_IMMEDIATE)
749
829
 
@@ -68,18 +68,19 @@ class CommandResponse(BaseModel):
68
68
 
69
69
  class MessageHeader(BaseModel):
70
70
  """Header for NATS messages."""
71
- message_type: MessageType = Field(description="Type of message")
72
71
  version: str = Field(default="1.0", description="Message version")
73
- timestamp: str = Field(default_factory=_get_current_timestamp, description="ISO format timestamp (auto-set on creation)")
72
+ message_type: MessageType = Field(description="Type of message")
73
+ user_id: str = Field(description="User ID")
74
+ username: str = Field(description="User name")
74
75
  machine_id: str = Field(description="Machine ID")
75
76
  run_id: Optional[str] = Field(default=None, description="Unique identifier (uuid) for the run/workflow")
76
-
77
+ timestamp: str = Field(default_factory=_get_current_timestamp, description="ISO format timestamp (auto-set on creation)")
77
78
  class NATSMessage(BaseModel):
78
79
  """
79
80
  Complete NATS message structure.
80
81
 
81
82
  Structure:
82
- - header: MessageHeader with message_type, version, timestamp, machine_id, run_id
83
+ - header: MessageHeader with message_type, version, timestamp, user_id, username, machine_id, run_id
83
84
  - command: Optional CommandRequest (for command messages)
84
85
  - response: Optional CommandResponse data (for response messages)
85
86
  """