@agentunion/kite 1.3.1 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -263,10 +263,19 @@ def _read_stdin_kite_message(expected_type: str, timeout: float = 10) -> dict |
263
263
 
264
264
  # Global WS reference for publish_event callback
265
265
  _ws_global = None
266
+ _shutting_down = False
267
+
268
+
269
+ def _is_auth_failure(e: Exception) -> bool:
270
+ """Check if a WebSocket exception indicates authentication failure."""
271
+ if hasattr(e, 'rcvd') and e.rcvd is not None:
272
+ code = e.rcvd.code if hasattr(e.rcvd, 'code') else 0
273
+ return code in (4001, 4003)
274
+ return False
266
275
 
267
276
 
268
277
  async def main():
269
- global _ws_global
278
+ global _ws_global, _shutting_down
270
279
  # Initialize log file paths
271
280
  global _log_dir, _log_latest_path, _crash_log_path
272
281
  module_data = os.environ.get("KITE_MODULE_DATA")
@@ -318,41 +327,84 @@ async def main():
318
327
 
319
328
  print(f"[model_service] Token received ({len(token)} chars), kernel port: {kernel_port} ({_fmt_elapsed(_t0)})")
320
329
 
321
- # Connect to Kernel WebSocket
330
+ # Start reconnect loop
331
+ await _ws_loop(token, kernel_port, _t0)
332
+
333
+
334
+ async def _ws_loop(token: str, kernel_port: int, _t0: float):
335
+ """Connect to Kernel with exponential backoff reconnection."""
336
+ global _shutting_down
337
+ retry_delay = 0.3
338
+ max_delay = 5.0
339
+ max_retries = 10
340
+ attempt = 0
341
+ while not _shutting_down:
342
+ try:
343
+ await _ws_connect(token, kernel_port, _t0)
344
+ retry_delay = 0.3
345
+ attempt = 0
346
+ except asyncio.CancelledError:
347
+ return
348
+ except Exception as e:
349
+ attempt += 1
350
+ if _is_auth_failure(e):
351
+ print(f"[model_service] Kernel 认证失败,退出")
352
+ sys.exit(1)
353
+ if attempt >= max_retries:
354
+ print(f"[model_service] 重连失败 {max_retries} 次,退出")
355
+ sys.exit(1)
356
+ _write_crash(type(e), e, e.__traceback__, severity="error", handled=True)
357
+ print(f"[model_service] 连接错误: {e}, {retry_delay:.1f}s 后重试 ({attempt}/{max_retries})")
358
+ _ws_global_clear()
359
+ if _shutting_down:
360
+ return
361
+ await asyncio.sleep(retry_delay)
362
+ retry_delay = min(retry_delay * 2, max_delay)
363
+
364
+
365
+ def _ws_global_clear():
366
+ global _ws_global
367
+ _ws_global = None
368
+
369
+
370
+ async def _ws_connect(token: str, kernel_port: int, _t0: float):
371
+ """Single WebSocket session: connect → subscribe → register → ready → receive loop."""
372
+ global _ws_global
373
+
322
374
  ws_url = f"ws://127.0.0.1:{kernel_port}/ws?token={token}&id=model_service"
323
375
  print(f"[model_service] Connecting to Kernel: {ws_url}")
324
376
 
325
- try:
326
- async with websockets.connect(ws_url, open_timeout=5, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
327
- _ws_global = ws
328
- print(f"[model_service] Connected to Kernel ({_fmt_elapsed(_t0)})")
329
-
330
- # Subscribe to events
331
- await _rpc_call(ws, "event.subscribe", {
332
- "events": [
333
- "module.started",
334
- "module.stopped",
335
- "module.shutdown",
336
- ],
337
- })
338
- print(f"[model_service] Subscribed to events ({_fmt_elapsed(_t0)})")
339
-
340
- # Register to Kernel Registry via RPC
341
- await _rpc_call(ws, "registry.register", {
342
- "module_id": "model_service",
343
- "module_type": "service",
344
- "events_publish": {
345
- "model_service.test": {"description": "Test event from model_service module"},
346
- },
347
- "events_subscribe": [
348
- "module.started",
349
- "module.stopped",
350
- "module.shutdown",
351
- ],
352
- })
353
- print(f"[model_service] Registered to Kernel ({_fmt_elapsed(_t0)})")
377
+ async with websockets.connect(ws_url, open_timeout=5, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
378
+ _ws_global = ws
379
+ print(f"[model_service] Connected to Kernel ({_fmt_elapsed(_t0)})")
380
+
381
+ # Subscribe to events
382
+ await _rpc_call(ws, "event.subscribe", {
383
+ "events": [
384
+ "module.started",
385
+ "module.stopped",
386
+ "module.shutdown",
387
+ ],
388
+ })
389
+ print(f"[model_service] Subscribed to events ({_fmt_elapsed(_t0)})")
390
+
391
+ # Register to Kernel Registry via RPC
392
+ await _rpc_call(ws, "registry.register", {
393
+ "module_id": "model_service",
394
+ "module_type": "service",
395
+ "events_publish": {
396
+ "model_service.test": {"description": "Test event from model_service module"},
397
+ },
398
+ "events_subscribe": [
399
+ "module.started",
400
+ "module.stopped",
401
+ "module.shutdown",
402
+ ],
403
+ })
404
+ print(f"[model_service] Registered to Kernel ({_fmt_elapsed(_t0)})")
354
405
 
355
- # Publish module.ready
406
+ # Publish module.ready (every reconnect)
407
+ if not _shutting_down:
356
408
  await _rpc_call(ws, "event.publish", {
357
409
  "event_id": str(uuid.uuid4()),
358
410
  "event": "module.ready",
@@ -363,34 +415,29 @@ async def main():
363
415
  })
364
416
  print(f"[model_service] module.ready published ({_fmt_elapsed(_t0)})")
365
417
 
366
- # Start test event loop in background
367
- test_task = asyncio.create_task(_test_event_loop(ws))
368
-
369
- # Message loop: handle incoming RPC + events
370
- async for raw in ws:
371
- try:
372
- msg = json.loads(raw)
373
- except (json.JSONDecodeError, TypeError):
374
- continue
375
-
376
- try:
377
- has_method = "method" in msg
378
- has_id = "id" in msg
379
-
380
- if has_method and not has_id:
381
- # Event Notification
382
- await _handle_event_notification(msg)
383
- elif has_method and has_id:
384
- # Incoming RPC request
385
- await _handle_rpc_request(ws, msg)
386
- # Ignore RPC responses (we don't await them in this simple impl)
387
- except Exception as e:
388
- print(f"[model_service] 消息处理异常(已忽略): {e}")
389
-
390
- except Exception as e:
391
- _write_crash(type(e), e, e.__traceback__, severity="critical", handled=True)
392
- _print_crash_summary(type(e), e.__traceback__)
393
- sys.exit(1)
418
+ # Start test event loop in background
419
+ test_task = asyncio.create_task(_test_event_loop(ws))
420
+
421
+ # Message loop: handle incoming RPC + events
422
+ async for raw in ws:
423
+ try:
424
+ msg = json.loads(raw)
425
+ except (json.JSONDecodeError, TypeError):
426
+ continue
427
+
428
+ try:
429
+ has_method = "method" in msg
430
+ has_id = "id" in msg
431
+
432
+ if has_method and not has_id:
433
+ # Event Notification
434
+ await _handle_event_notification(msg)
435
+ elif has_method and has_id:
436
+ # Incoming RPC request
437
+ await _handle_rpc_request(ws, msg)
438
+ # Ignore RPC responses (we don't await them in this simple impl)
439
+ except Exception as e:
440
+ print(f"[model_service] 消息处理异常(已忽略): {e}")
394
441
 
395
442
 
396
443
  async def _rpc_call(ws, method: str, params: dict = None):
@@ -416,10 +463,14 @@ async def _handle_event_notification(msg: dict):
416
463
  event_type = params.get("event", "")
417
464
  data = params.get("data", {})
418
465
 
419
- # Special handling for module.shutdown targeting model_service
420
- if event_type == "module.shutdown" and data.get("module_id") == "model_service":
421
- await _handle_shutdown()
422
- return
466
+ # Special handling for module.shutdown
467
+ if event_type == "module.shutdown":
468
+ target = data.get("module_id", "")
469
+ reason = data.get("reason", "")
470
+ # Handle both targeted shutdown (module_id == "model_service") and broadcast shutdown (no module_id or launcher_lost)
471
+ if target == "model_service" or not target or reason == "launcher_lost":
472
+ await _handle_shutdown()
473
+ return
423
474
 
424
475
  # Log other events
425
476
  print(f"[model_service] Event received: {event_type}")
@@ -472,8 +523,15 @@ async def _rpc_status() -> dict:
472
523
 
473
524
 
474
525
  async def _handle_shutdown():
475
- """Handle module.shutdown event — ack, cleanup, ready, exit."""
526
+ """Handle module.shutdown event — exiting → ack cleanup ready exit."""
527
+ global _shutting_down
476
528
  print("[model_service] Received shutdown request")
529
+ _shutting_down = True
530
+ # Step 0: Send module.exiting
531
+ await _publish_event(_ws_global, {
532
+ "event": "module.exiting",
533
+ "data": {"module_id": "model_service", "action": "none"},
534
+ })
477
535
  # Step 1: Send ack
478
536
  await _publish_event(_ws_global, {
479
537
  "event": "module.shutdown.ack",
@@ -265,11 +265,25 @@ def _read_stdin_kite_message(expected_type: str, timeout: float = 10) -> dict |
265
265
 
266
266
  # Global WS reference for publish_event callback
267
267
  _ws_global = None
268
+ _shutting_down = False
269
+ _monitor = None
270
+ _monitor_task = None
268
271
 
272
+ # RPC request-response infrastructure
273
+ _rpc_waiters: dict[str, asyncio.Event] = {} # rpc_id -> Event
274
+ _rpc_results: dict[str, dict] = {} # rpc_id -> response dict
275
+
276
+
277
+ def _is_auth_failure(e: Exception) -> bool:
278
+ """Check if a WebSocket exception indicates authentication failure."""
279
+ if hasattr(e, 'rcvd') and e.rcvd is not None:
280
+ code = e.rcvd.code if hasattr(e.rcvd, 'code') else 0
281
+ return code in (4001, 4003)
282
+ return False
269
283
 
270
284
 
271
285
  async def main():
272
- global _ws_global
286
+ global _ws_global, _shutting_down, _monitor
273
287
  # Initialize log file paths
274
288
  global _log_dir, _log_latest_path, _crash_log_path
275
289
  module_data = os.environ.get("KITE_MODULE_DATA")
@@ -321,57 +335,104 @@ async def main():
321
335
 
322
336
  print(f"[watchdog] Token received ({len(token)} chars), kernel port: {kernel_port} ({_fmt_elapsed(_t0)})")
323
337
 
324
- # Connect to Kernel WebSocket
325
- ws_url = f"ws://127.0.0.1:{kernel_port}/ws?token={token}&id=watchdog"
326
- print(f"[watchdog] Connecting to Kernel: {ws_url}")
338
+ # Create monitor (once, persists across reconnects)
339
+ _monitor = HealthMonitor(
340
+ own_token=token,
341
+ kernel_port=kernel_port,
342
+ )
327
343
 
328
- try:
329
- async with websockets.connect(ws_url, open_timeout=5, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
330
- _ws_global = ws
331
- print(f"[watchdog] Connected to Kernel ({_fmt_elapsed(_t0)})")
332
-
333
- # Subscribe to events
334
- await _rpc_call(ws, "event.subscribe", {
335
- "events": [
336
- "system.ready",
337
- "module.started",
338
- "module.stopped",
339
- "module.exiting",
340
- "module.ready",
341
- "module.shutdown",
342
- ],
343
- })
344
- print(f"[watchdog] Subscribed to events ({_fmt_elapsed(_t0)})")
345
-
346
- # Register to Kernel Registry via RPC
347
- await _rpc_call(ws, "registry.register", {
348
- "module_id": "watchdog",
349
- "module_type": "service",
350
- "events_publish": {
351
- "watchdog.module.unhealthy": {},
352
- "watchdog.module.recovered": {},
353
- "watchdog.alert": {},
354
- },
355
- "events_subscribe": [
356
- "system.ready",
357
- "module.started",
358
- "module.stopped",
359
- "module.exiting",
360
- "module.ready",
361
- "module.shutdown",
362
- ],
363
- })
364
- print(f"[watchdog] Registered to Kernel ({_fmt_elapsed(_t0)})")
344
+ # Start reconnect loop
345
+ await _ws_loop(token, kernel_port, _t0)
365
346
 
366
- # Create monitor with RPC callback
367
- monitor = HealthMonitor(
368
- own_token=token,
369
- kernel_port=kernel_port,
370
- )
371
- monitor.publish_event = lambda event: asyncio.create_task(_publish_event(ws, event))
372
- monitor.rpc_call = lambda method, params: _rpc_call(ws, method, params)
373
347
 
374
- # Publish module.ready
348
+ async def _ws_loop(token: str, kernel_port: int, _t0: float):
349
+ """Connect to Kernel with exponential backoff reconnection."""
350
+ global _shutting_down
351
+ retry_delay = 0.3
352
+ max_delay = 5.0
353
+ max_retries = 10
354
+ attempt = 0
355
+ while not _shutting_down:
356
+ try:
357
+ await _ws_connect(token, kernel_port, _t0)
358
+ retry_delay = 0.3
359
+ attempt = 0
360
+ except asyncio.CancelledError:
361
+ return
362
+ except Exception as e:
363
+ attempt += 1
364
+ if _is_auth_failure(e):
365
+ print(f"[watchdog] Kernel 认证失败,退出")
366
+ sys.exit(1)
367
+ if attempt >= max_retries:
368
+ print(f"[watchdog] 重连失败 {max_retries} 次,退出")
369
+ sys.exit(1)
370
+ _write_crash(type(e), e, e.__traceback__, severity="error", handled=True)
371
+ print(f"[watchdog] 连接错误: {e}, {retry_delay:.1f}s 后重试 ({attempt}/{max_retries})")
372
+ _ws_global_clear()
373
+ if _shutting_down:
374
+ return
375
+ await asyncio.sleep(retry_delay)
376
+ retry_delay = min(retry_delay * 2, max_delay)
377
+
378
+
379
+ def _ws_global_clear():
380
+ global _ws_global
381
+ _ws_global = None
382
+
383
+
384
+ async def _ws_connect(token: str, kernel_port: int, _t0: float):
385
+ """Single WebSocket session: connect → subscribe → register → ready → receive loop."""
386
+ global _ws_global, _monitor, _monitor_task
387
+
388
+ ws_url = f"ws://127.0.0.1:{kernel_port}/ws?token={token}&id=watchdog"
389
+ print(f"[watchdog] Connecting to Kernel: {ws_url}")
390
+
391
+ async with websockets.connect(ws_url, open_timeout=5, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
392
+ _ws_global = ws
393
+ print(f"[watchdog] Connected to Kernel ({_fmt_elapsed(_t0)})")
394
+
395
+ # Subscribe to events
396
+ await _rpc_call(ws, "event.subscribe", {
397
+ "events": [
398
+ "system.ready",
399
+ "module.started",
400
+ "module.stopped",
401
+ "module.exiting",
402
+ "module.ready",
403
+ "module.shutdown",
404
+ "module.offline",
405
+ ],
406
+ })
407
+ print(f"[watchdog] Subscribed to events ({_fmt_elapsed(_t0)})")
408
+
409
+ # Register to Kernel Registry via RPC
410
+ await _rpc_call(ws, "registry.register", {
411
+ "module_id": "watchdog",
412
+ "module_type": "service",
413
+ "events_publish": {
414
+ "watchdog.module.unhealthy": {},
415
+ "watchdog.module.recovered": {},
416
+ "watchdog.alert": {},
417
+ },
418
+ "events_subscribe": [
419
+ "system.ready",
420
+ "module.started",
421
+ "module.stopped",
422
+ "module.exiting",
423
+ "module.ready",
424
+ "module.shutdown",
425
+ "module.offline",
426
+ ],
427
+ })
428
+ print(f"[watchdog] Registered to Kernel ({_fmt_elapsed(_t0)})")
429
+
430
+ # Set up monitor callbacks (reconnect-safe)
431
+ _monitor.publish_event = lambda event: asyncio.create_task(_publish_event(ws, event))
432
+ _monitor.rpc_call = lambda method, params: _rpc_call_with_response(ws, method, params)
433
+
434
+ # Publish module.ready (every reconnect)
435
+ if not _shutting_down:
375
436
  await _rpc_call(ws, "event.publish", {
376
437
  "event_id": str(uuid.uuid4()),
377
438
  "event": "module.ready",
@@ -382,34 +443,35 @@ async def main():
382
443
  })
383
444
  print(f"[watchdog] module.ready published ({_fmt_elapsed(_t0)})")
384
445
 
385
- # Start monitor loop in background
386
- monitor_task = asyncio.create_task(monitor.run())
387
-
388
- # Message loop: handle incoming RPC + events
389
- async for raw in ws:
390
- try:
391
- msg = json.loads(raw)
392
- except (json.JSONDecodeError, TypeError):
393
- continue
394
-
395
- try:
396
- has_method = "method" in msg
397
- has_id = "id" in msg
398
-
399
- if has_method and not has_id:
400
- # Event Notification
401
- await _handle_event_notification(msg, monitor)
402
- elif has_method and has_id:
403
- # Incoming RPC request
404
- await _handle_rpc_request(ws, msg, monitor)
405
- # Ignore RPC responses (we don't await them in this simple impl)
406
- except Exception as e:
407
- print(f"[watchdog] 消息处理异常(已忽略): {e}")
408
-
409
- except Exception as e:
410
- _write_crash(type(e), e, e.__traceback__, severity="critical", handled=True)
411
- _print_crash_summary(type(e), e.__traceback__)
412
- sys.exit(1)
446
+ # Start monitor loop if not already running
447
+ if _monitor_task is None or _monitor_task.done():
448
+ _monitor_task = asyncio.create_task(_monitor.run())
449
+
450
+ # Message loop: handle incoming RPC + events
451
+ async for raw in ws:
452
+ try:
453
+ msg = json.loads(raw)
454
+ except (json.JSONDecodeError, TypeError):
455
+ continue
456
+
457
+ try:
458
+ has_method = "method" in msg
459
+ has_id = "id" in msg
460
+
461
+ if has_method and not has_id:
462
+ # Event Notification
463
+ await _handle_event_notification(msg, _monitor)
464
+ elif has_method and has_id:
465
+ # Incoming RPC request
466
+ await _handle_rpc_request(ws, msg, _monitor)
467
+ elif has_id and not has_method:
468
+ # RPC response — route to waiter
469
+ msg_id = msg["id"]
470
+ if msg_id in _rpc_waiters:
471
+ _rpc_results[msg_id] = msg
472
+ _rpc_waiters[msg_id].set()
473
+ except Exception as e:
474
+ print(f"[watchdog] 消息处理异常(已忽略): {e}")
413
475
 
414
476
 
415
477
 
@@ -421,6 +483,28 @@ async def _rpc_call(ws, method: str, params: dict = None):
421
483
  await ws.send(json.dumps(msg))
422
484
 
423
485
 
486
+ async def _rpc_call_with_response(ws, method: str, params: dict = None, timeout: float = 5) -> dict:
487
+ """Send a JSON-RPC 2.0 request and await the response."""
488
+ rpc_id = str(uuid.uuid4())
489
+ msg = {"jsonrpc": "2.0", "id": rpc_id, "method": method}
490
+ if params:
491
+ msg["params"] = params
492
+
493
+ evt = asyncio.Event()
494
+ _rpc_waiters[rpc_id] = evt
495
+
496
+ await ws.send(json.dumps(msg))
497
+
498
+ try:
499
+ await asyncio.wait_for(evt.wait(), timeout=timeout)
500
+ return _rpc_results.pop(rpc_id, {})
501
+ except asyncio.TimeoutError:
502
+ return {"error": {"code": -32000, "message": f"RPC timeout: {method}"}}
503
+ finally:
504
+ _rpc_waiters.pop(rpc_id, None)
505
+ _rpc_results.pop(rpc_id, None)
506
+
507
+
424
508
  async def _publish_event(ws, event: dict):
425
509
  """Publish an event via RPC event.publish."""
426
510
  await _rpc_call(ws, "event.publish", {
@@ -441,8 +525,8 @@ async def _handle_event_notification(msg: dict, monitor: HealthMonitor):
441
525
  await _handle_shutdown(monitor)
442
526
  return
443
527
 
444
- # Forward to monitor
445
- await monitor.handle_event(msg)
528
+ # Forward to monitor (extract params from JSON-RPC notification)
529
+ await monitor.handle_event(params)
446
530
 
447
531
 
448
532
  async def _handle_rpc_request(ws, msg: dict, monitor: HealthMonitor):
@@ -489,8 +573,15 @@ async def _rpc_status(monitor: HealthMonitor) -> dict:
489
573
 
490
574
 
491
575
  async def _handle_shutdown(monitor: HealthMonitor):
492
- """Handle module.shutdown event — ack, cleanup, ready, exit."""
576
+ """Handle module.shutdown event — exiting → ack cleanup ready exit."""
577
+ global _shutting_down
493
578
  print("[watchdog] Received shutdown request")
579
+ _shutting_down = True
580
+ # Step 0: Send module.exiting
581
+ await _publish_event(_ws_global, {
582
+ "event": "module.exiting",
583
+ "data": {"module_id": "watchdog", "action": "none"},
584
+ })
494
585
  # Step 1: Send ack
495
586
  await _publish_event(_ws_global, {
496
587
  "event": "module.shutdown.ack",