agentscope-runtime 1.0.5__py3-none-any.whl → 1.1.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. agentscope_runtime/__init__.py +3 -0
  2. agentscope_runtime/adapters/agentscope/message.py +36 -295
  3. agentscope_runtime/adapters/agentscope/stream.py +89 -2
  4. agentscope_runtime/adapters/agno/message.py +11 -2
  5. agentscope_runtime/adapters/agno/stream.py +1 -0
  6. agentscope_runtime/adapters/langgraph/__init__.py +1 -3
  7. agentscope_runtime/adapters/langgraph/message.py +11 -106
  8. agentscope_runtime/adapters/langgraph/stream.py +1 -0
  9. agentscope_runtime/adapters/ms_agent_framework/message.py +11 -1
  10. agentscope_runtime/adapters/ms_agent_framework/stream.py +1 -0
  11. agentscope_runtime/adapters/text/stream.py +1 -0
  12. agentscope_runtime/common/container_clients/agentrun_client.py +0 -3
  13. agentscope_runtime/common/container_clients/boxlite_client.py +26 -15
  14. agentscope_runtime/common/container_clients/fc_client.py +0 -11
  15. agentscope_runtime/common/utils/deprecation.py +14 -17
  16. agentscope_runtime/common/utils/logging.py +44 -0
  17. agentscope_runtime/engine/app/agent_app.py +5 -5
  18. agentscope_runtime/engine/app/celery_mixin.py +43 -4
  19. agentscope_runtime/engine/deployers/adapter/agui/__init__.py +8 -1
  20. agentscope_runtime/engine/deployers/adapter/agui/agui_adapter_utils.py +6 -1
  21. agentscope_runtime/engine/deployers/adapter/agui/agui_protocol_adapter.py +2 -2
  22. agentscope_runtime/engine/deployers/utils/service_utils/fastapi_factory.py +13 -0
  23. agentscope_runtime/engine/runner.py +31 -6
  24. agentscope_runtime/engine/schemas/agent_schemas.py +28 -0
  25. agentscope_runtime/engine/services/sandbox/sandbox_service.py +41 -9
  26. agentscope_runtime/sandbox/box/base/base_sandbox.py +4 -0
  27. agentscope_runtime/sandbox/box/browser/browser_sandbox.py +4 -0
  28. agentscope_runtime/sandbox/box/dummy/dummy_sandbox.py +9 -2
  29. agentscope_runtime/sandbox/box/filesystem/filesystem_sandbox.py +4 -0
  30. agentscope_runtime/sandbox/box/gui/gui_sandbox.py +5 -1
  31. agentscope_runtime/sandbox/box/mobile/mobile_sandbox.py +4 -0
  32. agentscope_runtime/sandbox/box/sandbox.py +122 -13
  33. agentscope_runtime/sandbox/client/async_http_client.py +1 -0
  34. agentscope_runtime/sandbox/client/base.py +0 -1
  35. agentscope_runtime/sandbox/client/http_client.py +0 -2
  36. agentscope_runtime/sandbox/manager/heartbeat_mixin.py +486 -0
  37. agentscope_runtime/sandbox/manager/sandbox_manager.py +740 -153
  38. agentscope_runtime/sandbox/manager/server/app.py +18 -11
  39. agentscope_runtime/sandbox/manager/server/config.py +10 -2
  40. agentscope_runtime/sandbox/mcp_server.py +0 -1
  41. agentscope_runtime/sandbox/model/__init__.py +2 -1
  42. agentscope_runtime/sandbox/model/container.py +90 -3
  43. agentscope_runtime/sandbox/model/manager_config.py +45 -1
  44. agentscope_runtime/version.py +1 -1
  45. {agentscope_runtime-1.0.5.dist-info → agentscope_runtime-1.1.0b2.dist-info}/METADATA +36 -54
  46. {agentscope_runtime-1.0.5.dist-info → agentscope_runtime-1.1.0b2.dist-info}/RECORD +50 -69
  47. {agentscope_runtime-1.0.5.dist-info → agentscope_runtime-1.1.0b2.dist-info}/WHEEL +1 -1
  48. agentscope_runtime/adapters/agentscope/long_term_memory/__init__.py +0 -6
  49. agentscope_runtime/adapters/agentscope/long_term_memory/_long_term_memory_adapter.py +0 -258
  50. agentscope_runtime/adapters/agentscope/memory/__init__.py +0 -6
  51. agentscope_runtime/adapters/agentscope/memory/_memory_adapter.py +0 -152
  52. agentscope_runtime/engine/services/agent_state/__init__.py +0 -25
  53. agentscope_runtime/engine/services/agent_state/redis_state_service.py +0 -166
  54. agentscope_runtime/engine/services/agent_state/state_service.py +0 -179
  55. agentscope_runtime/engine/services/agent_state/state_service_factory.py +0 -52
  56. agentscope_runtime/engine/services/memory/__init__.py +0 -33
  57. agentscope_runtime/engine/services/memory/mem0_memory_service.py +0 -128
  58. agentscope_runtime/engine/services/memory/memory_service.py +0 -292
  59. agentscope_runtime/engine/services/memory/memory_service_factory.py +0 -126
  60. agentscope_runtime/engine/services/memory/redis_memory_service.py +0 -290
  61. agentscope_runtime/engine/services/memory/reme_personal_memory_service.py +0 -109
  62. agentscope_runtime/engine/services/memory/reme_task_memory_service.py +0 -11
  63. agentscope_runtime/engine/services/memory/tablestore_memory_service.py +0 -301
  64. agentscope_runtime/engine/services/session_history/__init__.py +0 -32
  65. agentscope_runtime/engine/services/session_history/redis_session_history_service.py +0 -283
  66. agentscope_runtime/engine/services/session_history/session_history_service.py +0 -267
  67. agentscope_runtime/engine/services/session_history/session_history_service_factory.py +0 -73
  68. agentscope_runtime/engine/services/session_history/tablestore_session_history_service.py +0 -288
  69. {agentscope_runtime-1.0.5.dist-info → agentscope_runtime-1.1.0b2.dist-info}/entry_points.txt +0 -0
  70. {agentscope_runtime-1.0.5.dist-info → agentscope_runtime-1.1.0b2.dist-info}/licenses/LICENSE +0 -0
  71. {agentscope_runtime-1.0.5.dist-info → agentscope_runtime-1.1.0b2.dist-info}/top_level.txt +0 -0
@@ -2,10 +2,12 @@
2
2
  # pylint: disable=redefined-outer-name, protected-access
3
3
  # pylint: disable=too-many-branches, too-many-statements
4
4
  # pylint: disable=redefined-outer-name, protected-access, too-many-branches
5
- # pylint: disable=too-many-public-methods
5
+ # pylint: disable=too-many-public-methods, unused-argument
6
6
  import asyncio
7
7
  import inspect
8
8
  import json
9
+ import time
10
+ import threading
9
11
  import logging
10
12
  import os
11
13
  import secrets
@@ -17,6 +19,8 @@ import requests
17
19
  import shortuuid
18
20
  import httpx
19
21
 
22
+ from .heartbeat_mixin import HeartbeatMixin, touch_session
23
+ from ..constant import TIMEOUT
20
24
  from ..client import (
21
25
  SandboxHttpClient,
22
26
  TrainingSandboxClient,
@@ -29,6 +33,7 @@ from ..manager.storage import (
29
33
  )
30
34
  from ..model import (
31
35
  ContainerModel,
36
+ ContainerState,
32
37
  SandboxManagerEnvConfig,
33
38
  )
34
39
  from ..registry import SandboxRegistry
@@ -39,9 +44,7 @@ from ...common.collections import (
39
44
  InMemoryQueue,
40
45
  )
41
46
  from ...common.container_clients import ContainerClientFactory
42
- from ..constant import TIMEOUT
43
47
 
44
- logging.basicConfig(level=logging.INFO)
45
48
  logger = logging.getLogger(__name__)
46
49
 
47
50
 
@@ -130,7 +133,7 @@ def remote_wrapper_async(
130
133
  return decorator
131
134
 
132
135
 
133
- class SandboxManager:
136
+ class SandboxManager(HeartbeatMixin):
134
137
  def __init__(
135
138
  self,
136
139
  config: Optional[SandboxManagerEnvConfig] = None,
@@ -192,9 +195,7 @@ class SandboxManager:
192
195
  self.prefix = self.config.container_prefix_key
193
196
  self.default_mount_dir = self.config.default_mount_dir
194
197
  self.readonly_mounts = self.config.readonly_mounts
195
- self.storage_folder = (
196
- self.config.storage_folder or self.default_mount_dir
197
- )
198
+ self.storage_folder = self.config.storage_folder
198
199
 
199
200
  self.pool_queues = {}
200
201
  if self.config.redis_enabled:
@@ -208,24 +209,26 @@ class SandboxManager:
208
209
  password=self.config.redis_password,
209
210
  decode_responses=True,
210
211
  )
212
+ self.redis_client = redis_client
211
213
  try:
212
- redis_client.ping()
214
+ self.redis_client.ping()
213
215
  except ConnectionError as e:
214
216
  raise RuntimeError(
215
217
  "Unable to connect to the Redis server.",
216
218
  ) from e
217
219
 
218
- self.container_mapping = RedisMapping(redis_client)
220
+ self.container_mapping = RedisMapping(self.redis_client)
219
221
  self.session_mapping = RedisMapping(
220
- redis_client,
222
+ self.redis_client,
221
223
  prefix="session_mapping",
222
224
  )
223
225
 
224
226
  # Init multi sand box pool
225
227
  for t in self.default_type:
226
228
  queue_key = f"{self.config.redis_container_pool_key}:{t.value}"
227
- self.pool_queues[t] = RedisQueue(redis_client, queue_key)
229
+ self.pool_queues[t] = RedisQueue(self.redis_client, queue_key)
228
230
  else:
231
+ self.redis_client = None
229
232
  self.container_mapping = InMemoryMapping()
230
233
  self.session_mapping = InMemoryMapping()
231
234
 
@@ -254,8 +257,9 @@ class SandboxManager:
254
257
  else:
255
258
  self.storage = LocalStorage()
256
259
 
257
- if self.pool_size > 0:
258
- self._init_container_pool()
260
+ self._watcher_stop_event = threading.Event()
261
+ self._watcher_thread = None
262
+ self._watcher_thread_lock = threading.Lock()
259
263
 
260
264
  logger.debug(str(config))
261
265
 
@@ -264,12 +268,18 @@ class SandboxManager:
264
268
  "Entering SandboxManager context (sync). "
265
269
  "Cleanup will be performed automatically on exit.",
266
270
  )
271
+ # local mode: watcher starts
272
+ if self.http_session is None:
273
+ self.start_watcher()
274
+
267
275
  return self
268
276
 
269
277
  def __exit__(self, exc_type, exc_value, traceback):
270
278
  logger.debug(
271
279
  "Exiting SandboxManager context (sync). Cleaning up resources.",
272
280
  )
281
+ self.stop_watcher()
282
+
273
283
  self.cleanup()
274
284
 
275
285
  if self.http_session:
@@ -295,12 +305,18 @@ class SandboxManager:
295
305
  "Entering SandboxManager context (async). "
296
306
  "Cleanup will be performed automatically on async exit.",
297
307
  )
308
+ # local mode: watcher starts
309
+ if self.http_session is None:
310
+ self.start_watcher()
311
+
298
312
  return self
299
313
 
300
314
  async def __aexit__(self, exc_type, exc_value, tb):
301
315
  logger.debug(
302
316
  "Exiting SandboxManager context (async). Cleaning up resources.",
303
317
  )
318
+ self.stop_watcher()
319
+
304
320
  await self.cleanup_async()
305
321
 
306
322
  if self.http_session:
@@ -318,6 +334,7 @@ class SandboxManager:
318
334
  logger.warning(f"Error closing httpx_client: {e}")
319
335
 
320
336
  def _generate_container_key(self, session_id):
337
+ # TODO: refactor this and mapping, use sandbox_id as identity
321
338
  return f"{self.prefix}{session_id}"
322
339
 
323
340
  def _make_request(self, method: str, endpoint: str, data: dict):
@@ -420,68 +437,140 @@ class SandboxManager:
420
437
 
421
438
  return response.json()
422
439
 
423
- def _init_container_pool(self):
440
+ def start_watcher(self) -> bool:
424
441
  """
425
- Init runtime pool
442
+ Start background heartbeat scanning thread.
443
+ Default: not started automatically. Caller must invoke explicitly.
444
+ If watcher_scan_interval == 0 => disabled, returns False.
426
445
  """
427
- for t in self.default_type:
428
- queue = self.pool_queues[t]
429
- while queue.size() < self.pool_size:
430
- try:
431
- container_name = self.create(sandbox_type=t.value)
432
- container_model = self.container_mapping.get(
433
- container_name,
434
- )
435
- if container_model:
436
- # Check the pool size again to avoid race condition
437
- if queue.size() < self.pool_size:
438
- queue.enqueue(container_model)
439
- else:
440
- # The pool size has reached the limit
441
- self.release(container_name)
442
- break
443
- else:
444
- logger.error("Failed to create container for pool")
445
- break
446
- except Exception as e:
447
- logger.error(f"Error initializing runtime pool: {e}")
448
- break
446
+ interval = int(self.config.watcher_scan_interval)
447
+ if interval <= 0:
448
+ logger.info(
449
+ "Watcher disabled (watcher_scan_interval <= 0)",
450
+ )
451
+ return False
452
+
453
+ with self._watcher_thread_lock:
454
+ if self._watcher_thread and self._watcher_thread.is_alive():
455
+ return True # already running
456
+
457
+ self._watcher_stop_event.clear()
458
+
459
+ def _loop():
460
+ logger.info(f"Watcher started, interval={interval}s")
461
+ while not self._watcher_stop_event.is_set():
462
+ try:
463
+ hb = self.scan_heartbeat_once()
464
+ pool = self.scan_pool_once()
465
+ gc = self.scan_released_cleanup_once()
466
+
467
+ logger.debug(
468
+ "watcher metrics: "
469
+ f"heartbeat={hb}, pool={pool}, released_gc={gc}",
470
+ )
471
+ except Exception as e:
472
+ logger.warning(f"Watcher loop error: {e}")
473
+ logger.debug(traceback.format_exc())
474
+
475
+ # wait with stop support
476
+ self._watcher_stop_event.wait(interval)
477
+
478
+ logger.info("Watcher stopped")
479
+
480
+ t = threading.Thread(
481
+ target=_loop,
482
+ name="watcher",
483
+ daemon=True,
484
+ )
485
+ self._watcher_thread = t
486
+ t.start()
487
+ return True
488
+
489
+ def stop_watcher(self, join_timeout: float = 5.0) -> None:
490
+ """
491
+ Stop background watcher thread (if running).
492
+ """
493
+ with self._watcher_thread_lock:
494
+ self._watcher_stop_event.set()
495
+ t = self._watcher_thread
496
+
497
+ if t and t.is_alive():
498
+ t.join(timeout=join_timeout)
499
+
500
+ with self._watcher_thread_lock:
501
+ if self._watcher_thread is t:
502
+ self._watcher_thread = None
449
503
 
450
504
  @remote_wrapper()
451
505
  def cleanup(self):
452
- logger.debug(
453
- "Cleaning up resources.",
454
- )
506
+ """
507
+ Destroy all non-terminal containers managed by this SandboxManager.
508
+
509
+ Behavior (local mode):
510
+ - Dequeues and destroys containers from the warm pool (WARM/RUNNING).
511
+ - Scans container_mapping and destroys any remaining non-terminal
512
+ containers.
513
+ - Does NOT delete ContainerModel records from container_mapping;
514
+ instead it relies on release() to mark them as terminal (RELEASED).
515
+ - Skips containers already in terminal states: RELEASED / RECYCLED.
516
+
517
+ Notes:
518
+ - Uses container_name as identity to avoid ambiguity with session_id.
519
+ - Pool containers (WARM) are also destroyed (per current policy).
520
+ """
521
+ logger.debug("Cleaning up resources.")
455
522
 
456
- # Clean up pool first
523
+ # Clean up pool first (destroy warm/running containers; skip
524
+ # terminal states)
457
525
  for queue in self.pool_queues.values():
458
526
  try:
459
527
  while queue.size() > 0:
460
528
  container_json = queue.dequeue()
461
- if container_json:
462
- container_model = ContainerModel(**container_json)
463
- logger.debug(
464
- f"Destroy container"
465
- f" {container_model.container_id}",
466
- )
467
- self.release(container_model.session_id)
529
+ if not container_json:
530
+ continue
531
+
532
+ container_model = ContainerModel(**container_json)
533
+
534
+ # Terminal states: already cleaned logically
535
+ if container_model.state in (
536
+ ContainerState.RELEASED,
537
+ ContainerState.RECYCLED,
538
+ ):
539
+ continue
540
+
541
+ logger.debug(
542
+ f"Destroy pool container"
543
+ f" {container_model.container_id} "
544
+ f"({container_model.container_name})",
545
+ )
546
+ # Use container_name to avoid ambiguity
547
+ self.release(container_model.container_name)
468
548
  except Exception as e:
469
549
  logger.error(f"Error cleaning up runtime pool: {e}")
470
550
 
471
- # Clean up rest container
551
+ # Clean up remaining containers in mapping
472
552
  for key in self.container_mapping.scan(self.prefix):
473
553
  try:
474
554
  container_json = self.container_mapping.get(key)
475
- if container_json:
476
- container_model = ContainerModel(**container_json)
477
- logger.debug(
478
- f"Destroy container {container_model.container_id}",
479
- )
480
- self.release(container_model.session_id)
481
- except Exception as e:
482
- logger.error(
483
- f"Error cleaning up container {key}: {e}",
555
+ if not container_json:
556
+ continue
557
+
558
+ container_model = ContainerModel(**container_json)
559
+
560
+ # Terminal states: already cleaned logically
561
+ if container_model.state in (
562
+ ContainerState.RELEASED,
563
+ ContainerState.RECYCLED,
564
+ ):
565
+ continue
566
+
567
+ logger.debug(
568
+ f"Destroy container {container_model.container_id} "
569
+ f"({container_model.container_name})",
484
570
  )
571
+ self.release(container_model.container_name)
572
+ except Exception as e:
573
+ logger.error(f"Error cleaning up container {key}: {e}")
485
574
 
486
575
  @remote_wrapper_async()
487
576
  async def cleanup_async(self, *args, **kwargs):
@@ -499,101 +588,104 @@ class SandboxManager:
499
588
 
500
589
  queue = self.pool_queues[sandbox_type]
501
590
 
502
- cnt = 0
503
- try:
504
- while True:
505
- if cnt > self.pool_size:
506
- raise RuntimeError(
507
- "No container available in pool after check the pool.",
508
- )
509
- cnt += 1
510
-
511
- # Add a new one to container
512
- container_name = self.create(sandbox_type=sandbox_type)
513
- new_container_model = self.container_mapping.get(
514
- container_name,
515
- )
591
+ def _bind_meta(container_model: ContainerModel):
592
+ if not meta:
593
+ return
516
594
 
517
- if new_container_model:
518
- queue.enqueue(
519
- new_container_model,
520
- )
595
+ session_ctx_id = meta.get("session_ctx_id")
521
596
 
522
- container_json = queue.dequeue()
597
+ container_model.meta = meta
598
+ container_model.session_ctx_id = session_ctx_id
599
+ container_model.state = (
600
+ ContainerState.RUNNING
601
+ if session_ctx_id
602
+ else ContainerState.WARM
603
+ )
604
+ container_model.recycled_at = None
605
+ container_model.recycle_reason = None
606
+ container_model.updated_at = time.time()
523
607
 
524
- if not container_json:
525
- raise RuntimeError(
526
- "No container available in pool.",
527
- )
608
+ # persist first
609
+ self.container_mapping.set(
610
+ container_model.container_name,
611
+ container_model.model_dump(),
612
+ )
528
613
 
529
- container_model = ContainerModel(**container_json)
614
+ # session mapping + first heartbeat only when session_ctx_id exists
615
+ if session_ctx_id:
616
+ env_ids = self.session_mapping.get(session_ctx_id) or []
617
+ if container_model.container_name not in env_ids:
618
+ env_ids.append(container_model.container_name)
530
619
 
531
- # Add meta field to container
532
- if meta and not container_model.meta:
533
- container_model.meta = meta
534
- self.container_mapping.set(
535
- container_model.container_name,
536
- container_model.model_dump(),
537
- )
538
- # Update session mapping
539
- if "session_ctx_id" in meta:
540
- env_ids = (
541
- self.session_mapping.get(
542
- meta["session_ctx_id"],
543
- )
544
- or []
545
- )
546
- if container_model.container_name not in env_ids:
547
- env_ids.append(container_model.container_name)
548
- self.session_mapping.set(
549
- meta["session_ctx_id"],
550
- env_ids,
551
- )
620
+ self.session_mapping.set(session_ctx_id, env_ids)
552
621
 
553
- logger.debug(
554
- f"Retrieved container from pool:"
555
- f" {container_model.session_id}",
622
+ self.clear_container_recycle_marker(
623
+ container_model.container_name,
624
+ set_state=ContainerState.RUNNING,
556
625
  )
626
+ self.update_heartbeat(session_ctx_id)
627
+
628
+ try:
629
+ # 1) Try dequeue first
630
+ container_json = queue.dequeue()
631
+ if container_json:
632
+ container_model = ContainerModel(**container_json)
557
633
 
634
+ # version check
558
635
  if (
559
636
  container_model.version
560
- != SandboxRegistry.get_image_by_type(
561
- sandbox_type,
562
- )
637
+ != SandboxRegistry.get_image_by_type(sandbox_type)
563
638
  ):
564
639
  logger.warning(
565
640
  f"Container {container_model.session_id} outdated, "
566
- f"trying next one in pool",
641
+ "dropping it",
567
642
  )
568
- self.release(container_model.session_id)
569
- continue
643
+ self.release(container_model.container_name)
644
+ container_json = None
645
+ else:
646
+ # inspect + status check
647
+ if (
648
+ self.client.inspect(
649
+ container_model.container_id,
650
+ )
651
+ is None
652
+ ):
653
+ logger.warning(
654
+ f"Container {container_model.container_id} not "
655
+ f"found, dropping it",
656
+ )
657
+ self.release(container_model.container_name)
658
+ container_json = None
659
+ else:
660
+ status = self.client.get_status(
661
+ container_model.container_id,
662
+ )
663
+ if status != "running":
664
+ logger.warning(
665
+ f"Container {container_model.container_id} "
666
+ f"not running ({status}), dropping it",
667
+ )
668
+ self.release(container_model.container_name)
669
+ container_json = None
570
670
 
571
- if self.client.inspect(container_model.container_id) is None:
572
- logger.warning(
573
- f"Container {container_model.container_id} not found "
574
- f"or unexpected error happens.",
671
+ # if still valid, bind meta and return
672
+ if container_json:
673
+ _bind_meta(container_model)
674
+ logger.debug(
675
+ f"Retrieved container from pool:"
676
+ f" {container_model.session_id}",
575
677
  )
576
- continue
577
-
578
- if (
579
- self.client.get_status(container_model.container_id)
580
- == "running"
581
- ):
582
678
  return container_model.container_name
583
- else:
584
- logger.error(
585
- f"Container {container_model.container_id} is not "
586
- f"running. Trying next one in pool.",
587
- )
588
- # Destroy the stopped container
589
- self.release(container_model.session_id)
679
+
680
+ # 2) Pool empty or invalid -> create a new one and return
681
+ return self.create(sandbox_type=sandbox_type.value, meta=meta)
590
682
 
591
683
  except Exception as e:
592
684
  logger.warning(
593
685
  "Error getting container from pool, create a new one.",
594
686
  )
595
687
  logger.debug(f"{e}: {traceback.format_exc()}")
596
- return self.create()
688
+ return self.create(sandbox_type=sandbox_type.value, meta=meta)
597
689
 
598
690
  @remote_wrapper_async()
599
691
  async def create_from_pool_async(self, *args, **kwargs):
@@ -604,11 +696,44 @@ class SandboxManager:
604
696
  def create(
605
697
  self,
606
698
  sandbox_type=None,
607
- mount_dir=None, # TODO: remove to avoid leaking
699
+ mount_dir=None,
608
700
  storage_path=None,
609
701
  environment: Optional[Dict] = None,
610
702
  meta: Optional[Dict] = None,
611
- ):
703
+ ): # pylint: disable=too-many-return-statements
704
+ # Enforce max sandbox instances
705
+ try:
706
+ limit = self.config.max_sandbox_instances
707
+ if limit > 0:
708
+ # Count only ACTIVE containers; exclude terminal states
709
+ active_states = {
710
+ ContainerState.WARM,
711
+ ContainerState.RUNNING,
712
+ }
713
+ current = 0
714
+ for key in self.container_mapping.scan(self.prefix):
715
+ try:
716
+ container_json = self.container_mapping.get(key)
717
+ if not container_json:
718
+ continue
719
+ cm = ContainerModel(**container_json)
720
+ if cm.state in active_states:
721
+ current += 1
722
+ except Exception:
723
+ # ignore broken records
724
+ continue
725
+ except RuntimeError as e:
726
+ logger.warning(str(e))
727
+ return None
728
+ except Exception:
729
+ # Handle unexpected errors from container_mapping.scan() gracefully
730
+ logger.exception("Failed to check sandbox instance limit")
731
+ return None
732
+
733
+ session_ctx_id = None
734
+ if meta and meta.get("session_ctx_id"):
735
+ session_ctx_id = meta["session_ctx_id"]
736
+
612
737
  if sandbox_type is not None:
613
738
  target_sandbox_type = SandboxType(sandbox_type)
614
739
  else:
@@ -641,7 +766,13 @@ class SandboxManager:
641
766
  short_uuid = shortuuid.ShortUUID(alphabet=alphabet).uuid()
642
767
  session_id = str(short_uuid)
643
768
 
644
- if not mount_dir:
769
+ if mount_dir and not self.config.allow_mount_dir:
770
+ logger.warning(
771
+ "mount_dir is not allowed by config, fallback to "
772
+ "default_mount_dir",
773
+ )
774
+
775
+ if (not mount_dir) or (not self.config.allow_mount_dir):
645
776
  if self.default_mount_dir:
646
777
  mount_dir = os.path.join(self.default_mount_dir, session_id)
647
778
  os.makedirs(mount_dir, exist_ok=True)
@@ -711,6 +842,7 @@ class SandboxManager:
711
842
  volumes=volume_bindings,
712
843
  environment={
713
844
  "SECRET_TOKEN": runtime_token,
845
+ "NGINX_TIMEOUT": str(TIMEOUT) if TIMEOUT else "60",
714
846
  **environment,
715
847
  },
716
848
  runtime_config=config.runtime_config,
@@ -745,6 +877,12 @@ class SandboxManager:
745
877
  version=image,
746
878
  meta=meta or {},
747
879
  timeout=config.timeout,
880
+ sandbox_type=target_sandbox_type.value,
881
+ session_ctx_id=session_ctx_id,
882
+ state=ContainerState.RUNNING
883
+ if session_ctx_id
884
+ else ContainerState.WARM,
885
+ updated_at=time.time(),
748
886
  )
749
887
 
750
888
  # Register in mapping
@@ -754,15 +892,28 @@ class SandboxManager:
754
892
  )
755
893
 
756
894
  # Build mapping session_ctx_id to container_name
757
- if meta and "session_ctx_id" in meta:
758
- env_ids = (
759
- self.session_mapping.get(
760
- meta["session_ctx_id"],
761
- )
762
- or []
895
+ # NOTE:
896
+ # - Only containers bound to a user session_ctx_id participate
897
+ # in heartbeat/reap.
898
+ # - Prewarmed pool containers typically have no session_ctx_id;
899
+ # do NOT write heartbeat for them.
900
+ if meta and "session_ctx_id" in meta and meta["session_ctx_id"]:
901
+ session_ctx_id = meta["session_ctx_id"]
902
+
903
+ env_ids = self.session_mapping.get(session_ctx_id) or []
904
+ if container_model.container_name not in env_ids:
905
+ env_ids.append(container_model.container_name)
906
+ self.session_mapping.set(session_ctx_id, env_ids)
907
+
908
+ # First heartbeat on creation (treat "allocate to session"
909
+ # as first activity)
910
+ self.update_heartbeat(session_ctx_id)
911
+
912
+ # Session is now alive again; clear restore-required marker
913
+ self.clear_container_recycle_marker(
914
+ container_model.container_name,
915
+ set_state=ContainerState.RUNNING,
763
916
  )
764
- env_ids.append(container_model.container_name)
765
- self.session_mapping.set(meta["session_ctx_id"], env_ids)
766
917
 
767
918
  logger.debug(
768
919
  f"Created container {container_name}"
@@ -785,21 +936,25 @@ class SandboxManager:
785
936
  @remote_wrapper()
786
937
  def release(self, identity):
787
938
  try:
788
- container_json = self.get_info(identity)
789
-
790
- if not container_json:
791
- logger.warning(
792
- f"No container found for {identity}.",
939
+ container_json = self.container_mapping.get(identity)
940
+ if container_json is None:
941
+ container_json = self.container_mapping.get(
942
+ self._generate_container_key(identity),
793
943
  )
794
- return True
944
+ if container_json is None:
945
+ logger.warning(
946
+ f"release: container not found for {identity}, "
947
+ f"treat as already released",
948
+ )
949
+ return True
795
950
 
796
951
  container_info = ContainerModel(**container_json)
797
952
 
798
- # remove key in mapping before we remove container
799
- self.container_mapping.delete(container_json.get("container_name"))
953
+ # remove session key in mapping
954
+ session_ctx_id = container_info.session_ctx_id or (
955
+ container_info.meta or {}
956
+ ).get("session_ctx_id")
800
957
 
801
- # remove key in mapping
802
- session_ctx_id = container_info.meta.get("session_ctx_id")
803
958
  if session_ctx_id:
804
959
  env_ids = self.session_mapping.get(session_ctx_id) or []
805
960
  env_ids = [
@@ -810,10 +965,44 @@ class SandboxManager:
810
965
  if env_ids:
811
966
  self.session_mapping.set(session_ctx_id, env_ids)
812
967
  else:
968
+ # last container of this session is gone;
969
+ # keep state consistent
813
970
  self.session_mapping.delete(session_ctx_id)
814
971
 
815
- self.client.stop(container_info.container_id, timeout=1)
816
- self.client.remove(container_info.container_id, force=True)
972
+ # Mark released (do NOT delete mapping) in model
973
+ now = time.time()
974
+ container_info.state = ContainerState.RELEASED
975
+ container_info.released_at = now
976
+ container_info.updated_at = now
977
+ container_info.recycled_at = None
978
+ container_info.recycle_reason = None
979
+
980
+ # Unbind session in model
981
+ container_info.session_ctx_id = None
982
+ if container_info.meta is None:
983
+ container_info.meta = {}
984
+ container_info.meta.pop("session_ctx_id", None)
985
+
986
+ self.container_mapping.set(
987
+ container_info.container_name,
988
+ container_info.model_dump(),
989
+ )
990
+
991
+ try:
992
+ self.client.stop(container_info.container_id, timeout=1)
993
+ except Exception as e:
994
+ logger.debug(
995
+ f"release stop ignored for"
996
+ f" {container_info.container_id}: {e}",
997
+ )
998
+
999
+ try:
1000
+ self.client.remove(container_info.container_id, force=True)
1001
+ except Exception as e:
1002
+ logger.debug(
1003
+ f"release remove ignored for"
1004
+ f" {container_info.container_id}: {e}",
1005
+ )
817
1006
 
818
1007
  logger.debug(f"Container for {identity} destroyed.")
819
1008
 
@@ -970,40 +1159,47 @@ class SandboxManager:
970
1159
  return async_client
971
1160
 
972
1161
  @remote_wrapper()
1162
+ @touch_session(identity_arg="identity")
973
1163
  def check_health(self, identity):
974
1164
  """List tool"""
975
1165
  client = self._establish_connection(identity)
976
1166
  return client.check_health()
977
1167
 
978
1168
  @remote_wrapper_async()
1169
+ @touch_session(identity_arg="identity")
979
1170
  async def check_health_async(self, identity):
980
1171
  client = await self._establish_connection_async(identity)
981
1172
  return await client.check_health()
982
1173
 
983
1174
  @remote_wrapper()
1175
+ @touch_session(identity_arg="identity")
984
1176
  def list_tools(self, identity, tool_type=None, **kwargs):
985
1177
  """List tool"""
986
1178
  client = self._establish_connection(identity)
987
1179
  return client.list_tools(tool_type=tool_type, **kwargs)
988
1180
 
989
1181
  @remote_wrapper_async()
1182
+ @touch_session(identity_arg="identity")
990
1183
  async def list_tools_async(self, identity, tool_type=None, **kwargs):
991
1184
  client = await self._establish_connection_async(identity)
992
1185
  return await client.list_tools(tool_type=tool_type, **kwargs)
993
1186
 
994
1187
  @remote_wrapper()
1188
+ @touch_session(identity_arg="identity")
995
1189
  def call_tool(self, identity, tool_name=None, arguments=None):
996
1190
  """Call tool"""
997
1191
  client = self._establish_connection(identity)
998
1192
  return client.call_tool(tool_name, arguments)
999
1193
 
1000
1194
  @remote_wrapper_async()
1195
+ @touch_session(identity_arg="identity")
1001
1196
  async def call_tool_async(self, identity, tool_name=None, arguments=None):
1002
1197
  """Call tool (async)"""
1003
1198
  client = await self._establish_connection_async(identity)
1004
1199
  return await client.call_tool(tool_name, arguments)
1005
1200
 
1006
1201
  @remote_wrapper()
1202
+ @touch_session(identity_arg="identity")
1007
1203
  def add_mcp_servers(self, identity, server_configs, overwrite=False):
1008
1204
  """
1009
1205
  Add MCP servers to runtime.
@@ -1015,6 +1211,7 @@ class SandboxManager:
1015
1211
  )
1016
1212
 
1017
1213
  @remote_wrapper_async()
1214
+ @touch_session(identity_arg="identity")
1018
1215
  async def add_mcp_servers_async(
1019
1216
  self,
1020
1217
  identity,
@@ -1056,3 +1253,393 @@ class SandboxManager:
1056
1253
  async def list_session_keys_async(self, *args, **kwargs):
1057
1254
  """Async wrapper for list_session_keys()."""
1058
1255
  return await asyncio.to_thread(self.list_session_keys, *args, **kwargs)
1256
+
1257
+ def reap_session(
1258
+ self,
1259
+ session_ctx_id: str,
1260
+ reason: str = "heartbeat_timeout",
1261
+ ) -> bool:
1262
+ """
1263
+ Reap (release) ALL containers bound to session_ctx_id.
1264
+
1265
+ Important:
1266
+ - Prewarm pool containers are NOT part of session_mapping
1267
+ (no session_ctx_id), so they won't be reaped by this flow.
1268
+ """
1269
+ try:
1270
+ env_ids = self.get_session_mapping(session_ctx_id) or []
1271
+
1272
+ for container_name in list(env_ids):
1273
+ now = time.time()
1274
+ try:
1275
+ info = ContainerModel(**self.get_info(container_name))
1276
+
1277
+ # stop/remove actual container
1278
+ try:
1279
+ self.client.stop(info.container_id, timeout=1)
1280
+ except Exception as e:
1281
+ logger.debug(
1282
+ f"Failed to stop container "
1283
+ f"{info.container_id}: {e}",
1284
+ )
1285
+ try:
1286
+ self.client.remove(info.container_id, force=True)
1287
+ except Exception as e:
1288
+ logger.debug(
1289
+ f"Failed to remove container "
1290
+ f"{info.container_id}: {e}",
1291
+ )
1292
+
1293
+ # upload storage if needed
1294
+ if info.mount_dir and info.storage_path:
1295
+ try:
1296
+ self.storage.upload_folder(
1297
+ info.mount_dir,
1298
+ info.storage_path,
1299
+ )
1300
+ except Exception as e:
1301
+ logger.warning(
1302
+ f"upload_folder failed for {container_name}:"
1303
+ f" {e}",
1304
+ )
1305
+
1306
+ # mark recycled, keep model
1307
+ info.state = ContainerState.RECYCLED
1308
+ info.recycled_at = now
1309
+ info.recycle_reason = reason
1310
+ info.updated_at = now
1311
+
1312
+ # keep session_ctx_id for restore
1313
+ info.session_ctx_id = session_ctx_id
1314
+ if info.meta is None:
1315
+ info.meta = {}
1316
+ info.meta["session_ctx_id"] = session_ctx_id
1317
+
1318
+ self.container_mapping.set(
1319
+ info.container_name,
1320
+ info.model_dump(),
1321
+ )
1322
+
1323
+ except Exception as e:
1324
+ logger.warning(
1325
+ f"Failed to recycle container {container_name} for "
1326
+ f"session {session_ctx_id}: {e}",
1327
+ )
1328
+
1329
+ return True
1330
+ except Exception as e:
1331
+ logger.warning(f"Failed to reap session {session_ctx_id}: {e}")
1332
+ logger.debug(traceback.format_exc())
1333
+ return False
1334
+
1335
+ def restore_session(self, session_ctx_id: str) -> None:
1336
+ """
1337
+ Restore ALL recycled sandboxes (containers) for a session.
1338
+
1339
+ For each container record with state==RECYCLED in session_mapping[
1340
+ session_ctx_id]:
1341
+ - If mount_dir is empty -> allocate from pool
1342
+ (prefer same sandbox_type).
1343
+ - If mount_dir exists -> create a new container with that
1344
+ mount_dir/storage_path.
1345
+ - Bind new container to this session and mark RUNNING.
1346
+ - Archive the old recycled record (mark RELEASED).
1347
+
1348
+ After restore:
1349
+ - session_mapping[session_ctx_id] will be replaced with the list of
1350
+ NEW running containers.
1351
+ """
1352
+ env_ids = self.get_session_mapping(session_ctx_id) or []
1353
+ if not env_ids:
1354
+ return
1355
+
1356
+ new_container_names: list[str] = []
1357
+ recycled_old_names: list[str] = []
1358
+
1359
+ # 1) restore each recycled container
1360
+ for old_name in list(env_ids):
1361
+ try:
1362
+ old = ContainerModel(**self.get_info(old_name))
1363
+ except Exception:
1364
+ continue
1365
+
1366
+ if old.state != ContainerState.RECYCLED:
1367
+ # keep non-recycled entries as-is (optional). In practice
1368
+ # env_ids should be recycled only.
1369
+ continue
1370
+
1371
+ sandbox_type = old.sandbox_type or self.default_type[0].value
1372
+ meta = {
1373
+ "session_ctx_id": session_ctx_id,
1374
+ }
1375
+
1376
+ # allocate new container
1377
+ if not old.mount_dir:
1378
+ new_name = self.create_from_pool(
1379
+ sandbox_type=sandbox_type,
1380
+ meta=meta,
1381
+ )
1382
+ else:
1383
+ new_name = self.create(
1384
+ sandbox_type=sandbox_type,
1385
+ meta=meta,
1386
+ mount_dir=old.mount_dir,
1387
+ storage_path=old.storage_path,
1388
+ )
1389
+
1390
+ if not new_name:
1391
+ logger.warning(
1392
+ f"restore_session: failed to restore container {old_name} "
1393
+ f"for session {session_ctx_id}",
1394
+ )
1395
+ continue
1396
+
1397
+ recycled_old_names.append(old_name)
1398
+ new_container_names.append(new_name)
1399
+
1400
+ # ensure new container is marked RUNNING + bound
1401
+ try:
1402
+ new_cm = ContainerModel(**self.get_info(new_name))
1403
+ now = time.time()
1404
+ new_cm.state = ContainerState.RUNNING
1405
+ new_cm.session_ctx_id = session_ctx_id
1406
+ if new_cm.meta is None:
1407
+ new_cm.meta = {}
1408
+ new_cm.meta["session_ctx_id"] = session_ctx_id
1409
+ new_cm.meta["sandbox_type"] = sandbox_type
1410
+ new_cm.recycled_at = None
1411
+ new_cm.recycle_reason = None
1412
+ new_cm.updated_at = now
1413
+ self.container_mapping.set(
1414
+ new_cm.container_name,
1415
+ new_cm.model_dump(),
1416
+ )
1417
+ except Exception as e:
1418
+ logger.warning(
1419
+ f"restore_session: failed to mark new container running:"
1420
+ f" {e}",
1421
+ )
1422
+
1423
+ if not new_container_names:
1424
+ # nothing restored
1425
+ return
1426
+
1427
+ # 2) switch session mapping to restored running containers
1428
+ self.session_mapping.set(session_ctx_id, new_container_names)
1429
+
1430
+ # 3) heartbeat after restore (session-level)
1431
+ self.update_heartbeat(session_ctx_id)
1432
+
1433
+ # 4) archive old recycled records so needs_restore becomes False
1434
+ for old_name in recycled_old_names:
1435
+ try:
1436
+ self.container_mapping.delete(old_name)
1437
+ except Exception as e:
1438
+ logger.warning(
1439
+ f"restore_session: failed to delete old model"
1440
+ f" {old_name}: {e}",
1441
+ )
1442
+
1443
+ def scan_heartbeat_once(self) -> dict:
1444
+ """
1445
+ Scan all session_ctx_id in session_mapping and reap those idle
1446
+ beyond timeout. Uses redis distributed lock to avoid multi-instance
1447
+ double reap.
1448
+ """
1449
+ timeout = int(self.config.heartbeat_timeout)
1450
+
1451
+ result = {
1452
+ "scanned_sessions": 0,
1453
+ "reaped_sessions": 0,
1454
+ "skipped_no_heartbeat": 0,
1455
+ "skipped_no_running_containers": 0,
1456
+ "skipped_lock_busy": 0,
1457
+ "skipped_not_idle_after_double_check": 0,
1458
+ "errors": 0,
1459
+ }
1460
+
1461
+ for session_ctx_id in list(self.session_mapping.scan()):
1462
+ result["scanned_sessions"] += 1
1463
+
1464
+ has_running = False
1465
+ try:
1466
+ env_ids = self.get_session_mapping(session_ctx_id) or []
1467
+ for cname in list(env_ids):
1468
+ try:
1469
+ cm = ContainerModel(**self.get_info(cname))
1470
+ except Exception:
1471
+ continue
1472
+ if cm.state == ContainerState.RUNNING:
1473
+ has_running = True
1474
+ break
1475
+ except Exception:
1476
+ has_running = False
1477
+
1478
+ if not has_running:
1479
+ result["skipped_no_running_containers"] += 1
1480
+ continue
1481
+
1482
+ last_active = self.get_heartbeat(session_ctx_id)
1483
+ if last_active is None:
1484
+ result["skipped_no_heartbeat"] += 1
1485
+ continue
1486
+
1487
+ # Use time.time() consistently to avoid subtle timing skew if
1488
+ # the scan loop itself takes a while under load.
1489
+ if time.time() - last_active <= timeout:
1490
+ continue
1491
+
1492
+ token = self.acquire_heartbeat_lock(session_ctx_id)
1493
+ if not token:
1494
+ result["skipped_lock_busy"] += 1
1495
+ continue
1496
+
1497
+ try:
1498
+ # double-check after lock (avoid racing with a fresh heartbeat)
1499
+ last_active2 = self.get_heartbeat(session_ctx_id)
1500
+ if last_active2 is None:
1501
+ result["skipped_no_heartbeat"] += 1
1502
+ continue
1503
+
1504
+ if time.time() - last_active2 <= timeout:
1505
+ result["skipped_not_idle_after_double_check"] += 1
1506
+ continue
1507
+
1508
+ ok = self.reap_session(
1509
+ session_ctx_id,
1510
+ reason="heartbeat_timeout",
1511
+ )
1512
+ if ok:
1513
+ result["reaped_sessions"] += 1
1514
+
1515
+ except Exception:
1516
+ result["errors"] += 1
1517
+ logger.warning(
1518
+ f"scan_heartbeat_once error on session {session_ctx_id}",
1519
+ )
1520
+ logger.debug(traceback.format_exc())
1521
+ finally:
1522
+ self.release_heartbeat_lock(session_ctx_id, token)
1523
+
1524
+ return result
1525
+
1526
+ def scan_pool_once(self) -> dict:
1527
+ """
1528
+ Replenish warm pool for each sandbox_type up to pool_size.
1529
+
1530
+ Note:
1531
+ - No distributed lock by design (multi-instance may overfill slightly).
1532
+ - Pool containers are WARM (no session_ctx_id).
1533
+ """
1534
+ result = {
1535
+ "types": 0,
1536
+ "created": 0,
1537
+ "enqueued": 0,
1538
+ "failed_create": 0,
1539
+ "skipped_pool_disabled": 0,
1540
+ }
1541
+
1542
+ if self.pool_size <= 0:
1543
+ result["skipped_pool_disabled"] = 1
1544
+ return result
1545
+
1546
+ for t in self.default_type:
1547
+ result["types"] += 1
1548
+ queue = self.pool_queues.get(t)
1549
+ if queue is None:
1550
+ continue
1551
+
1552
+ try:
1553
+ need = int(self.pool_size - queue.size())
1554
+ except Exception:
1555
+ # if queue.size() fails for any reason, skip this type
1556
+ continue
1557
+
1558
+ if need <= 0:
1559
+ continue
1560
+
1561
+ for _ in range(need):
1562
+ try:
1563
+ # create a WARM container (no session_ctx_id)
1564
+ container_name = self.create(
1565
+ sandbox_type=t.value,
1566
+ meta=None,
1567
+ )
1568
+ if not container_name:
1569
+ result["failed_create"] += 1
1570
+ continue
1571
+
1572
+ cm_json = self.container_mapping.get(container_name)
1573
+ if not cm_json:
1574
+ result["failed_create"] += 1
1575
+ continue
1576
+
1577
+ queue.enqueue(cm_json)
1578
+ result["created"] += 1
1579
+ result["enqueued"] += 1
1580
+ except Exception:
1581
+ result["failed_create"] += 1
1582
+ logger.debug(traceback.format_exc())
1583
+
1584
+ return result
1585
+
1586
+ def scan_released_cleanup_once(self, max_delete: int = 200) -> dict:
1587
+ """
1588
+ Delete container_mapping records whose state == RELEASED and expired.
1589
+
1590
+ TTL is config.released_key_ttl seconds. 0 disables cleanup.
1591
+ """
1592
+ ttl = int(getattr(self.config, "released_key_ttl", 0))
1593
+ result = {
1594
+ "ttl": ttl,
1595
+ "scanned": 0,
1596
+ "deleted": 0,
1597
+ "skipped_ttl_disabled": 0,
1598
+ "skipped_not_expired": 0,
1599
+ "skipped_not_released": 0,
1600
+ "errors": 0,
1601
+ }
1602
+
1603
+ if ttl <= 0:
1604
+ result["skipped_ttl_disabled"] = 1
1605
+ return result
1606
+
1607
+ now = time.time()
1608
+
1609
+ for key in self.container_mapping.scan(self.prefix):
1610
+ if result["deleted"] >= max_delete:
1611
+ break
1612
+
1613
+ result["scanned"] += 1
1614
+ try:
1615
+ container_json = self.container_mapping.get(key)
1616
+ if not container_json:
1617
+ continue
1618
+
1619
+ cm = ContainerModel(**container_json)
1620
+
1621
+ if cm.state != ContainerState.RELEASED:
1622
+ result["skipped_not_released"] += 1
1623
+ continue
1624
+
1625
+ released_at = cm.released_at or cm.updated_at or 0
1626
+ if released_at <= 0:
1627
+ # no timestamp -> treat as not expired
1628
+ result["skipped_not_expired"] += 1
1629
+ continue
1630
+
1631
+ if now - released_at <= ttl:
1632
+ result["skipped_not_expired"] += 1
1633
+ continue
1634
+
1635
+ self.container_mapping.delete(cm.container_name)
1636
+ result["deleted"] += 1
1637
+
1638
+ except Exception as e:
1639
+ result["errors"] += 1
1640
+ logger.debug(
1641
+ f"scan_released_cleanup_once: {e},"
1642
+ f" {traceback.format_exc()}",
1643
+ )
1644
+
1645
+ return result