agentscope-runtime 1.0.5.post1__py3-none-any.whl → 1.1.0b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentscope_runtime/__init__.py +3 -0
- agentscope_runtime/adapters/agentscope/message.py +85 -295
- agentscope_runtime/adapters/agentscope/stream.py +133 -3
- agentscope_runtime/adapters/agno/message.py +11 -2
- agentscope_runtime/adapters/agno/stream.py +1 -0
- agentscope_runtime/adapters/langgraph/__init__.py +1 -3
- agentscope_runtime/adapters/langgraph/message.py +11 -106
- agentscope_runtime/adapters/langgraph/stream.py +1 -0
- agentscope_runtime/adapters/ms_agent_framework/message.py +11 -1
- agentscope_runtime/adapters/ms_agent_framework/stream.py +1 -0
- agentscope_runtime/adapters/text/stream.py +1 -0
- agentscope_runtime/common/container_clients/agentrun_client.py +0 -3
- agentscope_runtime/common/container_clients/boxlite_client.py +26 -15
- agentscope_runtime/common/container_clients/fc_client.py +0 -11
- agentscope_runtime/common/utils/deprecation.py +14 -17
- agentscope_runtime/common/utils/logging.py +44 -0
- agentscope_runtime/engine/app/agent_app.py +5 -5
- agentscope_runtime/engine/app/celery_mixin.py +43 -4
- agentscope_runtime/engine/deployers/adapter/agui/__init__.py +8 -1
- agentscope_runtime/engine/deployers/adapter/agui/agui_adapter_utils.py +6 -1
- agentscope_runtime/engine/deployers/adapter/agui/agui_protocol_adapter.py +2 -2
- agentscope_runtime/engine/deployers/utils/service_utils/fastapi_factory.py +13 -0
- agentscope_runtime/engine/runner.py +31 -6
- agentscope_runtime/engine/schemas/agent_schemas.py +28 -0
- agentscope_runtime/engine/services/sandbox/sandbox_service.py +41 -9
- agentscope_runtime/sandbox/box/base/base_sandbox.py +4 -0
- agentscope_runtime/sandbox/box/browser/browser_sandbox.py +4 -0
- agentscope_runtime/sandbox/box/dummy/dummy_sandbox.py +9 -2
- agentscope_runtime/sandbox/box/filesystem/filesystem_sandbox.py +4 -0
- agentscope_runtime/sandbox/box/gui/gui_sandbox.py +5 -1
- agentscope_runtime/sandbox/box/mobile/mobile_sandbox.py +4 -0
- agentscope_runtime/sandbox/box/sandbox.py +122 -13
- agentscope_runtime/sandbox/client/async_http_client.py +1 -0
- agentscope_runtime/sandbox/client/base.py +0 -1
- agentscope_runtime/sandbox/client/http_client.py +0 -2
- agentscope_runtime/sandbox/manager/heartbeat_mixin.py +486 -0
- agentscope_runtime/sandbox/manager/sandbox_manager.py +740 -153
- agentscope_runtime/sandbox/manager/server/app.py +18 -11
- agentscope_runtime/sandbox/manager/server/config.py +10 -2
- agentscope_runtime/sandbox/mcp_server.py +0 -1
- agentscope_runtime/sandbox/model/__init__.py +2 -1
- agentscope_runtime/sandbox/model/container.py +90 -3
- agentscope_runtime/sandbox/model/manager_config.py +45 -1
- agentscope_runtime/version.py +1 -1
- {agentscope_runtime-1.0.5.post1.dist-info → agentscope_runtime-1.1.0b3.dist-info}/METADATA +37 -54
- {agentscope_runtime-1.0.5.post1.dist-info → agentscope_runtime-1.1.0b3.dist-info}/RECORD +50 -69
- agentscope_runtime/adapters/agentscope/long_term_memory/__init__.py +0 -6
- agentscope_runtime/adapters/agentscope/long_term_memory/_long_term_memory_adapter.py +0 -258
- agentscope_runtime/adapters/agentscope/memory/__init__.py +0 -6
- agentscope_runtime/adapters/agentscope/memory/_memory_adapter.py +0 -152
- agentscope_runtime/engine/services/agent_state/__init__.py +0 -25
- agentscope_runtime/engine/services/agent_state/redis_state_service.py +0 -166
- agentscope_runtime/engine/services/agent_state/state_service.py +0 -179
- agentscope_runtime/engine/services/agent_state/state_service_factory.py +0 -52
- agentscope_runtime/engine/services/memory/__init__.py +0 -33
- agentscope_runtime/engine/services/memory/mem0_memory_service.py +0 -128
- agentscope_runtime/engine/services/memory/memory_service.py +0 -292
- agentscope_runtime/engine/services/memory/memory_service_factory.py +0 -126
- agentscope_runtime/engine/services/memory/redis_memory_service.py +0 -290
- agentscope_runtime/engine/services/memory/reme_personal_memory_service.py +0 -109
- agentscope_runtime/engine/services/memory/reme_task_memory_service.py +0 -11
- agentscope_runtime/engine/services/memory/tablestore_memory_service.py +0 -301
- agentscope_runtime/engine/services/session_history/__init__.py +0 -32
- agentscope_runtime/engine/services/session_history/redis_session_history_service.py +0 -283
- agentscope_runtime/engine/services/session_history/session_history_service.py +0 -267
- agentscope_runtime/engine/services/session_history/session_history_service_factory.py +0 -73
- agentscope_runtime/engine/services/session_history/tablestore_session_history_service.py +0 -288
- {agentscope_runtime-1.0.5.post1.dist-info → agentscope_runtime-1.1.0b3.dist-info}/WHEEL +0 -0
- {agentscope_runtime-1.0.5.post1.dist-info → agentscope_runtime-1.1.0b3.dist-info}/entry_points.txt +0 -0
- {agentscope_runtime-1.0.5.post1.dist-info → agentscope_runtime-1.1.0b3.dist-info}/licenses/LICENSE +0 -0
- {agentscope_runtime-1.0.5.post1.dist-info → agentscope_runtime-1.1.0b3.dist-info}/top_level.txt +0 -0
|
@@ -2,10 +2,12 @@
|
|
|
2
2
|
# pylint: disable=redefined-outer-name, protected-access
|
|
3
3
|
# pylint: disable=too-many-branches, too-many-statements
|
|
4
4
|
# pylint: disable=redefined-outer-name, protected-access, too-many-branches
|
|
5
|
-
# pylint: disable=too-many-public-methods
|
|
5
|
+
# pylint: disable=too-many-public-methods, unused-argument
|
|
6
6
|
import asyncio
|
|
7
7
|
import inspect
|
|
8
8
|
import json
|
|
9
|
+
import time
|
|
10
|
+
import threading
|
|
9
11
|
import logging
|
|
10
12
|
import os
|
|
11
13
|
import secrets
|
|
@@ -17,6 +19,8 @@ import requests
|
|
|
17
19
|
import shortuuid
|
|
18
20
|
import httpx
|
|
19
21
|
|
|
22
|
+
from .heartbeat_mixin import HeartbeatMixin, touch_session
|
|
23
|
+
from ..constant import TIMEOUT
|
|
20
24
|
from ..client import (
|
|
21
25
|
SandboxHttpClient,
|
|
22
26
|
TrainingSandboxClient,
|
|
@@ -29,6 +33,7 @@ from ..manager.storage import (
|
|
|
29
33
|
)
|
|
30
34
|
from ..model import (
|
|
31
35
|
ContainerModel,
|
|
36
|
+
ContainerState,
|
|
32
37
|
SandboxManagerEnvConfig,
|
|
33
38
|
)
|
|
34
39
|
from ..registry import SandboxRegistry
|
|
@@ -39,9 +44,7 @@ from ...common.collections import (
|
|
|
39
44
|
InMemoryQueue,
|
|
40
45
|
)
|
|
41
46
|
from ...common.container_clients import ContainerClientFactory
|
|
42
|
-
from ..constant import TIMEOUT
|
|
43
47
|
|
|
44
|
-
logging.basicConfig(level=logging.INFO)
|
|
45
48
|
logger = logging.getLogger(__name__)
|
|
46
49
|
|
|
47
50
|
|
|
@@ -130,7 +133,7 @@ def remote_wrapper_async(
|
|
|
130
133
|
return decorator
|
|
131
134
|
|
|
132
135
|
|
|
133
|
-
class SandboxManager:
|
|
136
|
+
class SandboxManager(HeartbeatMixin):
|
|
134
137
|
def __init__(
|
|
135
138
|
self,
|
|
136
139
|
config: Optional[SandboxManagerEnvConfig] = None,
|
|
@@ -192,9 +195,7 @@ class SandboxManager:
|
|
|
192
195
|
self.prefix = self.config.container_prefix_key
|
|
193
196
|
self.default_mount_dir = self.config.default_mount_dir
|
|
194
197
|
self.readonly_mounts = self.config.readonly_mounts
|
|
195
|
-
self.storage_folder =
|
|
196
|
-
self.config.storage_folder or self.default_mount_dir
|
|
197
|
-
)
|
|
198
|
+
self.storage_folder = self.config.storage_folder
|
|
198
199
|
|
|
199
200
|
self.pool_queues = {}
|
|
200
201
|
if self.config.redis_enabled:
|
|
@@ -208,24 +209,26 @@ class SandboxManager:
|
|
|
208
209
|
password=self.config.redis_password,
|
|
209
210
|
decode_responses=True,
|
|
210
211
|
)
|
|
212
|
+
self.redis_client = redis_client
|
|
211
213
|
try:
|
|
212
|
-
redis_client.ping()
|
|
214
|
+
self.redis_client.ping()
|
|
213
215
|
except ConnectionError as e:
|
|
214
216
|
raise RuntimeError(
|
|
215
217
|
"Unable to connect to the Redis server.",
|
|
216
218
|
) from e
|
|
217
219
|
|
|
218
|
-
self.container_mapping = RedisMapping(redis_client)
|
|
220
|
+
self.container_mapping = RedisMapping(self.redis_client)
|
|
219
221
|
self.session_mapping = RedisMapping(
|
|
220
|
-
redis_client,
|
|
222
|
+
self.redis_client,
|
|
221
223
|
prefix="session_mapping",
|
|
222
224
|
)
|
|
223
225
|
|
|
224
226
|
# Init multi sand box pool
|
|
225
227
|
for t in self.default_type:
|
|
226
228
|
queue_key = f"{self.config.redis_container_pool_key}:{t.value}"
|
|
227
|
-
self.pool_queues[t] = RedisQueue(redis_client, queue_key)
|
|
229
|
+
self.pool_queues[t] = RedisQueue(self.redis_client, queue_key)
|
|
228
230
|
else:
|
|
231
|
+
self.redis_client = None
|
|
229
232
|
self.container_mapping = InMemoryMapping()
|
|
230
233
|
self.session_mapping = InMemoryMapping()
|
|
231
234
|
|
|
@@ -254,8 +257,9 @@ class SandboxManager:
|
|
|
254
257
|
else:
|
|
255
258
|
self.storage = LocalStorage()
|
|
256
259
|
|
|
257
|
-
|
|
258
|
-
|
|
260
|
+
self._watcher_stop_event = threading.Event()
|
|
261
|
+
self._watcher_thread = None
|
|
262
|
+
self._watcher_thread_lock = threading.Lock()
|
|
259
263
|
|
|
260
264
|
logger.debug(str(config))
|
|
261
265
|
|
|
@@ -264,12 +268,18 @@ class SandboxManager:
|
|
|
264
268
|
"Entering SandboxManager context (sync). "
|
|
265
269
|
"Cleanup will be performed automatically on exit.",
|
|
266
270
|
)
|
|
271
|
+
# local mode: watcher starts
|
|
272
|
+
if self.http_session is None:
|
|
273
|
+
self.start_watcher()
|
|
274
|
+
|
|
267
275
|
return self
|
|
268
276
|
|
|
269
277
|
def __exit__(self, exc_type, exc_value, traceback):
|
|
270
278
|
logger.debug(
|
|
271
279
|
"Exiting SandboxManager context (sync). Cleaning up resources.",
|
|
272
280
|
)
|
|
281
|
+
self.stop_watcher()
|
|
282
|
+
|
|
273
283
|
self.cleanup()
|
|
274
284
|
|
|
275
285
|
if self.http_session:
|
|
@@ -295,12 +305,18 @@ class SandboxManager:
|
|
|
295
305
|
"Entering SandboxManager context (async). "
|
|
296
306
|
"Cleanup will be performed automatically on async exit.",
|
|
297
307
|
)
|
|
308
|
+
# local mode: watcher starts
|
|
309
|
+
if self.http_session is None:
|
|
310
|
+
self.start_watcher()
|
|
311
|
+
|
|
298
312
|
return self
|
|
299
313
|
|
|
300
314
|
async def __aexit__(self, exc_type, exc_value, tb):
|
|
301
315
|
logger.debug(
|
|
302
316
|
"Exiting SandboxManager context (async). Cleaning up resources.",
|
|
303
317
|
)
|
|
318
|
+
self.stop_watcher()
|
|
319
|
+
|
|
304
320
|
await self.cleanup_async()
|
|
305
321
|
|
|
306
322
|
if self.http_session:
|
|
@@ -318,6 +334,7 @@ class SandboxManager:
|
|
|
318
334
|
logger.warning(f"Error closing httpx_client: {e}")
|
|
319
335
|
|
|
320
336
|
def _generate_container_key(self, session_id):
|
|
337
|
+
# TODO: refactor this and mapping, use sandbox_id as identity
|
|
321
338
|
return f"{self.prefix}{session_id}"
|
|
322
339
|
|
|
323
340
|
def _make_request(self, method: str, endpoint: str, data: dict):
|
|
@@ -420,68 +437,140 @@ class SandboxManager:
|
|
|
420
437
|
|
|
421
438
|
return response.json()
|
|
422
439
|
|
|
423
|
-
def
|
|
440
|
+
def start_watcher(self) -> bool:
|
|
424
441
|
"""
|
|
425
|
-
|
|
442
|
+
Start background heartbeat scanning thread.
|
|
443
|
+
Default: not started automatically. Caller must invoke explicitly.
|
|
444
|
+
If watcher_scan_interval == 0 => disabled, returns False.
|
|
426
445
|
"""
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
446
|
+
interval = int(self.config.watcher_scan_interval)
|
|
447
|
+
if interval <= 0:
|
|
448
|
+
logger.info(
|
|
449
|
+
"Watcher disabled (watcher_scan_interval <= 0)",
|
|
450
|
+
)
|
|
451
|
+
return False
|
|
452
|
+
|
|
453
|
+
with self._watcher_thread_lock:
|
|
454
|
+
if self._watcher_thread and self._watcher_thread.is_alive():
|
|
455
|
+
return True # already running
|
|
456
|
+
|
|
457
|
+
self._watcher_stop_event.clear()
|
|
458
|
+
|
|
459
|
+
def _loop():
|
|
460
|
+
logger.info(f"Watcher started, interval={interval}s")
|
|
461
|
+
while not self._watcher_stop_event.is_set():
|
|
462
|
+
try:
|
|
463
|
+
hb = self.scan_heartbeat_once()
|
|
464
|
+
pool = self.scan_pool_once()
|
|
465
|
+
gc = self.scan_released_cleanup_once()
|
|
466
|
+
|
|
467
|
+
logger.debug(
|
|
468
|
+
"watcher metrics: "
|
|
469
|
+
f"heartbeat={hb}, pool={pool}, released_gc={gc}",
|
|
470
|
+
)
|
|
471
|
+
except Exception as e:
|
|
472
|
+
logger.warning(f"Watcher loop error: {e}")
|
|
473
|
+
logger.debug(traceback.format_exc())
|
|
474
|
+
|
|
475
|
+
# wait with stop support
|
|
476
|
+
self._watcher_stop_event.wait(interval)
|
|
477
|
+
|
|
478
|
+
logger.info("Watcher stopped")
|
|
479
|
+
|
|
480
|
+
t = threading.Thread(
|
|
481
|
+
target=_loop,
|
|
482
|
+
name="watcher",
|
|
483
|
+
daemon=True,
|
|
484
|
+
)
|
|
485
|
+
self._watcher_thread = t
|
|
486
|
+
t.start()
|
|
487
|
+
return True
|
|
488
|
+
|
|
489
|
+
def stop_watcher(self, join_timeout: float = 5.0) -> None:
|
|
490
|
+
"""
|
|
491
|
+
Stop background watcher thread (if running).
|
|
492
|
+
"""
|
|
493
|
+
with self._watcher_thread_lock:
|
|
494
|
+
self._watcher_stop_event.set()
|
|
495
|
+
t = self._watcher_thread
|
|
496
|
+
|
|
497
|
+
if t and t.is_alive():
|
|
498
|
+
t.join(timeout=join_timeout)
|
|
499
|
+
|
|
500
|
+
with self._watcher_thread_lock:
|
|
501
|
+
if self._watcher_thread is t:
|
|
502
|
+
self._watcher_thread = None
|
|
449
503
|
|
|
450
504
|
@remote_wrapper()
|
|
451
505
|
def cleanup(self):
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
506
|
+
"""
|
|
507
|
+
Destroy all non-terminal containers managed by this SandboxManager.
|
|
508
|
+
|
|
509
|
+
Behavior (local mode):
|
|
510
|
+
- Dequeues and destroys containers from the warm pool (WARM/RUNNING).
|
|
511
|
+
- Scans container_mapping and destroys any remaining non-terminal
|
|
512
|
+
containers.
|
|
513
|
+
- Does NOT delete ContainerModel records from container_mapping;
|
|
514
|
+
instead it relies on release() to mark them as terminal (RELEASED).
|
|
515
|
+
- Skips containers already in terminal states: RELEASED / RECYCLED.
|
|
516
|
+
|
|
517
|
+
Notes:
|
|
518
|
+
- Uses container_name as identity to avoid ambiguity with session_id.
|
|
519
|
+
- Pool containers (WARM) are also destroyed (per current policy).
|
|
520
|
+
"""
|
|
521
|
+
logger.debug("Cleaning up resources.")
|
|
455
522
|
|
|
456
|
-
# Clean up pool first
|
|
523
|
+
# Clean up pool first (destroy warm/running containers; skip
|
|
524
|
+
# terminal states)
|
|
457
525
|
for queue in self.pool_queues.values():
|
|
458
526
|
try:
|
|
459
527
|
while queue.size() > 0:
|
|
460
528
|
container_json = queue.dequeue()
|
|
461
|
-
if container_json:
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
529
|
+
if not container_json:
|
|
530
|
+
continue
|
|
531
|
+
|
|
532
|
+
container_model = ContainerModel(**container_json)
|
|
533
|
+
|
|
534
|
+
# Terminal states: already cleaned logically
|
|
535
|
+
if container_model.state in (
|
|
536
|
+
ContainerState.RELEASED,
|
|
537
|
+
ContainerState.RECYCLED,
|
|
538
|
+
):
|
|
539
|
+
continue
|
|
540
|
+
|
|
541
|
+
logger.debug(
|
|
542
|
+
f"Destroy pool container"
|
|
543
|
+
f" {container_model.container_id} "
|
|
544
|
+
f"({container_model.container_name})",
|
|
545
|
+
)
|
|
546
|
+
# Use container_name to avoid ambiguity
|
|
547
|
+
self.release(container_model.container_name)
|
|
468
548
|
except Exception as e:
|
|
469
549
|
logger.error(f"Error cleaning up runtime pool: {e}")
|
|
470
550
|
|
|
471
|
-
# Clean up
|
|
551
|
+
# Clean up remaining containers in mapping
|
|
472
552
|
for key in self.container_mapping.scan(self.prefix):
|
|
473
553
|
try:
|
|
474
554
|
container_json = self.container_mapping.get(key)
|
|
475
|
-
if container_json:
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
555
|
+
if not container_json:
|
|
556
|
+
continue
|
|
557
|
+
|
|
558
|
+
container_model = ContainerModel(**container_json)
|
|
559
|
+
|
|
560
|
+
# Terminal states: already cleaned logically
|
|
561
|
+
if container_model.state in (
|
|
562
|
+
ContainerState.RELEASED,
|
|
563
|
+
ContainerState.RECYCLED,
|
|
564
|
+
):
|
|
565
|
+
continue
|
|
566
|
+
|
|
567
|
+
logger.debug(
|
|
568
|
+
f"Destroy container {container_model.container_id} "
|
|
569
|
+
f"({container_model.container_name})",
|
|
484
570
|
)
|
|
571
|
+
self.release(container_model.container_name)
|
|
572
|
+
except Exception as e:
|
|
573
|
+
logger.error(f"Error cleaning up container {key}: {e}")
|
|
485
574
|
|
|
486
575
|
@remote_wrapper_async()
|
|
487
576
|
async def cleanup_async(self, *args, **kwargs):
|
|
@@ -499,101 +588,104 @@ class SandboxManager:
|
|
|
499
588
|
|
|
500
589
|
queue = self.pool_queues[sandbox_type]
|
|
501
590
|
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
if cnt > self.pool_size:
|
|
506
|
-
raise RuntimeError(
|
|
507
|
-
"No container available in pool after check the pool.",
|
|
508
|
-
)
|
|
509
|
-
cnt += 1
|
|
510
|
-
|
|
511
|
-
# Add a new one to container
|
|
512
|
-
container_name = self.create(sandbox_type=sandbox_type)
|
|
513
|
-
new_container_model = self.container_mapping.get(
|
|
514
|
-
container_name,
|
|
515
|
-
)
|
|
591
|
+
def _bind_meta(container_model: ContainerModel):
|
|
592
|
+
if not meta:
|
|
593
|
+
return
|
|
516
594
|
|
|
517
|
-
|
|
518
|
-
queue.enqueue(
|
|
519
|
-
new_container_model,
|
|
520
|
-
)
|
|
595
|
+
session_ctx_id = meta.get("session_ctx_id")
|
|
521
596
|
|
|
522
|
-
|
|
597
|
+
container_model.meta = meta
|
|
598
|
+
container_model.session_ctx_id = session_ctx_id
|
|
599
|
+
container_model.state = (
|
|
600
|
+
ContainerState.RUNNING
|
|
601
|
+
if session_ctx_id
|
|
602
|
+
else ContainerState.WARM
|
|
603
|
+
)
|
|
604
|
+
container_model.recycled_at = None
|
|
605
|
+
container_model.recycle_reason = None
|
|
606
|
+
container_model.updated_at = time.time()
|
|
523
607
|
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
608
|
+
# persist first
|
|
609
|
+
self.container_mapping.set(
|
|
610
|
+
container_model.container_name,
|
|
611
|
+
container_model.model_dump(),
|
|
612
|
+
)
|
|
528
613
|
|
|
529
|
-
|
|
614
|
+
# session mapping + first heartbeat only when session_ctx_id exists
|
|
615
|
+
if session_ctx_id:
|
|
616
|
+
env_ids = self.session_mapping.get(session_ctx_id) or []
|
|
617
|
+
if container_model.container_name not in env_ids:
|
|
618
|
+
env_ids.append(container_model.container_name)
|
|
530
619
|
|
|
531
|
-
|
|
532
|
-
if meta and not container_model.meta:
|
|
533
|
-
container_model.meta = meta
|
|
534
|
-
self.container_mapping.set(
|
|
535
|
-
container_model.container_name,
|
|
536
|
-
container_model.model_dump(),
|
|
537
|
-
)
|
|
538
|
-
# Update session mapping
|
|
539
|
-
if "session_ctx_id" in meta:
|
|
540
|
-
env_ids = (
|
|
541
|
-
self.session_mapping.get(
|
|
542
|
-
meta["session_ctx_id"],
|
|
543
|
-
)
|
|
544
|
-
or []
|
|
545
|
-
)
|
|
546
|
-
if container_model.container_name not in env_ids:
|
|
547
|
-
env_ids.append(container_model.container_name)
|
|
548
|
-
self.session_mapping.set(
|
|
549
|
-
meta["session_ctx_id"],
|
|
550
|
-
env_ids,
|
|
551
|
-
)
|
|
620
|
+
self.session_mapping.set(session_ctx_id, env_ids)
|
|
552
621
|
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
622
|
+
self.clear_container_recycle_marker(
|
|
623
|
+
container_model.container_name,
|
|
624
|
+
set_state=ContainerState.RUNNING,
|
|
556
625
|
)
|
|
626
|
+
self.update_heartbeat(session_ctx_id)
|
|
627
|
+
|
|
628
|
+
try:
|
|
629
|
+
# 1) Try dequeue first
|
|
630
|
+
container_json = queue.dequeue()
|
|
631
|
+
if container_json:
|
|
632
|
+
container_model = ContainerModel(**container_json)
|
|
557
633
|
|
|
634
|
+
# version check
|
|
558
635
|
if (
|
|
559
636
|
container_model.version
|
|
560
|
-
!= SandboxRegistry.get_image_by_type(
|
|
561
|
-
sandbox_type,
|
|
562
|
-
)
|
|
637
|
+
!= SandboxRegistry.get_image_by_type(sandbox_type)
|
|
563
638
|
):
|
|
564
639
|
logger.warning(
|
|
565
640
|
f"Container {container_model.session_id} outdated, "
|
|
566
|
-
|
|
641
|
+
"dropping it",
|
|
567
642
|
)
|
|
568
|
-
self.release(container_model.
|
|
569
|
-
|
|
643
|
+
self.release(container_model.container_name)
|
|
644
|
+
container_json = None
|
|
645
|
+
else:
|
|
646
|
+
# inspect + status check
|
|
647
|
+
if (
|
|
648
|
+
self.client.inspect(
|
|
649
|
+
container_model.container_id,
|
|
650
|
+
)
|
|
651
|
+
is None
|
|
652
|
+
):
|
|
653
|
+
logger.warning(
|
|
654
|
+
f"Container {container_model.container_id} not "
|
|
655
|
+
f"found, dropping it",
|
|
656
|
+
)
|
|
657
|
+
self.release(container_model.container_name)
|
|
658
|
+
container_json = None
|
|
659
|
+
else:
|
|
660
|
+
status = self.client.get_status(
|
|
661
|
+
container_model.container_id,
|
|
662
|
+
)
|
|
663
|
+
if status != "running":
|
|
664
|
+
logger.warning(
|
|
665
|
+
f"Container {container_model.container_id} "
|
|
666
|
+
f"not running ({status}), dropping it",
|
|
667
|
+
)
|
|
668
|
+
self.release(container_model.container_name)
|
|
669
|
+
container_json = None
|
|
570
670
|
|
|
571
|
-
if
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
671
|
+
# if still valid, bind meta and return
|
|
672
|
+
if container_json:
|
|
673
|
+
_bind_meta(container_model)
|
|
674
|
+
logger.debug(
|
|
675
|
+
f"Retrieved container from pool:"
|
|
676
|
+
f" {container_model.session_id}",
|
|
575
677
|
)
|
|
576
|
-
continue
|
|
577
|
-
|
|
578
|
-
if (
|
|
579
|
-
self.client.get_status(container_model.container_id)
|
|
580
|
-
== "running"
|
|
581
|
-
):
|
|
582
678
|
return container_model.container_name
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
f"running. Trying next one in pool.",
|
|
587
|
-
)
|
|
588
|
-
# Destroy the stopped container
|
|
589
|
-
self.release(container_model.session_id)
|
|
679
|
+
|
|
680
|
+
# 2) Pool empty or invalid -> create a new one and return
|
|
681
|
+
return self.create(sandbox_type=sandbox_type.value, meta=meta)
|
|
590
682
|
|
|
591
683
|
except Exception as e:
|
|
592
684
|
logger.warning(
|
|
593
685
|
"Error getting container from pool, create a new one.",
|
|
594
686
|
)
|
|
595
687
|
logger.debug(f"{e}: {traceback.format_exc()}")
|
|
596
|
-
return self.create()
|
|
688
|
+
return self.create(sandbox_type=sandbox_type.value, meta=meta)
|
|
597
689
|
|
|
598
690
|
@remote_wrapper_async()
|
|
599
691
|
async def create_from_pool_async(self, *args, **kwargs):
|
|
@@ -604,11 +696,44 @@ class SandboxManager:
|
|
|
604
696
|
def create(
|
|
605
697
|
self,
|
|
606
698
|
sandbox_type=None,
|
|
607
|
-
mount_dir=None,
|
|
699
|
+
mount_dir=None,
|
|
608
700
|
storage_path=None,
|
|
609
701
|
environment: Optional[Dict] = None,
|
|
610
702
|
meta: Optional[Dict] = None,
|
|
611
|
-
):
|
|
703
|
+
): # pylint: disable=too-many-return-statements
|
|
704
|
+
# Enforce max sandbox instances
|
|
705
|
+
try:
|
|
706
|
+
limit = self.config.max_sandbox_instances
|
|
707
|
+
if limit > 0:
|
|
708
|
+
# Count only ACTIVE containers; exclude terminal states
|
|
709
|
+
active_states = {
|
|
710
|
+
ContainerState.WARM,
|
|
711
|
+
ContainerState.RUNNING,
|
|
712
|
+
}
|
|
713
|
+
current = 0
|
|
714
|
+
for key in self.container_mapping.scan(self.prefix):
|
|
715
|
+
try:
|
|
716
|
+
container_json = self.container_mapping.get(key)
|
|
717
|
+
if not container_json:
|
|
718
|
+
continue
|
|
719
|
+
cm = ContainerModel(**container_json)
|
|
720
|
+
if cm.state in active_states:
|
|
721
|
+
current += 1
|
|
722
|
+
except Exception:
|
|
723
|
+
# ignore broken records
|
|
724
|
+
continue
|
|
725
|
+
except RuntimeError as e:
|
|
726
|
+
logger.warning(str(e))
|
|
727
|
+
return None
|
|
728
|
+
except Exception:
|
|
729
|
+
# Handle unexpected errors from container_mapping.scan() gracefully
|
|
730
|
+
logger.exception("Failed to check sandbox instance limit")
|
|
731
|
+
return None
|
|
732
|
+
|
|
733
|
+
session_ctx_id = None
|
|
734
|
+
if meta and meta.get("session_ctx_id"):
|
|
735
|
+
session_ctx_id = meta["session_ctx_id"]
|
|
736
|
+
|
|
612
737
|
if sandbox_type is not None:
|
|
613
738
|
target_sandbox_type = SandboxType(sandbox_type)
|
|
614
739
|
else:
|
|
@@ -641,7 +766,13 @@ class SandboxManager:
|
|
|
641
766
|
short_uuid = shortuuid.ShortUUID(alphabet=alphabet).uuid()
|
|
642
767
|
session_id = str(short_uuid)
|
|
643
768
|
|
|
644
|
-
if not
|
|
769
|
+
if mount_dir and not self.config.allow_mount_dir:
|
|
770
|
+
logger.warning(
|
|
771
|
+
"mount_dir is not allowed by config, fallback to "
|
|
772
|
+
"default_mount_dir",
|
|
773
|
+
)
|
|
774
|
+
|
|
775
|
+
if (not mount_dir) or (not self.config.allow_mount_dir):
|
|
645
776
|
if self.default_mount_dir:
|
|
646
777
|
mount_dir = os.path.join(self.default_mount_dir, session_id)
|
|
647
778
|
os.makedirs(mount_dir, exist_ok=True)
|
|
@@ -711,6 +842,7 @@ class SandboxManager:
|
|
|
711
842
|
volumes=volume_bindings,
|
|
712
843
|
environment={
|
|
713
844
|
"SECRET_TOKEN": runtime_token,
|
|
845
|
+
"NGINX_TIMEOUT": str(TIMEOUT) if TIMEOUT else "60",
|
|
714
846
|
**environment,
|
|
715
847
|
},
|
|
716
848
|
runtime_config=config.runtime_config,
|
|
@@ -745,6 +877,12 @@ class SandboxManager:
|
|
|
745
877
|
version=image,
|
|
746
878
|
meta=meta or {},
|
|
747
879
|
timeout=config.timeout,
|
|
880
|
+
sandbox_type=target_sandbox_type.value,
|
|
881
|
+
session_ctx_id=session_ctx_id,
|
|
882
|
+
state=ContainerState.RUNNING
|
|
883
|
+
if session_ctx_id
|
|
884
|
+
else ContainerState.WARM,
|
|
885
|
+
updated_at=time.time(),
|
|
748
886
|
)
|
|
749
887
|
|
|
750
888
|
# Register in mapping
|
|
@@ -754,15 +892,28 @@ class SandboxManager:
|
|
|
754
892
|
)
|
|
755
893
|
|
|
756
894
|
# Build mapping session_ctx_id to container_name
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
895
|
+
# NOTE:
|
|
896
|
+
# - Only containers bound to a user session_ctx_id participate
|
|
897
|
+
# in heartbeat/reap.
|
|
898
|
+
# - Prewarmed pool containers typically have no session_ctx_id;
|
|
899
|
+
# do NOT write heartbeat for them.
|
|
900
|
+
if meta and "session_ctx_id" in meta and meta["session_ctx_id"]:
|
|
901
|
+
session_ctx_id = meta["session_ctx_id"]
|
|
902
|
+
|
|
903
|
+
env_ids = self.session_mapping.get(session_ctx_id) or []
|
|
904
|
+
if container_model.container_name not in env_ids:
|
|
905
|
+
env_ids.append(container_model.container_name)
|
|
906
|
+
self.session_mapping.set(session_ctx_id, env_ids)
|
|
907
|
+
|
|
908
|
+
# First heartbeat on creation (treat "allocate to session"
|
|
909
|
+
# as first activity)
|
|
910
|
+
self.update_heartbeat(session_ctx_id)
|
|
911
|
+
|
|
912
|
+
# Session is now alive again; clear restore-required marker
|
|
913
|
+
self.clear_container_recycle_marker(
|
|
914
|
+
container_model.container_name,
|
|
915
|
+
set_state=ContainerState.RUNNING,
|
|
763
916
|
)
|
|
764
|
-
env_ids.append(container_model.container_name)
|
|
765
|
-
self.session_mapping.set(meta["session_ctx_id"], env_ids)
|
|
766
917
|
|
|
767
918
|
logger.debug(
|
|
768
919
|
f"Created container {container_name}"
|
|
@@ -785,21 +936,25 @@ class SandboxManager:
|
|
|
785
936
|
@remote_wrapper()
|
|
786
937
|
def release(self, identity):
|
|
787
938
|
try:
|
|
788
|
-
container_json = self.
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
f"No container found for {identity}.",
|
|
939
|
+
container_json = self.container_mapping.get(identity)
|
|
940
|
+
if container_json is None:
|
|
941
|
+
container_json = self.container_mapping.get(
|
|
942
|
+
self._generate_container_key(identity),
|
|
793
943
|
)
|
|
794
|
-
|
|
944
|
+
if container_json is None:
|
|
945
|
+
logger.warning(
|
|
946
|
+
f"release: container not found for {identity}, "
|
|
947
|
+
f"treat as already released",
|
|
948
|
+
)
|
|
949
|
+
return True
|
|
795
950
|
|
|
796
951
|
container_info = ContainerModel(**container_json)
|
|
797
952
|
|
|
798
|
-
# remove key in mapping
|
|
799
|
-
|
|
953
|
+
# remove session key in mapping
|
|
954
|
+
session_ctx_id = container_info.session_ctx_id or (
|
|
955
|
+
container_info.meta or {}
|
|
956
|
+
).get("session_ctx_id")
|
|
800
957
|
|
|
801
|
-
# remove key in mapping
|
|
802
|
-
session_ctx_id = container_info.meta.get("session_ctx_id")
|
|
803
958
|
if session_ctx_id:
|
|
804
959
|
env_ids = self.session_mapping.get(session_ctx_id) or []
|
|
805
960
|
env_ids = [
|
|
@@ -810,10 +965,44 @@ class SandboxManager:
|
|
|
810
965
|
if env_ids:
|
|
811
966
|
self.session_mapping.set(session_ctx_id, env_ids)
|
|
812
967
|
else:
|
|
968
|
+
# last container of this session is gone;
|
|
969
|
+
# keep state consistent
|
|
813
970
|
self.session_mapping.delete(session_ctx_id)
|
|
814
971
|
|
|
815
|
-
|
|
816
|
-
|
|
972
|
+
# Mark released (do NOT delete mapping) in model
|
|
973
|
+
now = time.time()
|
|
974
|
+
container_info.state = ContainerState.RELEASED
|
|
975
|
+
container_info.released_at = now
|
|
976
|
+
container_info.updated_at = now
|
|
977
|
+
container_info.recycled_at = None
|
|
978
|
+
container_info.recycle_reason = None
|
|
979
|
+
|
|
980
|
+
# Unbind session in model
|
|
981
|
+
container_info.session_ctx_id = None
|
|
982
|
+
if container_info.meta is None:
|
|
983
|
+
container_info.meta = {}
|
|
984
|
+
container_info.meta.pop("session_ctx_id", None)
|
|
985
|
+
|
|
986
|
+
self.container_mapping.set(
|
|
987
|
+
container_info.container_name,
|
|
988
|
+
container_info.model_dump(),
|
|
989
|
+
)
|
|
990
|
+
|
|
991
|
+
try:
|
|
992
|
+
self.client.stop(container_info.container_id, timeout=1)
|
|
993
|
+
except Exception as e:
|
|
994
|
+
logger.debug(
|
|
995
|
+
f"release stop ignored for"
|
|
996
|
+
f" {container_info.container_id}: {e}",
|
|
997
|
+
)
|
|
998
|
+
|
|
999
|
+
try:
|
|
1000
|
+
self.client.remove(container_info.container_id, force=True)
|
|
1001
|
+
except Exception as e:
|
|
1002
|
+
logger.debug(
|
|
1003
|
+
f"release remove ignored for"
|
|
1004
|
+
f" {container_info.container_id}: {e}",
|
|
1005
|
+
)
|
|
817
1006
|
|
|
818
1007
|
logger.debug(f"Container for {identity} destroyed.")
|
|
819
1008
|
|
|
@@ -970,40 +1159,47 @@ class SandboxManager:
|
|
|
970
1159
|
return async_client
|
|
971
1160
|
|
|
972
1161
|
@remote_wrapper()
|
|
1162
|
+
@touch_session(identity_arg="identity")
|
|
973
1163
|
def check_health(self, identity):
|
|
974
1164
|
"""List tool"""
|
|
975
1165
|
client = self._establish_connection(identity)
|
|
976
1166
|
return client.check_health()
|
|
977
1167
|
|
|
978
1168
|
@remote_wrapper_async()
|
|
1169
|
+
@touch_session(identity_arg="identity")
|
|
979
1170
|
async def check_health_async(self, identity):
|
|
980
1171
|
client = await self._establish_connection_async(identity)
|
|
981
1172
|
return await client.check_health()
|
|
982
1173
|
|
|
983
1174
|
@remote_wrapper()
|
|
1175
|
+
@touch_session(identity_arg="identity")
|
|
984
1176
|
def list_tools(self, identity, tool_type=None, **kwargs):
|
|
985
1177
|
"""List tool"""
|
|
986
1178
|
client = self._establish_connection(identity)
|
|
987
1179
|
return client.list_tools(tool_type=tool_type, **kwargs)
|
|
988
1180
|
|
|
989
1181
|
@remote_wrapper_async()
|
|
1182
|
+
@touch_session(identity_arg="identity")
|
|
990
1183
|
async def list_tools_async(self, identity, tool_type=None, **kwargs):
|
|
991
1184
|
client = await self._establish_connection_async(identity)
|
|
992
1185
|
return await client.list_tools(tool_type=tool_type, **kwargs)
|
|
993
1186
|
|
|
994
1187
|
@remote_wrapper()
|
|
1188
|
+
@touch_session(identity_arg="identity")
|
|
995
1189
|
def call_tool(self, identity, tool_name=None, arguments=None):
|
|
996
1190
|
"""Call tool"""
|
|
997
1191
|
client = self._establish_connection(identity)
|
|
998
1192
|
return client.call_tool(tool_name, arguments)
|
|
999
1193
|
|
|
1000
1194
|
@remote_wrapper_async()
|
|
1195
|
+
@touch_session(identity_arg="identity")
|
|
1001
1196
|
async def call_tool_async(self, identity, tool_name=None, arguments=None):
|
|
1002
1197
|
"""Call tool (async)"""
|
|
1003
1198
|
client = await self._establish_connection_async(identity)
|
|
1004
1199
|
return await client.call_tool(tool_name, arguments)
|
|
1005
1200
|
|
|
1006
1201
|
@remote_wrapper()
|
|
1202
|
+
@touch_session(identity_arg="identity")
|
|
1007
1203
|
def add_mcp_servers(self, identity, server_configs, overwrite=False):
|
|
1008
1204
|
"""
|
|
1009
1205
|
Add MCP servers to runtime.
|
|
@@ -1015,6 +1211,7 @@ class SandboxManager:
|
|
|
1015
1211
|
)
|
|
1016
1212
|
|
|
1017
1213
|
@remote_wrapper_async()
|
|
1214
|
+
@touch_session(identity_arg="identity")
|
|
1018
1215
|
async def add_mcp_servers_async(
|
|
1019
1216
|
self,
|
|
1020
1217
|
identity,
|
|
@@ -1056,3 +1253,393 @@ class SandboxManager:
|
|
|
1056
1253
|
async def list_session_keys_async(self, *args, **kwargs):
|
|
1057
1254
|
"""Async wrapper for list_session_keys()."""
|
|
1058
1255
|
return await asyncio.to_thread(self.list_session_keys, *args, **kwargs)
|
|
1256
|
+
|
|
1257
|
+
def reap_session(
|
|
1258
|
+
self,
|
|
1259
|
+
session_ctx_id: str,
|
|
1260
|
+
reason: str = "heartbeat_timeout",
|
|
1261
|
+
) -> bool:
|
|
1262
|
+
"""
|
|
1263
|
+
Reap (release) ALL containers bound to session_ctx_id.
|
|
1264
|
+
|
|
1265
|
+
Important:
|
|
1266
|
+
- Prewarm pool containers are NOT part of session_mapping
|
|
1267
|
+
(no session_ctx_id), so they won't be reaped by this flow.
|
|
1268
|
+
"""
|
|
1269
|
+
try:
|
|
1270
|
+
env_ids = self.get_session_mapping(session_ctx_id) or []
|
|
1271
|
+
|
|
1272
|
+
for container_name in list(env_ids):
|
|
1273
|
+
now = time.time()
|
|
1274
|
+
try:
|
|
1275
|
+
info = ContainerModel(**self.get_info(container_name))
|
|
1276
|
+
|
|
1277
|
+
# stop/remove actual container
|
|
1278
|
+
try:
|
|
1279
|
+
self.client.stop(info.container_id, timeout=1)
|
|
1280
|
+
except Exception as e:
|
|
1281
|
+
logger.debug(
|
|
1282
|
+
f"Failed to stop container "
|
|
1283
|
+
f"{info.container_id}: {e}",
|
|
1284
|
+
)
|
|
1285
|
+
try:
|
|
1286
|
+
self.client.remove(info.container_id, force=True)
|
|
1287
|
+
except Exception as e:
|
|
1288
|
+
logger.debug(
|
|
1289
|
+
f"Failed to remove container "
|
|
1290
|
+
f"{info.container_id}: {e}",
|
|
1291
|
+
)
|
|
1292
|
+
|
|
1293
|
+
# upload storage if needed
|
|
1294
|
+
if info.mount_dir and info.storage_path:
|
|
1295
|
+
try:
|
|
1296
|
+
self.storage.upload_folder(
|
|
1297
|
+
info.mount_dir,
|
|
1298
|
+
info.storage_path,
|
|
1299
|
+
)
|
|
1300
|
+
except Exception as e:
|
|
1301
|
+
logger.warning(
|
|
1302
|
+
f"upload_folder failed for {container_name}:"
|
|
1303
|
+
f" {e}",
|
|
1304
|
+
)
|
|
1305
|
+
|
|
1306
|
+
# mark recycled, keep model
|
|
1307
|
+
info.state = ContainerState.RECYCLED
|
|
1308
|
+
info.recycled_at = now
|
|
1309
|
+
info.recycle_reason = reason
|
|
1310
|
+
info.updated_at = now
|
|
1311
|
+
|
|
1312
|
+
# keep session_ctx_id for restore
|
|
1313
|
+
info.session_ctx_id = session_ctx_id
|
|
1314
|
+
if info.meta is None:
|
|
1315
|
+
info.meta = {}
|
|
1316
|
+
info.meta["session_ctx_id"] = session_ctx_id
|
|
1317
|
+
|
|
1318
|
+
self.container_mapping.set(
|
|
1319
|
+
info.container_name,
|
|
1320
|
+
info.model_dump(),
|
|
1321
|
+
)
|
|
1322
|
+
|
|
1323
|
+
except Exception as e:
|
|
1324
|
+
logger.warning(
|
|
1325
|
+
f"Failed to recycle container {container_name} for "
|
|
1326
|
+
f"session {session_ctx_id}: {e}",
|
|
1327
|
+
)
|
|
1328
|
+
|
|
1329
|
+
return True
|
|
1330
|
+
except Exception as e:
|
|
1331
|
+
logger.warning(f"Failed to reap session {session_ctx_id}: {e}")
|
|
1332
|
+
logger.debug(traceback.format_exc())
|
|
1333
|
+
return False
|
|
1334
|
+
|
|
1335
|
+
def restore_session(self, session_ctx_id: str) -> None:
|
|
1336
|
+
"""
|
|
1337
|
+
Restore ALL recycled sandboxes (containers) for a session.
|
|
1338
|
+
|
|
1339
|
+
For each container record with state==RECYCLED in session_mapping[
|
|
1340
|
+
session_ctx_id]:
|
|
1341
|
+
- If mount_dir is empty -> allocate from pool
|
|
1342
|
+
(prefer same sandbox_type).
|
|
1343
|
+
- If mount_dir exists -> create a new container with that
|
|
1344
|
+
mount_dir/storage_path.
|
|
1345
|
+
- Bind new container to this session and mark RUNNING.
|
|
1346
|
+
- Archive the old recycled record (mark RELEASED).
|
|
1347
|
+
|
|
1348
|
+
After restore:
|
|
1349
|
+
- session_mapping[session_ctx_id] will be replaced with the list of
|
|
1350
|
+
NEW running containers.
|
|
1351
|
+
"""
|
|
1352
|
+
env_ids = self.get_session_mapping(session_ctx_id) or []
|
|
1353
|
+
if not env_ids:
|
|
1354
|
+
return
|
|
1355
|
+
|
|
1356
|
+
new_container_names: list[str] = []
|
|
1357
|
+
recycled_old_names: list[str] = []
|
|
1358
|
+
|
|
1359
|
+
# 1) restore each recycled container
|
|
1360
|
+
for old_name in list(env_ids):
|
|
1361
|
+
try:
|
|
1362
|
+
old = ContainerModel(**self.get_info(old_name))
|
|
1363
|
+
except Exception:
|
|
1364
|
+
continue
|
|
1365
|
+
|
|
1366
|
+
if old.state != ContainerState.RECYCLED:
|
|
1367
|
+
# keep non-recycled entries as-is (optional). In practice
|
|
1368
|
+
# env_ids should be recycled only.
|
|
1369
|
+
continue
|
|
1370
|
+
|
|
1371
|
+
sandbox_type = old.sandbox_type or self.default_type[0].value
|
|
1372
|
+
meta = {
|
|
1373
|
+
"session_ctx_id": session_ctx_id,
|
|
1374
|
+
}
|
|
1375
|
+
|
|
1376
|
+
# allocate new container
|
|
1377
|
+
if not old.mount_dir:
|
|
1378
|
+
new_name = self.create_from_pool(
|
|
1379
|
+
sandbox_type=sandbox_type,
|
|
1380
|
+
meta=meta,
|
|
1381
|
+
)
|
|
1382
|
+
else:
|
|
1383
|
+
new_name = self.create(
|
|
1384
|
+
sandbox_type=sandbox_type,
|
|
1385
|
+
meta=meta,
|
|
1386
|
+
mount_dir=old.mount_dir,
|
|
1387
|
+
storage_path=old.storage_path,
|
|
1388
|
+
)
|
|
1389
|
+
|
|
1390
|
+
if not new_name:
|
|
1391
|
+
logger.warning(
|
|
1392
|
+
f"restore_session: failed to restore container {old_name} "
|
|
1393
|
+
f"for session {session_ctx_id}",
|
|
1394
|
+
)
|
|
1395
|
+
continue
|
|
1396
|
+
|
|
1397
|
+
recycled_old_names.append(old_name)
|
|
1398
|
+
new_container_names.append(new_name)
|
|
1399
|
+
|
|
1400
|
+
# ensure new container is marked RUNNING + bound
|
|
1401
|
+
try:
|
|
1402
|
+
new_cm = ContainerModel(**self.get_info(new_name))
|
|
1403
|
+
now = time.time()
|
|
1404
|
+
new_cm.state = ContainerState.RUNNING
|
|
1405
|
+
new_cm.session_ctx_id = session_ctx_id
|
|
1406
|
+
if new_cm.meta is None:
|
|
1407
|
+
new_cm.meta = {}
|
|
1408
|
+
new_cm.meta["session_ctx_id"] = session_ctx_id
|
|
1409
|
+
new_cm.meta["sandbox_type"] = sandbox_type
|
|
1410
|
+
new_cm.recycled_at = None
|
|
1411
|
+
new_cm.recycle_reason = None
|
|
1412
|
+
new_cm.updated_at = now
|
|
1413
|
+
self.container_mapping.set(
|
|
1414
|
+
new_cm.container_name,
|
|
1415
|
+
new_cm.model_dump(),
|
|
1416
|
+
)
|
|
1417
|
+
except Exception as e:
|
|
1418
|
+
logger.warning(
|
|
1419
|
+
f"restore_session: failed to mark new container running:"
|
|
1420
|
+
f" {e}",
|
|
1421
|
+
)
|
|
1422
|
+
|
|
1423
|
+
if not new_container_names:
|
|
1424
|
+
# nothing restored
|
|
1425
|
+
return
|
|
1426
|
+
|
|
1427
|
+
# 2) switch session mapping to restored running containers
|
|
1428
|
+
self.session_mapping.set(session_ctx_id, new_container_names)
|
|
1429
|
+
|
|
1430
|
+
# 3) heartbeat after restore (session-level)
|
|
1431
|
+
self.update_heartbeat(session_ctx_id)
|
|
1432
|
+
|
|
1433
|
+
# 4) archive old recycled records so needs_restore becomes False
|
|
1434
|
+
for old_name in recycled_old_names:
|
|
1435
|
+
try:
|
|
1436
|
+
self.container_mapping.delete(old_name)
|
|
1437
|
+
except Exception as e:
|
|
1438
|
+
logger.warning(
|
|
1439
|
+
f"restore_session: failed to delete old model"
|
|
1440
|
+
f" {old_name}: {e}",
|
|
1441
|
+
)
|
|
1442
|
+
|
|
1443
|
+
def scan_heartbeat_once(self) -> dict:
|
|
1444
|
+
"""
|
|
1445
|
+
Scan all session_ctx_id in session_mapping and reap those idle
|
|
1446
|
+
beyond timeout. Uses redis distributed lock to avoid multi-instance
|
|
1447
|
+
double reap.
|
|
1448
|
+
"""
|
|
1449
|
+
timeout = int(self.config.heartbeat_timeout)
|
|
1450
|
+
|
|
1451
|
+
result = {
|
|
1452
|
+
"scanned_sessions": 0,
|
|
1453
|
+
"reaped_sessions": 0,
|
|
1454
|
+
"skipped_no_heartbeat": 0,
|
|
1455
|
+
"skipped_no_running_containers": 0,
|
|
1456
|
+
"skipped_lock_busy": 0,
|
|
1457
|
+
"skipped_not_idle_after_double_check": 0,
|
|
1458
|
+
"errors": 0,
|
|
1459
|
+
}
|
|
1460
|
+
|
|
1461
|
+
for session_ctx_id in list(self.session_mapping.scan()):
|
|
1462
|
+
result["scanned_sessions"] += 1
|
|
1463
|
+
|
|
1464
|
+
has_running = False
|
|
1465
|
+
try:
|
|
1466
|
+
env_ids = self.get_session_mapping(session_ctx_id) or []
|
|
1467
|
+
for cname in list(env_ids):
|
|
1468
|
+
try:
|
|
1469
|
+
cm = ContainerModel(**self.get_info(cname))
|
|
1470
|
+
except Exception:
|
|
1471
|
+
continue
|
|
1472
|
+
if cm.state == ContainerState.RUNNING:
|
|
1473
|
+
has_running = True
|
|
1474
|
+
break
|
|
1475
|
+
except Exception:
|
|
1476
|
+
has_running = False
|
|
1477
|
+
|
|
1478
|
+
if not has_running:
|
|
1479
|
+
result["skipped_no_running_containers"] += 1
|
|
1480
|
+
continue
|
|
1481
|
+
|
|
1482
|
+
last_active = self.get_heartbeat(session_ctx_id)
|
|
1483
|
+
if last_active is None:
|
|
1484
|
+
result["skipped_no_heartbeat"] += 1
|
|
1485
|
+
continue
|
|
1486
|
+
|
|
1487
|
+
# Use time.time() consistently to avoid subtle timing skew if
|
|
1488
|
+
# the scan loop itself takes a while under load.
|
|
1489
|
+
if time.time() - last_active <= timeout:
|
|
1490
|
+
continue
|
|
1491
|
+
|
|
1492
|
+
token = self.acquire_heartbeat_lock(session_ctx_id)
|
|
1493
|
+
if not token:
|
|
1494
|
+
result["skipped_lock_busy"] += 1
|
|
1495
|
+
continue
|
|
1496
|
+
|
|
1497
|
+
try:
|
|
1498
|
+
# double-check after lock (avoid racing with a fresh heartbeat)
|
|
1499
|
+
last_active2 = self.get_heartbeat(session_ctx_id)
|
|
1500
|
+
if last_active2 is None:
|
|
1501
|
+
result["skipped_no_heartbeat"] += 1
|
|
1502
|
+
continue
|
|
1503
|
+
|
|
1504
|
+
if time.time() - last_active2 <= timeout:
|
|
1505
|
+
result["skipped_not_idle_after_double_check"] += 1
|
|
1506
|
+
continue
|
|
1507
|
+
|
|
1508
|
+
ok = self.reap_session(
|
|
1509
|
+
session_ctx_id,
|
|
1510
|
+
reason="heartbeat_timeout",
|
|
1511
|
+
)
|
|
1512
|
+
if ok:
|
|
1513
|
+
result["reaped_sessions"] += 1
|
|
1514
|
+
|
|
1515
|
+
except Exception:
|
|
1516
|
+
result["errors"] += 1
|
|
1517
|
+
logger.warning(
|
|
1518
|
+
f"scan_heartbeat_once error on session {session_ctx_id}",
|
|
1519
|
+
)
|
|
1520
|
+
logger.debug(traceback.format_exc())
|
|
1521
|
+
finally:
|
|
1522
|
+
self.release_heartbeat_lock(session_ctx_id, token)
|
|
1523
|
+
|
|
1524
|
+
return result
|
|
1525
|
+
|
|
1526
|
+
def scan_pool_once(self) -> dict:
|
|
1527
|
+
"""
|
|
1528
|
+
Replenish warm pool for each sandbox_type up to pool_size.
|
|
1529
|
+
|
|
1530
|
+
Note:
|
|
1531
|
+
- No distributed lock by design (multi-instance may overfill slightly).
|
|
1532
|
+
- Pool containers are WARM (no session_ctx_id).
|
|
1533
|
+
"""
|
|
1534
|
+
result = {
|
|
1535
|
+
"types": 0,
|
|
1536
|
+
"created": 0,
|
|
1537
|
+
"enqueued": 0,
|
|
1538
|
+
"failed_create": 0,
|
|
1539
|
+
"skipped_pool_disabled": 0,
|
|
1540
|
+
}
|
|
1541
|
+
|
|
1542
|
+
if self.pool_size <= 0:
|
|
1543
|
+
result["skipped_pool_disabled"] = 1
|
|
1544
|
+
return result
|
|
1545
|
+
|
|
1546
|
+
for t in self.default_type:
|
|
1547
|
+
result["types"] += 1
|
|
1548
|
+
queue = self.pool_queues.get(t)
|
|
1549
|
+
if queue is None:
|
|
1550
|
+
continue
|
|
1551
|
+
|
|
1552
|
+
try:
|
|
1553
|
+
need = int(self.pool_size - queue.size())
|
|
1554
|
+
except Exception:
|
|
1555
|
+
# if queue.size() fails for any reason, skip this type
|
|
1556
|
+
continue
|
|
1557
|
+
|
|
1558
|
+
if need <= 0:
|
|
1559
|
+
continue
|
|
1560
|
+
|
|
1561
|
+
for _ in range(need):
|
|
1562
|
+
try:
|
|
1563
|
+
# create a WARM container (no session_ctx_id)
|
|
1564
|
+
container_name = self.create(
|
|
1565
|
+
sandbox_type=t.value,
|
|
1566
|
+
meta=None,
|
|
1567
|
+
)
|
|
1568
|
+
if not container_name:
|
|
1569
|
+
result["failed_create"] += 1
|
|
1570
|
+
continue
|
|
1571
|
+
|
|
1572
|
+
cm_json = self.container_mapping.get(container_name)
|
|
1573
|
+
if not cm_json:
|
|
1574
|
+
result["failed_create"] += 1
|
|
1575
|
+
continue
|
|
1576
|
+
|
|
1577
|
+
queue.enqueue(cm_json)
|
|
1578
|
+
result["created"] += 1
|
|
1579
|
+
result["enqueued"] += 1
|
|
1580
|
+
except Exception:
|
|
1581
|
+
result["failed_create"] += 1
|
|
1582
|
+
logger.debug(traceback.format_exc())
|
|
1583
|
+
|
|
1584
|
+
return result
|
|
1585
|
+
|
|
1586
|
+
def scan_released_cleanup_once(self, max_delete: int = 200) -> dict:
|
|
1587
|
+
"""
|
|
1588
|
+
Delete container_mapping records whose state == RELEASED and expired.
|
|
1589
|
+
|
|
1590
|
+
TTL is config.released_key_ttl seconds. 0 disables cleanup.
|
|
1591
|
+
"""
|
|
1592
|
+
ttl = int(getattr(self.config, "released_key_ttl", 0))
|
|
1593
|
+
result = {
|
|
1594
|
+
"ttl": ttl,
|
|
1595
|
+
"scanned": 0,
|
|
1596
|
+
"deleted": 0,
|
|
1597
|
+
"skipped_ttl_disabled": 0,
|
|
1598
|
+
"skipped_not_expired": 0,
|
|
1599
|
+
"skipped_not_released": 0,
|
|
1600
|
+
"errors": 0,
|
|
1601
|
+
}
|
|
1602
|
+
|
|
1603
|
+
if ttl <= 0:
|
|
1604
|
+
result["skipped_ttl_disabled"] = 1
|
|
1605
|
+
return result
|
|
1606
|
+
|
|
1607
|
+
now = time.time()
|
|
1608
|
+
|
|
1609
|
+
for key in self.container_mapping.scan(self.prefix):
|
|
1610
|
+
if result["deleted"] >= max_delete:
|
|
1611
|
+
break
|
|
1612
|
+
|
|
1613
|
+
result["scanned"] += 1
|
|
1614
|
+
try:
|
|
1615
|
+
container_json = self.container_mapping.get(key)
|
|
1616
|
+
if not container_json:
|
|
1617
|
+
continue
|
|
1618
|
+
|
|
1619
|
+
cm = ContainerModel(**container_json)
|
|
1620
|
+
|
|
1621
|
+
if cm.state != ContainerState.RELEASED:
|
|
1622
|
+
result["skipped_not_released"] += 1
|
|
1623
|
+
continue
|
|
1624
|
+
|
|
1625
|
+
released_at = cm.released_at or cm.updated_at or 0
|
|
1626
|
+
if released_at <= 0:
|
|
1627
|
+
# no timestamp -> treat as not expired
|
|
1628
|
+
result["skipped_not_expired"] += 1
|
|
1629
|
+
continue
|
|
1630
|
+
|
|
1631
|
+
if now - released_at <= ttl:
|
|
1632
|
+
result["skipped_not_expired"] += 1
|
|
1633
|
+
continue
|
|
1634
|
+
|
|
1635
|
+
self.container_mapping.delete(cm.container_name)
|
|
1636
|
+
result["deleted"] += 1
|
|
1637
|
+
|
|
1638
|
+
except Exception as e:
|
|
1639
|
+
result["errors"] += 1
|
|
1640
|
+
logger.debug(
|
|
1641
|
+
f"scan_released_cleanup_once: {e},"
|
|
1642
|
+
f" {traceback.format_exc()}",
|
|
1643
|
+
)
|
|
1644
|
+
|
|
1645
|
+
return result
|