@icyfenix-dmla/cli 2026.5.3-2128 → 2026.5.3-2346
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/server/dmla_progress.py +60 -11
- package/src/server/sandbox.js +20 -3
- package/version.json +2 -2
package/package.json
CHANGED
|
@@ -18,12 +18,61 @@ DMLA 进度报告模块
|
|
|
18
18
|
|
|
19
19
|
import json
|
|
20
20
|
import time
|
|
21
|
+
import sys
|
|
22
|
+
import threading
|
|
23
|
+
import queue
|
|
21
24
|
from pathlib import Path
|
|
22
25
|
from typing import Optional
|
|
23
26
|
|
|
24
27
|
# 进度文件路径
|
|
25
28
|
PROGRESS_FILE = Path('/workspace/progress.json')
|
|
26
29
|
|
|
30
|
+
# stderr 异步写入队列(避免管道阻塞)
|
|
31
|
+
_stderr_queue: queue.Queue = queue.Queue()
|
|
32
|
+
_stderr_thread: Optional[threading.Thread] = None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _start_stderr_writer():
|
|
36
|
+
"""启动后台 stderr 写入线程(daemon 线程,随主线程退出)"""
|
|
37
|
+
global _stderr_thread
|
|
38
|
+
if _stderr_thread is None or not _stderr_thread.is_alive():
|
|
39
|
+
_stderr_thread = threading.Thread(target=_stderr_writer_loop, daemon=True)
|
|
40
|
+
_stderr_thread.start()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _stderr_writer_loop():
|
|
44
|
+
"""后台线程循环:从队列读取数据并写入 stderr"""
|
|
45
|
+
while True:
|
|
46
|
+
try:
|
|
47
|
+
item = _stderr_queue.get(timeout=1.0)
|
|
48
|
+
if item is None: # 停止信号
|
|
49
|
+
break
|
|
50
|
+
sys.stderr.write(item)
|
|
51
|
+
sys.stderr.flush()
|
|
52
|
+
except queue.Empty:
|
|
53
|
+
continue # 队列空,继续等待
|
|
54
|
+
except Exception:
|
|
55
|
+
pass # 写入失败,忽略(不影响主线程)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _write_stderr_async(data: str):
|
|
59
|
+
"""
|
|
60
|
+
异步写入 stderr(非阻塞)
|
|
61
|
+
|
|
62
|
+
将数据放入队列,由后台线程处理写入。
|
|
63
|
+
如果队列积压过多(>100 条),则丢弃旧数据,避免内存爆炸。
|
|
64
|
+
"""
|
|
65
|
+
_start_stderr_writer()
|
|
66
|
+
|
|
67
|
+
# 如果队列积压过多,清空部分旧数据
|
|
68
|
+
while _stderr_queue.qsize() > 100:
|
|
69
|
+
try:
|
|
70
|
+
_stderr_queue.get_nowait()
|
|
71
|
+
except queue.Empty:
|
|
72
|
+
break
|
|
73
|
+
|
|
74
|
+
_stderr_queue.put(data)
|
|
75
|
+
|
|
27
76
|
|
|
28
77
|
class ProgressReporter:
|
|
29
78
|
"""
|
|
@@ -214,23 +263,23 @@ class ProgressReporter:
|
|
|
214
263
|
if extra_data:
|
|
215
264
|
data["extra_data"] = extra_data
|
|
216
265
|
|
|
217
|
-
# 1.
|
|
218
|
-
#
|
|
219
|
-
try:
|
|
220
|
-
import sys
|
|
221
|
-
sys.stderr.write(json.dumps(data, ensure_ascii=False) + '\n')
|
|
222
|
-
sys.stderr.flush()
|
|
223
|
-
except Exception as e:
|
|
224
|
-
# stderr 输出失败不影响训练,仅打印警告
|
|
225
|
-
print(f"Warning: Failed to output progress to stderr: {e}")
|
|
226
|
-
|
|
227
|
-
# 2. 文件写入(作为降级/备用方案)
|
|
266
|
+
# 1. 文件写入(优先,确保进度数据持久化)
|
|
267
|
+
# 文件写入是可靠的,不受管道阻塞影响
|
|
228
268
|
try:
|
|
229
269
|
PROGRESS_FILE.write_text(json.dumps(data, ensure_ascii=False))
|
|
230
270
|
except Exception as e:
|
|
231
271
|
# 写入失败不影响训练,仅打印警告
|
|
232
272
|
print(f"Warning: Failed to write progress file: {e}")
|
|
233
273
|
|
|
274
|
+
# 2. stderr 异步输出(用于流式 HTTP 响应)
|
|
275
|
+
# 使用后台线程异步写入,避免管道阻塞主线程
|
|
276
|
+
# Windows Docker 环境下管道缓冲区满时会导致阻塞
|
|
277
|
+
try:
|
|
278
|
+
_write_stderr_async(json.dumps(data, ensure_ascii=False) + '\n')
|
|
279
|
+
except Exception as e:
|
|
280
|
+
# stderr 输出失败不影响训练
|
|
281
|
+
print(f"Warning: Failed to output progress to stderr: {e}")
|
|
282
|
+
|
|
234
283
|
|
|
235
284
|
def get_progress() -> Optional[dict]:
|
|
236
285
|
"""
|
package/src/server/sandbox.js
CHANGED
|
@@ -146,7 +146,8 @@ const SANDBOX_CONFIG = {
|
|
|
146
146
|
imageCpu: 'dmla-sandbox:cpu',
|
|
147
147
|
imageGpu: 'dmla-sandbox:gpu',
|
|
148
148
|
timeout: 60000, // 60 秒超时
|
|
149
|
-
|
|
149
|
+
memoryCpu: 4 * 1024 * 1024 * 1024, // CPU 容器 4GB 内存限制
|
|
150
|
+
memoryGpu: 0 // GPU 容器不限制内存(让 GPU 显存独立管理)
|
|
150
151
|
}
|
|
151
152
|
|
|
152
153
|
// DMLA 配置文件路径
|
|
@@ -480,11 +481,14 @@ export async function runPythonCode(code, useGpu = false, imageOverride = null,
|
|
|
480
481
|
|
|
481
482
|
// 创建容器配置 - 使用 kernel_runner.py 执行代码
|
|
482
483
|
const timeoutSeconds = actualTimeout === null ? 86400 : actualTimeout // unlimited 使用 24 小时
|
|
484
|
+
|
|
485
|
+
// GPU 容器不限制内存,CPU 容器限制 4GB
|
|
486
|
+
const memoryLimit = useGpu ? SANDBOX_CONFIG.memoryGpu : SANDBOX_CONFIG.memoryCpu
|
|
487
|
+
|
|
483
488
|
const containerConfig = {
|
|
484
489
|
Image: image,
|
|
485
490
|
Cmd: ['python3', '/workspace/kernel_runner.py', '--code', code, '--timeout', String(timeoutSeconds)],
|
|
486
491
|
HostConfig: {
|
|
487
|
-
Memory: SANDBOX_CONFIG.memory,
|
|
488
492
|
AutoRemove: false // 手动移除以获取日志
|
|
489
493
|
},
|
|
490
494
|
// matplotlib 使用 IPython Kernel 的 inline 后端,自动发送 display_data
|
|
@@ -496,6 +500,11 @@ export async function runPythonCode(code, useGpu = false, imageOverride = null,
|
|
|
496
500
|
].filter(e => e) // 过滤空字符串
|
|
497
501
|
}
|
|
498
502
|
|
|
503
|
+
// 仅对 CPU 容器设置内存限制(GPU 容器不限制)
|
|
504
|
+
if (memoryLimit > 0) {
|
|
505
|
+
containerConfig.HostConfig.Memory = memoryLimit
|
|
506
|
+
}
|
|
507
|
+
|
|
499
508
|
log('Container config created')
|
|
500
509
|
|
|
501
510
|
// Volume Mount 配置
|
|
@@ -772,11 +781,14 @@ export async function runPythonCodeStreaming(code, useGpu = false, res, imageOve
|
|
|
772
781
|
|
|
773
782
|
// 创建容器配置
|
|
774
783
|
const timeoutSeconds = actualTimeout === null ? 86400 : actualTimeout
|
|
784
|
+
|
|
785
|
+
// GPU 容器不限制内存,CPU 容器限制 4GB
|
|
786
|
+
const memoryLimit = useGpu ? SANDBOX_CONFIG.memoryGpu : SANDBOX_CONFIG.memoryCpu
|
|
787
|
+
|
|
775
788
|
const containerConfig = {
|
|
776
789
|
Image: image,
|
|
777
790
|
Cmd: ['python3', '/workspace/kernel_runner.py', '--code', code, '--timeout', String(timeoutSeconds), '--stream'],
|
|
778
791
|
HostConfig: {
|
|
779
|
-
Memory: SANDBOX_CONFIG.memory,
|
|
780
792
|
AutoRemove: false
|
|
781
793
|
},
|
|
782
794
|
Env: [
|
|
@@ -786,6 +798,11 @@ export async function runPythonCodeStreaming(code, useGpu = false, res, imageOve
|
|
|
786
798
|
].filter(e => e)
|
|
787
799
|
}
|
|
788
800
|
|
|
801
|
+
// 仅对 CPU 容器设置内存限制(GPU 容器不限制)
|
|
802
|
+
if (memoryLimit > 0) {
|
|
803
|
+
containerConfig.HostConfig.Memory = memoryLimit
|
|
804
|
+
}
|
|
805
|
+
|
|
789
806
|
log('Container config created for streaming')
|
|
790
807
|
|
|
791
808
|
// Volume Mount 配置(与 runPythonCode 相同)
|
package/version.json
CHANGED