@icyfenix-dmla/cli 2026.5.3-1305 → 2026.5.3-2128
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/server/dmla_progress.py +29 -7
- package/src/server/kernel_runner.py +84 -19
- package/src/server/routes/sandbox.js +87 -1
- package/src/server/sandbox.js +313 -0
- package/version.json +2 -2
package/package.json
CHANGED
|
@@ -36,7 +36,8 @@ class ProgressReporter:
|
|
|
36
36
|
self,
|
|
37
37
|
total_steps: int,
|
|
38
38
|
description: str = "",
|
|
39
|
-
start_step: int = 0
|
|
39
|
+
start_step: int = 0,
|
|
40
|
+
clear_existing: bool = True
|
|
40
41
|
):
|
|
41
42
|
"""
|
|
42
43
|
初始化进度报告器
|
|
@@ -45,17 +46,26 @@ class ProgressReporter:
|
|
|
45
46
|
total_steps: 总步数(如 epoch 数)
|
|
46
47
|
description: 任务描述
|
|
47
48
|
start_step: 起始步数(默认 0)
|
|
49
|
+
clear_existing: 是否清除已存在的进度文件(默认 True)
|
|
48
50
|
"""
|
|
49
51
|
self.total_steps = total_steps
|
|
50
52
|
self.description = description
|
|
51
53
|
self.current_step = start_step
|
|
52
54
|
self.start_time = time.time()
|
|
53
55
|
|
|
54
|
-
#
|
|
56
|
+
# 清除旧的进度文件,避免显示上一个任务的进度
|
|
57
|
+
if clear_existing:
|
|
58
|
+
clear_progress()
|
|
59
|
+
|
|
60
|
+
# 计算初始百分比
|
|
61
|
+
initial_percent = (start_step / total_steps) * 100 if total_steps > 0 else 0
|
|
62
|
+
|
|
63
|
+
# 写入初始状态(包含 percent)
|
|
55
64
|
self._write_progress(
|
|
56
65
|
step=start_step,
|
|
57
66
|
status="starting",
|
|
58
|
-
message=description
|
|
67
|
+
message=description,
|
|
68
|
+
percent=initial_percent
|
|
59
69
|
)
|
|
60
70
|
|
|
61
71
|
def update(
|
|
@@ -117,11 +127,12 @@ class ProgressReporter:
|
|
|
117
127
|
self.start_time = time.time()
|
|
118
128
|
self.current_step = 0
|
|
119
129
|
|
|
120
|
-
#
|
|
130
|
+
# 写入新阶段的初始状态(包含 percent=0)
|
|
121
131
|
self._write_progress(
|
|
122
132
|
step=0,
|
|
123
133
|
status="starting",
|
|
124
|
-
message=self.description
|
|
134
|
+
message=self.description,
|
|
135
|
+
percent=0.0
|
|
125
136
|
)
|
|
126
137
|
|
|
127
138
|
def complete(self, message: str = "", extra_data: Optional[dict] = None):
|
|
@@ -175,7 +186,7 @@ class ProgressReporter:
|
|
|
175
186
|
extra_data: Optional[dict] = None
|
|
176
187
|
):
|
|
177
188
|
"""
|
|
178
|
-
|
|
189
|
+
写入进度信息(stdout + 文件双模式)
|
|
179
190
|
|
|
180
191
|
Args:
|
|
181
192
|
step: 当前步数
|
|
@@ -187,6 +198,7 @@ class ProgressReporter:
|
|
|
187
198
|
extra_data: 额外数据
|
|
188
199
|
"""
|
|
189
200
|
data = {
|
|
201
|
+
"type": "progress", # 流式消息类型标识
|
|
190
202
|
"description": self.description,
|
|
191
203
|
"total_steps": self.total_steps,
|
|
192
204
|
"current_step": step,
|
|
@@ -202,7 +214,17 @@ class ProgressReporter:
|
|
|
202
214
|
if extra_data:
|
|
203
215
|
data["extra_data"] = extra_data
|
|
204
216
|
|
|
205
|
-
#
|
|
217
|
+
# 1. stderr 输出(用于流式 HTTP 响应,与 stdout 分开避免合并)
|
|
218
|
+
# Jupyter kernel 会将 stdout 和 stderr 分到不同的 stream 消息
|
|
219
|
+
try:
|
|
220
|
+
import sys
|
|
221
|
+
sys.stderr.write(json.dumps(data, ensure_ascii=False) + '\n')
|
|
222
|
+
sys.stderr.flush()
|
|
223
|
+
except Exception as e:
|
|
224
|
+
# stderr 输出失败不影响训练,仅打印警告
|
|
225
|
+
print(f"Warning: Failed to output progress to stderr: {e}")
|
|
226
|
+
|
|
227
|
+
# 2. 文件写入(作为降级/备用方案)
|
|
206
228
|
try:
|
|
207
229
|
PROGRESS_FILE.write_text(json.dumps(data, ensure_ascii=False))
|
|
208
230
|
except Exception as e:
|
|
@@ -156,18 +156,19 @@ def enrich_cuda_error(error_dict: dict) -> dict:
|
|
|
156
156
|
return error_dict
|
|
157
157
|
|
|
158
158
|
|
|
159
|
-
def run_code(code: str, timeout: int = DEFAULT_TIMEOUT) -> dict:
|
|
159
|
+
def run_code(code: str, timeout: int = DEFAULT_TIMEOUT, stream: bool = False) -> dict:
|
|
160
160
|
"""
|
|
161
161
|
使用 IPython Kernel 执行代码
|
|
162
162
|
|
|
163
163
|
Args:
|
|
164
164
|
code: 要执行的 Python 代码
|
|
165
165
|
timeout: 执行超时时间(秒)
|
|
166
|
+
stream: 是否启用流式输出模式(实时输出每个消息)
|
|
166
167
|
|
|
167
168
|
Returns:
|
|
168
|
-
包含 success, outputs, executionTime
|
|
169
|
+
包含 success, outputs, executionTime 的字典(stream 模式下返回空字典)
|
|
169
170
|
"""
|
|
170
|
-
log_debug(f'run_code called, code length={len(code)}, timeout={timeout}')
|
|
171
|
+
log_debug(f'run_code called, code length={len(code)}, timeout={timeout}, stream={stream}')
|
|
171
172
|
|
|
172
173
|
# 注意:不再在执行期间抑制 stdout
|
|
173
174
|
# stdout 只在导入阶段抑制(避免 matplotlib 等导入输出污染结果)
|
|
@@ -179,9 +180,11 @@ def run_code(code: str, timeout: int = DEFAULT_TIMEOUT) -> dict:
|
|
|
179
180
|
deadline = start_time + timeout
|
|
180
181
|
km = None
|
|
181
182
|
kc = None
|
|
182
|
-
outputs = []
|
|
183
|
+
outputs = [] # stream 模式下不使用,保留用于非流式模式
|
|
183
184
|
timed_out = False
|
|
184
185
|
msg_count = 0
|
|
186
|
+
has_error = False
|
|
187
|
+
final_outputs = [] # 用于最终 result 消息的 outputs 汇总
|
|
185
188
|
|
|
186
189
|
try:
|
|
187
190
|
# 1. 启动 Kernel(抑制 stdout 避免启动输出污染)
|
|
@@ -248,28 +251,48 @@ def run_code(code: str, timeout: int = DEFAULT_TIMEOUT) -> dict:
|
|
|
248
251
|
|
|
249
252
|
# 处理不同类型的输出
|
|
250
253
|
if msg_type == 'stream':
|
|
251
|
-
|
|
254
|
+
stream_output = {
|
|
252
255
|
'type': 'stream',
|
|
253
256
|
'name': content.get('name', 'stdout'),
|
|
254
257
|
'text': content.get('text', '')
|
|
255
|
-
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
if stream:
|
|
261
|
+
# 流式模式:立即输出
|
|
262
|
+
print(json.dumps(stream_output, ensure_ascii=False), flush=True)
|
|
263
|
+
else:
|
|
264
|
+
outputs.append(stream_output)
|
|
256
265
|
log_debug(f'Stream output: {content.get("name")} len={len(content.get("text", ""))}')
|
|
257
266
|
|
|
258
267
|
elif msg_type == 'display_data':
|
|
259
|
-
|
|
268
|
+
display_output = {
|
|
260
269
|
'type': 'display_data',
|
|
261
270
|
'data': content.get('data', {}),
|
|
262
271
|
'metadata': content.get('metadata', {})
|
|
263
|
-
}
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
if stream:
|
|
275
|
+
# 流式模式:立即输出
|
|
276
|
+
print(json.dumps(display_output, ensure_ascii=False), flush=True)
|
|
277
|
+
else:
|
|
278
|
+
outputs.append(display_output)
|
|
279
|
+
final_outputs.append(display_output) # 汇总到最终结果
|
|
264
280
|
log_debug(f'Display data: keys={list(content.get("data", {}).keys())}')
|
|
265
281
|
|
|
266
282
|
elif msg_type == 'execute_result':
|
|
267
|
-
|
|
283
|
+
result_output = {
|
|
268
284
|
'type': 'execute_result',
|
|
269
285
|
'data': content.get('data', {}),
|
|
270
286
|
'metadata': content.get('metadata', {}),
|
|
271
287
|
'execution_count': content.get('execution_count')
|
|
272
|
-
}
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
if stream:
|
|
291
|
+
# 流式模式:立即输出
|
|
292
|
+
print(json.dumps(result_output, ensure_ascii=False), flush=True)
|
|
293
|
+
else:
|
|
294
|
+
outputs.append(result_output)
|
|
295
|
+
final_outputs.append(result_output) # 汇总到最终结果
|
|
273
296
|
log_debug(f'Execute result: keys={list(content.get("data", {}).keys())}')
|
|
274
297
|
|
|
275
298
|
elif msg_type == 'error':
|
|
@@ -286,7 +309,14 @@ def run_code(code: str, timeout: int = DEFAULT_TIMEOUT) -> dict:
|
|
|
286
309
|
error_output = enrich_cuda_error(error_output)
|
|
287
310
|
log_debug(f'Detected CUDA compatibility error, enriched output')
|
|
288
311
|
|
|
289
|
-
|
|
312
|
+
has_error = True
|
|
313
|
+
|
|
314
|
+
if stream:
|
|
315
|
+
# 流式模式:立即输出
|
|
316
|
+
print(json.dumps(error_output, ensure_ascii=False), flush=True)
|
|
317
|
+
else:
|
|
318
|
+
outputs.append(error_output)
|
|
319
|
+
final_outputs.append(error_output) # 汇总到最终结果
|
|
290
320
|
log_debug(f'Error: {content.get("ename")}: {content.get("evalue")}')
|
|
291
321
|
|
|
292
322
|
elif msg_type == 'clear_output':
|
|
@@ -306,7 +336,7 @@ def run_code(code: str, timeout: int = DEFAULT_TIMEOUT) -> dict:
|
|
|
306
336
|
log_debug(f'Suppressed output preview: {suppressed_content[:500]}')
|
|
307
337
|
|
|
308
338
|
if timed_out:
|
|
309
|
-
|
|
339
|
+
timeout_result = {
|
|
310
340
|
'success': False,
|
|
311
341
|
'outputs': [{
|
|
312
342
|
'type': 'error',
|
|
@@ -316,9 +346,33 @@ def run_code(code: str, timeout: int = DEFAULT_TIMEOUT) -> dict:
|
|
|
316
346
|
}],
|
|
317
347
|
'executionTime': round(execution_time, 3)
|
|
318
348
|
}
|
|
349
|
+
if stream:
|
|
350
|
+
# 流式模式:输出超时消息
|
|
351
|
+
print(json.dumps({'type': 'error', 'ename': 'TimeoutError',
|
|
352
|
+
'evalue': f'Execution timed out after {timeout} seconds',
|
|
353
|
+
'traceback': [f'Execution timed out after {timeout} seconds']},
|
|
354
|
+
ensure_ascii=False), flush=True)
|
|
355
|
+
print(json.dumps({'type': 'result', 'success': False,
|
|
356
|
+
'executionTime': round(execution_time, 3)},
|
|
357
|
+
ensure_ascii=False), flush=True)
|
|
358
|
+
return {}
|
|
359
|
+
return timeout_result
|
|
360
|
+
|
|
361
|
+
success = not has_error
|
|
362
|
+
|
|
363
|
+
if stream:
|
|
364
|
+
# 流式模式:输出最终结果消息
|
|
365
|
+
result_msg = {
|
|
366
|
+
'type': 'result',
|
|
367
|
+
'success': success,
|
|
368
|
+
'outputs': final_outputs,
|
|
369
|
+
'executionTime': round(execution_time, 3)
|
|
370
|
+
}
|
|
371
|
+
print(json.dumps(result_msg, ensure_ascii=False), flush=True)
|
|
372
|
+
return {}
|
|
319
373
|
|
|
320
374
|
return {
|
|
321
|
-
'success':
|
|
375
|
+
'success': success,
|
|
322
376
|
'outputs': outputs,
|
|
323
377
|
'executionTime': round(execution_time, 3)
|
|
324
378
|
}
|
|
@@ -341,6 +395,15 @@ def run_code(code: str, timeout: int = DEFAULT_TIMEOUT) -> dict:
|
|
|
341
395
|
error_output = enrich_cuda_error(error_output)
|
|
342
396
|
log_debug(f'Detected CUDA compatibility error in exception, enriched output')
|
|
343
397
|
|
|
398
|
+
if stream:
|
|
399
|
+
# 流式模式:输出错误消息和结果消息
|
|
400
|
+
print(json.dumps(error_output, ensure_ascii=False), flush=True)
|
|
401
|
+
print(json.dumps({'type': 'result', 'success': False,
|
|
402
|
+
'outputs': [error_output],
|
|
403
|
+
'executionTime': round(execution_time, 3)},
|
|
404
|
+
ensure_ascii=False), flush=True)
|
|
405
|
+
return {}
|
|
406
|
+
|
|
344
407
|
return {
|
|
345
408
|
'success': False,
|
|
346
409
|
'outputs': [error_output],
|
|
@@ -428,6 +491,7 @@ def main():
|
|
|
428
491
|
parser.add_argument('--code', type=str, required=True, help='要执行的 Python 代码')
|
|
429
492
|
parser.add_argument('--timeout', type=int, default=DEFAULT_TIMEOUT, help='执行超时时间(秒)')
|
|
430
493
|
parser.add_argument('--check-cuda', action='store_true', help='仅检查 CUDA 兼容性')
|
|
494
|
+
parser.add_argument('--stream', action='store_true', help='启用流式输出模式(实时输出每个消息)')
|
|
431
495
|
|
|
432
496
|
args = parser.parse_args()
|
|
433
497
|
|
|
@@ -437,13 +501,14 @@ def main():
|
|
|
437
501
|
print(json.dumps(result, ensure_ascii=False))
|
|
438
502
|
return
|
|
439
503
|
|
|
440
|
-
result = run_code(args.code, args.timeout)
|
|
504
|
+
result = run_code(args.code, args.timeout, stream=args.stream)
|
|
441
505
|
|
|
442
|
-
#
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
506
|
+
# 非流式模式:输出 JSON 结果到 stdout
|
|
507
|
+
if not args.stream:
|
|
508
|
+
output_json = json.dumps(result, ensure_ascii=False)
|
|
509
|
+
log_debug(f'Final JSON output length: {len(output_json)}')
|
|
510
|
+
log_debug(f'Final JSON preview: {output_json[:500]}')
|
|
511
|
+
print(output_json)
|
|
447
512
|
|
|
448
513
|
|
|
449
514
|
if __name__ == '__main__':
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
*/
|
|
4
4
|
import { Router } from 'express'
|
|
5
5
|
import Docker from 'dockerode'
|
|
6
|
-
import sandbox, { runPythonCode, checkImageExists, checkGPUAvailable, checkCUDACompatibility, abortExecution } from '../sandbox.js'
|
|
6
|
+
import sandbox, { runPythonCode, runPythonCodeStreaming, checkImageExists, checkGPUAvailable, checkCUDACompatibility, abortExecution } from '../sandbox.js'
|
|
7
7
|
|
|
8
8
|
const { SANDBOX_CONFIG } = sandbox
|
|
9
9
|
const docker = new Docker()
|
|
@@ -121,6 +121,92 @@ router.post('/run', async (req, res) => {
|
|
|
121
121
|
}
|
|
122
122
|
})
|
|
123
123
|
|
|
124
|
+
/**
|
|
125
|
+
* 流式执行代码
|
|
126
|
+
* POST /api/sandbox/stream
|
|
127
|
+
* Body: { code: string, useGpu?: boolean, timeout?: number|null }
|
|
128
|
+
* 响应: JSON Lines 流式输出
|
|
129
|
+
*/
|
|
130
|
+
router.post('/stream', async (req, res) => {
|
|
131
|
+
const { code, useGpu = false, timeout = null } = req.body
|
|
132
|
+
|
|
133
|
+
// 验证请求
|
|
134
|
+
if (!code || typeof code !== 'string') {
|
|
135
|
+
return res.status(400).json({
|
|
136
|
+
success: false,
|
|
137
|
+
error: 'Missing or invalid code parameter'
|
|
138
|
+
})
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// 代码长度限制
|
|
142
|
+
if (code.length > 100000) {
|
|
143
|
+
return res.status(400).json({
|
|
144
|
+
success: false,
|
|
145
|
+
error: 'Code too long (max 100KB)'
|
|
146
|
+
})
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// 验证 timeout 参数
|
|
150
|
+
if (timeout !== null && timeout !== undefined) {
|
|
151
|
+
if (typeof timeout !== 'number' || timeout < 0 || timeout > 86400) {
|
|
152
|
+
return res.status(400).json({
|
|
153
|
+
success: false,
|
|
154
|
+
error: 'Invalid timeout parameter (must be null or number 0-86400)'
|
|
155
|
+
})
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
try {
|
|
160
|
+
// 检查镜像是否存在,智能降级
|
|
161
|
+
let actualUseGpu = useGpu
|
|
162
|
+
let actualImage = null
|
|
163
|
+
let imageExists = await checkImageExists(useGpu)
|
|
164
|
+
|
|
165
|
+
if (!imageExists && !useGpu) {
|
|
166
|
+
const gpuImageExists = await checkImageExists(true)
|
|
167
|
+
if (gpuImageExists) {
|
|
168
|
+
imageExists = true
|
|
169
|
+
actualUseGpu = false
|
|
170
|
+
actualImage = SANDBOX_CONFIG.imageGpu
|
|
171
|
+
console.log('[Sandbox Stream] CPU镜像不存在,使用GPU镜像执行')
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
if (!imageExists) {
|
|
176
|
+
return res.status(503).json({
|
|
177
|
+
success: false,
|
|
178
|
+
error: useGpu
|
|
179
|
+
? 'GPU 镜像未安装。请运行 npm run build:sandbox:gpu 或 dmla install --gpu'
|
|
180
|
+
: '沙箱镜像未安装。请运行 npm run build:sandbox:cpu 或 dmla install --cpu'
|
|
181
|
+
})
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// 如果请求 GPU,检查 GPU 是否可用
|
|
185
|
+
if (actualUseGpu) {
|
|
186
|
+
const gpuAvailable = await checkGPUAvailable()
|
|
187
|
+
if (!gpuAvailable) {
|
|
188
|
+
return res.status(503).json({
|
|
189
|
+
success: false,
|
|
190
|
+
error: 'GPU 硬件不可用。请确保系统安装了 NVIDIA GPU 驱动和 nvidia-container-toolkit。'
|
|
191
|
+
})
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// 流式执行代码
|
|
196
|
+
await runPythonCodeStreaming(code, actualUseGpu, res, actualImage, timeout)
|
|
197
|
+
|
|
198
|
+
} catch (error) {
|
|
199
|
+
console.error('Sandbox stream error:', error)
|
|
200
|
+
// 如果响应头已发送,无法发送状态码
|
|
201
|
+
if (!res.headersSent) {
|
|
202
|
+
res.status(500).json({
|
|
203
|
+
success: false,
|
|
204
|
+
error: error.message || 'Internal sandbox error'
|
|
205
|
+
})
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
})
|
|
209
|
+
|
|
124
210
|
/**
|
|
125
211
|
* GPU 状态检查
|
|
126
212
|
*/
|
package/src/server/sandbox.js
CHANGED
|
@@ -708,6 +708,318 @@ export async function runPythonCode(code, useGpu = false, imageOverride = null,
|
|
|
708
708
|
}
|
|
709
709
|
}
|
|
710
710
|
|
|
711
|
+
/**
|
|
712
|
+
* 流式执行 Python 代码
|
|
713
|
+
* 使用 Docker logs follow 模式实时转发容器输出到 HTTP 响应
|
|
714
|
+
* @param {string} code - Python 代码
|
|
715
|
+
* @param {boolean} useGpu - 是否启用 GPU 设备
|
|
716
|
+
* @param {object} res - Express 响应对象
|
|
717
|
+
* @param {string|null} imageOverride - 可选,指定使用的镜像名称
|
|
718
|
+
* @param {number|null} timeoutOverride - 可选,超时时间(秒)
|
|
719
|
+
* @returns {Promise<void>}
|
|
720
|
+
*/
|
|
721
|
+
export async function runPythonCodeStreaming(code, useGpu = false, res, imageOverride = null, timeoutOverride = null) {
|
|
722
|
+
const startTime = Date.now()
|
|
723
|
+
|
|
724
|
+
// 生成唯一执行 ID
|
|
725
|
+
const executionId = generateExecutionId()
|
|
726
|
+
|
|
727
|
+
// 计算实际超时时间
|
|
728
|
+
const actualTimeout = timeoutOverride === null
|
|
729
|
+
? null // unlimited
|
|
730
|
+
: (timeoutOverride || Math.floor(SANDBOX_CONFIG.timeout / 1000))
|
|
731
|
+
|
|
732
|
+
log(`runPythonCodeStreaming called, executionId=${executionId}, useGpu=${useGpu}, code length=${code.length}`)
|
|
733
|
+
|
|
734
|
+
// 设置流式响应头
|
|
735
|
+
res.setHeader('Content-Type', 'application/x-ndjson')
|
|
736
|
+
res.setHeader('Transfer-Encoding', 'chunked')
|
|
737
|
+
res.setHeader('X-Accel-Buffering', 'no') // 禁用 nginx 缓冲
|
|
738
|
+
res.setHeader('Cache-Control', 'no-cache')
|
|
739
|
+
res.setHeader('Connection', 'keep-alive')
|
|
740
|
+
|
|
741
|
+
// 选择镜像
|
|
742
|
+
const image = imageOverride || (useGpu ? SANDBOX_CONFIG.imageGpu : SANDBOX_CONFIG.imageCpu)
|
|
743
|
+
log(`Using image: ${image}`)
|
|
744
|
+
|
|
745
|
+
// GPU 兼容性预检查
|
|
746
|
+
if (useGpu) {
|
|
747
|
+
log('GPU mode: running CUDA compatibility pre-check...')
|
|
748
|
+
const compatResult = await checkCUDACompatibility()
|
|
749
|
+
log(`CUDA compatibility check result: ${JSON.stringify(compatResult)}`)
|
|
750
|
+
|
|
751
|
+
if (!compatResult.compatible) {
|
|
752
|
+
log('CUDA compatibility check failed')
|
|
753
|
+
// 输出错误消息
|
|
754
|
+
const errorMsg = {
|
|
755
|
+
type: 'error',
|
|
756
|
+
ename: 'CUDACompatError',
|
|
757
|
+
evalue: 'CUDA 兼容性错误',
|
|
758
|
+
traceback: ['PyTorch CUDA 版本与您的 GPU 不兼容,请使用 CPU 模式运行']
|
|
759
|
+
}
|
|
760
|
+
res.write(JSON.stringify(errorMsg) + '\n')
|
|
761
|
+
const resultMsg = {
|
|
762
|
+
type: 'result',
|
|
763
|
+
success: false,
|
|
764
|
+
executionTime: (Date.now() - startTime) / 1000
|
|
765
|
+
}
|
|
766
|
+
res.write(JSON.stringify(resultMsg) + '\n')
|
|
767
|
+
res.end()
|
|
768
|
+
return
|
|
769
|
+
}
|
|
770
|
+
log('CUDA compatibility check passed')
|
|
771
|
+
}
|
|
772
|
+
|
|
773
|
+
// 创建容器配置
|
|
774
|
+
const timeoutSeconds = actualTimeout === null ? 86400 : actualTimeout
|
|
775
|
+
const containerConfig = {
|
|
776
|
+
Image: image,
|
|
777
|
+
Cmd: ['python3', '/workspace/kernel_runner.py', '--code', code, '--timeout', String(timeoutSeconds), '--stream'],
|
|
778
|
+
HostConfig: {
|
|
779
|
+
Memory: SANDBOX_CONFIG.memory,
|
|
780
|
+
AutoRemove: false
|
|
781
|
+
},
|
|
782
|
+
Env: [
|
|
783
|
+
'PYTHONUNBUFFERED=1',
|
|
784
|
+
'PYTHONPATH=/workspace',
|
|
785
|
+
actualTimeout === null ? 'DMLA_NO_TIMEOUT=1' : ''
|
|
786
|
+
].filter(e => e)
|
|
787
|
+
}
|
|
788
|
+
|
|
789
|
+
log('Container config created for streaming')
|
|
790
|
+
|
|
791
|
+
// Volume Mount 配置(与 runPythonCode 相同)
|
|
792
|
+
const useMount = shouldMountSharedModules()
|
|
793
|
+
const sharedModulesPath = getSharedModulesPath()
|
|
794
|
+
const mountKernelRunner = shouldMountKernelRunner()
|
|
795
|
+
const kernelRunnerPath = getKernelRunnerPath()
|
|
796
|
+
const binds = []
|
|
797
|
+
|
|
798
|
+
// 挂数据目录
|
|
799
|
+
const dataVolumePath = getDataVolumePath()
|
|
800
|
+
if (dataVolumePath && fs.existsSync(dataVolumePath)) {
|
|
801
|
+
binds.push(`${dataVolumePath}:/data`)
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
// 挂共享模块
|
|
805
|
+
if (useMount && sharedModulesPath && fs.existsSync(sharedModulesPath)) {
|
|
806
|
+
binds.push(`${sharedModulesPath}:/workspace/shared:ro`)
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
// 挂 kernel_runner.py
|
|
810
|
+
if (mountKernelRunner && kernelRunnerPath && fs.existsSync(kernelRunnerPath)) {
|
|
811
|
+
binds.push(`${kernelRunnerPath}:/workspace/kernel_runner.py:ro`)
|
|
812
|
+
}
|
|
813
|
+
|
|
814
|
+
// 挂 dmla_progress.py
|
|
815
|
+
const progressReporterPath = getProgressReporterPath()
|
|
816
|
+
if (mountKernelRunner && progressReporterPath && fs.existsSync(progressReporterPath)) {
|
|
817
|
+
binds.push(`${progressReporterPath}:/workspace/dmla_progress.py:ro`)
|
|
818
|
+
}
|
|
819
|
+
|
|
820
|
+
if (binds.length > 0) {
|
|
821
|
+
containerConfig.HostConfig.Binds = binds
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
// GPU 配置
|
|
825
|
+
if (useGpu) {
|
|
826
|
+
containerConfig.HostConfig.DeviceRequests = [{
|
|
827
|
+
Driver: 'nvidia',
|
|
828
|
+
Count: -1,
|
|
829
|
+
Capabilities: [['gpu']]
|
|
830
|
+
}]
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
let container = null
|
|
834
|
+
|
|
835
|
+
try {
|
|
836
|
+
// 创建容器
|
|
837
|
+
log('Creating container for streaming...')
|
|
838
|
+
container = await docker.createContainer(containerConfig)
|
|
839
|
+
log(`Container created: ${container.id}`)
|
|
840
|
+
|
|
841
|
+
// 注册到活跃容器列表
|
|
842
|
+
registerContainer(executionId, container)
|
|
843
|
+
|
|
844
|
+
// 输出启动状态消息
|
|
845
|
+
const statusMsg = {
|
|
846
|
+
type: 'status',
|
|
847
|
+
status: 'starting',
|
|
848
|
+
message: '正在启动容器...',
|
|
849
|
+
executionId
|
|
850
|
+
}
|
|
851
|
+
res.write(JSON.stringify(statusMsg) + '\n')
|
|
852
|
+
|
|
853
|
+
// 启动容器
|
|
854
|
+
log('Starting container...')
|
|
855
|
+
await container.start()
|
|
856
|
+
log('Container started')
|
|
857
|
+
|
|
858
|
+
// 输出运行状态消息
|
|
859
|
+
const runningMsg = {
|
|
860
|
+
type: 'status',
|
|
861
|
+
status: 'running',
|
|
862
|
+
message: '代码执行中...'
|
|
863
|
+
}
|
|
864
|
+
res.write(JSON.stringify(runningMsg) + '\n')
|
|
865
|
+
|
|
866
|
+
// 获取实时日志流
|
|
867
|
+
log('Starting log stream...')
|
|
868
|
+
const logStream = await container.logs({
|
|
869
|
+
stdout: true,
|
|
870
|
+
stderr: true,
|
|
871
|
+
follow: true, // 实时跟踪
|
|
872
|
+
timestamps: false,
|
|
873
|
+
tail: 0 // 从当前位置开始
|
|
874
|
+
})
|
|
875
|
+
|
|
876
|
+
// 处理日志流数据
|
|
877
|
+
logStream.on('data', (chunk) => {
|
|
878
|
+
if (Buffer.isBuffer(chunk)) {
|
|
879
|
+
// 解析 Docker 日志格式
|
|
880
|
+
const lines = parseDockerLogLines(chunk)
|
|
881
|
+
for (const { streamType, text } of lines) {
|
|
882
|
+
if (text && text.trim()) {
|
|
883
|
+
log(`Stream output (${streamType}): ${text.substring(0, 100)}...`)
|
|
884
|
+
// kernel_runner.py 已经输出 JSON 格式消息,直接转发
|
|
885
|
+
// 检查是否已经是 JSON 格式(stream, result, progress 等消息)
|
|
886
|
+
if (text.trim().startsWith('{') && text.includes('"type":')) {
|
|
887
|
+
res.write(text + '\n')
|
|
888
|
+
} else {
|
|
889
|
+
// 非 JSON 内容(如容器启动日志),包装为 stream 消息
|
|
890
|
+
res.write(JSON.stringify({
|
|
891
|
+
type: 'stream',
|
|
892
|
+
name: streamType,
|
|
893
|
+
text: text
|
|
894
|
+
}) + '\n')
|
|
895
|
+
}
|
|
896
|
+
}
|
|
897
|
+
}
|
|
898
|
+
} else {
|
|
899
|
+
// 字符串格式(fallback)
|
|
900
|
+
const textLines = chunk.toString().split('\n').filter(l => l.trim())
|
|
901
|
+
for (const line of textLines) {
|
|
902
|
+
if (line.trim().startsWith('{') && line.includes('"type":')) {
|
|
903
|
+
res.write(line + '\n')
|
|
904
|
+
} else {
|
|
905
|
+
res.write(JSON.stringify({
|
|
906
|
+
type: 'stream',
|
|
907
|
+
name: 'stdout',
|
|
908
|
+
text: line
|
|
909
|
+
}) + '\n')
|
|
910
|
+
}
|
|
911
|
+
}
|
|
912
|
+
}
|
|
913
|
+
})
|
|
914
|
+
|
|
915
|
+
logStream.on('error', (err) => {
|
|
916
|
+
log(`Log stream error: ${err.message}`)
|
|
917
|
+
const errorMsg = {
|
|
918
|
+
type: 'error',
|
|
919
|
+
ename: 'StreamError',
|
|
920
|
+
evalue: err.message,
|
|
921
|
+
traceback: [err.message]
|
|
922
|
+
}
|
|
923
|
+
res.write(JSON.stringify(errorMsg) + '\n')
|
|
924
|
+
})
|
|
925
|
+
|
|
926
|
+
// 等待容器结束
|
|
927
|
+
log('Waiting for container to finish...')
|
|
928
|
+
await container.wait()
|
|
929
|
+
log('Container finished')
|
|
930
|
+
|
|
931
|
+
// 等待日志流结束
|
|
932
|
+
await new Promise((resolve) => {
|
|
933
|
+
logStream.on('end', resolve)
|
|
934
|
+
// 确保流已结束(可能已经结束)
|
|
935
|
+
if (logStream.destroyed || logStream.readableEnded) {
|
|
936
|
+
resolve()
|
|
937
|
+
}
|
|
938
|
+
})
|
|
939
|
+
|
|
940
|
+
log('Log stream ended')
|
|
941
|
+
|
|
942
|
+
} catch (error) {
|
|
943
|
+
log(`Streaming execution error: ${error.message}`)
|
|
944
|
+
log(`Error stack: ${error.stack}`)
|
|
945
|
+
|
|
946
|
+
const errorMsg = {
|
|
947
|
+
type: 'error',
|
|
948
|
+
ename: error.name || 'ExecutionError',
|
|
949
|
+
evalue: error.message || 'Unknown error',
|
|
950
|
+
traceback: [error.message || 'Unknown error']
|
|
951
|
+
}
|
|
952
|
+
res.write(JSON.stringify(errorMsg) + '\n')
|
|
953
|
+
|
|
954
|
+
const resultMsg = {
|
|
955
|
+
type: 'result',
|
|
956
|
+
success: false,
|
|
957
|
+
executionTime: (Date.now() - startTime) / 1000
|
|
958
|
+
}
|
|
959
|
+
res.write(JSON.stringify(resultMsg) + '\n')
|
|
960
|
+
|
|
961
|
+
} finally {
|
|
962
|
+
// 从活跃列表移除
|
|
963
|
+
unregisterContainer(executionId)
|
|
964
|
+
|
|
965
|
+
// 清理容器
|
|
966
|
+
log('Cleaning up container...')
|
|
967
|
+
if (container) {
|
|
968
|
+
try {
|
|
969
|
+
await container.remove({ force: true })
|
|
970
|
+
log('Container removed')
|
|
971
|
+
} catch (e) {
|
|
972
|
+
log(`Container cleanup error: ${e.message}`)
|
|
973
|
+
}
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
// 结束响应
|
|
977
|
+
res.end()
|
|
978
|
+
log('Streaming response ended')
|
|
979
|
+
}
|
|
980
|
+
}
|
|
981
|
+
|
|
982
|
+
/**
|
|
983
|
+
* 解析 Docker 日志流中的多行数据
|
|
984
|
+
* Docker 日志格式: [8字节头][数据]
|
|
985
|
+
* @param {Buffer} buffer - Docker 日志 buffer
|
|
986
|
+
* @returns {string[]} - 解析后的行数组
|
|
987
|
+
*/
|
|
988
|
+
function parseDockerLogLines(buffer) {
|
|
989
|
+
if (!Buffer.isBuffer(buffer) || buffer.length === 0) return []
|
|
990
|
+
|
|
991
|
+
const lines = []
|
|
992
|
+
let offset = 0
|
|
993
|
+
|
|
994
|
+
while (offset < buffer.length) {
|
|
995
|
+
// 检查是否有完整的头部
|
|
996
|
+
if (offset + 8 > buffer.length) break
|
|
997
|
+
|
|
998
|
+
const streamType = buffer[offset] // 1=stdout, 2=stderr
|
|
999
|
+
const length = buffer.readUInt32BE(offset + 4)
|
|
1000
|
+
|
|
1001
|
+
offset += 8
|
|
1002
|
+
|
|
1003
|
+
// 检查是否有完整的数据
|
|
1004
|
+
if (offset + length > buffer.length) break
|
|
1005
|
+
|
|
1006
|
+
const chunk = buffer.slice(offset, offset + length).toString('utf8')
|
|
1007
|
+
offset += length
|
|
1008
|
+
|
|
1009
|
+
// 按行分割(一个 Docker 消息可能包含多行)
|
|
1010
|
+
const chunkLines = chunk.split('\n').filter(l => l.trim())
|
|
1011
|
+
for (const line of chunkLines) {
|
|
1012
|
+
// 返回包含 streamType 的对象
|
|
1013
|
+
lines.push({
|
|
1014
|
+
streamType: streamType === 1 ? 'stdout' : 'stderr',
|
|
1015
|
+
text: line
|
|
1016
|
+
})
|
|
1017
|
+
}
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
return lines
|
|
1021
|
+
}
|
|
1022
|
+
|
|
711
1023
|
/**
|
|
712
1024
|
* 解析 Docker 日志输出
|
|
713
1025
|
* Docker 日志格式: [8字节头][数据]
|
|
@@ -826,6 +1138,7 @@ export async function pullImage(useGpu = false) {
|
|
|
826
1138
|
|
|
827
1139
|
export default {
|
|
828
1140
|
runPythonCode,
|
|
1141
|
+
runPythonCodeStreaming,
|
|
829
1142
|
checkGPUAvailable,
|
|
830
1143
|
checkCUDACompatibility,
|
|
831
1144
|
checkImageExists,
|
package/version.json
CHANGED