PyPI - zyworkflow - Versions diffs - 0.0.1__py3-none-any.whl - Mend

zyworkflow 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

zyworkflow/__init__.py +0 -0
zyworkflow/api_server.py +630 -0
zyworkflow/data/__init__.py +0 -0
zyworkflow/data/collection.py +1241 -0
zyworkflow/data/process.py +72 -0
zyworkflow/doc/api.md +461 -0
zyworkflow/example/__init__.py +0 -0
zyworkflow/example/train_client.py +301 -0
zyworkflow/example/train_client_example.py +43 -0
zyworkflow/policy/__init__.py +0 -0
zyworkflow/policy/train_pick_policy.py +834 -0
zyworkflow/utils/__init__.py +0 -0
zyworkflow/utils/logger_config.py +50 -0
zyworkflow/utils/pose.py +131 -0
zyworkflow/utils/utils.py +264 -0
zyworkflow-0.0.1.dist-info/METADATA +11 -0
zyworkflow-0.0.1.dist-info/RECORD +19 -0
zyworkflow-0.0.1.dist-info/WHEEL +5 -0
zyworkflow-0.0.1.dist-info/top_level.txt +1 -0

zyworkflow/__init__.py ADDED Viewed

File without changes

zyworkflow/api_server.py ADDED Viewed

@@ -0,0 +1,630 @@
+import os
+import sys
+import cv2
+import time
+import torch
+import signal
+import threading
+import traceback
+import subprocess
+import numpy as np
+from pydantic import BaseModel, Field
+from typing import List, Optional, Dict, Any, Literal
+from fastapi import FastAPI, HTTPException, BackgroundTasks
+from zyworkflow.utils.utils import *
+from zyworkflow.utils.logger_config import setup_api_server_logger
+from zyworkflow.policy.train_pick_policy import (
+    PersistentBNNPool,
+    SingleViewBNNActionPolicy,
+)
+logger = setup_api_server_logger()
+app = FastAPI(title="BNN 训练和测试服务", description="用于机器人抓取任务的训练和真机测试")
+bnn_pool_cache = None
+test_tasks = {}
+test_tasks_lock = threading.Lock()
+training_status = {}
+training_processes = {}
+training_lock = threading.RLock()
+collection_task_store = {}
+collection_executor = None
+collection_executor_lock = threading.Lock()
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+TRAIN_SCRIPT = os.path.join(SCRIPT_DIR, "policy", "train_pick_policy.py")
+class TrainRequest(BaseModel):
+    task_id: str = Field(..., description="任务ID，必填")
+    dataset_id: str = Field(..., description="数据集ID，必填")
+    model_id: str = Field(..., description="模型版本ID，必填")
+    ability_id: str = Field(..., description="原子动作ID，必填")
+    algo_type: str = Field(..., description="算法类型，目前仅支持'bnn'，必填")
+    action_type: str = Field(..., description="动作类型，目前仅支持'pick'和'place'，必填")
+    batch_size: int = 48
+    seq_len: int = 4
+    action_chunk: int = 8
+    lr: float = 1e-4
+    num_epochs: int = 500
+    start_epoch: int = 0
+    lambda_joints: float = 10.0
+    lambda_grip: float = 5.0
+    lambda_success: float = 2.0
+    log_path: Optional[str] = None
+    ckpt_dir: Optional[str] = None
+    success_mode: str = "within_horizon"
+    report_url: Optional[str] = None
+    @property
+    def task_name(self) -> str:
+        return f"{self.task_id}-{self.ability_id}-{self.model_id}"
+    def get_log_path(self) -> str:
+        if self.log_path:
+            return self.log_path
+        return f"/workspace/logs/{self.task_id}/{self.ability_id}/{self.model_id}/training_log.txt"
+    def get_ckpt_dir(self) -> str:
+        if self.ckpt_dir:
+            return self.ckpt_dir
+        return f"/workspace/checkpoints/{self.task_id}/{self.ability_id}/{self.model_id}"
+class TestRequest(BaseModel):
+    task_id: str = Field(..., description="任务ID，必填")
+    ability_id: str = Field(..., description="原子动作ID，必填")
+    model_id: str = Field(..., description="模型版本ID，必填")
+    model_name: str = Field(..., description="模型文件名/别名，必填")
+    algo_type: str = Field(..., description="算法类型，目前仅支持'bnn'，必填")
+    action_type: str = Field(..., description="动作类型，目前仅支持'pick'和'place'，必填")
+    seq_len: int = 4
+    action_chunk: int = 8
+    step: int = 200
+    callback_url: Optional[str] = None
+class TaskRequest(BaseModel):
+    sku: str
+    ability_id: str = Field(..., description="原子动作ID，必填")
+    dataset_id: str = Field(..., description="数据集ID，必填")
+    algo_type: str = Field(..., description="算法类型，目前仅支持'bnn'，必填")
+    action_type: str = Field(..., description="动作类型，目前仅支持'pick'和'place'，必填")
+    init_pose: Optional[List[float]] = Field(..., description="机械臂初始姿态，必填")
+    speed: Optional[int] = 40
+    sampling_rate: Optional[int] = 20
+    callback_url: Optional[str] = None
+class TrainResponse(BaseModel):
+    status: str
+    message: str
+    task_name: str
+class TaskResponse(BaseModel):
+    task_name: str
+    status: str
+    message: Optional[str] = None
+    result: Optional[Dict] = None
+async def execute_collection_task(algo_type: str, action_type: str, ability_id: str, dataset_id: str, sku: str, speed: int, init_pose: Optional[List[float]], sampling_rate: int = 20) -> Dict[str, Any]:
+    global collection_executor
+    try:
+        handle = create(f"{algo_type}-{action_type}", args=(ability_id, dataset_id, init_pose), kws={"speed": speed, "sampling_rate": sampling_rate})
+    except Exception as e:
+        logger.error(f"[data collection] 创建采集任务失败: {e}")
+        return {"code": -1, "msg": f"创建采集任务失败: {e}"}
+    import asyncio
+    loop = asyncio.get_event_loop()
+    with collection_executor_lock:
+        if collection_executor is None:
+            import concurrent.futures
+            collection_executor = concurrent.futures.ThreadPoolExecutor(max_workers=3)
+    try:
+        result = await loop.run_in_executor(collection_executor, lambda: handle.run_from_http_camera(sku))
+        if isinstance(result, dict):
+            return result
+        logger.error(f"[data collection] 任务返回结果类型不支持: {type(result)}")
+        return {"code": -1, "msg": f"任务返回结果类型不支持: {type(result)}"}
+    except Exception as e:
+        logger.error(f"[data collection] 采集任务执行失败: {e}\n{traceback.format_exc()}")
+        return {"code": -1, "msg": f"任务执行失败: {e}"}
+async def send_collection_callback(callback_url: str, task_name: str, sku: str, result: Dict[str, Any]) -> None:
+    import aiohttp
+    try:
+        msg = result.get("msg")
+        async with aiohttp.ClientSession() as session:
+            async with session.post(
+                callback_url,
+                json={
+                    "code": result.get("code"),
+                    "status": collection_task_store[task_name].status,
+                    "message": f"{msg}",
+                    "sku": sku,
+                    "task_name": task_name,
+                    "dataset_id": result.get("dataset_id", ""),
+                    "ability_id": result.get("ability_id", ""),
+                    "traj_path": result.get("traj_path", ""),
+                },
+                headers={"Content-Type": "application/json"},
+                timeout=aiohttp.ClientTimeout(total=30),
+            ) as response:
+                if response.status != 200:
+                    text = await response.text()
+                    logger.warning(f"[data collection] 回调发送失败: {response.status} - {text}")
+    except Exception as e:
+        logger.error(f"[data collection] 回调发送异常: {e}")
+async def process_collection_task_background(algo_type: str, action_type: str, task_name: str, ability_id: str, dataset_id: str, sku: str, init_pose: Optional[List[float]], callback_url: Optional[str], speed: int = 40, sampling_rate: int = 20):
+    try:
+        if task_name in collection_task_store:
+            collection_task_store[task_name].status = "running"
+            collection_task_store[task_name].message = "任务执行中..."
+        result = await execute_collection_task(algo_type, action_type, ability_id, dataset_id, sku, speed, init_pose, sampling_rate)
+        if task_name not in collection_task_store:
+            collection_task_store[task_name] = TaskResponse(task_name=task_name, status="running")
+        if result.get("code") == 0:
+            collection_task_store[task_name].status = "completed"
+            collection_task_store[task_name].message = result.get("msg", "任务执行成功")
+        else:
+            collection_task_store[task_name].status = "failed"
+            collection_task_store[task_name].message = result.get("msg", "任务执行失败")
+        collection_task_store[task_name].result = result
+        if callback_url:
+            await send_collection_callback(callback_url, task_name, sku, result)
+    except Exception as e:
+        if task_name in collection_task_store:
+            collection_task_store[task_name].status = "failed"
+            collection_task_store[task_name].message = f"任务执行异常: {str(e)}"
+            collection_task_store[task_name].result = {"code": -1, "msg": str(e)}
+        logger.error(f"[data collection] 任务 {task_name} 执行异常: {e}\n{traceback.format_exc()}")
+def _update_proc_status_no_throw(task_name: str) -> None:
+    with training_lock:
+        proc_info = training_processes.get(task_name)
+        if not proc_info:
+            return
+        proc: subprocess.Popen = proc_info["popen"]
+        rc = proc.poll()
+        if rc is None:
+            return
+        st = training_status.get(task_name, {})
+        if st.get("status") == "stopping":
+            st["status"] = "stopped"
+            st["message"] = "训练已被用户停止。"
+        elif rc == 0:
+            st["status"] = "completed"
+            st["message"] = "训练成功完成。"
+        else:
+            st["status"] = "failed"
+            st["message"] = f"训练失败，进程返回码: {rc}。请检查训练日志。"
+        training_processes.pop(task_name, None)
+def _kill_process_group(pgid: int, term_timeout_sec: float = 5.0) -> None:
+    try:
+        os.killpg(pgid, signal.SIGTERM)
+    except ProcessLookupError:
+        return
+    deadline = time.time() + term_timeout_sec
+    while time.time() < deadline:
+        try:
+            os.killpg(pgid, 0)
+        except ProcessLookupError:
+            return
+        time.sleep(0.2)
+    try:
+        os.killpg(pgid, signal.SIGKILL)
+    except ProcessLookupError:
+        return
+def load_model_for_inference(model_path: str, seq_len: int = 4, action_chunk: int = 8, device=None):
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = SingleViewBNNActionPolicy(seq_len=seq_len, action_chunk=action_chunk).to(device)
+    checkpoint = torch.load(model_path, map_location=device)
+    if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
+        model.load_state_dict(checkpoint['model_state_dict'])
+        joint_mean = checkpoint.get('joint_mean', torch.zeros(6))
+        joint_std = checkpoint.get('joint_std', torch.ones(6))
+    else:
+        model.load_state_dict(checkpoint)
+        joint_mean = torch.zeros(6)
+        joint_std = torch.ones(6)
+    model.eval()
+    return model, joint_mean.to(device), joint_std.to(device)
+def inference_single_step(model, bnn_pool, image_seq: np.ndarray, seq_len: int, device: torch.device):
+    img_tensor = torch.from_numpy(image_seq).unsqueeze(0).to(device)
+    with torch.no_grad():
+        bnn_in = model.encode_visual(img_tensor)
+        bnn_outputs = []
+        for i in range(seq_len):
+            curr_feat = bnn_in[:, i, :].detach().cpu().numpy() / 10.0
+            bnn_out = bnn_pool.step_batch([curr_feat.reshape(1, -1)])[0]
+            if bnn_out is not None and hasattr(bnn_out, "shape") and bnn_out.shape == (1, 80):
+                bnn_out = bnn_out.T
+            if bnn_out is None:
+                bnn_out = np.zeros(80)
+            bnn_out = np.array(bnn_out).squeeze()
+            bnn_outputs.append(bnn_out)
+        bnn_seq_tensor = torch.tensor(np.stack(bnn_outputs), device=device, dtype=torch.float32).unsqueeze(0)
+        p_j, p_g, p_s = model.decode_action(bnn_in, bnn_seq_tensor)
+        return p_j, p_g, p_s
+@app.get("/")
+async def root():
+    return {"message": "BNN 训练和测试服务", "version": "1.0"}
+@app.post("/train", response_model=TrainResponse)
+async def train_model(request: TrainRequest):
+    task_name = request.task_name
+    root_dir = os.path.join("/workspace/dataset", request.dataset_id, request.ability_id)
+    logger.info(f"[train] 接收到新的训练任务: algo_type={request.algo_type}, action_type={request.action_type}, task_id={request.task_id}, ability_id={request.ability_id}, model_id={request.model_id}, dataset_id={request.dataset_id}")
+    try:
+        if not os.path.exists(root_dir):
+            logger.error(f"[train] 没有找到训练数据: {root_dir}")
+            raise HTTPException(status_code=500, detail=f"Dataset not found: {root_dir}")
+        if not os.path.exists(TRAIN_SCRIPT):
+            logger.error(f"[train] 没有找到训练脚本: {TRAIN_SCRIPT}")
+            raise HTTPException(status_code=500, detail=f"Train script not found: {TRAIN_SCRIPT}")
+        with training_lock:
+            if task_name in training_processes:
+                _update_proc_status_no_throw(task_name)
+                if task_name in training_processes:
+                    logger.error(f"[train] 任务{task_name}已经在训练")
+                    raise HTTPException(status_code=409, detail=f"Task '{task_name}' is already running.")
+        log_path = request.get_log_path()
+        ckpt_dir = request.get_ckpt_dir()
+        os.makedirs(os.path.dirname(log_path), exist_ok=True)
+        os.makedirs(ckpt_dir, exist_ok=True)
+        cmd = [
+            sys.executable, TRAIN_SCRIPT,
+            "--task_name", task_name,
+            "--root_dir", root_dir,
+            "--ckpt_dir", ckpt_dir,
+            "--batch_size", str(request.batch_size),
+            "--seq_len", str(request.seq_len),
+            "--action_chunk", str(request.action_chunk),
+            "--lr", str(request.lr),
+            "--num_epochs", str(request.num_epochs),
+            "--start_epoch", str(request.start_epoch),
+            "--lambda_joints", str(request.lambda_joints),
+            "--lambda_grip", str(request.lambda_grip),
+            "--lambda_success", str(request.lambda_success),
+            "--success_mode", request.success_mode,
+        ]
+        if log_path:
+            cmd.extend(["--log_path", log_path])
+        if request.report_url:
+            cmd.extend(["--report_url", request.report_url])
+        stdout_f = open(log_path, "a", encoding="utf-8")
+        proc = subprocess.Popen(cmd, cwd=SCRIPT_DIR, stdout=stdout_f, stderr=stdout_f, preexec_fn=os.setsid)
+        with training_lock:
+            st = training_status.get(task_name, {})
+            st.update({"status": "running", "message": f"Process started (PID={proc.pid})"})
+            training_status[task_name] = st
+            training_processes[task_name] = {"popen": proc, "pid": proc.pid, "pgid": os.getpgid(proc.pid)}
+        logger.info(f"[train] [{task_name}]: 已下发训练任务")
+        return TrainResponse(status="started", message="已下发训练任务", task_name=task_name)
+    except Exception as e:
+        logger.error(f"[train] [{task_name}] Failed to start training: {e}\n{traceback.format_exc()}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/train/status/{task_id}/{ability_id}/{model_id}")
+async def get_training_status(task_id: str, ability_id: str, model_id: str):
+    task_name = f"{task_id}-{ability_id}-{model_id}"
+    logger.info(f"[train status] 接收获取训练状态请求: task_id={task_id}, ability_id={ability_id}, model_id={model_id}")
+    with training_lock:
+        if task_name not in training_status:
+            logger.error(f"[train status] 没有找到{task_name}训练任务")
+            raise HTTPException(status_code=404, detail=f"Task '{task_name}' not found.")
+    _update_proc_status_no_throw(task_name)
+    with training_lock:
+        logger.info(f"[train status] [{task_name}]: {training_status[task_name]}")
+        return training_status[task_name]
+@app.post("/train/stop/{task_id}/{ability_id}/{model_id}")
+async def stop_training(task_id: str, ability_id: str, model_id: str):
+    task_name = f"{task_id}-{ability_id}-{model_id}"
+    logger.info(f"[train stop] 接收到停止训练请求: task_id={task_id}, ability_id={ability_id}, model_id={model_id}")
+    with training_lock:
+        proc_info = training_processes.get(task_name)
+        if not proc_info:  # 如果为None, 则更新状态
+            _update_proc_status_no_throw(task_name)
+            st = training_status.get(task_name, {})
+            if st.get("status") in ["completed", "failed", "stopped"]:
+                return {"status": "already_finished", "message": st.get("message")}
+            logger.error(f"[train stop] 没有找到待停止任务: {task_name}")
+            raise HTTPException(status_code=404, detail="Running task not found.")
+        pgid = proc_info["pgid"]
+        training_status[task_name]["status"] = "stopping"
+    _kill_process_group(pgid)
+    _update_proc_status_no_throw(task_name)
+    with training_lock:
+        logger.info(f"[train stop] [{task_name}]: Task stopped.")
+        return training_status.get(task_name, {"status": "stopped", "message": "Task stopped."})
+@app.post("/data/collection", response_model=TaskResponse)
+async def submit_collection_task(request: TaskRequest, background_tasks: BackgroundTasks):
+    task_name = f"{request.dataset_id}-{request.ability_id}"
+    logger.info(f"[data collection] 接收到数据采集任务提交请求: {task_name}, sku={request.sku}, algo_type={request.algo_type}, action_type={request.action_type}")
+    valid_tasks = global_config.keys()
+    func = f"{request.algo_type}-{request.action_type}"
+    if func not in valid_tasks:
+        logger.error(f"[data collection] 任务类型错误: {func}")
+        return TaskResponse(
+            task_name=task_name,
+            status="failed",
+            message=f"任务类型错误，可选值: {list(valid_tasks)}",
+        )
+    collection_task_store[task_name] = TaskResponse(
+        task_name=task_name,
+        status="pending",
+        message="数据采集任务已提交",
+    )
+    background_tasks.add_task(
+        process_collection_task_background,
+        task_name,
+        request.algo_type,
+        request.action_type,
+        request.ability_id,
+        request.dataset_id,
+        request.sku,
+        request.init_pose,
+        request.callback_url,
+        request.speed if request.speed is not None else 40,
+        request.sampling_rate if request.sampling_rate is not None else 20,
+    )
+    logger.info(f"[data collection] 已提交新数据采集任务: {task_name}, sku={request.sku}, algo_type={request.algo_type}, action_type={request.action_type}")
+    return collection_task_store[task_name]
+async def send_test_callback(callback_url: str, payload: Dict[str, Any]) -> None:
+    import aiohttp
+    try:
+        async with aiohttp.ClientSession() as session:
+            async with session.post(
+                callback_url,
+                json=payload,
+                headers={"Content-Type": "application/json"},
+                timeout=aiohttp.ClientTimeout(total=10),
+            ) as response:
+                if response.status != 200:
+                    text = await response.text()
+                    logger.warning(f"[test] 测试回调发送失败: {response.status} - {text}")
+    except Exception as e:
+        logger.error(f"[test] 测试回调发送异常: {e}")
+async def process_test_task_background(task_name: str, request: TestRequest):
+    def _should_stop() -> bool:
+        with test_tasks_lock:
+            return bool(test_tasks.get(task_name, {}).get("stop_requested", False))
+    callback_url = request.callback_url
+    try:
+        with test_tasks_lock:
+            st = test_tasks.get(task_name, {})
+            st.update({"status": "running", "message": "测试任务执行中...", "start_time": time.time()})
+            test_tasks[task_name] = st
+        model_path = os.path.join("/workspace/checkpoints", request.task_id, request.ability_id, request.model_id, request.model_name)
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if not os.path.exists(model_path):
+            logger.error(f"[test] 测试模型{model_path}没有找到")
+            raise RuntimeError("Model file not found.")
+        model, joint_mean, joint_std = load_model_for_inference(model_path, request.seq_len, request.action_chunk, device)
+        global bnn_pool_cache
+        if bnn_pool_cache is None:
+            bnn_pool_cache = PersistentBNNPool(num_workers=1)
+        bnn_pool_cache.reset_all(1)
+        images = []
+        for _ in range(request.seq_len):
+            img, err = get_image(rgb_image_url)
+            if img is None:
+                logger.error(f"[test] 请求相机图像失败: {err}")
+                raise RuntimeError(f"请求相机图像失败: {err}")
+            images.append(preprocess_image(img))
+        joints_out = []
+        gripper_out = []
+        success_out = []
+        for i in range(int(request.step)):
+            if _should_stop():
+                with test_tasks_lock:
+                    test_tasks[task_name].update({"status": "stopped", "message": "测试任务被急停", "end_time": time.time()})
+                if callback_url:
+                    await send_test_callback(
+                        callback_url,
+                        {
+                            "code": -2,
+                            "message": "测试任务被急停",
+                            "task_id": request.task_id,
+                            "ability_id": request.ability_id,
+                            "model_id": request.model_id,
+                            "model_name": request.model_name,
+                            "step": i,
+                        },
+                    )
+                return
+            p_j, p_g, p_s = inference_single_step(model, bnn_pool_cache, np.stack(images), request.seq_len, device)
+            joints_step = (p_j[0] * joint_std + joint_mean).cpu().tolist()
+            gripper_step = torch.sigmoid(p_g[0]).squeeze(-1).cpu().tolist()
+            success_step = torch.sigmoid(p_s[0]).squeeze(-1).cpu().tolist()
+            joints_out.append(joints_step)
+            gripper_out.extend(gripper_step)
+            success_out.extend(success_step)
+            with test_tasks_lock:
+                test_tasks[task_name].update({
+                    "current_step": i + 1,
+                    "joints": joints_out,
+                    "gripper": gripper_out,
+                    "success": success_out,
+                })
+            img, err = get_image(rgb_image_url)
+            if img is None:
+                logger.error(f"[test] 请求相机图像失败: {err}")
+                raise RuntimeError(f"请求相机图像失败: {err}")
+            images = images[1:] + [preprocess_image(img)]
+        with test_tasks_lock:
+            test_tasks[task_name].update({"status": "completed", "message": "测试完成", "end_time": time.time()})
+        if callback_url:
+            await send_test_callback(
+                callback_url,
+                {
+                    "code": 0,
+                    "message": "测试完成",
+                    "task_id": request.task_id,
+                    "ability_id": request.ability_id,
+                    "model_id": request.model_id,
+                    "model_name": request.model_name,
+                    "step": request.step,
+                },
+            )
+    except Exception as e:
+        logger.error(f"[test] 测试任务执行失败: {e}\n{traceback.format_exc()}")
+        with test_tasks_lock:
+            test_tasks[task_name].update({"status": "failed", "message": str(e), "end_time": time.time()})
+        if callback_url:
+            await send_test_callback(
+                callback_url,
+                {
+                    "code": -1,
+                    "message": f"测试时发生未知错误: {e}",
+                    "task_id": request.task_id,
+                    "ability_id": request.ability_id,
+                    "model_id": request.model_id,
+                    "model_name": request.model_name,
+                },
+            )
+@app.post("/test/stop/{task_id}/{ability_id}/{model_id}/{model_name}")
+async def stop_test(task_id: str, ability_id: str, model_id: str, model_name: str):
+    task_name = f"{task_id}-{ability_id}-{model_id}-{model_name}"
+    logger.info(f"[test stop] 接收到停止测试请求: task_id={task_id}, ability_id={ability_id}, model_id={model_id}, model_name={model_name}")
+    try:
+        code, msg = post_arm_stop()
+        if code is None:
+            raise RuntimeError(msg or "post_arm_stop failed")
+    except Exception as e:
+        logger.error(f"[test stop] 急停失败: {e}")
+        raise HTTPException(status_code=500, detail=f"急停失败: {e}")
+    with test_tasks_lock:
+        st = test_tasks.get(task_name)
+        if st is None:
+            logger.error(f"[test stop] 没有找到需要急停的{task_name}任务")
+            raise HTTPException(status_code=404, detail="Running test task not found.")
+        st["stop_requested"] = True
+        test_tasks[task_name] = st
+    return {"code": 0, "message": "急停指令已发送"}
+@app.post("/test")
+async def test_model(request: TestRequest, background_tasks: BackgroundTasks):
+    task_name = f"{request.task_id}-{request.ability_id}-{request.model_id}-{request.model_name}"
+    logger.info(f"[test] 接收到测试任务: task_name={task_name}, model_name={request.model_name}, algo_type={request.algo_type}, action_type={request.action_type}, step={request.step}")
+    with test_tasks_lock:
+        st = test_tasks.get(task_name)
+        if st and st.get("status") in ["starting", "running"]:
+            logger.error(f"[test] 任务{task_name}已经在测试中")
+            raise HTTPException(status_code=409, detail=f"Test task '{task_name}' is already running.")
+        test_tasks[task_name] = {
+            "status": "starting",
+            "message": "测试任务已提交",
+            "stop_requested": False,
+            "current_step": 0,
+            "joints": [],
+            "gripper": [],
+            "success": [],
+        }
+    background_tasks.add_task(process_test_task_background, task_name, request)
+    return {"code": 0, "status": "started", "message": "测试任务已提交", "task_name": task_name}
+@app.get("/test/status/{task_id}/{ability_id}/{model_id}/{model_name}")
+async def get_test_status(task_id: str, ability_id: str, model_id: str, model_name: str):
+    task_name = f"{task_id}-{ability_id}-{model_id}-{model_name}"
+    logger.info(f"[test status] 接收获取测试状态请求: task_id={task_id}, ability_id={ability_id}, model_id={model_id}, model_name={model_name}")
+    with test_tasks_lock:
+        st = test_tasks.get(task_name)
+        if st is None:
+            logger.error(f"[test status] 没有找到{task_name}测试任务")
+            raise HTTPException(status_code=404, detail=f"Test task '{task_name}' not found.")
+        return st
+@app.on_event("shutdown")
+async def shutdown_event():
+    with training_lock:
+        running = list(training_processes.values())
+    for info in running:
+        if info.get("pgid"):
+            _kill_process_group(info["pgid"], 2.0)
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8003, access_log=False)

zyworkflow/data/__init__.py ADDED Viewed

File without changes