npm - @optima-chat/gen-cli - Versions diffs - 2.3.0 → 2.5.0 - Mend

@optima-chat/gen-cli 2.3.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/.claude/skills/video-compose/scripts/video_compose.py ADDED Viewed

@@ -0,0 +1,290 @@
+#!/usr/bin/env python3
+"""video-compose — 素材片段 + 口播文案 → 成片（情感配音/选片/字幕/BGM 全自动）
+容器版：TTS 走 `gen tts --provider minimax`（key + 计费在后端 optima-generation），
+不直连 MiniMax、不在容器放密钥。依赖：python3 + ffmpeg + `gen` CLI（容器自带）。
+两个命令，中间由 Claude 看帧写 proposal.json：
+  python3 $CLAUDE_SKILL_DIR/scripts/video_compose.py frames <proj>
+  # -> Claude Read 每帧，按 script.txt 写 <proj>/work/proposal.json
+  python3 $CLAUDE_SKILL_DIR/scripts/video_compose.py build  <proj>
+工程目录：
+  <proj>/inputs/clips/*.mp4   素材（任意命名，按文件名排序得 clip id）
+  <proj>/inputs/script.txt    口播稿，每行一句 = 一个 segment
+  <proj>/inputs/bgm/          (可选) 用户上传的 BGM
+  <proj>/work/                中间产物
+  <proj>/final.mp4            成片
+"""
+import json, subprocess, sys, os, random, shutil, hashlib
+from pathlib import Path
+# 脚本日志含大量中文：统一 stdout/stderr 为 UTF-8，避免非 UTF-8 locale 下一行 print
+# 抛 UnicodeEncodeError 连带杀掉整个 build（容器默认 UTF-8，此处是兜底）。
+try:
+    sys.stdout.reconfigure(encoding="utf-8"); sys.stderr.reconfigure(encoding="utf-8")
+except Exception:
+    pass
+W, H, FPS, CRF = 1080, 1920, 30, 20
+# 字幕字体：容器需有 CJK 字体；可用 env 覆盖。fc-match 验证见 SKILL §坑。
+SUB_FONT = os.environ.get("VIDEO_COMPOSE_FONT", "Noto Sans CJK SC")
+# 情绪 BGM 库 + 音色样音：随 @optima-chat/gen-cli npm 包一起发布（plugin 的
+# ensure-cli.sh 在 SessionStart 装它），不再放 skill 目录里——否则 5.3MB BGM 会把
+# plugin 的 thin tar 撑过上限（见 optima-gen #<this PR>）。解析顺序：
+#   1. VIDEO_COMPOSE_ASSETS 显式覆盖
+#   2. 已安装的 gen-cli 包内 assets/video-compose/
+#   3. skill 目录（optima-agent baked 老布局 / 本地开发的兜底）
+def _asset_root() -> Path:
+    # 资产（bgm-library/voice-samples）随 @optima-chat/gen-cli 发布在 <gen-cli>/assets/video-compose。
+    # gen-cli 在 prod 里可能：(a) 被 baked 进 optima-agent（PATH 上的 `gen` 是个薄 wrapper
+    # `…/optima-agent/dist/bin/gen.js`，嵌套的 gen-cli 在 …/optima-agent/node_modules/@optima-chat/gen-cli），
+    # (b) 由 plugin ensure-cli 装进 $CLAUDE_PLUGIN_DATA，(c) 直接是 gen-cli 自己的 bin。
+    # 因 wrapper 深度不定，从 `gen` 的真实路径**逐层往上**找，每层试 <anc>/assets/video-compose
+    # 和 <anc>/node_modules/@optima-chat/gen-cli/assets/video-compose，谁存在用谁。
+    REL = Path("assets") / "video-compose"
+    NESTED = Path("node_modules") / "@optima-chat" / "gen-cli" / REL
+    env = os.environ.get("VIDEO_COMPOSE_ASSETS")
+    if env and Path(env).exists():
+        return Path(env)
+    cpd = os.environ.get("CLAUDE_PLUGIN_DATA")
+    if cpd:
+        c = Path(cpd) / NESTED
+        if c.exists():
+            return c
+    gen = shutil.which("gen")
+    if gen:
+        p = Path(os.path.realpath(gen))
+        for anc in [p] + list(p.parents):
+            for c in (anc / REL, anc / NESTED):
+                if c.exists():
+                    return c
+    # 兜底：skill 目录（optima-agent baked 老布局 / 本地开发；可能没有资产，_resolve_bgm 会优雅跳过）
+    return Path(os.environ.get("CLAUDE_SKILL_DIR", Path(__file__).resolve().parent.parent))
+ASSET_ROOT = _asset_root()
+BGM_LIBRARY = ASSET_ROOT / "bgm-library"
+_AUDIO_EXT = (".mp3", ".wav", ".m4a", ".aac", ".flac", ".ogg")
+DEFAULT_VOICE = "Chinese (Mandarin)_Warm_Girl"
+# 默认明快语速：本工具是抖音/TikTok/小红书短视频出片，语速要快才像平台口播。
+# 1.35 ≈ TikTok 口播那种节奏（实测+用户拍板）。单句想放慢（治愈/抒情）在 proposal
+# 那句写更低的 speed 覆盖即可。
+DEFAULT_SPEED = 1.35
+def run(cmd):
+    r = subprocess.run([str(c) for c in cmd], capture_output=True, text=True)
+    if r.returncode != 0:
+        print("CMD FAIL:", " ".join(str(c) for c in cmd)); print(r.stderr[-2000:]); sys.exit(1)
+    return r
+def probe_dur(path):
+    return float(run(["ffprobe","-v","error","-show_entries","format=duration","-of","csv=p=0",path]).stdout.strip())
+def list_clips(proj):
+    return sorted((proj/"inputs"/"clips").glob("*.mp4"), key=lambda p: p.name)
+def read_segments(proj):
+    txt=(proj/"inputs"/"script.txt").read_text(encoding="utf-8")
+    return [ln.strip() for ln in txt.splitlines() if ln.strip()]
+# ---------- frames：抽帧给 Claude 看 ----------
+def cmd_frames(proj):
+    """每个素材自适应抽 3~6 帧（约每 5s 一帧），manifest 记录每帧的**时间戳 t（秒）**。
+    Claude 看帧后在 proposal 每句写 `src_start`= 选中那个子镜头帧的 t；同一素材被多句复用时
+    选**不同的 t**，build 据此精确切不同子镜头，避免重复镜头（见 build 的 _resolve_windows）。"""
+    fdir=proj/"work"/"frames"; fdir.mkdir(parents=True, exist_ok=True)
+    clips=list_clips(proj)
+    manifest={"clips":[], "segments":read_segments(proj)}
+    for p in clips:
+        cid=p.stem; dur=probe_dur(p); frames=[]
+        n=max(3, min(6, int(dur//5)+1))  # 自适应帧数：短片 3 帧，长片至多 6 帧
+        for j in range(n):
+            t=dur*(j+0.5)/n               # 每帧落在等分泳道中心，代表一个子镜头
+            tag=chr(ord('a')+j)
+            out=fdir/f"{cid}_{tag}.jpg"
+            run(["ffmpeg","-v","error","-ss",f"{t:.2f}","-i",p,"-frames:v","1","-q:v","3",out,"-y"])
+            frames.append({"tag":tag,"t":round(t,2),"path":str(out)})
+        manifest["clips"].append({"id":cid,"duration_s":round(dur,2),"frames":frames})
+    (proj/"work"/"clips_manifest.json").write_text(json.dumps(manifest,ensure_ascii=False,indent=2),encoding="utf-8")
+    nframes=sum(len(c["frames"]) for c in manifest["clips"])
+    print(f"[frames] {len(clips)} clips / {nframes} 帧（含时间戳 t）-> {fdir}")
+    print(f"[frames] segments: {len(manifest['segments'])} 句；下一步 Claude 看帧写 proposal.json")
+    print(f"[frames] 提示：每句 assignment 写 src_start=选中帧的 t；同素材复用请选不同 t（防重复镜头）")
+# ---------- TTS：gen tts --provider minimax（key/计费在后端）----------
+def gen_tts(text, voice, emotion, speed, out):
+    cmd=["gen","tts",text,"--provider","minimax","--voice",voice,"-o",str(out)]
+    if emotion: cmd+=["--emotion",emotion]
+    if speed is not None: cmd+=["--speed",str(speed)]
+    r=subprocess.run(cmd, capture_output=True, text=True)
+    if r.returncode!=0 or not Path(out).exists():
+        print("TTS FAIL:", " ".join(cmd)); print((r.stderr or r.stdout)[-1500:]); sys.exit(1)
+def _ass_time(t):
+    h=int(t//3600); m=int((t%3600)//60); s=t%60
+    return f"{h:d}:{m:02d}:{s:05.2f}"
+def _preflight_font():
+    """字体预检（fail-loud）：容器若无对应 CJK 字体，subtitles filter 会把中文渲染成豆腐块
+    且 ffmpeg 不报错（静默翻车）。这里用 fc-match 提前拦截。本地无 fc-match(如 Windows)则跳过。
+    放在 TTS 之前，未命中直接退出，不浪费配音扣费。"""
+    if not shutil.which("fc-match"):
+        return  # 非 fontconfig 环境（如本地 Windows），跳过；容器有 fc-match
+    try:
+        # 显式 utf-8 + errors=replace：避免非 utf-8 locale(如 Windows gbk) 解码 fc-match 输出崩溃
+        r=subprocess.run(["fc-match","-f","%{family}",SUB_FONT],
+                         capture_output=True,encoding="utf-8",errors="replace",timeout=10)
+    except Exception:
+        return  # 检查器自身跑不了 → 不阻塞（best-effort 预检）
+    got=(r.stdout or "").strip()
+    if r.returncode!=0 or not got:
+        return  # 拿不到结果 → 不阻塞
+    norm=lambda s: s.lower().replace(" ","")
+    # fontconfig 命中已装字体时 family 原样返回；未装则回退(如 DejaVu)→ 与请求名不符 → 硬失败
+    if norm(SUB_FONT) not in norm(got):
+        print(f"ERR 字幕字体 '{SUB_FONT}' 未命中（fc-match 回退到 '{got}'）——中文字幕会渲染成豆腐块。")
+        print(f"    解决其一：容器装该 CJK 字体 / 设 VIDEO_COMPOSE_FONT 指向已装 CJK 字体（fc-match 报告的）/ 在 skill bundle 字体。")
+        sys.exit(1)
+def _audio_in(d):
+    d=Path(d)
+    return sorted([p for p in d.iterdir() if p.suffix.lower() in _AUDIO_EXT]) if d.is_dir() else []
+def _resolve_bgm(proj, prop):
+    """BGM 不锁死：proposal.bgm 路径 > inputs/bgm/ 上传 > bgm_mood 情绪库(确定性挑) > 无。
+    情绪库选曲按 proposal 内容做确定性 seed —— 同一项目重跑选同一首(稳定)，不同项目才变化。"""
+    if prop.get("bgm"): return prop["bgm"]
+    up=_audio_in(proj/"inputs"/"bgm")
+    if up: return str(up[0])
+    mood=prop.get("bgm_mood")
+    if mood:
+        lib=_audio_in(BGM_LIBRARY/mood)
+        if lib:
+            sig=mood+"|"+"|".join(a.get("text","") for a in prop.get("assignments",[]))
+            seed=int(hashlib.md5(sig.encode("utf-8")).hexdigest(),16)
+            return str(random.Random(seed).choice(lib))
+        print(f"[bgm] 情绪 '{mood}' 库内无曲（{BGM_LIBRARY/mood}），跳过 BGM")
+    return None
+# ---------- 镜头时间窗：同素材复用不重复 ----------
+def _has_overlap(intervals, eps=0.05):
+    """intervals: [(start,end), ...]；排序后判断是否有相邻区间重叠。"""
+    s=sorted(intervals)
+    return any(b0 < a1-eps for (a0,a1),(b0,b1) in zip(s, s[1:]))
+def _resolve_windows(segs, clips_dir):
+    """算每句的切片起点，保证**同一素材被多句复用时时间窗不重叠**（消灭重复镜头）。
+    优先级：assignment.src_start(显式，以该时刻为子镜头中心) > 同素材内自动均匀错开(泳道)。
+    显式窗口若仍重叠 → 整组回退为均匀错开并提示。返回 starts[i]=第 i 句的切片起点(秒)。"""
+    by_clip={}
+    for i,s in enumerate(segs):
+        by_clip.setdefault(s["clip"],[]).append(i)
+    starts=[0.0]*len(segs)
+    for clip,idxs in by_clip.items():
+        cdur=probe_dur(clips_dir/f"{clip}.mp4"); k=len(idxs)
+        spans=[min(segs[i]["dur"], cdur) for i in idxs]
+        if k>1 and sum(spans) > cdur+0.1:     # 该片被复用所需的不同画面总时长 > 它本身时长
+            print(f"[mix] 警告: 素材 {clip} 仅 {cdur:.1f}s，被 {k} 句复用共需 {sum(spans):.1f}s 不同画面——"
+                  f"时长不够，可能仍有重复。建议多给素材，或减少该片复用。")
+        def lane(order):                      # 第 order 条均匀落在第 order 个泳道中心
+            c=(order+0.5)*cdur/k; sp=spans[order]
+            return max(0.0, min(c-sp/2, max(0.0, cdur-sp)))
+        st=[]
+        for order,i in enumerate(idxs):
+            ss=segs[i].get("src_start"); sp=spans[order]
+            st.append(max(0.0, min(float(ss)-sp/2, max(0.0, cdur-sp))) if ss is not None else lane(order))
+        if k>1 and _has_overlap([(st[o], st[o]+spans[o]) for o in range(k)]):
+            st=[lane(o) for o in range(k)]
+            print(f"[mix] 素材 {clip} 被 {k} 句复用且窗口重叠/未指定 → 自动均匀错开，避免重复镜头")
+        for order,i in enumerate(idxs): starts[i]=round(st[order],3)
+    return starts
+# ---------- build：proposal.json -> final.mp4 ----------
+def cmd_build(proj):
+    work=proj/"work"; clips_dir=proj/"inputs"/"clips"
+    prop=json.loads((work/"proposal.json").read_text(encoding="utf-8"))
+    voice=prop.get("voice", DEFAULT_VOICE)
+    asg=prop["assignments"]
+    avail={p.stem for p in list_clips(proj)}
+    for a in asg:
+        if a["clip"] not in avail:
+            print(f"ERR seg{a['segment_idx']} clip '{a['clip']}' 不存在。可用: {sorted(avail)}"); sys.exit(1)
+    _preflight_font()  # 字体不行就早退，别先花钱配音
+    # 1) 逐句情感配音（带缓存：engine/voice/emotion/speed/text 未变则复用，不重复扣费）
+    segs=[]; t0=0.0
+    for a in asg:
+        i=a["segment_idx"]; mp3=work/f"vo_{i:02d}.mp3"; keyf=work/f"vo_{i:02d}.key"
+        spd=a.get("speed") if a.get("speed") is not None else DEFAULT_SPEED  # 默认明快，适配短视频平台
+        ck=f"minimax|{a.get('voice',voice)}|{a.get('emotion')}|{spd}|{a['text']}"
+        if not (mp3.exists() and keyf.exists() and keyf.read_text(encoding="utf-8")==ck):
+            gen_tts(a["text"], a.get("voice",voice), a.get("emotion"), spd, mp3)
+            keyf.write_text(ck,encoding="utf-8")
+        d=probe_dur(mp3)
+        segs.append({**a,"audio":mp3,"start":t0,"end":t0+d,"dur":d}); t0+=d
+    total=t0; print(f"[voiceover] {len(segs)} 句 / {total:.2f}s")
+    # 2) 配音轨
+    (work/"vo_list.txt").write_text("".join(f"file '{s['audio'].as_posix()}'\n" for s in segs),encoding="utf-8")
+    voiceover=work/"voiceover.m4a"
+    run(["ffmpeg","-y","-f","concat","-safe","0","-i",work/"vo_list.txt","-c:a","aac","-b:a","192k",voiceover])
+    # 3) 切片对齐（竖版 crop；按 _resolve_windows 用 -ss 精切不同子镜头，防重复）
+    #    素材比该句短时：慢放填满（setpts），**不 loop**——loop 会在一句内重复画面。
+    starts=_resolve_windows(segs, clips_dir)
+    seg_mp4s=[]
+    base_vf=f"scale={W}:{H}:force_original_aspect_ratio=increase,crop={W}:{H}"
+    for k,s in enumerate(segs):
+        src=clips_dir/f"{s['clip']}.mp4"; dur=s["dur"]; st=starts[k]; cdur=probe_dur(src)
+        out=work/f"seg_{s['segment_idx']:02d}.mp4"
+        if cdur < dur-0.05:                      # 整片比该句短 → 慢放至该句时长，无重复
+            vf=f"{base_vf},setpts=PTS*{dur/cdur:.4f},fps={FPS},setsar=1"; ss=[]
+        else:
+            vf=f"{base_vf},fps={FPS},setsar=1"; ss=["-ss",f"{st:.3f}"]
+        run(["ffmpeg","-y",*ss,"-i",src,"-t",f"{dur:.3f}","-an","-vf",vf,
+             "-c:v","libx264","-crf",CRF,"-pix_fmt","yuv420p",out])
+        seg_mp4s.append(out)
+    (work/"v_list.txt").write_text("".join(f"file '{p.as_posix()}'\n" for p in seg_mp4s),encoding="utf-8")
+    silent=work/"video_silent.mp4"
+    run(["ffmpeg","-y","-f","concat","-safe","0","-i",work/"v_list.txt","-c","copy",silent])
+    # 4) 字幕 ASS（Format 与 Dialogue 字段数必须一致）
+    head=("[Script Info]\nScriptType: v4.00+\nPlayResX: %d\nPlayResY: %d\n\n"
+          "[V4+ Styles]\nFormat: Name,Fontname,Fontsize,PrimaryColour,OutlineColour,BackColour,Bold,Outline,Shadow,Alignment,MarginL,MarginR,MarginV,Encoding\n"
+          "Style: D,%s,58,&H00FFFFFF,&H00000000,&H64000000,1,3,1,2,40,40,180,1\n\n"
+          "[Events]\nFormat: Layer,Start,End,Style,Text\n") % (W,H,SUB_FONT)
+    body="\n".join(f"Dialogue: 0,{_ass_time(s['start'])},{_ass_time(s['end'])},D,{s['text']}" for s in segs)
+    ass=work/"subs.ass"; ass.write_text(head+body+"\n",encoding="utf-8")
+    ass_esc=ass.as_posix().replace(":","\\:")
+    # 5) BGM ducking + 烧字幕 -> final
+    bgm=_resolve_bgm(proj, prop); final=proj/"final.mp4"
+    if bgm and Path(bgm).exists():
+        print(f"[bgm] {bgm}")
+        fc=("[2:a]aloop=loop=-1:size=2e9,volume=0.25[bg];[1:a]asplit=2[vo][sc];"
+            "[bg][sc]sidechaincompress=threshold=0.02:ratio=8:attack=5:release=300[bgd];"
+            "[vo][bgd]amix=inputs=2:duration=first:normalize=0[aout]")
+        run(["ffmpeg","-y","-i",silent,"-i",voiceover,"-i",bgm,
+             "-filter_complex",fc+f";[0:v]subtitles='{ass_esc}'[v]",
+             "-map","[v]","-map","[aout]","-t",f"{total:.3f}",
+             "-c:v","libx264","-crf",CRF,"-pix_fmt","yuv420p","-c:a","aac","-b:a","192k","-shortest",final])
+    else:
+        print("[bgm] none（用户未提供且 proposal 未设 bgm_mood，仅人声）")
+        run(["ffmpeg","-y","-i",silent,"-i",voiceover,"-vf",f"subtitles='{ass_esc}'",
+             "-map","0:v","-map","1:a","-c:v","libx264","-crf",CRF,"-pix_fmt","yuv420p","-c:a","aac",final])
+    print(f"[done] {final}  ({probe_dur(final):.2f}s)")
+if __name__=="__main__":
+    # `assets-dir` prints the resolved BGM/voice asset root (so SKILL.md resolves it
+    # the same way this script does). No project arg needed.
+    if len(sys.argv)>=2 and sys.argv[1]=="assets-dir":
+        print(ASSET_ROOT); sys.exit(0)
+    if len(sys.argv)<3:
+        print("usage: video_compose.py [frames|build|assets-dir] <proj-dir>"); sys.exit(1)
+    cmd, proj = sys.argv[1], Path(sys.argv[2]).resolve()
+    {"frames":cmd_frames,"build":cmd_build}[cmd](proj)

package/.claude/skills/video-edit/SKILL.md CHANGED Viewed

@@ -14,6 +14,64 @@ owner_repo: Optima-Chat/optima-gen
 用户给原始视频，你交付剪好的成片。
+## Step 0：指令清单读回（≥ 2 个动作时必跑）
+**为什么必跑**：用户反馈"发给 AI 的多个指令总有漏掉，要重新生成"。LLM 直接读多指令时容易抓 1-2 个执行，剩下的被吞。**没有 checklist 也没有回读 → 用户只能等成片出来才发现漏了 → 全流程重做**。
+### 触发判定
+| 用户消息 | 是否必跑 Step 0 |
+|---|---|
+| "剪一下" / "加字幕" / "去卡顿"（单一动作） | 跳过 |
+| "剪一下，剪到 30 秒"（1 动作 + 1 修饰） | 跳过 |
+| "剪一下 + 加 banner 'XYZ'"（≥ 2 个动作） | **必跑** |
+| "字幕用白色 + 加 BGM + 压到 30 秒"（≥ 2 个动作） | **必跑** |
+| 任何**迭代消息**改 ≥ 2 项（"banner 换成 X，字幕换成 Y"） | **必跑** |
+模糊判定：宁可读回，不要漏。
+### 操作（在执行任何 `video-edit ...` 命令之前）
+1. **拆指令**：把用户消息切成原子动作清单（每条 ≤ 15 字）。常见类别：
+   - 剪辑：剪一下 / 去卡顿 / 剪到 30 秒 / 删开头 5 秒
+   - 字幕：加字幕 / 不要字幕 / 字幕用白色 / 字幕加大
+   - Banner：加 banner "XYZ" / 不要 banner / banner 换成 "ABC"
+   - 节奏：去 NG / 保留所有重复
+   - 输出：竖版 / 横版 / 不变
+2. **一条消息内输出读回 + 等确认**（**不要边读边动手**）：
+   ```
+   我理解你要做的：
+   1. 剪一下（默认去卡顿 + NG）
+   2. 加 banner "日发四万单"
+   3. 字幕用粗体白色
+   4. 压到 30 秒以内
+   全对回"开始"/"对"；要改/补告诉我哪条。
+   ```
+3. **用户确认后**：按清单顺序执行；每条对应到下文 skill 流程的具体步骤；执行完一条在内部 TaskList 标记 done；**全部 done 才报"成片完成"**。
+4. **交付汇报里逐条回扣**：
+   ```
+   成片 xxx_subbed.mp4 完成：
+   1. ✅ 剪一下 → 时长 28s（原 65s）
+   2. ✅ banner "日发四万单" 顶部
+   3. ✅ 字幕粗体白色已烧
+   4. ✅ 28s < 30s
+   ```
+### 严禁
+- ❌ 跳过 Step 0 直接执行 ≥ 2 个动作 — ≥ 2 动作时**附带的细粒度参数（"字幕 30pt"）**最容易被漏；不读回必发生
+- ❌ 读回时省掉某条 / 合并两条为"剩下默认" — 漏掉的同义词
+- ❌ 全部执行完才在交付汇报里说"我做了 1/2/3" — 用户已经等了 5 分钟,这时发现漏掉要重做
+- ❌ 用户给迭代指令（"banner 换成 X，字幕加大"）时直接动手 — 迭代同样要读回
+**细粒度参数本身是否触发 Step 0**：单一动作 + 单一参数（"剪到 30 秒" / "字幕 30pt" 单独说）**不触发**，按上方触发表跳过。Step 0 是给 ≥ 2 个独立动作准备的；单参数请求按单动作走。
 ## 用户怎么说，你交付什么
 **默认所有剪辑都带字幕**——中文短视频 99% 需要字幕，"剪一下"的潜台词就是"给我能直接发的成片"。
@@ -188,7 +246,9 @@ video-edit subtitle <video>
 ### 用户要求调整时
-用户看完成片说某段不对：
+**如果是 ≥ 2 项调整**（"banner 换成 X，字幕加大"）→ **先回到 Step 0 读回 + 等确认**，再动手。迭代是漏指令的高发场景（上一轮的已生效项 + 这一轮的新要求容易混）。
+单项调整直接做：
 1. 编辑 `<video>.work/final_script.txt`（加/删内容）
 2. 删 `<video>.work/subs.ass`
 3. 重跑 `smart-cut` + `subtitle`
@@ -258,6 +318,7 @@ video-edit smart-cut <video>
 - ❌ 用户说"剪掉前 X 秒"/"剪掉中间一段"——定点裁剪不是去卡顿，告诉用户"我擅长去卡顿停顿，定点裁剪请用别的方式"
 - ❌ **跳过通读自查直接 smart-cut**——重复内容是用户最反感的问题，省这一步省不出去
 - ❌ **跳过 review 直接 subtitle**——subtitle 烧完字幕的成片重复不可逆，必须先通过 review
+- ❌ **多指令场景跳过 Step 0 读回**——用户给 ≥ 2 个动作时不读回必有漏指令
 ## 命令参考（你内部用）