npm - @researai/deepscientist - Versions diffs - 1.5.11 → 1.5.13 - Mend

@researai/deepscientist 1.5.11 → 1.5.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

package/src/deepscientist/runners/codex.py CHANGED Viewed

@@ -11,14 +11,20 @@ from pathlib import Path
 from typing import Any
 from ..artifact import ArtifactService
+from ..codex_cli_compat import adapt_profile_only_provider_config, normalize_codex_reasoning_effort
 from ..config import ConfigManager
 from ..gitops import export_git_graph
 from ..prompts import PromptBuilder
 from ..runtime_logs import JsonlLogger
-from ..shared import append_jsonl, ensure_dir, generate_id, read_yaml, resolve_runner_binary, utc_now, write_json, write_text
+from ..shared import append_jsonl, ensure_dir, generate_id, read_text, read_yaml, resolve_runner_binary, utc_now, write_json, write_text
 from ..web_search import extract_web_search_payload
 from .base import RunRequest, RunResult
+_TOOL_EVENT_ARGS_TEXT_LIMIT = 8_000
+_TOOL_EVENT_OUTPUT_TEXT_LIMIT = 16_000
+_MAX_QUEST_EVENT_JSON_BYTES = 2_000_000
+_OVERSIZED_EVENT_PREVIEW_TEXT_LIMIT = 12_000
 def _compact_text(value: object, *, limit: int = 1200) -> str:
     if value is None:
@@ -35,15 +41,94 @@ def _compact_text(value: object, *, limit: int = 1200) -> str:
     return text[: limit - 1].rstrip() + "…"
-def _structured_text(value: object) -> str:
+def _truncate_leaf_text(text: str, *, limit: int) -> str:
+    if limit <= 0 or len(text) <= limit:
+        return text
+    head = max(int(limit * 0.7), 256)
+    tail = max(limit - head - 64, 128)
+    omitted = max(len(text) - head - tail, 0)
+    return f"{text[:head].rstrip()}\n...[truncated {omitted} chars]...\n{text[-tail:].lstrip()}"
+def _truncate_structured_value(value: object, *, string_limit: int) -> object:
+    if isinstance(value, str):
+        return _truncate_leaf_text(value.strip(), limit=string_limit)
+    if isinstance(value, list):
+        return [_truncate_structured_value(item, string_limit=string_limit) for item in value[:200]]
+    if isinstance(value, dict):
+        truncated: dict[object, object] = {}
+        for index, (key, item) in enumerate(value.items()):
+            if index >= 200:
+                truncated["__truncated__"] = f"truncated remaining {len(value) - 200} item(s)"
+                break
+            truncated[key] = _truncate_structured_value(item, string_limit=string_limit)
+        return truncated
+    return value
+def _structured_text(value: object, *, limit: int | None = None) -> str:
     if value is None:
         return ""
     if isinstance(value, str):
-        return value.strip()
+        return _truncate_leaf_text(value.strip(), limit=limit or len(value))
+    normalized_value = _truncate_structured_value(value, string_limit=max(limit or _TOOL_EVENT_OUTPUT_TEXT_LIMIT, 512))
     try:
-        return json.dumps(value, ensure_ascii=False, indent=2)
+        return json.dumps(normalized_value, ensure_ascii=False, indent=2)
     except TypeError:
-        return str(value)
+        return _truncate_leaf_text(str(value), limit=limit or _TOOL_EVENT_OUTPUT_TEXT_LIMIT)
+def _encoded_json_size(value: object) -> int:
+    try:
+        return len(json.dumps(value, ensure_ascii=False).encode("utf-8"))
+    except Exception:
+        return len(str(value).encode("utf-8", errors="ignore"))
+def _compact_tool_event_payload(payload: dict[str, Any]) -> dict[str, Any]:
+    if _encoded_json_size(payload) <= _MAX_QUEST_EVENT_JSON_BYTES:
+        return payload
+    compacted = dict(payload)
+    output_text = str(compacted.get("output") or "")
+    if output_text:
+        compacted["output_bytes"] = len(output_text.encode("utf-8", errors="ignore"))
+        compacted["output"] = _truncate_leaf_text(
+            output_text,
+            limit=_OVERSIZED_EVENT_PREVIEW_TEXT_LIMIT,
+        )
+        compacted["output_truncated"] = True
+    args_text = str(compacted.get("args") or "")
+    if args_text and _encoded_json_size(compacted) > _MAX_QUEST_EVENT_JSON_BYTES:
+        compacted["args"] = _truncate_leaf_text(args_text, limit=4_000)
+        compacted["args_truncated"] = True
+    if _encoded_json_size(compacted) > _MAX_QUEST_EVENT_JSON_BYTES:
+        metadata = compacted.get("metadata")
+        if isinstance(metadata, dict):
+            allowed_keys = {
+                "mcp_server",
+                "mcp_tool",
+                "bash_id",
+                "status",
+                "command",
+                "workdir",
+                "cwd",
+                "started_at",
+                "finished_at",
+                "exit_code",
+                "stop_reason",
+                "log_path",
+            }
+            compacted["metadata"] = {
+                key: metadata.get(key)
+                for key in allowed_keys
+                if key in metadata
+            }
+            compacted["metadata_truncated"] = True
+    if _encoded_json_size(compacted) > _MAX_QUEST_EVENT_JSON_BYTES:
+        compacted["output"] = _compact_text(compacted.get("output"), limit=2_000)
+        compacted["output_truncated"] = True
+    return compacted
 def _iter_event_texts(event: dict[str, Any]) -> list[str]:
@@ -209,7 +294,7 @@ def _tool_args(event: dict[str, Any], item: dict[str, Any]) -> str:
             item.get("input"),
             event.get("input"),
         ):
-            text = _structured_text(value)
+            text = _structured_text(value, limit=_TOOL_EVENT_ARGS_TEXT_LIMIT)
             if text:
                 return text
         return ""
@@ -243,7 +328,7 @@ def _tool_output(event: dict[str, Any], item: dict[str, Any]) -> str:
             item.get("aggregated_output"),
             event.get("aggregated_output"),
         ):
-            text = _structured_text(value)
+            text = _structured_text(value, limit=_TOOL_EVENT_OUTPUT_TEXT_LIMIT)
             if text:
                 return text
         return ""
@@ -361,7 +446,7 @@ def _tool_event(
                 "raw_event_type": event_type,
                 "created_at": created_at,
             }
-        return {
+        return _compact_tool_event_payload({
             "event_id": generate_id("evt"),
             "type": "runner.tool_result",
             "quest_id": quest_id,
@@ -375,7 +460,7 @@ def _tool_event(
             "output": _tool_output(event, item),
             "raw_event_type": event_type,
             "created_at": created_at,
-        }
+        })
     if item_type == "web_search":
         tool_call_id = _tool_call_id(event, item)
@@ -399,7 +484,7 @@ def _tool_event(
                 "raw_event_type": event_type,
                 "created_at": created_at,
             }
-        return {
+        return _compact_tool_event_payload({
             "event_id": generate_id("evt"),
             "type": "runner.tool_result",
             "quest_id": quest_id,
@@ -414,13 +499,13 @@ def _tool_event(
             "metadata": metadata,
             "raw_event_type": event_type,
             "created_at": created_at,
-        }
+        })
     if item_type == "file_change":
         tool_call_id = _tool_call_id(event, item)
         tool_name = "file_change"
         known_tool_names[tool_call_id] = tool_name
-        return {
+        return _compact_tool_event_payload({
             "event_id": generate_id("evt"),
             "type": "runner.tool_result",
             "quest_id": quest_id,
@@ -433,7 +518,7 @@ def _tool_event(
             "output": _tool_output(event, item),
             "raw_event_type": event_type,
             "created_at": created_at,
-        }
+        })
     if item_type == "mcp_tool_call":
         tool_call_id = _tool_call_id(event, item)
@@ -466,7 +551,7 @@ def _tool_event(
                 "raw_event_type": event_type,
                 "created_at": created_at,
             }
-        return {
+        return _compact_tool_event_payload({
             "event_id": generate_id("evt"),
             "type": "runner.tool_result",
             "quest_id": quest_id,
@@ -483,7 +568,7 @@ def _tool_event(
             "metadata": metadata,
             "raw_event_type": event_type,
             "created_at": created_at,
-        }
+        })
     if item_type in {"function_call", "custom_tool_call", "tool_call"} or "function_call" in event_type or "tool_call" in event_type:
         tool_call_id = _tool_call_id(event, item)
@@ -507,7 +592,7 @@ def _tool_event(
     if item_type in {"function_call_output", "custom_tool_call_output", "tool_result", "tool_call_output"} or "function_call_output" in event_type or "tool_result" in event_type:
         tool_call_id = _tool_call_id(event, item)
         tool_name = known_tool_names.get(tool_call_id) or _tool_name(event, item)
-        return {
+        return _compact_tool_event_payload({
             "event_id": generate_id("evt"),
             "type": "runner.tool_result",
             "quest_id": quest_id,
@@ -521,7 +606,7 @@ def _tool_event(
             "output": _tool_output(event, item),
             "raw_event_type": event_type,
             "created_at": created_at,
-        }
+        })
     return None
@@ -582,6 +667,12 @@ class CodexRunner:
         )
         env = dict(**os.environ)
+        runner_env = runner_config.get("env") if isinstance(runner_config.get("env"), dict) else {}
+        for key, value in runner_env.items():
+            env_key = str(key or "").strip()
+            if not env_key or value is None:
+                continue
+            env[env_key] = str(value)
         env["CODEX_HOME"] = str(codex_home)
         env["DEEPSCIENTIST_HOME"] = str(self.home)
         env["DS_HOME"] = str(self.home)
@@ -809,21 +900,31 @@ class CodexRunner:
         workspace_root = request.worktree_root or request.quest_root
         resolved_binary = resolve_runner_binary(self.binary, runner_name="codex")
         resolved_runner_config = runner_config if isinstance(runner_config, dict) else self._load_runner_config()
+        profile = str(resolved_runner_config.get("profile") or "").strip()
         normalized_model = str(request.model or "").strip()
         command = [
             resolved_binary or self.binary,
             "--search",
-            "exec",
-            "--json",
-            "--cd",
-            str(workspace_root),
-            "--skip-git-repo-check",
         ]
+        if profile:
+            command.extend(["--profile", profile])
+        command.extend(
+            [
+                "exec",
+                "--json",
+                "--cd",
+                str(workspace_root),
+                "--skip-git-repo-check",
+            ]
+        )
         if normalized_model.lower() not in {"", "inherit", "default", "codex-default"}:
             command.extend(["--model", normalized_model])
         if request.approval_policy:
             command.extend(["-c", f'approval_policy="{request.approval_policy}"'])
-        reasoning_effort = request.reasoning_effort
+        reasoning_effort, _ = normalize_codex_reasoning_effort(
+            request.reasoning_effort,
+            resolved_binary=resolved_binary or self.binary,
+        )
         if reasoning_effort:
             command.extend(["-c", f'model_reasoning_effort="{reasoning_effort}"'])
         tool_timeout_sec = self._positive_timeout_seconds(resolved_runner_config.get("mcp_tool_timeout_sec"))
@@ -846,7 +947,10 @@ class CodexRunner:
         runner_config: dict[str, Any] | None = None,
     ) -> Path:
         target = ensure_dir(workspace_root / ".codex")
-        source = Path(os.environ.get("CODEX_HOME", str(Path.home() / ".codex"))).expanduser()
+        resolved_runner_config = runner_config if isinstance(runner_config, dict) else self._load_runner_config()
+        configured_home = str(resolved_runner_config.get("config_dir") or os.environ.get("CODEX_HOME") or str(Path.home() / ".codex"))
+        profile = str(resolved_runner_config.get("profile") or "").strip()
+        source = Path(configured_home).expanduser()
         for filename in ("config.toml", "auth.json"):
             source_path = source / filename
             target_path = target / filename
@@ -854,6 +958,10 @@ class CodexRunner:
                 if source_path.resolve() == target_path.resolve():
                     continue
                 shutil.copy2(source_path, target_path)
+        config_path = target / "config.toml"
+        if profile and config_path.exists():
+            adapted_text, _ = adapt_profile_only_provider_config(read_text(config_path), profile=profile)
+            write_text(config_path, adapted_text)
         ensure_dir(target / "skills")
         quest_skills_root = quest_root / ".codex" / "skills"
         if quest_skills_root.exists():

package/src/deepscientist/runners/runtime_overrides.py CHANGED Viewed

@@ -18,18 +18,27 @@ def _as_bool_env(name: str) -> bool:
 def codex_runtime_overrides() -> dict[str, str]:
+    binary = _as_text(os.environ.get("DEEPSCIENTIST_CODEX_BINARY") or os.environ.get("DS_CODEX_BINARY"))
     approval_policy = _as_text(os.environ.get("DEEPSCIENTIST_CODEX_APPROVAL_POLICY"))
     sandbox_mode = _as_text(os.environ.get("DEEPSCIENTIST_CODEX_SANDBOX_MODE"))
+    profile = _as_text(os.environ.get("DEEPSCIENTIST_CODEX_PROFILE"))
+    model = _as_text(os.environ.get("DEEPSCIENTIST_CODEX_MODEL"))
     if _as_bool_env("DEEPSCIENTIST_CODEX_YOLO"):
         approval_policy = approval_policy or "never"
         sandbox_mode = sandbox_mode or "danger-full-access"
     overrides: dict[str, str] = {}
+    if binary:
+        overrides["binary"] = binary
     if approval_policy:
         overrides["approval_policy"] = approval_policy
     if sandbox_mode:
         overrides["sandbox_mode"] = sandbox_mode
+    if profile:
+        overrides["profile"] = profile
+    if model:
+        overrides["model"] = model
     return overrides

package/src/deepscientist/shared.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+from collections import deque
 import hashlib
 import json
 import os
@@ -9,7 +10,7 @@ import subprocess
 import sys
 from datetime import UTC, datetime
 from pathlib import Path
-from typing import Any
+from typing import Any, Iterator
 from uuid import uuid4
 try:
@@ -90,21 +91,39 @@ def append_jsonl(path: Path, payload: dict[str, Any]) -> None:
         handle.write(json.dumps(payload, ensure_ascii=False) + "\n")
-def read_jsonl(path: Path) -> list[dict[str, Any]]:
+def iter_jsonl(path: Path | str) -> Iterator[dict[str, Any]]:
+    path = Path(path)
     if not path.exists():
+        return
+    with path.open("r", encoding="utf-8") as handle:
+        for raw_line in handle:
+            line = raw_line.strip()
+            if not line:
+                continue
+            try:
+                payload = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            if isinstance(payload, dict):
+                yield payload
+def read_jsonl(path: Path) -> list[dict[str, Any]]:
+    return list(iter_jsonl(path))
+def count_jsonl(path: Path | str) -> int:
+    return sum(1 for _ in iter_jsonl(path))
+def read_jsonl_tail(path: Path | str, limit: int) -> list[dict[str, Any]]:
+    normalized_limit = max(int(limit or 0), 0)
+    if normalized_limit <= 0:
         return []
-    items: list[dict[str, Any]] = []
-    for line in path.read_text(encoding="utf-8").splitlines():
-        line = line.strip()
-        if not line:
-            continue
-        try:
-            payload = json.loads(line)
-        except json.JSONDecodeError:
-            continue
-        if isinstance(payload, dict):
-            items.append(payload)
-    return items
+    items: deque[dict[str, Any]] = deque(maxlen=normalized_limit)
+    for payload in iter_jsonl(path):
+        items.append(payload)
+    return list(items)
 def read_yaml(path: Path, default: Any = None) -> Any:

package/src/prompts/connectors/qq.md CHANGED Viewed

@@ -10,7 +10,8 @@
 - qq_summary_first_rule: start with the conclusion the user cares about, then what it means, then the next action
 - qq_progress_shape_rule: make the current task, the main difficulty or latest real progress, and the next concrete measure explicit whenever possible
 - qq_eta_rule: for baseline reproduction, main experiments, analysis experiments, and other important long-running research phases, include a rough ETA for the next meaningful result or the next update; if uncertain, say that and still give the next check-in window
-- qq_tool_call_keepalive_rule: for ordinary active work, prefer one concise QQ progress update after roughly 10 tool calls when there is already a human-meaningful delta, and do not let work drift beyond roughly 20 tool calls or about 15 minutes without a user-visible checkpoint
+- qq_tool_call_keepalive_rule: for ordinary active work, prefer one concise QQ progress update after roughly 6 tool calls when there is already a human-meaningful delta, and do not let work drift beyond roughly 12 tool calls or about 8 minutes without a user-visible checkpoint
+- qq_read_plan_keepalive_rule: if the active work is still mostly reading, comparison, or planning, do not wait too long for a "big result"; send a short QQ-facing checkpoint after about 5 consecutive tool calls if the user would otherwise see silence
 - qq_internal_detail_rule: omit worker names, heartbeat timestamps, retry counters, pending/running/completed counts, file names, and monitor-window narration unless the user asked for them or the detail changes the recommended action
 - qq_translation_rule: convert internal execution and file-management work into user value, such as saying the baseline record is now organized for easier later comparison instead of listing touched files
 - qq_preflight_rule: before sending a QQ progress update, rewrite it if it still sounds like a monitoring log, execution diary, or file inventory

package/src/prompts/connectors/weixin.md CHANGED Viewed

@@ -10,7 +10,8 @@
 - weixin_summary_first_rule: start with the user-facing conclusion, then what it means, then the next action
 - weixin_progress_shape_rule: make the current task, the main difficulty or latest real progress, and the next concrete measure explicit whenever possible
 - weixin_eta_rule: for important long-running phases such as baseline reproduction, main experiments, analysis, or paper packaging, include a rough ETA or next check-in window when you can
-- weixin_tool_call_keepalive_rule: for ordinary active work, prefer one concise Weixin progress update after roughly 10 tool calls when there is already a human-meaningful delta, and do not let work drift beyond roughly 20 tool calls or about 15 minutes without a user-visible checkpoint
+- weixin_tool_call_keepalive_rule: for ordinary active work, prefer one concise Weixin progress update after roughly 6 tool calls when there is already a human-meaningful delta, and do not let work drift beyond roughly 12 tool calls or about 8 minutes without a user-visible checkpoint
+- weixin_read_plan_keepalive_rule: if the active work is still mostly reading, comparison, or planning, do not wait too long for a "big result"; send a short Weixin-facing checkpoint after about 5 consecutive tool calls if the user would otherwise see silence
 - weixin_internal_detail_rule: omit worker names, retry counters, pending/running/completed counts, low-level file listings, and monitor-window narration unless the user explicitly asked for them or they change the recommended action
 - weixin_translation_rule: translate internal execution and file-management work into user value instead of narrating tool or filesystem churn
 - weixin_preflight_rule: before sending a Weixin-facing progress update, rewrite it if it still reads like a monitor log, execution diary, or file inventory

package/src/prompts/contracts/shared_interaction.md CHANGED Viewed

@@ -7,7 +7,10 @@ This shared contract is injected once per turn and applies across the stage and
 - Treat `artifact.interact(...)` as the main long-lived communication thread across TUI, web, and bound connectors.
 - If `artifact.interact(...)` returns queued user requirements, treat them as the highest-priority user instruction bundle before continuing the current stage or companion-skill task.
 - Immediately follow any non-empty mailbox poll with another `artifact.interact(...)` update that confirms receipt; if the request is directly answerable, answer there, otherwise say the current subtask is paused, give a short plan plus nearest report-back point, and handle that request first.
-- Emit `artifact.interact(kind='progress', reply_mode='threaded', ...)` when there is real user-visible progress: a meaningful checkpoint, route-shaping update, or a concise keepalive once active work has crossed roughly 10 tool calls with a human-meaningful delta. Do not let ordinary active work drift beyond roughly 20 tool calls or about 15 minutes without a user-visible update.
+- Stage-kickoff rule: after entering any stage or companion skill, send one `artifact.interact(kind='progress', reply_mode='threaded', ...)` update within the first 3 tool calls of substantial work.
+- Reading/planning keepalive rule: if you spend 5 consecutive tool calls on reading, searching, comparison, or planning without a user-visible update, send one concise checkpoint even if the route is not finalized yet.
+- Subtask-boundary rule: send a user-visible update whenever the active subtask changes materially, especially across intake -> audit, audit -> experiment planning, experiment planning -> run launch, run result -> drafting, or drafting -> review/rebuttal.
+- Emit `artifact.interact(kind='progress', reply_mode='threaded', ...)` when there is real user-visible progress: a meaningful checkpoint, route-shaping update, or a concise keepalive once active work has crossed roughly 6 tool calls with a human-meaningful delta. Do not let ordinary active work drift beyond roughly 12 tool calls or about 8 minutes without a user-visible update.
 - Keep progress updates chat-like and easy to understand: say what changed, what it means, and what happens next.
 - Default to plain-language summaries. Do not mention file paths, artifact ids, branch/worktree ids, session ids, raw commands, or raw logs unless the user asks or needs them to act.
 - Use `reply_mode='blocking'` only for real user decisions that cannot be resolved from local evidence.

package/src/prompts/system.md CHANGED Viewed

@@ -53,7 +53,7 @@ Your job is to keep a research quest moving forward in a durable, auditable, evi
   - for ordinary progress replies, usually stay within 2 to 4 short sentences or 3 short bullets at most
   - start with the conclusion the user cares about, then what it means, then the next action
   - for baseline reproduction, main experiments, analysis experiments, and similar long-running research phases, also tell the user roughly how long until the next meaningful result, next step, or next update
-  - for ordinary active multi-step work, prefer a concise update once active work has crossed about 10 tool calls and there is already a human-meaningful delta, and do not disappear for more than about 20 tool calls or about 15 minutes of active foreground work without a user-visible update unless a real milestone is imminent
+  - for ordinary active multi-step work, prefer a concise update once active work has crossed about 6 tool calls and there is already a human-meaningful delta, and do not disappear for more than about 12 tool calls or about 8 minutes of active foreground work without a user-visible update unless a real milestone is imminent
   - do not spam internal tool chatter, raw diffs, or every small checkpoint
   - do not proactively enumerate file paths, file inventories, or low-level file details unless the user explicitly asks
   - do not proactively expose worker names, heartbeat timestamps, retry counters, pending/running/completed counts, or monitor-window narration unless that detail changes the recommended action or is required for honesty about risk
@@ -203,7 +203,7 @@ When you send user-facing updates (especially via `artifact.interact(...)`), wri
   - what task you are currently working on
   - what the main difficulty, risk, or latest real progress is
   - what concrete next step or mitigation you will take
-- for ordinary active multi-step work, if no natural milestone arrives, prefer a short progress update once active work has crossed about 10 tool calls and there is already a human-meaningful delta, and do not drift beyond about 20 tool calls or about 15 minutes of active foreground work without any user-visible checkpoint
+- for ordinary active multi-step work, if no natural milestone arrives, prefer a short progress update once active work has crossed about 6 tool calls and there is already a human-meaningful delta, and do not drift beyond about 12 tool calls or about 8 minutes of active foreground work without any user-visible checkpoint
 - for baseline reproduction, main experiments, analysis experiments, and similar long-running phases, also make the timing expectation explicit:
   - roughly how long until the next meaningful result, next milestone, or next update, usually within a 10 to 30 minute window
   - if runtime is uncertain, say that directly and give the next check-in window instead of pretending to know an exact ETA
@@ -463,9 +463,12 @@ Each milestone update should usually state:
 Cadence defaults for ordinary active work:
 - treat `artifact.interact(...)` as the default user-visible heartbeat rather than an optional extra
-- soft trigger: after about 10 tool calls, if there is already a human-meaningful delta, send `artifact.interact(kind='progress', reply_mode='threaded', ...)`
-- hard trigger: do not exceed about 20 tool calls without a user-visible `artifact.interact(...)` update during active foreground work
-- time trigger: do not exceed about 15 minutes of active foreground work without a user-visible update, even if the tool-call count stayed low
+- stage-kickoff trigger: after entering any stage or companion skill, send one `artifact.interact(kind='progress', reply_mode='threaded', ...)` update within the first 3 tool calls of substantial work
+- reading/planning trigger: if you spend about 5 consecutive tool calls on reading, searching, comparison, or planning without a user-visible update, send one concise checkpoint even if the route is not finalized yet
+- boundary trigger: send a user-visible update whenever the active subtask changes materially, especially across intake -> audit, audit -> experiment planning, experiment planning -> run launch, run result -> drafting, or drafting -> review/rebuttal
+- soft trigger: after about 6 tool calls, if there is already a human-meaningful delta, send `artifact.interact(kind='progress', reply_mode='threaded', ...)`
+- hard trigger: do not exceed about 12 tool calls without a user-visible `artifact.interact(...)` update during active foreground work
+- time trigger: do not exceed about 8 minutes of active foreground work without a user-visible update, even if the tool-call count stayed low
 - immediate trigger: send a user-visible update as soon as a real blocker, recovery, route change, branch/worktree switch, baseline gate change, selected idea, recorded main experiment, or user-priority interruption becomes clear
 - de-duplication rule: do not send another ordinary progress update within about 2 additional tool calls or about 90 seconds unless a real milestone, blocker, route change, or new user message makes that extra update genuinely useful
 - keep ordinary subtask completions short; reserve richer milestone reports for stage-significant deliverables and route-changing checkpoints instead of narrating every small setup step
@@ -1080,9 +1083,10 @@ For `artifact.interact(...)` specifically:
   - raw logs
   - internal tool names
 - mention those details only if the user asked for them or needs them to act on the message
-- during active work, emit `artifact.interact(kind='progress', ...)` at real human-meaningful checkpoints; if no natural checkpoint appears, prefer sending one once active work has crossed about 10 tool calls and there is already a human-meaningful delta, and do not drift beyond about 20 tool calls or about 15 minutes of active foreground work without a user-visible update
+- during active work, emit `artifact.interact(kind='progress', ...)` at real human-meaningful checkpoints; if no natural checkpoint appears, prefer sending one once active work has crossed about 6 tool calls and there is already a human-meaningful delta, and do not drift beyond about 12 tool calls or about 8 minutes of active foreground work without a user-visible update
 - during long active execution, after the first meaningful signal from long-running work, keep the user informed and never let active user-relevant work go more than 30 minutes without a real progress inspection and, if still running, a user-visible keepalive
-- do not send another ordinary progress update within about 2 additional tool calls or about 90 seconds unless a milestone, blocker, route change, or new user message makes it genuinely useful
+- if the active work is still mostly reading, comparison, synthesis, or planning, do not hide behind "no result yet"; send a short user-visible checkpoint after about 5 consecutive tool calls if the user would otherwise see silence
+- do not send another ordinary progress update within about 2 additional tool calls or about 60 seconds unless a milestone, blocker, route change, or new user message makes it genuinely useful
 - each ordinary progress update should usually answer only:
   - what changed
   - what it means now
@@ -1321,7 +1325,7 @@ If the field is absent, default to `freeform`.
 When `launch_mode = custom`:
 - do not force the quest back into the canonical full-research path if the custom brief is narrower
-- treat `entry_state_summary`, `review_summary`, and `custom_brief` as real startup context rather than decorative metadata
+- treat `entry_state_summary`, `review_summary`, `review_materials`, and `custom_brief` as real startup context rather than decorative metadata
 - if the quest clearly starts from existing baseline / result / draft state, open `intake-audit` before restarting baseline discovery or fresh experimentation
 - if the quest clearly starts from reviewer comments, a revision request, or a rebuttal packet, open `rebuttal` before ordinary `write`
 - after the custom entry skill stabilizes the route, continue through the normal stage skills as needed
@@ -1331,12 +1335,58 @@ When `custom_profile = continue_existing_state`:
 - assume the quest may already contain reusable baselines, measured results, analysis assets, or writing assets
 - audit and trust-rank those assets first instead of reflexively rerunning everything
+When `custom_profile = review_audit`:
+- assume the active contract is a substantial draft or paper package that needs an independent skeptical audit
+- open `review` before more writing or finalization
+- if the audit finds real gaps, route to the needed downstream skill instead of polishing blindly
+When `startup_contract.review_followup_policy = auto_execute_followups`:
+- after review artifacts are durable, continue automatically into the required experiments, manuscript deltas, and review-closure work
+- do not stop at the audit report if the route is already clear
+When `startup_contract.review_followup_policy = user_gated_followups`:
+- finish the review artifacts first
+- then raise one structured decision before expensive experiments or manuscript revisions continue
+When `startup_contract.review_followup_policy = audit_only`:
+- stop after the durable audit artifacts and route recommendation unless the user later asks for execution follow-up
 When `custom_profile = revision_rebuttal`:
 - assume the active contract is a paper-review workflow rather than a blank research loop
 - preserve the existing paper, results, and reviewer package as the starting state
 - route supplementary experiments through `analysis-campaign` and manuscript deltas through `write`, but let `rebuttal` orchestrate that mapping
+When `startup_contract.baseline_execution_policy = must_reproduce_or_verify`:
+- explicitly verify or recover the rebuttal-critical baseline or comparator before reviewer-linked follow-up work
+When `startup_contract.baseline_execution_policy = reuse_existing_only`:
+- trust the current confirmed baseline/results unless you find concrete inconsistency, corruption, or missing-evidence problems
+When `startup_contract.baseline_execution_policy = skip_unless_blocking`:
+- do not spend time rerunning baselines by default
+- only open `baseline` if a named review/rebuttal issue truly depends on a missing comparator or unusable prior evidence
+When `startup_contract.manuscript_edit_mode = latex_required`:
+- if manuscript revision is required, treat the provided LaTeX tree or `paper/latex/` as the writing surface
+- if LaTeX source is unavailable, do not pretend the manuscript was edited; produce LaTeX-ready replacement text and state the blocker explicitly
+When `startup_contract.manuscript_edit_mode = copy_ready_text`:
+- provide section-level copy-ready replacement text and explicit deltas when manuscript revision is required
+When `startup_contract.manuscript_edit_mode = none`:
+- revision planning artifacts are sufficient unless the user later broadens scope
 When `custom_profile = freeform`:
 - treat the custom brief as the primary scope contract
@@ -2078,7 +2128,7 @@ When summarizing long logs, campaigns, or multi-agent work:
   - the estimated next reply time (usually the next sleep interval you are about to use)
 - If the run still looks healthy but there is no human-meaningful delta yet, continue monitoring silently instead of sending a no-change keepalive just because a sleep finished.
 - For baseline reproduction, main experiments, analysis experiments, and similar user-relevant long runs, translate that monitoring ETA into user-facing language such as how long until the next meaningful result or the next expected update.
-- Outside those detached experiment waits, prefer sending a concise `artifact.interact(kind='progress', ...)` once active work has crossed about 10 tool calls and there is already a human-meaningful delta, and do not let active foreground work drift beyond about 20 tool calls or about 15 minutes without a user-visible checkpoint.
+- Outside those detached experiment waits, prefer sending a concise `artifact.interact(kind='progress', ...)` once active work has crossed about 6 tool calls and there is already a human-meaningful delta, and do not let active foreground work drift beyond about 12 tool calls or about 8 minutes without a user-visible checkpoint.
 - If you forget a bash id, do not guess. Use `bash_exec(mode='history')` or `bash_exec(mode='list')` and recover it from the reverse-chronological session list.
 - If the long-running command or wrapper code can emit structured progress markers, prefer a concise `__DS_PROGRESS__ { ... }` JSON line with fields such as:
   - `current`