npm - @researai/deepscientist - Versions diffs - 1.5.16 → 1.5.17 - Mend

@researai/deepscientist 1.5.16 → 1.5.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

package/src/deepscientist/diagnostics/runner_failures.py ADDED Viewed

@@ -0,0 +1,130 @@
+from __future__ import annotations
+from dataclasses import dataclass
+@dataclass(frozen=True)
+class FailureDiagnosis:
+    code: str
+    problem: str
+    why: str
+    guidance: tuple[str, ...]
+    retriable: bool
+    matched_text: str | None = None
+_MODEL_UNAVAILABLE_MARKERS = (
+    "unknown model",
+    "invalid model",
+    "model not found",
+    "unsupported model",
+    "model is not available",
+    "not authorized to use model",
+    "you do not have access",
+    "access to model",
+    "model access",
+    "unrecognized model",
+)
+def _build_haystack(*values: object) -> str:
+    return "\n".join(str(value or "") for value in values if str(value or "").strip())
+def _contains(text: str, marker: str) -> bool:
+    return marker in text.lower()
+def diagnose_runner_failure(
+    *,
+    runner_name: str,
+    summary: str = "",
+    stderr_text: str = "",
+    output_text: str = "",
+) -> FailureDiagnosis | None:
+    haystack = _build_haystack(summary, stderr_text, output_text)
+    lower = haystack.lower()
+    normalized_runner = str(runner_name or "").strip().lower()
+    if (
+        "tool call result does not follow tool call (2013)" in lower
+        or "tool result's tool id" in lower
+    ):
+        return FailureDiagnosis(
+            code="minimax_tool_result_sequence_error",
+            problem="MiniMax rejected the tool result sequence.",
+            why=(
+                "The tool result did not immediately follow the corresponding tool call, "
+                "or the tool result referenced a tool call id that was no longer valid."
+            ),
+            guidance=(
+                "Keep each tool result immediately after its matching tool call.",
+                "Do not insert an extra assistant message between a tool call and its tool result.",
+                "For MiniMax chat-wire sessions, serialize tool use one call at a time.",
+            ),
+            retriable=False,
+            matched_text="2013",
+        )
+    if (
+        "invalid function arguments json string" in lower
+        or "failed to parse tool call arguments" in lower
+        or "trailing characters at line 1 column" in lower
+    ):
+        return FailureDiagnosis(
+            code="chat_wire_tool_argument_parse_error",
+            problem="The runner emitted malformed tool-call arguments.",
+            why=(
+                "The tool-call arguments were not a single valid JSON object. "
+                "This usually happens when multiple tool calls are batched into one response "
+                "or when the arguments string contains trailing characters."
+            ),
+            guidance=(
+                "Serialize tool calls one at a time instead of batching multiple MCP calls together.",
+                "Make sure each tool call emits exactly one complete JSON object for its arguments.",
+                "If this is a MiniMax chat-wire path, stay on the serialized single-tool compatibility route.",
+            ),
+            retriable=False,
+            matched_text="tool-call arguments",
+        )
+    if "missing environment variable" in lower:
+        return FailureDiagnosis(
+            code="provider_env_missing",
+            problem="A required provider environment variable is missing.",
+            why="The configured model provider expects an API key or env var that was not present in the runner environment.",
+            guidance=(
+                "Set the required key in `~/DeepScientist/config/runners.yaml` under `runners.codex.env`.",
+                "If you launch from a shell, export the provider key in that same shell before starting `ds`.",
+            ),
+            retriable=False,
+            matched_text="missing environment variable",
+        )
+    if any(marker in lower for marker in _MODEL_UNAVAILABLE_MARKERS):
+        return FailureDiagnosis(
+            code="runner_model_unavailable",
+            problem="The configured runner model is not available.",
+            why="The selected provider or Codex account could not access the requested model id.",
+            guidance=(
+                "Set `model: inherit` for provider-backed Codex profiles unless the provider explicitly supports the model id.",
+                "If you need a fixed model, verify that the same model works in plain `codex exec` before retrying DeepScientist.",
+            ),
+            retriable=False,
+            matched_text="model unavailable",
+        )
+    if normalized_runner == "codex" and "invalid params" in lower and "bad_request_error" in lower:
+        return FailureDiagnosis(
+            code="provider_invalid_params",
+            problem="The provider rejected the request parameters.",
+            why="The upstream provider returned a deterministic request-shape error instead of a transient transport failure.",
+            guidance=(
+                "Inspect the immediately preceding tool call / tool result sequence for protocol ordering or JSON-shape mistakes.",
+                "Do not keep retrying the same request until the request payload or provider config is corrected.",
+            ),
+            retriable=False,
+            matched_text="invalid params",
+        )
+    return None

package/src/deepscientist/doctor.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+from datetime import UTC, datetime, timedelta
 import os
 import socket
 import subprocess
@@ -13,9 +14,13 @@ from urllib.request import Request, urlopen
 from .bash_exec.shells import build_exec_shell_launch, build_terminal_shell_launch
 from .config import ConfigManager
+from .diagnostics import diagnose_runner_failure
 from .home import ensure_home_layout
 from .runtime_tools import RuntimeToolService
-from .shared import resolve_runner_binary, utc_now
+from .shared import read_json, read_jsonl_tail, resolve_runner_binary, utc_now
+_RUNTIME_FAILURE_LOOKBACK = timedelta(hours=24)
 def _browser_ui_url(host: str, port: int) -> str:
@@ -42,6 +47,10 @@ def _make_check(
     errors: list[str] | None = None,
     guidance: list[str] | None = None,
     details: dict[str, Any] | None = None,
+    problem: str | None = None,
+    why: str | None = None,
+    fix: list[str] | None = None,
+    evidence: list[str] | None = None,
 ) -> dict[str, Any]:
     normalized_warnings = list(warnings or [])
     normalized_errors = list(errors or [])
@@ -55,6 +64,10 @@ def _make_check(
         "errors": normalized_errors,
         "guidance": list(guidance or []),
         "details": dict(details or {}),
+        "problem": str(problem or "").strip() or None,
+        "why": str(why or "").strip() or None,
+        "fix": [str(line) for line in (fix or []) if str(line).strip()],
+        "evidence": [str(line) for line in (evidence or []) if str(line).strip()],
     }
@@ -273,6 +286,13 @@ def _check_codex(config_manager: ConfigManager) -> dict[str, Any]:
     probe_warnings = [str(value) for value in probe.get("warnings") or []]
     probe_guidance = [str(value) for value in probe.get("guidance") or []]
     summary = str(probe.get("summary") or "Codex startup probe completed.")
+    probe_details = probe.get("details") if isinstance(probe.get("details"), dict) else {}
+    diagnosis = diagnose_runner_failure(
+        runner_name="codex",
+        summary="\n".join([summary, *probe_errors]),
+        stderr_text=str(probe_details.get("stderr_excerpt") or ""),
+        output_text=str(probe_details.get("stdout_excerpt") or ""),
+    )
     if probe.get("ok"):
         return _make_check(
             check_id="codex",
@@ -284,17 +304,188 @@ def _check_codex(config_manager: ConfigManager) -> dict[str, Any]:
         )
     if not probe_guidance:
         probe_guidance = [
-            "Run `codex --login` (or `codex`) manually once and complete login, then retry `ds doctor`.",
+            "Run `codex login` (or just `codex`) manually once and complete login, then retry `ds doctor`.",
         ]
     return _make_check(
         check_id="codex",
         label="Codex CLI",
         ok=False,
-        summary=summary,
+        summary=diagnosis.problem if diagnosis is not None else summary,
         warnings=probe_warnings,
         errors=probe_errors or ["Codex startup probe did not succeed."],
         guidance=probe_guidance,
         details={"resolved_binary": resolved_binary},
+        problem=diagnosis.problem if diagnosis is not None else None,
+        why=diagnosis.why if diagnosis is not None else None,
+        fix=list(diagnosis.guidance) if diagnosis is not None else None,
+        evidence=(
+            [f"matched: {diagnosis.matched_text}"] if diagnosis is not None and diagnosis.matched_text else None
+        ),
+    )
+def _parse_timestamp(value: object) -> datetime | None:
+    normalized = str(value or "").strip()
+    if not normalized:
+        return None
+    candidate = normalized.replace("Z", "+00:00")
+    try:
+        parsed = datetime.fromisoformat(candidate)
+    except ValueError:
+        return None
+    if parsed.tzinfo is None:
+        parsed = parsed.replace(tzinfo=UTC)
+    return parsed.astimezone(UTC)
+def _read_runtime_failure_record(home: Path) -> dict[str, Any] | None:
+    quests_root = home / "quests"
+    if not quests_root.exists():
+        return None
+    latest: dict[str, Any] | None = None
+    latest_at: datetime | None = None
+    cutoff = datetime.now(UTC) - _RUNTIME_FAILURE_LOOKBACK
+    interesting_types = {
+        "runner.turn_error",
+        "runner.turn_retry_exhausted",
+        "quest.runtime_auto_resume_suppressed",
+    }
+    for quest_root in sorted(quests_root.glob("*/")):
+        events = read_jsonl_tail(quest_root / ".ds" / "events.jsonl", 300)
+        for event in reversed(events):
+            event_type = str(event.get("type") or "").strip()
+            if event_type not in interesting_types:
+                continue
+            created_at = _parse_timestamp(event.get("created_at"))
+            if created_at is None or created_at < cutoff:
+                continue
+            run_id = str(event.get("run_id") or "").strip() or None
+            stderr_text = ""
+            output_text = ""
+            if run_id:
+                run_root = quest_root / ".ds" / "runs" / run_id
+                result_payload = read_json(run_root / "result.json", {})
+                if isinstance(result_payload, dict):
+                    stderr_text = str(result_payload.get("stderr_text") or "").strip()
+                    output_text = str(result_payload.get("output_text") or "").strip()
+                stderr_path = run_root / "stderr.txt"
+                if not stderr_text and stderr_path.exists():
+                    try:
+                        stderr_text = stderr_path.read_text(encoding="utf-8")
+                    except OSError:
+                        stderr_text = ""
+            candidate = {
+                "quest_id": quest_root.name,
+                "run_id": run_id,
+                "event_type": event_type,
+                "summary": str(event.get("summary") or "").strip(),
+                "created_at": created_at.isoformat(),
+                "stderr_text": stderr_text,
+                "output_text": output_text,
+                "recent_attempts": event.get("recent_attempts"),
+            }
+            if latest_at is None or created_at > latest_at:
+                latest = candidate
+                latest_at = created_at
+            break
+    return latest
+def _check_recent_runtime_failures(home: Path) -> dict[str, Any]:
+    record = _read_runtime_failure_record(home)
+    if record is None:
+        return _make_check(
+            check_id="recent_runtime_failures",
+            label="Recent runtime failures",
+            ok=True,
+            summary="No recent quest runtime failures were found.",
+        )
+    event_type = str(record.get("event_type") or "").strip()
+    quest_id = str(record.get("quest_id") or "").strip() or None
+    run_id = str(record.get("run_id") or "").strip() or None
+    summary = str(record.get("summary") or "").strip()
+    details = {
+        "quest_id": quest_id,
+        "run_id": run_id,
+        "event_type": event_type,
+        "observed_at": record.get("created_at"),
+    }
+    if event_type == "quest.runtime_auto_resume_suppressed":
+        recent_attempts = int(record.get("recent_attempts") or 0)
+        return _make_check(
+            check_id="recent_runtime_failures",
+            label="Recent runtime failures",
+            ok=True,
+            summary="DeepScientist recently suppressed auto-resume to avoid a crash loop.",
+            warnings=["Automatic continuation was paused after repeated recovery attempts in a short window."],
+            guidance=[
+                "Inspect the most recent failing runner path before using `/resume` again.",
+                "If the failure was a provider-side 400/protocol error, fix that request path first instead of retrying immediately.",
+            ],
+            details=details,
+            problem="Automatic crash recovery was suppressed.",
+            why="The same quest hit repeated recovery attempts in a short window, so DeepScientist parked it instead of looping forever.",
+            fix=[
+                "Open the latest failing quest logs and identify the deterministic runner/provider error.",
+                "Resume manually only after the underlying runner or provider issue is corrected.",
+            ],
+            evidence=[
+                *( [f"quest: {quest_id}"] if quest_id else [] ),
+                f"recent recovery attempts: {recent_attempts}",
+            ],
+        )
+    diagnosis = diagnose_runner_failure(
+        runner_name="codex",
+        summary=summary,
+        stderr_text=str(record.get("stderr_text") or ""),
+        output_text=str(record.get("output_text") or ""),
+    )
+    if diagnosis is None:
+        return _make_check(
+            check_id="recent_runtime_failures",
+            label="Recent runtime failures",
+            ok=True,
+            summary="A recent quest runtime failure was found.",
+            warnings=[summary or "The latest quest run failed, but doctor could not classify it precisely yet."],
+            guidance=[
+                "Open the latest run stderr and events journal for the failing quest.",
+                "If the same failure repeats, capture the run_id and provider response text before retrying again.",
+            ],
+            details=details,
+            problem="A recent quest run failed.",
+            why="Doctor found a recent runtime failure event but could not match it to a known deterministic error pattern.",
+            fix=[
+                "Inspect the failing run's stderr and provider response text.",
+                "If the error is deterministic, avoid burning the retry budget until the request shape or config is fixed.",
+            ],
+            evidence=[
+                *( [f"quest: {quest_id}"] if quest_id else [] ),
+                *( [f"run: {run_id}"] if run_id else [] ),
+                *( [f"summary: {summary}"] if summary else [] ),
+            ],
+        )
+    return _make_check(
+        check_id="recent_runtime_failures",
+        label="Recent runtime failures",
+        ok=True,
+        summary=diagnosis.problem,
+        warnings=[summary] if summary and summary != diagnosis.problem else [],
+        guidance=list(diagnosis.guidance),
+        details=details,
+        problem=diagnosis.problem,
+        why=diagnosis.why,
+        fix=list(diagnosis.guidance),
+        evidence=[
+            *( [f"quest: {quest_id}"] if quest_id else [] ),
+            *( [f"run: {run_id}"] if run_id else [] ),
+            *( [f"matched: {diagnosis.matched_text}"] if diagnosis.matched_text else [] ),
+        ],
     )
@@ -491,6 +682,7 @@ def run_doctor(home: Path, *, repo_root: Path) -> dict[str, Any]:
         _check_config_validation(config_manager),
         _check_runner_support(config_manager),
         _check_codex(config_manager),
+        _check_recent_runtime_failures(home),
         _check_latex_runtime(home),
         _check_bundles(repo_root),
         _check_ui_port(home, config_manager),
@@ -519,6 +711,18 @@ def render_doctor_report(report: dict[str, Any]) -> str:
         status = str(item.get("status") or "ok").upper()
         icon = {"OK": "[ok]", "WARN": "[warn]", "ERROR": "[fail]"}.get(status, "[info]")
         lines.append(f"{icon} {item.get('label')}: {item.get('summary')}")
+        problem = str(item.get("problem") or "").strip()
+        why = str(item.get("why") or "").strip()
+        fix_lines = [str(line) for line in item.get("fix") or [] if str(line).strip()]
+        evidence_lines = [str(line) for line in item.get("evidence") or [] if str(line).strip()]
+        if problem:
+            lines.append(f"  problem: {problem}")
+        if why:
+            lines.append(f"  why: {why}")
+        for line in fix_lines:
+            lines.append(f"  fix: {line}")
+        for line in evidence_lines:
+            lines.append(f"  evidence: {line}")
         for warning in item.get("warnings") or []:
             lines.append(f"  warning: {warning}")
         for error in item.get("errors") or []:

package/src/deepscientist/prompts/builder.py CHANGED Viewed

@@ -1192,6 +1192,12 @@ class PromptBuilder:
                 "- collaboration_mode: user-directed copilot",
                 "- freeform_task_rule: if the user asks for a concrete research task, solve that task directly before introducing stage-routing language.",
                 "- requested_skill_hint_rule: in copilot mode, treat `requested_skill` as a lightweight routing hint, not as an instruction to default into `decision` for ordinary direct tasks.",
+                "- turn_self_routing_rule: before substantial work, classify the current turn as `direct_answer`, `direct_action`, `stage_continue`, or `route_decision`.",
+                "- direct_answer_rule: if the user mainly wants an answer or clarification, answer with the narrowest sufficient context and avoid reading large stage state unless needed.",
+                "- direct_action_rule: if the user mainly wants one concrete task, execute the smallest useful unit first and do not expand into background research continuation in the same turn unless the user asked for it.",
+                "- stage_continue_rule: if the user mainly wants the quest to keep moving, continue from the active durable stage state after acknowledging the request.",
+                "- route_decision_rule: switch into `decision`-style reasoning only when safe continuation depends on a real route, scope, cost, branch, or scientific-direction judgment.",
+                "- decision_skill_escalation_rule: if a turn upgrades into `route_decision`, explicitly read the `decision` skill before substantial route-changing work.",
                 "- response_pattern: say what changed -> say what it means -> say what happens next",
                 "- mailbox_protocol: artifact.interact(include_recent_inbound_messages=True) remains the queued human-message mailbox and should be checked whenever human continuity matters.",
                 "- planning_rule: before non-trivial execution, make the immediate plan explicit and keep the first step small.",
@@ -1201,13 +1207,18 @@ class PromptBuilder:
                 "- git_tool_mandate: for git work inside the current quest repository or worktree, prefer `artifact.git(...)` before raw shell git commands.",
                 "- git_test_rule: if the user wants a generic git smoke test rather than a quest-repo mutation, use `bash_exec(...)` in an isolated scratch repository.",
                 "- decision_entry_rule: use `decision` only for real route, scope, cost, branch, or scientific-direction judgments; do not default to it for ordinary repo, code, environment, or execution tasks.",
+                "- micro_task_stop_rule: after finishing a `direct_answer` or `direct_action` turn, report the result plainly and wait instead of auto-continuing.",
                 "- stop_rule: once the current requested unit is done, send a concise update and wait for the next message or `/resume`.",
                 "- escalation_rule: if a route change materially affects cost, scope, or scientific direction, ask before proceeding.",
             ]
             if chinese_turn:
-                lines.append("- tone_hint: 使用自然、礼貌、专业的中文，先解释结论，再说明下一步。")
+                lines.append(
+                    "- tone_hint: 使用自然、礼貌、专业、带一点活泼感的中文；像靠谱又主动汇报进展的研究搭子，不要冷冰冰或官话腔；对真实好消息可自然用“都搞定啦”“有结果了”这种轻微庆祝开头，但下一句要立刻说清具体结果。"
+                )
             else:
-                lines.append("- tone_hint: use concise, natural, professional English and lead with the conclusion.")
+                lines.append(
+                    "- tone_hint: use concise, natural, warm English, lead with the conclusion, and avoid sounding cold, bureaucratic, or log-like."
+                )
             return "\n".join(lines)
         bound_conversations = snapshot.get("bound_conversations") or []
         need_research_paper = self._need_research_paper(snapshot)
@@ -1224,6 +1235,12 @@ class PromptBuilder:
             f"- standard_profile: {standard_profile if launch_mode == 'standard' else 'n/a'}",
             f"- custom_profile: {custom_profile if launch_mode == 'custom' else 'n/a'}",
             "- collaboration_mode: long-horizon, continuity-first, artifact-aware",
+            "- user_turn_self_routing_rule: on a fresh user message, first classify the turn as `direct_answer`, `direct_action`, `stage_continue`, or `route_decision` before reading additional skills or large quest context.",
+            "- direct_answer_rule: if the user mainly wants an answer or clarification, answer with the narrowest sufficient context and avoid reading large stage state unless needed.",
+            "- direct_action_rule: if the user mainly wants one concrete task, execute the smallest useful unit first and do not silently expand into broader autonomous continuation in the same turn unless the user asked for it.",
+            "- stage_continue_rule: if the user is clearly asking to continue quest progress, resume from the active durable stage state.",
+            "- route_decision_rule: open `decision`-style reasoning only when safe continuation genuinely depends on a real route, scope, cost, branch, or scientific-direction judgment.",
+            "- decision_skill_escalation_rule: if a fresh user-message turn upgrades into `route_decision`, explicitly read the `decision` skill before substantial route-changing work.",
             "- response_pattern: say what changed -> say what it means -> say what happens next",
             "- interaction_protocol: first message may be plain conversation; after that, treat artifact.interact threads and mailbox polls as the main continuity spine across TUI, web, and connectors",
             "- shared_interaction_contract_precedence: use the shared interaction contract as the default user-facing cadence; the rules below add runtime-specific execution behavior instead of restating the same chat cadence",
@@ -1251,6 +1268,7 @@ class PromptBuilder:
             "- example_and_numbers_protocol: when it materially improves understanding, include one short example or 1 to 3 key numbers or comparisons instead of relying only on vague adjectives such as better, slower, or more stable.",
             "- omission_protocol: for ordinary user-facing updates, omit file paths, file names, artifact ids, branch/worktree ids, session ids, raw commands, raw logs, and internal tool names unless the user asked for them or needs them to act",
             "- compaction_protocol: ordinary artifact.interact progress updates should usually fit in 2 to 4 short sentences and should not read like a monitoring transcript or execution diary",
+            "- micro_task_stop_rule: after a fresh user-message turn that was only `direct_answer` or `direct_action`, finish that unit and do not silently turn the same turn into a broader autonomous stage pass unless the user asked for it.",
             "- watchdog_payload_protocol: if a tool result includes `watchdog_notes`, `progress_watchdog_note`, `visibility_watchdog_note`, or `state_change_watchdog_note`, treat that as an action item to inspect state and decide whether a fresh user-visible update is actually needed; do not emit duplicate progress by reflex",
             "- human_progress_shape_protocol: ordinary progress updates should usually make three things explicit in human language: the current task, the main difficulty or latest real progress, and the concrete next measure you will take",
             "- stage_contract_protocol: stage-specific plan/checklist rules, milestone rules, literature rules, and writing rules belong in the requested skill; do not expect this runtime block to restate them",
@@ -1292,14 +1310,14 @@ class PromptBuilder:
         if chinese_turn:
             lines.extend(
                 [
-                    "- tone_hint: 使用自然、礼貌、专业、偏正式的中文；必要时可自然称呼用户为“老师”，但不要每句重复；避免机械模板腔。",
+                    "- tone_hint: 使用自然、礼貌、专业、带一点活泼感的中文；必要时可自然称呼用户为“老师”，但不要每句重复；像靠谱又主动汇报进展的研究搭子，避免冷冰冰、官话化、机械模板腔；对真实好消息可自然用“都搞定啦”“有结果了”这种轻微庆祝开头，但下一句要立刻说清结果。",
                     "- connector_reply_hint: 在聊天面里优先简明说明当前状态、下一步动作、预计回传内容。",
                 ]
             )
         else:
             lines.extend(
                 [
-                    "- tone_hint: use a polite, professional, gentlemanly English tone.",
+                    "- tone_hint: use a polite, professional, warm English tone; avoid sounding cold, bureaucratic, or like a monitoring log.",
                     "- connector_reply_hint: keep chat replies concise but operational, with explicit next steps and evidence targets.",
                 ]
             )