@researai/deepscientist 1.5.16 → 1.5.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +66 -23
  2. package/bin/ds.js +550 -19
  3. package/docs/en/00_QUICK_START.md +65 -5
  4. package/docs/en/01_SETTINGS_REFERENCE.md +1 -1
  5. package/docs/en/09_DOCTOR.md +14 -3
  6. package/docs/en/15_CODEX_PROVIDER_SETUP.md +12 -3
  7. package/docs/en/21_LOCAL_MODEL_BACKENDS_GUIDE.md +283 -0
  8. package/docs/en/91_DEVELOPMENT.md +237 -0
  9. package/docs/en/README.md +7 -3
  10. package/docs/zh/00_QUICK_START.md +54 -5
  11. package/docs/zh/01_SETTINGS_REFERENCE.md +1 -1
  12. package/docs/zh/09_DOCTOR.md +15 -4
  13. package/docs/zh/15_CODEX_PROVIDER_SETUP.md +12 -3
  14. package/docs/zh/21_LOCAL_MODEL_BACKENDS_GUIDE.md +281 -0
  15. package/docs/zh/README.md +7 -3
  16. package/install.sh +46 -4
  17. package/package.json +2 -1
  18. package/pyproject.toml +1 -1
  19. package/src/deepscientist/__init__.py +1 -1
  20. package/src/deepscientist/bridges/connectors.py +8 -2
  21. package/src/deepscientist/codex_cli_compat.py +185 -72
  22. package/src/deepscientist/config/service.py +154 -6
  23. package/src/deepscientist/daemon/api/handlers.py +130 -25
  24. package/src/deepscientist/daemon/api/router.py +5 -0
  25. package/src/deepscientist/daemon/app.py +446 -22
  26. package/src/deepscientist/diagnostics/__init__.py +6 -0
  27. package/src/deepscientist/diagnostics/runner_failures.py +130 -0
  28. package/src/deepscientist/doctor.py +207 -3
  29. package/src/deepscientist/prompts/builder.py +22 -4
  30. package/src/deepscientist/quest/service.py +413 -13
  31. package/src/deepscientist/runners/codex.py +59 -14
  32. package/src/deepscientist/shared.py +19 -0
  33. package/src/prompts/contracts/shared_interaction.md +3 -2
  34. package/src/prompts/system.md +13 -0
  35. package/src/prompts/system_copilot.md +13 -0
  36. package/src/tui/package.json +1 -1
  37. package/src/ui/dist/assets/{AiManusChatView-COFACy7V.js → AiManusChatView-Bv-Z8YpU.js} +44 -44
  38. package/src/ui/dist/assets/{AnalysisPlugin-DnSm0GZn.js → AnalysisPlugin-BCKAfjba.js} +1 -1
  39. package/src/ui/dist/assets/{CliPlugin-CvwCmDQ5.js → CliPlugin-BCKcpc35.js} +4 -4
  40. package/src/ui/dist/assets/{CodeEditorPlugin-cOqSa0xq.js → CodeEditorPlugin-DbOfSJ8K.js} +1 -1
  41. package/src/ui/dist/assets/{CodeViewerPlugin-itb0tltR.js → CodeViewerPlugin-CbaFRrUU.js} +3 -3
  42. package/src/ui/dist/assets/{DocViewerPlugin-DqKkiCI6.js → DocViewerPlugin-DAjLVeQD.js} +3 -3
  43. package/src/ui/dist/assets/{GitCommitViewerPlugin-DVgNHBCS.js → GitCommitViewerPlugin-CIUqbUDO.js} +1 -1
  44. package/src/ui/dist/assets/{GitDiffViewerPlugin-DxL2ezFG.js → GitDiffViewerPlugin-CQACjoAA.js} +1 -1
  45. package/src/ui/dist/assets/{GitSnapshotViewer-B_RQm1YZ.js → GitSnapshotViewer-0r4nLPke.js} +1 -1
  46. package/src/ui/dist/assets/{ImageViewerPlugin-tHqlXY3n.js → ImageViewerPlugin-nBOmI2v_.js} +3 -3
  47. package/src/ui/dist/assets/{LabCopilotPanel-ClMbq5Yu.js → LabCopilotPanel-BHxOxF4z.js} +1 -1
  48. package/src/ui/dist/assets/{LabPlugin-L_SuE8ow.js → LabPlugin-BKoZGs95.js} +1 -1
  49. package/src/ui/dist/assets/{LatexPlugin-B495DTXC.js → LatexPlugin-ZwtV8pIp.js} +1 -1
  50. package/src/ui/dist/assets/{MarkdownViewerPlugin-DG28-61B.js → MarkdownViewerPlugin-DKqVfKyW.js} +3 -3
  51. package/src/ui/dist/assets/{MarketplacePlugin-BiOGT-Kj.js → MarketplacePlugin-BwxStZ9D.js} +1 -1
  52. package/src/ui/dist/assets/{NotebookEditor-C-4Kt1p9.js → NotebookEditor-BEQhaQbt.js} +1 -1
  53. package/src/ui/dist/assets/{NotebookEditor-CVsj8h_T.js → NotebookEditor-DB9N_T9q.js} +23 -23
  54. package/src/ui/dist/assets/{PdfLoader-CASDQmxJ.js → PdfLoader-eWBONbQP.js} +1 -1
  55. package/src/ui/dist/assets/{PdfMarkdownPlugin-BFhwoKsY.js → PdfMarkdownPlugin-D22YOZL3.js} +1 -1
  56. package/src/ui/dist/assets/{PdfViewerPlugin-DcOzU9vd.js → PdfViewerPlugin-c-RK9DLM.js} +3 -3
  57. package/src/ui/dist/assets/{SearchPlugin-CHj7M58O.js → SearchPlugin-CxF9ytAx.js} +1 -1
  58. package/src/ui/dist/assets/{TextViewerPlugin-CB4DYfWO.js → TextViewerPlugin-C5xqeeUH.js} +2 -2
  59. package/src/ui/dist/assets/{VNCViewer-CjlbyCB3.js → VNCViewer-BoLGLnHz.js} +1 -1
  60. package/src/ui/dist/assets/{bot-CFkZY-JP.js → bot-DREQOxzP.js} +1 -1
  61. package/src/ui/dist/assets/{chevron-up-Dq5ofbht.js → chevron-up-C9Qpx4DE.js} +1 -1
  62. package/src/ui/dist/assets/{code-DLC6G24T.js → code-WlFHE7z_.js} +1 -1
  63. package/src/ui/dist/assets/{file-content-Dv4LoZec.js → file-content-BZMz3RYp.js} +1 -1
  64. package/src/ui/dist/assets/{file-diff-panel-Denq-lC3.js → file-diff-panel-CQhw0jS2.js} +1 -1
  65. package/src/ui/dist/assets/{file-socket-Cu4Qln7Y.js → file-socket-CfQPKQKj.js} +1 -1
  66. package/src/ui/dist/assets/{git-commit-horizontal-BUh6G52n.js → git-commit-horizontal-DxZ8DCZh.js} +1 -1
  67. package/src/ui/dist/assets/{image-B9HUUddG.js → image-Bgl4VIyx.js} +1 -1
  68. package/src/ui/dist/assets/{index-Cgla8biy.css → index-BpV6lusQ.css} +1 -1
  69. package/src/ui/dist/assets/{index-Gbl53BNp.js → index-CBNVuWcP.js} +363 -363
  70. package/src/ui/dist/assets/{index-wQ7RIIRd.js → index-CwNu1aH4.js} +1 -1
  71. package/src/ui/dist/assets/{index-B2B1sg-M.js → index-DrUnlf6K.js} +1 -1
  72. package/src/ui/dist/assets/{index-DRyx7vAc.js → index-NW-h8VzN.js} +1 -1
  73. package/src/ui/dist/assets/{pdf-effect-queue-ZtnHFCAi.js → pdf-effect-queue-J8OnM0jE.js} +1 -1
  74. package/src/ui/dist/assets/{popover-DL6h35vr.js → popover-CLc0pPP8.js} +1 -1
  75. package/src/ui/dist/assets/{project-sync-CsX08Qno.js → project-sync-C9IdzdZW.js} +1 -1
  76. package/src/ui/dist/assets/{select-DvmXt1yY.js → select-Cs2PmzwL.js} +1 -1
  77. package/src/ui/dist/assets/{sigma-7jpXazui.js → sigma-ClKcHAXm.js} +1 -1
  78. package/src/ui/dist/assets/{trash-xA7kFt8i.js → trash-DwpbFr3w.js} +1 -1
  79. package/src/ui/dist/assets/{useCliAccess-DsMwDjOp.js → useCliAccess-NQ8m0Let.js} +1 -1
  80. package/src/ui/dist/assets/{wrap-text-CwMn-iqb.js → wrap-text-BC-Hltpd.js} +1 -1
  81. package/src/ui/dist/assets/{zoom-out-R-GWEhzS.js → zoom-out-E_gaeAxL.js} +1 -1
  82. package/src/ui/dist/index.html +2 -2
@@ -55,6 +55,7 @@ from ..connector.connector_profiles import (
55
55
  from ..connector_runtime import conversation_identity_key, format_conversation_id, normalize_conversation_id, parse_conversation_id
56
56
  from ..config import ConfigManager
57
57
  from ..config.models import SYSTEM_CONNECTOR_NAMES
58
+ from ..diagnostics import FailureDiagnosis, diagnose_runner_failure
58
59
  from ..home import repo_root
59
60
  from ..memory import MemoryService
60
61
  from ..network import urlopen_with_proxy as urlopen
@@ -105,6 +106,8 @@ TERMINAL_STREAM_IDLE_SLEEP_SECONDS = 0.02
105
106
  _AUTO_CONTINUE_DELAY_SECONDS = 240.0
106
107
  _AUTO_CONTINUE_ACTIVE_WORK_DELAY_SECONDS = 0.2
107
108
  _TERMINAL_PREWARM_DEBOUNCE_SECONDS = 20.0
109
+ _STALLED_RUNNING_TURN_INACTIVITY_SECONDS = 30 * 60
110
+ _STALLED_RUNNING_TURN_INTERRUPT_TIMEOUT_SECONDS = 5.0
108
111
  CODEX_RETRY_DEFAULT_MAX_ATTEMPTS = 5
109
112
  CODEX_RETRY_DEFAULT_INITIAL_BACKOFF_SEC = 10.0
110
113
  CODEX_RETRY_DEFAULT_BACKOFF_MULTIPLIER = 6.0
@@ -755,7 +758,23 @@ class DaemonApp:
755
758
  if int(snapshot.get("pending_user_message_count") or 0) > 0
756
759
  else "auto_continue"
757
760
  )
758
- scheduled = self.schedule_turn(quest_id, reason=reason)
761
+ retry_delay_seconds = self._recovery_retry_delay_seconds(snapshot) if reason == "auto_continue" else None
762
+ if retry_delay_seconds is not None and retry_delay_seconds > 0:
763
+ self._schedule_turn_later(
764
+ quest_id,
765
+ reason=reason,
766
+ delay_seconds=retry_delay_seconds,
767
+ )
768
+ scheduled = {
769
+ "scheduled": True,
770
+ "started": False,
771
+ "queued": True,
772
+ "reason": reason,
773
+ "delayed": True,
774
+ "delay_seconds": retry_delay_seconds,
775
+ }
776
+ else:
777
+ scheduled = self.schedule_turn(quest_id, reason=reason)
759
778
  event = {
760
779
  "event_id": generate_id("evt"),
761
780
  "type": "quest.runtime_auto_resumed",
@@ -767,6 +786,8 @@ class DaemonApp:
767
786
  "scheduled": bool(scheduled.get("scheduled")),
768
787
  "started": bool(scheduled.get("started")),
769
788
  "queued": bool(scheduled.get("queued")),
789
+ "delayed": bool(scheduled.get("delayed")),
790
+ "delay_seconds": scheduled.get("delay_seconds"),
770
791
  "created_at": utc_now(),
771
792
  }
772
793
  append_jsonl(self.home / "quests" / quest_id / ".ds" / "events.jsonl", event)
@@ -781,6 +802,8 @@ class DaemonApp:
781
802
  scheduled=bool(scheduled.get("scheduled")),
782
803
  started=bool(scheduled.get("started")),
783
804
  queued=bool(scheduled.get("queued")),
805
+ delayed=bool(scheduled.get("delayed")),
806
+ delay_seconds=scheduled.get("delay_seconds"),
784
807
  )
785
808
  self._recovered_quest_ids.add(quest_id)
786
809
  resumed.append(
@@ -834,6 +857,63 @@ class DaemonApp:
834
857
  count += 1
835
858
  return count
836
859
 
860
+ def _recovery_retry_delay_seconds(self, snapshot: dict[str, Any]) -> float | None:
861
+ retry_state = snapshot.get("retry_state") if isinstance(snapshot.get("retry_state"), dict) else None
862
+ if not retry_state:
863
+ return None
864
+ next_retry_at = str(retry_state.get("next_retry_at") or "").strip()
865
+ if not next_retry_at:
866
+ return None
867
+ parsed = self._parse_event_timestamp(next_retry_at)
868
+ if parsed is None:
869
+ return None
870
+ return max((parsed - datetime.now(UTC)).total_seconds(), 0.0)
871
+
872
+ def _resume_retry_state(
873
+ self,
874
+ snapshot: dict[str, Any],
875
+ *,
876
+ max_attempts: int,
877
+ ) -> tuple[int, str | None, dict[str, Any] | None]:
878
+ retry_state = snapshot.get("retry_state") if isinstance(snapshot.get("retry_state"), dict) else None
879
+ resume_source = str(snapshot.get("last_resume_source") or "").strip()
880
+ if not retry_state or not resume_source.startswith("auto:daemon-recovery"):
881
+ return 1, None, None
882
+
883
+ try:
884
+ recorded_attempt = int(retry_state.get("attempt_index") or 0)
885
+ except (TypeError, ValueError):
886
+ recorded_attempt = 0
887
+ if recorded_attempt <= 0:
888
+ return 1, None, None
889
+
890
+ next_retry_at = str(retry_state.get("next_retry_at") or "").strip()
891
+ start_attempt = recorded_attempt + 1 if next_retry_at else recorded_attempt
892
+ if start_attempt > max_attempts:
893
+ start_attempt = max_attempts
894
+ if start_attempt <= 1:
895
+ return 1, None, None
896
+
897
+ turn_id = str(retry_state.get("turn_id") or "").strip() or None
898
+ previous_run_id = str(retry_state.get("last_run_id") or "").strip() or None
899
+ failure_summary = str(retry_state.get("last_error") or "").strip() or None
900
+ retry_context = {
901
+ "turn_id": turn_id,
902
+ "attempt_index": recorded_attempt,
903
+ "max_attempts": max_attempts,
904
+ "previous_run_id": previous_run_id,
905
+ "failure_kind": "daemon_recovery",
906
+ "failure_summary": failure_summary or "Recovered retry state after daemon restart.",
907
+ "previous_exit_code": None,
908
+ "previous_output_text": "",
909
+ "stderr_tail": "",
910
+ "recent_messages": [],
911
+ "tool_progress": [],
912
+ "workspace_summary": {},
913
+ "recent_artifacts": [],
914
+ }
915
+ return start_attempt, turn_id, retry_context
916
+
837
917
  def _record_auto_resume_suppressed(
838
918
  self,
839
919
  *,
@@ -1543,7 +1623,13 @@ class DaemonApp:
1543
1623
  )
1544
1624
  turn_state = self._refresh_turn_worker_state(quest_id)
1545
1625
  has_live_turn = bool(turn_state.get("running"))
1546
- if runtime_status == "running" and has_live_turn:
1626
+ stalled_details = self._stalled_running_turn_details(
1627
+ quest_id,
1628
+ snapshot=snapshot,
1629
+ turn_state=turn_state,
1630
+ turn_reason="user_message",
1631
+ )
1632
+ if runtime_status == "running" and has_live_turn and stalled_details is None:
1547
1633
  scheduled = {
1548
1634
  "scheduled": True,
1549
1635
  "started": False,
@@ -1712,18 +1798,30 @@ class DaemonApp:
1712
1798
  return snapshot
1713
1799
 
1714
1800
  def schedule_turn(self, quest_id: str, *, reason: str = "user_message") -> dict:
1801
+ snapshot = self.quest_service.snapshot(quest_id)
1802
+ snapshot = self._reconcile_stale_active_turn(quest_id, snapshot=snapshot)
1803
+ recovery = self._recover_stalled_running_turn(quest_id, snapshot=snapshot, turn_reason=reason)
1804
+ snapshot = dict(recovery.get("snapshot") or snapshot)
1805
+ if recovery.get("blocked"):
1806
+ return {
1807
+ "scheduled": True,
1808
+ "started": False,
1809
+ "queued": True,
1810
+ "reason": "stalled_turn_recovery_pending",
1811
+ }
1715
1812
  self._refresh_turn_worker_state(quest_id)
1716
1813
  with self._turn_lock:
1717
1814
  state = self._turn_state.setdefault(quest_id, {"running": False, "pending": False})
1718
1815
  state["pending"] = True
1719
1816
  state["stop_requested"] = False
1817
+ state.pop("recovery_pending", None)
1720
1818
  state["reason"] = reason
1721
1819
  if state.get("running"):
1722
1820
  return {
1723
1821
  "scheduled": True,
1724
1822
  "started": False,
1725
1823
  "queued": True,
1726
- "reason": reason,
1824
+ "reason": "queued_for_artifact_interact" if reason == "user_message" else reason,
1727
1825
  }
1728
1826
  state["running"] = True
1729
1827
  worker = threading.Thread(
@@ -1804,6 +1902,110 @@ class DaemonApp:
1804
1902
  state.pop("worker", None)
1805
1903
  return dict(state)
1806
1904
 
1905
+ def _wait_for_turn_worker_exit(self, quest_id: str, *, timeout_seconds: float) -> dict[str, object]:
1906
+ deadline = time.monotonic() + max(0.0, float(timeout_seconds))
1907
+ state = self._refresh_turn_worker_state(quest_id)
1908
+ while state.get("running") and time.monotonic() < deadline:
1909
+ time.sleep(0.05)
1910
+ state = self._refresh_turn_worker_state(quest_id)
1911
+ return state
1912
+
1913
+ def _ensure_recovery_resume_watch(self, quest_id: str, *, turn_reason: str) -> None:
1914
+ with self._turn_lock:
1915
+ state = self._turn_state.setdefault(quest_id, {"running": False, "pending": False})
1916
+ if state.get("recovery_watch_active"):
1917
+ return
1918
+ state["recovery_watch_active"] = True
1919
+ watcher = threading.Thread(
1920
+ target=self._wait_and_resume_recovered_turn,
1921
+ args=(quest_id, turn_reason),
1922
+ daemon=True,
1923
+ name=f"deepscientist-recovery-watch-{quest_id}",
1924
+ )
1925
+ watcher.start()
1926
+
1927
+ def _wait_and_resume_recovered_turn(self, quest_id: str, turn_reason: str) -> None:
1928
+ try:
1929
+ while True:
1930
+ state = self._refresh_turn_worker_state(quest_id)
1931
+ if not state.get("recovery_pending"):
1932
+ return
1933
+ if not state.get("running"):
1934
+ break
1935
+ time.sleep(0.1)
1936
+
1937
+ snapshot = self.quest_service.snapshot(quest_id)
1938
+ runtime_status = str(snapshot.get("runtime_status") or snapshot.get("status") or "").strip().lower()
1939
+ if runtime_status in {"paused", "stopped", "completed", "error"}:
1940
+ with self._turn_lock:
1941
+ state = self._turn_state.setdefault(quest_id, {"running": False, "pending": False})
1942
+ state.pop("recovery_pending", None)
1943
+ state["stop_requested"] = runtime_status in {"paused", "stopped"}
1944
+ return
1945
+ pending_user_count = int(snapshot.get("pending_user_message_count") or 0)
1946
+ if pending_user_count > 0:
1947
+ with self._turn_lock:
1948
+ state = self._turn_state.setdefault(quest_id, {"running": False, "pending": False})
1949
+ state.pop("recovery_pending", None)
1950
+ self.schedule_turn(quest_id, reason=turn_reason)
1951
+ return
1952
+
1953
+ with self._turn_lock:
1954
+ state = self._turn_state.setdefault(quest_id, {"running": False, "pending": False})
1955
+ state.pop("recovery_pending", None)
1956
+ state["stop_requested"] = False
1957
+ except Exception as exc:
1958
+ self.logger.log(
1959
+ "warning",
1960
+ "quest.turn_state_recovery_watch_failed",
1961
+ quest_id=quest_id,
1962
+ reason=turn_reason,
1963
+ error=str(exc),
1964
+ )
1965
+ finally:
1966
+ with self._turn_lock:
1967
+ state = self._turn_state.setdefault(quest_id, {"running": False, "pending": False})
1968
+ state.pop("recovery_watch_active", None)
1969
+
1970
+ def _stalled_running_turn_details(
1971
+ self,
1972
+ quest_id: str,
1973
+ *,
1974
+ snapshot: dict | None = None,
1975
+ turn_state: dict[str, object] | None = None,
1976
+ turn_reason: str,
1977
+ ) -> dict[str, int] | None:
1978
+ if str(turn_reason or "").strip() not in {"user_message", "queued_user_messages"}:
1979
+ return None
1980
+ snapshot = dict(snapshot or self.quest_service.snapshot(quest_id))
1981
+ runtime_status = str(snapshot.get("runtime_status") or snapshot.get("status") or "").strip().lower()
1982
+ active_run_id = str(snapshot.get("active_run_id") or "").strip()
1983
+ if runtime_status != "running" or not active_run_id:
1984
+ return None
1985
+ state = dict(turn_state or self._refresh_turn_worker_state(quest_id))
1986
+ if not state.get("running"):
1987
+ return None
1988
+ pending_user_count = int(snapshot.get("pending_user_message_count") or 0)
1989
+ if pending_user_count <= 0:
1990
+ return None
1991
+ counts = snapshot.get("counts") if isinstance(snapshot.get("counts"), dict) else {}
1992
+ if int(counts.get("bash_running_count") or 0) > 0:
1993
+ return None
1994
+ silent_seconds = snapshot.get("seconds_since_last_tool_activity")
1995
+ if silent_seconds is None:
1996
+ watchdog = snapshot.get("interaction_watchdog") if isinstance(snapshot.get("interaction_watchdog"), dict) else {}
1997
+ silent_seconds = watchdog.get("seconds_since_last_tool_activity")
1998
+ try:
1999
+ silent_seconds_int = int(silent_seconds or 0)
2000
+ except (TypeError, ValueError):
2001
+ return None
2002
+ if silent_seconds_int < _STALLED_RUNNING_TURN_INACTIVITY_SECONDS:
2003
+ return None
2004
+ return {
2005
+ "pending_user_count": pending_user_count,
2006
+ "silent_seconds": silent_seconds_int,
2007
+ }
2008
+
1807
2009
  def _reconcile_stale_active_turn(self, quest_id: str, *, snapshot: dict | None = None) -> dict:
1808
2010
  snapshot = dict(snapshot or self.quest_service.snapshot(quest_id))
1809
2011
  active_run_id = str(snapshot.get("active_run_id") or "").strip()
@@ -1855,6 +2057,139 @@ class DaemonApp:
1855
2057
  )
1856
2058
  return self.quest_service.mark_turn_finished(quest_id, status=normalized_status)
1857
2059
 
2060
+ def _recover_stalled_running_turn(
2061
+ self,
2062
+ quest_id: str,
2063
+ *,
2064
+ snapshot: dict | None = None,
2065
+ turn_reason: str,
2066
+ ) -> dict[str, object]:
2067
+ snapshot = dict(snapshot or self.quest_service.snapshot(quest_id))
2068
+ turn_state = self._refresh_turn_worker_state(quest_id)
2069
+ with self._turn_lock:
2070
+ state = self._turn_state.setdefault(quest_id, {"running": False, "pending": False})
2071
+ if state.get("recovery_pending"):
2072
+ return {
2073
+ "snapshot": snapshot,
2074
+ "blocked": True,
2075
+ }
2076
+ details = self._stalled_running_turn_details(
2077
+ quest_id,
2078
+ snapshot=snapshot,
2079
+ turn_state=turn_state,
2080
+ turn_reason=turn_reason,
2081
+ )
2082
+ if details is None:
2083
+ return {
2084
+ "snapshot": snapshot,
2085
+ "blocked": False,
2086
+ }
2087
+
2088
+ active_run_id = str(snapshot.get("active_run_id") or "").strip()
2089
+ runner_name = self._runner_name_for(snapshot)
2090
+ with self._turn_lock:
2091
+ state = self._turn_state.setdefault(quest_id, {"running": False, "pending": False})
2092
+ if state.get("recovery_pending"):
2093
+ return {
2094
+ "snapshot": snapshot,
2095
+ "blocked": True,
2096
+ }
2097
+ state["pending"] = False
2098
+ state["stop_requested"] = True
2099
+ state["recovery_pending"] = True
2100
+ interrupted = False
2101
+ try:
2102
+ try:
2103
+ runner = self.get_runner(runner_name)
2104
+ except KeyError:
2105
+ runner = None
2106
+ if runner is not None and hasattr(runner, "interrupt"):
2107
+ interrupted = bool(getattr(runner, "interrupt")(quest_id))
2108
+ stopped_bash_session_ids = self._stop_active_bash_exec_sessions(
2109
+ quest_id,
2110
+ run_id=active_run_id or None,
2111
+ reason="stalled_turn_recovery",
2112
+ user_id="auto:stalled-turn-recovery",
2113
+ )
2114
+ turn_state = self._wait_for_turn_worker_exit(
2115
+ quest_id,
2116
+ timeout_seconds=_STALLED_RUNNING_TURN_INTERRUPT_TIMEOUT_SECONDS,
2117
+ )
2118
+ if turn_state.get("running"):
2119
+ self._ensure_recovery_resume_watch(quest_id, turn_reason="queued_user_messages")
2120
+ self.logger.log(
2121
+ "warning",
2122
+ "quest.turn_state_recovery_pending",
2123
+ quest_id=quest_id,
2124
+ abandoned_run_id=active_run_id or None,
2125
+ reason=turn_reason,
2126
+ silent_seconds=int(details.get("silent_seconds") or 0),
2127
+ pending_user_message_count=int(details.get("pending_user_count") or 0),
2128
+ interrupted=interrupted,
2129
+ )
2130
+ return {
2131
+ "snapshot": snapshot,
2132
+ "blocked": True,
2133
+ }
2134
+
2135
+ previous_status = (
2136
+ str(snapshot.get("runtime_status") or snapshot.get("status") or snapshot.get("display_status") or "running").strip()
2137
+ or "running"
2138
+ )
2139
+ normalized_status = "active" if previous_status == "running" else previous_status
2140
+ summary = (
2141
+ f"Recovered stalled running turn `{active_run_id}` after "
2142
+ f"{int(details.get('silent_seconds') or 0)} seconds without tool activity while "
2143
+ f"{int(details.get('pending_user_count') or 0)} queued user message(s) were waiting."
2144
+ )
2145
+ if interrupted:
2146
+ summary = f"{summary} The active runner process was interrupted."
2147
+ if stopped_bash_session_ids:
2148
+ summary = f"{summary} Stopped {len(stopped_bash_session_ids)} bash_exec session(s)."
2149
+ quest_root = self.quest_service._quest_root(quest_id)
2150
+ append_jsonl(
2151
+ quest_root / ".ds" / "events.jsonl",
2152
+ {
2153
+ "event_id": generate_id("evt"),
2154
+ "type": "quest.turn_state_reconciled",
2155
+ "quest_id": quest_id,
2156
+ "abandoned_run_id": active_run_id or None,
2157
+ "previous_status": previous_status,
2158
+ "status": normalized_status,
2159
+ "completed_at": None,
2160
+ "exit_code": None,
2161
+ "summary": summary,
2162
+ "recovery_kind": "stalled_live_turn",
2163
+ "interrupted": interrupted,
2164
+ "stopped_bash_session_ids": stopped_bash_session_ids,
2165
+ "created_at": utc_now(),
2166
+ },
2167
+ )
2168
+ self.logger.log(
2169
+ "warning",
2170
+ "quest.turn_state_reconciled",
2171
+ quest_id=quest_id,
2172
+ abandoned_run_id=active_run_id or None,
2173
+ previous_status=previous_status,
2174
+ status=normalized_status,
2175
+ recovery_kind="stalled_live_turn",
2176
+ interrupted=interrupted,
2177
+ stopped_bash_session_count=len(stopped_bash_session_ids),
2178
+ )
2179
+ snapshot = self.quest_service.mark_turn_finished(quest_id, status=normalized_status)
2180
+ with self._turn_lock:
2181
+ state = self._turn_state.setdefault(quest_id, {"running": False, "pending": False})
2182
+ state.pop("recovery_pending", None)
2183
+ return {
2184
+ "snapshot": snapshot,
2185
+ "blocked": False,
2186
+ }
2187
+ except Exception:
2188
+ with self._turn_lock:
2189
+ state = self._turn_state.setdefault(quest_id, {"running": False, "pending": False})
2190
+ state.pop("recovery_pending", None)
2191
+ raise
2192
+
1858
2193
  def control_quest(self, quest_id: str, *, action: str, source: str = "local") -> dict:
1859
2194
  normalized_action = str(action or "").strip().lower()
1860
2195
  if normalized_action == "pause":
@@ -2321,12 +2656,15 @@ class DaemonApp:
2321
2656
  )
2322
2657
  retry_policy = self._runner_retry_policy(runner_name, runner_cfg if isinstance(runner_cfg, dict) else {})
2323
2658
  max_attempts = int(retry_policy.get("max_attempts") or 1)
2324
- turn_id = generate_id("turn")
2325
- retry_context: dict[str, Any] | None = None
2659
+ resumed_start_attempt, resumed_turn_id, retry_context = self._resume_retry_state(
2660
+ snapshot,
2661
+ max_attempts=max_attempts,
2662
+ )
2663
+ turn_id = resumed_turn_id or generate_id("turn")
2326
2664
  quest_root = Path(snapshot["quest_root"])
2327
2665
  worktree_root = Path(str(snapshot["current_workspace_root"])) if snapshot.get("current_workspace_root") else None
2328
2666
 
2329
- for attempt_index in range(1, max_attempts + 1):
2667
+ for attempt_index in range(resumed_start_attempt, max_attempts + 1):
2330
2668
  current_run_id = run_id if attempt_index == 1 else generate_id("run")
2331
2669
  if attempt_index > 1:
2332
2670
  self._append_retry_event(
@@ -2395,6 +2733,31 @@ class DaemonApp:
2395
2733
  previous_output_text="",
2396
2734
  stderr_text=str(exc),
2397
2735
  )
2736
+ diagnosis = self._non_retryable_failure_diagnosis(
2737
+ runner_name=runner_name,
2738
+ summary=failure_summary,
2739
+ stderr_text=str(exc),
2740
+ output_text="",
2741
+ )
2742
+ if diagnosis is not None:
2743
+ self.quest_service.update_runtime_state(
2744
+ quest_root=quest_root,
2745
+ continuation_policy="wait_for_user_or_resume",
2746
+ continuation_reason="non_retryable_runner_error",
2747
+ continuation_updated_at=utc_now(),
2748
+ )
2749
+ self._record_turn_error(
2750
+ quest_id=quest_id,
2751
+ runner_name=runner_name,
2752
+ run_id=current_run_id,
2753
+ skill_id=skill_id,
2754
+ model=model,
2755
+ summary=f"{diagnosis.problem} {failure_summary}".strip(),
2756
+ retry_state=None,
2757
+ diagnosis_code=diagnosis.code,
2758
+ guidance=list(diagnosis.guidance),
2759
+ )
2760
+ return
2398
2761
  if bool(retry_policy.get("enabled")) and attempt_index < max_attempts:
2399
2762
  delay_seconds = self._retry_delay_seconds(retry_policy, attempt_index=attempt_index + 1)
2400
2763
  next_retry_at = self._retry_next_timestamp(delay_seconds)
@@ -2543,6 +2906,31 @@ class DaemonApp:
2543
2906
  previous_output_text=result.output_text,
2544
2907
  stderr_text=result.stderr_text,
2545
2908
  )
2909
+ diagnosis = self._non_retryable_failure_diagnosis(
2910
+ runner_name=runner_name,
2911
+ summary=failure_summary,
2912
+ stderr_text=result.stderr_text,
2913
+ output_text=result.output_text,
2914
+ )
2915
+ if diagnosis is not None:
2916
+ self.quest_service.update_runtime_state(
2917
+ quest_root=quest_root,
2918
+ continuation_policy="wait_for_user_or_resume",
2919
+ continuation_reason="non_retryable_runner_error",
2920
+ continuation_updated_at=utc_now(),
2921
+ )
2922
+ self._record_turn_error(
2923
+ quest_id=quest_id,
2924
+ runner_name=runner_name,
2925
+ run_id=result.run_id,
2926
+ skill_id=skill_id,
2927
+ model=model,
2928
+ summary=f"{diagnosis.problem} {failure_summary}".strip(),
2929
+ retry_state=None,
2930
+ diagnosis_code=diagnosis.code,
2931
+ guidance=list(diagnosis.guidance),
2932
+ )
2933
+ return
2546
2934
  if bool(retry_policy.get("enabled")) and attempt_index < max_attempts:
2547
2935
  delay_seconds = self._retry_delay_seconds(retry_policy, attempt_index=attempt_index + 1)
2548
2936
  next_retry_at = self._retry_next_timestamp(delay_seconds)
@@ -2736,6 +3124,18 @@ class DaemonApp:
2736
3124
 
2737
3125
  return skill
2738
3126
 
3127
+ @staticmethod
3128
+ def _direct_user_turn_skill(snapshot: dict) -> str:
3129
+ available_stage_skills = current_standard_skills(repo_root())
3130
+ for candidate in (
3131
+ str(snapshot.get("active_anchor") or "").strip(),
3132
+ str(snapshot.get("continuation_anchor") or "").strip(),
3133
+ ):
3134
+ if candidate in available_stage_skills and candidate != "decision":
3135
+ return DaemonApp._turn_skill_stage_gate(snapshot, candidate)
3136
+ fallback = "baseline" if "baseline" in available_stage_skills else "scout"
3137
+ return DaemonApp._turn_skill_stage_gate(snapshot, fallback)
3138
+
2739
3139
  @staticmethod
2740
3140
  def _turn_skill_for(
2741
3141
  snapshot: dict,
@@ -2747,16 +3147,6 @@ class DaemonApp:
2747
3147
  available_stage_skills = current_standard_skills(repo_root())
2748
3148
  workspace_mode = DaemonApp._workspace_mode_for(snapshot)
2749
3149
 
2750
- def copilot_default_skill() -> str:
2751
- active_anchor = str(snapshot.get("active_anchor") or "").strip()
2752
- if active_anchor in available_stage_skills and active_anchor != "decision":
2753
- return DaemonApp._turn_skill_stage_gate(snapshot, active_anchor)
2754
- continuation_anchor = str(snapshot.get("continuation_anchor") or "").strip()
2755
- if continuation_anchor in available_stage_skills and continuation_anchor != "decision":
2756
- return DaemonApp._turn_skill_stage_gate(snapshot, continuation_anchor)
2757
- fallback = "baseline" if "baseline" in available_stage_skills else "scout"
2758
- return DaemonApp._turn_skill_stage_gate(snapshot, fallback)
2759
-
2760
3150
  reply_target = str((latest_user_message or {}).get("reply_to_interaction_id") or "").strip()
2761
3151
  if reply_target:
2762
3152
  for item in (snapshot.get("active_interactions") or []):
@@ -2783,8 +3173,8 @@ class DaemonApp:
2783
3173
  ):
2784
3174
  return "decision"
2785
3175
  if str(item.get("reply_mode") or "") == "threaded":
2786
- if workspace_mode == "copilot":
2787
- return copilot_default_skill()
3176
+ if workspace_mode == "copilot" or turn_mode in {"answering", "command_execution"}:
3177
+ return DaemonApp._direct_user_turn_skill(snapshot)
2788
3178
  return DaemonApp._turn_skill_stage_gate(
2789
3179
  snapshot,
2790
3180
  DaemonApp._continuation_anchor_for(snapshot),
@@ -2792,9 +3182,9 @@ class DaemonApp:
2792
3182
  if turn_mode == "recovering":
2793
3183
  return "decision"
2794
3184
  if workspace_mode == "copilot" and latest_user_message is not None:
2795
- return copilot_default_skill()
3185
+ return DaemonApp._direct_user_turn_skill(snapshot)
2796
3186
  if turn_mode in {"answering", "command_execution"}:
2797
- return "decision"
3187
+ return DaemonApp._direct_user_turn_skill(snapshot)
2798
3188
  if str(turn_reason or "").strip() == "auto_continue" or latest_user_message is None:
2799
3189
  return DaemonApp._turn_skill_stage_gate(
2800
3190
  snapshot,
@@ -3108,8 +3498,11 @@ class DaemonApp:
3108
3498
  summary: str,
3109
3499
  display_status: str = "error",
3110
3500
  retry_state: dict[str, Any] | None = None,
3501
+ diagnosis_code: str | None = None,
3502
+ guidance: list[str] | None = None,
3111
3503
  ) -> None:
3112
3504
  quest_root = self.home / "quests" / quest_id
3505
+ normalized_guidance = [str(line) for line in (guidance or []) if str(line).strip()]
3113
3506
  append_jsonl(
3114
3507
  quest_root / ".ds" / "events.jsonl",
3115
3508
  {
@@ -3121,6 +3514,8 @@ class DaemonApp:
3121
3514
  "skill_id": skill_id,
3122
3515
  "model": model,
3123
3516
  "summary": summary,
3517
+ "diagnosis_code": str(diagnosis_code or "").strip() or None,
3518
+ "guidance": normalized_guidance,
3124
3519
  "created_at": utc_now(),
3125
3520
  },
3126
3521
  )
@@ -3131,6 +3526,16 @@ class DaemonApp:
3131
3526
  active_run_id=None,
3132
3527
  retry_state=retry_state,
3133
3528
  )
3529
+ notice_message = summary
3530
+ if normalized_guidance:
3531
+ notice_message = "\n".join(
3532
+ [
3533
+ summary,
3534
+ "",
3535
+ "Suggested fix:",
3536
+ *[f"- {line}" for line in normalized_guidance[:3]],
3537
+ ]
3538
+ ).strip()
3134
3539
  self.logger.log(
3135
3540
  "error",
3136
3541
  "runner.turn_error",
@@ -3143,7 +3548,7 @@ class DaemonApp:
3143
3548
  )
3144
3549
  self._relay_quest_message_to_bound_connectors(
3145
3550
  quest_id,
3146
- message=summary,
3551
+ message=notice_message,
3147
3552
  kind="error",
3148
3553
  response_phase="final",
3149
3554
  importance="warning",
@@ -3154,10 +3559,29 @@ class DaemonApp:
3154
3559
  "skill_id": skill_id,
3155
3560
  "runner": runner_name,
3156
3561
  "model": model,
3562
+ "diagnosis_code": str(diagnosis_code or "").strip() or None,
3157
3563
  }
3158
3564
  ],
3159
3565
  )
3160
3566
 
3567
+ @staticmethod
3568
+ def _non_retryable_failure_diagnosis(
3569
+ *,
3570
+ runner_name: str,
3571
+ summary: str,
3572
+ stderr_text: str,
3573
+ output_text: str,
3574
+ ) -> FailureDiagnosis | None:
3575
+ diagnosis = diagnose_runner_failure(
3576
+ runner_name=runner_name,
3577
+ summary=summary,
3578
+ stderr_text=stderr_text,
3579
+ output_text=output_text,
3580
+ )
3581
+ if diagnosis is None or diagnosis.retriable:
3582
+ return None
3583
+ return diagnosis
3584
+
3161
3585
  def _record_turn_postprocess_warning(
3162
3586
  self,
3163
3587
  *,
@@ -7375,7 +7799,7 @@ class DaemonApp:
7375
7799
  payload = result(**params, path=self.path)
7376
7800
  elif method == "GET":
7377
7801
  payload = result(**params) if params else result()
7378
- elif route_name in {"document_open", "document_asset_upload", "chat", "command", "quest_control", "config_save", "quest_create", "quest_baseline_binding", "run_create", "qq_inbound", "connector_inbound", "docs_open", "admin_shutdown", "bash_stop", "quest_settings", "quest_bindings", "quest_delete", "quest_layout_update", "terminal_session_ensure", "terminal_attach", "terminal_input", "stage_view", "latex_init", "latex_compile", "system_update_action", "weixin_login_qr_start", "weixin_login_qr_wait", "arxiv_import", "annotation_create", "auth_login", "auth_rotate"}:
7802
+ elif route_name in {"document_open", "document_asset_upload", "quest_file_create_folder", "quest_file_upload", "quest_file_rename", "quest_file_move", "quest_file_delete", "chat", "command", "quest_control", "config_save", "quest_create", "quest_baseline_binding", "run_create", "qq_inbound", "connector_inbound", "docs_open", "admin_shutdown", "bash_stop", "quest_settings", "quest_bindings", "quest_delete", "quest_layout_update", "terminal_session_ensure", "terminal_attach", "terminal_input", "stage_view", "latex_init", "latex_compile", "system_update_action", "weixin_login_qr_start", "weixin_login_qr_wait", "arxiv_import", "annotation_create", "auth_login", "auth_rotate"}:
7379
7803
  payload = result(**params, body=body)
7380
7804
  elif route_name == "config_validate":
7381
7805
  payload = result(body)
@@ -0,0 +1,6 @@
1
+ from .runner_failures import FailureDiagnosis, diagnose_runner_failure
2
+
3
+ __all__ = [
4
+ "FailureDiagnosis",
5
+ "diagnose_runner_failure",
6
+ ]