optio-claudecode 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/PKG-INFO +1 -1
  2. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/pyproject.toml +1 -1
  3. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/src/optio_claudecode/host_actions.py +84 -9
  4. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/src/optio_claudecode/session.py +106 -32
  5. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/src/optio_claudecode.egg-info/PKG-INFO +1 -1
  6. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/src/optio_claudecode.egg-info/SOURCES.txt +3 -0
  7. optio_claudecode-0.2.3/tests/test_kill_ttyd_by_socket.py +45 -0
  8. optio_claudecode-0.2.3/tests/test_rescue_orphan.py +270 -0
  9. optio_claudecode-0.2.3/tests/test_teardown_session_tree.py +86 -0
  10. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/README.md +0 -0
  11. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/setup.cfg +0 -0
  12. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/src/optio_claudecode/__init__.py +0 -0
  13. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/src/optio_claudecode/account.py +0 -0
  14. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/src/optio_claudecode/cred_watcher.py +0 -0
  15. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/src/optio_claudecode/oauth.py +0 -0
  16. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/src/optio_claudecode/oauth_redirect.py +0 -0
  17. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/src/optio_claudecode/prompt.py +0 -0
  18. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/src/optio_claudecode/seed_manifest.py +0 -0
  19. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/src/optio_claudecode/snapshots.py +0 -0
  20. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/src/optio_claudecode/types.py +0 -0
  21. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/src/optio_claudecode.egg-info/dependency_links.txt +0 -0
  22. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/src/optio_claudecode.egg-info/requires.txt +0 -0
  23. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/src/optio_claudecode.egg-info/top_level.txt +0 -0
  24. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_account_summary.py +0 -0
  25. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_agent_sender_claudecode.py +0 -0
  26. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_await_claude_gone.py +0 -0
  27. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_cred_watcher.py +0 -0
  28. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_home_isolation.py +0 -0
  29. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_host_actions.py +0 -0
  30. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_launch_detached_checked.py +0 -0
  31. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_oauth.py +0 -0
  32. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_oauth_redirect.py +0 -0
  33. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_on_resume_refresh.py +0 -0
  34. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_prompt.py +0 -0
  35. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_purge_seed.py +0 -0
  36. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_rekey_projects.py +0 -0
  37. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_resume_prompt.py +0 -0
  38. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_resume_sentence_claudecode.py +0 -0
  39. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_runtime_cache.py +0 -0
  40. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_sanity.py +0 -0
  41. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_seed_config.py +0 -0
  42. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_seed_provider.py +0 -0
  43. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_send_text_to_claude.py +0 -0
  44. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_session_blob_hooks.py +0 -0
  45. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_session_hooks.py +0 -0
  46. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_session_local.py +0 -0
  47. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_session_resume.py +0 -0
  48. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_session_resume_decrypt_failure.py +0 -0
  49. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_session_seed_capture.py +0 -0
  50. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_session_seed_consume.py +0 -0
  51. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_session_seed_saveback.py +0 -0
  52. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_session_seed_unknown_id.py +0 -0
  53. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_snapshots.py +0 -0
  54. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_tmux_persistence.py +0 -0
  55. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_tmux_socket_path.py +0 -0
  56. {optio_claudecode-0.2.2 → optio_claudecode-0.2.3}/tests/test_types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: optio-claudecode
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Run Anthropic Claude Code as an optio task; local subprocess or remote via SSH; ttyd-served TUI iframe.
5
5
  Author-email: Kristof Csillag <kristof.csillag@deai-labs.com>
6
6
  License-Expression: Apache-2.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "optio-claudecode"
7
- version = "0.2.2"
7
+ version = "0.2.3"
8
8
  description = "Run Anthropic Claude Code as an optio task; local subprocess or remote via SSH; ttyd-served TUI iframe."
9
9
  readme = "README.md"
10
10
  license = "Apache-2.0"
@@ -676,18 +676,41 @@ def _claude_pgrep_pattern(claude_path: str) -> str:
676
676
  return "^" + body
677
677
 
678
678
 
679
+ def _socket_pkill_pattern(socket_path: str) -> str:
680
+ """Anchored pkill -f pattern matching the orphan ttyd that carries
681
+ ``socket_path`` in its cmdline (``ttyd ... -- tmux -S <socket> attach``).
682
+
683
+ The ``ttyd`` binary token is bracket-escaped (``[t]tyd``) so pkill's own
684
+ argv does not self-match — same trick as ``_claude_pgrep_pattern``'s
685
+ ``[c]laude``. The full ``socket_path`` is kept verbatim so the match is
686
+ scoped to this task's private socket (and not some other ttyd)."""
687
+ if not socket_path:
688
+ return socket_path
689
+ return f"[t]tyd.*{socket_path}"
690
+
691
+
692
+ async def _kill_ttyd_by_socket(host: "Host", socket_path: str) -> None:
693
+ """Reap a detached orphan ttyd that has no tracked launch handle.
694
+
695
+ Normal teardown kills ttyd via ``terminate_subprocess(launched_handle)``.
696
+ A crash orphan's ttyd is re-parented to init with no handle, so it is
697
+ reaped host-side by an anchored ``pkill -f`` on the private socket path it
698
+ carries in its cmdline. Best-effort: pkill exits non-zero when nothing
699
+ matches."""
700
+ pattern = _socket_pkill_pattern(socket_path)
701
+ await host.run_command(f"pkill -KILL -f {shlex.quote(pattern)} || true")
702
+
703
+
679
704
  async def kill_claude_processes(
680
705
  host: "Host", claude_path: str, *, signal: str = "KILL",
681
706
  ) -> None:
682
- """Force the per-task claude process tree to exit.
683
-
684
- Teardown SIGKILLs ttyd and kill-sessions tmux, but claude runs under pasta
685
- in its own process group and ignores the tmux pane's SIGHUP, so it is
686
- orphaned and survives ``await_claude_gone`` then waits for a process
687
- nothing kills, blowing the cancel grace. pasta isolates only the network
688
- namespace (not PID), so a host-side ``pkill`` on the anchored argv[0] path
689
- reaches it. Best-effort: pkill exits non-zero when nothing matches.
690
- """
707
+ """Kill the per-task claude via an anchored host-side ``pkill``.
708
+
709
+ claude ignores the tmux pane SIGHUP, and MAY run under a pasta netns
710
+ wrapper (only when ``OPTIO_CLAUDECODE_NETNS`` is set AND the host is
711
+ local). The anchored pkill on claude's argv[0] reaches it regardless of
712
+ whether pasta wraps it, because pasta isolates the network namespace, not
713
+ PID. Best-effort: pkill exits non-zero when nothing matches."""
691
714
  pattern = _claude_pgrep_pattern(claude_path)
692
715
  await host.run_command(f"pkill -{signal} -f {shlex.quote(pattern)} || true")
693
716
 
@@ -721,6 +744,58 @@ async def await_claude_gone(
721
744
  waited += poll_s
722
745
 
723
746
 
747
+ async def teardown_session_tree(
748
+ host: "Host",
749
+ *,
750
+ tmux_path: str,
751
+ tmux_socket: str,
752
+ tmux_session: str,
753
+ claude_path: str,
754
+ ttyd_handle: "ProcessHandle | None" = None,
755
+ aggressive: bool,
756
+ ) -> None:
757
+ """Kill a full claudecode session tree (ttyd + tmux + claude), reused by
758
+ both normal teardown and crash-orphan rescue.
759
+
760
+ Four best-effort steps, each isolated so one failure does not abort the
761
+ rest:
762
+ 1. ttyd — via the tracked launch handle (normal teardown) or, when no
763
+ handle exists (a crash orphan re-parented to init), an anchored
764
+ host-side pkill on the socket path.
765
+ 2. ``kill-session`` — SIGHUPs the tmux pane.
766
+ 3. ``kill_claude_processes`` — claude ignores the pane SIGHUP (and may
767
+ run under a pasta netns wrapper), so it is killed explicitly via an
768
+ anchored host-side pkill on its argv[0]; this reaches it whether or
769
+ not pasta wraps it (pasta isolates the network namespace, not PID).
770
+ 4. ``await_claude_gone`` — waits for quiescence so a subsequent capture
771
+ tar does not race a dying claude."""
772
+ if ttyd_handle is not None:
773
+ try:
774
+ await host.terminate_subprocess(ttyd_handle, aggressive=aggressive)
775
+ except Exception:
776
+ _LOG.exception("terminate_subprocess (ttyd) failed")
777
+ else:
778
+ try:
779
+ await _kill_ttyd_by_socket(host, tmux_socket)
780
+ except Exception:
781
+ _LOG.exception("orphan ttyd reap failed (socket=%s)", tmux_socket)
782
+
783
+ try:
784
+ await _kill_tmux_session(host, tmux_path, tmux_socket, tmux_session)
785
+ except Exception:
786
+ _LOG.exception("tmux session teardown failed")
787
+
788
+ try:
789
+ await kill_claude_processes(host, claude_path)
790
+ except Exception:
791
+ _LOG.exception("kill_claude_processes failed")
792
+
793
+ try:
794
+ await await_claude_gone(host, claude_path)
795
+ except Exception:
796
+ _LOG.exception("await_claude_gone failed; proceeding")
797
+
798
+
724
799
  async def tmux_session_alive(
725
800
  host: "Host", tmux_path: str, socket_path: str, session_name: str,
726
801
  ) -> bool:
@@ -110,6 +110,11 @@ async def run_claudecode_session(
110
110
 
111
111
  await host.connect()
112
112
 
113
+ # Crash-orphan rescue: if a non-graceful host death left this task's
114
+ # tmux/ttyd/claude tree running with unsaved state, harvest it into a fresh
115
+ # snapshot and kill it BEFORE the driver wipes the workdir. No-op otherwise.
116
+ await _rescue_orphan_if_present(ctx, config=config, host=host)
117
+
113
118
  async def _prepare(host: Host, hook_ctx: HookContext) -> None:
114
119
  """Install the claude+ttyd runtime and restore a resume snapshot.
115
120
 
@@ -358,41 +363,27 @@ async def run_claudecode_session(
358
363
  if not ctx.should_continue():
359
364
  cancelled = True
360
365
  _trace("finally: ENTER cancelled=%s resuming=%s", cancelled, resuming)
361
- if launched_handle is not None:
362
- _trace("finally: terminate_subprocess START aggressive=%s", cancelled)
363
- try:
364
- await host.terminate_subprocess(launched_handle, aggressive=cancelled)
365
- except Exception:
366
- _LOG.exception("terminate_subprocess failed")
367
- _trace("finally: terminate_subprocess DONE")
368
-
369
- if tmux_path is not None and tmux_socket is not None and tmux_session is not None:
366
+ if (
367
+ launched_handle is not None
368
+ and tmux_path is not None
369
+ and tmux_socket is not None
370
+ and tmux_session is not None
371
+ and claude_path
372
+ ):
373
+ _trace("finally: teardown_session_tree START aggressive=%s", cancelled)
370
374
  try:
371
- await host_actions._kill_tmux_session(
372
- host, tmux_path, tmux_socket, tmux_session,
375
+ await host_actions.teardown_session_tree(
376
+ host,
377
+ tmux_path=tmux_path,
378
+ tmux_socket=tmux_socket,
379
+ tmux_session=tmux_session,
380
+ claude_path=claude_path,
381
+ ttyd_handle=launched_handle,
382
+ aggressive=cancelled,
373
383
  )
374
384
  except Exception:
375
- _LOG.exception("tmux session teardown failed")
376
-
377
- # Wait for claude to fully exit before snapshotting, so the tar of
378
- # home/.claude reads a quiescent tree. kill-session only SIGHUPs
379
- # claude and returns; claude may still be flushing settings / mcp-cache
380
- # / locks as it dies, which would make the capture tar fail with
381
- # "file changed as we read it". Best-effort + bounded; the strict tar
382
- # exit check in _archive_home_claude is the backstop.
383
- if launched_handle is not None and claude_path:
384
- # claude runs under pasta in its own process group and ignores the
385
- # tmux pane's SIGHUP from kill-session, so it is orphaned by the
386
- # ttyd/tmux teardown above. Kill it explicitly, else await_claude_gone
387
- # waits on a process nothing kills and the cancel grace is exceeded.
388
- try:
389
- await host_actions.kill_claude_processes(host, claude_path)
390
- except Exception:
391
- _LOG.exception("kill_claude_processes failed")
392
- try:
393
- await host_actions.await_claude_gone(host, claude_path)
394
- except Exception:
395
- _LOG.exception("await_claude_gone failed; proceeding to capture")
385
+ _LOG.exception("teardown_session_tree failed")
386
+ _trace("finally: teardown_session_tree DONE")
396
387
 
397
388
  if cred_watch_task is not None:
398
389
  cred_watch_task.cancel()
@@ -566,6 +557,89 @@ async def _extract_home_claude(host: Host, plain: bytes) -> None:
566
557
  await host.run_command(f"rm -f {shlex.quote(tmpfile)}")
567
558
 
568
559
 
560
+ _RESCUE_MARKER = ".optio-rescue-pending"
561
+
562
+
563
+ def _claude_bin_path(host: "Host") -> str:
564
+ """Deterministic launch path of claude inside the isolated HOME."""
565
+ return f"{host.workdir.rstrip('/')}/home/.local/bin/claude"
566
+
567
+
568
+ async def _marker_present(host: "Host", marker_path: str) -> bool:
569
+ r = await host.run_command(
570
+ f"test -e {shlex.quote(marker_path)} && echo YES || true"
571
+ )
572
+ return "YES" in r.stdout
573
+
574
+
575
+ async def _rescue_orphan_if_present(
576
+ ctx: ProcessContext, host: Host, config: ClaudeCodeTaskConfig,
577
+ ) -> None:
578
+ """Before the driver wipes the workdir, recover a crash-surviving orphan.
579
+
580
+ A non-graceful host death (disk-full, OOM, power loss) leaves the
581
+ tmux/ttyd/claude sub-tree running, re-parented to init, with unsaved state
582
+ on disk — but no snapshot. This bracket, run before
583
+ ``run_log_protocol_session`` (hence before ``setup_workdir``), detects that
584
+ orphan on the deterministic per-task socket, kills it, and captures its
585
+ live state into a fresh snapshot that the unchanged resume path then
586
+ restores. No-op unless an orphan (or a leftover rescue marker) is found.
587
+
588
+ Kill-before-capture is deliberate: a dead, static workdir prevents a live
589
+ claude from repopulating ``home/.claude`` into the plaintext workdir blob
590
+ after the expunge, and yields a race-free tar. See spec decisions D3/D4."""
591
+ if not getattr(config, "supports_resume", True):
592
+ return
593
+ if not bool(getattr(ctx, "resume", False)):
594
+ return
595
+
596
+ socket = host_actions._tmux_socket_path(host)
597
+ session = "optio"
598
+ marker_path = f"{host.workdir.rstrip('/')}/{_RESCUE_MARKER}"
599
+
600
+ tmux_path = await host_actions._require_tmux(host)
601
+ alive = await host_actions.tmux_session_alive(
602
+ host, tmux_path, socket, session,
603
+ )
604
+ if not alive and not await _marker_present(host, marker_path):
605
+ return # normal resume; nothing to rescue
606
+
607
+ _LOG.warning(
608
+ "crash-orphan rescue: live=%s socket=%s — capturing live state before wipe",
609
+ alive, socket,
610
+ )
611
+
612
+ # 1. Durable marker (retry guard: kill removes the has-session signal).
613
+ await host.write_text(_RESCUE_MARKER, "")
614
+
615
+ # 2. Kill the orphan tree (handle-less: orphan ttyd reaped by socket).
616
+ claude_path = _claude_bin_path(host)
617
+ await host_actions.teardown_session_tree(
618
+ host,
619
+ tmux_path=tmux_path,
620
+ tmux_socket=socket,
621
+ tmux_session=session,
622
+ claude_path=claude_path,
623
+ ttyd_handle=None,
624
+ aggressive=True,
625
+ )
626
+
627
+ # 3. Capture the now-static workdir — identical artifacts to a normal
628
+ # teardown capture. Exclude the marker so a restored workdir cannot
629
+ # re-trigger rescue in a loop.
630
+ exclude = [*(config.workdir_exclude or []), _RESCUE_MARKER]
631
+ await _capture_snapshot(
632
+ ctx, host,
633
+ end_state="rescued",
634
+ workdir_exclude=exclude,
635
+ session_blob_encrypt=config.session_blob_encrypt,
636
+ )
637
+
638
+ # 4. Capture durable — clear the marker.
639
+ await host.run_command(f"rm -f {shlex.quote(marker_path)}")
640
+ _LOG.warning("crash-orphan rescue: fresh snapshot captured; orphan killed")
641
+
642
+
569
643
  async def _capture_snapshot(
570
644
  ctx: ProcessContext,
571
645
  host: Host,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: optio-claudecode
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Run Anthropic Claude Code as an optio task; local subprocess or remote via SSH; ttyd-served TUI iframe.
5
5
  Author-email: Kristof Csillag <kristof.csillag@deai-labs.com>
6
6
  License-Expression: Apache-2.0
@@ -22,6 +22,7 @@ tests/test_await_claude_gone.py
22
22
  tests/test_cred_watcher.py
23
23
  tests/test_home_isolation.py
24
24
  tests/test_host_actions.py
25
+ tests/test_kill_ttyd_by_socket.py
25
26
  tests/test_launch_detached_checked.py
26
27
  tests/test_oauth.py
27
28
  tests/test_oauth_redirect.py
@@ -29,6 +30,7 @@ tests/test_on_resume_refresh.py
29
30
  tests/test_prompt.py
30
31
  tests/test_purge_seed.py
31
32
  tests/test_rekey_projects.py
33
+ tests/test_rescue_orphan.py
32
34
  tests/test_resume_prompt.py
33
35
  tests/test_resume_sentence_claudecode.py
34
36
  tests/test_runtime_cache.py
@@ -46,6 +48,7 @@ tests/test_session_seed_consume.py
46
48
  tests/test_session_seed_saveback.py
47
49
  tests/test_session_seed_unknown_id.py
48
50
  tests/test_snapshots.py
51
+ tests/test_teardown_session_tree.py
49
52
  tests/test_tmux_persistence.py
50
53
  tests/test_tmux_socket_path.py
51
54
  tests/test_types.py
@@ -0,0 +1,45 @@
1
+ import pytest
2
+
3
+ import optio_claudecode.host_actions as H
4
+
5
+
6
+ class _Result:
7
+ def __init__(self, stdout=""):
8
+ self.stdout = stdout
9
+ self.stderr = ""
10
+ self.exit_code = 0
11
+
12
+
13
+ class _Host:
14
+ def __init__(self):
15
+ self.commands = []
16
+
17
+ async def run_command(self, cmd, **kwargs):
18
+ self.commands.append(cmd)
19
+ return _Result()
20
+
21
+
22
+ @pytest.mark.asyncio
23
+ async def test_kill_ttyd_by_socket_anchored_pkill():
24
+ host = _Host()
25
+ await H._kill_ttyd_by_socket(host, "/tmp/optio-cc-deadbeef0badcafe.sock")
26
+ assert len(host.commands) == 1
27
+ cmd = host.commands[0]
28
+ # Targets ttyd processes by the socket path they carry in their cmdline.
29
+ assert "pkill" in cmd
30
+ assert "/tmp/optio-cc-deadbeef0badcafe.sock" in cmd
31
+ # Anchored so the rescue's own command line is not matched (mirrors the
32
+ # [c]laude self-match guard used for claude).
33
+ assert "optio-cc-deadbeef0badcafe.sock" in cmd
34
+ # Best-effort: never fails the caller when nothing matches.
35
+ assert "|| true" in cmd
36
+
37
+
38
+ @pytest.mark.asyncio
39
+ async def test_kill_ttyd_by_socket_does_not_self_match():
40
+ # The emitted pattern must contain a bracket-escape so pkill -f does not
41
+ # match its own argv. We assert the socket digest is bracket-split.
42
+ host = _Host()
43
+ await H._kill_ttyd_by_socket(host, "/tmp/optio-cc-abc123.sock")
44
+ cmd = host.commands[0]
45
+ assert "[" in cmd and "]" in cmd
@@ -0,0 +1,270 @@
1
+ import pytest
2
+
3
+ import optio_claudecode.session as S
4
+
5
+
6
+ class _Result:
7
+ def __init__(self, stdout=""):
8
+ self.stdout = stdout
9
+ self.stderr = ""
10
+ self.exit_code = 0
11
+
12
+
13
+ class _Host:
14
+ """Stub host. ``existing`` is a set of marker paths reported present."""
15
+
16
+ def __init__(self, workdir, existing=None):
17
+ self.workdir = workdir
18
+ self.existing = set(existing or ())
19
+ self.written = []
20
+ self.removed = []
21
+ self.commands = []
22
+
23
+ async def run_command(self, cmd, **kwargs):
24
+ self.commands.append(cmd)
25
+ # Emulate `test -e <path> && echo YES || true` marker probes.
26
+ if cmd.startswith("test -e "):
27
+ path = cmd.split("test -e ", 1)[1].split(" ", 1)[0].strip("'\"")
28
+ return _Result("YES\n" if path in self.existing else "")
29
+ if cmd.startswith("rm -f "):
30
+ path = cmd.split("rm -f ", 1)[1].strip().strip("'\"")
31
+ self.removed.append(path)
32
+ self.existing.discard(path)
33
+ return _Result()
34
+
35
+ async def write_text(self, rel, text):
36
+ self.written.append(rel)
37
+ self.existing.add(f"{self.workdir.rstrip('/')}/{rel}")
38
+
39
+
40
+ class _Config:
41
+ workdir_exclude = None
42
+ session_blob_encrypt = None
43
+
44
+
45
+ class _Ctx:
46
+ process_id = "pid-1"
47
+ resume = True
48
+
49
+
50
+ @pytest.fixture
51
+ def patched(monkeypatch):
52
+ rec = {"alive": False, "teardown": [], "capture": [], "fail_capture": False}
53
+
54
+ async def _alive(host, tmux_path, socket, session):
55
+ return rec["alive"]
56
+
57
+ async def _require_tmux(host):
58
+ return "tmux"
59
+
60
+ def _socket(host):
61
+ return "/tmp/optio-cc-deadbeef.sock"
62
+
63
+ async def _teardown(host, **kw):
64
+ rec["teardown"].append(kw)
65
+
66
+ async def _capture(ctx, host, **kw):
67
+ rec["capture"].append(kw)
68
+ if rec["fail_capture"]:
69
+ raise RuntimeError("capture failed")
70
+
71
+ monkeypatch.setattr(S.host_actions, "tmux_session_alive", _alive)
72
+ monkeypatch.setattr(S.host_actions, "_require_tmux", _require_tmux)
73
+ monkeypatch.setattr(S.host_actions, "_tmux_socket_path", _socket)
74
+ monkeypatch.setattr(S.host_actions, "teardown_session_tree", _teardown)
75
+ monkeypatch.setattr(S, "_capture_snapshot", _capture)
76
+ return rec
77
+
78
+
79
+ @pytest.mark.asyncio
80
+ async def test_noop_when_no_session_and_no_marker(patched, tmp_path):
81
+ patched["alive"] = False
82
+ host = _Host(str(tmp_path))
83
+ await S._rescue_orphan_if_present(_Ctx(), host, _Config())
84
+ assert patched["teardown"] == []
85
+ assert patched["capture"] == []
86
+ assert host.written == []
87
+
88
+
89
+ @pytest.mark.asyncio
90
+ async def test_triggers_on_live_session_kill_before_capture(patched, tmp_path):
91
+ patched["alive"] = True
92
+ host = _Host(str(tmp_path))
93
+ await S._rescue_orphan_if_present(_Ctx(), host, _Config())
94
+ # Marker written, orphan killed, then captured — kill strictly before capture.
95
+ assert host.written == [".optio-rescue-pending"]
96
+ assert len(patched["teardown"]) == 1
97
+ assert patched["teardown"][0]["ttyd_handle"] is None
98
+ assert len(patched["capture"]) == 1
99
+ # end_state marks the rescue for forensics.
100
+ assert patched["capture"][0]["end_state"] == "rescued"
101
+ # Marker excluded from the snapshot so a restored workdir does not loop.
102
+ assert ".optio-rescue-pending" in (patched["capture"][0]["workdir_exclude"] or [])
103
+ # Marker cleared after capture success.
104
+ marker = f"{str(tmp_path).rstrip('/')}/.optio-rescue-pending"
105
+ assert marker in host.removed
106
+
107
+
108
+ @pytest.mark.asyncio
109
+ async def test_triggers_on_marker_even_without_session(patched, tmp_path):
110
+ patched["alive"] = False
111
+ marker = f"{str(tmp_path).rstrip('/')}/.optio-rescue-pending"
112
+ host = _Host(str(tmp_path), existing={marker})
113
+ await S._rescue_orphan_if_present(_Ctx(), host, _Config())
114
+ # Detect-by-marker path still rescues (mid-rescue retry).
115
+ assert len(patched["teardown"]) == 1
116
+ assert len(patched["capture"]) == 1
117
+
118
+
119
+ @pytest.mark.asyncio
120
+ async def test_capture_failure_persists_marker_and_reraises(patched, tmp_path):
121
+ patched["alive"] = True
122
+ patched["fail_capture"] = True
123
+ host = _Host(str(tmp_path))
124
+ with pytest.raises(RuntimeError):
125
+ await S._rescue_orphan_if_present(_Ctx(), host, _Config())
126
+ marker = f"{str(tmp_path).rstrip('/')}/.optio-rescue-pending"
127
+ # Marker NOT removed — a retried resume re-enters rescue.
128
+ assert marker not in host.removed
129
+
130
+
131
+ @pytest.mark.asyncio
132
+ async def test_skipped_when_not_resuming(patched, tmp_path):
133
+ patched["alive"] = True
134
+
135
+ class _FreshCtx:
136
+ process_id = "pid-1"
137
+ resume = False
138
+
139
+ host = _Host(str(tmp_path))
140
+ await S._rescue_orphan_if_present(_FreshCtx(), host, _Config())
141
+ assert patched["teardown"] == []
142
+ assert patched["capture"] == []
143
+
144
+
145
+ # --------------------------------------------------------------------------
146
+ # Integration: real (shim) detached session -> orphan -> rescue.
147
+ #
148
+ # Exercises the REAL teardown_session_tree + _capture_snapshot against a real
149
+ # local tmux tree: launch a detached tmux session whose "claude" is a long-lived
150
+ # sleep shim at the deterministic <workdir>/home/.local/bin/claude path, abandon
151
+ # it (no teardown) so it is the orphan, then run _rescue_orphan_if_present and
152
+ # assert the orphan is gone, a fresh "rescued" snapshot exists, and the marker is
153
+ # cleared. Mirrors the launch scaffolding in test_tmux_persistence.py.
154
+ # --------------------------------------------------------------------------
155
+
156
+ import os # noqa: E402
157
+ import shutil # noqa: E402
158
+
159
+ from optio_claudecode import ClaudeCodeTaskConfig # noqa: E402
160
+ from optio_claudecode import host_actions as H # noqa: E402
161
+
162
+
163
+ _NO_TMUX = shutil.which("tmux") is None
164
+
165
+
166
+ async def _launch_orphan_session(host):
167
+ """Start a detached tmux session whose 'claude' records its pid then sleeps,
168
+ on the deterministic per-task socket the rescue probes. Returns
169
+ (tmux_path, socket)."""
170
+ workdir = host.workdir
171
+ os.makedirs(f"{workdir}/home/.local/bin", exist_ok=True)
172
+ claude = f"{workdir}/home/.local/bin/claude"
173
+ marker = f"{workdir}/claude.pid"
174
+ with open(claude, "w") as f:
175
+ f.write(f"#!/bin/bash\necho $$ > {marker}\nexec sleep 60\n")
176
+ os.chmod(claude, 0o755)
177
+
178
+ # Plant credentials so the capture credentials-present guard passes.
179
+ os.makedirs(f"{workdir}/home/.claude", exist_ok=True)
180
+ with open(f"{workdir}/home/.claude/.credentials.json", "w") as f:
181
+ f.write('{"token": "test"}')
182
+
183
+ tmux_path = await H._require_tmux(host)
184
+ socket = H._tmux_socket_path(host)
185
+ argv = H.build_tmux_session_argv(
186
+ tmux_path=tmux_path, claude_path=claude, workdir=workdir,
187
+ socket_path=socket, session_name="optio",
188
+ extra_env=None, claude_flags=[],
189
+ )
190
+ import shlex
191
+ cmd = " ".join(shlex.quote(a) for a in argv)
192
+ r = await host.run_command(cmd)
193
+ assert r.exit_code == 0, r.stderr
194
+ return tmux_path, socket
195
+
196
+
197
+ @pytest.mark.skipif(_NO_TMUX, reason="tmux not installed on the worker")
198
+ @pytest.mark.asyncio
199
+ async def test_rescue_end_to_end_kills_orphan_and_snapshots(
200
+ mongo_db, tmp_path, ctx_and_captures,
201
+ ):
202
+ """Launch a real (shim) detached session, abandon it so it becomes an
203
+ orphan, then run _rescue_orphan_if_present and assert: the tmux session is
204
+ gone, no claude shim remains, and a fresh 'rescued' snapshot was inserted.
205
+
206
+ Manual run:
207
+ .venv/bin/python -m pytest \\
208
+ packages/optio-claudecode/tests/test_rescue_orphan.py \\
209
+ -k end_to_end -v
210
+ """
211
+ import asyncio
212
+
213
+ from optio_claudecode.snapshots import load_latest_snapshot
214
+ from optio_host.host import LocalHost
215
+
216
+ ctx, _cap, _flag = ctx_and_captures
217
+ ctx.resume = True
218
+
219
+ # (a)+(b) Build the LocalHost and launch the detached (shim) session on the
220
+ # deterministic per-task socket.
221
+ taskdir = str(tmp_path / "task")
222
+ os.makedirs(taskdir, exist_ok=True)
223
+ host = LocalHost(taskdir=taskdir)
224
+ os.makedirs(host.workdir, exist_ok=True)
225
+
226
+ tmux_path, socket = await _launch_orphan_session(host)
227
+ # Confirm the orphan is actually alive before we rescue it.
228
+ assert await H.tmux_session_alive(host, tmux_path, socket, "optio")
229
+ pid_file = f"{host.workdir}/claude.pid"
230
+ for _ in range(30):
231
+ if os.path.exists(pid_file):
232
+ break
233
+ await asyncio.sleep(0.1)
234
+ assert os.path.exists(pid_file)
235
+ child_pid = int(open(pid_file).read().strip())
236
+
237
+ # (c) Crash simulation: we never captured an in-process handle, and we do
238
+ # not tear the session down. The tmux/claude tree is the orphan.
239
+
240
+ config = ClaudeCodeTaskConfig(
241
+ consumer_instructions="(rescue e2e)",
242
+ supports_resume=True,
243
+ session_blob_encrypt=lambda b: b,
244
+ session_blob_decrypt=lambda b: b,
245
+ )
246
+
247
+ # (d) Rescue.
248
+ await S._rescue_orphan_if_present(ctx, host, config)
249
+
250
+ # (e) Assertions.
251
+ # The tmux session is gone (orphan killed before capture).
252
+ assert (await H.tmux_session_alive(host, tmux_path, socket, "optio")) is False
253
+ # The sleep shim child is reaped (kill-session SIGHUPs the pane tree).
254
+ await asyncio.sleep(0.3)
255
+ with pytest.raises(ProcessLookupError):
256
+ os.kill(child_pid, 0)
257
+
258
+ # A fresh 'rescued' snapshot was inserted.
259
+ snap = await load_latest_snapshot(
260
+ mongo_db, prefix=ctx._prefix, process_id=ctx.process_id,
261
+ )
262
+ assert snap is not None
263
+ assert snap["endState"] == "rescued"
264
+
265
+ # Marker cleared on success.
266
+ marker = f"{host.workdir.rstrip('/')}/.optio-rescue-pending"
267
+ r = await host.run_command(
268
+ f"test -e {marker} && echo Y || true"
269
+ )
270
+ assert "Y" not in r.stdout
@@ -0,0 +1,86 @@
1
+ import pytest
2
+
3
+ import optio_claudecode.host_actions as H
4
+
5
+
6
+ @pytest.fixture
7
+ def calls(monkeypatch):
8
+ rec = []
9
+
10
+ async def _kill_ttyd(host, socket):
11
+ rec.append(("ttyd_socket", socket))
12
+
13
+ async def _kill_session(host, tmux_path, socket, session):
14
+ rec.append(("kill_session", session))
15
+
16
+ async def _kill_claude(host, claude_path, **kw):
17
+ rec.append(("kill_claude", claude_path))
18
+
19
+ async def _await_gone(host, claude_path, **kw):
20
+ rec.append(("await_gone", claude_path))
21
+ return True
22
+
23
+ monkeypatch.setattr(H, "_kill_ttyd_by_socket", _kill_ttyd)
24
+ monkeypatch.setattr(H, "_kill_tmux_session", _kill_session)
25
+ monkeypatch.setattr(H, "kill_claude_processes", _kill_claude)
26
+ monkeypatch.setattr(H, "await_claude_gone", _await_gone)
27
+ return rec
28
+
29
+
30
+ class _FakeHandle:
31
+ pass
32
+
33
+
34
+ class _Host:
35
+ def __init__(self):
36
+ self.terminated = []
37
+
38
+ async def terminate_subprocess(self, handle, *, aggressive):
39
+ self.terminated.append((handle, aggressive))
40
+
41
+
42
+ @pytest.mark.asyncio
43
+ async def test_orphan_branch_uses_kill_ttyd_by_socket(calls):
44
+ host = _Host()
45
+ await H.teardown_session_tree(
46
+ host, tmux_path="tmux", tmux_socket="/tmp/s.sock",
47
+ tmux_session="optio", claude_path="/w/home/.local/bin/claude",
48
+ ttyd_handle=None, aggressive=True,
49
+ )
50
+ # All four steps, in order; orphan ttyd path; no terminate_subprocess.
51
+ assert [c[0] for c in calls] == [
52
+ "ttyd_socket", "kill_session", "kill_claude", "await_gone",
53
+ ]
54
+ assert host.terminated == []
55
+
56
+
57
+ @pytest.mark.asyncio
58
+ async def test_handle_branch_uses_terminate_subprocess(calls):
59
+ host = _Host()
60
+ handle = _FakeHandle()
61
+ await H.teardown_session_tree(
62
+ host, tmux_path="tmux", tmux_socket="/tmp/s.sock",
63
+ tmux_session="optio", claude_path="/w/home/.local/bin/claude",
64
+ ttyd_handle=handle, aggressive=False,
65
+ )
66
+ assert host.terminated == [(handle, False)]
67
+ # ttyd-by-socket NOT used when a handle is present.
68
+ assert [c[0] for c in calls] == ["kill_session", "kill_claude", "await_gone"]
69
+
70
+
71
+ @pytest.mark.asyncio
72
+ async def test_steps_are_best_effort(calls, monkeypatch):
73
+ # A failure in one step does not abort the rest.
74
+ async def _boom(host, socket):
75
+ raise RuntimeError("ttyd kill blew up")
76
+
77
+ monkeypatch.setattr(H, "_kill_ttyd_by_socket", _boom)
78
+ host = _Host()
79
+ # Should not raise.
80
+ await H.teardown_session_tree(
81
+ host, tmux_path="tmux", tmux_socket="/tmp/s.sock",
82
+ tmux_session="optio", claude_path="/w/home/.local/bin/claude",
83
+ ttyd_handle=None, aggressive=True,
84
+ )
85
+ # The remaining three steps still ran.
86
+ assert [c[0] for c in calls] == ["kill_session", "kill_claude", "await_gone"]