react-agent-harness 0.0.1__tar.gz → 0.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {react_agent_harness-0.0.1/react_agent_harness.egg-info → react_agent_harness-0.0.2}/PKG-INFO +1 -1
  2. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/README.md +93 -25
  3. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/agents/base.py +104 -26
  4. react_agent_harness-0.0.2/harness/checkpoint.py +165 -0
  5. react_agent_harness-0.0.2/harness/hitl.py +195 -0
  6. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/harness/runtime.py +175 -22
  7. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/orchestrator/planner.py +117 -12
  8. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/pyproject.toml +1 -1
  9. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2/react_agent_harness.egg-info}/PKG-INFO +1 -1
  10. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/react_agent_harness.egg-info/SOURCES.txt +2 -0
  11. react_agent_harness-0.0.2/tests/test_checkpoint_resume.py +1119 -0
  12. react_agent_harness-0.0.1/harness/hitl.py +0 -282
  13. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/LICENSE +0 -0
  14. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/agents/__init__.py +0 -0
  15. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/harness/__init__.py +0 -0
  16. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/harness/annotation.py +0 -0
  17. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/harness/events.py +0 -0
  18. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/harness/executor_bridge.py +0 -0
  19. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/harness/llm/__init__.py +0 -0
  20. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/harness/llm/openai.py +0 -0
  21. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/harness/otel.py +0 -0
  22. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/harness/utils.py +0 -0
  23. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/memory/__init__.py +0 -0
  24. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/memory/episodic_lance.py +0 -0
  25. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/memory/manager.py +0 -0
  26. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/memory/redis_store.py +0 -0
  27. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/memory/stores.py +0 -0
  28. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/memory/working.py +0 -0
  29. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/orchestrator/__init__.py +0 -0
  30. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/react_agent_harness.egg-info/dependency_links.txt +0 -0
  31. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/react_agent_harness.egg-info/requires.txt +0 -0
  32. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/react_agent_harness.egg-info/top_level.txt +0 -0
  33. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/setup.cfg +0 -0
  34. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/tests/test_agents_base.py +0 -0
  35. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/tests/test_annotation.py +0 -0
  36. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/tests/test_executor_bridge.py +0 -0
  37. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/tests/test_http_fetch.py +0 -0
  38. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/tests/test_mcp_adapter.py +0 -0
  39. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/tests/test_memory.py +0 -0
  40. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/tests/test_openai_llm.py +0 -0
  41. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/tests/test_orchestrator.py +0 -0
  42. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/tests/test_otel.py +0 -0
  43. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/tests/test_parse_action_json.py +0 -0
  44. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/tests/test_redis_store.py +0 -0
  45. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/tests/test_streaming.py +0 -0
  46. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/tests/test_vision.py +0 -0
  47. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/tests/test_working_memory.py +0 -0
  48. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/tools/__init__.py +0 -0
  49. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/tools/builtin/__init__.py +0 -0
  50. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/tools/builtin/fetch_image.py +0 -0
  51. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/tools/builtin/http_fetch.py +0 -0
  52. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/tools/mcp/__init__.py +0 -0
  53. {react_agent_harness-0.0.1 → react_agent_harness-0.0.2}/tools/mcp/adapter.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: react-agent-harness
3
- Version: 0.0.1
3
+ Version: 0.0.2
4
4
  Summary: Multi-agent LLM orchestration: hybrid DAG planning, two-tier memory, streaming
5
5
  Requires-Python: >=3.10
6
6
  License-File: LICENSE
@@ -37,7 +37,8 @@ harness/runtime.py AgentRuntime — single entry point, wire once run a
37
37
  harness/events.py BusEvent + EventType — canonical event vocabulary
38
38
  harness/llm/openai.py OpenAILLM — OpenAI adapter with usage + cost tracking
39
39
  harness/annotation.py Annotation store + AnnotationHook — RLHF trajectory capture
40
- harness/hitl.py HITL approval gate — interactive CLI, Redis checkpoint/resume
40
+ harness/hitl.py HITL approval gate — interactive CLI, session-allow list
41
+ harness/checkpoint.py CheckpointStore + _ResumeHint + maybe_resume_key — pluggable run-state persistence (file + Redis); auto-resume built into dispatch_stream / run_stream
41
42
  harness/otel.py OTELHook — OpenTelemetry span exporter (opt-in)
42
43
  harness/executor_bridge.py ExecutorBridge + ExecutorTool — controlled subprocess launcher with optional Docker sandboxing
43
44
  orchestrator/planner.py Hybrid DAG orchestrator — plan, replan, synthesize
@@ -564,7 +565,7 @@ agents.register(AgentConfig(
564
565
  hitl_tools=["write_file", "delete_file"], # these two require human approval
565
566
  ))
566
567
 
567
- # AgentRuntime auto-creates a FileApprovalStore when hitl_tools are present.
568
+ # AgentRuntime auto-creates a FileCheckpointStore when hitl_tools are present.
568
569
  runtime = AgentRuntime(...)
569
570
  await runtime.run_agent("file_agent", "clean up the logs directory")
570
571
  ```
@@ -573,19 +574,19 @@ Checkpoints are written to `~/.agent-harness/checkpoints/` by default.
573
574
  Override the directory:
574
575
 
575
576
  ```python
576
- from harness.hitl import FileApprovalStore
577
+ from harness.checkpoint import FileCheckpointStore
577
578
 
578
- runtime = AgentRuntime(..., approval_store=FileApprovalStore("/var/lib/myapp/hitl"))
579
+ runtime = AgentRuntime(..., checkpoint_store=FileCheckpointStore("/var/lib/myapp/ckp"))
579
580
  ```
580
581
 
581
582
  For Redis-backed storage (shared across processes or machines):
582
583
 
583
584
  ```python
584
585
  import redis.asyncio as aioredis
585
- from harness.hitl import RedisApprovalStore
586
+ from harness.checkpoint import RedisCheckpointStore
586
587
 
587
588
  client = aioredis.from_url("redis://localhost:6379", decode_responses=True)
588
- runtime = AgentRuntime(..., approval_store=RedisApprovalStore(client))
589
+ runtime = AgentRuntime(..., checkpoint_store=RedisCheckpointStore(client))
589
590
  ```
590
591
 
591
592
  When the agent calls `write_file` or `delete_file` a prompt appears:
@@ -597,46 +598,112 @@ When the agent calls `write_file` or `delete_file` a prompt appears:
597
598
  Tool: delete_file
598
599
  Args: {"path": "/var/log/app.log"}
599
600
  Agent: file_agent step=2
600
- Run: 3f7a1b2c-...
601
+ Run: 3f7a1b2c-...:file_agent
601
602
  ID: a1b2-c3d4
602
603
  ────────────────────────────────────────────────────────────
603
- Approve? [y/n/correction]:
604
+ y = approve once | a = allow 'delete_file' for session | n = reject | <text> = steer
605
+ Ctrl-C to pause. Resume: python my_script.py --resume 3f7a1b2c-...:file_agent
606
+ ────────────────────────────────────────────────────────────
607
+ Approve? [y/n/a/correction]:
604
608
  ```
605
609
 
606
610
  **Prompt semantics:**
607
611
 
608
612
  | Input | Effect |
609
613
  |---|---|
610
- | `y` / `yes` | Tool runs |
614
+ | `y` / `yes` | Tool runs once |
611
615
  | `n` / `no` | Tool skipped; agent sees a rejection observation |
616
+ | `a` / `allow` | Tool runs **and** added to session allow-list; no further prompts for this tool (or command prefix for shell-like tools) |
612
617
  | any other text | Correction: tool skipped, text injected into `WorkingMemory` as a user message; LLM self-corrects on the next step |
613
618
 
619
+ For shell-like tools (`shell`, `bash`, `run`, `exec`), `a` allows the **first
620
+ word** of the command — e.g. typing `a` when approving `shell git commit ...`
621
+ allows all `git` commands for the session but still prompts for `shell rm ...`.
622
+
614
623
  **Wall-time budget** is suspended while waiting for input — human think-time
615
624
  does not count against `max_wall_time_seconds`.
616
625
 
626
+ ### Step-level checkpointing
627
+
628
+ Enable periodic crash-resume independent of HITL:
629
+
630
+ ```python
631
+ AgentConfig(
632
+ agent_id="long_runner",
633
+ ...
634
+ checkpoint_every=3, # checkpoint before every 3rd step (0 = disabled)
635
+ )
636
+ ```
637
+
638
+ The same `CheckpointStore` is used for both HITL and step checkpoints. Resume
639
+ works with `runtime.resume(key)` regardless of how the checkpoint was created.
640
+
641
+ ### Checkpoint namespacing
642
+
643
+ Each agent writes to its own key so orchestrated runs never overwrite each other:
644
+
645
+ | Path | Checkpoint key | Stored at |
646
+ |---|---|---|
647
+ | Single-agent (`run_agent`, `run_routed`) | `<run_id>:<agent_id>` | `~/.agent-harness/checkpoints/<run_id>:<agent_id>.json` |
648
+ | Orchestrated (`run`, `run_stream`) | `<run_id>` (orchestrator) + `<run_id>:<agent_id>` (each agent) | one file per agent, one file for the orchestrator |
649
+
650
+ The orchestrator checkpoint stores the goal, the full plan, completed task
651
+ results, and the replan count. It is updated after each parallel batch
652
+ completes and deleted on clean `DONE`.
653
+
617
654
  ### Crash / Ctrl-C resume
618
655
 
619
- The run checkpoint (step number + full `WorkingMemory`) is written to Redis
620
- before every approval prompt. The banner prints the exact command to resume:
656
+ The checkpoint (step number + full `WorkingMemory`) is written before every
657
+ HITL prompt and (if `checkpoint_every > 0`) at each periodic step.
658
+
659
+ **What the banner prints:**
621
660
 
661
+ - **Single-agent run**: `--resume <run_id>:<agent_id>` — restores just that agent.
662
+ - **Orchestrated run**: `--resume <run_id>` — restores the full orchestration.
663
+
664
+ ```
665
+ Run interrupted — checkpoint saved.
666
+ Resume: python my_script.py --resume 3f7a1b2c-...
622
667
  ```
623
- Ctrl-C to pause. Resume: python my_script.py --resume 3f7a1b2c-...
668
+
669
+ **Auto-resume — no script changes required.** When `checkpoint_store` is
670
+ configured, `dispatch_stream` and `run_stream` detect `--resume <key>` in
671
+ `sys.argv` automatically. Your existing script resumes transparently:
672
+
673
+ ```bash
674
+ python my_script.py --resume 3f7a1b2c-...
624
675
  ```
625
676
 
626
- Add one line to your script to handle it:
677
+ The runtime detects the flag, loads the checkpoint, and streams events
678
+ identically to a fresh run. Scripts need zero resume-specific code.
679
+
680
+ For **explicit control** — streaming resume or blocking resume:
627
681
 
628
682
  ```python
629
- from harness.hitl import maybe_resume
683
+ # streaming (same event sequence as the original run)
684
+ async for event in runtime.resume_stream("3f7a1b2c-..."):
685
+ ...
630
686
 
631
- result = await maybe_resume(runtime) or await runtime.run_agent("agent", "task")
687
+ # blocking
688
+ result = await runtime.resume("3f7a1b2c-...:file_agent") # single-agent
689
+ result = await runtime.resume("3f7a1b2c-...") # orchestrated
632
690
  ```
633
691
 
634
- `maybe_resume` checks `sys.argv` for `--resume <run_id>`. If present it
635
- restores the checkpoint from Redis, re-displays the approval banner, and
636
- continues the run. If absent it returns `None` so the normal path runs.
637
- Re-running the same script with `--resume` is all the human needs to do.
692
+ Both `resume_stream` and `resume` auto-detect the checkpoint type (agent vs
693
+ orchestrator) from the stored data and call the right path.
694
+
695
+ If you need the resume key from `sys.argv` directly:
696
+
697
+ ```python
698
+ from harness.checkpoint import maybe_resume_key
699
+
700
+ key = maybe_resume_key() # returns None if --resume is absent
701
+ ```
638
702
 
639
- Checkpoints expire after 24 hours (configurable via `RedisApprovalStore(ttl_seconds=...)`).
703
+ **Orchestrated resume** skips completed tasks (injects their stored results
704
+ directly into the synthesis step) and re-runs only the tasks that had not yet
705
+ finished. If an individual agent's HITL checkpoint is still on disk, that agent
706
+ is resumed at its saved step rather than re-run from scratch.
640
707
 
641
708
  ### Correction steering and replanning
642
709
 
@@ -646,9 +713,10 @@ When the human types a correction instead of y/n:
646
713
  `WorkingMemory`. The LLM sees it on the next think step and self-corrects
647
714
  without replanning. Suitable for redirecting tool choice or adjusting
648
715
  parameters.
649
- - **Multi-agent orchestrated run**: if the correction implies a different agent
650
- or task structure, call `resume_agent` with updated context instead, which
651
- re-enters the orchestrator.
716
+ - **Orchestrated run**: the correction steers only the current agent. Because
717
+ the orchestrator checkpoint records task results as they complete, a full
718
+ `runtime.resume(run_id)` after the agent finishes will continue the remaining
719
+ tasks with correct upstream context.
652
720
 
653
- The `annotation_store` and `approval_store` are independent — both can be wired
654
- simultaneously for RLHF data collection with HITL review.
721
+ The `annotation_store` and `checkpoint_store` are independent — both can be
722
+ wired simultaneously for RLHF data collection with HITL review.
@@ -35,6 +35,7 @@ from dataclasses import dataclass
35
35
  from datetime import datetime, timezone
36
36
  from typing import Any, Final
37
37
 
38
+ from harness.checkpoint import _ResumeHint
38
39
  from harness.events import BusEvent, EventType
39
40
  from harness.utils import fire
40
41
  from memory.manager import MemoryManager
@@ -61,6 +62,7 @@ class AgentConfig:
61
62
  confidence_from_llm: bool = True # if False, confidence=1.0 on success
62
63
  working_memory_max_tokens: int = 8000 # WorkingMemory eviction threshold; tune per agent
63
64
  hitl_tools: list[str] = None # tools requiring human approval; None = no HITL
65
+ checkpoint_every: int = 0 # write a resumable checkpoint every N steps; 0 = disabled
64
66
 
65
67
  def __post_init__(self):
66
68
  if self.hitl_tools is None:
@@ -129,7 +131,7 @@ class BaseAgent:
129
131
  tracer,
130
132
  guard,
131
133
  llm,
132
- approval_store: Any | None = None, # RedisApprovalStore enables HITL + resume
134
+ checkpoint_store: Any | None = None, # FileCheckpointStore / RedisCheckpointStore
133
135
  ) -> None:
134
136
  self.config = config
135
137
  self.role = config.role # exposed for orchestrator planner prompt
@@ -138,10 +140,14 @@ class BaseAgent:
138
140
  self._tracer = tracer
139
141
  self._guard = guard
140
142
  self._llm = llm
141
- self._approval_store = approval_store
143
+ self._checkpoint_store = checkpoint_store
142
144
  self._working_memory: WorkingMemory | None = None
143
145
  self._task: str = ""
144
146
  self._last_think_error: str | None = None
147
+ self._ckp_id: str = "" # f"{run_id}:{agent_id}" — unique per agent per run
148
+ self._resume_key: str = (
149
+ "" # key printed in --resume banner; set by orchestrator to outer run_id
150
+ )
145
151
 
146
152
  # ── Streaming entry point (canonical) ─────────────────────────────────────
147
153
 
@@ -151,6 +157,9 @@ class BaseAgent:
151
157
  run_id: str | None = None,
152
158
  ) -> AsyncGenerator[BusEvent, None]:
153
159
  run_id = run_id or str(uuid.uuid4())
160
+ self._ckp_id = f"{run_id}:{self.config.agent_id}"
161
+ if not self._resume_key:
162
+ self._resume_key = self._ckp_id
154
163
  self._task = task
155
164
  self._working_memory = WorkingMemory(
156
165
  llm=self._llm,
@@ -161,8 +170,17 @@ class BaseAgent:
161
170
  await self._working_memory.append("system", system, pinned=True)
162
171
  await self._working_memory.append("user", task)
163
172
 
164
- async for event in self._run_stream_internal(run_id):
165
- yield event
173
+ async with _ResumeHint(
174
+ self._resume_key,
175
+ self._checkpoint_store,
176
+ f"Agent {self.config.agent_id}",
177
+ check_key=self._ckp_id,
178
+ ) as hint:
179
+ async for event in self._run_stream_internal(run_id):
180
+ if event.type == EventType.TASK_DONE:
181
+ await self._clear_checkpoint(run_id)
182
+ hint.done = True
183
+ yield event
166
184
 
167
185
  async def _resume_stream(
168
186
  self,
@@ -177,13 +195,25 @@ class BaseAgent:
177
195
  The approval prompt is shown again; once the human responds the
178
196
  tool runs (or the correction is injected) before the loop continues.
179
197
  """
198
+ self._ckp_id = f"{run_id}:{self.config.agent_id}"
199
+ if not self._resume_key:
200
+ self._resume_key = self._ckp_id
180
201
  if pending:
181
202
  async for event in self._replay_pending_step(run_id, pending):
182
203
  yield event
183
204
  start_step = pending["step"] + 1
184
205
 
185
- async for event in self._run_stream_internal(run_id, start_step=start_step):
186
- yield event
206
+ async with _ResumeHint(
207
+ self._resume_key,
208
+ self._checkpoint_store,
209
+ f"Agent {self.config.agent_id}",
210
+ check_key=self._ckp_id,
211
+ ) as hint:
212
+ async for event in self._run_stream_internal(run_id, start_step=start_step):
213
+ if event.type == EventType.TASK_DONE:
214
+ await self._clear_checkpoint(run_id)
215
+ hint.done = True
216
+ yield event
187
217
 
188
218
  async def _run_stream_internal(
189
219
  self,
@@ -246,11 +276,31 @@ class BaseAgent:
246
276
 
247
277
  # ── ReAct Loop (stream) ───────────────────────────────────────────────────
248
278
 
279
+ async def _write_step_checkpoint(self, run_id: str, step: int) -> None:
280
+ if self._checkpoint_store is None:
281
+ return
282
+ await self._checkpoint_store.write(
283
+ self._ckp_id,
284
+ {
285
+ "run_id": run_id,
286
+ "agent_id": self.config.agent_id,
287
+ "task": self._task,
288
+ "step": step,
289
+ "memory": self._working_memory.to_dict(),
290
+ },
291
+ )
292
+
249
293
  async def _react_stream(
250
294
  self, run_id: str, start_step: int = 0
251
295
  ) -> AsyncGenerator[BusEvent, None]:
252
296
  for step in range(start_step, self.config.max_steps):
253
297
  self._guard.check()
298
+ if (
299
+ self._checkpoint_store is not None
300
+ and self.config.checkpoint_every > 0
301
+ and step % self.config.checkpoint_every == 0
302
+ ):
303
+ await self._write_step_checkpoint(run_id, step)
254
304
 
255
305
  # Think — yields TOKEN events when the LLM client supports streaming.
256
306
  response = None
@@ -337,7 +387,9 @@ class BaseAgent:
337
387
  if approval is None or approval.approved:
338
388
  approved.append(act)
339
389
  elif approval.correction:
340
- await self._inject_human_guidance(response, approval.correction, run_id)
390
+ await self._inject_human_guidance(
391
+ response, approval.correction, run_id, step
392
+ )
341
393
  correction_injected = True
342
394
  break
343
395
  # else: rejected — drop from batch silently
@@ -366,7 +418,7 @@ class BaseAgent:
366
418
  for act in parallel_actions
367
419
  ]
368
420
  )
369
- await self._clear_checkpoint(run_id)
421
+ await self._commit_checkpoint(run_id, step)
370
422
 
371
423
  combined: list[dict] = []
372
424
  for i, (act, obs) in enumerate(zip(parallel_actions, observations, strict=False)):
@@ -611,14 +663,17 @@ class BaseAgent:
611
663
  Returns ApprovalResponse if the tool is gated, None if not.
612
664
  Writes a crash-resumable checkpoint to the store before blocking on stdin.
613
665
  """
614
- if not (self._approval_store and tool_name in self.config.hitl_tools):
666
+ if not (self._checkpoint_store and tool_name in self.config.hitl_tools):
615
667
  return None
616
668
 
617
- from harness.hitl import ApprovalRequest, request_approval
669
+ from harness.hitl import ApprovalRequest, is_session_allowed, request_approval
670
+
671
+ if is_session_allowed(tool_name, tool_args):
672
+ return None # fast-path: human already allowed this tool/prefix for session
618
673
 
619
674
  approval_id = str(uuid.uuid4())
620
- await self._approval_store.write_checkpoint(
621
- run_id,
675
+ await self._checkpoint_store.write(
676
+ self._ckp_id,
622
677
  {
623
678
  "run_id": run_id,
624
679
  "agent_id": self.config.agent_id,
@@ -637,14 +692,13 @@ class BaseAgent:
637
692
  return await request_approval(
638
693
  ApprovalRequest(
639
694
  approval_id=approval_id,
640
- run_id=run_id,
695
+ run_id=self._resume_key, # standalone: ckp_id; orchestrated: outer run_id
641
696
  agent_id=self.config.agent_id,
642
697
  tool=tool_name,
643
698
  args=tool_args,
644
699
  step=step,
645
700
  timestamp=datetime.now(timezone.utc).isoformat(),
646
701
  ),
647
- self._approval_store,
648
702
  self._guard,
649
703
  )
650
704
 
@@ -666,24 +720,49 @@ class BaseAgent:
666
720
  approval = await self._gate_tool(run_id, step, tool_name, tool_args, response)
667
721
  if approval is not None:
668
722
  if approval.correction:
669
- await self._inject_human_guidance(response, approval.correction, run_id)
723
+ await self._inject_human_guidance(response, approval.correction, run_id, step)
670
724
  return _HITL_CORRECTION
671
725
  if not approval.approved:
672
- await self._clear_checkpoint(run_id)
726
+ await self._commit_checkpoint(run_id, step)
673
727
  return f"Tool rejected by human: {approval.correction or 'no reason given'}"
674
728
  obs = await self._execute_tool(tool_name, tool_args)
675
- await self._clear_checkpoint(run_id)
729
+ if approval is not None:
730
+ # HITL was involved — overwrite pending checkpoint with clean state.
731
+ # Non-HITL tools leave the step checkpoint intact for the next iteration.
732
+ await self._commit_checkpoint(run_id, step)
676
733
  return obs
677
734
 
678
- async def _inject_human_guidance(self, response: dict, correction: str, run_id: str) -> None:
679
- """Append human correction to WorkingMemory and clear the checkpoint."""
735
+ async def _inject_human_guidance(
736
+ self, response: dict, correction: str, run_id: str, step: int
737
+ ) -> None:
738
+ """Append human correction to WorkingMemory and commit a clean checkpoint."""
680
739
  await self._working_memory.append("assistant", json.dumps(response))
681
740
  await self._working_memory.append("user", f"Human guidance: {correction}")
682
- await self._clear_checkpoint(run_id)
741
+ await self._commit_checkpoint(run_id, step)
742
+
743
+ async def _commit_checkpoint(self, run_id: str, step: int) -> None:
744
+ """Overwrite checkpoint with current state (no pending field).
745
+
746
+ Called after HITL resolves or a tool completes so the stored state
747
+ always reflects reality — no stale 'pending' approval marker, and
748
+ the step position is preserved for crash-resume.
749
+ """
750
+ if self._checkpoint_store is None:
751
+ return
752
+ await self._checkpoint_store.write(
753
+ self._ckp_id,
754
+ {
755
+ "run_id": run_id,
756
+ "agent_id": self.config.agent_id,
757
+ "task": self._task,
758
+ "step": step,
759
+ "memory": self._working_memory.to_dict(),
760
+ },
761
+ )
683
762
 
684
763
  async def _clear_checkpoint(self, run_id: str) -> None:
685
- if self._approval_store:
686
- await self._approval_store.clear_checkpoint(run_id)
764
+ if self._checkpoint_store:
765
+ await self._checkpoint_store.delete(self._ckp_id)
687
766
 
688
767
  async def _replay_pending_step(
689
768
  self,
@@ -701,19 +780,18 @@ class BaseAgent:
701
780
  approval = await request_approval(
702
781
  ApprovalRequest(
703
782
  approval_id=pending["approval_id"],
704
- run_id=run_id,
783
+ run_id=self._resume_key, # standalone: ckp_id; orchestrated: outer run_id
705
784
  agent_id=self.config.agent_id,
706
785
  tool=tool_name,
707
786
  args=tool_args,
708
787
  step=step,
709
788
  timestamp=datetime.now(timezone.utc).isoformat(),
710
789
  ),
711
- self._approval_store,
712
790
  self._guard,
713
791
  )
714
792
 
715
793
  if approval.correction:
716
- await self._inject_human_guidance(llm_response, approval.correction, run_id)
794
+ await self._inject_human_guidance(llm_response, approval.correction, run_id, step)
717
795
  return
718
796
 
719
797
  observation = (
@@ -740,7 +818,7 @@ class BaseAgent:
740
818
  else observation
741
819
  )
742
820
  await self._working_memory.append("user", f"Observation: {obs_text}")
743
- await self._clear_checkpoint(run_id)
821
+ await self._commit_checkpoint(run_id, step)
744
822
 
745
823
 
746
824
  # ── Response normalization (module-level for testability) ────────────────────
@@ -0,0 +1,165 @@
1
+ """
2
+ harness/checkpoint.py — Pluggable checkpoint store for run state persistence.
3
+
4
+ Used by both the HITL approval gate (crash-resume) and periodic step
5
+ checkpointing (checkpoint_every on AgentConfig).
6
+
7
+ A checkpoint is a plain dict written under a run_id key. The schema is
8
+ defined by the caller — agents write {run_id, agent_id, task, step, memory}
9
+ with an optional {pending: ...} field added by the HITL gate.
10
+
11
+ Two backends ship out of the box:
12
+
13
+ FileCheckpointStore — zero deps, one JSON file per run_id
14
+ RedisCheckpointStore — for distributed / multi-process setups
15
+
16
+ Both share the same three-method interface so callers are backend-agnostic.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ import os
23
+ import sys
24
+ import time
25
+ from pathlib import Path
26
+ from typing import Any
27
+
28
+
29
+ def maybe_resume_key() -> str | None:
30
+ """
31
+ Extract the --resume <key> value from sys.argv, or return None.
32
+
33
+ Called automatically by AgentRuntime.dispatch_stream / run_stream so that
34
+ scripts resume transparently without any resume-specific code. Also
35
+ available for scripts that need the key explicitly.
36
+ """
37
+ args = sys.argv[1:]
38
+ if "--resume" not in args:
39
+ return None
40
+ idx = args.index("--resume")
41
+ if idx + 1 >= len(args):
42
+ print("Usage: --resume <ckp_id>", file=sys.stderr)
43
+ sys.exit(1)
44
+ return args[idx + 1]
45
+
46
+
47
+ class FileCheckpointStore:
48
+ """
49
+ Zero-dependency checkpoint store backed by JSON files.
50
+
51
+ Default directory: ~/.agent-harness/checkpoints/
52
+ Override with the CHECKPOINT_DIR env var or by passing checkpoint_dir.
53
+ """
54
+
55
+ def __init__(self, checkpoint_dir: str | Path | None = None) -> None:
56
+ self._dir = Path(
57
+ checkpoint_dir
58
+ or os.environ.get("CHECKPOINT_DIR", Path.home() / ".agent-harness" / "checkpoints")
59
+ )
60
+ self._dir.mkdir(parents=True, exist_ok=True)
61
+
62
+ def _path(self, run_id: str) -> Path:
63
+ return self._dir / f"{run_id}.json"
64
+
65
+ async def write(self, run_id: str, data: dict) -> None:
66
+ self._path(run_id).write_text(json.dumps(data, default=str, indent=2))
67
+
68
+ async def read(self, run_id: str) -> dict | None:
69
+ path = self._path(run_id)
70
+ if not path.exists():
71
+ return None
72
+ return json.loads(path.read_text())
73
+
74
+ async def delete(self, run_id: str) -> None:
75
+ path = self._path(run_id)
76
+ if path.exists():
77
+ path.unlink()
78
+
79
+ @classmethod
80
+ def purge_old(cls, days: int = 7, checkpoint_dir: str | Path | None = None) -> int:
81
+ """Delete checkpoint files older than `days`. Returns count removed."""
82
+ store = cls(checkpoint_dir)
83
+ cutoff = time.time() - days * 86_400
84
+ removed = 0
85
+ for p in store._dir.glob("*.json"):
86
+ if p.stat().st_mtime < cutoff:
87
+ p.unlink()
88
+ removed += 1
89
+ return removed
90
+
91
+
92
+ class RedisCheckpointStore:
93
+ """
94
+ Checkpoint store backed by Redis.
95
+
96
+ Checkpoints expire after ttl_seconds (default 24 h).
97
+
98
+ Usage:
99
+ import redis.asyncio as redis
100
+ client = redis.Redis(host="localhost", decode_responses=True)
101
+ store = RedisCheckpointStore(client)
102
+ """
103
+
104
+ _KEY = "ckp:{}"
105
+
106
+ def __init__(self, client: Any, ttl_seconds: int = 86_400) -> None:
107
+ self._r = client
108
+ self._ttl = ttl_seconds
109
+
110
+ async def write(self, run_id: str, data: dict) -> None:
111
+ await self._r.set(self._KEY.format(run_id), json.dumps(data, default=str), ex=self._ttl)
112
+
113
+ async def read(self, run_id: str) -> dict | None:
114
+ raw = await self._r.get(self._KEY.format(run_id))
115
+ return json.loads(raw) if raw else None
116
+
117
+ async def delete(self, run_id: str) -> None:
118
+ await self._r.delete(self._KEY.format(run_id))
119
+
120
+
121
+ class _ResumeHint:
122
+ """
123
+ Async context manager that prints a --resume hint to stderr on interruption,
124
+ but only when a checkpoint actually exists in the store.
125
+
126
+ Two keys:
127
+ check_key — where the checkpoint lives in the store (verified on exit).
128
+ Defaults to resume_key when not supplied.
129
+ resume_key — printed in the hint; what the human passes to --resume.
130
+
131
+ For orchestrated agents these differ: check_key is the namespaced agent key
132
+ (f"{run_id}:{agent_id}") while resume_key is the bare orchestrator run_id.
133
+
134
+ Set ``hint.done = True`` before leaving the managed block to suppress the
135
+ message on clean success.
136
+ """
137
+
138
+ def __init__(
139
+ self,
140
+ resume_key: str,
141
+ checkpoint_store: Any,
142
+ label: str = "Run",
143
+ *,
144
+ check_key: str | None = None,
145
+ ) -> None:
146
+ self._resume_key = resume_key
147
+ self._check_key = check_key if check_key is not None else resume_key
148
+ self._store = checkpoint_store
149
+ self._label = label
150
+ self.done: bool = False
151
+
152
+ async def __aenter__(self) -> _ResumeHint:
153
+ return self
154
+
155
+ async def __aexit__(self, exc_type, exc_val, exc_tb) -> bool:
156
+ if not self.done and self._store is not None and self._resume_key:
157
+ checkpoint = await self._store.read(self._check_key)
158
+ if checkpoint is not None:
159
+ script = sys.argv[0] if sys.argv else "your_script.py"
160
+ print(
161
+ f"\n {self._label} interrupted — checkpoint saved."
162
+ f"\n Resume: python {script} --resume {self._resume_key}\n",
163
+ file=sys.stderr,
164
+ )
165
+ return False