react-agent-harness 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
agents/base.py ADDED
@@ -0,0 +1,788 @@
1
+ """
2
+ BaseAgent — generic ReAct loop agent. Streaming-primary.
3
+
4
+ Every agent is an instance of BaseAgent configured via AgentConfig.
5
+ No subclassing needed for new domains — just register a new AgentConfig
6
+ with different role, system_prompt, and allowed_tools.
7
+
8
+ Execution model:
9
+ - run_stream(task) is the canonical method — yields BusEvents for each
10
+ THOUGHT, TOKEN (when the LLM client streams), ACTION, OBSERVATION,
11
+ and finally TASK_DONE with the result payload.
12
+ - run(task) is a thin drain: collects the stream and returns the final dict.
13
+ Use it when you don't need real-time events.
14
+
15
+ Memory integration:
16
+ - build_context() injected into system prompt at run start
17
+ - write_working_fact() called after each tool observation
18
+ - run-end write handled by Orchestrator, not BaseAgent
19
+
20
+ Token management:
21
+ - WorkingMemory handles eviction via LLM summarization
22
+ - max budget is configured per-agent via AgentConfig.working_memory_max_tokens
23
+ - count_tokens defaults to chars/4; pass a custom counter to WorkingMemory
24
+ if you need exact (e.g. tiktoken) counts.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import asyncio
30
+ import json
31
+ import logging
32
+ import uuid
33
+ from collections.abc import AsyncGenerator
34
+ from dataclasses import dataclass
35
+ from datetime import datetime, timezone
36
+ from typing import Any, Final
37
+
38
+ from harness.events import BusEvent, EventType
39
+ from harness.utils import fire
40
+ from memory.manager import MemoryManager
41
+ from memory.working import WorkingMemory
42
+
43
+ logger = logging.getLogger(__name__)
44
+
45
+ # Sentinel returned by _run_tool_gated when human injects a correction.
46
+ # Caller must `continue` the ReAct loop — WM is already updated.
47
+ _HITL_CORRECTION: Final = object()
48
+
49
+
50
+ # ── Agent Config ──────────────────────────────────────────────────────────────
51
+
52
+
53
+ @dataclass
54
+ class AgentConfig:
55
+ agent_id: str
56
+ role: str # plain English — used by planner for agent selection
57
+ system_prompt: str
58
+ allowed_tools: list[str] # tool names from ToolRegistry
59
+ max_steps: int = 10
60
+ memory_context_enabled: bool = True
61
+ confidence_from_llm: bool = True # if False, confidence=1.0 on success
62
+ working_memory_max_tokens: int = 8000 # WorkingMemory eviction threshold; tune per agent
63
+ hitl_tools: list[str] = None # tools requiring human approval; None = no HITL
64
+
65
+ def __post_init__(self):
66
+ if self.hitl_tools is None:
67
+ self.hitl_tools = []
68
+
69
+
70
+ # ── ReAct Response Schema ─────────────────────────────────────────────────────
71
+
72
+ # Injected into every agent's system prompt so LLM knows the expected format.
73
+ REACT_FORMAT = """
74
+ At each step, respond with a JSON object in one of three forms:
75
+
76
+ To use a single tool:
77
+ {
78
+ "thought": "<reasoning about what to do next>",
79
+ "action": "<tool_name>",
80
+ "args": { "<arg>": "<value>", ... }
81
+ }
82
+
83
+ To use multiple independent tools at once (they run in parallel — use this when \
84
+ the calls don't depend on each other):
85
+ {
86
+ "thought": "<reasoning>",
87
+ "actions": [
88
+ {"tool": "<tool_name>", "args": { "<arg>": "<value>", ... }},
89
+ {"tool": "<tool_name_2>", "args": { "<arg>": "<value>", ... }}
90
+ ]
91
+ }
92
+
93
+ To finish:
94
+ {
95
+ "thought": "<final reasoning>",
96
+ "action": "finish",
97
+ "answer": "<comprehensive answer to the task>",
98
+ "confidence": <0.0-1.0>
99
+ }
100
+
101
+ Available tools: __TOOL_LIST__
102
+ Return JSON only — no markdown, no preamble.
103
+ """
104
+
105
+
106
+ # ── Base Agent ────────────────────────────────────────────────────────────────
107
+
108
+
109
+ class BaseAgent:
110
+ """
111
+ Generic ReAct agent. Configured entirely via AgentConfig + ToolRegistry.
112
+
113
+ To create a new specialist agent:
114
+ config = AgentConfig(
115
+ agent_id="my_agent",
116
+ role="does X using tools Y and Z",
117
+ system_prompt="You are an expert at X...",
118
+ allowed_tools=["tool_y", "tool_z"],
119
+ )
120
+ registry.register(config)
121
+ No subclassing needed.
122
+ """
123
+
124
+ def __init__(
125
+ self,
126
+ config: AgentConfig,
127
+ tools: dict[str, Any], # name → Tool instance
128
+ memory: MemoryManager,
129
+ tracer,
130
+ guard,
131
+ llm,
132
+ approval_store: Any | None = None, # RedisApprovalStore — enables HITL + resume
133
+ ) -> None:
134
+ self.config = config
135
+ self.role = config.role # exposed for orchestrator planner prompt
136
+ self._tools = tools
137
+ self._memory = memory
138
+ self._tracer = tracer
139
+ self._guard = guard
140
+ self._llm = llm
141
+ self._approval_store = approval_store
142
+ self._working_memory: WorkingMemory | None = None
143
+ self._task: str = ""
144
+ self._last_think_error: str | None = None
145
+
146
+ # ── Streaming entry point (canonical) ─────────────────────────────────────
147
+
148
+ async def run_stream(
149
+ self,
150
+ task: str,
151
+ run_id: str | None = None,
152
+ ) -> AsyncGenerator[BusEvent, None]:
153
+ run_id = run_id or str(uuid.uuid4())
154
+ self._task = task
155
+ self._working_memory = WorkingMemory(
156
+ llm=self._llm,
157
+ max_tokens=self.config.working_memory_max_tokens,
158
+ )
159
+
160
+ system = await self._build_system_prompt(task)
161
+ await self._working_memory.append("system", system, pinned=True)
162
+ await self._working_memory.append("user", task)
163
+
164
+ async for event in self._run_stream_internal(run_id):
165
+ yield event
166
+
167
+ async def _resume_stream(
168
+ self,
169
+ run_id: str,
170
+ start_step: int,
171
+ pending: dict | None = None,
172
+ ) -> AsyncGenerator[BusEvent, None]:
173
+ """
174
+ Re-enter the ReAct loop from a checkpoint.
175
+
176
+ If pending is set, the last step was interrupted mid-approval.
177
+ The approval prompt is shown again; once the human responds the
178
+ tool runs (or the correction is injected) before the loop continues.
179
+ """
180
+ if pending:
181
+ async for event in self._replay_pending_step(run_id, pending):
182
+ yield event
183
+ start_step = pending["step"] + 1
184
+
185
+ async for event in self._run_stream_internal(run_id, start_step=start_step):
186
+ yield event
187
+
188
+ async def _run_stream_internal(
189
+ self,
190
+ run_id: str,
191
+ start_step: int = 0,
192
+ ) -> AsyncGenerator[BusEvent, None]:
193
+ try:
194
+ async for event in self._react_stream(run_id, start_step=start_step):
195
+ yield event
196
+ except Exception as e:
197
+ logger.exception("Agent %s stream crashed", self.config.agent_id)
198
+ yield BusEvent(
199
+ type=EventType.ERROR,
200
+ agent_id=self.config.agent_id,
201
+ error=str(e),
202
+ )
203
+ finally:
204
+ if self._working_memory is not None:
205
+ self._tracer.log(
206
+ "trajectory",
207
+ self.config.agent_id,
208
+ {
209
+ "run_id": run_id,
210
+ "messages": self._working_memory.get_messages(),
211
+ "summarization_count": self._working_memory.summarization_count,
212
+ },
213
+ )
214
+
215
+ # ── Blocking entry point (thin drain) ─────────────────────────────────────
216
+
217
+ async def run(self, task: str, run_id: str | None = None) -> dict:
218
+ result: dict = {}
219
+ last_step = 0 # tracked from ACTION events so ERROR can report meaningful steps
220
+ async for event in self.run_stream(task=task, run_id=run_id):
221
+ if event.type == EventType.TASK_DONE:
222
+ result = event.payload
223
+ elif event.type == EventType.ACTION:
224
+ last_step = event.payload.get("step", last_step) + 1
225
+ elif event.type == EventType.ERROR:
226
+ steps = event.payload.get("steps", last_step) if event.payload else last_step
227
+ result = self._error_result(event.error, steps=steps)
228
+ return result
229
+
230
+ # ── System Prompt ─────────────────────────────────────────────────────────
231
+
232
+ async def _build_system_prompt(self, task: str) -> str:
233
+ parts = [self.config.system_prompt]
234
+
235
+ if self.config.memory_context_enabled:
236
+ mem_context = await self._memory.build_context(
237
+ goal=task,
238
+ agent_id=self.config.agent_id,
239
+ )
240
+ if not mem_context.is_empty():
241
+ parts.append(mem_context.render())
242
+
243
+ tool_list = ", ".join(self._tools.keys()) or "none"
244
+ parts.append(REACT_FORMAT.replace("__TOOL_LIST__", tool_list))
245
+ return "\n\n".join(parts)
246
+
247
+ # ── ReAct Loop (stream) ───────────────────────────────────────────────────
248
+
249
+ async def _react_stream(
250
+ self, run_id: str, start_step: int = 0
251
+ ) -> AsyncGenerator[BusEvent, None]:
252
+ for step in range(start_step, self.config.max_steps):
253
+ self._guard.check()
254
+
255
+ # Think — yields TOKEN events when the LLM client supports streaming.
256
+ response = None
257
+ async for thought_event in self._think_stream():
258
+ if thought_event.type == EventType.TOKEN:
259
+ yield thought_event
260
+ elif thought_event.type == EventType.THOUGHT:
261
+ response = thought_event.payload.get("response")
262
+ yield thought_event
263
+
264
+ if response is None:
265
+ reason = self._last_think_error or "LLM returned unparseable response"
266
+ self._tracer.log(
267
+ "task_result",
268
+ self.config.agent_id,
269
+ {"answer": "", "confidence": 0.0, "steps": step, "error": reason},
270
+ )
271
+ yield BusEvent(
272
+ type=EventType.ERROR,
273
+ agent_id=self.config.agent_id,
274
+ error=reason,
275
+ )
276
+ return
277
+
278
+ self._tracer.log(
279
+ "thought",
280
+ self.config.agent_id,
281
+ {
282
+ "step": step,
283
+ "thought": response.get("thought", ""),
284
+ "action": response.get("action"),
285
+ },
286
+ )
287
+
288
+ # Finish?
289
+ if response.get("action") == "finish":
290
+ await self._working_memory.append("assistant", json.dumps(response))
291
+ result = {
292
+ "agent_id": self.config.agent_id,
293
+ "answer": response.get("answer", ""),
294
+ "confidence": (
295
+ response.get("confidence", 1.0) if self.config.confidence_from_llm else 1.0
296
+ ),
297
+ "steps": step + 1,
298
+ "metadata": {
299
+ "summarizations": self._working_memory.summarization_count,
300
+ },
301
+ }
302
+ logger.info(
303
+ "Agent %s completed: steps=%d confidence=%.2f summarizations=%d",
304
+ self.config.agent_id,
305
+ result["steps"],
306
+ result["confidence"],
307
+ self._working_memory.summarization_count,
308
+ )
309
+ self._tracer.log(
310
+ "task_result",
311
+ self.config.agent_id,
312
+ {
313
+ "answer": result["answer"],
314
+ "confidence": result["confidence"],
315
+ "steps": result["steps"],
316
+ "error": "",
317
+ },
318
+ )
319
+ yield BusEvent(
320
+ type=EventType.TASK_DONE,
321
+ agent_id=self.config.agent_id,
322
+ payload=result,
323
+ )
324
+ return
325
+
326
+ # Act — parallel or single
327
+ parallel_actions = response.get("actions")
328
+ if parallel_actions and isinstance(parallel_actions, list):
329
+ # Gate each gated tool sequentially before fanning out.
330
+ # Correction from any one tool aborts the whole batch.
331
+ approved: list[dict] = []
332
+ correction_injected = False
333
+ for act in parallel_actions:
334
+ approval = await self._gate_tool(
335
+ run_id, step, act.get("tool", ""), act.get("args", {}), response
336
+ )
337
+ if approval is None or approval.approved:
338
+ approved.append(act)
339
+ elif approval.correction:
340
+ await self._inject_human_guidance(response, approval.correction, run_id)
341
+ correction_injected = True
342
+ break
343
+ # else: rejected — drop from batch silently
344
+
345
+ if correction_injected:
346
+ continue
347
+
348
+ parallel_actions = approved
349
+
350
+ # Emit ACTION events first so callers see what's being launched.
351
+ for act in parallel_actions:
352
+ yield BusEvent(
353
+ type=EventType.ACTION,
354
+ agent_id=self.config.agent_id,
355
+ payload={
356
+ "step": step,
357
+ "tool": act.get("tool", ""),
358
+ "args": act.get("args", {}),
359
+ },
360
+ )
361
+
362
+ # Fan out all approved tool calls concurrently.
363
+ observations = await asyncio.gather(
364
+ *[
365
+ self._execute_tool(act.get("tool", ""), act.get("args", {}))
366
+ for act in parallel_actions
367
+ ]
368
+ )
369
+ await self._clear_checkpoint(run_id)
370
+
371
+ combined: list[dict] = []
372
+ for i, (act, obs) in enumerate(zip(parallel_actions, observations, strict=False)):
373
+ tool_name = act.get("tool", "")
374
+ tool_args = act.get("args", {})
375
+ obs_display = "[image]" if _is_image_block(obs) else str(obs)[:500]
376
+ self._tracer.log(
377
+ "action",
378
+ self.config.agent_id,
379
+ {
380
+ "step": step,
381
+ "tool": tool_name,
382
+ "args": tool_args,
383
+ "observation": obs_display,
384
+ },
385
+ )
386
+ yield BusEvent(
387
+ type=EventType.OBSERVATION,
388
+ agent_id=self.config.agent_id,
389
+ payload={"step": step, "tool": tool_name, "observation": obs_display},
390
+ )
391
+ combined.append({"tool": tool_name, "result": obs_display})
392
+ if obs and not isinstance(obs, str) and not _is_image_block(obs):
393
+ fire(
394
+ self._memory.write_working_fact(
395
+ run_id=run_id,
396
+ agent_id=self.config.agent_id,
397
+ key=f"step_{step}_{i}_{tool_name}",
398
+ value=obs,
399
+ )
400
+ )
401
+
402
+ await self._working_memory.append("assistant", json.dumps(response))
403
+ # Inject image observations as content blocks; text observations as a string.
404
+ image_blocks = [
405
+ (act.get("tool", ""), obs)
406
+ for act, obs in zip(parallel_actions, observations, strict=False)
407
+ if _is_image_block(obs)
408
+ ]
409
+ if image_blocks:
410
+ content: list = [
411
+ {
412
+ "type": "text",
413
+ "text": f"Observations:\n{json.dumps(combined, default=str)}",
414
+ }
415
+ ]
416
+ for tool_name_img, img_block in image_blocks:
417
+ content.append({"type": "text", "text": f"\nImage from {tool_name_img}:"})
418
+ content.append(img_block)
419
+ await self._working_memory.append("user", content)
420
+ else:
421
+ await self._working_memory.append(
422
+ "user",
423
+ f"Observations:\n{json.dumps(combined, default=str)}",
424
+ )
425
+ else:
426
+ # Single action path.
427
+ tool_name = response.get("action", "")
428
+ tool_args = response.get("args", {})
429
+ yield BusEvent(
430
+ type=EventType.ACTION,
431
+ agent_id=self.config.agent_id,
432
+ payload={"step": step, "tool": tool_name, "args": tool_args},
433
+ )
434
+
435
+ observation = await self._run_tool_gated(
436
+ run_id, step, tool_name, tool_args, response
437
+ )
438
+ if observation is _HITL_CORRECTION:
439
+ continue
440
+
441
+ obs_display = "[image]" if _is_image_block(observation) else str(observation)[:500]
442
+ self._tracer.log(
443
+ "action",
444
+ self.config.agent_id,
445
+ {
446
+ "step": step,
447
+ "tool": tool_name,
448
+ "args": tool_args,
449
+ "observation": obs_display,
450
+ },
451
+ )
452
+ yield BusEvent(
453
+ type=EventType.OBSERVATION,
454
+ agent_id=self.config.agent_id,
455
+ payload={
456
+ "step": step,
457
+ "tool": tool_name,
458
+ "observation": obs_display,
459
+ },
460
+ )
461
+
462
+ if (
463
+ observation
464
+ and not isinstance(observation, str)
465
+ and not _is_image_block(observation)
466
+ ):
467
+ fire(
468
+ self._memory.write_working_fact(
469
+ run_id=run_id,
470
+ agent_id=self.config.agent_id,
471
+ key=f"step_{step}_{tool_name}",
472
+ value=observation,
473
+ )
474
+ )
475
+
476
+ await self._working_memory.append("assistant", json.dumps(response))
477
+ if _is_image_block(observation):
478
+ await self._working_memory.append(
479
+ "user",
480
+ [
481
+ {"type": "text", "text": f"Observation ({tool_name}):"},
482
+ observation,
483
+ ],
484
+ )
485
+ else:
486
+ obs_text = (
487
+ json.dumps(observation, default=str)
488
+ if not isinstance(observation, str)
489
+ else observation
490
+ )
491
+ await self._working_memory.append("user", f"Observation: {obs_text}")
492
+
493
+ # Max steps exhausted.
494
+ self._tracer.log(
495
+ "task_result",
496
+ self.config.agent_id,
497
+ {
498
+ "answer": "",
499
+ "confidence": 0.0,
500
+ "steps": self.config.max_steps,
501
+ "error": f"Max steps ({self.config.max_steps}) reached",
502
+ },
503
+ )
504
+ yield BusEvent(
505
+ type=EventType.ERROR,
506
+ agent_id=self.config.agent_id,
507
+ error=f"Max steps ({self.config.max_steps}) reached",
508
+ payload={"steps": self.config.max_steps},
509
+ )
510
+
511
+ # ── Think ─────────────────────────────────────────────────────────────────
512
+
513
+ async def _think_stream(self) -> AsyncGenerator[BusEvent, None]:
514
+ """
515
+ Streaming think: if the LLM client has `stream_complete`, forwards
516
+ TOKEN events as text arrives, then parses the accumulated response
517
+ into the action JSON and yields it as a THOUGHT event. Otherwise
518
+ falls back to one `complete` call.
519
+ """
520
+ messages = self._working_memory.get_messages()
521
+ accumulated = ""
522
+
523
+ try:
524
+ if hasattr(self._llm, "stream_complete"):
525
+ async for token in self._llm.stream_complete(
526
+ system=None,
527
+ messages=messages,
528
+ ):
529
+ accumulated += token
530
+ yield BusEvent(
531
+ type=EventType.TOKEN,
532
+ agent_id=self.config.agent_id,
533
+ token=token,
534
+ )
535
+ response = _parse_action_json(accumulated)
536
+ if response is None:
537
+ logger.warning(
538
+ "Agent %s stream got unparseable response: %r",
539
+ self.config.agent_id,
540
+ accumulated[:300],
541
+ )
542
+ self._last_think_error = f"Unparseable stream response: {accumulated[:300]}"
543
+ else:
544
+ raw = await self._llm.complete(
545
+ system=None,
546
+ messages=messages,
547
+ response_format={"type": "json_object"},
548
+ )
549
+ response = _normalize_response(raw)
550
+ if response is None:
551
+ logger.warning(
552
+ "Agent %s got unparseable response: %r",
553
+ self.config.agent_id,
554
+ raw,
555
+ )
556
+ self._last_think_error = f"Unparseable response: {str(raw)[:300]}"
557
+ except Exception as e:
558
+ logger.error("Agent %s think failed: %s", self.config.agent_id, e)
559
+ response = None
560
+ self._last_think_error = str(e)
561
+ else:
562
+ if response is not None:
563
+ self._last_think_error = None
564
+
565
+ yield BusEvent(
566
+ type=EventType.THOUGHT,
567
+ agent_id=self.config.agent_id,
568
+ payload={
569
+ "response": response,
570
+ "thought": response.get("thought", "") if response else "",
571
+ "action": response.get("action") if response else None,
572
+ },
573
+ )
574
+
575
+ # ── Tool Execution ────────────────────────────────────────────────────────
576
+
577
+ async def _execute_tool(self, name: str, args: dict) -> Any:
578
+ if name not in self._tools:
579
+ return (
580
+ f"Error: tool '{name}' not available. Available tools: {list(self._tools.keys())}"
581
+ )
582
+ try:
583
+ return await self._tools[name].execute(**args)
584
+ except Exception as e:
585
+ logger.error("Tool %s failed: %s", name, e)
586
+ return f"Tool error ({name}): {e}"
587
+
588
+ # ── Helpers ───────────────────────────────────────────────────────────────
589
+
590
+ def _error_result(self, reason: str, steps: int) -> dict:
591
+ return {
592
+ "agent_id": self.config.agent_id,
593
+ "answer": "",
594
+ "confidence": 0.0,
595
+ "steps": steps,
596
+ "error": reason,
597
+ "metadata": {},
598
+ }
599
+
600
+ async def _gate_tool(
601
+ self,
602
+ run_id: str,
603
+ step: int,
604
+ tool_name: str,
605
+ tool_args: dict,
606
+ llm_response: dict,
607
+ ):
608
+ """
609
+ Run the HITL approval gate for one tool.
610
+
611
+ Returns ApprovalResponse if the tool is gated, None if not.
612
+ Writes a crash-resumable checkpoint to the store before blocking on stdin.
613
+ """
614
+ if not (self._approval_store and tool_name in self.config.hitl_tools):
615
+ return None
616
+
617
+ from harness.hitl import ApprovalRequest, request_approval
618
+
619
+ approval_id = str(uuid.uuid4())
620
+ await self._approval_store.write_checkpoint(
621
+ run_id,
622
+ {
623
+ "run_id": run_id,
624
+ "agent_id": self.config.agent_id,
625
+ "task": self._task,
626
+ "step": step,
627
+ "memory": self._working_memory.to_dict(),
628
+ "pending": {
629
+ "approval_id": approval_id,
630
+ "tool": tool_name,
631
+ "args": tool_args,
632
+ "step": step,
633
+ "llm_response": llm_response,
634
+ },
635
+ },
636
+ )
637
+ return await request_approval(
638
+ ApprovalRequest(
639
+ approval_id=approval_id,
640
+ run_id=run_id,
641
+ agent_id=self.config.agent_id,
642
+ tool=tool_name,
643
+ args=tool_args,
644
+ step=step,
645
+ timestamp=datetime.now(timezone.utc).isoformat(),
646
+ ),
647
+ self._approval_store,
648
+ self._guard,
649
+ )
650
+
651
+ async def _run_tool_gated(
652
+ self,
653
+ run_id: str,
654
+ step: int,
655
+ tool_name: str,
656
+ tool_args: dict,
657
+ response: dict,
658
+ ) -> Any:
659
+ """
660
+ Gate + execute a single tool.
661
+
662
+ Returns _HITL_CORRECTION sentinel if the human typed a correction
663
+ (WorkingMemory already updated; caller must `continue` the ReAct loop).
664
+ Otherwise returns the observation (str or image block).
665
+ """
666
+ approval = await self._gate_tool(run_id, step, tool_name, tool_args, response)
667
+ if approval is not None:
668
+ if approval.correction:
669
+ await self._inject_human_guidance(response, approval.correction, run_id)
670
+ return _HITL_CORRECTION
671
+ if not approval.approved:
672
+ await self._clear_checkpoint(run_id)
673
+ return f"Tool rejected by human: {approval.correction or 'no reason given'}"
674
+ obs = await self._execute_tool(tool_name, tool_args)
675
+ await self._clear_checkpoint(run_id)
676
+ return obs
677
+
678
+ async def _inject_human_guidance(self, response: dict, correction: str, run_id: str) -> None:
679
+ """Append human correction to WorkingMemory and clear the checkpoint."""
680
+ await self._working_memory.append("assistant", json.dumps(response))
681
+ await self._working_memory.append("user", f"Human guidance: {correction}")
682
+ await self._clear_checkpoint(run_id)
683
+
684
+ async def _clear_checkpoint(self, run_id: str) -> None:
685
+ if self._approval_store:
686
+ await self._approval_store.clear_checkpoint(run_id)
687
+
688
+ async def _replay_pending_step(
689
+ self,
690
+ run_id: str,
691
+ pending: dict,
692
+ ) -> AsyncGenerator[BusEvent, None]:
693
+ """Re-prompt approval for a step interrupted by a crash, then complete it."""
694
+ from harness.hitl import ApprovalRequest, request_approval
695
+
696
+ tool_name = pending["tool"]
697
+ tool_args = pending["args"]
698
+ step = pending["step"]
699
+ llm_response = pending["llm_response"]
700
+
701
+ approval = await request_approval(
702
+ ApprovalRequest(
703
+ approval_id=pending["approval_id"],
704
+ run_id=run_id,
705
+ agent_id=self.config.agent_id,
706
+ tool=tool_name,
707
+ args=tool_args,
708
+ step=step,
709
+ timestamp=datetime.now(timezone.utc).isoformat(),
710
+ ),
711
+ self._approval_store,
712
+ self._guard,
713
+ )
714
+
715
+ if approval.correction:
716
+ await self._inject_human_guidance(llm_response, approval.correction, run_id)
717
+ return
718
+
719
+ observation = (
720
+ await self._execute_tool(tool_name, tool_args)
721
+ if approval.approved
722
+ else f"Tool rejected by human: {approval.correction or 'no reason given'}"
723
+ )
724
+ obs_display = "[image]" if _is_image_block(observation) else str(observation)[:500]
725
+ yield BusEvent(
726
+ type=EventType.OBSERVATION,
727
+ agent_id=self.config.agent_id,
728
+ payload={"step": step, "tool": tool_name, "observation": obs_display},
729
+ )
730
+ await self._working_memory.append("assistant", json.dumps(llm_response))
731
+ if _is_image_block(observation):
732
+ await self._working_memory.append(
733
+ "user",
734
+ [{"type": "text", "text": f"Observation ({tool_name}):"}, observation],
735
+ )
736
+ else:
737
+ obs_text = (
738
+ json.dumps(observation, default=str)
739
+ if not isinstance(observation, str)
740
+ else observation
741
+ )
742
+ await self._working_memory.append("user", f"Observation: {obs_text}")
743
+ await self._clear_checkpoint(run_id)
744
+
745
+
746
+ # ── Response normalization (module-level for testability) ────────────────────
747
+
748
+
749
+ def _normalize_response(response: Any) -> dict | None:
750
+ if isinstance(response, dict) and ("action" in response or "actions" in response):
751
+ return response
752
+ if isinstance(response, dict) and "text" in response:
753
+ text = response["text"].strip()
754
+ elif isinstance(response, str):
755
+ text = response.strip()
756
+ else:
757
+ text = str(response).strip()
758
+ return _parse_action_json(text)
759
+
760
+
761
+ def _is_image_block(obs: Any) -> bool:
762
+ """True when a tool observation is an OpenAI-style image content block."""
763
+ return isinstance(obs, dict) and obs.get("type") in ("image_url", "image")
764
+
765
+
766
+ def _parse_action_json(text: str) -> dict | None:
767
+ """Extract and parse the first parseable JSON object in text.
768
+
769
+ Scans forward through every '{' so that a malformed preamble (e.g. a
770
+ thought with an unescaped newline) doesn't block the valid action object
771
+ that follows it.
772
+ """
773
+ text = text.strip()
774
+ if not text:
775
+ return None
776
+
777
+ decoder = json.JSONDecoder()
778
+ pos = 0
779
+ while (start := text.find("{", pos)) >= 0:
780
+ try:
781
+ obj, _ = decoder.raw_decode(text, start)
782
+ if isinstance(obj, dict):
783
+ return obj
784
+ except (json.JSONDecodeError, ValueError):
785
+ pass
786
+ pos = start + 1
787
+
788
+ return None