execution-agent 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. execution_agent/__init__.py +8 -0
  2. execution_agent/__main__.py +5 -0
  3. execution_agent/agent.py +955 -0
  4. execution_agent/commands_interface.json +7 -0
  5. execution_agent/config.py +21 -0
  6. execution_agent/context.py +1565 -0
  7. execution_agent/docker_helpers_static.py +593 -0
  8. execution_agent/env.py +61 -0
  9. execution_agent/exceptions.py +17 -0
  10. execution_agent/exit_artifacts.py +350 -0
  11. execution_agent/main.py +1234 -0
  12. execution_agent/prompt_files/c_guidelines +481 -0
  13. execution_agent/prompt_files/command_stuck +7 -0
  14. execution_agent/prompt_files/cpp_guidelines +481 -0
  15. execution_agent/prompt_files/cycle_instruction +51 -0
  16. execution_agent/prompt_files/java_guidelines +37 -0
  17. execution_agent/prompt_files/javascript_guidelines +69 -0
  18. execution_agent/prompt_files/latest_containter_technology +7 -0
  19. execution_agent/prompt_files/python_guidelines +48 -0
  20. execution_agent/prompt_files/remove_progress_bars +1 -0
  21. execution_agent/prompt_files/rust_guidelines +53 -0
  22. execution_agent/prompt_files/search_workflows_summary +121 -0
  23. execution_agent/prompt_files/steps_list.json +32 -0
  24. execution_agent/prompt_files/summarize_cycle +13 -0
  25. execution_agent/prompt_files/tools_list +99 -0
  26. execution_agent/prompt_logging.py +311 -0
  27. execution_agent/repetition.py +39 -0
  28. execution_agent/shared_utils.py +507 -0
  29. execution_agent/state_persistence.py +286 -0
  30. execution_agent/tools.py +1611 -0
  31. execution_agent/trace_to_bash.py +281 -0
  32. execution_agent-0.1.0.dist-info/METADATA +231 -0
  33. execution_agent-0.1.0.dist-info/RECORD +37 -0
  34. execution_agent-0.1.0.dist-info/WHEEL +5 -0
  35. execution_agent-0.1.0.dist-info/entry_points.txt +2 -0
  36. execution_agent-0.1.0.dist-info/licenses/LICENSE.md +46 -0
  37. execution_agent-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1234 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import atexit
6
+ import json
7
+ import logging
8
+ import os
9
+ import signal
10
+ import sys
11
+ from dataclasses import dataclass
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+ import secrets
15
+ from typing import Any, Optional, Callable, Tuple
16
+
17
+
18
+ # Global reference for cleanup on shutdown
19
+ _active_agent: Optional[Any] = None
20
+ _shutdown_in_progress = False
21
+
22
+
23
+ def _cleanup_on_shutdown(signum=None, frame=None):
24
+ """Clean up Docker containers and resources on shutdown."""
25
+ global _shutdown_in_progress, _active_agent
26
+
27
+ if _shutdown_in_progress:
28
+ return
29
+ _shutdown_in_progress = True
30
+
31
+ if signum:
32
+ sig_name = signal.Signals(signum).name if hasattr(signal, 'Signals') else str(signum)
33
+ print(f"\nReceived {sig_name}, cleaning up...", file=sys.stderr)
34
+
35
+ if _active_agent is not None:
36
+ try:
37
+ from execution_agent.docker_helpers_static import cleanup_container
38
+ container = getattr(_active_agent, 'container', None)
39
+ docker_tag = getattr(_active_agent, 'docker_tag', None)
40
+ if container or docker_tag:
41
+ print("Cleaning up Docker container...", file=sys.stderr)
42
+ cleanup_container(container, docker_tag)
43
+ print("Cleanup complete.", file=sys.stderr)
44
+ except Exception as e:
45
+ print(f"Warning: Cleanup failed: {e}", file=sys.stderr)
46
+
47
+ if signum:
48
+ sys.exit(128 + signum)
49
+
50
+
51
+ # Register signal handlers for graceful shutdown
52
+ signal.signal(signal.SIGINT, _cleanup_on_shutdown)
53
+ signal.signal(signal.SIGTERM, _cleanup_on_shutdown)
54
+ atexit.register(_cleanup_on_shutdown)
55
+
56
+ import os as _os
57
+ _os.environ.setdefault("MSWEA_SILENT_STARTUP", "1")
58
+
59
+ from execution_agent.prompt_logging import install_cycle_prompt_logging
60
+ from minisweagent.models.litellm_model import LitellmModel
61
+
62
+ from execution_agent.agent import ExecutionAgent
63
+ from execution_agent.env import ExecutionEnvironment
64
+ from execution_agent.tools import (
65
+ ToolRegistry,
66
+ linux_terminal,
67
+ read_file,
68
+ write_to_file,
69
+ search_docker_image,
70
+ goals_accomplished,
71
+ )
72
+ from execution_agent.context import ContextBuilder
73
+
74
+
75
+ # -------------------------
76
+ # Run logging (text + jsonl + transcript)
77
+ # -------------------------
78
+
79
+ def _now_iso() -> str:
80
+ return datetime.now().isoformat(timespec="seconds")
81
+
82
+
83
+ def _safe_json(obj: Any) -> str:
84
+ try:
85
+ return json.dumps(obj, ensure_ascii=False, sort_keys=True)
86
+ except Exception:
87
+ return str(obj)
88
+
89
+
90
+ class JsonlLogHandler(logging.Handler):
91
+ """
92
+ Structured JSONL log handler for all log records.
93
+ """
94
+ def __init__(self, path: Path, level: int = logging.INFO) -> None:
95
+ super().__init__(level=level)
96
+ self.path = path
97
+ self._fh = open(path, "a", encoding="utf-8")
98
+
99
+ def emit(self, record: logging.LogRecord) -> None:
100
+ try:
101
+ payload = {
102
+ "ts": _now_iso(),
103
+ "level": record.levelname,
104
+ "logger": record.name,
105
+ "message": record.getMessage(),
106
+ }
107
+ # If the caller included "extra" fields, capture selected ones
108
+ for k in ("event", "tool_name", "tool_args", "image_tag"):
109
+ if hasattr(record, k):
110
+ payload[k] = getattr(record, k)
111
+
112
+ self._fh.write(_safe_json(payload) + "\n")
113
+ self._fh.flush()
114
+ except Exception:
115
+ # Never crash the run because of logging
116
+ pass
117
+
118
+ def close(self) -> None:
119
+ try:
120
+ self._fh.close()
121
+ except Exception:
122
+ pass
123
+ super().close()
124
+
125
+
126
+ @dataclass
127
+ class TranscriptWriters:
128
+ text_path: Path
129
+ jsonl_path: Path
130
+ json_path: Path
131
+
132
+ def __post_init__(self) -> None:
133
+ self._text_fh = open(self.text_path, "a", encoding="utf-8")
134
+ self._jsonl_fh = open(self.jsonl_path, "a", encoding="utf-8")
135
+
136
+ def write_message(self, index: int, msg: dict[str, Any]) -> None:
137
+ ts = _now_iso()
138
+ role = str(msg.get("role") or "").upper()
139
+ tag = str(msg.get("tag") or "").strip()
140
+ content = str(msg.get("content") or "")
141
+
142
+ # Text transcript
143
+ header = f"[{ts}] #{index:04d} {role}"
144
+ if tag:
145
+ header += f" (tag={tag})"
146
+ self._text_fh.write(header + "\n")
147
+ self._text_fh.write(content.rstrip("\n") + "\n\n")
148
+ self._text_fh.flush()
149
+
150
+ # JSONL transcript
151
+ self._jsonl_fh.write(
152
+ json.dumps(
153
+ {"ts": ts, "index": index, "role": msg.get("role"), "tag": msg.get("tag"), "content": msg.get("content")},
154
+ ensure_ascii=False,
155
+ )
156
+ + "\n"
157
+ )
158
+ self._jsonl_fh.flush()
159
+
160
+ def finalize_full_json(self, messages: list[dict[str, Any]]) -> None:
161
+ try:
162
+ self.json_path.write_text(json.dumps(messages, ensure_ascii=False, indent=2), encoding="utf-8")
163
+ except Exception:
164
+ pass
165
+
166
+ def close(self) -> None:
167
+ try:
168
+ self._text_fh.close()
169
+ except Exception:
170
+ pass
171
+ try:
172
+ self._jsonl_fh.close()
173
+ except Exception:
174
+ pass
175
+
176
+
177
+ # -------------------------
178
+ # NEW: per-cycle LLM prompt logging (exact chat prompt payload)
179
+ # -------------------------
180
+
181
+ def _format_messages_as_text(messages: Any) -> str:
182
+ """
183
+ Human-readable, but content-preserving: shows each message role/tag and the exact content.
184
+ """
185
+ if not isinstance(messages, list):
186
+ return str(messages)
187
+
188
+ chunks: list[str] = []
189
+ for i, m in enumerate(messages):
190
+ if isinstance(m, dict):
191
+ role = str(m.get("role") or "").strip()
192
+ tag = str(m.get("tag") or "").strip()
193
+ header = f"----- message[{i}] role={role or 'UNKNOWN'}"
194
+ if tag:
195
+ header += f" tag={tag}"
196
+ header += " -----"
197
+ chunks.append(header)
198
+ # Preserve exact content
199
+ content = m.get("content")
200
+ if content is None:
201
+ chunks.append("")
202
+ else:
203
+ chunks.append(str(content))
204
+ else:
205
+ chunks.append(f"----- message[{i}] (non-dict) -----")
206
+ chunks.append(str(m))
207
+ chunks.append("") # trailing newline
208
+ return "\n".join(chunks)
209
+
210
+
211
+ class CyclePromptLoggerModelProxy:
212
+ """
213
+ Wraps the model and writes the *exact* chat prompt sent to the LLM per cycle.
214
+
215
+ Output:
216
+ <run_dir>/cycles_chats/cycle_<N>/prompt_<K>.json
217
+ <run_dir>/cycles_chats/cycle_<N>/prompt_<K>.txt
218
+
219
+ Notes:
220
+ - Uses agent._current_cycle_idx set by the run_one_cycle wrapper.
221
+ - Detects "messages" in kwargs or first positional arg if it looks like a chat messages list.
222
+ - Non-invasive: if anything goes wrong, it silently falls back to the underlying model call.
223
+ """
224
+ def __init__(self, base_model: Any, agent: Any, run_dir: Path, log: logging.Logger) -> None:
225
+ self._base_model = base_model
226
+ self._agent = agent
227
+ self._run_dir = run_dir
228
+ self._log = log
229
+
230
+ def __getattr__(self, name: str) -> Any:
231
+ attr = getattr(self._base_model, name)
232
+
233
+ if not callable(attr):
234
+ return attr
235
+
236
+ def _wrapped(*args: Any, **kwargs: Any):
237
+ self._maybe_log_prompt(args, kwargs)
238
+ return attr(*args, **kwargs)
239
+
240
+ return _wrapped
241
+
242
+ def _maybe_log_prompt(self, args: tuple[Any, ...], kwargs: dict[str, Any]) -> None:
243
+ try:
244
+ messages = None
245
+
246
+ # Common convention: messages passed as kwarg
247
+ if "messages" in kwargs:
248
+ messages = kwargs.get("messages")
249
+
250
+ # Otherwise, if first arg looks like chat messages
251
+ if messages is None and len(args) >= 1 and isinstance(args[0], list):
252
+ if len(args[0]) == 0 or isinstance(args[0][0], dict):
253
+ messages = args[0]
254
+
255
+ if messages is None:
256
+ return
257
+
258
+ cycle_idx = getattr(self._agent, "_current_cycle_idx", None)
259
+ if not isinstance(cycle_idx, int) or cycle_idx <= 0:
260
+ # If no cycle context, still log under cycle_0
261
+ cycle_idx = 0
262
+
263
+ cycles_root = self._run_dir / "cycles_chats" / f"cycle_{cycle_idx}"
264
+ cycles_root.mkdir(parents=True, exist_ok=True)
265
+
266
+ # Sequence number for multiple LLM calls within the same cycle
267
+ seq_key = "_cycle_prompt_seq"
268
+ seq_map = getattr(self._agent, seq_key, None)
269
+ if not isinstance(seq_map, dict):
270
+ seq_map = {}
271
+ setattr(self._agent, seq_key, seq_map)
272
+
273
+ k = int(seq_map.get(cycle_idx, 0)) + 1
274
+ seq_map[cycle_idx] = k
275
+
276
+ json_path = cycles_root / f"prompt_{k}.json"
277
+ txt_path = cycles_root / f"prompt_{k}.txt"
278
+
279
+ # Write exact JSON payload (as provided to the model)
280
+ try:
281
+ json_path.write_text(json.dumps(messages, ensure_ascii=False, indent=2), encoding="utf-8")
282
+ except Exception:
283
+ # Fall back to safe string
284
+ json_path.write_text(_safe_json(messages), encoding="utf-8")
285
+
286
+ # Write text view (content-preserving)
287
+ try:
288
+ txt_path.write_text(_format_messages_as_text(messages), encoding="utf-8")
289
+ except Exception:
290
+ pass
291
+
292
+ # Also reflect in the run log (path only; payload is in files)
293
+ try:
294
+ self._log.info(
295
+ "LLM prompt logged: %s",
296
+ str(txt_path),
297
+ extra={"event": "llm_prompt_logged"},
298
+ )
299
+ except Exception:
300
+ pass
301
+
302
+ except Exception:
303
+ # Never break the run due to logging
304
+ return
305
+
306
+
307
+ def _attach_cycle_and_transcript_logging(agent: ExecutionAgent, run_dir: Path, log: logging.Logger) -> None:
308
+ """
309
+ Non-invasive:
310
+ - Wrap run_one_cycle() to log cycle boundaries and flush new agent.messages to transcript files.
311
+ - Works even if agent does not expose add_message().
312
+ """
313
+ transcript = TranscriptWriters(
314
+ text_path=run_dir / "messages_transcript.txt",
315
+ jsonl_path=run_dir / "messages_transcript.jsonl",
316
+ json_path=run_dir / "messages.json",
317
+ )
318
+
319
+ setattr(agent, "_transcript_writers", transcript)
320
+ setattr(agent, "_messages_logged_upto", 0)
321
+
322
+ def _flush_new_messages() -> None:
323
+ try:
324
+ msgs = list(getattr(agent, "messages", []) or [])
325
+ upto = int(getattr(agent, "_messages_logged_upto", 0) or 0)
326
+ if upto < 0:
327
+ upto = 0
328
+ for i in range(upto, len(msgs)):
329
+ m = msgs[i]
330
+ if isinstance(m, dict):
331
+ transcript.write_message(i, m)
332
+ setattr(agent, "_messages_logged_upto", len(msgs))
333
+ except Exception:
334
+ pass
335
+
336
+ # Initial flush (system+initial context if any already present)
337
+ _flush_new_messages()
338
+
339
+ if not hasattr(agent, "run_one_cycle"):
340
+ log.warning("ExecutionAgent has no run_one_cycle(); cycle-level logging will rely on tool logs only.")
341
+ return
342
+
343
+ orig = agent.run_one_cycle
344
+
345
+ def _flush_log_handlers():
346
+ """Flush all handlers to ensure logs are written immediately."""
347
+ for handler in log.handlers:
348
+ try:
349
+ handler.flush()
350
+ except Exception:
351
+ pass
352
+
353
+ def wrapped_run_one_cycle(*args: Any, **kwargs: Any):
354
+ cycle_idx = getattr(agent, "cycle_count", 0) + 1
355
+
356
+ # NEW: publish current cycle index for the model proxy logger
357
+ try:
358
+ setattr(agent, "_current_cycle_idx", int(cycle_idx))
359
+ except Exception:
360
+ pass
361
+
362
+ log.info("โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€")
363
+ log.info("CYCLE %02d START", cycle_idx, extra={"event": "cycle_start"})
364
+ _flush_log_handlers()
365
+
366
+ # Flush any messages queued before the cycle starts
367
+ _flush_new_messages()
368
+
369
+ res = orig(*args, **kwargs)
370
+
371
+ # Flush messages produced by the cycle
372
+ _flush_new_messages()
373
+
374
+ # After-cycle summary if the agent returns a dict-ish cycle record
375
+ try:
376
+ if isinstance(res, dict):
377
+ tool_call = res.get("tool_call") or {}
378
+ cmd = tool_call.get("command") if isinstance(tool_call, dict) else None
379
+ if isinstance(cmd, dict) and cmd.get("name"):
380
+ log.info(
381
+ "CYCLE %02d TOOL: %s args=%s",
382
+ cycle_idx,
383
+ cmd.get("name"),
384
+ _safe_json(cmd.get("args")),
385
+ extra={"event": "cycle_tool", "tool_name": cmd.get("name"), "tool_args": cmd.get("args")},
386
+ )
387
+ if "result" in res:
388
+ preview = str(res.get("result"))
389
+ if len(preview) > 1200:
390
+ preview = preview[:1199].rstrip() + "โ€ฆ"
391
+ log.info("CYCLE %02d RESULT (preview): %s", cycle_idx, preview, extra={"event": "cycle_result"})
392
+ except Exception:
393
+ pass
394
+
395
+ log.info("CYCLE %02d END", cycle_idx, extra={"event": "cycle_end"})
396
+ _flush_log_handlers()
397
+
398
+ # NEW: clear current cycle marker (best-effort)
399
+ try:
400
+ setattr(agent, "_current_cycle_idx", None)
401
+ except Exception:
402
+ pass
403
+
404
+ return res
405
+
406
+ def wrapped_run_one_cycle_with_error_handling(*args: Any, **kwargs: Any):
407
+ """Wrapper that ensures exceptions are logged before propagating."""
408
+ try:
409
+ return wrapped_run_one_cycle(*args, **kwargs)
410
+ except Exception as e:
411
+ log.error(f"Exception in cycle execution: {type(e).__name__}: {e}", exc_info=True)
412
+ _flush_log_handlers()
413
+ raise
414
+
415
+ agent.run_one_cycle = wrapped_run_one_cycle_with_error_handling # type: ignore[attr-defined]
416
+
417
+
418
+ def _configure_logging(run_dir: Path, level: int = logging.INFO) -> logging.Logger:
419
+ logger = logging.getLogger("execution_agent")
420
+ logger.setLevel(level)
421
+ logger.propagate = False
422
+
423
+ for h in list(logger.handlers):
424
+ logger.removeHandler(h)
425
+
426
+ run_dir.mkdir(parents=True, exist_ok=True)
427
+
428
+ # Console handler
429
+ console = logging.StreamHandler()
430
+ console.setLevel(level)
431
+
432
+ class _Formatter(logging.Formatter):
433
+ def format(self, record: logging.LogRecord) -> str:
434
+ ts = self.formatTime(record, datefmt="%H:%M:%S")
435
+ lvl = record.levelname.ljust(5)
436
+ return f"{ts} | {lvl} | {record.name} | {record.getMessage()}"
437
+
438
+ console.setFormatter(_Formatter())
439
+ logger.addHandler(console)
440
+
441
+ # Text file handler
442
+ file_handler = logging.FileHandler(run_dir / "run.log", encoding="utf-8")
443
+ file_handler.setLevel(level)
444
+ file_handler.setFormatter(_Formatter())
445
+ logger.addHandler(file_handler)
446
+
447
+ # JSONL handler (structured)
448
+ jsonl_handler = JsonlLogHandler(run_dir / "run.jsonl", level=level)
449
+ logger.addHandler(jsonl_handler)
450
+
451
+ # Ensure sub-loggers propagate into execution_agent
452
+ for sub in ("execution_agent.tools", "execution_agent.docker", "litellm_model"):
453
+ l = logging.getLogger(sub)
454
+ l.setLevel(level)
455
+ l.propagate = True
456
+
457
+ # Also configure litellm_model logger to output to the same handlers
458
+ litellm_logger = logging.getLogger("litellm_model")
459
+ litellm_logger.setLevel(level)
460
+ for h in logger.handlers:
461
+ litellm_logger.addHandler(h)
462
+
463
+ return logger
464
+
465
+
466
+ # -------------------------
467
+ # CLI
468
+ # -------------------------
469
+
470
+ def load_text(path: str) -> str:
471
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
472
+ return f.read()
473
+
474
+
475
+ def parse_args() -> argparse.Namespace:
476
+ ap = argparse.ArgumentParser(description="Run ExecutionAgent (mini-swe-agent based, Option A env).")
477
+ ap.add_argument("--experiment-file", required=True, help="Path to project_meta_data.json")
478
+ ap.add_argument("--task-file", default=None, help="Optional file containing the top-level task/instructions.")
479
+ ap.add_argument("--task", default=None, help="Optional task string. Overrides --task-file if set.")
480
+ ap.add_argument("--model", default=os.getenv("OPENAI_MODEL", "gpt-5-nano"))
481
+ ap.add_argument("--knowledge-model", default=os.getenv("KNOWLEDGE_MODEL", "gpt-5-mini"),
482
+ help="Model for web search analysis and unified summary (default: gpt-5-mini). "
483
+ "Should be an up-to-date model with good general knowledge.")
484
+ ap.add_argument("--api-key", default=os.getenv("OPENAI_API_KEY"))
485
+ ap.add_argument("--workspace-root", default="execution_agent_workspace")
486
+ ap.add_argument("--prompt-files", default=None,
487
+ help="Directory with prompt template files (default: bundled with package)")
488
+ ap.add_argument("--log-level", default="INFO", help="DEBUG|INFO|WARNING|ERROR")
489
+
490
+ # NEW: optional explicit run log dir
491
+ ap.add_argument("--run-log-dir", default=None, help="Optional directory to write run logs/transcripts into.")
492
+
493
+ # NEW: max retries for budget exhaustion
494
+ ap.add_argument("--max-retries", type=int, default=2,
495
+ help="Maximum retries after budget exhaustion (default: 2). Total attempts = 1 + max_retries.")
496
+ return ap.parse_args()
497
+
498
+
499
+ def build_default_task(meta: dict) -> str:
500
+ return (
501
+ "Your objective is to set up, build, install, and run the project's test suite inside a container. "
502
+ "You must produce a Dockerfile (written via write_to_file) that clones the repo and prepares the environment, "
503
+ "then run installation/build/test commands via linux_terminal until tests can be executed, and write "
504
+ "TEST_RESULTS.txt with outcomes. Only declare goals_accomplished once Dockerfile exists and results are recorded.\n\n"
505
+ "IMPORTANT: The task is considered successful if ~80% or more of the tests pass. "
506
+ "Having a few failing tests or errors is acceptable and expected. "
507
+ "Once you have a substantial majority of tests passing (~80%+), declare goals accomplished. "
508
+ "Do NOT waste cycles trying to fix the last few failing tests."
509
+ )
510
+
511
+
512
+ def _extract_dockerfile_and_script(llm_response: str) -> Tuple[Optional[str], Optional[str]]:
513
+ """
514
+ Extract Dockerfile and bash script from LLM response.
515
+
516
+ Expected format in response:
517
+ ```dockerfile
518
+ <dockerfile content>
519
+ ```
520
+
521
+ ```bash
522
+ <bash script content>
523
+ ```
524
+
525
+ Returns:
526
+ Tuple of (dockerfile_content, bash_script_content), either can be None if not found
527
+ """
528
+ import re
529
+
530
+ dockerfile = None
531
+ bash_script = None
532
+
533
+ # Extract Dockerfile - look for ```dockerfile or ```Dockerfile
534
+ dockerfile_match = re.search(
535
+ r'```[Dd]ockerfile\s*\n(.*?)```',
536
+ llm_response,
537
+ re.DOTALL
538
+ )
539
+ if dockerfile_match:
540
+ dockerfile = dockerfile_match.group(1).strip()
541
+
542
+ # Extract bash script - look for ```bash or ```sh
543
+ bash_match = re.search(
544
+ r'```(?:bash|sh)\s*\n(.*?)```',
545
+ llm_response,
546
+ re.DOTALL
547
+ )
548
+ if bash_match:
549
+ bash_script = bash_match.group(1).strip()
550
+
551
+ return dockerfile, bash_script
552
+
553
+
554
+ def _run_forced_exit_cycle(
555
+ *,
556
+ knowledge_model,
557
+ agent,
558
+ project_path: str,
559
+ project_url: str,
560
+ workspace_root: str,
561
+ run_dir: Path,
562
+ log: logging.Logger,
563
+ ) -> bool:
564
+ """
565
+ Forced exit cycle: Ask knowledge model to produce a final Dockerfile + bash script
566
+ based on all available context, then attempt to execute them.
567
+
568
+ Args:
569
+ knowledge_model: The knowledge model (e.g., gpt-5-mini) to use
570
+ agent: The agent instance with history and context
571
+ project_path: Name/path of the project
572
+ project_url: Git URL of the project
573
+ workspace_root: Root workspace directory
574
+ run_dir: Directory for logs and output files
575
+ log: Logger instance
576
+
577
+ Returns:
578
+ True if the forced exit cycle succeeded (tests ran), False otherwise
579
+ """
580
+ log.info("=" * 80)
581
+ log.info("๐Ÿšจ FORCED EXIT CYCLE - Attempting final solution with knowledge model")
582
+ log.info("=" * 80)
583
+
584
+ # Gather all context for the knowledge model
585
+ # 1. Previous attempt lessons
586
+ previous_lessons = []
587
+ for i, attempt in enumerate(getattr(agent, 'previous_attempts', []), 1):
588
+ if isinstance(attempt, dict):
589
+ previous_lessons.append(f"Attempt {i}: {json.dumps(attempt, indent=2)}")
590
+
591
+ # 2. Command history from current/last attempt
592
+ command_history = []
593
+ for cmd_summary in getattr(agent, 'commands_and_summary', [])[-50:]: # Last 50 commands
594
+ if isinstance(cmd_summary, dict):
595
+ cmd = cmd_summary.get('command', '')
596
+ result = cmd_summary.get('result', '')[:500] # Truncate long results
597
+ command_history.append(f"$ {cmd}\n{result}")
598
+ elif isinstance(cmd_summary, str):
599
+ command_history.append(cmd_summary)
600
+
601
+ # 3. Workflow/CI hints from repo context
602
+ workflow_hints = ""
603
+ repo_context = getattr(agent, 'repo_context', None)
604
+ if repo_context:
605
+ for path, content in getattr(repo_context, 'workflow_contents', [])[:3]:
606
+ workflow_hints += f"\n--- {path} ---\n{content[:3000]}\n"
607
+
608
+ unified_summary = getattr(repo_context, 'unified_summary', '')
609
+ if unified_summary:
610
+ workflow_hints += f"\n--- Unified Summary ---\n{unified_summary[:5000]}\n"
611
+
612
+ # Build the prompt for the knowledge model
613
+ forced_exit_prompt = f"""You are an expert at setting up software projects for testing. An automated agent has failed multiple times to install and test the project "{project_path}" from source. You need to produce a FINAL SOLUTION.
614
+
615
+ PROJECT INFORMATION:
616
+ - Project: {project_path}
617
+ - Repository URL: {project_url}
618
+ - Language: {getattr(repo_context, 'language', 'unknown') if repo_context else 'unknown'}
619
+
620
+ PREVIOUS ATTEMPT LESSONS:
621
+ {chr(10).join(previous_lessons) if previous_lessons else "No previous attempt summaries available."}
622
+
623
+ RECENT COMMAND HISTORY (what was tried):
624
+ {chr(10).join(command_history[-30:]) if command_history else "No command history available."}
625
+
626
+ CI/CD AND BUILD HINTS FROM REPOSITORY:
627
+ {workflow_hints if workflow_hints else "No CI/CD hints available."}
628
+
629
+ YOUR TASK:
630
+ Based on ALL the information above, produce:
631
+ 1. A complete Dockerfile that will set up the environment for building and testing this project
632
+ 2. A bash script that will run inside the container to install dependencies, build the project, and run tests
633
+
634
+ IMPORTANT REQUIREMENTS:
635
+ - The Dockerfile should be based on Ubuntu (ubuntu:22.04 or ubuntu:24.04)
636
+ - The Dockerfile must install git and clone the repository
637
+ - The bash script should be executable inside the container
638
+ - Include all necessary system dependencies
639
+ - Handle common issues that were encountered in previous attempts
640
+ - The bash script should end with running the test suite
641
+ - If tests fail, that's acceptable - we just need to run them
642
+
643
+ OUTPUT FORMAT:
644
+ You MUST provide your response in EXACTLY this format:
645
+
646
+ ```dockerfile
647
+ <your complete Dockerfile here>
648
+ ```
649
+
650
+ ```bash
651
+ <your complete bash script here>
652
+ ```
653
+
654
+ Provide ONLY these two code blocks. The Dockerfile and bash script must be complete and ready to use."""
655
+
656
+ # Try up to 3 times to get a valid response
657
+ max_llm_retries = 3
658
+ dockerfile_content = None
659
+ bash_script_content = None
660
+
661
+ for llm_attempt in range(1, max_llm_retries + 1):
662
+ log.info(f"๐Ÿค– Querying knowledge model (attempt {llm_attempt}/{max_llm_retries})...")
663
+ try:
664
+ response = knowledge_model.query([{"role": "user", "content": forced_exit_prompt}])
665
+ llm_response = response.get("content", "")
666
+
667
+ # Extract Dockerfile and bash script
668
+ dockerfile_content, bash_script_content = _extract_dockerfile_and_script(llm_response)
669
+
670
+ if dockerfile_content and bash_script_content:
671
+ log.info("โœ… Successfully extracted Dockerfile and bash script from LLM response")
672
+ break
673
+ else:
674
+ missing = []
675
+ if not dockerfile_content:
676
+ missing.append("Dockerfile")
677
+ if not bash_script_content:
678
+ missing.append("bash script")
679
+ log.warning(f"โš ๏ธ LLM response missing: {', '.join(missing)}")
680
+ if llm_attempt < max_llm_retries:
681
+ log.info("Retrying with clarification...")
682
+ forced_exit_prompt += "\n\nIMPORTANT: Your previous response was missing required components. Please provide BOTH a ```dockerfile block AND a ```bash block."
683
+
684
+ except Exception as e:
685
+ log.error(f"โŒ LLM query failed: {e}")
686
+ if llm_attempt >= max_llm_retries:
687
+ log.error("All LLM query attempts failed")
688
+ return False
689
+
690
+ if not dockerfile_content or not bash_script_content:
691
+ log.error("โŒ Failed to extract valid Dockerfile and bash script after all retries")
692
+ return False
693
+
694
+ # Save the extracted files
695
+ forced_exit_dir = run_dir / "forced_exit_cycle"
696
+ forced_exit_dir.mkdir(parents=True, exist_ok=True)
697
+
698
+ dockerfile_path = forced_exit_dir / "Dockerfile"
699
+ script_path = forced_exit_dir / "run_tests.sh"
700
+
701
+ dockerfile_path.write_text(dockerfile_content, encoding="utf-8")
702
+ script_path.write_text(bash_script_content, encoding="utf-8")
703
+ log.info(f"๐Ÿ“„ Saved Dockerfile to: {dockerfile_path}")
704
+ log.info(f"๐Ÿ“„ Saved bash script to: {script_path}")
705
+
706
+ # Log the contents for debugging
707
+ log.info("--- Dockerfile content ---")
708
+ for line in dockerfile_content.split('\n')[:30]:
709
+ log.info(f" {line}")
710
+ if dockerfile_content.count('\n') > 30:
711
+ log.info(f" ... ({dockerfile_content.count(chr(10)) - 30} more lines)")
712
+
713
+ log.info("--- Bash script content ---")
714
+ for line in bash_script_content.split('\n')[:30]:
715
+ log.info(f" {line}")
716
+ if bash_script_content.count('\n') > 30:
717
+ log.info(f" ... ({bash_script_content.count(chr(10)) - 30} more lines)")
718
+
719
+ # Now attempt to build and run
720
+ log.info("๐Ÿ”จ Building Docker image from forced exit Dockerfile...")
721
+
722
+ try:
723
+ from execution_agent.docker_helpers_static import cleanup_container
724
+ import subprocess
725
+ import secrets
726
+
727
+ # Generate a unique tag for this forced exit attempt
728
+ docker_tag = f"forced_exit_{project_path}_{secrets.token_hex(4)}"
729
+
730
+ # Build the Docker image
731
+ build_result = subprocess.run(
732
+ ["docker", "build", "-t", docker_tag, "-f", str(dockerfile_path), str(forced_exit_dir)],
733
+ capture_output=True,
734
+ text=True,
735
+ timeout=600, # 10 minute timeout for build
736
+ cwd=str(forced_exit_dir),
737
+ )
738
+
739
+ if build_result.returncode != 0:
740
+ log.error(f"โŒ Docker build failed:\n{build_result.stderr}")
741
+ # Save build output for debugging
742
+ (forced_exit_dir / "docker_build.log").write_text(
743
+ f"STDOUT:\n{build_result.stdout}\n\nSTDERR:\n{build_result.stderr}",
744
+ encoding="utf-8"
745
+ )
746
+ return False
747
+
748
+ log.info("โœ… Docker image built successfully")
749
+
750
+ # Run the container with the bash script
751
+ log.info("๐Ÿš€ Running test script inside container...")
752
+
753
+ # Copy the script into the container and execute it
754
+ run_result = subprocess.run(
755
+ [
756
+ "docker", "run", "--rm",
757
+ "-v", f"{script_path}:/run_tests.sh:ro",
758
+ docker_tag,
759
+ "bash", "/run_tests.sh"
760
+ ],
761
+ capture_output=True,
762
+ text=True,
763
+ timeout=1800, # 30 minute timeout for tests
764
+ )
765
+
766
+ # Save the output
767
+ test_output = f"STDOUT:\n{run_result.stdout}\n\nSTDERR:\n{run_result.stderr}\n\nReturn code: {run_result.returncode}"
768
+ (forced_exit_dir / "test_output.log").write_text(test_output, encoding="utf-8")
769
+
770
+ log.info(f"๐Ÿ“‹ Test execution completed with return code: {run_result.returncode}")
771
+ log.info("--- Test output (last 50 lines) ---")
772
+ output_lines = (run_result.stdout + run_result.stderr).split('\n')
773
+ for line in output_lines[-50:]:
774
+ log.info(f" {line}")
775
+
776
+ # Cleanup the Docker image
777
+ try:
778
+ subprocess.run(["docker", "rmi", docker_tag], capture_output=True, timeout=60)
779
+ except Exception:
780
+ pass
781
+
782
+ # Consider it a success if the script ran (even if tests failed)
783
+ # The goal is to at least execute the test suite
784
+ if run_result.returncode == 0:
785
+ log.info("โœ… Forced exit cycle: Tests completed successfully!")
786
+ return True
787
+ else:
788
+ log.warning(f"โš ๏ธ Forced exit cycle: Tests ran but exited with code {run_result.returncode}")
789
+ # Still consider it partially successful - tests ran
790
+ return True
791
+
792
+ except subprocess.TimeoutExpired as e:
793
+ log.error(f"โŒ Timeout during forced exit cycle: {e}")
794
+ return False
795
+ except Exception as e:
796
+ log.error(f"โŒ Error during forced exit cycle: {e}", exc_info=True)
797
+ return False
798
+
799
+
800
+ def main() -> int:
801
+ args = parse_args()
802
+
803
+ # Resolve prompt-files from installed package if not explicitly given
804
+ if args.prompt_files is None:
805
+ from execution_agent import prompt_files_dir
806
+ args.prompt_files = str(prompt_files_dir)
807
+
808
+ # Load meta early so we can compute run_dir before configuring logging
809
+ meta = json.loads(Path(args.experiment_file).read_text(encoding="utf-8"))
810
+ project_path = meta["project_path"]
811
+ project_url = meta["project_url"]
812
+ language = meta.get("language", "unknown")
813
+
814
+ # Run directory for logs/transcripts
815
+ if args.run_log_dir:
816
+ run_dir = Path(args.run_log_dir)
817
+ else:
818
+ safe_proj = str(project_path).replace(os.sep, "__").replace("/", "__")
819
+ run_dir = Path(args.workspace_root) / "_run_logs" / safe_proj / datetime.now().strftime("%Y%m%d_%H%M%S")
820
+
821
+ # Configure logging (console + file + jsonl)
822
+ level = getattr(logging, str(args.log_level).upper(), logging.INFO)
823
+ LOG = _configure_logging(run_dir, level=level)
824
+
825
+ # Task selection
826
+ if args.task:
827
+ task = args.task
828
+ elif args.task_file:
829
+ task = Path(args.task_file).read_text(encoding="utf-8", errors="ignore")
830
+ else:
831
+ task = build_default_task(meta)
832
+
833
+ # API key
834
+ if not args.api_key:
835
+ raise SystemExit("Missing OPENAI_API_KEY (or pass --api-key).")
836
+ os.environ["OPENAI_API_KEY"] = args.api_key
837
+
838
+ # =========================================================================
839
+ # PREPARATION PHASE - Collecting context and building main prompt
840
+ # =========================================================================
841
+ LOG.info("=" * 80)
842
+ LOG.info("PREPARATION PHASE - Collecting context and building main prompt")
843
+ LOG.info("=" * 80)
844
+
845
+ LOG.info("Project: %s", project_path)
846
+ LOG.info("Repo: %s", project_url)
847
+ LOG.info("Model: %s", args.model)
848
+ LOG.info("Knowledge Model: %s", args.knowledge_model)
849
+ LOG.info("Run dir: %s", str(run_dir))
850
+
851
+ # Load prompt snippets
852
+ LOG.info("Loading prompt templates...")
853
+ pf = Path(args.prompt_files)
854
+ cycle_instruction = load_text(str(pf / "cycle_instruction"))
855
+ summarize_cycle = load_text(str(pf / "summarize_cycle"))
856
+ search_workflows_summary = load_text(str(pf / "search_workflows_summary"))
857
+ remove_progress_bars_prompt = load_text(str(pf / "remove_progress_bars"))
858
+
859
+ LOG.info("Initializing models...")
860
+ model = LitellmModel(model_name=args.model, model_kwargs={})
861
+
862
+ # Create a separate knowledge model for web search analysis and unified summary
863
+ # This model should be up-to-date and knowledgeable about current technologies
864
+ knowledge_model = LitellmModel(model_name=args.knowledge_model, model_kwargs={})
865
+
866
+ LOG.info("Registering tools...")
867
+ commands_schema = {
868
+ "linux_terminal": ["command"],
869
+ "read_file": ["file_path"],
870
+ "write_to_file": ["filename", "text"],
871
+ "search_docker_image": ["search_term"],
872
+ "goals_accomplished": ["reason"],
873
+ }
874
+
875
+ tool_registry = ToolRegistry(commands_schema)
876
+ tool_registry.register("linux_terminal", linux_terminal)
877
+ tool_registry.register("read_file", read_file)
878
+ tool_registry.register("write_to_file", write_to_file)
879
+ tool_registry.register("search_docker_image", search_docker_image)
880
+ tool_registry.register("goals_accomplished", goals_accomplished)
881
+
882
+ def local_shell_interact(cmd: str):
883
+ import subprocess
884
+
885
+ cwd = Path(args.workspace_root) / project_path
886
+ p = subprocess.run(cmd, shell=True, cwd=str(cwd), capture_output=True, text=True)
887
+ out = (p.stdout or "") + ("\n" + p.stderr if p.stderr else "")
888
+ return out, str(cwd)
889
+
890
+ env = ExecutionEnvironment(
891
+ workspace_path=args.workspace_root,
892
+ project_path=project_path,
893
+ shell_interact_fn=local_shell_interact,
894
+ )
895
+
896
+ LOG.info("Building repository context (cloning repo, finding workflows, requirements, README)...")
897
+ ctx_builder = ContextBuilder(workspace_root=args.workspace_root)
898
+ repo_context = ctx_builder.build_repo_context(
899
+ model=model,
900
+ knowledge_model=knowledge_model,
901
+ project_path=project_path,
902
+ project_url=project_url,
903
+ language=language,
904
+ search_workflows_summary_prompt=search_workflows_summary,
905
+ )
906
+ LOG.info("Repository context built successfully")
907
+
908
+ tools_doc_path = pf / "tools_list"
909
+ tools_doc_string = load_text(str(tools_doc_path)) if tools_doc_path.exists() else ""
910
+
911
+ # Load language-specific guidelines if available
912
+ language_guidelines = ""
913
+ if language:
914
+ lang_lower = language.lower().strip()
915
+ # Map common language names to guideline file names
916
+ lang_map = {
917
+ "python": "python_guidelines",
918
+ "py": "python_guidelines",
919
+ "java": "java_guidelines",
920
+ "javascript": "javascript_guidelines",
921
+ "js": "javascript_guidelines",
922
+ "typescript": "javascript_guidelines",
923
+ "ts": "javascript_guidelines",
924
+ "c": "c_guidelines",
925
+ "c++": "cpp_guidelines",
926
+ "cpp": "cpp_guidelines",
927
+ "rust": "rust_guidelines",
928
+ "rs": "rust_guidelines",
929
+ }
930
+ guideline_name = lang_map.get(lang_lower, f"{lang_lower}_guidelines")
931
+ guideline_path = pf / guideline_name
932
+ if guideline_path.exists():
933
+ language_guidelines = load_text(str(guideline_path))
934
+ LOG.info(f"Loaded language guidelines for: {language} from {guideline_name}")
935
+ else:
936
+ LOG.info(f"No language guidelines found for: {language} (tried {guideline_name})")
937
+
938
+ agent = ExecutionAgent(
939
+ model=model,
940
+ env=env,
941
+ tool_registry=tool_registry,
942
+ cycle_instruction=cycle_instruction,
943
+ summarize_cycle=summarize_cycle,
944
+ remove_progress_bars_prompt=remove_progress_bars_prompt,
945
+ search_workflows_summary_prompt=search_workflows_summary,
946
+ step_limit=int(meta.get("budget", 40)) if isinstance(meta, dict) else 40,
947
+ )
948
+
949
+ # Attach runtime metadata/state
950
+ agent.workspace_path = args.workspace_root
951
+ agent.project_path = project_path
952
+ agent.project_url = project_url
953
+ agent.hyperparams = meta
954
+ agent.repo_context = repo_context
955
+ agent.tools_doc_string = tools_doc_string
956
+ agent.language_guidelines = language_guidelines
957
+ agent.written_files = []
958
+ agent.commands_and_summary = []
959
+
960
+ # State used by the upgraded tools
961
+ agent.command_stuck = False
962
+ agent.current_logfile = None
963
+ agent.stuck_commands = []
964
+ agent.docker_tag = ""
965
+
966
+ # Register agent for graceful shutdown cleanup
967
+ global _active_agent
968
+ _active_agent = agent
969
+
970
+ # Set up state persistence for recovery
971
+ from execution_agent.state_persistence import create_state_persistence
972
+ state_persistence = create_state_persistence(run_dir)
973
+ agent._state_persistence = state_persistence
974
+
975
+ # Check for existing state to resume from
976
+ if state_persistence.has_saved_state():
977
+ saved_state = state_persistence.load_state()
978
+ if saved_state and saved_state.cycle_count > 0:
979
+ LOG.info(f"Found saved state from cycle {saved_state.cycle_count}")
980
+ # For now, just log - actual restoration would need more work
981
+ # to handle container reconnection etc.
982
+ LOG.info("To implement: automatic state restoration")
983
+
984
+ # NEW: cycle + message transcript logging
985
+ _attach_cycle_and_transcript_logging(agent, run_dir, LOG)
986
+
987
+ # NEW: wrap the model so we persist the exact chat prompt sent to the LLM per cycle
988
+ try:
989
+ agent.model = CyclePromptLoggerModelProxy(agent.model, agent, run_dir, LOG)
990
+ except Exception:
991
+ pass
992
+
993
+ # Optional: also attempt to enable any built-in prompt logging helper (best-effort, non-fatal)
994
+ try:
995
+ # Some repos expose install_cycle_prompt_logging; if present and compatible, enable it as well.
996
+ install_cycle_prompt_logging(agent=agent, run_dir=run_dir, logger=LOG) # type: ignore[call-arg]
997
+ except TypeError:
998
+ try:
999
+ install_cycle_prompt_logging(agent, run_dir) # type: ignore[misc]
1000
+ except Exception:
1001
+ pass
1002
+ except Exception:
1003
+ pass
1004
+
1005
+ LOG.info("=" * 80)
1006
+ LOG.info("PREPARATION PHASE COMPLETE")
1007
+ LOG.info("=" * 80)
1008
+
1009
+ # Import BudgetExhausted exception and cleanup function
1010
+ from execution_agent.exceptions import BudgetExhausted
1011
+ from execution_agent.docker_helpers_static import cleanup_container
1012
+
1013
+ # =========================================================================
1014
+ # MAIN PHASE - Agent execution cycles
1015
+ # =========================================================================
1016
+ LOG.info("=" * 80)
1017
+ LOG.info("MAIN PHASE - Starting agent execution cycles")
1018
+ LOG.info("=" * 80)
1019
+
1020
+ LOG.info("Starting agent run with retry support...")
1021
+ max_attempts = 1 + args.max_retries
1022
+ LOG.info(f"Configuration: max_attempts={max_attempts} (1 initial + {args.max_retries} retries)")
1023
+
1024
+ final_success = False
1025
+
1026
+ for attempt in range(1, max_attempts + 1):
1027
+ LOG.info("=" * 80)
1028
+ LOG.info(f"ATTEMPT {attempt} of {max_attempts}")
1029
+ LOG.info("=" * 80)
1030
+
1031
+ try:
1032
+ agent.run(task=task)
1033
+ # Success!
1034
+ LOG.info(f"โœ… Goals accomplished on attempt {attempt}")
1035
+ final_success = True
1036
+
1037
+ # Generate exit artifacts for successful (non-forced) exit
1038
+ try:
1039
+ from execution_agent.exit_artifacts import generate_exit_artifacts
1040
+ LOG.info("๐Ÿ“ฆ Generating exit artifacts for successful run...")
1041
+ artifacts_generated = generate_exit_artifacts(agent, run_dir, LOG)
1042
+ if artifacts_generated:
1043
+ LOG.info("โœ… Exit artifacts generated successfully")
1044
+ else:
1045
+ LOG.warning("โš ๏ธ Could not generate exit artifacts (no Dockerfile found)")
1046
+ except Exception as artifact_error:
1047
+ LOG.warning(f"โš ๏ธ Failed to generate exit artifacts: {artifact_error}")
1048
+
1049
+ break
1050
+
1051
+ except BudgetExhausted as e:
1052
+ LOG.warning(f"โš ๏ธ Attempt {attempt} exhausted budget: {e}")
1053
+
1054
+ # Generate attempt summary for this attempt (needed for forced exit cycle too)
1055
+ LOG.info("๐Ÿ“Š Generating summary of failed attempt...")
1056
+ try:
1057
+ summary = agent.generate_attempt_summary()
1058
+ LOG.info(f"Attempt {attempt} summary:")
1059
+ LOG.info(json.dumps(summary, indent=2))
1060
+ agent.previous_attempts.append(summary)
1061
+ except Exception as summary_error:
1062
+ LOG.error(f"Failed to generate attempt summary: {summary_error}")
1063
+ agent.previous_attempts.append({
1064
+ "problems": "Summary generation failed",
1065
+ "actions": f"Executed {len(agent.commands_and_summary)} commands",
1066
+ "lessons": "Unable to extract detailed lessons",
1067
+ "suggestions": "Try a different approach; review logs manually"
1068
+ })
1069
+
1070
+ # If this was the last attempt, try forced exit cycle
1071
+ if attempt >= max_attempts:
1072
+ LOG.error(f"โŒ All {max_attempts} retry attempts exhausted")
1073
+
1074
+ # Try forced exit cycle with knowledge model
1075
+ LOG.info("๐Ÿšจ Attempting forced exit cycle with knowledge model...")
1076
+ try:
1077
+ forced_exit_success = _run_forced_exit_cycle(
1078
+ knowledge_model=knowledge_model,
1079
+ agent=agent,
1080
+ project_path=project_path,
1081
+ project_url=project_url,
1082
+ workspace_root=args.workspace_root,
1083
+ run_dir=run_dir,
1084
+ log=LOG,
1085
+ )
1086
+ if forced_exit_success:
1087
+ LOG.info("โœ… Forced exit cycle succeeded!")
1088
+ final_success = True
1089
+ except Exception as forced_exit_error:
1090
+ LOG.error(f"โŒ Forced exit cycle failed: {forced_exit_error}", exc_info=True)
1091
+
1092
+ break
1093
+
1094
+ # Cleanup Docker resources (summary was already generated above)
1095
+ LOG.info("๐Ÿงน Cleaning up Docker resources...")
1096
+ cleanup_container(agent.container, agent.docker_tag)
1097
+
1098
+ # Reset agent state for next attempt
1099
+ LOG.info("๐Ÿ”„ Resetting agent state for next attempt...")
1100
+
1101
+ # Save state that must be preserved
1102
+ saved_attempts = list(agent.previous_attempts)
1103
+ saved_model = agent.model._base_model if hasattr(agent.model, '_base_model') else agent.model
1104
+ saved_env = agent.env
1105
+ saved_tool_registry = agent.tool_registry
1106
+ saved_workspace_path = agent.workspace_path
1107
+ saved_project_path = agent.project_path
1108
+ saved_project_url = agent.project_url
1109
+ saved_hyperparams = agent.hyperparams
1110
+ saved_repo_context = agent.repo_context
1111
+ saved_tools_doc_string = agent.tools_doc_string
1112
+ saved_language_guidelines = agent.language_guidelines
1113
+ saved_cycle_instruction = agent.cycle_instruction
1114
+ saved_summarize_cycle = agent.summarize_cycle
1115
+ saved_remove_progress_bars_prompt = agent.remove_progress_bars_prompt
1116
+ saved_search_workflows_summary_prompt = agent.search_workflows_summary_prompt
1117
+ saved_step_limit = agent.step_limit
1118
+
1119
+ # Reset volatile state
1120
+ agent.commands_and_summary = []
1121
+ agent.written_files = []
1122
+ agent.messages = []
1123
+ agent.cycle_count = 0
1124
+ agent.last_action = None
1125
+ agent.last_result = None
1126
+ agent.last_thoughts = None
1127
+ agent.last_format_error = None
1128
+ agent._last_failed_response = None
1129
+ agent.command_stuck = False
1130
+ agent.current_logfile = None
1131
+ agent.stuck_commands = []
1132
+ agent.container = None
1133
+ agent.docker_tag = ""
1134
+
1135
+ # Restore preserved state
1136
+ agent.previous_attempts = saved_attempts
1137
+ agent.model = saved_model
1138
+ agent.env = saved_env
1139
+ agent.tool_registry = saved_tool_registry
1140
+ agent.workspace_path = saved_workspace_path
1141
+ agent.project_path = saved_project_path
1142
+ agent.project_url = saved_project_url
1143
+ agent.hyperparams = saved_hyperparams
1144
+ agent.repo_context = saved_repo_context
1145
+ agent.tools_doc_string = saved_tools_doc_string
1146
+ agent.language_guidelines = saved_language_guidelines
1147
+ agent.cycle_instruction = saved_cycle_instruction
1148
+ agent.summarize_cycle = saved_summarize_cycle
1149
+ agent.remove_progress_bars_prompt = saved_remove_progress_bars_prompt
1150
+ agent.search_workflows_summary_prompt = saved_search_workflows_summary_prompt
1151
+ agent.step_limit = saved_step_limit
1152
+
1153
+ # Reset environment container reference if it exists
1154
+ if hasattr(agent.env, 'container'):
1155
+ agent.env.container = None
1156
+
1157
+ # Reinitialize logging wrappers for new attempt
1158
+ LOG.info("๐Ÿ“ Reinitializing logging wrappers...")
1159
+ _attach_cycle_and_transcript_logging(agent, run_dir, LOG)
1160
+ try:
1161
+ agent.model = CyclePromptLoggerModelProxy(agent.model, agent, run_dir, LOG)
1162
+ except Exception as wrap_err:
1163
+ LOG.warning(f"Failed to reinitialize model wrapper: {wrap_err}")
1164
+
1165
+ LOG.info(f"โœ“ State reset complete. Preserved {len(saved_attempts)} previous attempt summaries.")
1166
+ LOG.info(f"Starting attempt {attempt + 1}...")
1167
+
1168
+ except Exception as e:
1169
+ LOG.error(f"โŒ Unexpected error during attempt {attempt}: {e}", exc_info=True)
1170
+ break
1171
+
1172
+ finally:
1173
+ # Persist messages for this attempt
1174
+ try:
1175
+ tw = getattr(agent, "_transcript_writers", None)
1176
+ if tw is not None:
1177
+ msgs = list(getattr(agent, "messages", []) or [])
1178
+ tw.finalize_full_json(msgs)
1179
+ except Exception:
1180
+ pass
1181
+
1182
+ # Final cleanup
1183
+ try:
1184
+ tw = getattr(agent, "_transcript_writers", None)
1185
+ if tw is not None:
1186
+ tw.close()
1187
+ except Exception:
1188
+ pass
1189
+
1190
+ # Generate bash script from trace
1191
+ try:
1192
+ from execution_agent.trace_to_bash import save_bash_script_from_agent
1193
+
1194
+ bash_script_path = run_dir / "replay_trace.sh"
1195
+ LOG.info("Generating bash script from execution trace...")
1196
+ save_bash_script_from_agent(agent, bash_script_path)
1197
+ LOG.info(f"โœ“ Bash script saved to: {bash_script_path}")
1198
+ except Exception as e:
1199
+ LOG.warning(f"Failed to generate bash script from trace: {e}")
1200
+
1201
+ # Save tool execution metrics
1202
+ try:
1203
+ from execution_agent.shared_utils import get_metrics_collector
1204
+ metrics = get_metrics_collector().get_all_metrics()
1205
+ if metrics:
1206
+ metrics_path = run_dir / "tool_metrics.json"
1207
+ with open(metrics_path, "w") as f:
1208
+ json.dump(metrics, f, indent=2)
1209
+ LOG.info(f"โœ“ Tool metrics saved to: {metrics_path}")
1210
+
1211
+ # Log summary of tool metrics
1212
+ for tool_name, tool_metrics in metrics.items():
1213
+ LOG.info(
1214
+ f" {tool_name}: {tool_metrics['total_calls']} calls, "
1215
+ f"{tool_metrics['success_rate_percent']:.1f}% success, "
1216
+ f"avg {tool_metrics['avg_duration_seconds']:.2f}s"
1217
+ )
1218
+ except Exception as e:
1219
+ LOG.warning(f"Failed to save tool metrics: {e}")
1220
+
1221
+ if final_success:
1222
+ LOG.info("=" * 80)
1223
+ LOG.info("๐ŸŽ‰ Agent run completed successfully!")
1224
+ LOG.info("=" * 80)
1225
+ return 0
1226
+ else:
1227
+ LOG.error("=" * 80)
1228
+ LOG.error("๐Ÿ’ฅ Agent run failed to accomplish goals")
1229
+ LOG.error("=" * 80)
1230
+ return 1
1231
+
1232
+
1233
+ if __name__ == "__main__":
1234
+ raise SystemExit(main())