amd-gaia 0.15.0__py3-none-any.whl → 0.15.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.2.dist-info}/METADATA +222 -223
  2. amd_gaia-0.15.2.dist-info/RECORD +182 -0
  3. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.2.dist-info}/WHEEL +1 -1
  4. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.2.dist-info}/entry_points.txt +1 -0
  5. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.2.dist-info}/licenses/LICENSE.md +20 -20
  6. gaia/__init__.py +29 -29
  7. gaia/agents/__init__.py +19 -19
  8. gaia/agents/base/__init__.py +9 -9
  9. gaia/agents/base/agent.py +2132 -2177
  10. gaia/agents/base/api_agent.py +119 -120
  11. gaia/agents/base/console.py +1967 -1841
  12. gaia/agents/base/errors.py +237 -237
  13. gaia/agents/base/mcp_agent.py +86 -86
  14. gaia/agents/base/tools.py +88 -83
  15. gaia/agents/blender/__init__.py +7 -0
  16. gaia/agents/blender/agent.py +553 -556
  17. gaia/agents/blender/agent_simple.py +133 -135
  18. gaia/agents/blender/app.py +211 -211
  19. gaia/agents/blender/app_simple.py +41 -41
  20. gaia/agents/blender/core/__init__.py +16 -16
  21. gaia/agents/blender/core/materials.py +506 -506
  22. gaia/agents/blender/core/objects.py +316 -316
  23. gaia/agents/blender/core/rendering.py +225 -225
  24. gaia/agents/blender/core/scene.py +220 -220
  25. gaia/agents/blender/core/view.py +146 -146
  26. gaia/agents/chat/__init__.py +9 -9
  27. gaia/agents/chat/agent.py +809 -835
  28. gaia/agents/chat/app.py +1065 -1058
  29. gaia/agents/chat/session.py +508 -508
  30. gaia/agents/chat/tools/__init__.py +15 -15
  31. gaia/agents/chat/tools/file_tools.py +96 -96
  32. gaia/agents/chat/tools/rag_tools.py +1744 -1729
  33. gaia/agents/chat/tools/shell_tools.py +437 -436
  34. gaia/agents/code/__init__.py +7 -7
  35. gaia/agents/code/agent.py +549 -549
  36. gaia/agents/code/cli.py +377 -0
  37. gaia/agents/code/models.py +135 -135
  38. gaia/agents/code/orchestration/__init__.py +24 -24
  39. gaia/agents/code/orchestration/checklist_executor.py +1763 -1763
  40. gaia/agents/code/orchestration/checklist_generator.py +713 -713
  41. gaia/agents/code/orchestration/factories/__init__.py +9 -9
  42. gaia/agents/code/orchestration/factories/base.py +63 -63
  43. gaia/agents/code/orchestration/factories/nextjs_factory.py +118 -118
  44. gaia/agents/code/orchestration/factories/python_factory.py +106 -106
  45. gaia/agents/code/orchestration/orchestrator.py +841 -841
  46. gaia/agents/code/orchestration/project_analyzer.py +391 -391
  47. gaia/agents/code/orchestration/steps/__init__.py +67 -67
  48. gaia/agents/code/orchestration/steps/base.py +188 -188
  49. gaia/agents/code/orchestration/steps/error_handler.py +314 -314
  50. gaia/agents/code/orchestration/steps/nextjs.py +828 -828
  51. gaia/agents/code/orchestration/steps/python.py +307 -307
  52. gaia/agents/code/orchestration/template_catalog.py +469 -469
  53. gaia/agents/code/orchestration/workflows/__init__.py +14 -14
  54. gaia/agents/code/orchestration/workflows/base.py +80 -80
  55. gaia/agents/code/orchestration/workflows/nextjs.py +186 -186
  56. gaia/agents/code/orchestration/workflows/python.py +94 -94
  57. gaia/agents/code/prompts/__init__.py +11 -11
  58. gaia/agents/code/prompts/base_prompt.py +77 -77
  59. gaia/agents/code/prompts/code_patterns.py +2034 -2036
  60. gaia/agents/code/prompts/nextjs_prompt.py +40 -40
  61. gaia/agents/code/prompts/python_prompt.py +109 -109
  62. gaia/agents/code/schema_inference.py +365 -365
  63. gaia/agents/code/system_prompt.py +41 -41
  64. gaia/agents/code/tools/__init__.py +42 -42
  65. gaia/agents/code/tools/cli_tools.py +1138 -1138
  66. gaia/agents/code/tools/code_formatting.py +319 -319
  67. gaia/agents/code/tools/code_tools.py +769 -769
  68. gaia/agents/code/tools/error_fixing.py +1347 -1347
  69. gaia/agents/code/tools/external_tools.py +180 -180
  70. gaia/agents/code/tools/file_io.py +845 -845
  71. gaia/agents/code/tools/prisma_tools.py +190 -190
  72. gaia/agents/code/tools/project_management.py +1016 -1016
  73. gaia/agents/code/tools/testing.py +321 -321
  74. gaia/agents/code/tools/typescript_tools.py +122 -122
  75. gaia/agents/code/tools/validation_parsing.py +461 -461
  76. gaia/agents/code/tools/validation_tools.py +806 -806
  77. gaia/agents/code/tools/web_dev_tools.py +1758 -1758
  78. gaia/agents/code/validators/__init__.py +16 -16
  79. gaia/agents/code/validators/antipattern_checker.py +241 -241
  80. gaia/agents/code/validators/ast_analyzer.py +197 -197
  81. gaia/agents/code/validators/requirements_validator.py +145 -145
  82. gaia/agents/code/validators/syntax_validator.py +171 -171
  83. gaia/agents/docker/__init__.py +7 -7
  84. gaia/agents/docker/agent.py +643 -642
  85. gaia/agents/emr/__init__.py +8 -8
  86. gaia/agents/emr/agent.py +1504 -1506
  87. gaia/agents/emr/cli.py +1322 -1322
  88. gaia/agents/emr/constants.py +475 -475
  89. gaia/agents/emr/dashboard/__init__.py +4 -4
  90. gaia/agents/emr/dashboard/server.py +1972 -1974
  91. gaia/agents/jira/__init__.py +11 -11
  92. gaia/agents/jira/agent.py +894 -894
  93. gaia/agents/jira/jql_templates.py +299 -299
  94. gaia/agents/routing/__init__.py +7 -7
  95. gaia/agents/routing/agent.py +567 -570
  96. gaia/agents/routing/system_prompt.py +75 -75
  97. gaia/agents/summarize/__init__.py +11 -0
  98. gaia/agents/summarize/agent.py +885 -0
  99. gaia/agents/summarize/prompts.py +129 -0
  100. gaia/api/__init__.py +23 -23
  101. gaia/api/agent_registry.py +238 -238
  102. gaia/api/app.py +305 -305
  103. gaia/api/openai_server.py +575 -575
  104. gaia/api/schemas.py +186 -186
  105. gaia/api/sse_handler.py +373 -373
  106. gaia/apps/__init__.py +4 -4
  107. gaia/apps/llm/__init__.py +6 -6
  108. gaia/apps/llm/app.py +184 -169
  109. gaia/apps/summarize/app.py +116 -633
  110. gaia/apps/summarize/html_viewer.py +133 -133
  111. gaia/apps/summarize/pdf_formatter.py +284 -284
  112. gaia/audio/__init__.py +2 -2
  113. gaia/audio/audio_client.py +439 -439
  114. gaia/audio/audio_recorder.py +269 -269
  115. gaia/audio/kokoro_tts.py +599 -599
  116. gaia/audio/whisper_asr.py +432 -432
  117. gaia/chat/__init__.py +16 -16
  118. gaia/chat/app.py +428 -430
  119. gaia/chat/prompts.py +522 -522
  120. gaia/chat/sdk.py +1228 -1225
  121. gaia/cli.py +5659 -5632
  122. gaia/database/__init__.py +10 -10
  123. gaia/database/agent.py +176 -176
  124. gaia/database/mixin.py +290 -290
  125. gaia/database/testing.py +64 -64
  126. gaia/eval/batch_experiment.py +2332 -2332
  127. gaia/eval/claude.py +542 -542
  128. gaia/eval/config.py +37 -37
  129. gaia/eval/email_generator.py +512 -512
  130. gaia/eval/eval.py +3179 -3179
  131. gaia/eval/groundtruth.py +1130 -1130
  132. gaia/eval/transcript_generator.py +582 -582
  133. gaia/eval/webapp/README.md +167 -167
  134. gaia/eval/webapp/package-lock.json +875 -875
  135. gaia/eval/webapp/package.json +20 -20
  136. gaia/eval/webapp/public/app.js +3402 -3402
  137. gaia/eval/webapp/public/index.html +87 -87
  138. gaia/eval/webapp/public/styles.css +3661 -3661
  139. gaia/eval/webapp/server.js +415 -415
  140. gaia/eval/webapp/test-setup.js +72 -72
  141. gaia/installer/__init__.py +23 -0
  142. gaia/installer/init_command.py +1275 -0
  143. gaia/installer/lemonade_installer.py +619 -0
  144. gaia/llm/__init__.py +10 -2
  145. gaia/llm/base_client.py +60 -0
  146. gaia/llm/exceptions.py +12 -0
  147. gaia/llm/factory.py +70 -0
  148. gaia/llm/lemonade_client.py +3421 -3221
  149. gaia/llm/lemonade_manager.py +294 -294
  150. gaia/llm/providers/__init__.py +9 -0
  151. gaia/llm/providers/claude.py +108 -0
  152. gaia/llm/providers/lemonade.py +118 -0
  153. gaia/llm/providers/openai_provider.py +79 -0
  154. gaia/llm/vlm_client.py +382 -382
  155. gaia/logger.py +189 -189
  156. gaia/mcp/agent_mcp_server.py +245 -245
  157. gaia/mcp/blender_mcp_client.py +138 -138
  158. gaia/mcp/blender_mcp_server.py +648 -648
  159. gaia/mcp/context7_cache.py +332 -332
  160. gaia/mcp/external_services.py +518 -518
  161. gaia/mcp/mcp_bridge.py +811 -550
  162. gaia/mcp/servers/__init__.py +6 -6
  163. gaia/mcp/servers/docker_mcp.py +83 -83
  164. gaia/perf_analysis.py +361 -0
  165. gaia/rag/__init__.py +10 -10
  166. gaia/rag/app.py +293 -293
  167. gaia/rag/demo.py +304 -304
  168. gaia/rag/pdf_utils.py +235 -235
  169. gaia/rag/sdk.py +2194 -2194
  170. gaia/security.py +183 -163
  171. gaia/talk/app.py +287 -289
  172. gaia/talk/sdk.py +538 -538
  173. gaia/testing/__init__.py +87 -87
  174. gaia/testing/assertions.py +330 -330
  175. gaia/testing/fixtures.py +333 -333
  176. gaia/testing/mocks.py +493 -493
  177. gaia/util.py +46 -46
  178. gaia/utils/__init__.py +33 -33
  179. gaia/utils/file_watcher.py +675 -675
  180. gaia/utils/parsing.py +223 -223
  181. gaia/version.py +100 -100
  182. amd_gaia-0.15.0.dist-info/RECORD +0 -168
  183. gaia/agents/code/app.py +0 -266
  184. gaia/llm/llm_client.py +0 -723
  185. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.2.dist-info}/top_level.txt +0 -0
@@ -1,841 +1,841 @@
1
- # Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
2
- # SPDX-License-Identifier: MIT
3
- """
4
- Orchestrator for LLM-driven workflow execution.
5
-
6
- The Orchestrator controls workflow execution using Checklist Mode:
7
- - LLM generates a checklist of template invocations based on user request
8
- - Executor runs templates deterministically with error recovery
9
- - Provides semantic understanding (e.g., adds checkboxes for todos)
10
-
11
- Features:
12
- - LLM-driven checklist generation
13
- - Deterministic template execution
14
- - Error recovery with three-tier strategy
15
- - Progress reporting
16
- """
17
-
18
- import json
19
- import logging
20
- import os
21
- import re
22
- import subprocess
23
- from dataclasses import dataclass, field
24
- from pathlib import Path
25
- from typing import Any, Callable, Dict, List, Optional, Protocol
26
-
27
- from gaia.agents.base.console import AgentConsole
28
-
29
- from .steps.base import ToolExecutor, UserContext
30
- from .steps.error_handler import ErrorHandler
31
-
32
- logger = logging.getLogger(__name__)
33
-
34
-
35
- class ProjectDirectoryError(Exception):
36
- """Raised when the project directory cannot be prepared safely."""
37
-
38
-
39
- def _estimate_token_count(text: str) -> int:
40
- """Lightweight token estimate assuming ~4 characters per token."""
41
- avg_chars_per_token = 4
42
- byte_length = len(text.encode("utf-8"))
43
- return max(1, (byte_length + avg_chars_per_token - 1) // avg_chars_per_token)
44
-
45
-
46
- class ChatSDK(Protocol):
47
- """Protocol for chat SDK interface used by checklist generator."""
48
-
49
- def send(self, message: str, timeout: int = 600, no_history: bool = False) -> Any:
50
- """Send a message and get response."""
51
- ...
52
-
53
-
54
- @dataclass
55
- class ExecutionResult:
56
- """Result of a complete workflow execution."""
57
-
58
- success: bool
59
- phases_completed: List[str] = field(default_factory=list)
60
- phases_failed: List[str] = field(default_factory=list)
61
- total_steps: int = 0
62
- steps_succeeded: int = 0
63
- steps_failed: int = 0
64
- steps_skipped: int = 0
65
- errors: List[str] = field(default_factory=list)
66
- outputs: Dict[str, Any] = field(default_factory=dict)
67
-
68
- @property
69
- def summary(self) -> str:
70
- """Get a human-readable summary."""
71
- status = "SUCCESS" if self.success else "FAILED"
72
- return (
73
- f"{status}: {self.steps_succeeded}/{self.total_steps} steps completed, "
74
- f"{self.steps_failed} failed, {self.steps_skipped} skipped"
75
- )
76
-
77
-
78
- CHECKPOINT_REVIEW_PROMPT = """You are the checkpoint reviewer for the GAIA web development agent.
79
-
80
- You receive:
81
- - The original user request
82
- - A summary of the latest checklist execution (including errors/warnings)
83
- - Logs from the validation and testing tools (run_typescript_check, validate_styles, run_tests, etc.)
84
- - Any previously requested fixes that are still outstanding
85
-
86
- Decide if the application is ready to ship or if additional fixes are required.
87
-
88
- Rules:
89
- 1. If ANY validation or test log failed, status must be \"needs_fix\" with concrete guidance.
90
- 2. Only return \"complete\" when the app works end-to-end and validations passed.
91
- 3. When fixes are needed, suggest actionable steps that can be executed through `fix_code` (LLM-assisted repair of problematic files).
92
-
93
- Respond with concise JSON only:
94
- {
95
- \"status\": \"complete\" | \"needs_fix\",
96
- \"reasoning\": \"short justification\",
97
- \"issues\": [\"list of concrete bugs or failures\"],
98
- \"fix_instructions\": [\"ordered actions the next checklist should perform\"]
99
- }
100
- """
101
-
102
- MAX_CHAT_HISTORY_TOKENS = 15000
103
-
104
-
105
- @dataclass
106
- class CheckpointAssessment:
107
- """LLM-produced verdict about the current checkpoint."""
108
-
109
- status: str
110
- reasoning: str
111
- issues: List[str] = field(default_factory=list)
112
- fix_instructions: List[str] = field(default_factory=list)
113
-
114
- @property
115
- def needs_fix(self) -> bool:
116
- """Return True when the reviewer requires another checklist."""
117
- return self.status.lower() != "complete"
118
-
119
- def to_dict(self) -> Dict[str, Any]:
120
- """Serialize the assessment."""
121
- return {
122
- "status": self.status,
123
- "reasoning": self.reasoning,
124
- "issues": self.issues,
125
- "fix_instructions": self.fix_instructions,
126
- }
127
-
128
-
129
- class Orchestrator:
130
- """Controls LLM-driven workflow execution with error recovery.
131
-
132
- The orchestrator uses Checklist Mode exclusively:
133
- - LLM analyzes user request and generates a checklist of templates
134
- - Executor runs templates deterministically
135
- - Provides semantic understanding (e.g., adds checkboxes for todos)
136
- """
137
-
138
- def __init__(
139
- self,
140
- tool_executor: ToolExecutor,
141
- llm_client: ChatSDK,
142
- llm_fixer: Optional[Callable[[str, str], Optional[str]]] = None,
143
- progress_callback: Optional[Callable[[str, str, int, int], None]] = None,
144
- console: Optional[AgentConsole] = None,
145
- max_checklist_loops: int = 10,
146
- ):
147
- """Initialize orchestrator.
148
-
149
- Args:
150
- tool_executor: Function to execute tools (name, args) -> result
151
- llm_client: Chat SDK for checklist generation (required)
152
- llm_fixer: Optional LLM-based code fixer for escalation
153
- progress_callback: Optional callback(phase, step, current, total)
154
- console: Optional console for displaying output
155
- max_checklist_loops: Max number of checklist iterations before giving up
156
- """
157
- if llm_client is None:
158
- raise ValueError("llm_client is required for Orchestrator")
159
-
160
- self.tool_executor = tool_executor
161
- self.llm_client = llm_client
162
- self.error_handler = ErrorHandler(
163
- command_executor=self._run_command,
164
- llm_fixer=llm_fixer,
165
- )
166
- self.progress_callback = progress_callback
167
- self.console = console
168
- self.max_checklist_loops = max(1, max_checklist_loops)
169
-
170
- # Initialize checklist components
171
- from .checklist_executor import ChecklistExecutor
172
- from .checklist_generator import ChecklistGenerator
173
-
174
- self.checklist_generator = ChecklistGenerator(llm_client)
175
- self.checklist_executor = ChecklistExecutor(
176
- tool_executor,
177
- llm_client=llm_client, # Pass LLM for per-item code generation
178
- error_handler=self.error_handler,
179
- progress_callback=self._checklist_progress_callback,
180
- console=console, # Pass console
181
- )
182
- logger.debug(
183
- "Orchestrator initialized - LLM will plan execution AND generate code per item"
184
- )
185
-
186
- def execute(
187
- self, context: UserContext, step_through: bool = False
188
- ) -> ExecutionResult:
189
- """Execute the workflow using iterative LLM-generated checklists."""
190
- logger.debug("Executing workflow (LLM-driven checklist loop)")
191
-
192
- from .project_analyzer import ProjectAnalyzer
193
-
194
- analyzer = ProjectAnalyzer()
195
- aggregated_validation_logs: List[Any] = []
196
- fix_feedback: List[str] = []
197
- iteration_outputs: List[Dict[str, Any]] = []
198
- combined_errors: List[str] = []
199
- previous_execution_errors: List[str] = []
200
- previous_validation_logs: List[Any] = []
201
-
202
- total_steps = 0
203
- steps_succeeded = 0
204
- steps_failed = 0
205
- success = False
206
-
207
- try:
208
- context.project_dir = self._prepare_project_directory(context)
209
- except ProjectDirectoryError as exc:
210
- error_message = str(exc)
211
- logger.error(error_message)
212
- if self.console:
213
- self.console.print_error(error_message)
214
- return ExecutionResult(
215
- success=False,
216
- phases_completed=[],
217
- phases_failed=["project_directory"],
218
- total_steps=1,
219
- steps_succeeded=0,
220
- steps_failed=1,
221
- steps_skipped=0,
222
- errors=[error_message],
223
- outputs={
224
- "iterations": [],
225
- "validation_logs": [],
226
- "fix_feedback": [],
227
- "project_dir": context.project_dir,
228
- },
229
- )
230
-
231
- for iteration in range(1, self.max_checklist_loops + 1):
232
- logger.debug("Starting checklist iteration %d", iteration)
233
-
234
- if iteration > 1:
235
- summary_result = self._maybe_summarize_conversation_history()
236
- if summary_result and self.console:
237
- self.console.print_info(
238
- "Conversation history summarized to stay within token limits."
239
- )
240
-
241
- project_state = analyzer.analyze(context.project_dir)
242
-
243
- # Surface accumulated signals to the next checklist prompt
244
- context.validation_reports = [
245
- log.to_dict() for log in aggregated_validation_logs
246
- ]
247
- context.fix_feedback = fix_feedback.copy()
248
-
249
- logger.info(
250
- "Generating checklist iteration %d of %d",
251
- iteration,
252
- self.max_checklist_loops,
253
- )
254
- if self.console:
255
- self.console.print_info(
256
- f"Generating checklist iteration {iteration} of {self.max_checklist_loops}"
257
- )
258
- if iteration == 1:
259
- checklist = self.checklist_generator.generate_initial_checklist(
260
- context, project_state
261
- )
262
- else:
263
- checklist = self.checklist_generator.generate_debug_checklist(
264
- context=context,
265
- project_state=project_state,
266
- prior_errors=previous_execution_errors,
267
- validation_logs=previous_validation_logs,
268
- )
269
-
270
- if not checklist.is_valid:
271
- logger.error(
272
- "Invalid checklist (iteration %d): %s",
273
- iteration,
274
- checklist.validation_errors,
275
- )
276
- try:
277
- checklist_dump = json.dumps(checklist.to_dict(), indent=2)
278
- except Exception: # pylint: disable=broad-exception-caught
279
- checklist_dump = str(checklist)
280
- logger.error("Invalid checklist payload: %s", checklist_dump)
281
- if self.console:
282
- self.console.pretty_print_json(
283
- checklist.to_dict(), title="Invalid Checklist"
284
- )
285
- combined_errors.extend(checklist.validation_errors)
286
- assessment = CheckpointAssessment(
287
- status="needs_fix",
288
- reasoning="Checklist validation failed",
289
- issues=checklist.validation_errors.copy(),
290
- fix_instructions=checklist.validation_errors.copy(),
291
- )
292
- iteration_outputs.append(
293
- {
294
- "iteration": iteration,
295
- "checklist": checklist.to_dict(),
296
- "execution": None,
297
- "assessment": assessment.to_dict(),
298
- }
299
- )
300
- break
301
-
302
- logger.debug(
303
- "Generated checklist with %d items: %s",
304
- len(checklist.items),
305
- checklist.reasoning,
306
- )
307
-
308
- checklist_result = self.checklist_executor.execute(
309
- checklist, context, step_through=step_through
310
- )
311
-
312
- total_steps += len(checklist_result.item_results)
313
- steps_succeeded += checklist_result.items_succeeded
314
- steps_failed += checklist_result.items_failed
315
- combined_errors.extend(checklist_result.errors)
316
-
317
- aggregated_validation_logs.extend(checklist_result.validation_logs)
318
- previous_execution_errors = checklist_result.errors.copy()
319
- previous_validation_logs = checklist_result.validation_logs.copy()
320
-
321
- logger.info("Assessing application state after iteration %d", iteration)
322
- if self.console:
323
- self.console.print_info(
324
- f"Assessing application state after iteration {iteration}"
325
- )
326
- assessment = self._assess_checkpoint(
327
- context=context,
328
- checklist=checklist,
329
- execution_result=checklist_result,
330
- validation_history=aggregated_validation_logs,
331
- )
332
- if assessment.needs_fix:
333
- logger.info(
334
- "Application not ready after iteration %d, planning another checklist: %s",
335
- iteration,
336
- assessment.reasoning or "no reasoning provided",
337
- )
338
- if self.console:
339
- self.console.print_info(
340
- "Application not ready; preparing another checklist."
341
- )
342
- else:
343
- logger.info(
344
- "Application marked complete after iteration %d: %s",
345
- iteration,
346
- assessment.reasoning or "no reasoning provided",
347
- )
348
- if self.console:
349
- self.console.print_success("Application marked complete.")
350
-
351
- iteration_outputs.append(
352
- {
353
- "iteration": iteration,
354
- "checklist": checklist.to_dict(),
355
- "execution": {
356
- "summary": checklist_result.summary,
357
- "success": checklist_result.success,
358
- "files": checklist_result.total_files,
359
- "errors": checklist_result.errors,
360
- "warnings": checklist_result.warnings,
361
- "item_results": [
362
- r.to_dict() for r in checklist_result.item_results
363
- ],
364
- "validation_logs": [
365
- log.to_dict() for log in checklist_result.validation_logs
366
- ],
367
- },
368
- "assessment": assessment.to_dict(),
369
- }
370
- )
371
-
372
- if not assessment.needs_fix:
373
- success = (
374
- checklist_result.success and assessment.status.lower() == "complete"
375
- )
376
- break
377
-
378
- instructions = assessment.fix_instructions or assessment.issues
379
- if not instructions and assessment.reasoning:
380
- instructions = [assessment.reasoning]
381
- if instructions:
382
- fix_feedback.extend(instructions)
383
-
384
- else:
385
- combined_errors.append(
386
- f"Reached maximum checklist iterations ({self.max_checklist_loops}) without passing validation"
387
- )
388
-
389
- latest_execution = None
390
- latest_checklist = None
391
- if iteration_outputs:
392
- latest_entry = iteration_outputs[-1]
393
- latest_execution = latest_entry.get("execution")
394
- latest_checklist = latest_entry.get("checklist")
395
-
396
- outputs = {
397
- "iterations": iteration_outputs,
398
- "validation_logs": [log.to_dict() for log in aggregated_validation_logs],
399
- "fix_feedback": fix_feedback,
400
- "project_dir": context.project_dir,
401
- }
402
-
403
- if latest_execution:
404
- outputs["files"] = latest_execution.get("files", [])
405
- outputs["detailed_results"] = latest_execution.get("item_results", [])
406
- if latest_checklist:
407
- outputs["checklist"] = latest_checklist
408
-
409
- return ExecutionResult(
410
- success=success,
411
- phases_completed=["checklist"] if success else [],
412
- phases_failed=[] if success else ["checklist"],
413
- total_steps=total_steps,
414
- steps_succeeded=steps_succeeded,
415
- steps_failed=steps_failed,
416
- steps_skipped=0,
417
- errors=combined_errors,
418
- outputs=outputs,
419
- )
420
-
421
- def _run_command(self, command: str, cwd: Optional[str] = None) -> tuple[int, str]:
422
- """Run a shell command.
423
-
424
- Args:
425
- command: Command to run
426
- cwd: Working directory
427
-
428
- Returns:
429
- Tuple of (exit_code, output)
430
- """
431
- try:
432
- result = subprocess.run(
433
- command,
434
- shell=True,
435
- cwd=cwd,
436
- capture_output=True,
437
- text=True,
438
- timeout=1200,
439
- check=False, # We handle return codes ourselves
440
- )
441
- output = result.stdout + result.stderr
442
- return result.returncode, output
443
- except subprocess.TimeoutExpired:
444
- return 1, "Command timed out"
445
- except Exception as e:
446
- return 1, str(e)
447
-
448
- def _checklist_progress_callback(
449
- self, description: str, current: int, total: int
450
- ) -> None:
451
- """Progress callback adapter for checklist execution.
452
-
453
- Converts checklist progress format to the standard progress format.
454
-
455
- Args:
456
- description: Current item description
457
- current: Current item number
458
- total: Total items
459
- """
460
- if self.progress_callback:
461
- self.progress_callback("checklist", description, current, total)
462
-
463
- def _assess_checkpoint(
464
- self,
465
- context: UserContext,
466
- checklist: Any,
467
- execution_result: Any,
468
- validation_history: List[Any],
469
- ) -> CheckpointAssessment:
470
- """Ask the LLM whether the workflow is complete or needs another checklist."""
471
- prompt = self._build_checkpoint_prompt(
472
- context=context,
473
- checklist=checklist,
474
- execution_result=execution_result,
475
- validation_history=validation_history,
476
- )
477
-
478
- try:
479
- response = self.llm_client.send(prompt, timeout=1200)
480
- data = self._parse_checkpoint_response(response)
481
- return CheckpointAssessment(
482
- status=data.get("status", "needs_fix"),
483
- reasoning=data.get("reasoning", ""),
484
- issues=data.get("issues", []),
485
- fix_instructions=data.get("fix_instructions", []),
486
- )
487
- except Exception as exc: # pylint: disable=broad-exception-caught
488
- logger.exception("Checkpoint assessment failed")
489
- return CheckpointAssessment(
490
- status="needs_fix",
491
- reasoning="Failed to interpret checkpoint reviewer output",
492
- issues=[f"Checkpoint reviewer error: {exc}"],
493
- fix_instructions=[
494
- "Inspect validation logs, then fix the root cause using fix_code."
495
- ],
496
- )
497
-
498
- def _build_checkpoint_prompt(
499
- self,
500
- context: UserContext,
501
- checklist: Any,
502
- execution_result: Any,
503
- validation_history: List[Any],
504
- ) -> str:
505
- """Build the prompt for the checkpoint reviewer."""
506
- validation_summary = self._format_validation_history(
507
- validation_history, getattr(execution_result, "validation_logs", None)
508
- )
509
-
510
- outstanding = (
511
- "\n".join(f"- {item}" for item in context.fix_feedback)
512
- if context.fix_feedback
513
- else "None"
514
- )
515
-
516
- errors = execution_result.errors or ["None"]
517
- warnings = execution_result.warnings or []
518
-
519
- sections = [
520
- CHECKPOINT_REVIEW_PROMPT.strip(),
521
- "",
522
- "## User Request",
523
- context.user_request,
524
- "",
525
- "## Latest Checklist Plan",
526
- f"Reasoning: {checklist.reasoning}",
527
- "",
528
- "## Execution Summary",
529
- execution_result.summary,
530
- "",
531
- "## Execution Errors",
532
- "\n".join(f"- {err}" for err in errors),
533
- "",
534
- "## Execution Warnings",
535
- "\n".join(f"- {warn}" for warn in warnings) if warnings else "None",
536
- "",
537
- "## Validation & Test Logs",
538
- validation_summary,
539
- "",
540
- "## Outstanding Fix Requests",
541
- outstanding,
542
- ]
543
-
544
- return "\n".join(sections)
545
-
546
- def _maybe_summarize_conversation_history(self) -> Optional[str]:
547
- """Trigger ChatSDK conversation summarization when available."""
548
- chat_sdk = getattr(self, "llm_client", None)
549
- if not chat_sdk or not hasattr(chat_sdk, "summarize_conversation_history"):
550
- return None
551
-
552
- try:
553
- summary = chat_sdk.summarize_conversation_history(
554
- max_history_tokens=MAX_CHAT_HISTORY_TOKENS
555
- )
556
- if summary:
557
- logger.info(
558
- "Conversation history summarized to ~%d tokens",
559
- _estimate_token_count(summary),
560
- )
561
- return summary
562
- except Exception as exc: # pylint: disable=broad-exception-caught
563
- logger.exception("Failed to summarize conversation history: %s", exc)
564
- return None
565
-
566
- def _prepare_project_directory(self, context: UserContext) -> str:
567
- """
568
- Ensure the project directory is ready for creation workflows.
569
-
570
- If the provided path exists and is non-empty without an existing project,
571
- pick a unique subdirectory via the LLM to avoid create-next-app failures.
572
- """
573
- base_path = Path(context.project_dir).expanduser()
574
- if base_path.exists() and not base_path.is_dir():
575
- raise ProjectDirectoryError(
576
- f"Provided path is not a directory: {base_path}"
577
- )
578
-
579
- if not base_path.exists():
580
- base_path.mkdir(parents=True, exist_ok=True)
581
- logger.info("Created project directory: %s", base_path)
582
- return str(base_path)
583
-
584
- existing_entries = [p.name for p in base_path.iterdir()]
585
- if not existing_entries:
586
- return str(base_path)
587
-
588
- if self.console:
589
- self.console.print_warning(
590
- f"Target directory {base_path} is not empty; selecting a new subdirectory."
591
- )
592
-
593
- suggested = self._choose_subdirectory_name(
594
- base_path, existing_entries, context.user_request
595
- )
596
- if not suggested:
597
- raise ProjectDirectoryError(
598
- f"Unable to find an available project name under {base_path}. "
599
- "Provide one explicitly with --path."
600
- )
601
-
602
- new_dir = base_path / suggested
603
- new_dir.mkdir(parents=False, exist_ok=False)
604
- logger.info("Using nested project directory: %s", new_dir)
605
- # Align process cwd with the newly created project directory.
606
- try:
607
- os.chdir(new_dir)
608
- except OSError as exc:
609
- logger.warning("Failed to chdir to %s: %s", new_dir, exc)
610
- if self.console:
611
- self.console.print_info(f"Using project directory: {new_dir}")
612
- return str(new_dir)
613
-
614
- def _choose_subdirectory_name(
615
- self, base_path: Path, existing_entries: List[str], user_request: str
616
- ) -> Optional[str]:
617
- """Ask the LLM for a unique subdirectory name, retrying on conflicts."""
618
- existing_lower = {name.lower() for name in existing_entries}
619
- prompt = self._build_directory_prompt(
620
- base_path, existing_entries, user_request, None
621
- )
622
- last_reason = None
623
-
624
- system_prompt = "You suggest concise folder names for new projects."
625
-
626
- for attempt in range(1, 4):
627
- try:
628
- response = self._send_prompt_without_history(
629
- prompt, timeout=120, system_prompt=system_prompt
630
- )
631
- except Exception as exc: # pylint: disable=broad-exception-caught
632
- last_reason = f"LLM error on attempt {attempt}: {exc}"
633
- logger.warning(last_reason)
634
- prompt = self._build_directory_prompt(
635
- base_path, existing_entries, user_request, last_reason
636
- )
637
- continue
638
-
639
- raw_response = self._extract_response_text(response)
640
- candidate = self._sanitize_directory_name(raw_response)
641
- if not candidate:
642
- last_reason = "LLM returned an empty or invalid directory name."
643
- elif candidate.lower() in existing_lower:
644
- last_reason = f"Name '{candidate}' already exists in {base_path}."
645
- elif "/" in candidate or "\\" in candidate or ".." in candidate:
646
- last_reason = "Directory name contained path separators or traversal."
647
- elif len(candidate) > 64:
648
- last_reason = "Directory name exceeded 64 characters."
649
- else:
650
- candidate_path = base_path / candidate
651
- if candidate_path.exists():
652
- last_reason = f"Directory '{candidate}' already exists."
653
- else:
654
- return candidate
655
-
656
- logger.warning(
657
- "Directory name attempt %d rejected: %s", attempt, last_reason
658
- )
659
- prompt = self._build_directory_prompt(
660
- base_path, existing_entries, user_request, last_reason
661
- )
662
-
663
- return None
664
-
665
- @staticmethod
666
- def _sanitize_directory_name(raw: str) -> str:
667
- """Normalize LLM output to a filesystem-safe directory name."""
668
- if not raw:
669
- return ""
670
- candidate = raw.strip().strip("`'\"")
671
- candidate = candidate.splitlines()[0].strip()
672
- candidate = re.sub(r"[^A-Za-z0-9_-]+", "-", candidate)
673
- return candidate.strip("-_").lower()
674
-
675
- def _send_prompt_without_history(
676
- self, prompt: str, timeout: int = 120, system_prompt: Optional[str] = None
677
- ) -> Any:
678
- """
679
- Send a prompt without reading from or writing to chat history.
680
-
681
- Prefers the underlying LLM client's `generate` API when available,
682
- falling back to `send(..., no_history=True)` for compatibility.
683
- """
684
- # If the ChatSDK exposes the underlying LLM client, use it directly with chat messages
685
- # to avoid any stored history and ensure system prompts are applied cleanly.
686
- llm_client = getattr(self.llm_client, "llm_client", None)
687
- if llm_client and hasattr(llm_client, "generate"):
688
- model = getattr(getattr(self.llm_client, "config", None), "model", None)
689
- messages = []
690
- if system_prompt:
691
- messages.append({"role": "system", "content": system_prompt})
692
- messages.append({"role": "user", "content": prompt})
693
- return llm_client.generate(
694
- prompt=prompt,
695
- messages=messages,
696
- model=model,
697
- timeout=timeout,
698
- endpoint="chat",
699
- )
700
-
701
- # Fallback: use send with no_history to avoid persisting messages.
702
- if hasattr(self.llm_client, "send"):
703
- return self.llm_client.send(
704
- prompt, timeout=timeout, no_history=True, system_prompt=system_prompt
705
- )
706
-
707
- raise ValueError("LLM client does not support generate or send APIs")
708
-
709
- @staticmethod
710
- def _build_directory_prompt(
711
- base_path: Path,
712
- existing_entries: List[str],
713
- user_request: Optional[str],
714
- rejection_reason: Optional[str],
715
- ) -> str:
716
- """Construct the LLM prompt for picking a safe project subdirectory."""
717
- entries = sorted(existing_entries)
718
- max_list = 50
719
- if len(entries) > max_list:
720
- entries_display = "\n".join(f"- {name}" for name in entries[:max_list])
721
- entries_display += f"\n- ...and {len(entries) - max_list} more"
722
- else:
723
- entries_display = "\n".join(f"- {name}" for name in entries)
724
-
725
- prompt_sections = [
726
- "You must choose a new folder name for a project because the target path is not empty.",
727
- f"Base path: {base_path}",
728
- "Existing files and folders you MUST avoid (do not reuse any of these names):",
729
- entries_display or "- <empty>",
730
- "User request driving this project:",
731
- user_request or "<no request provided>",
732
- "Rules:",
733
- "- Return a single folder name only. Do NOT echo the instructions. No paths, quotes, JSON, or extra text.",
734
- "- Use lowercase kebab-case or snake_case; ASCII letters, numbers, hyphens, and underscores only.",
735
- "- Do not use any existing names above. Avoid dots, spaces, or slashes.",
736
- "- Keep it under 40 characters.",
737
- ]
738
-
739
- if rejection_reason:
740
- prompt_sections.append(
741
- f"Previous suggestion was rejected: {rejection_reason}. Try a different unique name."
742
- )
743
-
744
- return "\n".join(prompt_sections)
745
-
746
- def _format_validation_history(
747
- self, validation_history: List[Any], latest_plan_logs: Optional[List[Any]]
748
- ) -> str:
749
- """Format validation logs, splitting latest plan from historical ones."""
750
-
751
- if not validation_history:
752
- return "No validation or test commands have been executed yet."
753
-
754
- latest_logs = latest_plan_logs or []
755
- latest_count = len(latest_logs)
756
- historical_logs = (
757
- validation_history[:-latest_count] if latest_count else validation_history
758
- )
759
-
760
- def normalize(entry: Any) -> Dict[str, Any]:
761
- if hasattr(entry, "to_dict"):
762
- return entry.to_dict()
763
- if isinstance(entry, dict):
764
- return entry
765
- return {}
766
-
767
- def render(entries: List[Any], limit: Optional[int] = None) -> List[str]:
768
- if not entries:
769
- return ["None"]
770
-
771
- selected = entries if limit is None else entries[-limit:]
772
- lines: List[str] = []
773
- for entry in selected:
774
- data = normalize(entry)
775
- template = data.get("template", "unknown")
776
- description = data.get("description", "")
777
- success = data.get("success", True)
778
- status = "PASS" if success else "FAIL"
779
- error = data.get("error")
780
- output = data.get("output", {})
781
-
782
- lines.append(f"- [{status}] {template}: {description}")
783
- if error:
784
- lines.append(f" Error: {error}")
785
-
786
- snippet = ""
787
- if isinstance(output, dict):
788
- for key in ("stdout", "stderr", "message", "log", "details"):
789
- if output.get(key):
790
- snippet = str(output[key])
791
- break
792
- if not snippet and output:
793
- snippet = json.dumps(output)[:400]
794
- elif output:
795
- snippet = str(output)[:400]
796
-
797
- snippet = snippet.strip()
798
- if snippet:
799
- lines.append(f" Output: {snippet[:400]}")
800
- return lines
801
-
802
- sections: List[str] = []
803
- sections.append("### Latest Plan Results")
804
- sections.extend(render(list(latest_logs)))
805
- sections.append("")
806
- sections.append("### Previous Plan History")
807
- sections.extend(render(list(historical_logs), limit=5))
808
-
809
- return "\n".join(sections).strip()
810
-
811
- def _parse_checkpoint_response(self, response: Any) -> Dict[str, Any]:
812
- """Parse JSON output from the checkpoint reviewer."""
813
- text = self._extract_response_text(response)
814
- json_str = self._extract_json(text)
815
- return json.loads(json_str)
816
-
817
- @staticmethod
818
- def _extract_response_text(response: Any) -> str:
819
- """Normalize SDK response objects to raw text."""
820
- if isinstance(response, str):
821
- return response
822
- if hasattr(response, "text"):
823
- return response.text
824
- if hasattr(response, "content"):
825
- return response.content
826
- if isinstance(response, dict):
827
- return response.get("text", response.get("content", str(response)))
828
- return str(response)
829
-
830
- @staticmethod
831
- def _extract_json(text: str) -> str:
832
- """Extract JSON blob from arbitrary text (markdown-safe)."""
833
- code_block = re.search(r"```(?:json)?\\s*\\n?(.*?)\\n?```", text, re.DOTALL)
834
- if code_block:
835
- return code_block.group(1).strip()
836
-
837
- json_match = re.search(r"\\{.*\\}", text, re.DOTALL)
838
- if json_match:
839
- return json_match.group(0)
840
-
841
- return text.strip()
1
+ # Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
2
+ # SPDX-License-Identifier: MIT
3
+ """
4
+ Orchestrator for LLM-driven workflow execution.
5
+
6
+ The Orchestrator controls workflow execution using Checklist Mode:
7
+ - LLM generates a checklist of template invocations based on user request
8
+ - Executor runs templates deterministically with error recovery
9
+ - Provides semantic understanding (e.g., adds checkboxes for todos)
10
+
11
+ Features:
12
+ - LLM-driven checklist generation
13
+ - Deterministic template execution
14
+ - Error recovery with three-tier strategy
15
+ - Progress reporting
16
+ """
17
+
18
+ import json
19
+ import logging
20
+ import os
21
+ import re
22
+ import subprocess
23
+ from dataclasses import dataclass, field
24
+ from pathlib import Path
25
+ from typing import Any, Callable, Dict, List, Optional, Protocol
26
+
27
+ from gaia.agents.base.console import AgentConsole
28
+
29
+ from .steps.base import ToolExecutor, UserContext
30
+ from .steps.error_handler import ErrorHandler
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ class ProjectDirectoryError(Exception):
36
+ """Raised when the project directory cannot be prepared safely."""
37
+
38
+
39
+ def _estimate_token_count(text: str) -> int:
40
+ """Lightweight token estimate assuming ~4 characters per token."""
41
+ avg_chars_per_token = 4
42
+ byte_length = len(text.encode("utf-8"))
43
+ return max(1, (byte_length + avg_chars_per_token - 1) // avg_chars_per_token)
44
+
45
+
46
+ class ChatSDK(Protocol):
47
+ """Protocol for chat SDK interface used by checklist generator."""
48
+
49
+ def send(self, message: str, timeout: int = 600, no_history: bool = False) -> Any:
50
+ """Send a message and get response."""
51
+ ...
52
+
53
+
54
+ @dataclass
55
+ class ExecutionResult:
56
+ """Result of a complete workflow execution."""
57
+
58
+ success: bool
59
+ phases_completed: List[str] = field(default_factory=list)
60
+ phases_failed: List[str] = field(default_factory=list)
61
+ total_steps: int = 0
62
+ steps_succeeded: int = 0
63
+ steps_failed: int = 0
64
+ steps_skipped: int = 0
65
+ errors: List[str] = field(default_factory=list)
66
+ outputs: Dict[str, Any] = field(default_factory=dict)
67
+
68
+ @property
69
+ def summary(self) -> str:
70
+ """Get a human-readable summary."""
71
+ status = "SUCCESS" if self.success else "FAILED"
72
+ return (
73
+ f"{status}: {self.steps_succeeded}/{self.total_steps} steps completed, "
74
+ f"{self.steps_failed} failed, {self.steps_skipped} skipped"
75
+ )
76
+
77
+
78
+ CHECKPOINT_REVIEW_PROMPT = """You are the checkpoint reviewer for the GAIA web development agent.
79
+
80
+ You receive:
81
+ - The original user request
82
+ - A summary of the latest checklist execution (including errors/warnings)
83
+ - Logs from the validation and testing tools (run_typescript_check, validate_styles, run_tests, etc.)
84
+ - Any previously requested fixes that are still outstanding
85
+
86
+ Decide if the application is ready to ship or if additional fixes are required.
87
+
88
+ Rules:
89
+ 1. If ANY validation or test log failed, status must be \"needs_fix\" with concrete guidance.
90
+ 2. Only return \"complete\" when the app works end-to-end and validations passed.
91
+ 3. When fixes are needed, suggest actionable steps that can be executed through `fix_code` (LLM-assisted repair of problematic files).
92
+
93
+ Respond with concise JSON only:
94
+ {
95
+ \"status\": \"complete\" | \"needs_fix\",
96
+ \"reasoning\": \"short justification\",
97
+ \"issues\": [\"list of concrete bugs or failures\"],
98
+ \"fix_instructions\": [\"ordered actions the next checklist should perform\"]
99
+ }
100
+ """
101
+
102
+ MAX_CHAT_HISTORY_TOKENS = 15000
103
+
104
+
105
+ @dataclass
106
+ class CheckpointAssessment:
107
+ """LLM-produced verdict about the current checkpoint."""
108
+
109
+ status: str
110
+ reasoning: str
111
+ issues: List[str] = field(default_factory=list)
112
+ fix_instructions: List[str] = field(default_factory=list)
113
+
114
+ @property
115
+ def needs_fix(self) -> bool:
116
+ """Return True when the reviewer requires another checklist."""
117
+ return self.status.lower() != "complete"
118
+
119
+ def to_dict(self) -> Dict[str, Any]:
120
+ """Serialize the assessment."""
121
+ return {
122
+ "status": self.status,
123
+ "reasoning": self.reasoning,
124
+ "issues": self.issues,
125
+ "fix_instructions": self.fix_instructions,
126
+ }
127
+
128
+
129
+ class Orchestrator:
130
+ """Controls LLM-driven workflow execution with error recovery.
131
+
132
+ The orchestrator uses Checklist Mode exclusively:
133
+ - LLM analyzes user request and generates a checklist of templates
134
+ - Executor runs templates deterministically
135
+ - Provides semantic understanding (e.g., adds checkboxes for todos)
136
+ """
137
+
138
+ def __init__(
139
+ self,
140
+ tool_executor: ToolExecutor,
141
+ llm_client: ChatSDK,
142
+ llm_fixer: Optional[Callable[[str, str], Optional[str]]] = None,
143
+ progress_callback: Optional[Callable[[str, str, int, int], None]] = None,
144
+ console: Optional[AgentConsole] = None,
145
+ max_checklist_loops: int = 10,
146
+ ):
147
+ """Initialize orchestrator.
148
+
149
+ Args:
150
+ tool_executor: Function to execute tools (name, args) -> result
151
+ llm_client: Chat SDK for checklist generation (required)
152
+ llm_fixer: Optional LLM-based code fixer for escalation
153
+ progress_callback: Optional callback(phase, step, current, total)
154
+ console: Optional console for displaying output
155
+ max_checklist_loops: Max number of checklist iterations before giving up
156
+ """
157
+ if llm_client is None:
158
+ raise ValueError("llm_client is required for Orchestrator")
159
+
160
+ self.tool_executor = tool_executor
161
+ self.llm_client = llm_client
162
+ self.error_handler = ErrorHandler(
163
+ command_executor=self._run_command,
164
+ llm_fixer=llm_fixer,
165
+ )
166
+ self.progress_callback = progress_callback
167
+ self.console = console
168
+ self.max_checklist_loops = max(1, max_checklist_loops)
169
+
170
+ # Initialize checklist components
171
+ from .checklist_executor import ChecklistExecutor
172
+ from .checklist_generator import ChecklistGenerator
173
+
174
+ self.checklist_generator = ChecklistGenerator(llm_client)
175
+ self.checklist_executor = ChecklistExecutor(
176
+ tool_executor,
177
+ llm_client=llm_client, # Pass LLM for per-item code generation
178
+ error_handler=self.error_handler,
179
+ progress_callback=self._checklist_progress_callback,
180
+ console=console, # Pass console
181
+ )
182
+ logger.debug(
183
+ "Orchestrator initialized - LLM will plan execution AND generate code per item"
184
+ )
185
+
186
+ def execute(
187
+ self, context: UserContext, step_through: bool = False
188
+ ) -> ExecutionResult:
189
+ """Execute the workflow using iterative LLM-generated checklists."""
190
+ logger.debug("Executing workflow (LLM-driven checklist loop)")
191
+
192
+ from .project_analyzer import ProjectAnalyzer
193
+
194
+ analyzer = ProjectAnalyzer()
195
+ aggregated_validation_logs: List[Any] = []
196
+ fix_feedback: List[str] = []
197
+ iteration_outputs: List[Dict[str, Any]] = []
198
+ combined_errors: List[str] = []
199
+ previous_execution_errors: List[str] = []
200
+ previous_validation_logs: List[Any] = []
201
+
202
+ total_steps = 0
203
+ steps_succeeded = 0
204
+ steps_failed = 0
205
+ success = False
206
+
207
+ try:
208
+ context.project_dir = self._prepare_project_directory(context)
209
+ except ProjectDirectoryError as exc:
210
+ error_message = str(exc)
211
+ logger.error(error_message)
212
+ if self.console:
213
+ self.console.print_error(error_message)
214
+ return ExecutionResult(
215
+ success=False,
216
+ phases_completed=[],
217
+ phases_failed=["project_directory"],
218
+ total_steps=1,
219
+ steps_succeeded=0,
220
+ steps_failed=1,
221
+ steps_skipped=0,
222
+ errors=[error_message],
223
+ outputs={
224
+ "iterations": [],
225
+ "validation_logs": [],
226
+ "fix_feedback": [],
227
+ "project_dir": context.project_dir,
228
+ },
229
+ )
230
+
231
+ for iteration in range(1, self.max_checklist_loops + 1):
232
+ logger.debug("Starting checklist iteration %d", iteration)
233
+
234
+ if iteration > 1:
235
+ summary_result = self._maybe_summarize_conversation_history()
236
+ if summary_result and self.console:
237
+ self.console.print_info(
238
+ "Conversation history summarized to stay within token limits."
239
+ )
240
+
241
+ project_state = analyzer.analyze(context.project_dir)
242
+
243
+ # Surface accumulated signals to the next checklist prompt
244
+ context.validation_reports = [
245
+ log.to_dict() for log in aggregated_validation_logs
246
+ ]
247
+ context.fix_feedback = fix_feedback.copy()
248
+
249
+ logger.info(
250
+ "Generating checklist iteration %d of %d",
251
+ iteration,
252
+ self.max_checklist_loops,
253
+ )
254
+ if self.console:
255
+ self.console.print_info(
256
+ f"Generating checklist iteration {iteration} of {self.max_checklist_loops}"
257
+ )
258
+ if iteration == 1:
259
+ checklist = self.checklist_generator.generate_initial_checklist(
260
+ context, project_state
261
+ )
262
+ else:
263
+ checklist = self.checklist_generator.generate_debug_checklist(
264
+ context=context,
265
+ project_state=project_state,
266
+ prior_errors=previous_execution_errors,
267
+ validation_logs=previous_validation_logs,
268
+ )
269
+
270
+ if not checklist.is_valid:
271
+ logger.error(
272
+ "Invalid checklist (iteration %d): %s",
273
+ iteration,
274
+ checklist.validation_errors,
275
+ )
276
+ try:
277
+ checklist_dump = json.dumps(checklist.to_dict(), indent=2)
278
+ except Exception: # pylint: disable=broad-exception-caught
279
+ checklist_dump = str(checklist)
280
+ logger.error("Invalid checklist payload: %s", checklist_dump)
281
+ if self.console:
282
+ self.console.pretty_print_json(
283
+ checklist.to_dict(), title="Invalid Checklist"
284
+ )
285
+ combined_errors.extend(checklist.validation_errors)
286
+ assessment = CheckpointAssessment(
287
+ status="needs_fix",
288
+ reasoning="Checklist validation failed",
289
+ issues=checklist.validation_errors.copy(),
290
+ fix_instructions=checklist.validation_errors.copy(),
291
+ )
292
+ iteration_outputs.append(
293
+ {
294
+ "iteration": iteration,
295
+ "checklist": checklist.to_dict(),
296
+ "execution": None,
297
+ "assessment": assessment.to_dict(),
298
+ }
299
+ )
300
+ break
301
+
302
+ logger.debug(
303
+ "Generated checklist with %d items: %s",
304
+ len(checklist.items),
305
+ checklist.reasoning,
306
+ )
307
+
308
+ checklist_result = self.checklist_executor.execute(
309
+ checklist, context, step_through=step_through
310
+ )
311
+
312
+ total_steps += len(checklist_result.item_results)
313
+ steps_succeeded += checklist_result.items_succeeded
314
+ steps_failed += checklist_result.items_failed
315
+ combined_errors.extend(checklist_result.errors)
316
+
317
+ aggregated_validation_logs.extend(checklist_result.validation_logs)
318
+ previous_execution_errors = checklist_result.errors.copy()
319
+ previous_validation_logs = checklist_result.validation_logs.copy()
320
+
321
+ logger.info("Assessing application state after iteration %d", iteration)
322
+ if self.console:
323
+ self.console.print_info(
324
+ f"Assessing application state after iteration {iteration}"
325
+ )
326
+ assessment = self._assess_checkpoint(
327
+ context=context,
328
+ checklist=checklist,
329
+ execution_result=checklist_result,
330
+ validation_history=aggregated_validation_logs,
331
+ )
332
+ if assessment.needs_fix:
333
+ logger.info(
334
+ "Application not ready after iteration %d, planning another checklist: %s",
335
+ iteration,
336
+ assessment.reasoning or "no reasoning provided",
337
+ )
338
+ if self.console:
339
+ self.console.print_info(
340
+ "Application not ready; preparing another checklist."
341
+ )
342
+ else:
343
+ logger.info(
344
+ "Application marked complete after iteration %d: %s",
345
+ iteration,
346
+ assessment.reasoning or "no reasoning provided",
347
+ )
348
+ if self.console:
349
+ self.console.print_success("Application marked complete.")
350
+
351
+ iteration_outputs.append(
352
+ {
353
+ "iteration": iteration,
354
+ "checklist": checklist.to_dict(),
355
+ "execution": {
356
+ "summary": checklist_result.summary,
357
+ "success": checklist_result.success,
358
+ "files": checklist_result.total_files,
359
+ "errors": checklist_result.errors,
360
+ "warnings": checklist_result.warnings,
361
+ "item_results": [
362
+ r.to_dict() for r in checklist_result.item_results
363
+ ],
364
+ "validation_logs": [
365
+ log.to_dict() for log in checklist_result.validation_logs
366
+ ],
367
+ },
368
+ "assessment": assessment.to_dict(),
369
+ }
370
+ )
371
+
372
+ if not assessment.needs_fix:
373
+ success = (
374
+ checklist_result.success and assessment.status.lower() == "complete"
375
+ )
376
+ break
377
+
378
+ instructions = assessment.fix_instructions or assessment.issues
379
+ if not instructions and assessment.reasoning:
380
+ instructions = [assessment.reasoning]
381
+ if instructions:
382
+ fix_feedback.extend(instructions)
383
+
384
+ else:
385
+ combined_errors.append(
386
+ f"Reached maximum checklist iterations ({self.max_checklist_loops}) without passing validation"
387
+ )
388
+
389
+ latest_execution = None
390
+ latest_checklist = None
391
+ if iteration_outputs:
392
+ latest_entry = iteration_outputs[-1]
393
+ latest_execution = latest_entry.get("execution")
394
+ latest_checklist = latest_entry.get("checklist")
395
+
396
+ outputs = {
397
+ "iterations": iteration_outputs,
398
+ "validation_logs": [log.to_dict() for log in aggregated_validation_logs],
399
+ "fix_feedback": fix_feedback,
400
+ "project_dir": context.project_dir,
401
+ }
402
+
403
+ if latest_execution:
404
+ outputs["files"] = latest_execution.get("files", [])
405
+ outputs["detailed_results"] = latest_execution.get("item_results", [])
406
+ if latest_checklist:
407
+ outputs["checklist"] = latest_checklist
408
+
409
+ return ExecutionResult(
410
+ success=success,
411
+ phases_completed=["checklist"] if success else [],
412
+ phases_failed=[] if success else ["checklist"],
413
+ total_steps=total_steps,
414
+ steps_succeeded=steps_succeeded,
415
+ steps_failed=steps_failed,
416
+ steps_skipped=0,
417
+ errors=combined_errors,
418
+ outputs=outputs,
419
+ )
420
+
421
+ def _run_command(self, command: str, cwd: Optional[str] = None) -> tuple[int, str]:
422
+ """Run a shell command.
423
+
424
+ Args:
425
+ command: Command to run
426
+ cwd: Working directory
427
+
428
+ Returns:
429
+ Tuple of (exit_code, output)
430
+ """
431
+ try:
432
+ result = subprocess.run(
433
+ command,
434
+ shell=True,
435
+ cwd=cwd,
436
+ capture_output=True,
437
+ text=True,
438
+ timeout=1200,
439
+ check=False, # We handle return codes ourselves
440
+ )
441
+ output = result.stdout + result.stderr
442
+ return result.returncode, output
443
+ except subprocess.TimeoutExpired:
444
+ return 1, "Command timed out"
445
+ except Exception as e:
446
+ return 1, str(e)
447
+
448
+ def _checklist_progress_callback(
449
+ self, description: str, current: int, total: int
450
+ ) -> None:
451
+ """Progress callback adapter for checklist execution.
452
+
453
+ Converts checklist progress format to the standard progress format.
454
+
455
+ Args:
456
+ description: Current item description
457
+ current: Current item number
458
+ total: Total items
459
+ """
460
+ if self.progress_callback:
461
+ self.progress_callback("checklist", description, current, total)
462
+
463
+ def _assess_checkpoint(
464
+ self,
465
+ context: UserContext,
466
+ checklist: Any,
467
+ execution_result: Any,
468
+ validation_history: List[Any],
469
+ ) -> CheckpointAssessment:
470
+ """Ask the LLM whether the workflow is complete or needs another checklist."""
471
+ prompt = self._build_checkpoint_prompt(
472
+ context=context,
473
+ checklist=checklist,
474
+ execution_result=execution_result,
475
+ validation_history=validation_history,
476
+ )
477
+
478
+ try:
479
+ response = self.llm_client.send(prompt, timeout=1200)
480
+ data = self._parse_checkpoint_response(response)
481
+ return CheckpointAssessment(
482
+ status=data.get("status", "needs_fix"),
483
+ reasoning=data.get("reasoning", ""),
484
+ issues=data.get("issues", []),
485
+ fix_instructions=data.get("fix_instructions", []),
486
+ )
487
+ except Exception as exc: # pylint: disable=broad-exception-caught
488
+ logger.exception("Checkpoint assessment failed")
489
+ return CheckpointAssessment(
490
+ status="needs_fix",
491
+ reasoning="Failed to interpret checkpoint reviewer output",
492
+ issues=[f"Checkpoint reviewer error: {exc}"],
493
+ fix_instructions=[
494
+ "Inspect validation logs, then fix the root cause using fix_code."
495
+ ],
496
+ )
497
+
498
+ def _build_checkpoint_prompt(
499
+ self,
500
+ context: UserContext,
501
+ checklist: Any,
502
+ execution_result: Any,
503
+ validation_history: List[Any],
504
+ ) -> str:
505
+ """Build the prompt for the checkpoint reviewer."""
506
+ validation_summary = self._format_validation_history(
507
+ validation_history, getattr(execution_result, "validation_logs", None)
508
+ )
509
+
510
+ outstanding = (
511
+ "\n".join(f"- {item}" for item in context.fix_feedback)
512
+ if context.fix_feedback
513
+ else "None"
514
+ )
515
+
516
+ errors = execution_result.errors or ["None"]
517
+ warnings = execution_result.warnings or []
518
+
519
+ sections = [
520
+ CHECKPOINT_REVIEW_PROMPT.strip(),
521
+ "",
522
+ "## User Request",
523
+ context.user_request,
524
+ "",
525
+ "## Latest Checklist Plan",
526
+ f"Reasoning: {checklist.reasoning}",
527
+ "",
528
+ "## Execution Summary",
529
+ execution_result.summary,
530
+ "",
531
+ "## Execution Errors",
532
+ "\n".join(f"- {err}" for err in errors),
533
+ "",
534
+ "## Execution Warnings",
535
+ "\n".join(f"- {warn}" for warn in warnings) if warnings else "None",
536
+ "",
537
+ "## Validation & Test Logs",
538
+ validation_summary,
539
+ "",
540
+ "## Outstanding Fix Requests",
541
+ outstanding,
542
+ ]
543
+
544
+ return "\n".join(sections)
545
+
546
+ def _maybe_summarize_conversation_history(self) -> Optional[str]:
547
+ """Trigger ChatSDK conversation summarization when available."""
548
+ chat_sdk = getattr(self, "llm_client", None)
549
+ if not chat_sdk or not hasattr(chat_sdk, "summarize_conversation_history"):
550
+ return None
551
+
552
+ try:
553
+ summary = chat_sdk.summarize_conversation_history(
554
+ max_history_tokens=MAX_CHAT_HISTORY_TOKENS
555
+ )
556
+ if summary:
557
+ logger.info(
558
+ "Conversation history summarized to ~%d tokens",
559
+ _estimate_token_count(summary),
560
+ )
561
+ return summary
562
+ except Exception as exc: # pylint: disable=broad-exception-caught
563
+ logger.exception("Failed to summarize conversation history: %s", exc)
564
+ return None
565
+
566
+ def _prepare_project_directory(self, context: UserContext) -> str:
567
+ """
568
+ Ensure the project directory is ready for creation workflows.
569
+
570
+ If the provided path exists and is non-empty without an existing project,
571
+ pick a unique subdirectory via the LLM to avoid create-next-app failures.
572
+ """
573
+ base_path = Path(context.project_dir).expanduser()
574
+ if base_path.exists() and not base_path.is_dir():
575
+ raise ProjectDirectoryError(
576
+ f"Provided path is not a directory: {base_path}"
577
+ )
578
+
579
+ if not base_path.exists():
580
+ base_path.mkdir(parents=True, exist_ok=True)
581
+ logger.info("Created project directory: %s", base_path)
582
+ return str(base_path)
583
+
584
+ existing_entries = [p.name for p in base_path.iterdir()]
585
+ if not existing_entries:
586
+ return str(base_path)
587
+
588
+ if self.console:
589
+ self.console.print_warning(
590
+ f"Target directory {base_path} is not empty; selecting a new subdirectory."
591
+ )
592
+
593
+ suggested = self._choose_subdirectory_name(
594
+ base_path, existing_entries, context.user_request
595
+ )
596
+ if not suggested:
597
+ raise ProjectDirectoryError(
598
+ f"Unable to find an available project name under {base_path}. "
599
+ "Provide one explicitly with --path."
600
+ )
601
+
602
+ new_dir = base_path / suggested
603
+ new_dir.mkdir(parents=False, exist_ok=False)
604
+ logger.info("Using nested project directory: %s", new_dir)
605
+ # Align process cwd with the newly created project directory.
606
+ try:
607
+ os.chdir(new_dir)
608
+ except OSError as exc:
609
+ logger.warning("Failed to chdir to %s: %s", new_dir, exc)
610
+ if self.console:
611
+ self.console.print_info(f"Using project directory: {new_dir}")
612
+ return str(new_dir)
613
+
614
+ def _choose_subdirectory_name(
615
+ self, base_path: Path, existing_entries: List[str], user_request: str
616
+ ) -> Optional[str]:
617
+ """Ask the LLM for a unique subdirectory name, retrying on conflicts."""
618
+ existing_lower = {name.lower() for name in existing_entries}
619
+ prompt = self._build_directory_prompt(
620
+ base_path, existing_entries, user_request, None
621
+ )
622
+ last_reason = None
623
+
624
+ system_prompt = "You suggest concise folder names for new projects."
625
+
626
+ for attempt in range(1, 4):
627
+ try:
628
+ response = self._send_prompt_without_history(
629
+ prompt, timeout=120, system_prompt=system_prompt
630
+ )
631
+ except Exception as exc: # pylint: disable=broad-exception-caught
632
+ last_reason = f"LLM error on attempt {attempt}: {exc}"
633
+ logger.warning(last_reason)
634
+ prompt = self._build_directory_prompt(
635
+ base_path, existing_entries, user_request, last_reason
636
+ )
637
+ continue
638
+
639
+ raw_response = self._extract_response_text(response)
640
+ candidate = self._sanitize_directory_name(raw_response)
641
+ if not candidate:
642
+ last_reason = "LLM returned an empty or invalid directory name."
643
+ elif candidate.lower() in existing_lower:
644
+ last_reason = f"Name '{candidate}' already exists in {base_path}."
645
+ elif "/" in candidate or "\\" in candidate or ".." in candidate:
646
+ last_reason = "Directory name contained path separators or traversal."
647
+ elif len(candidate) > 64:
648
+ last_reason = "Directory name exceeded 64 characters."
649
+ else:
650
+ candidate_path = base_path / candidate
651
+ if candidate_path.exists():
652
+ last_reason = f"Directory '{candidate}' already exists."
653
+ else:
654
+ return candidate
655
+
656
+ logger.warning(
657
+ "Directory name attempt %d rejected: %s", attempt, last_reason
658
+ )
659
+ prompt = self._build_directory_prompt(
660
+ base_path, existing_entries, user_request, last_reason
661
+ )
662
+
663
+ return None
664
+
665
+ @staticmethod
666
+ def _sanitize_directory_name(raw: str) -> str:
667
+ """Normalize LLM output to a filesystem-safe directory name."""
668
+ if not raw:
669
+ return ""
670
+ candidate = raw.strip().strip("`'\"")
671
+ candidate = candidate.splitlines()[0].strip()
672
+ candidate = re.sub(r"[^A-Za-z0-9_-]+", "-", candidate)
673
+ return candidate.strip("-_").lower()
674
+
675
+ def _send_prompt_without_history(
676
+ self, prompt: str, timeout: int = 120, system_prompt: Optional[str] = None
677
+ ) -> Any:
678
+ """
679
+ Send a prompt without reading from or writing to chat history.
680
+
681
+ Prefers the underlying LLM client's `generate` API when available,
682
+ falling back to `send(..., no_history=True)` for compatibility.
683
+ """
684
+ # If the ChatSDK exposes the underlying LLM client, use it directly with chat messages
685
+ # to avoid any stored history and ensure system prompts are applied cleanly.
686
+ llm_client = getattr(self.llm_client, "llm_client", None)
687
+ if llm_client and hasattr(llm_client, "generate"):
688
+ model = getattr(getattr(self.llm_client, "config", None), "model", None)
689
+ messages = []
690
+ if system_prompt:
691
+ messages.append({"role": "system", "content": system_prompt})
692
+ messages.append({"role": "user", "content": prompt})
693
+ return llm_client.generate(
694
+ prompt=prompt,
695
+ messages=messages,
696
+ model=model,
697
+ timeout=timeout,
698
+ endpoint="chat",
699
+ )
700
+
701
+ # Fallback: use send with no_history to avoid persisting messages.
702
+ if hasattr(self.llm_client, "send"):
703
+ return self.llm_client.send(
704
+ prompt, timeout=timeout, no_history=True, system_prompt=system_prompt
705
+ )
706
+
707
+ raise ValueError("LLM client does not support generate or send APIs")
708
+
709
+ @staticmethod
710
+ def _build_directory_prompt(
711
+ base_path: Path,
712
+ existing_entries: List[str],
713
+ user_request: Optional[str],
714
+ rejection_reason: Optional[str],
715
+ ) -> str:
716
+ """Construct the LLM prompt for picking a safe project subdirectory."""
717
+ entries = sorted(existing_entries)
718
+ max_list = 50
719
+ if len(entries) > max_list:
720
+ entries_display = "\n".join(f"- {name}" for name in entries[:max_list])
721
+ entries_display += f"\n- ...and {len(entries) - max_list} more"
722
+ else:
723
+ entries_display = "\n".join(f"- {name}" for name in entries)
724
+
725
+ prompt_sections = [
726
+ "You must choose a new folder name for a project because the target path is not empty.",
727
+ f"Base path: {base_path}",
728
+ "Existing files and folders you MUST avoid (do not reuse any of these names):",
729
+ entries_display or "- <empty>",
730
+ "User request driving this project:",
731
+ user_request or "<no request provided>",
732
+ "Rules:",
733
+ "- Return a single folder name only. Do NOT echo the instructions. No paths, quotes, JSON, or extra text.",
734
+ "- Use lowercase kebab-case or snake_case; ASCII letters, numbers, hyphens, and underscores only.",
735
+ "- Do not use any existing names above. Avoid dots, spaces, or slashes.",
736
+ "- Keep it under 40 characters.",
737
+ ]
738
+
739
+ if rejection_reason:
740
+ prompt_sections.append(
741
+ f"Previous suggestion was rejected: {rejection_reason}. Try a different unique name."
742
+ )
743
+
744
+ return "\n".join(prompt_sections)
745
+
746
+ def _format_validation_history(
747
+ self, validation_history: List[Any], latest_plan_logs: Optional[List[Any]]
748
+ ) -> str:
749
+ """Format validation logs, splitting latest plan from historical ones."""
750
+
751
+ if not validation_history:
752
+ return "No validation or test commands have been executed yet."
753
+
754
+ latest_logs = latest_plan_logs or []
755
+ latest_count = len(latest_logs)
756
+ historical_logs = (
757
+ validation_history[:-latest_count] if latest_count else validation_history
758
+ )
759
+
760
+ def normalize(entry: Any) -> Dict[str, Any]:
761
+ if hasattr(entry, "to_dict"):
762
+ return entry.to_dict()
763
+ if isinstance(entry, dict):
764
+ return entry
765
+ return {}
766
+
767
+ def render(entries: List[Any], limit: Optional[int] = None) -> List[str]:
768
+ if not entries:
769
+ return ["None"]
770
+
771
+ selected = entries if limit is None else entries[-limit:]
772
+ lines: List[str] = []
773
+ for entry in selected:
774
+ data = normalize(entry)
775
+ template = data.get("template", "unknown")
776
+ description = data.get("description", "")
777
+ success = data.get("success", True)
778
+ status = "PASS" if success else "FAIL"
779
+ error = data.get("error")
780
+ output = data.get("output", {})
781
+
782
+ lines.append(f"- [{status}] {template}: {description}")
783
+ if error:
784
+ lines.append(f" Error: {error}")
785
+
786
+ snippet = ""
787
+ if isinstance(output, dict):
788
+ for key in ("stdout", "stderr", "message", "log", "details"):
789
+ if output.get(key):
790
+ snippet = str(output[key])
791
+ break
792
+ if not snippet and output:
793
+ snippet = json.dumps(output)[:400]
794
+ elif output:
795
+ snippet = str(output)[:400]
796
+
797
+ snippet = snippet.strip()
798
+ if snippet:
799
+ lines.append(f" Output: {snippet[:400]}")
800
+ return lines
801
+
802
+ sections: List[str] = []
803
+ sections.append("### Latest Plan Results")
804
+ sections.extend(render(list(latest_logs)))
805
+ sections.append("")
806
+ sections.append("### Previous Plan History")
807
+ sections.extend(render(list(historical_logs), limit=5))
808
+
809
+ return "\n".join(sections).strip()
810
+
811
+ def _parse_checkpoint_response(self, response: Any) -> Dict[str, Any]:
812
+ """Parse JSON output from the checkpoint reviewer."""
813
+ text = self._extract_response_text(response)
814
+ json_str = self._extract_json(text)
815
+ return json.loads(json_str)
816
+
817
+ @staticmethod
818
+ def _extract_response_text(response: Any) -> str:
819
+ """Normalize SDK response objects to raw text."""
820
+ if isinstance(response, str):
821
+ return response
822
+ if hasattr(response, "text"):
823
+ return response.text
824
+ if hasattr(response, "content"):
825
+ return response.content
826
+ if isinstance(response, dict):
827
+ return response.get("text", response.get("content", str(response)))
828
+ return str(response)
829
+
830
+ @staticmethod
831
+ def _extract_json(text: str) -> str:
832
+ """Extract JSON blob from arbitrary text (markdown-safe)."""
833
+ code_block = re.search(r"```(?:json)?\\s*\\n?(.*?)\\n?```", text, re.DOTALL)
834
+ if code_block:
835
+ return code_block.group(1).strip()
836
+
837
+ json_match = re.search(r"\\{.*\\}", text, re.DOTALL)
838
+ if json_match:
839
+ return json_match.group(0)
840
+
841
+ return text.strip()