amd-gaia 0.15.0__py3-none-any.whl → 0.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.1.dist-info}/METADATA +223 -223
  2. amd_gaia-0.15.1.dist-info/RECORD +178 -0
  3. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.1.dist-info}/entry_points.txt +1 -0
  4. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.1.dist-info}/licenses/LICENSE.md +20 -20
  5. gaia/__init__.py +29 -29
  6. gaia/agents/__init__.py +19 -19
  7. gaia/agents/base/__init__.py +9 -9
  8. gaia/agents/base/agent.py +2177 -2177
  9. gaia/agents/base/api_agent.py +120 -120
  10. gaia/agents/base/console.py +1841 -1841
  11. gaia/agents/base/errors.py +237 -237
  12. gaia/agents/base/mcp_agent.py +86 -86
  13. gaia/agents/base/tools.py +83 -83
  14. gaia/agents/blender/agent.py +556 -556
  15. gaia/agents/blender/agent_simple.py +133 -135
  16. gaia/agents/blender/app.py +211 -211
  17. gaia/agents/blender/app_simple.py +41 -41
  18. gaia/agents/blender/core/__init__.py +16 -16
  19. gaia/agents/blender/core/materials.py +506 -506
  20. gaia/agents/blender/core/objects.py +316 -316
  21. gaia/agents/blender/core/rendering.py +225 -225
  22. gaia/agents/blender/core/scene.py +220 -220
  23. gaia/agents/blender/core/view.py +146 -146
  24. gaia/agents/chat/__init__.py +9 -9
  25. gaia/agents/chat/agent.py +835 -835
  26. gaia/agents/chat/app.py +1058 -1058
  27. gaia/agents/chat/session.py +508 -508
  28. gaia/agents/chat/tools/__init__.py +15 -15
  29. gaia/agents/chat/tools/file_tools.py +96 -96
  30. gaia/agents/chat/tools/rag_tools.py +1729 -1729
  31. gaia/agents/chat/tools/shell_tools.py +436 -436
  32. gaia/agents/code/__init__.py +7 -7
  33. gaia/agents/code/agent.py +549 -549
  34. gaia/agents/code/cli.py +377 -0
  35. gaia/agents/code/models.py +135 -135
  36. gaia/agents/code/orchestration/__init__.py +24 -24
  37. gaia/agents/code/orchestration/checklist_executor.py +1763 -1763
  38. gaia/agents/code/orchestration/checklist_generator.py +713 -713
  39. gaia/agents/code/orchestration/factories/__init__.py +9 -9
  40. gaia/agents/code/orchestration/factories/base.py +63 -63
  41. gaia/agents/code/orchestration/factories/nextjs_factory.py +118 -118
  42. gaia/agents/code/orchestration/factories/python_factory.py +106 -106
  43. gaia/agents/code/orchestration/orchestrator.py +841 -841
  44. gaia/agents/code/orchestration/project_analyzer.py +391 -391
  45. gaia/agents/code/orchestration/steps/__init__.py +67 -67
  46. gaia/agents/code/orchestration/steps/base.py +188 -188
  47. gaia/agents/code/orchestration/steps/error_handler.py +314 -314
  48. gaia/agents/code/orchestration/steps/nextjs.py +828 -828
  49. gaia/agents/code/orchestration/steps/python.py +307 -307
  50. gaia/agents/code/orchestration/template_catalog.py +469 -469
  51. gaia/agents/code/orchestration/workflows/__init__.py +14 -14
  52. gaia/agents/code/orchestration/workflows/base.py +80 -80
  53. gaia/agents/code/orchestration/workflows/nextjs.py +186 -186
  54. gaia/agents/code/orchestration/workflows/python.py +94 -94
  55. gaia/agents/code/prompts/__init__.py +11 -11
  56. gaia/agents/code/prompts/base_prompt.py +77 -77
  57. gaia/agents/code/prompts/code_patterns.py +2036 -2036
  58. gaia/agents/code/prompts/nextjs_prompt.py +40 -40
  59. gaia/agents/code/prompts/python_prompt.py +109 -109
  60. gaia/agents/code/schema_inference.py +365 -365
  61. gaia/agents/code/system_prompt.py +41 -41
  62. gaia/agents/code/tools/__init__.py +42 -42
  63. gaia/agents/code/tools/cli_tools.py +1138 -1138
  64. gaia/agents/code/tools/code_formatting.py +319 -319
  65. gaia/agents/code/tools/code_tools.py +769 -769
  66. gaia/agents/code/tools/error_fixing.py +1347 -1347
  67. gaia/agents/code/tools/external_tools.py +180 -180
  68. gaia/agents/code/tools/file_io.py +845 -845
  69. gaia/agents/code/tools/prisma_tools.py +190 -190
  70. gaia/agents/code/tools/project_management.py +1016 -1016
  71. gaia/agents/code/tools/testing.py +321 -321
  72. gaia/agents/code/tools/typescript_tools.py +122 -122
  73. gaia/agents/code/tools/validation_parsing.py +461 -461
  74. gaia/agents/code/tools/validation_tools.py +806 -806
  75. gaia/agents/code/tools/web_dev_tools.py +1758 -1758
  76. gaia/agents/code/validators/__init__.py +16 -16
  77. gaia/agents/code/validators/antipattern_checker.py +241 -241
  78. gaia/agents/code/validators/ast_analyzer.py +197 -197
  79. gaia/agents/code/validators/requirements_validator.py +145 -145
  80. gaia/agents/code/validators/syntax_validator.py +171 -171
  81. gaia/agents/docker/__init__.py +7 -7
  82. gaia/agents/docker/agent.py +642 -642
  83. gaia/agents/emr/__init__.py +8 -8
  84. gaia/agents/emr/agent.py +1506 -1506
  85. gaia/agents/emr/cli.py +1322 -1322
  86. gaia/agents/emr/constants.py +475 -475
  87. gaia/agents/emr/dashboard/__init__.py +4 -4
  88. gaia/agents/emr/dashboard/server.py +1974 -1974
  89. gaia/agents/jira/__init__.py +11 -11
  90. gaia/agents/jira/agent.py +894 -894
  91. gaia/agents/jira/jql_templates.py +299 -299
  92. gaia/agents/routing/__init__.py +7 -7
  93. gaia/agents/routing/agent.py +567 -570
  94. gaia/agents/routing/system_prompt.py +75 -75
  95. gaia/agents/summarize/__init__.py +11 -0
  96. gaia/agents/summarize/agent.py +885 -0
  97. gaia/agents/summarize/prompts.py +129 -0
  98. gaia/api/__init__.py +23 -23
  99. gaia/api/agent_registry.py +238 -238
  100. gaia/api/app.py +305 -305
  101. gaia/api/openai_server.py +575 -575
  102. gaia/api/schemas.py +186 -186
  103. gaia/api/sse_handler.py +373 -373
  104. gaia/apps/__init__.py +4 -4
  105. gaia/apps/llm/__init__.py +6 -6
  106. gaia/apps/llm/app.py +173 -169
  107. gaia/apps/summarize/app.py +116 -633
  108. gaia/apps/summarize/html_viewer.py +133 -133
  109. gaia/apps/summarize/pdf_formatter.py +284 -284
  110. gaia/audio/__init__.py +2 -2
  111. gaia/audio/audio_client.py +439 -439
  112. gaia/audio/audio_recorder.py +269 -269
  113. gaia/audio/kokoro_tts.py +599 -599
  114. gaia/audio/whisper_asr.py +432 -432
  115. gaia/chat/__init__.py +16 -16
  116. gaia/chat/app.py +430 -430
  117. gaia/chat/prompts.py +522 -522
  118. gaia/chat/sdk.py +1228 -1225
  119. gaia/cli.py +5481 -5632
  120. gaia/database/__init__.py +10 -10
  121. gaia/database/agent.py +176 -176
  122. gaia/database/mixin.py +290 -290
  123. gaia/database/testing.py +64 -64
  124. gaia/eval/batch_experiment.py +2332 -2332
  125. gaia/eval/claude.py +542 -542
  126. gaia/eval/config.py +37 -37
  127. gaia/eval/email_generator.py +512 -512
  128. gaia/eval/eval.py +3179 -3179
  129. gaia/eval/groundtruth.py +1130 -1130
  130. gaia/eval/transcript_generator.py +582 -582
  131. gaia/eval/webapp/README.md +167 -167
  132. gaia/eval/webapp/package-lock.json +875 -875
  133. gaia/eval/webapp/package.json +20 -20
  134. gaia/eval/webapp/public/app.js +3402 -3402
  135. gaia/eval/webapp/public/index.html +87 -87
  136. gaia/eval/webapp/public/styles.css +3661 -3661
  137. gaia/eval/webapp/server.js +415 -415
  138. gaia/eval/webapp/test-setup.js +72 -72
  139. gaia/llm/__init__.py +9 -2
  140. gaia/llm/base_client.py +60 -0
  141. gaia/llm/exceptions.py +12 -0
  142. gaia/llm/factory.py +70 -0
  143. gaia/llm/lemonade_client.py +3236 -3221
  144. gaia/llm/lemonade_manager.py +294 -294
  145. gaia/llm/providers/__init__.py +9 -0
  146. gaia/llm/providers/claude.py +108 -0
  147. gaia/llm/providers/lemonade.py +120 -0
  148. gaia/llm/providers/openai_provider.py +79 -0
  149. gaia/llm/vlm_client.py +382 -382
  150. gaia/logger.py +189 -189
  151. gaia/mcp/agent_mcp_server.py +245 -245
  152. gaia/mcp/blender_mcp_client.py +138 -138
  153. gaia/mcp/blender_mcp_server.py +648 -648
  154. gaia/mcp/context7_cache.py +332 -332
  155. gaia/mcp/external_services.py +518 -518
  156. gaia/mcp/mcp_bridge.py +811 -550
  157. gaia/mcp/servers/__init__.py +6 -6
  158. gaia/mcp/servers/docker_mcp.py +83 -83
  159. gaia/perf_analysis.py +361 -0
  160. gaia/rag/__init__.py +10 -10
  161. gaia/rag/app.py +293 -293
  162. gaia/rag/demo.py +304 -304
  163. gaia/rag/pdf_utils.py +235 -235
  164. gaia/rag/sdk.py +2194 -2194
  165. gaia/security.py +163 -163
  166. gaia/talk/app.py +289 -289
  167. gaia/talk/sdk.py +538 -538
  168. gaia/testing/__init__.py +87 -87
  169. gaia/testing/assertions.py +330 -330
  170. gaia/testing/fixtures.py +333 -333
  171. gaia/testing/mocks.py +493 -493
  172. gaia/util.py +46 -46
  173. gaia/utils/__init__.py +33 -33
  174. gaia/utils/file_watcher.py +675 -675
  175. gaia/utils/parsing.py +223 -223
  176. gaia/version.py +100 -100
  177. amd_gaia-0.15.0.dist-info/RECORD +0 -168
  178. gaia/agents/code/app.py +0 -266
  179. gaia/llm/llm_client.py +0 -723
  180. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.1.dist-info}/WHEEL +0 -0
  181. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.1.dist-info}/top_level.txt +0 -0
@@ -1,841 +1,841 @@
1
- # Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
2
- # SPDX-License-Identifier: MIT
3
- """
4
- Orchestrator for LLM-driven workflow execution.
5
-
6
- The Orchestrator controls workflow execution using Checklist Mode:
7
- - LLM generates a checklist of template invocations based on user request
8
- - Executor runs templates deterministically with error recovery
9
- - Provides semantic understanding (e.g., adds checkboxes for todos)
10
-
11
- Features:
12
- - LLM-driven checklist generation
13
- - Deterministic template execution
14
- - Error recovery with three-tier strategy
15
- - Progress reporting
16
- """
17
-
18
- import json
19
- import logging
20
- import os
21
- import re
22
- import subprocess
23
- from dataclasses import dataclass, field
24
- from pathlib import Path
25
- from typing import Any, Callable, Dict, List, Optional, Protocol
26
-
27
- from gaia.agents.base.console import AgentConsole
28
-
29
- from .steps.base import ToolExecutor, UserContext
30
- from .steps.error_handler import ErrorHandler
31
-
32
- logger = logging.getLogger(__name__)
33
-
34
-
35
- class ProjectDirectoryError(Exception):
36
- """Raised when the project directory cannot be prepared safely."""
37
-
38
-
39
- def _estimate_token_count(text: str) -> int:
40
- """Lightweight token estimate assuming ~4 characters per token."""
41
- avg_chars_per_token = 4
42
- byte_length = len(text.encode("utf-8"))
43
- return max(1, (byte_length + avg_chars_per_token - 1) // avg_chars_per_token)
44
-
45
-
46
- class ChatSDK(Protocol):
47
- """Protocol for chat SDK interface used by checklist generator."""
48
-
49
- def send(self, message: str, timeout: int = 600, no_history: bool = False) -> Any:
50
- """Send a message and get response."""
51
- ...
52
-
53
-
54
- @dataclass
55
- class ExecutionResult:
56
- """Result of a complete workflow execution."""
57
-
58
- success: bool
59
- phases_completed: List[str] = field(default_factory=list)
60
- phases_failed: List[str] = field(default_factory=list)
61
- total_steps: int = 0
62
- steps_succeeded: int = 0
63
- steps_failed: int = 0
64
- steps_skipped: int = 0
65
- errors: List[str] = field(default_factory=list)
66
- outputs: Dict[str, Any] = field(default_factory=dict)
67
-
68
- @property
69
- def summary(self) -> str:
70
- """Get a human-readable summary."""
71
- status = "SUCCESS" if self.success else "FAILED"
72
- return (
73
- f"{status}: {self.steps_succeeded}/{self.total_steps} steps completed, "
74
- f"{self.steps_failed} failed, {self.steps_skipped} skipped"
75
- )
76
-
77
-
78
- CHECKPOINT_REVIEW_PROMPT = """You are the checkpoint reviewer for the GAIA web development agent.
79
-
80
- You receive:
81
- - The original user request
82
- - A summary of the latest checklist execution (including errors/warnings)
83
- - Logs from the validation and testing tools (run_typescript_check, validate_styles, run_tests, etc.)
84
- - Any previously requested fixes that are still outstanding
85
-
86
- Decide if the application is ready to ship or if additional fixes are required.
87
-
88
- Rules:
89
- 1. If ANY validation or test log failed, status must be \"needs_fix\" with concrete guidance.
90
- 2. Only return \"complete\" when the app works end-to-end and validations passed.
91
- 3. When fixes are needed, suggest actionable steps that can be executed through `fix_code` (LLM-assisted repair of problematic files).
92
-
93
- Respond with concise JSON only:
94
- {
95
- \"status\": \"complete\" | \"needs_fix\",
96
- \"reasoning\": \"short justification\",
97
- \"issues\": [\"list of concrete bugs or failures\"],
98
- \"fix_instructions\": [\"ordered actions the next checklist should perform\"]
99
- }
100
- """
101
-
102
- MAX_CHAT_HISTORY_TOKENS = 15000
103
-
104
-
105
- @dataclass
106
- class CheckpointAssessment:
107
- """LLM-produced verdict about the current checkpoint."""
108
-
109
- status: str
110
- reasoning: str
111
- issues: List[str] = field(default_factory=list)
112
- fix_instructions: List[str] = field(default_factory=list)
113
-
114
- @property
115
- def needs_fix(self) -> bool:
116
- """Return True when the reviewer requires another checklist."""
117
- return self.status.lower() != "complete"
118
-
119
- def to_dict(self) -> Dict[str, Any]:
120
- """Serialize the assessment."""
121
- return {
122
- "status": self.status,
123
- "reasoning": self.reasoning,
124
- "issues": self.issues,
125
- "fix_instructions": self.fix_instructions,
126
- }
127
-
128
-
129
- class Orchestrator:
130
- """Controls LLM-driven workflow execution with error recovery.
131
-
132
- The orchestrator uses Checklist Mode exclusively:
133
- - LLM analyzes user request and generates a checklist of templates
134
- - Executor runs templates deterministically
135
- - Provides semantic understanding (e.g., adds checkboxes for todos)
136
- """
137
-
138
- def __init__(
139
- self,
140
- tool_executor: ToolExecutor,
141
- llm_client: ChatSDK,
142
- llm_fixer: Optional[Callable[[str, str], Optional[str]]] = None,
143
- progress_callback: Optional[Callable[[str, str, int, int], None]] = None,
144
- console: Optional[AgentConsole] = None,
145
- max_checklist_loops: int = 10,
146
- ):
147
- """Initialize orchestrator.
148
-
149
- Args:
150
- tool_executor: Function to execute tools (name, args) -> result
151
- llm_client: Chat SDK for checklist generation (required)
152
- llm_fixer: Optional LLM-based code fixer for escalation
153
- progress_callback: Optional callback(phase, step, current, total)
154
- console: Optional console for displaying output
155
- max_checklist_loops: Max number of checklist iterations before giving up
156
- """
157
- if llm_client is None:
158
- raise ValueError("llm_client is required for Orchestrator")
159
-
160
- self.tool_executor = tool_executor
161
- self.llm_client = llm_client
162
- self.error_handler = ErrorHandler(
163
- command_executor=self._run_command,
164
- llm_fixer=llm_fixer,
165
- )
166
- self.progress_callback = progress_callback
167
- self.console = console
168
- self.max_checklist_loops = max(1, max_checklist_loops)
169
-
170
- # Initialize checklist components
171
- from .checklist_executor import ChecklistExecutor
172
- from .checklist_generator import ChecklistGenerator
173
-
174
- self.checklist_generator = ChecklistGenerator(llm_client)
175
- self.checklist_executor = ChecklistExecutor(
176
- tool_executor,
177
- llm_client=llm_client, # Pass LLM for per-item code generation
178
- error_handler=self.error_handler,
179
- progress_callback=self._checklist_progress_callback,
180
- console=console, # Pass console
181
- )
182
- logger.debug(
183
- "Orchestrator initialized - LLM will plan execution AND generate code per item"
184
- )
185
-
186
- def execute(
187
- self, context: UserContext, step_through: bool = False
188
- ) -> ExecutionResult:
189
- """Execute the workflow using iterative LLM-generated checklists."""
190
- logger.debug("Executing workflow (LLM-driven checklist loop)")
191
-
192
- from .project_analyzer import ProjectAnalyzer
193
-
194
- analyzer = ProjectAnalyzer()
195
- aggregated_validation_logs: List[Any] = []
196
- fix_feedback: List[str] = []
197
- iteration_outputs: List[Dict[str, Any]] = []
198
- combined_errors: List[str] = []
199
- previous_execution_errors: List[str] = []
200
- previous_validation_logs: List[Any] = []
201
-
202
- total_steps = 0
203
- steps_succeeded = 0
204
- steps_failed = 0
205
- success = False
206
-
207
- try:
208
- context.project_dir = self._prepare_project_directory(context)
209
- except ProjectDirectoryError as exc:
210
- error_message = str(exc)
211
- logger.error(error_message)
212
- if self.console:
213
- self.console.print_error(error_message)
214
- return ExecutionResult(
215
- success=False,
216
- phases_completed=[],
217
- phases_failed=["project_directory"],
218
- total_steps=1,
219
- steps_succeeded=0,
220
- steps_failed=1,
221
- steps_skipped=0,
222
- errors=[error_message],
223
- outputs={
224
- "iterations": [],
225
- "validation_logs": [],
226
- "fix_feedback": [],
227
- "project_dir": context.project_dir,
228
- },
229
- )
230
-
231
- for iteration in range(1, self.max_checklist_loops + 1):
232
- logger.debug("Starting checklist iteration %d", iteration)
233
-
234
- if iteration > 1:
235
- summary_result = self._maybe_summarize_conversation_history()
236
- if summary_result and self.console:
237
- self.console.print_info(
238
- "Conversation history summarized to stay within token limits."
239
- )
240
-
241
- project_state = analyzer.analyze(context.project_dir)
242
-
243
- # Surface accumulated signals to the next checklist prompt
244
- context.validation_reports = [
245
- log.to_dict() for log in aggregated_validation_logs
246
- ]
247
- context.fix_feedback = fix_feedback.copy()
248
-
249
- logger.info(
250
- "Generating checklist iteration %d of %d",
251
- iteration,
252
- self.max_checklist_loops,
253
- )
254
- if self.console:
255
- self.console.print_info(
256
- f"Generating checklist iteration {iteration} of {self.max_checklist_loops}"
257
- )
258
- if iteration == 1:
259
- checklist = self.checklist_generator.generate_initial_checklist(
260
- context, project_state
261
- )
262
- else:
263
- checklist = self.checklist_generator.generate_debug_checklist(
264
- context=context,
265
- project_state=project_state,
266
- prior_errors=previous_execution_errors,
267
- validation_logs=previous_validation_logs,
268
- )
269
-
270
- if not checklist.is_valid:
271
- logger.error(
272
- "Invalid checklist (iteration %d): %s",
273
- iteration,
274
- checklist.validation_errors,
275
- )
276
- try:
277
- checklist_dump = json.dumps(checklist.to_dict(), indent=2)
278
- except Exception: # pylint: disable=broad-exception-caught
279
- checklist_dump = str(checklist)
280
- logger.error("Invalid checklist payload: %s", checklist_dump)
281
- if self.console:
282
- self.console.pretty_print_json(
283
- checklist.to_dict(), title="Invalid Checklist"
284
- )
285
- combined_errors.extend(checklist.validation_errors)
286
- assessment = CheckpointAssessment(
287
- status="needs_fix",
288
- reasoning="Checklist validation failed",
289
- issues=checklist.validation_errors.copy(),
290
- fix_instructions=checklist.validation_errors.copy(),
291
- )
292
- iteration_outputs.append(
293
- {
294
- "iteration": iteration,
295
- "checklist": checklist.to_dict(),
296
- "execution": None,
297
- "assessment": assessment.to_dict(),
298
- }
299
- )
300
- break
301
-
302
- logger.debug(
303
- "Generated checklist with %d items: %s",
304
- len(checklist.items),
305
- checklist.reasoning,
306
- )
307
-
308
- checklist_result = self.checklist_executor.execute(
309
- checklist, context, step_through=step_through
310
- )
311
-
312
- total_steps += len(checklist_result.item_results)
313
- steps_succeeded += checklist_result.items_succeeded
314
- steps_failed += checklist_result.items_failed
315
- combined_errors.extend(checklist_result.errors)
316
-
317
- aggregated_validation_logs.extend(checklist_result.validation_logs)
318
- previous_execution_errors = checklist_result.errors.copy()
319
- previous_validation_logs = checklist_result.validation_logs.copy()
320
-
321
- logger.info("Assessing application state after iteration %d", iteration)
322
- if self.console:
323
- self.console.print_info(
324
- f"Assessing application state after iteration {iteration}"
325
- )
326
- assessment = self._assess_checkpoint(
327
- context=context,
328
- checklist=checklist,
329
- execution_result=checklist_result,
330
- validation_history=aggregated_validation_logs,
331
- )
332
- if assessment.needs_fix:
333
- logger.info(
334
- "Application not ready after iteration %d, planning another checklist: %s",
335
- iteration,
336
- assessment.reasoning or "no reasoning provided",
337
- )
338
- if self.console:
339
- self.console.print_info(
340
- "Application not ready; preparing another checklist."
341
- )
342
- else:
343
- logger.info(
344
- "Application marked complete after iteration %d: %s",
345
- iteration,
346
- assessment.reasoning or "no reasoning provided",
347
- )
348
- if self.console:
349
- self.console.print_success("Application marked complete.")
350
-
351
- iteration_outputs.append(
352
- {
353
- "iteration": iteration,
354
- "checklist": checklist.to_dict(),
355
- "execution": {
356
- "summary": checklist_result.summary,
357
- "success": checklist_result.success,
358
- "files": checklist_result.total_files,
359
- "errors": checklist_result.errors,
360
- "warnings": checklist_result.warnings,
361
- "item_results": [
362
- r.to_dict() for r in checklist_result.item_results
363
- ],
364
- "validation_logs": [
365
- log.to_dict() for log in checklist_result.validation_logs
366
- ],
367
- },
368
- "assessment": assessment.to_dict(),
369
- }
370
- )
371
-
372
- if not assessment.needs_fix:
373
- success = (
374
- checklist_result.success and assessment.status.lower() == "complete"
375
- )
376
- break
377
-
378
- instructions = assessment.fix_instructions or assessment.issues
379
- if not instructions and assessment.reasoning:
380
- instructions = [assessment.reasoning]
381
- if instructions:
382
- fix_feedback.extend(instructions)
383
-
384
- else:
385
- combined_errors.append(
386
- f"Reached maximum checklist iterations ({self.max_checklist_loops}) without passing validation"
387
- )
388
-
389
- latest_execution = None
390
- latest_checklist = None
391
- if iteration_outputs:
392
- latest_entry = iteration_outputs[-1]
393
- latest_execution = latest_entry.get("execution")
394
- latest_checklist = latest_entry.get("checklist")
395
-
396
- outputs = {
397
- "iterations": iteration_outputs,
398
- "validation_logs": [log.to_dict() for log in aggregated_validation_logs],
399
- "fix_feedback": fix_feedback,
400
- "project_dir": context.project_dir,
401
- }
402
-
403
- if latest_execution:
404
- outputs["files"] = latest_execution.get("files", [])
405
- outputs["detailed_results"] = latest_execution.get("item_results", [])
406
- if latest_checklist:
407
- outputs["checklist"] = latest_checklist
408
-
409
- return ExecutionResult(
410
- success=success,
411
- phases_completed=["checklist"] if success else [],
412
- phases_failed=[] if success else ["checklist"],
413
- total_steps=total_steps,
414
- steps_succeeded=steps_succeeded,
415
- steps_failed=steps_failed,
416
- steps_skipped=0,
417
- errors=combined_errors,
418
- outputs=outputs,
419
- )
420
-
421
- def _run_command(self, command: str, cwd: Optional[str] = None) -> tuple[int, str]:
422
- """Run a shell command.
423
-
424
- Args:
425
- command: Command to run
426
- cwd: Working directory
427
-
428
- Returns:
429
- Tuple of (exit_code, output)
430
- """
431
- try:
432
- result = subprocess.run(
433
- command,
434
- shell=True,
435
- cwd=cwd,
436
- capture_output=True,
437
- text=True,
438
- timeout=1200,
439
- check=False, # We handle return codes ourselves
440
- )
441
- output = result.stdout + result.stderr
442
- return result.returncode, output
443
- except subprocess.TimeoutExpired:
444
- return 1, "Command timed out"
445
- except Exception as e:
446
- return 1, str(e)
447
-
448
- def _checklist_progress_callback(
449
- self, description: str, current: int, total: int
450
- ) -> None:
451
- """Progress callback adapter for checklist execution.
452
-
453
- Converts checklist progress format to the standard progress format.
454
-
455
- Args:
456
- description: Current item description
457
- current: Current item number
458
- total: Total items
459
- """
460
- if self.progress_callback:
461
- self.progress_callback("checklist", description, current, total)
462
-
463
- def _assess_checkpoint(
464
- self,
465
- context: UserContext,
466
- checklist: Any,
467
- execution_result: Any,
468
- validation_history: List[Any],
469
- ) -> CheckpointAssessment:
470
- """Ask the LLM whether the workflow is complete or needs another checklist."""
471
- prompt = self._build_checkpoint_prompt(
472
- context=context,
473
- checklist=checklist,
474
- execution_result=execution_result,
475
- validation_history=validation_history,
476
- )
477
-
478
- try:
479
- response = self.llm_client.send(prompt, timeout=1200)
480
- data = self._parse_checkpoint_response(response)
481
- return CheckpointAssessment(
482
- status=data.get("status", "needs_fix"),
483
- reasoning=data.get("reasoning", ""),
484
- issues=data.get("issues", []),
485
- fix_instructions=data.get("fix_instructions", []),
486
- )
487
- except Exception as exc: # pylint: disable=broad-exception-caught
488
- logger.exception("Checkpoint assessment failed")
489
- return CheckpointAssessment(
490
- status="needs_fix",
491
- reasoning="Failed to interpret checkpoint reviewer output",
492
- issues=[f"Checkpoint reviewer error: {exc}"],
493
- fix_instructions=[
494
- "Inspect validation logs, then fix the root cause using fix_code."
495
- ],
496
- )
497
-
498
- def _build_checkpoint_prompt(
499
- self,
500
- context: UserContext,
501
- checklist: Any,
502
- execution_result: Any,
503
- validation_history: List[Any],
504
- ) -> str:
505
- """Build the prompt for the checkpoint reviewer."""
506
- validation_summary = self._format_validation_history(
507
- validation_history, getattr(execution_result, "validation_logs", None)
508
- )
509
-
510
- outstanding = (
511
- "\n".join(f"- {item}" for item in context.fix_feedback)
512
- if context.fix_feedback
513
- else "None"
514
- )
515
-
516
- errors = execution_result.errors or ["None"]
517
- warnings = execution_result.warnings or []
518
-
519
- sections = [
520
- CHECKPOINT_REVIEW_PROMPT.strip(),
521
- "",
522
- "## User Request",
523
- context.user_request,
524
- "",
525
- "## Latest Checklist Plan",
526
- f"Reasoning: {checklist.reasoning}",
527
- "",
528
- "## Execution Summary",
529
- execution_result.summary,
530
- "",
531
- "## Execution Errors",
532
- "\n".join(f"- {err}" for err in errors),
533
- "",
534
- "## Execution Warnings",
535
- "\n".join(f"- {warn}" for warn in warnings) if warnings else "None",
536
- "",
537
- "## Validation & Test Logs",
538
- validation_summary,
539
- "",
540
- "## Outstanding Fix Requests",
541
- outstanding,
542
- ]
543
-
544
- return "\n".join(sections)
545
-
546
- def _maybe_summarize_conversation_history(self) -> Optional[str]:
547
- """Trigger ChatSDK conversation summarization when available."""
548
- chat_sdk = getattr(self, "llm_client", None)
549
- if not chat_sdk or not hasattr(chat_sdk, "summarize_conversation_history"):
550
- return None
551
-
552
- try:
553
- summary = chat_sdk.summarize_conversation_history(
554
- max_history_tokens=MAX_CHAT_HISTORY_TOKENS
555
- )
556
- if summary:
557
- logger.info(
558
- "Conversation history summarized to ~%d tokens",
559
- _estimate_token_count(summary),
560
- )
561
- return summary
562
- except Exception as exc: # pylint: disable=broad-exception-caught
563
- logger.exception("Failed to summarize conversation history: %s", exc)
564
- return None
565
-
566
- def _prepare_project_directory(self, context: UserContext) -> str:
567
- """
568
- Ensure the project directory is ready for creation workflows.
569
-
570
- If the provided path exists and is non-empty without an existing project,
571
- pick a unique subdirectory via the LLM to avoid create-next-app failures.
572
- """
573
- base_path = Path(context.project_dir).expanduser()
574
- if base_path.exists() and not base_path.is_dir():
575
- raise ProjectDirectoryError(
576
- f"Provided path is not a directory: {base_path}"
577
- )
578
-
579
- if not base_path.exists():
580
- base_path.mkdir(parents=True, exist_ok=True)
581
- logger.info("Created project directory: %s", base_path)
582
- return str(base_path)
583
-
584
- existing_entries = [p.name for p in base_path.iterdir()]
585
- if not existing_entries:
586
- return str(base_path)
587
-
588
- if self.console:
589
- self.console.print_warning(
590
- f"Target directory {base_path} is not empty; selecting a new subdirectory."
591
- )
592
-
593
- suggested = self._choose_subdirectory_name(
594
- base_path, existing_entries, context.user_request
595
- )
596
- if not suggested:
597
- raise ProjectDirectoryError(
598
- f"Unable to find an available project name under {base_path}. "
599
- "Provide one explicitly with --path."
600
- )
601
-
602
- new_dir = base_path / suggested
603
- new_dir.mkdir(parents=False, exist_ok=False)
604
- logger.info("Using nested project directory: %s", new_dir)
605
- # Align process cwd with the newly created project directory.
606
- try:
607
- os.chdir(new_dir)
608
- except OSError as exc:
609
- logger.warning("Failed to chdir to %s: %s", new_dir, exc)
610
- if self.console:
611
- self.console.print_info(f"Using project directory: {new_dir}")
612
- return str(new_dir)
613
-
614
- def _choose_subdirectory_name(
615
- self, base_path: Path, existing_entries: List[str], user_request: str
616
- ) -> Optional[str]:
617
- """Ask the LLM for a unique subdirectory name, retrying on conflicts."""
618
- existing_lower = {name.lower() for name in existing_entries}
619
- prompt = self._build_directory_prompt(
620
- base_path, existing_entries, user_request, None
621
- )
622
- last_reason = None
623
-
624
- system_prompt = "You suggest concise folder names for new projects."
625
-
626
- for attempt in range(1, 4):
627
- try:
628
- response = self._send_prompt_without_history(
629
- prompt, timeout=120, system_prompt=system_prompt
630
- )
631
- except Exception as exc: # pylint: disable=broad-exception-caught
632
- last_reason = f"LLM error on attempt {attempt}: {exc}"
633
- logger.warning(last_reason)
634
- prompt = self._build_directory_prompt(
635
- base_path, existing_entries, user_request, last_reason
636
- )
637
- continue
638
-
639
- raw_response = self._extract_response_text(response)
640
- candidate = self._sanitize_directory_name(raw_response)
641
- if not candidate:
642
- last_reason = "LLM returned an empty or invalid directory name."
643
- elif candidate.lower() in existing_lower:
644
- last_reason = f"Name '{candidate}' already exists in {base_path}."
645
- elif "/" in candidate or "\\" in candidate or ".." in candidate:
646
- last_reason = "Directory name contained path separators or traversal."
647
- elif len(candidate) > 64:
648
- last_reason = "Directory name exceeded 64 characters."
649
- else:
650
- candidate_path = base_path / candidate
651
- if candidate_path.exists():
652
- last_reason = f"Directory '{candidate}' already exists."
653
- else:
654
- return candidate
655
-
656
- logger.warning(
657
- "Directory name attempt %d rejected: %s", attempt, last_reason
658
- )
659
- prompt = self._build_directory_prompt(
660
- base_path, existing_entries, user_request, last_reason
661
- )
662
-
663
- return None
664
-
665
- @staticmethod
666
- def _sanitize_directory_name(raw: str) -> str:
667
- """Normalize LLM output to a filesystem-safe directory name."""
668
- if not raw:
669
- return ""
670
- candidate = raw.strip().strip("`'\"")
671
- candidate = candidate.splitlines()[0].strip()
672
- candidate = re.sub(r"[^A-Za-z0-9_-]+", "-", candidate)
673
- return candidate.strip("-_").lower()
674
-
675
- def _send_prompt_without_history(
676
- self, prompt: str, timeout: int = 120, system_prompt: Optional[str] = None
677
- ) -> Any:
678
- """
679
- Send a prompt without reading from or writing to chat history.
680
-
681
- Prefers the underlying LLM client's `generate` API when available,
682
- falling back to `send(..., no_history=True)` for compatibility.
683
- """
684
- # If the ChatSDK exposes the underlying LLM client, use it directly with chat messages
685
- # to avoid any stored history and ensure system prompts are applied cleanly.
686
- llm_client = getattr(self.llm_client, "llm_client", None)
687
- if llm_client and hasattr(llm_client, "generate"):
688
- model = getattr(getattr(self.llm_client, "config", None), "model", None)
689
- messages = []
690
- if system_prompt:
691
- messages.append({"role": "system", "content": system_prompt})
692
- messages.append({"role": "user", "content": prompt})
693
- return llm_client.generate(
694
- prompt=prompt,
695
- messages=messages,
696
- model=model,
697
- timeout=timeout,
698
- endpoint="chat",
699
- )
700
-
701
- # Fallback: use send with no_history to avoid persisting messages.
702
- if hasattr(self.llm_client, "send"):
703
- return self.llm_client.send(
704
- prompt, timeout=timeout, no_history=True, system_prompt=system_prompt
705
- )
706
-
707
- raise ValueError("LLM client does not support generate or send APIs")
708
-
709
- @staticmethod
710
- def _build_directory_prompt(
711
- base_path: Path,
712
- existing_entries: List[str],
713
- user_request: Optional[str],
714
- rejection_reason: Optional[str],
715
- ) -> str:
716
- """Construct the LLM prompt for picking a safe project subdirectory."""
717
- entries = sorted(existing_entries)
718
- max_list = 50
719
- if len(entries) > max_list:
720
- entries_display = "\n".join(f"- {name}" for name in entries[:max_list])
721
- entries_display += f"\n- ...and {len(entries) - max_list} more"
722
- else:
723
- entries_display = "\n".join(f"- {name}" for name in entries)
724
-
725
- prompt_sections = [
726
- "You must choose a new folder name for a project because the target path is not empty.",
727
- f"Base path: {base_path}",
728
- "Existing files and folders you MUST avoid (do not reuse any of these names):",
729
- entries_display or "- <empty>",
730
- "User request driving this project:",
731
- user_request or "<no request provided>",
732
- "Rules:",
733
- "- Return a single folder name only. Do NOT echo the instructions. No paths, quotes, JSON, or extra text.",
734
- "- Use lowercase kebab-case or snake_case; ASCII letters, numbers, hyphens, and underscores only.",
735
- "- Do not use any existing names above. Avoid dots, spaces, or slashes.",
736
- "- Keep it under 40 characters.",
737
- ]
738
-
739
- if rejection_reason:
740
- prompt_sections.append(
741
- f"Previous suggestion was rejected: {rejection_reason}. Try a different unique name."
742
- )
743
-
744
- return "\n".join(prompt_sections)
745
-
746
- def _format_validation_history(
747
- self, validation_history: List[Any], latest_plan_logs: Optional[List[Any]]
748
- ) -> str:
749
- """Format validation logs, splitting latest plan from historical ones."""
750
-
751
- if not validation_history:
752
- return "No validation or test commands have been executed yet."
753
-
754
- latest_logs = latest_plan_logs or []
755
- latest_count = len(latest_logs)
756
- historical_logs = (
757
- validation_history[:-latest_count] if latest_count else validation_history
758
- )
759
-
760
- def normalize(entry: Any) -> Dict[str, Any]:
761
- if hasattr(entry, "to_dict"):
762
- return entry.to_dict()
763
- if isinstance(entry, dict):
764
- return entry
765
- return {}
766
-
767
- def render(entries: List[Any], limit: Optional[int] = None) -> List[str]:
768
- if not entries:
769
- return ["None"]
770
-
771
- selected = entries if limit is None else entries[-limit:]
772
- lines: List[str] = []
773
- for entry in selected:
774
- data = normalize(entry)
775
- template = data.get("template", "unknown")
776
- description = data.get("description", "")
777
- success = data.get("success", True)
778
- status = "PASS" if success else "FAIL"
779
- error = data.get("error")
780
- output = data.get("output", {})
781
-
782
- lines.append(f"- [{status}] {template}: {description}")
783
- if error:
784
- lines.append(f" Error: {error}")
785
-
786
- snippet = ""
787
- if isinstance(output, dict):
788
- for key in ("stdout", "stderr", "message", "log", "details"):
789
- if output.get(key):
790
- snippet = str(output[key])
791
- break
792
- if not snippet and output:
793
- snippet = json.dumps(output)[:400]
794
- elif output:
795
- snippet = str(output)[:400]
796
-
797
- snippet = snippet.strip()
798
- if snippet:
799
- lines.append(f" Output: {snippet[:400]}")
800
- return lines
801
-
802
- sections: List[str] = []
803
- sections.append("### Latest Plan Results")
804
- sections.extend(render(list(latest_logs)))
805
- sections.append("")
806
- sections.append("### Previous Plan History")
807
- sections.extend(render(list(historical_logs), limit=5))
808
-
809
- return "\n".join(sections).strip()
810
-
811
- def _parse_checkpoint_response(self, response: Any) -> Dict[str, Any]:
812
- """Parse JSON output from the checkpoint reviewer."""
813
- text = self._extract_response_text(response)
814
- json_str = self._extract_json(text)
815
- return json.loads(json_str)
816
-
817
- @staticmethod
818
- def _extract_response_text(response: Any) -> str:
819
- """Normalize SDK response objects to raw text."""
820
- if isinstance(response, str):
821
- return response
822
- if hasattr(response, "text"):
823
- return response.text
824
- if hasattr(response, "content"):
825
- return response.content
826
- if isinstance(response, dict):
827
- return response.get("text", response.get("content", str(response)))
828
- return str(response)
829
-
830
- @staticmethod
831
- def _extract_json(text: str) -> str:
832
- """Extract JSON blob from arbitrary text (markdown-safe)."""
833
- code_block = re.search(r"```(?:json)?\\s*\\n?(.*?)\\n?```", text, re.DOTALL)
834
- if code_block:
835
- return code_block.group(1).strip()
836
-
837
- json_match = re.search(r"\\{.*\\}", text, re.DOTALL)
838
- if json_match:
839
- return json_match.group(0)
840
-
841
- return text.strip()
1
+ # Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
2
+ # SPDX-License-Identifier: MIT
3
+ """
4
+ Orchestrator for LLM-driven workflow execution.
5
+
6
+ The Orchestrator controls workflow execution using Checklist Mode:
7
+ - LLM generates a checklist of template invocations based on user request
8
+ - Executor runs templates deterministically with error recovery
9
+ - Provides semantic understanding (e.g., adds checkboxes for todos)
10
+
11
+ Features:
12
+ - LLM-driven checklist generation
13
+ - Deterministic template execution
14
+ - Error recovery with three-tier strategy
15
+ - Progress reporting
16
+ """
17
+
18
+ import json
19
+ import logging
20
+ import os
21
+ import re
22
+ import subprocess
23
+ from dataclasses import dataclass, field
24
+ from pathlib import Path
25
+ from typing import Any, Callable, Dict, List, Optional, Protocol
26
+
27
+ from gaia.agents.base.console import AgentConsole
28
+
29
+ from .steps.base import ToolExecutor, UserContext
30
+ from .steps.error_handler import ErrorHandler
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ class ProjectDirectoryError(Exception):
36
+ """Raised when the project directory cannot be prepared safely."""
37
+
38
+
39
+ def _estimate_token_count(text: str) -> int:
40
+ """Lightweight token estimate assuming ~4 characters per token."""
41
+ avg_chars_per_token = 4
42
+ byte_length = len(text.encode("utf-8"))
43
+ return max(1, (byte_length + avg_chars_per_token - 1) // avg_chars_per_token)
44
+
45
+
46
+ class ChatSDK(Protocol):
47
+ """Protocol for chat SDK interface used by checklist generator."""
48
+
49
+ def send(self, message: str, timeout: int = 600, no_history: bool = False) -> Any:
50
+ """Send a message and get response."""
51
+ ...
52
+
53
+
54
+ @dataclass
55
+ class ExecutionResult:
56
+ """Result of a complete workflow execution."""
57
+
58
+ success: bool
59
+ phases_completed: List[str] = field(default_factory=list)
60
+ phases_failed: List[str] = field(default_factory=list)
61
+ total_steps: int = 0
62
+ steps_succeeded: int = 0
63
+ steps_failed: int = 0
64
+ steps_skipped: int = 0
65
+ errors: List[str] = field(default_factory=list)
66
+ outputs: Dict[str, Any] = field(default_factory=dict)
67
+
68
+ @property
69
+ def summary(self) -> str:
70
+ """Get a human-readable summary."""
71
+ status = "SUCCESS" if self.success else "FAILED"
72
+ return (
73
+ f"{status}: {self.steps_succeeded}/{self.total_steps} steps completed, "
74
+ f"{self.steps_failed} failed, {self.steps_skipped} skipped"
75
+ )
76
+
77
+
78
+ CHECKPOINT_REVIEW_PROMPT = """You are the checkpoint reviewer for the GAIA web development agent.
79
+
80
+ You receive:
81
+ - The original user request
82
+ - A summary of the latest checklist execution (including errors/warnings)
83
+ - Logs from the validation and testing tools (run_typescript_check, validate_styles, run_tests, etc.)
84
+ - Any previously requested fixes that are still outstanding
85
+
86
+ Decide if the application is ready to ship or if additional fixes are required.
87
+
88
+ Rules:
89
+ 1. If ANY validation or test log failed, status must be \"needs_fix\" with concrete guidance.
90
+ 2. Only return \"complete\" when the app works end-to-end and validations passed.
91
+ 3. When fixes are needed, suggest actionable steps that can be executed through `fix_code` (LLM-assisted repair of problematic files).
92
+
93
+ Respond with concise JSON only:
94
+ {
95
+ \"status\": \"complete\" | \"needs_fix\",
96
+ \"reasoning\": \"short justification\",
97
+ \"issues\": [\"list of concrete bugs or failures\"],
98
+ \"fix_instructions\": [\"ordered actions the next checklist should perform\"]
99
+ }
100
+ """
101
+
102
+ MAX_CHAT_HISTORY_TOKENS = 15000
103
+
104
+
105
+ @dataclass
106
+ class CheckpointAssessment:
107
+ """LLM-produced verdict about the current checkpoint."""
108
+
109
+ status: str
110
+ reasoning: str
111
+ issues: List[str] = field(default_factory=list)
112
+ fix_instructions: List[str] = field(default_factory=list)
113
+
114
+ @property
115
+ def needs_fix(self) -> bool:
116
+ """Return True when the reviewer requires another checklist."""
117
+ return self.status.lower() != "complete"
118
+
119
+ def to_dict(self) -> Dict[str, Any]:
120
+ """Serialize the assessment."""
121
+ return {
122
+ "status": self.status,
123
+ "reasoning": self.reasoning,
124
+ "issues": self.issues,
125
+ "fix_instructions": self.fix_instructions,
126
+ }
127
+
128
+
129
+ class Orchestrator:
130
+ """Controls LLM-driven workflow execution with error recovery.
131
+
132
+ The orchestrator uses Checklist Mode exclusively:
133
+ - LLM analyzes user request and generates a checklist of templates
134
+ - Executor runs templates deterministically
135
+ - Provides semantic understanding (e.g., adds checkboxes for todos)
136
+ """
137
+
138
+ def __init__(
139
+ self,
140
+ tool_executor: ToolExecutor,
141
+ llm_client: ChatSDK,
142
+ llm_fixer: Optional[Callable[[str, str], Optional[str]]] = None,
143
+ progress_callback: Optional[Callable[[str, str, int, int], None]] = None,
144
+ console: Optional[AgentConsole] = None,
145
+ max_checklist_loops: int = 10,
146
+ ):
147
+ """Initialize orchestrator.
148
+
149
+ Args:
150
+ tool_executor: Function to execute tools (name, args) -> result
151
+ llm_client: Chat SDK for checklist generation (required)
152
+ llm_fixer: Optional LLM-based code fixer for escalation
153
+ progress_callback: Optional callback(phase, step, current, total)
154
+ console: Optional console for displaying output
155
+ max_checklist_loops: Max number of checklist iterations before giving up
156
+ """
157
+ if llm_client is None:
158
+ raise ValueError("llm_client is required for Orchestrator")
159
+
160
+ self.tool_executor = tool_executor
161
+ self.llm_client = llm_client
162
+ self.error_handler = ErrorHandler(
163
+ command_executor=self._run_command,
164
+ llm_fixer=llm_fixer,
165
+ )
166
+ self.progress_callback = progress_callback
167
+ self.console = console
168
+ self.max_checklist_loops = max(1, max_checklist_loops)
169
+
170
+ # Initialize checklist components
171
+ from .checklist_executor import ChecklistExecutor
172
+ from .checklist_generator import ChecklistGenerator
173
+
174
+ self.checklist_generator = ChecklistGenerator(llm_client)
175
+ self.checklist_executor = ChecklistExecutor(
176
+ tool_executor,
177
+ llm_client=llm_client, # Pass LLM for per-item code generation
178
+ error_handler=self.error_handler,
179
+ progress_callback=self._checklist_progress_callback,
180
+ console=console, # Pass console
181
+ )
182
+ logger.debug(
183
+ "Orchestrator initialized - LLM will plan execution AND generate code per item"
184
+ )
185
+
186
+ def execute(
187
+ self, context: UserContext, step_through: bool = False
188
+ ) -> ExecutionResult:
189
+ """Execute the workflow using iterative LLM-generated checklists."""
190
+ logger.debug("Executing workflow (LLM-driven checklist loop)")
191
+
192
+ from .project_analyzer import ProjectAnalyzer
193
+
194
+ analyzer = ProjectAnalyzer()
195
+ aggregated_validation_logs: List[Any] = []
196
+ fix_feedback: List[str] = []
197
+ iteration_outputs: List[Dict[str, Any]] = []
198
+ combined_errors: List[str] = []
199
+ previous_execution_errors: List[str] = []
200
+ previous_validation_logs: List[Any] = []
201
+
202
+ total_steps = 0
203
+ steps_succeeded = 0
204
+ steps_failed = 0
205
+ success = False
206
+
207
+ try:
208
+ context.project_dir = self._prepare_project_directory(context)
209
+ except ProjectDirectoryError as exc:
210
+ error_message = str(exc)
211
+ logger.error(error_message)
212
+ if self.console:
213
+ self.console.print_error(error_message)
214
+ return ExecutionResult(
215
+ success=False,
216
+ phases_completed=[],
217
+ phases_failed=["project_directory"],
218
+ total_steps=1,
219
+ steps_succeeded=0,
220
+ steps_failed=1,
221
+ steps_skipped=0,
222
+ errors=[error_message],
223
+ outputs={
224
+ "iterations": [],
225
+ "validation_logs": [],
226
+ "fix_feedback": [],
227
+ "project_dir": context.project_dir,
228
+ },
229
+ )
230
+
231
+ for iteration in range(1, self.max_checklist_loops + 1):
232
+ logger.debug("Starting checklist iteration %d", iteration)
233
+
234
+ if iteration > 1:
235
+ summary_result = self._maybe_summarize_conversation_history()
236
+ if summary_result and self.console:
237
+ self.console.print_info(
238
+ "Conversation history summarized to stay within token limits."
239
+ )
240
+
241
+ project_state = analyzer.analyze(context.project_dir)
242
+
243
+ # Surface accumulated signals to the next checklist prompt
244
+ context.validation_reports = [
245
+ log.to_dict() for log in aggregated_validation_logs
246
+ ]
247
+ context.fix_feedback = fix_feedback.copy()
248
+
249
+ logger.info(
250
+ "Generating checklist iteration %d of %d",
251
+ iteration,
252
+ self.max_checklist_loops,
253
+ )
254
+ if self.console:
255
+ self.console.print_info(
256
+ f"Generating checklist iteration {iteration} of {self.max_checklist_loops}"
257
+ )
258
+ if iteration == 1:
259
+ checklist = self.checklist_generator.generate_initial_checklist(
260
+ context, project_state
261
+ )
262
+ else:
263
+ checklist = self.checklist_generator.generate_debug_checklist(
264
+ context=context,
265
+ project_state=project_state,
266
+ prior_errors=previous_execution_errors,
267
+ validation_logs=previous_validation_logs,
268
+ )
269
+
270
+ if not checklist.is_valid:
271
+ logger.error(
272
+ "Invalid checklist (iteration %d): %s",
273
+ iteration,
274
+ checklist.validation_errors,
275
+ )
276
+ try:
277
+ checklist_dump = json.dumps(checklist.to_dict(), indent=2)
278
+ except Exception: # pylint: disable=broad-exception-caught
279
+ checklist_dump = str(checklist)
280
+ logger.error("Invalid checklist payload: %s", checklist_dump)
281
+ if self.console:
282
+ self.console.pretty_print_json(
283
+ checklist.to_dict(), title="Invalid Checklist"
284
+ )
285
+ combined_errors.extend(checklist.validation_errors)
286
+ assessment = CheckpointAssessment(
287
+ status="needs_fix",
288
+ reasoning="Checklist validation failed",
289
+ issues=checklist.validation_errors.copy(),
290
+ fix_instructions=checklist.validation_errors.copy(),
291
+ )
292
+ iteration_outputs.append(
293
+ {
294
+ "iteration": iteration,
295
+ "checklist": checklist.to_dict(),
296
+ "execution": None,
297
+ "assessment": assessment.to_dict(),
298
+ }
299
+ )
300
+ break
301
+
302
+ logger.debug(
303
+ "Generated checklist with %d items: %s",
304
+ len(checklist.items),
305
+ checklist.reasoning,
306
+ )
307
+
308
+ checklist_result = self.checklist_executor.execute(
309
+ checklist, context, step_through=step_through
310
+ )
311
+
312
+ total_steps += len(checklist_result.item_results)
313
+ steps_succeeded += checklist_result.items_succeeded
314
+ steps_failed += checklist_result.items_failed
315
+ combined_errors.extend(checklist_result.errors)
316
+
317
+ aggregated_validation_logs.extend(checklist_result.validation_logs)
318
+ previous_execution_errors = checklist_result.errors.copy()
319
+ previous_validation_logs = checklist_result.validation_logs.copy()
320
+
321
+ logger.info("Assessing application state after iteration %d", iteration)
322
+ if self.console:
323
+ self.console.print_info(
324
+ f"Assessing application state after iteration {iteration}"
325
+ )
326
+ assessment = self._assess_checkpoint(
327
+ context=context,
328
+ checklist=checklist,
329
+ execution_result=checklist_result,
330
+ validation_history=aggregated_validation_logs,
331
+ )
332
+ if assessment.needs_fix:
333
+ logger.info(
334
+ "Application not ready after iteration %d, planning another checklist: %s",
335
+ iteration,
336
+ assessment.reasoning or "no reasoning provided",
337
+ )
338
+ if self.console:
339
+ self.console.print_info(
340
+ "Application not ready; preparing another checklist."
341
+ )
342
+ else:
343
+ logger.info(
344
+ "Application marked complete after iteration %d: %s",
345
+ iteration,
346
+ assessment.reasoning or "no reasoning provided",
347
+ )
348
+ if self.console:
349
+ self.console.print_success("Application marked complete.")
350
+
351
+ iteration_outputs.append(
352
+ {
353
+ "iteration": iteration,
354
+ "checklist": checklist.to_dict(),
355
+ "execution": {
356
+ "summary": checklist_result.summary,
357
+ "success": checklist_result.success,
358
+ "files": checklist_result.total_files,
359
+ "errors": checklist_result.errors,
360
+ "warnings": checklist_result.warnings,
361
+ "item_results": [
362
+ r.to_dict() for r in checklist_result.item_results
363
+ ],
364
+ "validation_logs": [
365
+ log.to_dict() for log in checklist_result.validation_logs
366
+ ],
367
+ },
368
+ "assessment": assessment.to_dict(),
369
+ }
370
+ )
371
+
372
+ if not assessment.needs_fix:
373
+ success = (
374
+ checklist_result.success and assessment.status.lower() == "complete"
375
+ )
376
+ break
377
+
378
+ instructions = assessment.fix_instructions or assessment.issues
379
+ if not instructions and assessment.reasoning:
380
+ instructions = [assessment.reasoning]
381
+ if instructions:
382
+ fix_feedback.extend(instructions)
383
+
384
+ else:
385
+ combined_errors.append(
386
+ f"Reached maximum checklist iterations ({self.max_checklist_loops}) without passing validation"
387
+ )
388
+
389
+ latest_execution = None
390
+ latest_checklist = None
391
+ if iteration_outputs:
392
+ latest_entry = iteration_outputs[-1]
393
+ latest_execution = latest_entry.get("execution")
394
+ latest_checklist = latest_entry.get("checklist")
395
+
396
+ outputs = {
397
+ "iterations": iteration_outputs,
398
+ "validation_logs": [log.to_dict() for log in aggregated_validation_logs],
399
+ "fix_feedback": fix_feedback,
400
+ "project_dir": context.project_dir,
401
+ }
402
+
403
+ if latest_execution:
404
+ outputs["files"] = latest_execution.get("files", [])
405
+ outputs["detailed_results"] = latest_execution.get("item_results", [])
406
+ if latest_checklist:
407
+ outputs["checklist"] = latest_checklist
408
+
409
+ return ExecutionResult(
410
+ success=success,
411
+ phases_completed=["checklist"] if success else [],
412
+ phases_failed=[] if success else ["checklist"],
413
+ total_steps=total_steps,
414
+ steps_succeeded=steps_succeeded,
415
+ steps_failed=steps_failed,
416
+ steps_skipped=0,
417
+ errors=combined_errors,
418
+ outputs=outputs,
419
+ )
420
+
421
+ def _run_command(self, command: str, cwd: Optional[str] = None) -> tuple[int, str]:
422
+ """Run a shell command.
423
+
424
+ Args:
425
+ command: Command to run
426
+ cwd: Working directory
427
+
428
+ Returns:
429
+ Tuple of (exit_code, output)
430
+ """
431
+ try:
432
+ result = subprocess.run(
433
+ command,
434
+ shell=True,
435
+ cwd=cwd,
436
+ capture_output=True,
437
+ text=True,
438
+ timeout=1200,
439
+ check=False, # We handle return codes ourselves
440
+ )
441
+ output = result.stdout + result.stderr
442
+ return result.returncode, output
443
+ except subprocess.TimeoutExpired:
444
+ return 1, "Command timed out"
445
+ except Exception as e:
446
+ return 1, str(e)
447
+
448
+ def _checklist_progress_callback(
449
+ self, description: str, current: int, total: int
450
+ ) -> None:
451
+ """Progress callback adapter for checklist execution.
452
+
453
+ Converts checklist progress format to the standard progress format.
454
+
455
+ Args:
456
+ description: Current item description
457
+ current: Current item number
458
+ total: Total items
459
+ """
460
+ if self.progress_callback:
461
+ self.progress_callback("checklist", description, current, total)
462
+
463
+ def _assess_checkpoint(
464
+ self,
465
+ context: UserContext,
466
+ checklist: Any,
467
+ execution_result: Any,
468
+ validation_history: List[Any],
469
+ ) -> CheckpointAssessment:
470
+ """Ask the LLM whether the workflow is complete or needs another checklist."""
471
+ prompt = self._build_checkpoint_prompt(
472
+ context=context,
473
+ checklist=checklist,
474
+ execution_result=execution_result,
475
+ validation_history=validation_history,
476
+ )
477
+
478
+ try:
479
+ response = self.llm_client.send(prompt, timeout=1200)
480
+ data = self._parse_checkpoint_response(response)
481
+ return CheckpointAssessment(
482
+ status=data.get("status", "needs_fix"),
483
+ reasoning=data.get("reasoning", ""),
484
+ issues=data.get("issues", []),
485
+ fix_instructions=data.get("fix_instructions", []),
486
+ )
487
+ except Exception as exc: # pylint: disable=broad-exception-caught
488
+ logger.exception("Checkpoint assessment failed")
489
+ return CheckpointAssessment(
490
+ status="needs_fix",
491
+ reasoning="Failed to interpret checkpoint reviewer output",
492
+ issues=[f"Checkpoint reviewer error: {exc}"],
493
+ fix_instructions=[
494
+ "Inspect validation logs, then fix the root cause using fix_code."
495
+ ],
496
+ )
497
+
498
+ def _build_checkpoint_prompt(
499
+ self,
500
+ context: UserContext,
501
+ checklist: Any,
502
+ execution_result: Any,
503
+ validation_history: List[Any],
504
+ ) -> str:
505
+ """Build the prompt for the checkpoint reviewer."""
506
+ validation_summary = self._format_validation_history(
507
+ validation_history, getattr(execution_result, "validation_logs", None)
508
+ )
509
+
510
+ outstanding = (
511
+ "\n".join(f"- {item}" for item in context.fix_feedback)
512
+ if context.fix_feedback
513
+ else "None"
514
+ )
515
+
516
+ errors = execution_result.errors or ["None"]
517
+ warnings = execution_result.warnings or []
518
+
519
+ sections = [
520
+ CHECKPOINT_REVIEW_PROMPT.strip(),
521
+ "",
522
+ "## User Request",
523
+ context.user_request,
524
+ "",
525
+ "## Latest Checklist Plan",
526
+ f"Reasoning: {checklist.reasoning}",
527
+ "",
528
+ "## Execution Summary",
529
+ execution_result.summary,
530
+ "",
531
+ "## Execution Errors",
532
+ "\n".join(f"- {err}" for err in errors),
533
+ "",
534
+ "## Execution Warnings",
535
+ "\n".join(f"- {warn}" for warn in warnings) if warnings else "None",
536
+ "",
537
+ "## Validation & Test Logs",
538
+ validation_summary,
539
+ "",
540
+ "## Outstanding Fix Requests",
541
+ outstanding,
542
+ ]
543
+
544
+ return "\n".join(sections)
545
+
546
+ def _maybe_summarize_conversation_history(self) -> Optional[str]:
547
+ """Trigger ChatSDK conversation summarization when available."""
548
+ chat_sdk = getattr(self, "llm_client", None)
549
+ if not chat_sdk or not hasattr(chat_sdk, "summarize_conversation_history"):
550
+ return None
551
+
552
+ try:
553
+ summary = chat_sdk.summarize_conversation_history(
554
+ max_history_tokens=MAX_CHAT_HISTORY_TOKENS
555
+ )
556
+ if summary:
557
+ logger.info(
558
+ "Conversation history summarized to ~%d tokens",
559
+ _estimate_token_count(summary),
560
+ )
561
+ return summary
562
+ except Exception as exc: # pylint: disable=broad-exception-caught
563
+ logger.exception("Failed to summarize conversation history: %s", exc)
564
+ return None
565
+
566
+ def _prepare_project_directory(self, context: UserContext) -> str:
567
+ """
568
+ Ensure the project directory is ready for creation workflows.
569
+
570
+ If the provided path exists and is non-empty without an existing project,
571
+ pick a unique subdirectory via the LLM to avoid create-next-app failures.
572
+ """
573
+ base_path = Path(context.project_dir).expanduser()
574
+ if base_path.exists() and not base_path.is_dir():
575
+ raise ProjectDirectoryError(
576
+ f"Provided path is not a directory: {base_path}"
577
+ )
578
+
579
+ if not base_path.exists():
580
+ base_path.mkdir(parents=True, exist_ok=True)
581
+ logger.info("Created project directory: %s", base_path)
582
+ return str(base_path)
583
+
584
+ existing_entries = [p.name for p in base_path.iterdir()]
585
+ if not existing_entries:
586
+ return str(base_path)
587
+
588
+ if self.console:
589
+ self.console.print_warning(
590
+ f"Target directory {base_path} is not empty; selecting a new subdirectory."
591
+ )
592
+
593
+ suggested = self._choose_subdirectory_name(
594
+ base_path, existing_entries, context.user_request
595
+ )
596
+ if not suggested:
597
+ raise ProjectDirectoryError(
598
+ f"Unable to find an available project name under {base_path}. "
599
+ "Provide one explicitly with --path."
600
+ )
601
+
602
+ new_dir = base_path / suggested
603
+ new_dir.mkdir(parents=False, exist_ok=False)
604
+ logger.info("Using nested project directory: %s", new_dir)
605
+ # Align process cwd with the newly created project directory.
606
+ try:
607
+ os.chdir(new_dir)
608
+ except OSError as exc:
609
+ logger.warning("Failed to chdir to %s: %s", new_dir, exc)
610
+ if self.console:
611
+ self.console.print_info(f"Using project directory: {new_dir}")
612
+ return str(new_dir)
613
+
614
+ def _choose_subdirectory_name(
615
+ self, base_path: Path, existing_entries: List[str], user_request: str
616
+ ) -> Optional[str]:
617
+ """Ask the LLM for a unique subdirectory name, retrying on conflicts."""
618
+ existing_lower = {name.lower() for name in existing_entries}
619
+ prompt = self._build_directory_prompt(
620
+ base_path, existing_entries, user_request, None
621
+ )
622
+ last_reason = None
623
+
624
+ system_prompt = "You suggest concise folder names for new projects."
625
+
626
+ for attempt in range(1, 4):
627
+ try:
628
+ response = self._send_prompt_without_history(
629
+ prompt, timeout=120, system_prompt=system_prompt
630
+ )
631
+ except Exception as exc: # pylint: disable=broad-exception-caught
632
+ last_reason = f"LLM error on attempt {attempt}: {exc}"
633
+ logger.warning(last_reason)
634
+ prompt = self._build_directory_prompt(
635
+ base_path, existing_entries, user_request, last_reason
636
+ )
637
+ continue
638
+
639
+ raw_response = self._extract_response_text(response)
640
+ candidate = self._sanitize_directory_name(raw_response)
641
+ if not candidate:
642
+ last_reason = "LLM returned an empty or invalid directory name."
643
+ elif candidate.lower() in existing_lower:
644
+ last_reason = f"Name '{candidate}' already exists in {base_path}."
645
+ elif "/" in candidate or "\\" in candidate or ".." in candidate:
646
+ last_reason = "Directory name contained path separators or traversal."
647
+ elif len(candidate) > 64:
648
+ last_reason = "Directory name exceeded 64 characters."
649
+ else:
650
+ candidate_path = base_path / candidate
651
+ if candidate_path.exists():
652
+ last_reason = f"Directory '{candidate}' already exists."
653
+ else:
654
+ return candidate
655
+
656
+ logger.warning(
657
+ "Directory name attempt %d rejected: %s", attempt, last_reason
658
+ )
659
+ prompt = self._build_directory_prompt(
660
+ base_path, existing_entries, user_request, last_reason
661
+ )
662
+
663
+ return None
664
+
665
+ @staticmethod
666
+ def _sanitize_directory_name(raw: str) -> str:
667
+ """Normalize LLM output to a filesystem-safe directory name."""
668
+ if not raw:
669
+ return ""
670
+ candidate = raw.strip().strip("`'\"")
671
+ candidate = candidate.splitlines()[0].strip()
672
+ candidate = re.sub(r"[^A-Za-z0-9_-]+", "-", candidate)
673
+ return candidate.strip("-_").lower()
674
+
675
+ def _send_prompt_without_history(
676
+ self, prompt: str, timeout: int = 120, system_prompt: Optional[str] = None
677
+ ) -> Any:
678
+ """
679
+ Send a prompt without reading from or writing to chat history.
680
+
681
+ Prefers the underlying LLM client's `generate` API when available,
682
+ falling back to `send(..., no_history=True)` for compatibility.
683
+ """
684
+ # If the ChatSDK exposes the underlying LLM client, use it directly with chat messages
685
+ # to avoid any stored history and ensure system prompts are applied cleanly.
686
+ llm_client = getattr(self.llm_client, "llm_client", None)
687
+ if llm_client and hasattr(llm_client, "generate"):
688
+ model = getattr(getattr(self.llm_client, "config", None), "model", None)
689
+ messages = []
690
+ if system_prompt:
691
+ messages.append({"role": "system", "content": system_prompt})
692
+ messages.append({"role": "user", "content": prompt})
693
+ return llm_client.generate(
694
+ prompt=prompt,
695
+ messages=messages,
696
+ model=model,
697
+ timeout=timeout,
698
+ endpoint="chat",
699
+ )
700
+
701
+ # Fallback: use send with no_history to avoid persisting messages.
702
+ if hasattr(self.llm_client, "send"):
703
+ return self.llm_client.send(
704
+ prompt, timeout=timeout, no_history=True, system_prompt=system_prompt
705
+ )
706
+
707
+ raise ValueError("LLM client does not support generate or send APIs")
708
+
709
+ @staticmethod
710
+ def _build_directory_prompt(
711
+ base_path: Path,
712
+ existing_entries: List[str],
713
+ user_request: Optional[str],
714
+ rejection_reason: Optional[str],
715
+ ) -> str:
716
+ """Construct the LLM prompt for picking a safe project subdirectory."""
717
+ entries = sorted(existing_entries)
718
+ max_list = 50
719
+ if len(entries) > max_list:
720
+ entries_display = "\n".join(f"- {name}" for name in entries[:max_list])
721
+ entries_display += f"\n- ...and {len(entries) - max_list} more"
722
+ else:
723
+ entries_display = "\n".join(f"- {name}" for name in entries)
724
+
725
+ prompt_sections = [
726
+ "You must choose a new folder name for a project because the target path is not empty.",
727
+ f"Base path: {base_path}",
728
+ "Existing files and folders you MUST avoid (do not reuse any of these names):",
729
+ entries_display or "- <empty>",
730
+ "User request driving this project:",
731
+ user_request or "<no request provided>",
732
+ "Rules:",
733
+ "- Return a single folder name only. Do NOT echo the instructions. No paths, quotes, JSON, or extra text.",
734
+ "- Use lowercase kebab-case or snake_case; ASCII letters, numbers, hyphens, and underscores only.",
735
+ "- Do not use any existing names above. Avoid dots, spaces, or slashes.",
736
+ "- Keep it under 40 characters.",
737
+ ]
738
+
739
+ if rejection_reason:
740
+ prompt_sections.append(
741
+ f"Previous suggestion was rejected: {rejection_reason}. Try a different unique name."
742
+ )
743
+
744
+ return "\n".join(prompt_sections)
745
+
746
+ def _format_validation_history(
747
+ self, validation_history: List[Any], latest_plan_logs: Optional[List[Any]]
748
+ ) -> str:
749
+ """Format validation logs, splitting latest plan from historical ones."""
750
+
751
+ if not validation_history:
752
+ return "No validation or test commands have been executed yet."
753
+
754
+ latest_logs = latest_plan_logs or []
755
+ latest_count = len(latest_logs)
756
+ historical_logs = (
757
+ validation_history[:-latest_count] if latest_count else validation_history
758
+ )
759
+
760
+ def normalize(entry: Any) -> Dict[str, Any]:
761
+ if hasattr(entry, "to_dict"):
762
+ return entry.to_dict()
763
+ if isinstance(entry, dict):
764
+ return entry
765
+ return {}
766
+
767
+ def render(entries: List[Any], limit: Optional[int] = None) -> List[str]:
768
+ if not entries:
769
+ return ["None"]
770
+
771
+ selected = entries if limit is None else entries[-limit:]
772
+ lines: List[str] = []
773
+ for entry in selected:
774
+ data = normalize(entry)
775
+ template = data.get("template", "unknown")
776
+ description = data.get("description", "")
777
+ success = data.get("success", True)
778
+ status = "PASS" if success else "FAIL"
779
+ error = data.get("error")
780
+ output = data.get("output", {})
781
+
782
+ lines.append(f"- [{status}] {template}: {description}")
783
+ if error:
784
+ lines.append(f" Error: {error}")
785
+
786
+ snippet = ""
787
+ if isinstance(output, dict):
788
+ for key in ("stdout", "stderr", "message", "log", "details"):
789
+ if output.get(key):
790
+ snippet = str(output[key])
791
+ break
792
+ if not snippet and output:
793
+ snippet = json.dumps(output)[:400]
794
+ elif output:
795
+ snippet = str(output)[:400]
796
+
797
+ snippet = snippet.strip()
798
+ if snippet:
799
+ lines.append(f" Output: {snippet[:400]}")
800
+ return lines
801
+
802
+ sections: List[str] = []
803
+ sections.append("### Latest Plan Results")
804
+ sections.extend(render(list(latest_logs)))
805
+ sections.append("")
806
+ sections.append("### Previous Plan History")
807
+ sections.extend(render(list(historical_logs), limit=5))
808
+
809
+ return "\n".join(sections).strip()
810
+
811
+ def _parse_checkpoint_response(self, response: Any) -> Dict[str, Any]:
812
+ """Parse JSON output from the checkpoint reviewer."""
813
+ text = self._extract_response_text(response)
814
+ json_str = self._extract_json(text)
815
+ return json.loads(json_str)
816
+
817
+ @staticmethod
818
+ def _extract_response_text(response: Any) -> str:
819
+ """Normalize SDK response objects to raw text."""
820
+ if isinstance(response, str):
821
+ return response
822
+ if hasattr(response, "text"):
823
+ return response.text
824
+ if hasattr(response, "content"):
825
+ return response.content
826
+ if isinstance(response, dict):
827
+ return response.get("text", response.get("content", str(response)))
828
+ return str(response)
829
+
830
+ @staticmethod
831
+ def _extract_json(text: str) -> str:
832
+ """Extract JSON blob from arbitrary text (markdown-safe)."""
833
+ code_block = re.search(r"```(?:json)?\\s*\\n?(.*?)\\n?```", text, re.DOTALL)
834
+ if code_block:
835
+ return code_block.group(1).strip()
836
+
837
+ json_match = re.search(r"\\{.*\\}", text, re.DOTALL)
838
+ if json_match:
839
+ return json_match.group(0)
840
+
841
+ return text.strip()