dslighting 1.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. dsat/__init__.py +3 -0
  2. dsat/benchmark/__init__.py +1 -0
  3. dsat/benchmark/benchmark.py +168 -0
  4. dsat/benchmark/datasci.py +291 -0
  5. dsat/benchmark/mle.py +777 -0
  6. dsat/benchmark/sciencebench.py +304 -0
  7. dsat/common/__init__.py +0 -0
  8. dsat/common/constants.py +11 -0
  9. dsat/common/exceptions.py +48 -0
  10. dsat/common/typing.py +19 -0
  11. dsat/config.py +79 -0
  12. dsat/models/__init__.py +3 -0
  13. dsat/models/candidates.py +16 -0
  14. dsat/models/formats.py +52 -0
  15. dsat/models/task.py +64 -0
  16. dsat/operators/__init__.py +0 -0
  17. dsat/operators/aflow_ops.py +90 -0
  18. dsat/operators/autokaggle_ops.py +170 -0
  19. dsat/operators/automind_ops.py +38 -0
  20. dsat/operators/base.py +22 -0
  21. dsat/operators/code.py +45 -0
  22. dsat/operators/dsagent_ops.py +123 -0
  23. dsat/operators/llm_basic.py +84 -0
  24. dsat/prompts/__init__.py +0 -0
  25. dsat/prompts/aflow_prompt.py +76 -0
  26. dsat/prompts/aide_prompt.py +52 -0
  27. dsat/prompts/autokaggle_prompt.py +290 -0
  28. dsat/prompts/automind_prompt.py +29 -0
  29. dsat/prompts/common.py +51 -0
  30. dsat/prompts/data_interpreter_prompt.py +82 -0
  31. dsat/prompts/dsagent_prompt.py +88 -0
  32. dsat/runner.py +554 -0
  33. dsat/services/__init__.py +0 -0
  34. dsat/services/data_analyzer.py +387 -0
  35. dsat/services/llm.py +486 -0
  36. dsat/services/llm_single.py +421 -0
  37. dsat/services/sandbox.py +386 -0
  38. dsat/services/states/__init__.py +0 -0
  39. dsat/services/states/autokaggle_state.py +43 -0
  40. dsat/services/states/base.py +14 -0
  41. dsat/services/states/dsa_log.py +13 -0
  42. dsat/services/states/experience.py +237 -0
  43. dsat/services/states/journal.py +153 -0
  44. dsat/services/states/operator_library.py +290 -0
  45. dsat/services/vdb.py +76 -0
  46. dsat/services/workspace.py +178 -0
  47. dsat/tasks/__init__.py +3 -0
  48. dsat/tasks/handlers.py +376 -0
  49. dsat/templates/open_ended/grade_template.py +107 -0
  50. dsat/tools/__init__.py +4 -0
  51. dsat/utils/__init__.py +0 -0
  52. dsat/utils/context.py +172 -0
  53. dsat/utils/dynamic_import.py +71 -0
  54. dsat/utils/parsing.py +33 -0
  55. dsat/workflows/__init__.py +12 -0
  56. dsat/workflows/base.py +53 -0
  57. dsat/workflows/factory.py +439 -0
  58. dsat/workflows/manual/__init__.py +0 -0
  59. dsat/workflows/manual/autokaggle_workflow.py +148 -0
  60. dsat/workflows/manual/data_interpreter_workflow.py +153 -0
  61. dsat/workflows/manual/deepanalyze_workflow.py +484 -0
  62. dsat/workflows/manual/dsagent_workflow.py +76 -0
  63. dsat/workflows/search/__init__.py +0 -0
  64. dsat/workflows/search/aflow_workflow.py +344 -0
  65. dsat/workflows/search/aide_workflow.py +283 -0
  66. dsat/workflows/search/automind_workflow.py +237 -0
  67. dsat/workflows/templates/__init__.py +0 -0
  68. dsat/workflows/templates/basic_kaggle_loop.py +71 -0
  69. dslighting/__init__.py +170 -0
  70. dslighting/core/__init__.py +13 -0
  71. dslighting/core/agent.py +646 -0
  72. dslighting/core/config_builder.py +318 -0
  73. dslighting/core/data_loader.py +422 -0
  74. dslighting/core/task_detector.py +422 -0
  75. dslighting/utils/__init__.py +19 -0
  76. dslighting/utils/defaults.py +151 -0
  77. dslighting-1.3.9.dist-info/METADATA +554 -0
  78. dslighting-1.3.9.dist-info/RECORD +80 -0
  79. dslighting-1.3.9.dist-info/WHEEL +5 -0
  80. dslighting-1.3.9.dist-info/top_level.txt +2 -0
dsat/runner.py ADDED
@@ -0,0 +1,554 @@
1
+ # dsat/runner.py
2
+ import logging
3
+ import shutil
4
+ import uuid
5
+ import json
6
+ import time
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+ from typing import Callable, Coroutine, Any, Tuple, Dict, Type, Optional, List
10
+
11
+ # Core configuration and models
12
+ from dsat.config import DSATConfig
13
+ from dsat.models.task import TaskDefinition, TaskType
14
+
15
+ # Services and workflows
16
+ from dsat.services.llm import LLMService
17
+ from dsat.workflows.base import DSATWorkflow
18
+
19
+ # Dynamic components (factories and handlers)
20
+ from dsat.tasks.handlers import TaskHandler, KaggleTaskHandler, QATaskHandler, DataSciTaskHandler, OpenEndedTaskHandler
21
+ from dsat.workflows.factory import (
22
+ WorkflowFactory,
23
+ AutoMindWorkflowFactory,
24
+ AIDEWorkflowFactory,
25
+ DSAgentWorkflowFactory,
26
+ DataInterpreterWorkflowFactory,
27
+ AutoKaggleWorkflowFactory,
28
+ AFlowWorkflowFactory,
29
+ DeepAnalyzeWorkflowFactory,
30
+ DynamicWorkflowFactory,
31
+ )
32
+ # Import AFlow workflow for type checking
33
+ from dsat.workflows.search.aflow_workflow import AFlowWorkflow
34
+ from dsat.services.states.journal import JournalState
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+ # ==============================================================================
39
+ # == COMPONENT REGISTRIES ==
40
+ # ==============================================================================
41
+
42
+ WORKFLOW_FACTORIES: Dict[str, WorkflowFactory] = {
43
+ "automind": AutoMindWorkflowFactory(),
44
+ "aide": AIDEWorkflowFactory(),
45
+ "dsagent": DSAgentWorkflowFactory(),
46
+ "data_interpreter": DataInterpreterWorkflowFactory(),
47
+ "autokaggle": AutoKaggleWorkflowFactory(),
48
+ "aflow": AFlowWorkflowFactory(),
49
+ "deepanalyze": DeepAnalyzeWorkflowFactory(),
50
+ }
51
+
52
+ TASK_HANDLER_CLASSES: Dict[TaskType, Type[TaskHandler]] = {
53
+ "kaggle": KaggleTaskHandler,
54
+ "qa": QATaskHandler,
55
+ "datasci": DataSciTaskHandler,
56
+ "open_ended": OpenEndedTaskHandler,
57
+ # "code": CodeTaskHandler, # future extension
58
+ }
59
+
60
+
61
+ # ==============================================================================
62
+ # == DSAT RUNNER ==
63
+ # ==============================================================================
64
+
65
+ class DSATRunner:
66
+ """
67
+ Orchestrates benchmarking tasks by instantiating workflows, preparing inputs,
68
+ executing runs, and collecting telemetry for later inspection.
69
+ """
70
+
71
+ def __init__(self, config: DSATConfig):
72
+ logger.info(f"Initializing DSATRunner for workflow: '{config.workflow.name}'")
73
+ self.config = config
74
+ self.factories = WORKFLOW_FACTORIES.copy()
75
+ self.factory = self.factories.get(config.workflow.name)
76
+ if not self.factory:
77
+ available = ", ".join(self.factories.keys())
78
+ raise ValueError(f"Unknown workflow '{config.workflow.name}'. Available workflows: [{available}]")
79
+
80
+ self.handler_classes = TASK_HANDLER_CLASSES
81
+ self.benchmark = None
82
+ self.run_records: List[Dict[str, Any]] = []
83
+
84
+ logger.info("DSATRunner is ready to evaluate tasks.")
85
+
86
+ def register_workflow(self, name: str, factory: WorkflowFactory) -> None:
87
+ """
88
+ Register a workflow factory dynamically for this runner instance.
89
+ Critical for paradigms like AFLOW which synthesize workflows at runtime.
90
+ """
91
+ logger.info(f"Registering workflow '{name}' for this runner instance.")
92
+ self.factories[name] = factory
93
+ if self.config.workflow and self.config.workflow.name == name:
94
+ self.factory = factory
95
+ logger.info(f"Active workflow factory switched to '{name}'.")
96
+
97
+ def get_eval_function(self) -> Callable[[TaskDefinition], Coroutine[Any, Any, Tuple[Any, float, Dict[str, Any]]]]:
98
+ """
99
+ Produce an async function that evaluates a single TaskDefinition and returns (result, cost, usage_summary).
100
+ Benchmark drivers call this function repeatedly for each competition/task.
101
+ """
102
+
103
+ async def eval_function(task: TaskDefinition) -> Tuple[Any, float, Dict[str, Any]]:
104
+ logger.info(f"Starting evaluation for task '{task.task_id}' (type='{task.task_type}').")
105
+
106
+ # If a specific run name is provided in config, use it. Otherwise, generate one.
107
+ if self.config.run.name and self.config.run.name != "dsat_run":
108
+ task_run_name = self.config.run.name
109
+ else:
110
+ safe_task_id = "".join(c if c.isalnum() else "_" for c in task.task_id)
111
+ unique_suffix = uuid.uuid4().hex[:8]
112
+ task_run_name = f"{self.config.run.name}_{safe_task_id}_{unique_suffix}"
113
+
114
+ task_config = self.config.model_copy(deep=True)
115
+ task_config.run.name = task_run_name
116
+
117
+ workflow: Optional[DSATWorkflow] = None
118
+ workspace_service = None
119
+ sandbox_service = None
120
+ llm_service: Optional[LLMService] = None
121
+ result: Any = None
122
+ run_total_cost = 0.0
123
+ run_started_at = datetime.utcnow()
124
+ run_perf_start = time.perf_counter()
125
+
126
+ benchmark_instance = self.benchmark
127
+
128
+ try:
129
+ workflow = self.factory.create_workflow(task_config, benchmark=benchmark_instance)
130
+ workspace_service = workflow.services.get("workspace")
131
+ llm_service = workflow.services.get("llm")
132
+ sandbox_service = workflow.services.get("sandbox")
133
+
134
+ if isinstance(workflow, AFlowWorkflow):
135
+ optimizer_name = "AFLOW"
136
+ logger.info("Detected %s workflow. Running meta-optimization stage.", optimizer_name)
137
+ best_workflow_code = await workflow.optimize()
138
+ logger.info("Meta-optimization complete. Proceeding with final evaluation workflow.")
139
+
140
+ if hasattr(benchmark_instance, 'set_mode'):
141
+ logger.info("Switching benchmark to 'test' mode for final %s evaluation.", optimizer_name)
142
+ benchmark_instance.set_mode('test')
143
+
144
+ dynamic_factory = DynamicWorkflowFactory(code_string=best_workflow_code)
145
+ workflow = dynamic_factory.create_workflow(task_config, benchmark=benchmark_instance)
146
+ llm_service = workflow.services.get("llm")
147
+ sandbox_service = workflow.services.get("sandbox")
148
+ workspace_service = workflow.services.get("workspace")
149
+ logger.info("Final %s workflow instantiated and ready.", optimizer_name)
150
+
151
+ workspace_service = workspace_service or workflow.services.get("workspace")
152
+ llm_service = llm_service or workflow.services.get("llm")
153
+ sandbox_service = sandbox_service or workflow.services.get("sandbox")
154
+
155
+ if not llm_service:
156
+ logger.error("Workflow did not expose an LLMService.")
157
+ return "[ERROR] Missing LLM service", 0.0
158
+
159
+ handler_class = self.handler_classes.get(task.task_type)
160
+ if not handler_class:
161
+ logger.error(f"No handler registered for task type '{task.task_type}'.")
162
+ return f"[ERROR] Unsupported task type '{task.task_type}'", 0.0
163
+
164
+ handler: TaskHandler = handler_class()
165
+
166
+ description, io_instructions = "", ""
167
+ data_dir, output_path = None, None
168
+
169
+ try:
170
+ description, io_instructions, data_dir, output_path = handler.prepare_input(task)
171
+
172
+ if workspace_service:
173
+ try:
174
+ workspace_service.link_data_to_workspace(data_dir)
175
+ except Exception as link_error:
176
+ raise RuntimeError(f"Failed to link data directory: {link_error}") from link_error
177
+ else:
178
+ logger.warning("WorkspaceService missing; skipping data linkage.")
179
+
180
+ await workflow.solve(
181
+ description=description,
182
+ io_instructions=io_instructions,
183
+ data_dir=data_dir,
184
+ output_path=output_path
185
+ )
186
+
187
+ if workspace_service and output_path:
188
+ sandbox_workdir = workspace_service.get_path("sandbox_workdir")
189
+ generated_file = sandbox_workdir / output_path.name
190
+ if generated_file.exists():
191
+ output_path.parent.mkdir(parents=True, exist_ok=True)
192
+ if generated_file.resolve() != output_path.resolve():
193
+ logger.info(f"Collecting produced artifact '{output_path.name}' from the sandbox.")
194
+
195
+ # Handle both files and directories (e.g., for open-ended tasks)
196
+ if generated_file.is_dir():
197
+ # For directories (like 'artifacts'), use copytree
198
+ if output_path.exists():
199
+ if output_path.is_dir():
200
+ shutil.rmtree(output_path)
201
+ else:
202
+ output_path.unlink()
203
+ shutil.copytree(generated_file, output_path)
204
+ logger.info(f"Copied directory '{generated_file}' to '{output_path}'")
205
+ else:
206
+ # For files, use regular copy
207
+ shutil.copy(generated_file, output_path)
208
+ logger.info(f"Copied file '{generated_file}' to '{output_path}'")
209
+ else:
210
+ logger.warning(f"No output '{output_path.name}' found in sandbox '{sandbox_workdir}' after workflow execution.")
211
+
212
+ if output_path:
213
+ result = handler.parse_output(output_path)
214
+
215
+ # Grade the submission if benchmark is available
216
+ if benchmark_instance and hasattr(benchmark_instance, 'grade') and isinstance(result, Path):
217
+ try:
218
+ logger.info(f"Grading submission: {result}")
219
+ score = await benchmark_instance.grade(result)
220
+ logger.info(f"✓ Grading complete | Score: {score}")
221
+ # Return score as result
222
+ result = {"score": score, "submission_path": str(result)}
223
+ except Exception as grade_error:
224
+ logger.warning(f"Grading failed: {grade_error}")
225
+ # Keep the path as result if grading fails
226
+ logger.info(f"Submission created at: {result}")
227
+ elif isinstance(result, Path):
228
+ logger.info(f"Submission created at: {result}")
229
+
230
+ logger.info(f"Task '{task.task_id}' evaluation finished successfully.")
231
+
232
+ except Exception as execution_error:
233
+ logger.error(f"Task '{task.task_id}' failed: {execution_error}", exc_info=True)
234
+ result = f"[ERROR] {execution_error.__class__.__name__}: {execution_error}"
235
+ finally:
236
+ handler.cleanup()
237
+ if workspace_service:
238
+ ended_at = datetime.utcnow()
239
+ duration_sec = round(time.perf_counter() - run_perf_start, 4)
240
+ run_total_cost = llm_service.get_total_cost() if llm_service else 0.0
241
+ try:
242
+ self._persist_run_metadata(
243
+ workspace_service=workspace_service,
244
+ task_config=task_config,
245
+ task=task,
246
+ description=description,
247
+ io_instructions=io_instructions,
248
+ data_dir=data_dir,
249
+ output_path=output_path,
250
+ result=result,
251
+ llm_service=llm_service,
252
+ sandbox_service=sandbox_service,
253
+ workflow=workflow,
254
+ started_at=run_started_at,
255
+ ended_at=ended_at,
256
+ duration_seconds=duration_sec,
257
+ total_cost=run_total_cost
258
+ )
259
+ except Exception as persist_error:
260
+ logger.error(f"Failed to persist telemetry for task '{task.task_id}': {persist_error}", exc_info=True)
261
+
262
+ failed = isinstance(result, str) and result.startswith("[ERROR]")
263
+ keep_on_fail = self.config.run.keep_workspace_on_failure
264
+ keep_all = self.config.run.keep_all_workspaces
265
+ workspace_service.cleanup(keep_workspace=keep_all or (failed and keep_on_fail))
266
+
267
+ except Exception as workflow_error:
268
+ logger.error(f"Workflow creation failed for task '{task.task_id}': {workflow_error}", exc_info=True)
269
+ result = f"[ERROR] {workflow_error.__class__.__name__}: {workflow_error}"
270
+ if workspace_service:
271
+ ended_at = datetime.utcnow()
272
+ duration_sec = round(time.perf_counter() - run_perf_start, 4)
273
+ run_total_cost = llm_service.get_total_cost() if llm_service else 0.0
274
+ try:
275
+ self._persist_run_metadata(
276
+ workspace_service=workspace_service,
277
+ task_config=task_config,
278
+ task=task,
279
+ description="",
280
+ io_instructions="",
281
+ data_dir=None,
282
+ output_path=None,
283
+ result=result,
284
+ llm_service=llm_service,
285
+ sandbox_service=sandbox_service,
286
+ workflow=workflow,
287
+ started_at=run_started_at,
288
+ ended_at=ended_at,
289
+ duration_seconds=duration_sec,
290
+ total_cost=run_total_cost
291
+ )
292
+ except Exception as persist_error:
293
+ logger.error(f"Telemetry persistence failed after workflow creation error: {persist_error}", exc_info=True)
294
+ workspace_service.cleanup(keep_workspace=True)
295
+
296
+ run_total_cost = llm_service.get_total_cost() if llm_service else run_total_cost
297
+ usage_summary = llm_service.get_usage_summary() if llm_service else {}
298
+ logger.info(f"Task '{task.task_id}' LLM cost: ${run_total_cost:.6f}")
299
+ return result, run_total_cost, usage_summary
300
+
301
+ return eval_function
302
+
303
+ def get_run_records(self) -> List[Dict[str, Any]]:
304
+ """
305
+ Return a shallow copy of stored run metadata records for summary rendering.
306
+ """
307
+ return [record.copy() for record in self.run_records]
308
+
309
+ def _persist_run_metadata(
310
+ self,
311
+ *,
312
+ workspace_service,
313
+ task_config: DSATConfig,
314
+ task: TaskDefinition,
315
+ description: str,
316
+ io_instructions: str,
317
+ data_dir: Optional[Path],
318
+ output_path: Optional[Path],
319
+ result: Any,
320
+ llm_service: Optional[LLMService],
321
+ sandbox_service: Optional[Any],
322
+ workflow: Optional[DSATWorkflow],
323
+ started_at: datetime,
324
+ ended_at: datetime,
325
+ duration_seconds: float,
326
+ total_cost: float,
327
+ ) -> None:
328
+ """
329
+ Write per-task telemetry (LLM calls, sandbox runs, search tree, summary) to the workspace.
330
+ """
331
+ telemetry_dir = "telemetry"
332
+ workspace_dir = workspace_service.get_path("run_dir")
333
+ llm_calls = llm_service.get_call_history() if llm_service else []
334
+ sandbox_runs = sandbox_service.get_execution_history() if sandbox_service else []
335
+ usage_summary = llm_service.get_usage_summary() if llm_service else {}
336
+ best_node = self._get_best_node(workflow)
337
+ search_tree_data, search_tree_info = self._extract_search_tree(workflow, best_node)
338
+
339
+ config_snapshot = task_config.model_dump()
340
+ if "llm" in config_snapshot and "api_key" in config_snapshot["llm"]:
341
+ config_snapshot["llm"]["api_key"] = "***REDACTED***"
342
+
343
+ benchmark_snapshot = self._build_benchmark_snapshot()
344
+
345
+ final_code_path: Optional[str] = None
346
+ final_candidate = workspace_service.get_path("artifacts") / "final_submission" / "final_solution.py"
347
+ if final_candidate.exists():
348
+ final_code_path = str(final_candidate)
349
+ elif best_node:
350
+ final_code_path = best_node.final_submission_path or best_node.code_artifact_path
351
+
352
+ filtered_parameters = {
353
+ key: value for key, value in (task_config.run.parameters or {}).items()
354
+ if value not in (None, "", [], {})
355
+ }
356
+
357
+ metadata = {
358
+ "run_name": task_config.run.name,
359
+ "workspace_dir": str(workspace_dir),
360
+ "workflow": task_config.workflow.name if task_config.workflow else None,
361
+ "parameters": filtered_parameters,
362
+ "benchmark": benchmark_snapshot,
363
+ "task": {
364
+ "task_id": task.task_id,
365
+ "task_type": task.task_type,
366
+ "payload": task.payload,
367
+ },
368
+ "task_context": {
369
+ "description": description,
370
+ "io_instructions": io_instructions,
371
+ "data_dir": str(data_dir) if data_dir else None,
372
+ "expected_output_path": str(output_path) if output_path else None,
373
+ },
374
+ "timeline": {
375
+ "started_at_utc": started_at.isoformat() + "Z",
376
+ "ended_at_utc": ended_at.isoformat() + "Z",
377
+ "duration_seconds": duration_seconds,
378
+ },
379
+ "summary": {
380
+ "result": self._format_result(result),
381
+ "success": not (isinstance(result, str) and result.startswith("[ERROR]")),
382
+ "total_cost": total_cost,
383
+ "usage": usage_summary,
384
+ "cost_per_token": usage_summary.get("cost_per_token"),
385
+ "llm_call_count": len(llm_calls),
386
+ "sandbox_run_count": len(sandbox_runs),
387
+ "final_code": best_node.code if best_node else None,
388
+ "final_code_path": final_code_path,
389
+ "best_node_id": best_node.id if best_node else None,
390
+ "best_path_node_ids": search_tree_info.get("best_path"),
391
+ },
392
+ "config_snapshot": config_snapshot,
393
+ }
394
+
395
+ # Save Final Code to a standard location
396
+ if best_node and best_node.code:
397
+ final_code_file = workspace_service.get_path("run_dir") / "final_solution.py"
398
+ with open(final_code_file, "w", encoding="utf-8") as f:
399
+ f.write(best_node.code)
400
+ metadata["summary"]["final_code_path"] = str(final_code_file)
401
+
402
+ # 💾 NEW: Save model training code to code_history directory
403
+ try:
404
+ code_history_dir = workspace_service.get_path("sandbox_workdir") / "code_history"
405
+ code_history_dir.mkdir(parents=True, exist_ok=True)
406
+
407
+ # Find next available number for model training code
408
+ import re
409
+ existing_model_codes = list(code_history_dir.glob("model_code_*.py"))
410
+ if existing_model_codes:
411
+ numbers = []
412
+ for f in existing_model_codes:
413
+ match = re.search(r'model_code_(\d+)\.py', f.name)
414
+ if match:
415
+ numbers.append(int(match.group(1)))
416
+ next_num = max(numbers) + 1 if numbers else 1
417
+ else:
418
+ next_num = 1
419
+
420
+ # Save with formatted number and metadata
421
+ model_code_filename = f"model_code_{next_num:03d}.py"
422
+ model_code_filepath = code_history_dir / model_code_filename
423
+
424
+ # Add header with training metadata
425
+ import datetime
426
+ header = f'''# Code Type: MODEL TRAINING
427
+ # Workflow: {task_config.workflow.name if task_config.workflow else 'Unknown'}
428
+ # Model: {task_config.llm.model if task_config.llm else 'Unknown'}
429
+ # Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
430
+ # Task ID: {task.task_id}
431
+ # Success: {not (isinstance(result, str) and result.startswith("[ERROR]"))}
432
+
433
+ '''
434
+ model_code_filepath.write_text(header + best_node.code)
435
+ logger.info(f"💾 Saved model training code to workspace: {model_code_filepath}")
436
+ except Exception as e:
437
+ logger.warning(f"Failed to save model training code to code_history: {e}")
438
+
439
+ # Save Evaluation Result to a CSV in workspace
440
+ if isinstance(result, (float, int, str)) and not str(result).startswith("[ERROR]"):
441
+ try:
442
+ res_file = workspace_service.get_path("run_dir") / "evaluation_result.csv"
443
+ with open(res_file, "w") as f:
444
+ f.write("task_id,score,cost,duration\n")
445
+ f.write(f"{task.task_id},{result},{total_cost},{duration_seconds}\n")
446
+ except Exception:
447
+ pass
448
+
449
+ detail_files = {}
450
+ if llm_calls:
451
+ llm_calls_path = f"{telemetry_dir}/llm_calls.jsonl"
452
+ self._write_jsonl(workspace_service, llm_calls_path, llm_calls)
453
+ detail_files["llm_calls"] = f"artifacts/{llm_calls_path}"
454
+ if sandbox_runs:
455
+ sandbox_runs_path = f"{telemetry_dir}/sandbox_runs.jsonl"
456
+ self._write_jsonl(workspace_service, sandbox_runs_path, sandbox_runs)
457
+ detail_files["sandbox_runs"] = f"artifacts/{sandbox_runs_path}"
458
+ if search_tree_data:
459
+ search_tree_path = f"{telemetry_dir}/search_tree.json"
460
+ workspace_service.write_file(
461
+ json.dumps(search_tree_data, ensure_ascii=False, indent=2),
462
+ "artifacts",
463
+ search_tree_path
464
+ )
465
+ detail_files["search_tree"] = f"artifacts/{search_tree_path}"
466
+ metadata["search_tree"] = {
467
+ "node_count": len(search_tree_data),
468
+ "best_node_id": search_tree_info.get("best_node_id"),
469
+ "best_path_node_ids": search_tree_info.get("best_path"),
470
+ "file": f"artifacts/{search_tree_path}",
471
+ }
472
+ else:
473
+ metadata["search_tree"] = None
474
+
475
+ if detail_files:
476
+ metadata["detail_files"] = detail_files
477
+
478
+ run_metadata_path = f"{telemetry_dir}/run_metadata.json"
479
+ workspace_service.write_file(
480
+ json.dumps(metadata, ensure_ascii=False, indent=2),
481
+ "artifacts",
482
+ run_metadata_path
483
+ )
484
+
485
+ metadata_file = workspace_service.get_path("artifacts") / run_metadata_path
486
+ record_entry = {
487
+ "task_id": task.task_id,
488
+ "metadata_path": str(metadata_file),
489
+ "workspace_dir": metadata["workspace_dir"],
490
+ "summary": metadata["summary"],
491
+ "timeline": metadata["timeline"],
492
+ "parameters": metadata["parameters"],
493
+ "detail_files": metadata.get("detail_files"),
494
+ }
495
+ self.run_records.append(record_entry)
496
+
497
+ def _write_jsonl(self, workspace_service, relative_path: str, records: List[Dict[str, Any]]) -> None:
498
+ """Write newline-delimited JSON records to an artifacts sub-path."""
499
+ content = "\n".join(json.dumps(record, ensure_ascii=False) for record in records)
500
+ workspace_service.write_file(content, "artifacts", relative_path)
501
+
502
+ def _format_result(self, result: Any) -> Any:
503
+ """Return a serialization-friendly representation of the workflow result."""
504
+ if isinstance(result, Path):
505
+ return str(result)
506
+ return result
507
+
508
+ def _get_best_node(self, workflow: Optional[DSATWorkflow]):
509
+ if not workflow or not hasattr(workflow, "state"):
510
+ return None
511
+ state = workflow.state
512
+ if isinstance(state, JournalState):
513
+ return state.get_best_node()
514
+ return None
515
+
516
+ def _extract_search_tree(self, workflow: Optional[DSATWorkflow], best_node: Optional[Any]):
517
+ if not workflow or not hasattr(workflow, "state"):
518
+ return None, {"best_node_id": None, "best_path": None}
519
+ state = workflow.state
520
+ if not isinstance(state, JournalState):
521
+ return None, {"best_node_id": None, "best_path": None}
522
+
523
+ nodes = [
524
+ node.model_dump(mode="json")
525
+ for node in sorted(state.nodes.values(), key=lambda n: n.step)
526
+ ]
527
+ best_path = self._extract_best_path(state, best_node)
528
+ info = {
529
+ "best_node_id": best_node.id if best_node else None,
530
+ "best_path": best_path,
531
+ }
532
+ return nodes, info
533
+
534
+ def _extract_best_path(self, state: JournalState, best_node: Optional[Any]) -> Optional[List[str]]:
535
+ if not best_node:
536
+ return None
537
+ path: List[str] = []
538
+ current = best_node
539
+ while current:
540
+ path.append(current.id)
541
+ current = state.get_node(current.parent_id) if current.parent_id else None
542
+ return list(reversed(path))
543
+
544
+ def _build_benchmark_snapshot(self) -> Optional[Dict[str, Any]]:
545
+ if not self.benchmark:
546
+ return None
547
+ snapshot: Dict[str, Any] = {"name": getattr(self.benchmark, "name", None)}
548
+ data_dir = getattr(self.benchmark, "data_dir", None)
549
+ if data_dir is not None:
550
+ snapshot["data_dir"] = str(data_dir)
551
+ config_value = getattr(self.benchmark, "config", None)
552
+ if isinstance(config_value, dict):
553
+ snapshot["config"] = config_value
554
+ return snapshot
File without changes