dslighting 1.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dsat/__init__.py +3 -0
- dsat/benchmark/__init__.py +1 -0
- dsat/benchmark/benchmark.py +168 -0
- dsat/benchmark/datasci.py +291 -0
- dsat/benchmark/mle.py +777 -0
- dsat/benchmark/sciencebench.py +304 -0
- dsat/common/__init__.py +0 -0
- dsat/common/constants.py +11 -0
- dsat/common/exceptions.py +48 -0
- dsat/common/typing.py +19 -0
- dsat/config.py +79 -0
- dsat/models/__init__.py +3 -0
- dsat/models/candidates.py +16 -0
- dsat/models/formats.py +52 -0
- dsat/models/task.py +64 -0
- dsat/operators/__init__.py +0 -0
- dsat/operators/aflow_ops.py +90 -0
- dsat/operators/autokaggle_ops.py +170 -0
- dsat/operators/automind_ops.py +38 -0
- dsat/operators/base.py +22 -0
- dsat/operators/code.py +45 -0
- dsat/operators/dsagent_ops.py +123 -0
- dsat/operators/llm_basic.py +84 -0
- dsat/prompts/__init__.py +0 -0
- dsat/prompts/aflow_prompt.py +76 -0
- dsat/prompts/aide_prompt.py +52 -0
- dsat/prompts/autokaggle_prompt.py +290 -0
- dsat/prompts/automind_prompt.py +29 -0
- dsat/prompts/common.py +51 -0
- dsat/prompts/data_interpreter_prompt.py +82 -0
- dsat/prompts/dsagent_prompt.py +88 -0
- dsat/runner.py +554 -0
- dsat/services/__init__.py +0 -0
- dsat/services/data_analyzer.py +387 -0
- dsat/services/llm.py +486 -0
- dsat/services/llm_single.py +421 -0
- dsat/services/sandbox.py +386 -0
- dsat/services/states/__init__.py +0 -0
- dsat/services/states/autokaggle_state.py +43 -0
- dsat/services/states/base.py +14 -0
- dsat/services/states/dsa_log.py +13 -0
- dsat/services/states/experience.py +237 -0
- dsat/services/states/journal.py +153 -0
- dsat/services/states/operator_library.py +290 -0
- dsat/services/vdb.py +76 -0
- dsat/services/workspace.py +178 -0
- dsat/tasks/__init__.py +3 -0
- dsat/tasks/handlers.py +376 -0
- dsat/templates/open_ended/grade_template.py +107 -0
- dsat/tools/__init__.py +4 -0
- dsat/utils/__init__.py +0 -0
- dsat/utils/context.py +172 -0
- dsat/utils/dynamic_import.py +71 -0
- dsat/utils/parsing.py +33 -0
- dsat/workflows/__init__.py +12 -0
- dsat/workflows/base.py +53 -0
- dsat/workflows/factory.py +439 -0
- dsat/workflows/manual/__init__.py +0 -0
- dsat/workflows/manual/autokaggle_workflow.py +148 -0
- dsat/workflows/manual/data_interpreter_workflow.py +153 -0
- dsat/workflows/manual/deepanalyze_workflow.py +484 -0
- dsat/workflows/manual/dsagent_workflow.py +76 -0
- dsat/workflows/search/__init__.py +0 -0
- dsat/workflows/search/aflow_workflow.py +344 -0
- dsat/workflows/search/aide_workflow.py +283 -0
- dsat/workflows/search/automind_workflow.py +237 -0
- dsat/workflows/templates/__init__.py +0 -0
- dsat/workflows/templates/basic_kaggle_loop.py +71 -0
- dslighting/__init__.py +170 -0
- dslighting/core/__init__.py +13 -0
- dslighting/core/agent.py +646 -0
- dslighting/core/config_builder.py +318 -0
- dslighting/core/data_loader.py +422 -0
- dslighting/core/task_detector.py +422 -0
- dslighting/utils/__init__.py +19 -0
- dslighting/utils/defaults.py +151 -0
- dslighting-1.3.9.dist-info/METADATA +554 -0
- dslighting-1.3.9.dist-info/RECORD +80 -0
- dslighting-1.3.9.dist-info/WHEEL +5 -0
- dslighting-1.3.9.dist-info/top_level.txt +2 -0
dsat/runner.py
ADDED
|
@@ -0,0 +1,554 @@
|
|
|
1
|
+
# dsat/runner.py
|
|
2
|
+
import logging
|
|
3
|
+
import shutil
|
|
4
|
+
import uuid
|
|
5
|
+
import json
|
|
6
|
+
import time
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Callable, Coroutine, Any, Tuple, Dict, Type, Optional, List
|
|
10
|
+
|
|
11
|
+
# Core configuration and models
|
|
12
|
+
from dsat.config import DSATConfig
|
|
13
|
+
from dsat.models.task import TaskDefinition, TaskType
|
|
14
|
+
|
|
15
|
+
# Services and workflows
|
|
16
|
+
from dsat.services.llm import LLMService
|
|
17
|
+
from dsat.workflows.base import DSATWorkflow
|
|
18
|
+
|
|
19
|
+
# Dynamic components (factories and handlers)
|
|
20
|
+
from dsat.tasks.handlers import TaskHandler, KaggleTaskHandler, QATaskHandler, DataSciTaskHandler, OpenEndedTaskHandler
|
|
21
|
+
from dsat.workflows.factory import (
|
|
22
|
+
WorkflowFactory,
|
|
23
|
+
AutoMindWorkflowFactory,
|
|
24
|
+
AIDEWorkflowFactory,
|
|
25
|
+
DSAgentWorkflowFactory,
|
|
26
|
+
DataInterpreterWorkflowFactory,
|
|
27
|
+
AutoKaggleWorkflowFactory,
|
|
28
|
+
AFlowWorkflowFactory,
|
|
29
|
+
DeepAnalyzeWorkflowFactory,
|
|
30
|
+
DynamicWorkflowFactory,
|
|
31
|
+
)
|
|
32
|
+
# Import AFlow workflow for type checking
|
|
33
|
+
from dsat.workflows.search.aflow_workflow import AFlowWorkflow
|
|
34
|
+
from dsat.services.states.journal import JournalState
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
# ==============================================================================
|
|
39
|
+
# == COMPONENT REGISTRIES ==
|
|
40
|
+
# ==============================================================================
|
|
41
|
+
|
|
42
|
+
WORKFLOW_FACTORIES: Dict[str, WorkflowFactory] = {
|
|
43
|
+
"automind": AutoMindWorkflowFactory(),
|
|
44
|
+
"aide": AIDEWorkflowFactory(),
|
|
45
|
+
"dsagent": DSAgentWorkflowFactory(),
|
|
46
|
+
"data_interpreter": DataInterpreterWorkflowFactory(),
|
|
47
|
+
"autokaggle": AutoKaggleWorkflowFactory(),
|
|
48
|
+
"aflow": AFlowWorkflowFactory(),
|
|
49
|
+
"deepanalyze": DeepAnalyzeWorkflowFactory(),
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
TASK_HANDLER_CLASSES: Dict[TaskType, Type[TaskHandler]] = {
|
|
53
|
+
"kaggle": KaggleTaskHandler,
|
|
54
|
+
"qa": QATaskHandler,
|
|
55
|
+
"datasci": DataSciTaskHandler,
|
|
56
|
+
"open_ended": OpenEndedTaskHandler,
|
|
57
|
+
# "code": CodeTaskHandler, # future extension
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ==============================================================================
|
|
62
|
+
# == DSAT RUNNER ==
|
|
63
|
+
# ==============================================================================
|
|
64
|
+
|
|
65
|
+
class DSATRunner:
|
|
66
|
+
"""
|
|
67
|
+
Orchestrates benchmarking tasks by instantiating workflows, preparing inputs,
|
|
68
|
+
executing runs, and collecting telemetry for later inspection.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(self, config: DSATConfig):
|
|
72
|
+
logger.info(f"Initializing DSATRunner for workflow: '{config.workflow.name}'")
|
|
73
|
+
self.config = config
|
|
74
|
+
self.factories = WORKFLOW_FACTORIES.copy()
|
|
75
|
+
self.factory = self.factories.get(config.workflow.name)
|
|
76
|
+
if not self.factory:
|
|
77
|
+
available = ", ".join(self.factories.keys())
|
|
78
|
+
raise ValueError(f"Unknown workflow '{config.workflow.name}'. Available workflows: [{available}]")
|
|
79
|
+
|
|
80
|
+
self.handler_classes = TASK_HANDLER_CLASSES
|
|
81
|
+
self.benchmark = None
|
|
82
|
+
self.run_records: List[Dict[str, Any]] = []
|
|
83
|
+
|
|
84
|
+
logger.info("DSATRunner is ready to evaluate tasks.")
|
|
85
|
+
|
|
86
|
+
def register_workflow(self, name: str, factory: WorkflowFactory) -> None:
|
|
87
|
+
"""
|
|
88
|
+
Register a workflow factory dynamically for this runner instance.
|
|
89
|
+
Critical for paradigms like AFLOW which synthesize workflows at runtime.
|
|
90
|
+
"""
|
|
91
|
+
logger.info(f"Registering workflow '{name}' for this runner instance.")
|
|
92
|
+
self.factories[name] = factory
|
|
93
|
+
if self.config.workflow and self.config.workflow.name == name:
|
|
94
|
+
self.factory = factory
|
|
95
|
+
logger.info(f"Active workflow factory switched to '{name}'.")
|
|
96
|
+
|
|
97
|
+
def get_eval_function(self) -> Callable[[TaskDefinition], Coroutine[Any, Any, Tuple[Any, float, Dict[str, Any]]]]:
|
|
98
|
+
"""
|
|
99
|
+
Produce an async function that evaluates a single TaskDefinition and returns (result, cost, usage_summary).
|
|
100
|
+
Benchmark drivers call this function repeatedly for each competition/task.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
async def eval_function(task: TaskDefinition) -> Tuple[Any, float, Dict[str, Any]]:
|
|
104
|
+
logger.info(f"Starting evaluation for task '{task.task_id}' (type='{task.task_type}').")
|
|
105
|
+
|
|
106
|
+
# If a specific run name is provided in config, use it. Otherwise, generate one.
|
|
107
|
+
if self.config.run.name and self.config.run.name != "dsat_run":
|
|
108
|
+
task_run_name = self.config.run.name
|
|
109
|
+
else:
|
|
110
|
+
safe_task_id = "".join(c if c.isalnum() else "_" for c in task.task_id)
|
|
111
|
+
unique_suffix = uuid.uuid4().hex[:8]
|
|
112
|
+
task_run_name = f"{self.config.run.name}_{safe_task_id}_{unique_suffix}"
|
|
113
|
+
|
|
114
|
+
task_config = self.config.model_copy(deep=True)
|
|
115
|
+
task_config.run.name = task_run_name
|
|
116
|
+
|
|
117
|
+
workflow: Optional[DSATWorkflow] = None
|
|
118
|
+
workspace_service = None
|
|
119
|
+
sandbox_service = None
|
|
120
|
+
llm_service: Optional[LLMService] = None
|
|
121
|
+
result: Any = None
|
|
122
|
+
run_total_cost = 0.0
|
|
123
|
+
run_started_at = datetime.utcnow()
|
|
124
|
+
run_perf_start = time.perf_counter()
|
|
125
|
+
|
|
126
|
+
benchmark_instance = self.benchmark
|
|
127
|
+
|
|
128
|
+
try:
|
|
129
|
+
workflow = self.factory.create_workflow(task_config, benchmark=benchmark_instance)
|
|
130
|
+
workspace_service = workflow.services.get("workspace")
|
|
131
|
+
llm_service = workflow.services.get("llm")
|
|
132
|
+
sandbox_service = workflow.services.get("sandbox")
|
|
133
|
+
|
|
134
|
+
if isinstance(workflow, AFlowWorkflow):
|
|
135
|
+
optimizer_name = "AFLOW"
|
|
136
|
+
logger.info("Detected %s workflow. Running meta-optimization stage.", optimizer_name)
|
|
137
|
+
best_workflow_code = await workflow.optimize()
|
|
138
|
+
logger.info("Meta-optimization complete. Proceeding with final evaluation workflow.")
|
|
139
|
+
|
|
140
|
+
if hasattr(benchmark_instance, 'set_mode'):
|
|
141
|
+
logger.info("Switching benchmark to 'test' mode for final %s evaluation.", optimizer_name)
|
|
142
|
+
benchmark_instance.set_mode('test')
|
|
143
|
+
|
|
144
|
+
dynamic_factory = DynamicWorkflowFactory(code_string=best_workflow_code)
|
|
145
|
+
workflow = dynamic_factory.create_workflow(task_config, benchmark=benchmark_instance)
|
|
146
|
+
llm_service = workflow.services.get("llm")
|
|
147
|
+
sandbox_service = workflow.services.get("sandbox")
|
|
148
|
+
workspace_service = workflow.services.get("workspace")
|
|
149
|
+
logger.info("Final %s workflow instantiated and ready.", optimizer_name)
|
|
150
|
+
|
|
151
|
+
workspace_service = workspace_service or workflow.services.get("workspace")
|
|
152
|
+
llm_service = llm_service or workflow.services.get("llm")
|
|
153
|
+
sandbox_service = sandbox_service or workflow.services.get("sandbox")
|
|
154
|
+
|
|
155
|
+
if not llm_service:
|
|
156
|
+
logger.error("Workflow did not expose an LLMService.")
|
|
157
|
+
return "[ERROR] Missing LLM service", 0.0
|
|
158
|
+
|
|
159
|
+
handler_class = self.handler_classes.get(task.task_type)
|
|
160
|
+
if not handler_class:
|
|
161
|
+
logger.error(f"No handler registered for task type '{task.task_type}'.")
|
|
162
|
+
return f"[ERROR] Unsupported task type '{task.task_type}'", 0.0
|
|
163
|
+
|
|
164
|
+
handler: TaskHandler = handler_class()
|
|
165
|
+
|
|
166
|
+
description, io_instructions = "", ""
|
|
167
|
+
data_dir, output_path = None, None
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
description, io_instructions, data_dir, output_path = handler.prepare_input(task)
|
|
171
|
+
|
|
172
|
+
if workspace_service:
|
|
173
|
+
try:
|
|
174
|
+
workspace_service.link_data_to_workspace(data_dir)
|
|
175
|
+
except Exception as link_error:
|
|
176
|
+
raise RuntimeError(f"Failed to link data directory: {link_error}") from link_error
|
|
177
|
+
else:
|
|
178
|
+
logger.warning("WorkspaceService missing; skipping data linkage.")
|
|
179
|
+
|
|
180
|
+
await workflow.solve(
|
|
181
|
+
description=description,
|
|
182
|
+
io_instructions=io_instructions,
|
|
183
|
+
data_dir=data_dir,
|
|
184
|
+
output_path=output_path
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
if workspace_service and output_path:
|
|
188
|
+
sandbox_workdir = workspace_service.get_path("sandbox_workdir")
|
|
189
|
+
generated_file = sandbox_workdir / output_path.name
|
|
190
|
+
if generated_file.exists():
|
|
191
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
192
|
+
if generated_file.resolve() != output_path.resolve():
|
|
193
|
+
logger.info(f"Collecting produced artifact '{output_path.name}' from the sandbox.")
|
|
194
|
+
|
|
195
|
+
# Handle both files and directories (e.g., for open-ended tasks)
|
|
196
|
+
if generated_file.is_dir():
|
|
197
|
+
# For directories (like 'artifacts'), use copytree
|
|
198
|
+
if output_path.exists():
|
|
199
|
+
if output_path.is_dir():
|
|
200
|
+
shutil.rmtree(output_path)
|
|
201
|
+
else:
|
|
202
|
+
output_path.unlink()
|
|
203
|
+
shutil.copytree(generated_file, output_path)
|
|
204
|
+
logger.info(f"Copied directory '{generated_file}' to '{output_path}'")
|
|
205
|
+
else:
|
|
206
|
+
# For files, use regular copy
|
|
207
|
+
shutil.copy(generated_file, output_path)
|
|
208
|
+
logger.info(f"Copied file '{generated_file}' to '{output_path}'")
|
|
209
|
+
else:
|
|
210
|
+
logger.warning(f"No output '{output_path.name}' found in sandbox '{sandbox_workdir}' after workflow execution.")
|
|
211
|
+
|
|
212
|
+
if output_path:
|
|
213
|
+
result = handler.parse_output(output_path)
|
|
214
|
+
|
|
215
|
+
# Grade the submission if benchmark is available
|
|
216
|
+
if benchmark_instance and hasattr(benchmark_instance, 'grade') and isinstance(result, Path):
|
|
217
|
+
try:
|
|
218
|
+
logger.info(f"Grading submission: {result}")
|
|
219
|
+
score = await benchmark_instance.grade(result)
|
|
220
|
+
logger.info(f"✓ Grading complete | Score: {score}")
|
|
221
|
+
# Return score as result
|
|
222
|
+
result = {"score": score, "submission_path": str(result)}
|
|
223
|
+
except Exception as grade_error:
|
|
224
|
+
logger.warning(f"Grading failed: {grade_error}")
|
|
225
|
+
# Keep the path as result if grading fails
|
|
226
|
+
logger.info(f"Submission created at: {result}")
|
|
227
|
+
elif isinstance(result, Path):
|
|
228
|
+
logger.info(f"Submission created at: {result}")
|
|
229
|
+
|
|
230
|
+
logger.info(f"Task '{task.task_id}' evaluation finished successfully.")
|
|
231
|
+
|
|
232
|
+
except Exception as execution_error:
|
|
233
|
+
logger.error(f"Task '{task.task_id}' failed: {execution_error}", exc_info=True)
|
|
234
|
+
result = f"[ERROR] {execution_error.__class__.__name__}: {execution_error}"
|
|
235
|
+
finally:
|
|
236
|
+
handler.cleanup()
|
|
237
|
+
if workspace_service:
|
|
238
|
+
ended_at = datetime.utcnow()
|
|
239
|
+
duration_sec = round(time.perf_counter() - run_perf_start, 4)
|
|
240
|
+
run_total_cost = llm_service.get_total_cost() if llm_service else 0.0
|
|
241
|
+
try:
|
|
242
|
+
self._persist_run_metadata(
|
|
243
|
+
workspace_service=workspace_service,
|
|
244
|
+
task_config=task_config,
|
|
245
|
+
task=task,
|
|
246
|
+
description=description,
|
|
247
|
+
io_instructions=io_instructions,
|
|
248
|
+
data_dir=data_dir,
|
|
249
|
+
output_path=output_path,
|
|
250
|
+
result=result,
|
|
251
|
+
llm_service=llm_service,
|
|
252
|
+
sandbox_service=sandbox_service,
|
|
253
|
+
workflow=workflow,
|
|
254
|
+
started_at=run_started_at,
|
|
255
|
+
ended_at=ended_at,
|
|
256
|
+
duration_seconds=duration_sec,
|
|
257
|
+
total_cost=run_total_cost
|
|
258
|
+
)
|
|
259
|
+
except Exception as persist_error:
|
|
260
|
+
logger.error(f"Failed to persist telemetry for task '{task.task_id}': {persist_error}", exc_info=True)
|
|
261
|
+
|
|
262
|
+
failed = isinstance(result, str) and result.startswith("[ERROR]")
|
|
263
|
+
keep_on_fail = self.config.run.keep_workspace_on_failure
|
|
264
|
+
keep_all = self.config.run.keep_all_workspaces
|
|
265
|
+
workspace_service.cleanup(keep_workspace=keep_all or (failed and keep_on_fail))
|
|
266
|
+
|
|
267
|
+
except Exception as workflow_error:
|
|
268
|
+
logger.error(f"Workflow creation failed for task '{task.task_id}': {workflow_error}", exc_info=True)
|
|
269
|
+
result = f"[ERROR] {workflow_error.__class__.__name__}: {workflow_error}"
|
|
270
|
+
if workspace_service:
|
|
271
|
+
ended_at = datetime.utcnow()
|
|
272
|
+
duration_sec = round(time.perf_counter() - run_perf_start, 4)
|
|
273
|
+
run_total_cost = llm_service.get_total_cost() if llm_service else 0.0
|
|
274
|
+
try:
|
|
275
|
+
self._persist_run_metadata(
|
|
276
|
+
workspace_service=workspace_service,
|
|
277
|
+
task_config=task_config,
|
|
278
|
+
task=task,
|
|
279
|
+
description="",
|
|
280
|
+
io_instructions="",
|
|
281
|
+
data_dir=None,
|
|
282
|
+
output_path=None,
|
|
283
|
+
result=result,
|
|
284
|
+
llm_service=llm_service,
|
|
285
|
+
sandbox_service=sandbox_service,
|
|
286
|
+
workflow=workflow,
|
|
287
|
+
started_at=run_started_at,
|
|
288
|
+
ended_at=ended_at,
|
|
289
|
+
duration_seconds=duration_sec,
|
|
290
|
+
total_cost=run_total_cost
|
|
291
|
+
)
|
|
292
|
+
except Exception as persist_error:
|
|
293
|
+
logger.error(f"Telemetry persistence failed after workflow creation error: {persist_error}", exc_info=True)
|
|
294
|
+
workspace_service.cleanup(keep_workspace=True)
|
|
295
|
+
|
|
296
|
+
run_total_cost = llm_service.get_total_cost() if llm_service else run_total_cost
|
|
297
|
+
usage_summary = llm_service.get_usage_summary() if llm_service else {}
|
|
298
|
+
logger.info(f"Task '{task.task_id}' LLM cost: ${run_total_cost:.6f}")
|
|
299
|
+
return result, run_total_cost, usage_summary
|
|
300
|
+
|
|
301
|
+
return eval_function
|
|
302
|
+
|
|
303
|
+
def get_run_records(self) -> List[Dict[str, Any]]:
|
|
304
|
+
"""
|
|
305
|
+
Return a shallow copy of stored run metadata records for summary rendering.
|
|
306
|
+
"""
|
|
307
|
+
return [record.copy() for record in self.run_records]
|
|
308
|
+
|
|
309
|
+
def _persist_run_metadata(
|
|
310
|
+
self,
|
|
311
|
+
*,
|
|
312
|
+
workspace_service,
|
|
313
|
+
task_config: DSATConfig,
|
|
314
|
+
task: TaskDefinition,
|
|
315
|
+
description: str,
|
|
316
|
+
io_instructions: str,
|
|
317
|
+
data_dir: Optional[Path],
|
|
318
|
+
output_path: Optional[Path],
|
|
319
|
+
result: Any,
|
|
320
|
+
llm_service: Optional[LLMService],
|
|
321
|
+
sandbox_service: Optional[Any],
|
|
322
|
+
workflow: Optional[DSATWorkflow],
|
|
323
|
+
started_at: datetime,
|
|
324
|
+
ended_at: datetime,
|
|
325
|
+
duration_seconds: float,
|
|
326
|
+
total_cost: float,
|
|
327
|
+
) -> None:
|
|
328
|
+
"""
|
|
329
|
+
Write per-task telemetry (LLM calls, sandbox runs, search tree, summary) to the workspace.
|
|
330
|
+
"""
|
|
331
|
+
telemetry_dir = "telemetry"
|
|
332
|
+
workspace_dir = workspace_service.get_path("run_dir")
|
|
333
|
+
llm_calls = llm_service.get_call_history() if llm_service else []
|
|
334
|
+
sandbox_runs = sandbox_service.get_execution_history() if sandbox_service else []
|
|
335
|
+
usage_summary = llm_service.get_usage_summary() if llm_service else {}
|
|
336
|
+
best_node = self._get_best_node(workflow)
|
|
337
|
+
search_tree_data, search_tree_info = self._extract_search_tree(workflow, best_node)
|
|
338
|
+
|
|
339
|
+
config_snapshot = task_config.model_dump()
|
|
340
|
+
if "llm" in config_snapshot and "api_key" in config_snapshot["llm"]:
|
|
341
|
+
config_snapshot["llm"]["api_key"] = "***REDACTED***"
|
|
342
|
+
|
|
343
|
+
benchmark_snapshot = self._build_benchmark_snapshot()
|
|
344
|
+
|
|
345
|
+
final_code_path: Optional[str] = None
|
|
346
|
+
final_candidate = workspace_service.get_path("artifacts") / "final_submission" / "final_solution.py"
|
|
347
|
+
if final_candidate.exists():
|
|
348
|
+
final_code_path = str(final_candidate)
|
|
349
|
+
elif best_node:
|
|
350
|
+
final_code_path = best_node.final_submission_path or best_node.code_artifact_path
|
|
351
|
+
|
|
352
|
+
filtered_parameters = {
|
|
353
|
+
key: value for key, value in (task_config.run.parameters or {}).items()
|
|
354
|
+
if value not in (None, "", [], {})
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
metadata = {
|
|
358
|
+
"run_name": task_config.run.name,
|
|
359
|
+
"workspace_dir": str(workspace_dir),
|
|
360
|
+
"workflow": task_config.workflow.name if task_config.workflow else None,
|
|
361
|
+
"parameters": filtered_parameters,
|
|
362
|
+
"benchmark": benchmark_snapshot,
|
|
363
|
+
"task": {
|
|
364
|
+
"task_id": task.task_id,
|
|
365
|
+
"task_type": task.task_type,
|
|
366
|
+
"payload": task.payload,
|
|
367
|
+
},
|
|
368
|
+
"task_context": {
|
|
369
|
+
"description": description,
|
|
370
|
+
"io_instructions": io_instructions,
|
|
371
|
+
"data_dir": str(data_dir) if data_dir else None,
|
|
372
|
+
"expected_output_path": str(output_path) if output_path else None,
|
|
373
|
+
},
|
|
374
|
+
"timeline": {
|
|
375
|
+
"started_at_utc": started_at.isoformat() + "Z",
|
|
376
|
+
"ended_at_utc": ended_at.isoformat() + "Z",
|
|
377
|
+
"duration_seconds": duration_seconds,
|
|
378
|
+
},
|
|
379
|
+
"summary": {
|
|
380
|
+
"result": self._format_result(result),
|
|
381
|
+
"success": not (isinstance(result, str) and result.startswith("[ERROR]")),
|
|
382
|
+
"total_cost": total_cost,
|
|
383
|
+
"usage": usage_summary,
|
|
384
|
+
"cost_per_token": usage_summary.get("cost_per_token"),
|
|
385
|
+
"llm_call_count": len(llm_calls),
|
|
386
|
+
"sandbox_run_count": len(sandbox_runs),
|
|
387
|
+
"final_code": best_node.code if best_node else None,
|
|
388
|
+
"final_code_path": final_code_path,
|
|
389
|
+
"best_node_id": best_node.id if best_node else None,
|
|
390
|
+
"best_path_node_ids": search_tree_info.get("best_path"),
|
|
391
|
+
},
|
|
392
|
+
"config_snapshot": config_snapshot,
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
# Save Final Code to a standard location
|
|
396
|
+
if best_node and best_node.code:
|
|
397
|
+
final_code_file = workspace_service.get_path("run_dir") / "final_solution.py"
|
|
398
|
+
with open(final_code_file, "w", encoding="utf-8") as f:
|
|
399
|
+
f.write(best_node.code)
|
|
400
|
+
metadata["summary"]["final_code_path"] = str(final_code_file)
|
|
401
|
+
|
|
402
|
+
# 💾 NEW: Save model training code to code_history directory
|
|
403
|
+
try:
|
|
404
|
+
code_history_dir = workspace_service.get_path("sandbox_workdir") / "code_history"
|
|
405
|
+
code_history_dir.mkdir(parents=True, exist_ok=True)
|
|
406
|
+
|
|
407
|
+
# Find next available number for model training code
|
|
408
|
+
import re
|
|
409
|
+
existing_model_codes = list(code_history_dir.glob("model_code_*.py"))
|
|
410
|
+
if existing_model_codes:
|
|
411
|
+
numbers = []
|
|
412
|
+
for f in existing_model_codes:
|
|
413
|
+
match = re.search(r'model_code_(\d+)\.py', f.name)
|
|
414
|
+
if match:
|
|
415
|
+
numbers.append(int(match.group(1)))
|
|
416
|
+
next_num = max(numbers) + 1 if numbers else 1
|
|
417
|
+
else:
|
|
418
|
+
next_num = 1
|
|
419
|
+
|
|
420
|
+
# Save with formatted number and metadata
|
|
421
|
+
model_code_filename = f"model_code_{next_num:03d}.py"
|
|
422
|
+
model_code_filepath = code_history_dir / model_code_filename
|
|
423
|
+
|
|
424
|
+
# Add header with training metadata
|
|
425
|
+
import datetime
|
|
426
|
+
header = f'''# Code Type: MODEL TRAINING
|
|
427
|
+
# Workflow: {task_config.workflow.name if task_config.workflow else 'Unknown'}
|
|
428
|
+
# Model: {task_config.llm.model if task_config.llm else 'Unknown'}
|
|
429
|
+
# Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
|
430
|
+
# Task ID: {task.task_id}
|
|
431
|
+
# Success: {not (isinstance(result, str) and result.startswith("[ERROR]"))}
|
|
432
|
+
|
|
433
|
+
'''
|
|
434
|
+
model_code_filepath.write_text(header + best_node.code)
|
|
435
|
+
logger.info(f"💾 Saved model training code to workspace: {model_code_filepath}")
|
|
436
|
+
except Exception as e:
|
|
437
|
+
logger.warning(f"Failed to save model training code to code_history: {e}")
|
|
438
|
+
|
|
439
|
+
# Save Evaluation Result to a CSV in workspace
|
|
440
|
+
if isinstance(result, (float, int, str)) and not str(result).startswith("[ERROR]"):
|
|
441
|
+
try:
|
|
442
|
+
res_file = workspace_service.get_path("run_dir") / "evaluation_result.csv"
|
|
443
|
+
with open(res_file, "w") as f:
|
|
444
|
+
f.write("task_id,score,cost,duration\n")
|
|
445
|
+
f.write(f"{task.task_id},{result},{total_cost},{duration_seconds}\n")
|
|
446
|
+
except Exception:
|
|
447
|
+
pass
|
|
448
|
+
|
|
449
|
+
detail_files = {}
|
|
450
|
+
if llm_calls:
|
|
451
|
+
llm_calls_path = f"{telemetry_dir}/llm_calls.jsonl"
|
|
452
|
+
self._write_jsonl(workspace_service, llm_calls_path, llm_calls)
|
|
453
|
+
detail_files["llm_calls"] = f"artifacts/{llm_calls_path}"
|
|
454
|
+
if sandbox_runs:
|
|
455
|
+
sandbox_runs_path = f"{telemetry_dir}/sandbox_runs.jsonl"
|
|
456
|
+
self._write_jsonl(workspace_service, sandbox_runs_path, sandbox_runs)
|
|
457
|
+
detail_files["sandbox_runs"] = f"artifacts/{sandbox_runs_path}"
|
|
458
|
+
if search_tree_data:
|
|
459
|
+
search_tree_path = f"{telemetry_dir}/search_tree.json"
|
|
460
|
+
workspace_service.write_file(
|
|
461
|
+
json.dumps(search_tree_data, ensure_ascii=False, indent=2),
|
|
462
|
+
"artifacts",
|
|
463
|
+
search_tree_path
|
|
464
|
+
)
|
|
465
|
+
detail_files["search_tree"] = f"artifacts/{search_tree_path}"
|
|
466
|
+
metadata["search_tree"] = {
|
|
467
|
+
"node_count": len(search_tree_data),
|
|
468
|
+
"best_node_id": search_tree_info.get("best_node_id"),
|
|
469
|
+
"best_path_node_ids": search_tree_info.get("best_path"),
|
|
470
|
+
"file": f"artifacts/{search_tree_path}",
|
|
471
|
+
}
|
|
472
|
+
else:
|
|
473
|
+
metadata["search_tree"] = None
|
|
474
|
+
|
|
475
|
+
if detail_files:
|
|
476
|
+
metadata["detail_files"] = detail_files
|
|
477
|
+
|
|
478
|
+
run_metadata_path = f"{telemetry_dir}/run_metadata.json"
|
|
479
|
+
workspace_service.write_file(
|
|
480
|
+
json.dumps(metadata, ensure_ascii=False, indent=2),
|
|
481
|
+
"artifacts",
|
|
482
|
+
run_metadata_path
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
metadata_file = workspace_service.get_path("artifacts") / run_metadata_path
|
|
486
|
+
record_entry = {
|
|
487
|
+
"task_id": task.task_id,
|
|
488
|
+
"metadata_path": str(metadata_file),
|
|
489
|
+
"workspace_dir": metadata["workspace_dir"],
|
|
490
|
+
"summary": metadata["summary"],
|
|
491
|
+
"timeline": metadata["timeline"],
|
|
492
|
+
"parameters": metadata["parameters"],
|
|
493
|
+
"detail_files": metadata.get("detail_files"),
|
|
494
|
+
}
|
|
495
|
+
self.run_records.append(record_entry)
|
|
496
|
+
|
|
497
|
+
def _write_jsonl(self, workspace_service, relative_path: str, records: List[Dict[str, Any]]) -> None:
|
|
498
|
+
"""Write newline-delimited JSON records to an artifacts sub-path."""
|
|
499
|
+
content = "\n".join(json.dumps(record, ensure_ascii=False) for record in records)
|
|
500
|
+
workspace_service.write_file(content, "artifacts", relative_path)
|
|
501
|
+
|
|
502
|
+
def _format_result(self, result: Any) -> Any:
|
|
503
|
+
"""Return a serialization-friendly representation of the workflow result."""
|
|
504
|
+
if isinstance(result, Path):
|
|
505
|
+
return str(result)
|
|
506
|
+
return result
|
|
507
|
+
|
|
508
|
+
def _get_best_node(self, workflow: Optional[DSATWorkflow]):
|
|
509
|
+
if not workflow or not hasattr(workflow, "state"):
|
|
510
|
+
return None
|
|
511
|
+
state = workflow.state
|
|
512
|
+
if isinstance(state, JournalState):
|
|
513
|
+
return state.get_best_node()
|
|
514
|
+
return None
|
|
515
|
+
|
|
516
|
+
def _extract_search_tree(self, workflow: Optional[DSATWorkflow], best_node: Optional[Any]):
|
|
517
|
+
if not workflow or not hasattr(workflow, "state"):
|
|
518
|
+
return None, {"best_node_id": None, "best_path": None}
|
|
519
|
+
state = workflow.state
|
|
520
|
+
if not isinstance(state, JournalState):
|
|
521
|
+
return None, {"best_node_id": None, "best_path": None}
|
|
522
|
+
|
|
523
|
+
nodes = [
|
|
524
|
+
node.model_dump(mode="json")
|
|
525
|
+
for node in sorted(state.nodes.values(), key=lambda n: n.step)
|
|
526
|
+
]
|
|
527
|
+
best_path = self._extract_best_path(state, best_node)
|
|
528
|
+
info = {
|
|
529
|
+
"best_node_id": best_node.id if best_node else None,
|
|
530
|
+
"best_path": best_path,
|
|
531
|
+
}
|
|
532
|
+
return nodes, info
|
|
533
|
+
|
|
534
|
+
def _extract_best_path(self, state: JournalState, best_node: Optional[Any]) -> Optional[List[str]]:
|
|
535
|
+
if not best_node:
|
|
536
|
+
return None
|
|
537
|
+
path: List[str] = []
|
|
538
|
+
current = best_node
|
|
539
|
+
while current:
|
|
540
|
+
path.append(current.id)
|
|
541
|
+
current = state.get_node(current.parent_id) if current.parent_id else None
|
|
542
|
+
return list(reversed(path))
|
|
543
|
+
|
|
544
|
+
def _build_benchmark_snapshot(self) -> Optional[Dict[str, Any]]:
|
|
545
|
+
if not self.benchmark:
|
|
546
|
+
return None
|
|
547
|
+
snapshot: Dict[str, Any] = {"name": getattr(self.benchmark, "name", None)}
|
|
548
|
+
data_dir = getattr(self.benchmark, "data_dir", None)
|
|
549
|
+
if data_dir is not None:
|
|
550
|
+
snapshot["data_dir"] = str(data_dir)
|
|
551
|
+
config_value = getattr(self.benchmark, "config", None)
|
|
552
|
+
if isinstance(config_value, dict):
|
|
553
|
+
snapshot["config"] = config_value
|
|
554
|
+
return snapshot
|
|
File without changes
|