dslighting 1.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. dsat/__init__.py +3 -0
  2. dsat/benchmark/__init__.py +1 -0
  3. dsat/benchmark/benchmark.py +168 -0
  4. dsat/benchmark/datasci.py +291 -0
  5. dsat/benchmark/mle.py +777 -0
  6. dsat/benchmark/sciencebench.py +304 -0
  7. dsat/common/__init__.py +0 -0
  8. dsat/common/constants.py +11 -0
  9. dsat/common/exceptions.py +48 -0
  10. dsat/common/typing.py +19 -0
  11. dsat/config.py +79 -0
  12. dsat/models/__init__.py +3 -0
  13. dsat/models/candidates.py +16 -0
  14. dsat/models/formats.py +52 -0
  15. dsat/models/task.py +64 -0
  16. dsat/operators/__init__.py +0 -0
  17. dsat/operators/aflow_ops.py +90 -0
  18. dsat/operators/autokaggle_ops.py +170 -0
  19. dsat/operators/automind_ops.py +38 -0
  20. dsat/operators/base.py +22 -0
  21. dsat/operators/code.py +45 -0
  22. dsat/operators/dsagent_ops.py +123 -0
  23. dsat/operators/llm_basic.py +84 -0
  24. dsat/prompts/__init__.py +0 -0
  25. dsat/prompts/aflow_prompt.py +76 -0
  26. dsat/prompts/aide_prompt.py +52 -0
  27. dsat/prompts/autokaggle_prompt.py +290 -0
  28. dsat/prompts/automind_prompt.py +29 -0
  29. dsat/prompts/common.py +51 -0
  30. dsat/prompts/data_interpreter_prompt.py +82 -0
  31. dsat/prompts/dsagent_prompt.py +88 -0
  32. dsat/runner.py +554 -0
  33. dsat/services/__init__.py +0 -0
  34. dsat/services/data_analyzer.py +387 -0
  35. dsat/services/llm.py +486 -0
  36. dsat/services/llm_single.py +421 -0
  37. dsat/services/sandbox.py +386 -0
  38. dsat/services/states/__init__.py +0 -0
  39. dsat/services/states/autokaggle_state.py +43 -0
  40. dsat/services/states/base.py +14 -0
  41. dsat/services/states/dsa_log.py +13 -0
  42. dsat/services/states/experience.py +237 -0
  43. dsat/services/states/journal.py +153 -0
  44. dsat/services/states/operator_library.py +290 -0
  45. dsat/services/vdb.py +76 -0
  46. dsat/services/workspace.py +178 -0
  47. dsat/tasks/__init__.py +3 -0
  48. dsat/tasks/handlers.py +376 -0
  49. dsat/templates/open_ended/grade_template.py +107 -0
  50. dsat/tools/__init__.py +4 -0
  51. dsat/utils/__init__.py +0 -0
  52. dsat/utils/context.py +172 -0
  53. dsat/utils/dynamic_import.py +71 -0
  54. dsat/utils/parsing.py +33 -0
  55. dsat/workflows/__init__.py +12 -0
  56. dsat/workflows/base.py +53 -0
  57. dsat/workflows/factory.py +439 -0
  58. dsat/workflows/manual/__init__.py +0 -0
  59. dsat/workflows/manual/autokaggle_workflow.py +148 -0
  60. dsat/workflows/manual/data_interpreter_workflow.py +153 -0
  61. dsat/workflows/manual/deepanalyze_workflow.py +484 -0
  62. dsat/workflows/manual/dsagent_workflow.py +76 -0
  63. dsat/workflows/search/__init__.py +0 -0
  64. dsat/workflows/search/aflow_workflow.py +344 -0
  65. dsat/workflows/search/aide_workflow.py +283 -0
  66. dsat/workflows/search/automind_workflow.py +237 -0
  67. dsat/workflows/templates/__init__.py +0 -0
  68. dsat/workflows/templates/basic_kaggle_loop.py +71 -0
  69. dslighting/__init__.py +170 -0
  70. dslighting/core/__init__.py +13 -0
  71. dslighting/core/agent.py +646 -0
  72. dslighting/core/config_builder.py +318 -0
  73. dslighting/core/data_loader.py +422 -0
  74. dslighting/core/task_detector.py +422 -0
  75. dslighting/utils/__init__.py +19 -0
  76. dslighting/utils/defaults.py +151 -0
  77. dslighting-1.3.9.dist-info/METADATA +554 -0
  78. dslighting-1.3.9.dist-info/RECORD +80 -0
  79. dslighting-1.3.9.dist-info/WHEEL +5 -0
  80. dslighting-1.3.9.dist-info/top_level.txt +2 -0
dsat/services/vdb.py ADDED
@@ -0,0 +1,76 @@
1
+ """
2
+ Service for managing an in-memory vector database for case-based reasoning.
3
+ """
4
+ import logging
5
+ from pathlib import Path
6
+ from typing import List
7
+
8
+ import torch
9
+ from transformers import AutoModel, AutoTokenizer
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class VDBService:
14
+ """
15
+ Manages embedding and retrieving text documents (cases) for retrieval-augmented generation.
16
+ """
17
+ def __init__(self, case_dir: str, model_name: str = "BAAI/llm-embedder"):
18
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
20
+ self.model = AutoModel.from_pretrained(model_name).to(self.device).eval()
21
+ self.case_files: List[Path] = []
22
+ self.embedding_bank: torch.Tensor = None
23
+ self._build_index(Path(case_dir))
24
+
25
+ def _build_index(self, case_dir: Path):
26
+ """Loads cases from a directory and builds the vector index."""
27
+ logger.info(f"Building vector index from cases in: {case_dir}")
28
+ if not case_dir.exists():
29
+ logger.warning(f"Case directory not found: {case_dir}. Creating empty index.")
30
+ return
31
+
32
+ self.case_files = sorted(list(case_dir.glob("*.py")))
33
+ case_texts = []
34
+ for file_path in self.case_files:
35
+ with open(file_path, "r", encoding="utf-8") as f:
36
+ case_texts.append(f.read())
37
+
38
+ if not case_texts:
39
+ logger.warning("No case files found to build index.")
40
+ return
41
+
42
+ with torch.no_grad():
43
+ inputs = self.tokenizer(case_texts, padding=True, truncation=True, return_tensors='pt', max_length=512).to(self.device)
44
+ outputs = self.model(**inputs)
45
+ # Use CLS pooling
46
+ embeddings = outputs.last_hidden_state[:, 0]
47
+ self.embedding_bank = torch.nn.functional.normalize(embeddings, p=2, dim=1)
48
+ logger.info(f"Successfully built index with {len(self.case_files)} cases.")
49
+
50
+ def retrieve(self, query: str, top_k: int) -> List[str]:
51
+ """Retrieves the top_k most similar case texts for a given query."""
52
+ if self.embedding_bank is None:
53
+ return []
54
+
55
+ with torch.no_grad():
56
+ inputs = self.tokenizer([query], padding=True, truncation=True, return_tensors='pt', max_length=512).to(self.device)
57
+ query_embedding = self.model(**inputs).last_hidden_state[:, 0]
58
+ query_embedding = torch.nn.functional.normalize(query_embedding, p=2, dim=1)
59
+
60
+ similarity = (query_embedding @ self.embedding_bank.T).squeeze()
61
+ _, indices = torch.topk(similarity, min(top_k, len(self.case_files)))
62
+
63
+ retrieved_cases = []
64
+ for idx in indices.tolist():
65
+ with open(self.case_files[idx], "r", encoding="utf-8") as f:
66
+ retrieved_cases.append(f.read())
67
+ return retrieved_cases
68
+
69
+ async def store_documents(self, documents: list):
70
+ """Store documents in vector database."""
71
+ # This method is kept for compatibility but not used in DS-Agent workflow
72
+ pass
73
+
74
+ async def search(self, query: str, top_k: int = 5):
75
+ """Search for similar documents."""
76
+ return self.retrieve(query, top_k)
@@ -0,0 +1,178 @@
1
+ """
2
+ Service for managing the run-specific workspace, including data, logs, and artifacts.
3
+ """
4
+ import logging
5
+ import shutil
6
+ import uuid
7
+ import os
8
+ import contextlib
9
+ from pathlib import Path
10
+ from typing import Dict, Optional
11
+
12
+ from dsat.common.constants import DEFAULT_WORKSPACE_DIR
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ class WorkspaceService:
17
+ """
18
+ Manages the file system for a single, isolated agent run.
19
+ It creates a directory based on the provided unique run name and provides structured access to it.
20
+ The responsibility for generating unique run names is delegated to the DSATRunner.
21
+ """
22
+ def __init__(self, run_name: str, base_dir: str = None):
23
+ """
24
+ Initializes the workspace for a new run.
25
+
26
+ Args:
27
+ run_name (str): A descriptive and unique name for the run, provided by the runner.
28
+ base_dir (str, optional): The base directory where all run folders will be stored.
29
+ If None, uses DEFAULT_WORKSPACE_DIR from constants.
30
+ """
31
+
32
+ # Use the constant if base_dir is not provided
33
+ if base_dir is None:
34
+ base_dir = DEFAULT_WORKSPACE_DIR
35
+
36
+ if not Path(base_dir).is_absolute():
37
+ # Use Path.cwd() as the base for relative paths
38
+ base_dir_path = (Path.cwd() / base_dir).resolve()
39
+ else:
40
+ base_dir_path = Path(base_dir).resolve()
41
+
42
+ base_dir_path.mkdir(parents=True, exist_ok=True)
43
+
44
+ self.run_dir = base_dir_path / run_name
45
+ self.sandbox_workdir = self.run_dir / "sandbox"
46
+
47
+ self.paths: Dict[str, Path] = {
48
+ "run_dir": self.run_dir,
49
+ "sandbox_workdir": self.sandbox_workdir,
50
+ "config": self.run_dir / "config.yaml",
51
+ "workflow": self.run_dir / "workflow.py",
52
+ "logs": self.run_dir / "logs",
53
+ "state": self.run_dir / "state",
54
+ "candidates": self.run_dir / "candidates",
55
+ "artifacts": self.run_dir / "artifacts",
56
+ "results": self.run_dir / "results.json",
57
+ }
58
+
59
+ self._create_directories()
60
+ logger.info(f"Workspace initialized at: {self.run_dir.resolve()}. Sandbox Workdir: {self.sandbox_workdir.resolve()}")
61
+
62
+ def _create_directories(self):
63
+ """Creates the full directory structure for the run."""
64
+ for path in self.paths.values():
65
+ if not path.suffix: # Check if it's a directory
66
+ path.mkdir(parents=True, exist_ok=True)
67
+
68
+ def get_path(self, name: str) -> Path:
69
+ """
70
+ Retrieves a managed path from the workspace.
71
+
72
+ Args:
73
+ name (str): The key of the path to retrieve (e.g., 'logs', 'artifacts').
74
+
75
+ Returns:
76
+ Path: The absolute Path object for the requested resource.
77
+
78
+ Raises:
79
+ KeyError: If the requested path name is not defined.
80
+ """
81
+ if name == 'sandbox_cwd':
82
+ logger.warning("Accessing deprecated 'sandbox_cwd'. Use 'sandbox_workdir' instead.")
83
+ name = 'sandbox_workdir'
84
+
85
+ if name not in self.paths:
86
+ raise KeyError(f"Path '{name}' is not a defined workspace path.")
87
+ return self.paths[name]
88
+
89
+ def write_file(self, content: str, path_name: str, sub_path: str = None):
90
+ """
91
+ Writes content to a file within a managed directory.
92
+
93
+ Args:
94
+ content (str): The string content to write.
95
+ path_name (str): The key of the managed directory (e.g., 'logs').
96
+ sub_path (str, optional): A filename or relative path within the managed directory.
97
+ """
98
+ target_dir = self.get_path(path_name)
99
+ file_path = target_dir / sub_path if sub_path else target_dir
100
+
101
+ # Ensure parent directory of the file exists if sub_path contains folders
102
+ file_path.parent.mkdir(parents=True, exist_ok=True)
103
+
104
+ with open(file_path, "w", encoding="utf-8") as f:
105
+ f.write(content)
106
+ logger.debug(f"Wrote {len(content)} bytes to {file_path}")
107
+
108
+ def link_data_to_workspace(self, source_data_dir: Path):
109
+ """
110
+ Links or copies the CONTENTS of a source data directory into the run's sandbox_workdir.
111
+ This ensures the agent runs in an isolated environment containing all inputs.
112
+ """
113
+ # Use sandbox_workdir as the destination
114
+ destination_dir = self.get_path("sandbox_workdir")
115
+
116
+ src = source_data_dir.resolve()
117
+ if not src.exists() or not src.is_dir():
118
+ raise FileNotFoundError(f"Source data directory not found: {src}")
119
+
120
+ for item in src.iterdir():
121
+ source_item = item
122
+ destination_item = destination_dir / item.name
123
+
124
+ # If the destination item already exists, skip it (idempotent behavior)
125
+ if destination_item.exists() or destination_item.is_symlink():
126
+ continue
127
+
128
+ # Try to create a symlink for the item
129
+ try:
130
+ # Determine if the target is a directory for Windows compatibility
131
+ target_is_directory = source_item.is_dir()
132
+ os.symlink(source_item, destination_item, target_is_directory=target_is_directory)
133
+ logger.debug(f"Linked {source_item.name} into sandbox.")
134
+
135
+ except (OSError, NotImplementedError) as e:
136
+ # Symlink not permitted. Fallback to copy.
137
+ warning_message = (
138
+ f"Symlink creation failed for {item.name} ({e}). Falling back to copying. "
139
+ )
140
+ if os.name == 'nt':
141
+ warning_message += " On Windows, enable 'Developer Mode' or run as administrator for symlinks."
142
+
143
+ logger.warning(warning_message)
144
+
145
+ try:
146
+ if source_item.resolve() == destination_item.resolve():
147
+ logger.warning(
148
+ f"Skipping copy of {source_item.name} because source and destination "
149
+ "resolve to the same file. This may indicate a workspace configuration issue."
150
+ )
151
+ continue
152
+
153
+ if source_item.is_dir():
154
+ shutil.copytree(source_item, destination_item)
155
+ else:
156
+ shutil.copy2(source_item, destination_item)
157
+ logger.debug(f"Copied {source_item.name} into sandbox.")
158
+ except Exception as copy_e:
159
+ logger.error(f"Failed to copy item {item.name}: {copy_e}", exc_info=True)
160
+ raise
161
+
162
+ logger.info(f"Data from {src} successfully populated into {destination_dir}")
163
+
164
+ def cleanup(self, keep_workspace: bool = False):
165
+ """
166
+ Removes the entire run directory unless explicitly told to keep it.
167
+ """
168
+ if keep_workspace:
169
+ logger.info(f"Workspace preserved as requested: {self.run_dir.resolve()}")
170
+ return
171
+
172
+ logger.info(f"Cleaning up workspace: {self.run_dir.resolve()}")
173
+ try:
174
+ if self.run_dir.exists():
175
+ shutil.rmtree(self.run_dir)
176
+ except Exception as e:
177
+ logger.error(f"Failed to clean up workspace {self.run_dir}: {e}")
178
+
dsat/tasks/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ # dsat/tasks/__init__.py
2
+
3
+ # This file makes the 'tasks' directory a Python package.
dsat/tasks/handlers.py ADDED
@@ -0,0 +1,376 @@
1
+ from abc import ABC, abstractmethod
2
+ from pathlib import Path
3
+ import tempfile
4
+ import logging
5
+ from typing import Tuple, Any
6
+ from dsat.models.task import TaskDefinition
7
+ from dsat.services.data_analyzer import DataAnalyzer
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class TaskHandler(ABC):
13
+ """
14
+ Base class for handlers that translate between logical TaskDefinition and physical file interfaces required by DSATWorkflow.
15
+
16
+ Each handler encapsulates preparation and parsing logic for specific task types (e.g., Kaggle, QA),
17
+ allowing the workflow itself to remain task-agnostic.
18
+ """
19
+ def __init__(self):
20
+ """
21
+ Initialize the handler and create a temporary, self-managed directory
22
+ for storing physical files generated for the task.
23
+ """
24
+ try:
25
+ self.temp_dir = tempfile.TemporaryDirectory()
26
+ except Exception as e:
27
+ logger.error(f"Failed to create temporary directory for TaskHandler: {e}")
28
+ self.temp_dir = None
29
+
30
+ self.analyzer = DataAnalyzer()
31
+
32
+ @abstractmethod
33
+ def prepare_input(self, task: TaskDefinition) -> Tuple[str, str, Path, Path]:
34
+ """
35
+ Prepare the physical file input required by the workflow.
36
+
37
+ This method converts logical tasks into physical parameters needed by DSATWorkflow.solve().
38
+
39
+ Args:
40
+ task: Logical task definition.
41
+
42
+ Returns:
43
+ A tuple (description, io_instructions, data_dir, output_path) to pass to workflow.solve().
44
+ """
45
+ raise NotImplementedError
46
+
47
+ @abstractmethod
48
+ def parse_output(self, output_path: Path) -> Any:
49
+ """
50
+ Parse the workflow's output file into structured results required by benchmarking.
51
+
52
+ This method converts physical output files back into logical answers.
53
+
54
+ Args:
55
+ output_path: Path where the workflow saved its output.
56
+
57
+ Returns:
58
+ Final answer in the format expected by benchmarking (e.g., string for QA, Path object for Kaggle).
59
+ """
60
+ raise NotImplementedError
61
+
62
+ def cleanup(self):
63
+ """
64
+ Explicitly clean up the temporary directory.
65
+ """
66
+ if self.temp_dir:
67
+ try:
68
+ self.temp_dir.cleanup()
69
+ logger.debug(f"Successfully cleaned up temporary directory for {self.__class__.__name__}.")
70
+ except Exception as e:
71
+ logger.error(f"Error cleaning up temporary directory for {self.__class__.__name__}: {e}")
72
+
73
+ def __del__(self):
74
+ """Ensure cleanup is called when the object is garbage collected."""
75
+ self.cleanup()
76
+
77
+
78
+ class KaggleTaskHandler(TaskHandler):
79
+ """
80
+ Handler for Kaggle-style file input/file output tasks.
81
+ This is a "pass-through" implementation since tasks are already file-based.
82
+ """
83
+ def prepare_input(self, task: TaskDefinition) -> Tuple[str, str, Path, Path]:
84
+ """Extract paths, analyze data, and separate description from I/O instructions."""
85
+ if task.task_type != "kaggle":
86
+ raise ValueError("KaggleTaskHandler can only handle tasks of type 'kaggle'.")
87
+
88
+ description = task.payload.get("description")
89
+ data_dir = Path(task.payload.get("public_data_dir"))
90
+ output_path = Path(task.payload.get("output_submission_path"))
91
+
92
+ if not all([description, data_dir, output_path]):
93
+ raise ValueError("Kaggle task payload is missing required keys: 'description', 'public_data_dir', 'output_submission_path'.")
94
+ if not data_dir.exists() or not data_dir.is_dir():
95
+ raise FileNotFoundError(f"Kaggle public_data_dir not found: {data_dir}")
96
+
97
+ logger.info(f"Analyzing input data for task '{task.task_id}'...")
98
+
99
+ data_report = self.analyzer.analyze_data(data_dir, task_type="kaggle")
100
+ io_instructions = self.analyzer.generate_io_instructions(output_path.name, optimization_context=False)
101
+
102
+ augmented_description = f"{description}\n{data_report}"
103
+
104
+ logger.debug(f"Preparing Kaggle task '{task.task_id}': data_dir='{data_dir}', output_path='{output_path}'")
105
+ return augmented_description, io_instructions, data_dir, output_path
106
+
107
+ def parse_output(self, output_path: Path) -> Path:
108
+ """
109
+ For Kaggle tasks, the result is the output file itself.
110
+ This just validates that the file was created.
111
+ """
112
+ if not output_path.exists():
113
+ # In actual evaluation, this will be caught and reported as a failure.
114
+ logger.warning(f"Agent did not produce the required submission file at: {output_path}")
115
+ # Return the path even if it doesn't exist, let the caller (e.g., benchmark) handle the file not found case.
116
+ return output_path
117
+
118
+
119
+ class QATaskHandler(TaskHandler):
120
+ """
121
+ Handler for simple question-answer (QA) tasks.
122
+ This is a "translation" implementation that converts string questions to files and expects answers as files.
123
+ """
124
+ def prepare_input(self, task: TaskDefinition) -> Tuple[str, str, Path, Path]:
125
+ """Convert QA question to physical file input."""
126
+ if task.task_type != "qa":
127
+ raise ValueError("QATaskHandler can only handle tasks of type 'qa'.")
128
+ if not self.temp_dir:
129
+ raise RuntimeError("Temporary directory not available for QATaskHandler.")
130
+
131
+ question = task.payload.get("question")
132
+ if not question:
133
+ raise ValueError("QA task payload is missing required key: 'question'.")
134
+
135
+ data_dir = Path(self.temp_dir.name)
136
+
137
+ # Create physical task representation
138
+ problem_file = data_dir / "problem.txt"
139
+ problem_file.write_text(question, encoding='utf-8')
140
+
141
+ # Define output contract
142
+ output_path = data_dir / "answer.txt"
143
+
144
+ # This core instruction is now simpler
145
+ core_instruction = (
146
+ "Your task is to answer the question found in `problem.txt`. "
147
+ "Write ONLY the final answer into the required output file."
148
+ )
149
+
150
+ data_report = self.analyzer.analyze_data(data_dir, task_type="qa")
151
+ io_instructions = self.analyzer.generate_io_instructions(output_path.name, optimization_context=False)
152
+
153
+ description = f"{core_instruction}\n{data_report}"
154
+
155
+ logger.debug(f"Preparing QA task '{task.task_id}': input file='{problem_file}', expected output='{output_path}'")
156
+ return description, io_instructions, data_dir, output_path
157
+
158
+ def parse_output(self, output_path: Path) -> str:
159
+ """Read and return the final answer string from the output file."""
160
+ if not output_path.exists() or not output_path.is_file():
161
+ logger.warning(f"Agent did not produce the answer file for QA task at: {output_path}")
162
+ return "[ERROR] Agent did not produce an answer file."
163
+
164
+ try:
165
+ answer = output_path.read_text(encoding='utf-8').strip()
166
+ logger.debug(f"Parsed QA answer from '{output_path}': '{answer[:50]}...'")
167
+ return answer
168
+ except Exception as e:
169
+ logger.error(f"Failed to read or parse QA answer file '{output_path}': {e}")
170
+ return f"[ERROR] Failed to parse answer file: {e}"
171
+
172
+
173
+ class DataSciTaskHandler(TaskHandler):
174
+ """
175
+ Handler for DataSciBench tasks.
176
+ These are multi-step data science tasks with prompts and optional input files.
177
+ """
178
+ def prepare_input(self, task: TaskDefinition) -> Tuple[str, str, Path, Path]:
179
+ """Prepare DataSciBench task input."""
180
+ if task.task_type != "datasci":
181
+ raise ValueError("DataSciTaskHandler can only handle tasks of type 'datasci'.")
182
+
183
+ prompt = task.payload.get("prompt", "")
184
+ input_dir = task.payload.get("input_dir", "")
185
+ output_dir = task.payload.get("output_dir", "")
186
+
187
+ if not prompt:
188
+ raise ValueError("DataSci task payload is missing required key: 'prompt'.")
189
+
190
+ # Use input_dir as data_dir, or temp_dir if no input files
191
+ if input_dir and Path(input_dir).exists():
192
+ data_dir = Path(input_dir)
193
+ elif self.temp_dir:
194
+ data_dir = Path(self.temp_dir.name)
195
+ else:
196
+ raise RuntimeError("No data directory available for DataSciTaskHandler.")
197
+
198
+ # Output directory
199
+ if output_dir:
200
+ output_path = Path(output_dir) / "output.csv"
201
+ else:
202
+ output_path = data_dir / "output.csv"
203
+
204
+ # Build description with the prompt
205
+ description = prompt
206
+
207
+ # Analyze data if available
208
+ try:
209
+ data_report = self.analyzer.analyze_data(data_dir, task_type="datasci")
210
+ description = f"{prompt}\n\n{data_report}"
211
+ except Exception as e:
212
+ logger.debug(f"Data analysis skipped: {e}")
213
+
214
+ # Generate I/O instructions
215
+ io_instructions = (
216
+ f"All input data files are in the current working directory.\n"
217
+ f"Save all output files to the current working directory.\n"
218
+ f"Follow the task instructions carefully and generate the required output files."
219
+ )
220
+
221
+ logger.debug(f"Preparing DataSci task '{task.task_id}': data_dir='{data_dir}', output_dir='{output_dir}'")
222
+ return description, io_instructions, data_dir, output_path
223
+
224
+ def parse_output(self, output_path: Path) -> Path:
225
+ """
226
+ For DataSci tasks, return the output directory path.
227
+ The actual evaluation is done by the benchmark class using metric.yaml.
228
+ """
229
+ if output_path.parent.exists():
230
+ return output_path.parent
231
+ return output_path
232
+
233
+
234
+ class OpenEndedTaskHandler(TaskHandler):
235
+ """
236
+ Handler for open-ended tasks (mathematical modeling, simulations, strategy tasks).
237
+ These tasks don't have ground truth answers and are evaluated via LLM judges.
238
+ """
239
+ def prepare_input(self, task: TaskDefinition) -> Tuple[str, str, Path, Path]:
240
+ """Prepare open-ended task input."""
241
+ if task.task_type != "open_ended":
242
+ raise ValueError("OpenEndedTaskHandler can only handle tasks of type 'open_ended'.")
243
+ if not self.temp_dir:
244
+ raise RuntimeError("Temporary directory not available for OpenEndedTaskHandler.")
245
+
246
+ # Get task paths from payload
247
+ raw_dir_str = task.payload.get("raw_data_dir", "")
248
+ description_file = task.payload.get("description_file", "")
249
+ rubric_file = task.payload.get("rubric_file", "")
250
+
251
+ # Use temp directory as working directory
252
+ data_dir = Path(self.temp_dir.name)
253
+
254
+ # Copy ONLY data files (CSV, JSON, etc.) - exclude description and rubric files
255
+ if raw_dir_str:
256
+ raw_dir = Path(raw_dir_str)
257
+ if raw_dir.exists():
258
+ import shutil
259
+ for file in raw_dir.iterdir():
260
+ if file.is_file() and file.suffix in ['.csv', '.json', '.txt', '.xlsx', '.parquet']:
261
+ # Exclude description.md and rubric.md from being treated as data files
262
+ if file.name not in ['description.md', 'rubric.md']:
263
+ shutil.copy2(file, data_dir / file.name)
264
+ logger.debug(f"Copied data file: {file.name}")
265
+
266
+ # Read task description and rubric from files if provided
267
+ description = task.payload.get("description", "")
268
+ rubric = task.payload.get("rubric", "")
269
+
270
+ # Read from files if specified
271
+ if description_file and Path(description_file).exists():
272
+ try:
273
+ description = Path(description_file).read_text(encoding='utf-8')
274
+ logger.debug(f"Read description from {description_file} ({len(description)} chars)")
275
+ except Exception as e:
276
+ logger.warning(f"Failed to read description file {description_file}: {e}")
277
+
278
+ if rubric_file and Path(rubric_file).exists():
279
+ try:
280
+ rubric = Path(rubric_file).read_text(encoding='utf-8')
281
+ logger.debug(f"Read rubric from {rubric_file} ({len(rubric)} chars)")
282
+ except Exception as e:
283
+ logger.warning(f"Failed to read rubric file {rubric_file}: {e}")
284
+
285
+ if not description:
286
+ raise ValueError("Open-ended task payload is missing required key: 'description'.")
287
+
288
+ # Output path - agent should create an artifacts directory or report
289
+ output_path = data_dir / "artifacts"
290
+
291
+ # Build the FULL task description directly in the prompt
292
+ # Include description and evaluation criteria
293
+ # IMPORTANT: For open-ended tasks, explicitly require artifacts directory creation
294
+
295
+ task_description_section = f"""## Task Description
296
+
297
+ {description}
298
+ """
299
+
300
+ if rubric:
301
+ task_description_section += f"""
302
+
303
+ ## Evaluation Criteria
304
+
305
+ {rubric}
306
+ """
307
+
308
+ # Analyze available data files to provide schema information (excluding task files)
309
+ data_report = self.analyzer.analyze_data(data_dir, task_type="datasci")
310
+
311
+ # Combine everything into the full description
312
+ # Note: data_report already contains "--- COMPREHENSIVE DATA REPORT ---" header
313
+ full_description = f"""{task_description_section}
314
+
315
+ {data_report}
316
+
317
+ ## CRITICAL OUTPUT INSTRUCTIONS
318
+
319
+ **YOU MUST CREATE AN `artifacts/` DIRECTORY AND SAVE ALL OUTPUTS THERE:**
320
+
321
+ ```python
322
+ import os
323
+ artifact_dir = 'artifacts'
324
+ os.makedirs(artifact_dir, exist_ok=True)
325
+
326
+ # Save all your work to the artifacts directory:
327
+ # - Analysis code: artifacts/analysis.py
328
+ # - Visualizations: artifacts/plot_*.png
329
+ # - Data files: artifacts/results.csv
330
+ # - Models, notebooks, etc.
331
+ ```
332
+
333
+ ## Task Goals
334
+ - Your goal is to complete this task to the best of your ability
335
+ - Create appropriate output files (code, analysis, visualizations, etc.) in the `artifacts/` subdirectory
336
+ - The evaluation will be based on the quality and completeness of your work according to the evaluation criteria
337
+ """
338
+
339
+ # Generate I/O instructions - VERY EXPLICIT for open-ended tasks
340
+ io_instructions = f"""**OUTPUT DIRECTORY STRUCTURE (MANDATORY):**
341
+
342
+ ```python
343
+ # At the START of your code, create the artifacts directory:
344
+ import os
345
+ artifact_dir = 'artifacts'
346
+ os.makedirs(artifact_dir, exist_ok=True)
347
+
348
+ # Save ALL outputs to this directory:
349
+ # - Code: f"{{artifact_dir}}/solution.py"
350
+ # - Plots: f"{{artifact_dir}}/visualization_{{i}}.png"
351
+ # - Data: f"{{artifact_dir}}/results.csv"
352
+ ```
353
+
354
+ **REQUIREMENTS:**
355
+ 1. Create the `artifacts/` directory at the beginning of your code
356
+ 2. Save ALL generated files (plots, models, data, code) to this directory
357
+ 3. Do NOT save files to the current directory - use the artifacts/ subdirectory
358
+ 4. Focus on quality, completeness, and following the evaluation criteria
359
+ """
360
+
361
+ logger.debug(f"Preparing open-ended task '{task.task_id}': output_path='{output_path}', description_len={len(description)}")
362
+ return full_description, io_instructions, data_dir, output_path
363
+
364
+ def parse_output(self, output_path: Path) -> Path:
365
+ """
366
+ For open-ended tasks, return the artifacts directory path.
367
+ The actual evaluation is done by LLM judges, not CSV grading.
368
+ """
369
+ if not output_path.exists():
370
+ # If artifacts directory doesn't exist, return the parent temp dir
371
+ # This allows evaluation to proceed even if no artifacts were created
372
+ logger.warning(f"Open-ended task did not create artifacts directory at: {output_path}")
373
+ return output_path.parent
374
+
375
+ logger.debug(f"Parsed open-ended task artifacts from: {output_path}")
376
+ return output_path