dslighting 1.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dsat/__init__.py +3 -0
- dsat/benchmark/__init__.py +1 -0
- dsat/benchmark/benchmark.py +168 -0
- dsat/benchmark/datasci.py +291 -0
- dsat/benchmark/mle.py +777 -0
- dsat/benchmark/sciencebench.py +304 -0
- dsat/common/__init__.py +0 -0
- dsat/common/constants.py +11 -0
- dsat/common/exceptions.py +48 -0
- dsat/common/typing.py +19 -0
- dsat/config.py +79 -0
- dsat/models/__init__.py +3 -0
- dsat/models/candidates.py +16 -0
- dsat/models/formats.py +52 -0
- dsat/models/task.py +64 -0
- dsat/operators/__init__.py +0 -0
- dsat/operators/aflow_ops.py +90 -0
- dsat/operators/autokaggle_ops.py +170 -0
- dsat/operators/automind_ops.py +38 -0
- dsat/operators/base.py +22 -0
- dsat/operators/code.py +45 -0
- dsat/operators/dsagent_ops.py +123 -0
- dsat/operators/llm_basic.py +84 -0
- dsat/prompts/__init__.py +0 -0
- dsat/prompts/aflow_prompt.py +76 -0
- dsat/prompts/aide_prompt.py +52 -0
- dsat/prompts/autokaggle_prompt.py +290 -0
- dsat/prompts/automind_prompt.py +29 -0
- dsat/prompts/common.py +51 -0
- dsat/prompts/data_interpreter_prompt.py +82 -0
- dsat/prompts/dsagent_prompt.py +88 -0
- dsat/runner.py +554 -0
- dsat/services/__init__.py +0 -0
- dsat/services/data_analyzer.py +387 -0
- dsat/services/llm.py +486 -0
- dsat/services/llm_single.py +421 -0
- dsat/services/sandbox.py +386 -0
- dsat/services/states/__init__.py +0 -0
- dsat/services/states/autokaggle_state.py +43 -0
- dsat/services/states/base.py +14 -0
- dsat/services/states/dsa_log.py +13 -0
- dsat/services/states/experience.py +237 -0
- dsat/services/states/journal.py +153 -0
- dsat/services/states/operator_library.py +290 -0
- dsat/services/vdb.py +76 -0
- dsat/services/workspace.py +178 -0
- dsat/tasks/__init__.py +3 -0
- dsat/tasks/handlers.py +376 -0
- dsat/templates/open_ended/grade_template.py +107 -0
- dsat/tools/__init__.py +4 -0
- dsat/utils/__init__.py +0 -0
- dsat/utils/context.py +172 -0
- dsat/utils/dynamic_import.py +71 -0
- dsat/utils/parsing.py +33 -0
- dsat/workflows/__init__.py +12 -0
- dsat/workflows/base.py +53 -0
- dsat/workflows/factory.py +439 -0
- dsat/workflows/manual/__init__.py +0 -0
- dsat/workflows/manual/autokaggle_workflow.py +148 -0
- dsat/workflows/manual/data_interpreter_workflow.py +153 -0
- dsat/workflows/manual/deepanalyze_workflow.py +484 -0
- dsat/workflows/manual/dsagent_workflow.py +76 -0
- dsat/workflows/search/__init__.py +0 -0
- dsat/workflows/search/aflow_workflow.py +344 -0
- dsat/workflows/search/aide_workflow.py +283 -0
- dsat/workflows/search/automind_workflow.py +237 -0
- dsat/workflows/templates/__init__.py +0 -0
- dsat/workflows/templates/basic_kaggle_loop.py +71 -0
- dslighting/__init__.py +170 -0
- dslighting/core/__init__.py +13 -0
- dslighting/core/agent.py +646 -0
- dslighting/core/config_builder.py +318 -0
- dslighting/core/data_loader.py +422 -0
- dslighting/core/task_detector.py +422 -0
- dslighting/utils/__init__.py +19 -0
- dslighting/utils/defaults.py +151 -0
- dslighting-1.3.9.dist-info/METADATA +554 -0
- dslighting-1.3.9.dist-info/RECORD +80 -0
- dslighting-1.3.9.dist-info/WHEEL +5 -0
- dslighting-1.3.9.dist-info/top_level.txt +2 -0
dsat/services/vdb.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Service for managing an in-memory vector database for case-based reasoning.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
import torch
|
|
9
|
+
from transformers import AutoModel, AutoTokenizer
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
class VDBService:
|
|
14
|
+
"""
|
|
15
|
+
Manages embedding and retrieving text documents (cases) for retrieval-augmented generation.
|
|
16
|
+
"""
|
|
17
|
+
def __init__(self, case_dir: str, model_name: str = "BAAI/llm-embedder"):
|
|
18
|
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
19
|
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
20
|
+
self.model = AutoModel.from_pretrained(model_name).to(self.device).eval()
|
|
21
|
+
self.case_files: List[Path] = []
|
|
22
|
+
self.embedding_bank: torch.Tensor = None
|
|
23
|
+
self._build_index(Path(case_dir))
|
|
24
|
+
|
|
25
|
+
def _build_index(self, case_dir: Path):
|
|
26
|
+
"""Loads cases from a directory and builds the vector index."""
|
|
27
|
+
logger.info(f"Building vector index from cases in: {case_dir}")
|
|
28
|
+
if not case_dir.exists():
|
|
29
|
+
logger.warning(f"Case directory not found: {case_dir}. Creating empty index.")
|
|
30
|
+
return
|
|
31
|
+
|
|
32
|
+
self.case_files = sorted(list(case_dir.glob("*.py")))
|
|
33
|
+
case_texts = []
|
|
34
|
+
for file_path in self.case_files:
|
|
35
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
36
|
+
case_texts.append(f.read())
|
|
37
|
+
|
|
38
|
+
if not case_texts:
|
|
39
|
+
logger.warning("No case files found to build index.")
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
with torch.no_grad():
|
|
43
|
+
inputs = self.tokenizer(case_texts, padding=True, truncation=True, return_tensors='pt', max_length=512).to(self.device)
|
|
44
|
+
outputs = self.model(**inputs)
|
|
45
|
+
# Use CLS pooling
|
|
46
|
+
embeddings = outputs.last_hidden_state[:, 0]
|
|
47
|
+
self.embedding_bank = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
|
48
|
+
logger.info(f"Successfully built index with {len(self.case_files)} cases.")
|
|
49
|
+
|
|
50
|
+
def retrieve(self, query: str, top_k: int) -> List[str]:
|
|
51
|
+
"""Retrieves the top_k most similar case texts for a given query."""
|
|
52
|
+
if self.embedding_bank is None:
|
|
53
|
+
return []
|
|
54
|
+
|
|
55
|
+
with torch.no_grad():
|
|
56
|
+
inputs = self.tokenizer([query], padding=True, truncation=True, return_tensors='pt', max_length=512).to(self.device)
|
|
57
|
+
query_embedding = self.model(**inputs).last_hidden_state[:, 0]
|
|
58
|
+
query_embedding = torch.nn.functional.normalize(query_embedding, p=2, dim=1)
|
|
59
|
+
|
|
60
|
+
similarity = (query_embedding @ self.embedding_bank.T).squeeze()
|
|
61
|
+
_, indices = torch.topk(similarity, min(top_k, len(self.case_files)))
|
|
62
|
+
|
|
63
|
+
retrieved_cases = []
|
|
64
|
+
for idx in indices.tolist():
|
|
65
|
+
with open(self.case_files[idx], "r", encoding="utf-8") as f:
|
|
66
|
+
retrieved_cases.append(f.read())
|
|
67
|
+
return retrieved_cases
|
|
68
|
+
|
|
69
|
+
async def store_documents(self, documents: list):
|
|
70
|
+
"""Store documents in vector database."""
|
|
71
|
+
# This method is kept for compatibility but not used in DS-Agent workflow
|
|
72
|
+
pass
|
|
73
|
+
|
|
74
|
+
async def search(self, query: str, top_k: int = 5):
|
|
75
|
+
"""Search for similar documents."""
|
|
76
|
+
return self.retrieve(query, top_k)
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Service for managing the run-specific workspace, including data, logs, and artifacts.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
import shutil
|
|
6
|
+
import uuid
|
|
7
|
+
import os
|
|
8
|
+
import contextlib
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Dict, Optional
|
|
11
|
+
|
|
12
|
+
from dsat.common.constants import DEFAULT_WORKSPACE_DIR
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
class WorkspaceService:
|
|
17
|
+
"""
|
|
18
|
+
Manages the file system for a single, isolated agent run.
|
|
19
|
+
It creates a directory based on the provided unique run name and provides structured access to it.
|
|
20
|
+
The responsibility for generating unique run names is delegated to the DSATRunner.
|
|
21
|
+
"""
|
|
22
|
+
def __init__(self, run_name: str, base_dir: str = None):
|
|
23
|
+
"""
|
|
24
|
+
Initializes the workspace for a new run.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
run_name (str): A descriptive and unique name for the run, provided by the runner.
|
|
28
|
+
base_dir (str, optional): The base directory where all run folders will be stored.
|
|
29
|
+
If None, uses DEFAULT_WORKSPACE_DIR from constants.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
# Use the constant if base_dir is not provided
|
|
33
|
+
if base_dir is None:
|
|
34
|
+
base_dir = DEFAULT_WORKSPACE_DIR
|
|
35
|
+
|
|
36
|
+
if not Path(base_dir).is_absolute():
|
|
37
|
+
# Use Path.cwd() as the base for relative paths
|
|
38
|
+
base_dir_path = (Path.cwd() / base_dir).resolve()
|
|
39
|
+
else:
|
|
40
|
+
base_dir_path = Path(base_dir).resolve()
|
|
41
|
+
|
|
42
|
+
base_dir_path.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
|
|
44
|
+
self.run_dir = base_dir_path / run_name
|
|
45
|
+
self.sandbox_workdir = self.run_dir / "sandbox"
|
|
46
|
+
|
|
47
|
+
self.paths: Dict[str, Path] = {
|
|
48
|
+
"run_dir": self.run_dir,
|
|
49
|
+
"sandbox_workdir": self.sandbox_workdir,
|
|
50
|
+
"config": self.run_dir / "config.yaml",
|
|
51
|
+
"workflow": self.run_dir / "workflow.py",
|
|
52
|
+
"logs": self.run_dir / "logs",
|
|
53
|
+
"state": self.run_dir / "state",
|
|
54
|
+
"candidates": self.run_dir / "candidates",
|
|
55
|
+
"artifacts": self.run_dir / "artifacts",
|
|
56
|
+
"results": self.run_dir / "results.json",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
self._create_directories()
|
|
60
|
+
logger.info(f"Workspace initialized at: {self.run_dir.resolve()}. Sandbox Workdir: {self.sandbox_workdir.resolve()}")
|
|
61
|
+
|
|
62
|
+
def _create_directories(self):
|
|
63
|
+
"""Creates the full directory structure for the run."""
|
|
64
|
+
for path in self.paths.values():
|
|
65
|
+
if not path.suffix: # Check if it's a directory
|
|
66
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
67
|
+
|
|
68
|
+
def get_path(self, name: str) -> Path:
|
|
69
|
+
"""
|
|
70
|
+
Retrieves a managed path from the workspace.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
name (str): The key of the path to retrieve (e.g., 'logs', 'artifacts').
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Path: The absolute Path object for the requested resource.
|
|
77
|
+
|
|
78
|
+
Raises:
|
|
79
|
+
KeyError: If the requested path name is not defined.
|
|
80
|
+
"""
|
|
81
|
+
if name == 'sandbox_cwd':
|
|
82
|
+
logger.warning("Accessing deprecated 'sandbox_cwd'. Use 'sandbox_workdir' instead.")
|
|
83
|
+
name = 'sandbox_workdir'
|
|
84
|
+
|
|
85
|
+
if name not in self.paths:
|
|
86
|
+
raise KeyError(f"Path '{name}' is not a defined workspace path.")
|
|
87
|
+
return self.paths[name]
|
|
88
|
+
|
|
89
|
+
def write_file(self, content: str, path_name: str, sub_path: str = None):
|
|
90
|
+
"""
|
|
91
|
+
Writes content to a file within a managed directory.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
content (str): The string content to write.
|
|
95
|
+
path_name (str): The key of the managed directory (e.g., 'logs').
|
|
96
|
+
sub_path (str, optional): A filename or relative path within the managed directory.
|
|
97
|
+
"""
|
|
98
|
+
target_dir = self.get_path(path_name)
|
|
99
|
+
file_path = target_dir / sub_path if sub_path else target_dir
|
|
100
|
+
|
|
101
|
+
# Ensure parent directory of the file exists if sub_path contains folders
|
|
102
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
103
|
+
|
|
104
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
|
105
|
+
f.write(content)
|
|
106
|
+
logger.debug(f"Wrote {len(content)} bytes to {file_path}")
|
|
107
|
+
|
|
108
|
+
def link_data_to_workspace(self, source_data_dir: Path):
|
|
109
|
+
"""
|
|
110
|
+
Links or copies the CONTENTS of a source data directory into the run's sandbox_workdir.
|
|
111
|
+
This ensures the agent runs in an isolated environment containing all inputs.
|
|
112
|
+
"""
|
|
113
|
+
# Use sandbox_workdir as the destination
|
|
114
|
+
destination_dir = self.get_path("sandbox_workdir")
|
|
115
|
+
|
|
116
|
+
src = source_data_dir.resolve()
|
|
117
|
+
if not src.exists() or not src.is_dir():
|
|
118
|
+
raise FileNotFoundError(f"Source data directory not found: {src}")
|
|
119
|
+
|
|
120
|
+
for item in src.iterdir():
|
|
121
|
+
source_item = item
|
|
122
|
+
destination_item = destination_dir / item.name
|
|
123
|
+
|
|
124
|
+
# If the destination item already exists, skip it (idempotent behavior)
|
|
125
|
+
if destination_item.exists() or destination_item.is_symlink():
|
|
126
|
+
continue
|
|
127
|
+
|
|
128
|
+
# Try to create a symlink for the item
|
|
129
|
+
try:
|
|
130
|
+
# Determine if the target is a directory for Windows compatibility
|
|
131
|
+
target_is_directory = source_item.is_dir()
|
|
132
|
+
os.symlink(source_item, destination_item, target_is_directory=target_is_directory)
|
|
133
|
+
logger.debug(f"Linked {source_item.name} into sandbox.")
|
|
134
|
+
|
|
135
|
+
except (OSError, NotImplementedError) as e:
|
|
136
|
+
# Symlink not permitted. Fallback to copy.
|
|
137
|
+
warning_message = (
|
|
138
|
+
f"Symlink creation failed for {item.name} ({e}). Falling back to copying. "
|
|
139
|
+
)
|
|
140
|
+
if os.name == 'nt':
|
|
141
|
+
warning_message += " On Windows, enable 'Developer Mode' or run as administrator for symlinks."
|
|
142
|
+
|
|
143
|
+
logger.warning(warning_message)
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
if source_item.resolve() == destination_item.resolve():
|
|
147
|
+
logger.warning(
|
|
148
|
+
f"Skipping copy of {source_item.name} because source and destination "
|
|
149
|
+
"resolve to the same file. This may indicate a workspace configuration issue."
|
|
150
|
+
)
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
if source_item.is_dir():
|
|
154
|
+
shutil.copytree(source_item, destination_item)
|
|
155
|
+
else:
|
|
156
|
+
shutil.copy2(source_item, destination_item)
|
|
157
|
+
logger.debug(f"Copied {source_item.name} into sandbox.")
|
|
158
|
+
except Exception as copy_e:
|
|
159
|
+
logger.error(f"Failed to copy item {item.name}: {copy_e}", exc_info=True)
|
|
160
|
+
raise
|
|
161
|
+
|
|
162
|
+
logger.info(f"Data from {src} successfully populated into {destination_dir}")
|
|
163
|
+
|
|
164
|
+
def cleanup(self, keep_workspace: bool = False):
|
|
165
|
+
"""
|
|
166
|
+
Removes the entire run directory unless explicitly told to keep it.
|
|
167
|
+
"""
|
|
168
|
+
if keep_workspace:
|
|
169
|
+
logger.info(f"Workspace preserved as requested: {self.run_dir.resolve()}")
|
|
170
|
+
return
|
|
171
|
+
|
|
172
|
+
logger.info(f"Cleaning up workspace: {self.run_dir.resolve()}")
|
|
173
|
+
try:
|
|
174
|
+
if self.run_dir.exists():
|
|
175
|
+
shutil.rmtree(self.run_dir)
|
|
176
|
+
except Exception as e:
|
|
177
|
+
logger.error(f"Failed to clean up workspace {self.run_dir}: {e}")
|
|
178
|
+
|
dsat/tasks/__init__.py
ADDED
dsat/tasks/handlers.py
ADDED
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import tempfile
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Tuple, Any
|
|
6
|
+
from dsat.models.task import TaskDefinition
|
|
7
|
+
from dsat.services.data_analyzer import DataAnalyzer
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TaskHandler(ABC):
|
|
13
|
+
"""
|
|
14
|
+
Base class for handlers that translate between logical TaskDefinition and physical file interfaces required by DSATWorkflow.
|
|
15
|
+
|
|
16
|
+
Each handler encapsulates preparation and parsing logic for specific task types (e.g., Kaggle, QA),
|
|
17
|
+
allowing the workflow itself to remain task-agnostic.
|
|
18
|
+
"""
|
|
19
|
+
def __init__(self):
|
|
20
|
+
"""
|
|
21
|
+
Initialize the handler and create a temporary, self-managed directory
|
|
22
|
+
for storing physical files generated for the task.
|
|
23
|
+
"""
|
|
24
|
+
try:
|
|
25
|
+
self.temp_dir = tempfile.TemporaryDirectory()
|
|
26
|
+
except Exception as e:
|
|
27
|
+
logger.error(f"Failed to create temporary directory for TaskHandler: {e}")
|
|
28
|
+
self.temp_dir = None
|
|
29
|
+
|
|
30
|
+
self.analyzer = DataAnalyzer()
|
|
31
|
+
|
|
32
|
+
@abstractmethod
|
|
33
|
+
def prepare_input(self, task: TaskDefinition) -> Tuple[str, str, Path, Path]:
|
|
34
|
+
"""
|
|
35
|
+
Prepare the physical file input required by the workflow.
|
|
36
|
+
|
|
37
|
+
This method converts logical tasks into physical parameters needed by DSATWorkflow.solve().
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
task: Logical task definition.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
A tuple (description, io_instructions, data_dir, output_path) to pass to workflow.solve().
|
|
44
|
+
"""
|
|
45
|
+
raise NotImplementedError
|
|
46
|
+
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def parse_output(self, output_path: Path) -> Any:
|
|
49
|
+
"""
|
|
50
|
+
Parse the workflow's output file into structured results required by benchmarking.
|
|
51
|
+
|
|
52
|
+
This method converts physical output files back into logical answers.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
output_path: Path where the workflow saved its output.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Final answer in the format expected by benchmarking (e.g., string for QA, Path object for Kaggle).
|
|
59
|
+
"""
|
|
60
|
+
raise NotImplementedError
|
|
61
|
+
|
|
62
|
+
def cleanup(self):
|
|
63
|
+
"""
|
|
64
|
+
Explicitly clean up the temporary directory.
|
|
65
|
+
"""
|
|
66
|
+
if self.temp_dir:
|
|
67
|
+
try:
|
|
68
|
+
self.temp_dir.cleanup()
|
|
69
|
+
logger.debug(f"Successfully cleaned up temporary directory for {self.__class__.__name__}.")
|
|
70
|
+
except Exception as e:
|
|
71
|
+
logger.error(f"Error cleaning up temporary directory for {self.__class__.__name__}: {e}")
|
|
72
|
+
|
|
73
|
+
def __del__(self):
|
|
74
|
+
"""Ensure cleanup is called when the object is garbage collected."""
|
|
75
|
+
self.cleanup()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class KaggleTaskHandler(TaskHandler):
|
|
79
|
+
"""
|
|
80
|
+
Handler for Kaggle-style file input/file output tasks.
|
|
81
|
+
This is a "pass-through" implementation since tasks are already file-based.
|
|
82
|
+
"""
|
|
83
|
+
def prepare_input(self, task: TaskDefinition) -> Tuple[str, str, Path, Path]:
|
|
84
|
+
"""Extract paths, analyze data, and separate description from I/O instructions."""
|
|
85
|
+
if task.task_type != "kaggle":
|
|
86
|
+
raise ValueError("KaggleTaskHandler can only handle tasks of type 'kaggle'.")
|
|
87
|
+
|
|
88
|
+
description = task.payload.get("description")
|
|
89
|
+
data_dir = Path(task.payload.get("public_data_dir"))
|
|
90
|
+
output_path = Path(task.payload.get("output_submission_path"))
|
|
91
|
+
|
|
92
|
+
if not all([description, data_dir, output_path]):
|
|
93
|
+
raise ValueError("Kaggle task payload is missing required keys: 'description', 'public_data_dir', 'output_submission_path'.")
|
|
94
|
+
if not data_dir.exists() or not data_dir.is_dir():
|
|
95
|
+
raise FileNotFoundError(f"Kaggle public_data_dir not found: {data_dir}")
|
|
96
|
+
|
|
97
|
+
logger.info(f"Analyzing input data for task '{task.task_id}'...")
|
|
98
|
+
|
|
99
|
+
data_report = self.analyzer.analyze_data(data_dir, task_type="kaggle")
|
|
100
|
+
io_instructions = self.analyzer.generate_io_instructions(output_path.name, optimization_context=False)
|
|
101
|
+
|
|
102
|
+
augmented_description = f"{description}\n{data_report}"
|
|
103
|
+
|
|
104
|
+
logger.debug(f"Preparing Kaggle task '{task.task_id}': data_dir='{data_dir}', output_path='{output_path}'")
|
|
105
|
+
return augmented_description, io_instructions, data_dir, output_path
|
|
106
|
+
|
|
107
|
+
def parse_output(self, output_path: Path) -> Path:
|
|
108
|
+
"""
|
|
109
|
+
For Kaggle tasks, the result is the output file itself.
|
|
110
|
+
This just validates that the file was created.
|
|
111
|
+
"""
|
|
112
|
+
if not output_path.exists():
|
|
113
|
+
# In actual evaluation, this will be caught and reported as a failure.
|
|
114
|
+
logger.warning(f"Agent did not produce the required submission file at: {output_path}")
|
|
115
|
+
# Return the path even if it doesn't exist, let the caller (e.g., benchmark) handle the file not found case.
|
|
116
|
+
return output_path
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class QATaskHandler(TaskHandler):
|
|
120
|
+
"""
|
|
121
|
+
Handler for simple question-answer (QA) tasks.
|
|
122
|
+
This is a "translation" implementation that converts string questions to files and expects answers as files.
|
|
123
|
+
"""
|
|
124
|
+
def prepare_input(self, task: TaskDefinition) -> Tuple[str, str, Path, Path]:
|
|
125
|
+
"""Convert QA question to physical file input."""
|
|
126
|
+
if task.task_type != "qa":
|
|
127
|
+
raise ValueError("QATaskHandler can only handle tasks of type 'qa'.")
|
|
128
|
+
if not self.temp_dir:
|
|
129
|
+
raise RuntimeError("Temporary directory not available for QATaskHandler.")
|
|
130
|
+
|
|
131
|
+
question = task.payload.get("question")
|
|
132
|
+
if not question:
|
|
133
|
+
raise ValueError("QA task payload is missing required key: 'question'.")
|
|
134
|
+
|
|
135
|
+
data_dir = Path(self.temp_dir.name)
|
|
136
|
+
|
|
137
|
+
# Create physical task representation
|
|
138
|
+
problem_file = data_dir / "problem.txt"
|
|
139
|
+
problem_file.write_text(question, encoding='utf-8')
|
|
140
|
+
|
|
141
|
+
# Define output contract
|
|
142
|
+
output_path = data_dir / "answer.txt"
|
|
143
|
+
|
|
144
|
+
# This core instruction is now simpler
|
|
145
|
+
core_instruction = (
|
|
146
|
+
"Your task is to answer the question found in `problem.txt`. "
|
|
147
|
+
"Write ONLY the final answer into the required output file."
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
data_report = self.analyzer.analyze_data(data_dir, task_type="qa")
|
|
151
|
+
io_instructions = self.analyzer.generate_io_instructions(output_path.name, optimization_context=False)
|
|
152
|
+
|
|
153
|
+
description = f"{core_instruction}\n{data_report}"
|
|
154
|
+
|
|
155
|
+
logger.debug(f"Preparing QA task '{task.task_id}': input file='{problem_file}', expected output='{output_path}'")
|
|
156
|
+
return description, io_instructions, data_dir, output_path
|
|
157
|
+
|
|
158
|
+
def parse_output(self, output_path: Path) -> str:
|
|
159
|
+
"""Read and return the final answer string from the output file."""
|
|
160
|
+
if not output_path.exists() or not output_path.is_file():
|
|
161
|
+
logger.warning(f"Agent did not produce the answer file for QA task at: {output_path}")
|
|
162
|
+
return "[ERROR] Agent did not produce an answer file."
|
|
163
|
+
|
|
164
|
+
try:
|
|
165
|
+
answer = output_path.read_text(encoding='utf-8').strip()
|
|
166
|
+
logger.debug(f"Parsed QA answer from '{output_path}': '{answer[:50]}...'")
|
|
167
|
+
return answer
|
|
168
|
+
except Exception as e:
|
|
169
|
+
logger.error(f"Failed to read or parse QA answer file '{output_path}': {e}")
|
|
170
|
+
return f"[ERROR] Failed to parse answer file: {e}"
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class DataSciTaskHandler(TaskHandler):
|
|
174
|
+
"""
|
|
175
|
+
Handler for DataSciBench tasks.
|
|
176
|
+
These are multi-step data science tasks with prompts and optional input files.
|
|
177
|
+
"""
|
|
178
|
+
def prepare_input(self, task: TaskDefinition) -> Tuple[str, str, Path, Path]:
|
|
179
|
+
"""Prepare DataSciBench task input."""
|
|
180
|
+
if task.task_type != "datasci":
|
|
181
|
+
raise ValueError("DataSciTaskHandler can only handle tasks of type 'datasci'.")
|
|
182
|
+
|
|
183
|
+
prompt = task.payload.get("prompt", "")
|
|
184
|
+
input_dir = task.payload.get("input_dir", "")
|
|
185
|
+
output_dir = task.payload.get("output_dir", "")
|
|
186
|
+
|
|
187
|
+
if not prompt:
|
|
188
|
+
raise ValueError("DataSci task payload is missing required key: 'prompt'.")
|
|
189
|
+
|
|
190
|
+
# Use input_dir as data_dir, or temp_dir if no input files
|
|
191
|
+
if input_dir and Path(input_dir).exists():
|
|
192
|
+
data_dir = Path(input_dir)
|
|
193
|
+
elif self.temp_dir:
|
|
194
|
+
data_dir = Path(self.temp_dir.name)
|
|
195
|
+
else:
|
|
196
|
+
raise RuntimeError("No data directory available for DataSciTaskHandler.")
|
|
197
|
+
|
|
198
|
+
# Output directory
|
|
199
|
+
if output_dir:
|
|
200
|
+
output_path = Path(output_dir) / "output.csv"
|
|
201
|
+
else:
|
|
202
|
+
output_path = data_dir / "output.csv"
|
|
203
|
+
|
|
204
|
+
# Build description with the prompt
|
|
205
|
+
description = prompt
|
|
206
|
+
|
|
207
|
+
# Analyze data if available
|
|
208
|
+
try:
|
|
209
|
+
data_report = self.analyzer.analyze_data(data_dir, task_type="datasci")
|
|
210
|
+
description = f"{prompt}\n\n{data_report}"
|
|
211
|
+
except Exception as e:
|
|
212
|
+
logger.debug(f"Data analysis skipped: {e}")
|
|
213
|
+
|
|
214
|
+
# Generate I/O instructions
|
|
215
|
+
io_instructions = (
|
|
216
|
+
f"All input data files are in the current working directory.\n"
|
|
217
|
+
f"Save all output files to the current working directory.\n"
|
|
218
|
+
f"Follow the task instructions carefully and generate the required output files."
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
logger.debug(f"Preparing DataSci task '{task.task_id}': data_dir='{data_dir}', output_dir='{output_dir}'")
|
|
222
|
+
return description, io_instructions, data_dir, output_path
|
|
223
|
+
|
|
224
|
+
def parse_output(self, output_path: Path) -> Path:
|
|
225
|
+
"""
|
|
226
|
+
For DataSci tasks, return the output directory path.
|
|
227
|
+
The actual evaluation is done by the benchmark class using metric.yaml.
|
|
228
|
+
"""
|
|
229
|
+
if output_path.parent.exists():
|
|
230
|
+
return output_path.parent
|
|
231
|
+
return output_path
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
class OpenEndedTaskHandler(TaskHandler):
|
|
235
|
+
"""
|
|
236
|
+
Handler for open-ended tasks (mathematical modeling, simulations, strategy tasks).
|
|
237
|
+
These tasks don't have ground truth answers and are evaluated via LLM judges.
|
|
238
|
+
"""
|
|
239
|
+
def prepare_input(self, task: TaskDefinition) -> Tuple[str, str, Path, Path]:
|
|
240
|
+
"""Prepare open-ended task input."""
|
|
241
|
+
if task.task_type != "open_ended":
|
|
242
|
+
raise ValueError("OpenEndedTaskHandler can only handle tasks of type 'open_ended'.")
|
|
243
|
+
if not self.temp_dir:
|
|
244
|
+
raise RuntimeError("Temporary directory not available for OpenEndedTaskHandler.")
|
|
245
|
+
|
|
246
|
+
# Get task paths from payload
|
|
247
|
+
raw_dir_str = task.payload.get("raw_data_dir", "")
|
|
248
|
+
description_file = task.payload.get("description_file", "")
|
|
249
|
+
rubric_file = task.payload.get("rubric_file", "")
|
|
250
|
+
|
|
251
|
+
# Use temp directory as working directory
|
|
252
|
+
data_dir = Path(self.temp_dir.name)
|
|
253
|
+
|
|
254
|
+
# Copy ONLY data files (CSV, JSON, etc.) - exclude description and rubric files
|
|
255
|
+
if raw_dir_str:
|
|
256
|
+
raw_dir = Path(raw_dir_str)
|
|
257
|
+
if raw_dir.exists():
|
|
258
|
+
import shutil
|
|
259
|
+
for file in raw_dir.iterdir():
|
|
260
|
+
if file.is_file() and file.suffix in ['.csv', '.json', '.txt', '.xlsx', '.parquet']:
|
|
261
|
+
# Exclude description.md and rubric.md from being treated as data files
|
|
262
|
+
if file.name not in ['description.md', 'rubric.md']:
|
|
263
|
+
shutil.copy2(file, data_dir / file.name)
|
|
264
|
+
logger.debug(f"Copied data file: {file.name}")
|
|
265
|
+
|
|
266
|
+
# Read task description and rubric from files if provided
|
|
267
|
+
description = task.payload.get("description", "")
|
|
268
|
+
rubric = task.payload.get("rubric", "")
|
|
269
|
+
|
|
270
|
+
# Read from files if specified
|
|
271
|
+
if description_file and Path(description_file).exists():
|
|
272
|
+
try:
|
|
273
|
+
description = Path(description_file).read_text(encoding='utf-8')
|
|
274
|
+
logger.debug(f"Read description from {description_file} ({len(description)} chars)")
|
|
275
|
+
except Exception as e:
|
|
276
|
+
logger.warning(f"Failed to read description file {description_file}: {e}")
|
|
277
|
+
|
|
278
|
+
if rubric_file and Path(rubric_file).exists():
|
|
279
|
+
try:
|
|
280
|
+
rubric = Path(rubric_file).read_text(encoding='utf-8')
|
|
281
|
+
logger.debug(f"Read rubric from {rubric_file} ({len(rubric)} chars)")
|
|
282
|
+
except Exception as e:
|
|
283
|
+
logger.warning(f"Failed to read rubric file {rubric_file}: {e}")
|
|
284
|
+
|
|
285
|
+
if not description:
|
|
286
|
+
raise ValueError("Open-ended task payload is missing required key: 'description'.")
|
|
287
|
+
|
|
288
|
+
# Output path - agent should create an artifacts directory or report
|
|
289
|
+
output_path = data_dir / "artifacts"
|
|
290
|
+
|
|
291
|
+
# Build the FULL task description directly in the prompt
|
|
292
|
+
# Include description and evaluation criteria
|
|
293
|
+
# IMPORTANT: For open-ended tasks, explicitly require artifacts directory creation
|
|
294
|
+
|
|
295
|
+
task_description_section = f"""## Task Description
|
|
296
|
+
|
|
297
|
+
{description}
|
|
298
|
+
"""
|
|
299
|
+
|
|
300
|
+
if rubric:
|
|
301
|
+
task_description_section += f"""
|
|
302
|
+
|
|
303
|
+
## Evaluation Criteria
|
|
304
|
+
|
|
305
|
+
{rubric}
|
|
306
|
+
"""
|
|
307
|
+
|
|
308
|
+
# Analyze available data files to provide schema information (excluding task files)
|
|
309
|
+
data_report = self.analyzer.analyze_data(data_dir, task_type="datasci")
|
|
310
|
+
|
|
311
|
+
# Combine everything into the full description
|
|
312
|
+
# Note: data_report already contains "--- COMPREHENSIVE DATA REPORT ---" header
|
|
313
|
+
full_description = f"""{task_description_section}
|
|
314
|
+
|
|
315
|
+
{data_report}
|
|
316
|
+
|
|
317
|
+
## CRITICAL OUTPUT INSTRUCTIONS
|
|
318
|
+
|
|
319
|
+
**YOU MUST CREATE AN `artifacts/` DIRECTORY AND SAVE ALL OUTPUTS THERE:**
|
|
320
|
+
|
|
321
|
+
```python
|
|
322
|
+
import os
|
|
323
|
+
artifact_dir = 'artifacts'
|
|
324
|
+
os.makedirs(artifact_dir, exist_ok=True)
|
|
325
|
+
|
|
326
|
+
# Save all your work to the artifacts directory:
|
|
327
|
+
# - Analysis code: artifacts/analysis.py
|
|
328
|
+
# - Visualizations: artifacts/plot_*.png
|
|
329
|
+
# - Data files: artifacts/results.csv
|
|
330
|
+
# - Models, notebooks, etc.
|
|
331
|
+
```
|
|
332
|
+
|
|
333
|
+
## Task Goals
|
|
334
|
+
- Your goal is to complete this task to the best of your ability
|
|
335
|
+
- Create appropriate output files (code, analysis, visualizations, etc.) in the `artifacts/` subdirectory
|
|
336
|
+
- The evaluation will be based on the quality and completeness of your work according to the evaluation criteria
|
|
337
|
+
"""
|
|
338
|
+
|
|
339
|
+
# Generate I/O instructions - VERY EXPLICIT for open-ended tasks
|
|
340
|
+
io_instructions = f"""**OUTPUT DIRECTORY STRUCTURE (MANDATORY):**
|
|
341
|
+
|
|
342
|
+
```python
|
|
343
|
+
# At the START of your code, create the artifacts directory:
|
|
344
|
+
import os
|
|
345
|
+
artifact_dir = 'artifacts'
|
|
346
|
+
os.makedirs(artifact_dir, exist_ok=True)
|
|
347
|
+
|
|
348
|
+
# Save ALL outputs to this directory:
|
|
349
|
+
# - Code: f"{{artifact_dir}}/solution.py"
|
|
350
|
+
# - Plots: f"{{artifact_dir}}/visualization_{{i}}.png"
|
|
351
|
+
# - Data: f"{{artifact_dir}}/results.csv"
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
**REQUIREMENTS:**
|
|
355
|
+
1. Create the `artifacts/` directory at the beginning of your code
|
|
356
|
+
2. Save ALL generated files (plots, models, data, code) to this directory
|
|
357
|
+
3. Do NOT save files to the current directory - use the artifacts/ subdirectory
|
|
358
|
+
4. Focus on quality, completeness, and following the evaluation criteria
|
|
359
|
+
"""
|
|
360
|
+
|
|
361
|
+
logger.debug(f"Preparing open-ended task '{task.task_id}': output_path='{output_path}', description_len={len(description)}")
|
|
362
|
+
return full_description, io_instructions, data_dir, output_path
|
|
363
|
+
|
|
364
|
+
def parse_output(self, output_path: Path) -> Path:
|
|
365
|
+
"""
|
|
366
|
+
For open-ended tasks, return the artifacts directory path.
|
|
367
|
+
The actual evaluation is done by LLM judges, not CSV grading.
|
|
368
|
+
"""
|
|
369
|
+
if not output_path.exists():
|
|
370
|
+
# If artifacts directory doesn't exist, return the parent temp dir
|
|
371
|
+
# This allows evaluation to proceed even if no artifacts were created
|
|
372
|
+
logger.warning(f"Open-ended task did not create artifacts directory at: {output_path}")
|
|
373
|
+
return output_path.parent
|
|
374
|
+
|
|
375
|
+
logger.debug(f"Parsed open-ended task artifacts from: {output_path}")
|
|
376
|
+
return output_path
|