dslighting 1.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dsat/__init__.py +3 -0
- dsat/benchmark/__init__.py +1 -0
- dsat/benchmark/benchmark.py +168 -0
- dsat/benchmark/datasci.py +291 -0
- dsat/benchmark/mle.py +777 -0
- dsat/benchmark/sciencebench.py +304 -0
- dsat/common/__init__.py +0 -0
- dsat/common/constants.py +11 -0
- dsat/common/exceptions.py +48 -0
- dsat/common/typing.py +19 -0
- dsat/config.py +79 -0
- dsat/models/__init__.py +3 -0
- dsat/models/candidates.py +16 -0
- dsat/models/formats.py +52 -0
- dsat/models/task.py +64 -0
- dsat/operators/__init__.py +0 -0
- dsat/operators/aflow_ops.py +90 -0
- dsat/operators/autokaggle_ops.py +170 -0
- dsat/operators/automind_ops.py +38 -0
- dsat/operators/base.py +22 -0
- dsat/operators/code.py +45 -0
- dsat/operators/dsagent_ops.py +123 -0
- dsat/operators/llm_basic.py +84 -0
- dsat/prompts/__init__.py +0 -0
- dsat/prompts/aflow_prompt.py +76 -0
- dsat/prompts/aide_prompt.py +52 -0
- dsat/prompts/autokaggle_prompt.py +290 -0
- dsat/prompts/automind_prompt.py +29 -0
- dsat/prompts/common.py +51 -0
- dsat/prompts/data_interpreter_prompt.py +82 -0
- dsat/prompts/dsagent_prompt.py +88 -0
- dsat/runner.py +554 -0
- dsat/services/__init__.py +0 -0
- dsat/services/data_analyzer.py +387 -0
- dsat/services/llm.py +486 -0
- dsat/services/llm_single.py +421 -0
- dsat/services/sandbox.py +386 -0
- dsat/services/states/__init__.py +0 -0
- dsat/services/states/autokaggle_state.py +43 -0
- dsat/services/states/base.py +14 -0
- dsat/services/states/dsa_log.py +13 -0
- dsat/services/states/experience.py +237 -0
- dsat/services/states/journal.py +153 -0
- dsat/services/states/operator_library.py +290 -0
- dsat/services/vdb.py +76 -0
- dsat/services/workspace.py +178 -0
- dsat/tasks/__init__.py +3 -0
- dsat/tasks/handlers.py +376 -0
- dsat/templates/open_ended/grade_template.py +107 -0
- dsat/tools/__init__.py +4 -0
- dsat/utils/__init__.py +0 -0
- dsat/utils/context.py +172 -0
- dsat/utils/dynamic_import.py +71 -0
- dsat/utils/parsing.py +33 -0
- dsat/workflows/__init__.py +12 -0
- dsat/workflows/base.py +53 -0
- dsat/workflows/factory.py +439 -0
- dsat/workflows/manual/__init__.py +0 -0
- dsat/workflows/manual/autokaggle_workflow.py +148 -0
- dsat/workflows/manual/data_interpreter_workflow.py +153 -0
- dsat/workflows/manual/deepanalyze_workflow.py +484 -0
- dsat/workflows/manual/dsagent_workflow.py +76 -0
- dsat/workflows/search/__init__.py +0 -0
- dsat/workflows/search/aflow_workflow.py +344 -0
- dsat/workflows/search/aide_workflow.py +283 -0
- dsat/workflows/search/automind_workflow.py +237 -0
- dsat/workflows/templates/__init__.py +0 -0
- dsat/workflows/templates/basic_kaggle_loop.py +71 -0
- dslighting/__init__.py +170 -0
- dslighting/core/__init__.py +13 -0
- dslighting/core/agent.py +646 -0
- dslighting/core/config_builder.py +318 -0
- dslighting/core/data_loader.py +422 -0
- dslighting/core/task_detector.py +422 -0
- dslighting/utils/__init__.py +19 -0
- dslighting/utils/defaults.py +151 -0
- dslighting-1.3.9.dist-info/METADATA +554 -0
- dslighting-1.3.9.dist-info/RECORD +80 -0
- dslighting-1.3.9.dist-info/WHEEL +5 -0
- dslighting-1.3.9.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,422 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data loading and preprocessing.
|
|
3
|
+
|
|
4
|
+
This module provides a unified interface for loading data from various sources
|
|
5
|
+
with automatic type detection and validation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Dict, List, Optional, Union
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from dslighting.core.task_detector import TaskDetector, TaskDetection
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class LoadedData:
|
|
22
|
+
"""
|
|
23
|
+
Container for loaded data with metadata.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
source: Original data source (path, DataFrame, etc.)
|
|
27
|
+
data_dir: Data directory path (for file-based sources)
|
|
28
|
+
task_detection: Detected task information
|
|
29
|
+
task_id: Task/Competition ID (extracted from path)
|
|
30
|
+
metadata: Additional metadata
|
|
31
|
+
"""
|
|
32
|
+
source: Any
|
|
33
|
+
data_dir: Optional[Path] = None
|
|
34
|
+
task_detection: Optional[TaskDetection] = None
|
|
35
|
+
task_id: Optional[str] = None
|
|
36
|
+
metadata: Dict[str, Any] = None
|
|
37
|
+
|
|
38
|
+
def __post_init__(self):
|
|
39
|
+
if self.metadata is None:
|
|
40
|
+
self.metadata = {}
|
|
41
|
+
|
|
42
|
+
def get_description(self) -> str:
|
|
43
|
+
"""Get task description."""
|
|
44
|
+
if self.task_detection:
|
|
45
|
+
return self.task_detection.description
|
|
46
|
+
return ""
|
|
47
|
+
|
|
48
|
+
def get_io_instructions(self) -> str:
|
|
49
|
+
"""Get I/O instructions."""
|
|
50
|
+
if self.task_detection:
|
|
51
|
+
return self.task_detection.io_instructions
|
|
52
|
+
return ""
|
|
53
|
+
|
|
54
|
+
def get_recommended_workflow(self) -> str:
|
|
55
|
+
"""Get recommended workflow."""
|
|
56
|
+
if self.task_detection:
|
|
57
|
+
return self.task_detection.recommended_workflow
|
|
58
|
+
return "aide"
|
|
59
|
+
|
|
60
|
+
def get_task_type(self) -> str:
|
|
61
|
+
"""Get task type."""
|
|
62
|
+
if self.task_detection:
|
|
63
|
+
return self.task_detection.task_type
|
|
64
|
+
return "datasci"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class DataLoader:
|
|
68
|
+
"""
|
|
69
|
+
Load data from various sources with optional automatic type detection.
|
|
70
|
+
|
|
71
|
+
By default, all data is treated as MLE format (prepared/public & prepared/private).
|
|
72
|
+
Set auto_detect=True to enable automatic task type detection.
|
|
73
|
+
|
|
74
|
+
Supported sources:
|
|
75
|
+
- File paths (CSV, JSON, parquet)
|
|
76
|
+
- Directory paths (competition layout)
|
|
77
|
+
- Pandas DataFrames
|
|
78
|
+
- Dict/question strings (QA tasks)
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def __init__(self, auto_detect: bool = False):
|
|
82
|
+
"""
|
|
83
|
+
Initialize DataLoader.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
auto_detect: If True, automatically detect task type.
|
|
87
|
+
If False (default), treat all data as MLE format.
|
|
88
|
+
"""
|
|
89
|
+
self.detector = TaskDetector()
|
|
90
|
+
self.auto_detect = auto_detect
|
|
91
|
+
self.logger = logger
|
|
92
|
+
|
|
93
|
+
def load(
|
|
94
|
+
self,
|
|
95
|
+
source: Union[str, Path, pd.DataFrame, dict],
|
|
96
|
+
auto_detect: bool = None,
|
|
97
|
+
**kwargs
|
|
98
|
+
) -> LoadedData:
|
|
99
|
+
"""
|
|
100
|
+
Load data from a source.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
source: Data source (path, DataFrame, dict, etc.)
|
|
104
|
+
auto_detect: Override the instance's auto_detect setting.
|
|
105
|
+
If None (default), use instance setting.
|
|
106
|
+
**kwargs: Additional parameters
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
LoadedData with detection information
|
|
110
|
+
"""
|
|
111
|
+
self.logger.info(f"Loading data from source: {type(source).__name__}")
|
|
112
|
+
|
|
113
|
+
# Determine whether to auto-detect
|
|
114
|
+
should_auto_detect = auto_detect if auto_detect is not None else self.auto_detect
|
|
115
|
+
|
|
116
|
+
# Detect task type (or use MLE default)
|
|
117
|
+
if should_auto_detect:
|
|
118
|
+
task_detection = self.detector.detect(source)
|
|
119
|
+
else:
|
|
120
|
+
task_detection = self._get_default_mle_detection(source)
|
|
121
|
+
|
|
122
|
+
# Determine data directory
|
|
123
|
+
data_dir = self._extract_data_dir(source, task_detection)
|
|
124
|
+
|
|
125
|
+
# Extract task_id from path
|
|
126
|
+
task_id = self._extract_task_id(source, data_dir)
|
|
127
|
+
|
|
128
|
+
# Create LoadedData
|
|
129
|
+
loaded_data = LoadedData(
|
|
130
|
+
source=source,
|
|
131
|
+
data_dir=data_dir,
|
|
132
|
+
task_detection=task_detection,
|
|
133
|
+
task_id=task_id,
|
|
134
|
+
metadata=kwargs
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
self.logger.info(
|
|
138
|
+
f"Loaded data: task_type={task_detection.task_type}, "
|
|
139
|
+
f"workflow={task_detection.recommended_workflow}"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
return loaded_data
|
|
143
|
+
|
|
144
|
+
def load_csv(
|
|
145
|
+
self,
|
|
146
|
+
path: Union[str, Path],
|
|
147
|
+
**kwargs
|
|
148
|
+
) -> LoadedData:
|
|
149
|
+
"""
|
|
150
|
+
Load data from a CSV file.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
path: Path to CSV file
|
|
154
|
+
**kwargs: Additional parameters passed to pd.read_csv
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
LoadedData with DataFrame and detection
|
|
158
|
+
"""
|
|
159
|
+
path = Path(path)
|
|
160
|
+
self.logger.info(f"Loading CSV file: {path}")
|
|
161
|
+
|
|
162
|
+
try:
|
|
163
|
+
df = pd.read_csv(path, **kwargs)
|
|
164
|
+
return self.load(df)
|
|
165
|
+
except Exception as e:
|
|
166
|
+
self.logger.error(f"Failed to load CSV file {path}: {e}")
|
|
167
|
+
raise
|
|
168
|
+
|
|
169
|
+
def load_directory(
|
|
170
|
+
self,
|
|
171
|
+
path: Union[str, Path],
|
|
172
|
+
**kwargs
|
|
173
|
+
) -> LoadedData:
|
|
174
|
+
"""
|
|
175
|
+
Load data from a directory.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
path: Path to directory
|
|
179
|
+
**kwargs: Additional parameters
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
LoadedData with directory and detection
|
|
183
|
+
"""
|
|
184
|
+
path = Path(path)
|
|
185
|
+
self.logger.info(f"Loading directory: {path}")
|
|
186
|
+
|
|
187
|
+
if not path.is_dir():
|
|
188
|
+
raise ValueError(f"Not a directory: {path}")
|
|
189
|
+
|
|
190
|
+
return self.load(path, **kwargs)
|
|
191
|
+
|
|
192
|
+
def load_dataframe(
|
|
193
|
+
self,
|
|
194
|
+
df: pd.DataFrame,
|
|
195
|
+
**kwargs
|
|
196
|
+
) -> LoadedData:
|
|
197
|
+
"""
|
|
198
|
+
Load data from a pandas DataFrame.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
df: Pandas DataFrame
|
|
202
|
+
**kwargs: Additional parameters
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
LoadedData with DataFrame and detection
|
|
206
|
+
"""
|
|
207
|
+
self.logger.info(f"Loading DataFrame with shape {df.shape}")
|
|
208
|
+
return self.load(df, **kwargs)
|
|
209
|
+
|
|
210
|
+
def load_competition(
|
|
211
|
+
self,
|
|
212
|
+
competition_id: str,
|
|
213
|
+
data_dir: Union[str, Path] = None,
|
|
214
|
+
**kwargs
|
|
215
|
+
) -> LoadedData:
|
|
216
|
+
"""
|
|
217
|
+
Load data from a competition (MLE-Bench style).
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
competition_id: Competition identifier
|
|
221
|
+
data_dir: Base data directory containing competitions
|
|
222
|
+
**kwargs: Additional parameters
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
LoadedData with competition data
|
|
226
|
+
"""
|
|
227
|
+
self.logger.info(f"Loading competition: {competition_id}")
|
|
228
|
+
|
|
229
|
+
# Default to data/competitions directory
|
|
230
|
+
if data_dir is None:
|
|
231
|
+
data_dir = Path("data/competitions") / competition_id
|
|
232
|
+
else:
|
|
233
|
+
data_dir = Path(data_dir) / competition_id
|
|
234
|
+
|
|
235
|
+
if not data_dir.exists():
|
|
236
|
+
raise ValueError(f"Competition directory not found: {data_dir}")
|
|
237
|
+
|
|
238
|
+
return self.load(data_dir, **kwargs)
|
|
239
|
+
|
|
240
|
+
def load_question(
|
|
241
|
+
self,
|
|
242
|
+
question: str,
|
|
243
|
+
**kwargs
|
|
244
|
+
) -> LoadedData:
|
|
245
|
+
"""
|
|
246
|
+
Load a QA question.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
question: Question text
|
|
250
|
+
**kwargs: Additional parameters
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
LoadedData with question
|
|
254
|
+
"""
|
|
255
|
+
self.logger.info("Loading QA question")
|
|
256
|
+
return self.load(question, **kwargs)
|
|
257
|
+
|
|
258
|
+
def _get_default_mle_detection(self, source: Any) -> TaskDetection:
|
|
259
|
+
"""
|
|
260
|
+
Get default MLE competition detection for any data source.
|
|
261
|
+
|
|
262
|
+
This method treats all data as MLE format (prepared/public & prepared/private).
|
|
263
|
+
It extracts the data directory if available.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
source: Data source (path, DataFrame, dict, etc.)
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
TaskDetection configured for MLE competition
|
|
270
|
+
"""
|
|
271
|
+
self.logger.info("Using default MLE competition format (prepared/public & prepared/private)")
|
|
272
|
+
|
|
273
|
+
# Try to extract data directory
|
|
274
|
+
data_dir = None
|
|
275
|
+
description = "MLE competition task"
|
|
276
|
+
|
|
277
|
+
if isinstance(source, (str, Path)):
|
|
278
|
+
path = Path(source).resolve() # Convert to absolute path
|
|
279
|
+
self.logger.info(f"Resolved path: {path}")
|
|
280
|
+
|
|
281
|
+
if path.exists():
|
|
282
|
+
if path.is_dir():
|
|
283
|
+
data_dir = path
|
|
284
|
+
self.logger.info(f"Data directory found: {data_dir}")
|
|
285
|
+
# Try to load description
|
|
286
|
+
desc_file = path / "description.md"
|
|
287
|
+
if desc_file.exists():
|
|
288
|
+
try:
|
|
289
|
+
description = desc_file.read_text(encoding='utf-8')
|
|
290
|
+
self.logger.info(f"Loaded description from {desc_file}")
|
|
291
|
+
except Exception:
|
|
292
|
+
pass
|
|
293
|
+
elif path.is_file():
|
|
294
|
+
data_dir = path.parent
|
|
295
|
+
self.logger.info(f"Data directory (from file parent): {data_dir}")
|
|
296
|
+
# Try to load description from parent directory
|
|
297
|
+
desc_file = path.parent / "description.md"
|
|
298
|
+
if desc_file.exists():
|
|
299
|
+
try:
|
|
300
|
+
description = desc_file.read_text(encoding='utf-8')
|
|
301
|
+
self.logger.info(f"Loaded description from {desc_file}")
|
|
302
|
+
except Exception:
|
|
303
|
+
pass
|
|
304
|
+
else:
|
|
305
|
+
self.logger.warning(f"Path does not exist: {path}")
|
|
306
|
+
|
|
307
|
+
# Try to find the data in common locations
|
|
308
|
+
competition_id = path.name
|
|
309
|
+
|
|
310
|
+
# Common search locations for data
|
|
311
|
+
search_locations = [
|
|
312
|
+
# Current project: ./data/competitions/
|
|
313
|
+
Path.cwd() / "data" / "competitions" / competition_id,
|
|
314
|
+
# Parent dslighting: ../dslighting/data/competitions/
|
|
315
|
+
Path.cwd().parent / "dslighting" / "data" / "competitions" / competition_id,
|
|
316
|
+
# Parent data: ../data/competitions/
|
|
317
|
+
Path.cwd().parent / "data" / "competitions" / competition_id,
|
|
318
|
+
# From package location: ../../data/competitions/
|
|
319
|
+
Path(__file__).parent.parent.parent / "data" / "competitions" / competition_id,
|
|
320
|
+
# Absolute path fallback
|
|
321
|
+
Path("/Users/liufan/Applications/Github/dslighting/data/competitions") / competition_id,
|
|
322
|
+
]
|
|
323
|
+
|
|
324
|
+
for location in search_locations:
|
|
325
|
+
self.logger.info(f" Trying: {location}")
|
|
326
|
+
if location.exists() and location.is_dir():
|
|
327
|
+
data_dir = location
|
|
328
|
+
self.logger.info(f" ✓ Found data at: {data_dir}")
|
|
329
|
+
break
|
|
330
|
+
|
|
331
|
+
if data_dir is None:
|
|
332
|
+
# Last resort: use the original resolved path
|
|
333
|
+
self.logger.warning(f" Could not find data, using original path: {path}")
|
|
334
|
+
data_dir = path
|
|
335
|
+
|
|
336
|
+
# Try to load description (if data_dir was found)
|
|
337
|
+
if data_dir and data_dir.exists():
|
|
338
|
+
desc_file = data_dir / "description.md"
|
|
339
|
+
if desc_file.exists():
|
|
340
|
+
try:
|
|
341
|
+
description = desc_file.read_text(encoding='utf-8')
|
|
342
|
+
self.logger.info(f"Loaded description from {desc_file}")
|
|
343
|
+
except Exception:
|
|
344
|
+
pass
|
|
345
|
+
|
|
346
|
+
# Create MLE-style detection
|
|
347
|
+
from dslighting.utils.defaults import WORKFLOW_RECOMMENDATIONS
|
|
348
|
+
|
|
349
|
+
return TaskDetection(
|
|
350
|
+
task_type="kaggle", # MLE uses kaggle task type internally
|
|
351
|
+
task_mode="standard_ml",
|
|
352
|
+
data_dir=data_dir,
|
|
353
|
+
description=description,
|
|
354
|
+
io_instructions="Train a model and generate predictions for the test set.",
|
|
355
|
+
recommended_workflow=WORKFLOW_RECOMMENDATIONS.get("kaggle_competition", {}).get("default", "aide"),
|
|
356
|
+
confidence=1.0, # High confidence since this is explicit user intent
|
|
357
|
+
metadata={"structure": "mle_competition", "auto_detected": False}
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
def _extract_data_dir(
|
|
361
|
+
self,
|
|
362
|
+
source: Any,
|
|
363
|
+
detection: TaskDetection
|
|
364
|
+
) -> Optional[Path]:
|
|
365
|
+
"""
|
|
366
|
+
Extract data directory from source and detection.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
source: Original data source
|
|
370
|
+
detection: Task detection result
|
|
371
|
+
|
|
372
|
+
Returns:
|
|
373
|
+
Path to data directory or None
|
|
374
|
+
"""
|
|
375
|
+
# If detection already has data_dir, use it
|
|
376
|
+
if detection.data_dir:
|
|
377
|
+
return detection.data_dir
|
|
378
|
+
|
|
379
|
+
# If source is a path, use its parent
|
|
380
|
+
if isinstance(source, (str, Path)):
|
|
381
|
+
path = Path(source)
|
|
382
|
+
if path.is_file():
|
|
383
|
+
return path.parent
|
|
384
|
+
elif path.is_dir():
|
|
385
|
+
return path
|
|
386
|
+
|
|
387
|
+
# No data directory
|
|
388
|
+
return None
|
|
389
|
+
|
|
390
|
+
def _extract_task_id(
|
|
391
|
+
self,
|
|
392
|
+
source: Any,
|
|
393
|
+
data_dir: Optional[Path]
|
|
394
|
+
) -> Optional[str]:
|
|
395
|
+
"""
|
|
396
|
+
Extract task/competition ID from source path.
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
source: Original data source
|
|
400
|
+
data_dir: Detected data directory
|
|
401
|
+
|
|
402
|
+
Returns:
|
|
403
|
+
Task ID (e.g., "bike-sharing-demand") or None
|
|
404
|
+
"""
|
|
405
|
+
# If source is a path, extract the last directory name as task_id
|
|
406
|
+
if isinstance(source, (str, Path)):
|
|
407
|
+
path = Path(source)
|
|
408
|
+
|
|
409
|
+
# If it's a file, use parent directory name
|
|
410
|
+
if path.is_file():
|
|
411
|
+
return path.parent.name
|
|
412
|
+
|
|
413
|
+
# If it's a directory, use its name
|
|
414
|
+
if path.is_dir():
|
|
415
|
+
return path.name
|
|
416
|
+
|
|
417
|
+
# If data_dir is available, use its name
|
|
418
|
+
if data_dir:
|
|
419
|
+
return data_dir.name
|
|
420
|
+
|
|
421
|
+
# No task_id found
|
|
422
|
+
return None
|