dslighting 1.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dsat/__init__.py +3 -0
- dsat/benchmark/__init__.py +1 -0
- dsat/benchmark/benchmark.py +168 -0
- dsat/benchmark/datasci.py +291 -0
- dsat/benchmark/mle.py +777 -0
- dsat/benchmark/sciencebench.py +304 -0
- dsat/common/__init__.py +0 -0
- dsat/common/constants.py +11 -0
- dsat/common/exceptions.py +48 -0
- dsat/common/typing.py +19 -0
- dsat/config.py +79 -0
- dsat/models/__init__.py +3 -0
- dsat/models/candidates.py +16 -0
- dsat/models/formats.py +52 -0
- dsat/models/task.py +64 -0
- dsat/operators/__init__.py +0 -0
- dsat/operators/aflow_ops.py +90 -0
- dsat/operators/autokaggle_ops.py +170 -0
- dsat/operators/automind_ops.py +38 -0
- dsat/operators/base.py +22 -0
- dsat/operators/code.py +45 -0
- dsat/operators/dsagent_ops.py +123 -0
- dsat/operators/llm_basic.py +84 -0
- dsat/prompts/__init__.py +0 -0
- dsat/prompts/aflow_prompt.py +76 -0
- dsat/prompts/aide_prompt.py +52 -0
- dsat/prompts/autokaggle_prompt.py +290 -0
- dsat/prompts/automind_prompt.py +29 -0
- dsat/prompts/common.py +51 -0
- dsat/prompts/data_interpreter_prompt.py +82 -0
- dsat/prompts/dsagent_prompt.py +88 -0
- dsat/runner.py +554 -0
- dsat/services/__init__.py +0 -0
- dsat/services/data_analyzer.py +387 -0
- dsat/services/llm.py +486 -0
- dsat/services/llm_single.py +421 -0
- dsat/services/sandbox.py +386 -0
- dsat/services/states/__init__.py +0 -0
- dsat/services/states/autokaggle_state.py +43 -0
- dsat/services/states/base.py +14 -0
- dsat/services/states/dsa_log.py +13 -0
- dsat/services/states/experience.py +237 -0
- dsat/services/states/journal.py +153 -0
- dsat/services/states/operator_library.py +290 -0
- dsat/services/vdb.py +76 -0
- dsat/services/workspace.py +178 -0
- dsat/tasks/__init__.py +3 -0
- dsat/tasks/handlers.py +376 -0
- dsat/templates/open_ended/grade_template.py +107 -0
- dsat/tools/__init__.py +4 -0
- dsat/utils/__init__.py +0 -0
- dsat/utils/context.py +172 -0
- dsat/utils/dynamic_import.py +71 -0
- dsat/utils/parsing.py +33 -0
- dsat/workflows/__init__.py +12 -0
- dsat/workflows/base.py +53 -0
- dsat/workflows/factory.py +439 -0
- dsat/workflows/manual/__init__.py +0 -0
- dsat/workflows/manual/autokaggle_workflow.py +148 -0
- dsat/workflows/manual/data_interpreter_workflow.py +153 -0
- dsat/workflows/manual/deepanalyze_workflow.py +484 -0
- dsat/workflows/manual/dsagent_workflow.py +76 -0
- dsat/workflows/search/__init__.py +0 -0
- dsat/workflows/search/aflow_workflow.py +344 -0
- dsat/workflows/search/aide_workflow.py +283 -0
- dsat/workflows/search/automind_workflow.py +237 -0
- dsat/workflows/templates/__init__.py +0 -0
- dsat/workflows/templates/basic_kaggle_loop.py +71 -0
- dslighting/__init__.py +170 -0
- dslighting/core/__init__.py +13 -0
- dslighting/core/agent.py +646 -0
- dslighting/core/config_builder.py +318 -0
- dslighting/core/data_loader.py +422 -0
- dslighting/core/task_detector.py +422 -0
- dslighting/utils/__init__.py +19 -0
- dslighting/utils/defaults.py +151 -0
- dslighting-1.3.9.dist-info/METADATA +554 -0
- dslighting-1.3.9.dist-info/RECORD +80 -0
- dslighting-1.3.9.dist-info/WHEEL +5 -0
- dslighting-1.3.9.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,422 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Task detection and automatic task type inference.
|
|
3
|
+
|
|
4
|
+
This module provides intelligent detection of data science task types
|
|
5
|
+
from various data sources (directories, files, DataFrames, etc.).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Dict, List, Optional, Union
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from dslighting.utils.defaults import WORKFLOW_RECOMMENDATIONS
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class TaskDetection:
|
|
22
|
+
"""
|
|
23
|
+
Result of task detection.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
task_type: Detected task type (kaggle, qa, datasci, open_ended)
|
|
27
|
+
task_mode: Task mode (standard_ml, open_ended)
|
|
28
|
+
data_dir: Data directory path (if applicable)
|
|
29
|
+
description: Task description (if found)
|
|
30
|
+
io_instructions: I/O instructions
|
|
31
|
+
recommended_workflow: Recommended workflow for this task
|
|
32
|
+
confidence: Detection confidence (0-1)
|
|
33
|
+
metadata: Additional detection metadata
|
|
34
|
+
"""
|
|
35
|
+
task_type: str
|
|
36
|
+
task_mode: str
|
|
37
|
+
data_dir: Optional[Path] = None
|
|
38
|
+
description: str = ""
|
|
39
|
+
io_instructions: str = ""
|
|
40
|
+
recommended_workflow: str = "aide"
|
|
41
|
+
confidence: float = 0.8
|
|
42
|
+
metadata: Dict[str, Any] = None
|
|
43
|
+
|
|
44
|
+
def __post_init__(self):
|
|
45
|
+
if self.metadata is None:
|
|
46
|
+
self.metadata = {}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class TaskDetector:
|
|
50
|
+
"""
|
|
51
|
+
Automatically detect task types from data sources.
|
|
52
|
+
|
|
53
|
+
This class analyzes directory structures, file types, and content
|
|
54
|
+
to determine the most appropriate task type and workflow.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(self):
|
|
58
|
+
self.logger = logger
|
|
59
|
+
|
|
60
|
+
def detect(self, source: Any) -> TaskDetection:
|
|
61
|
+
"""
|
|
62
|
+
Detect task type from a data source.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
source: Data source (path, DataFrame, dict, etc.)
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
TaskDetection with inferred task information
|
|
69
|
+
"""
|
|
70
|
+
# DataFrame: Custom ML task (highest priority for DataFrames)
|
|
71
|
+
if isinstance(source, pd.DataFrame):
|
|
72
|
+
return self._detect_dataframe_task(source)
|
|
73
|
+
|
|
74
|
+
# Path: Directory or file (check before treating as string)
|
|
75
|
+
if isinstance(source, (str, Path)):
|
|
76
|
+
path = Path(source)
|
|
77
|
+
if path.is_dir():
|
|
78
|
+
return self._detect_from_directory(path)
|
|
79
|
+
elif path.is_file():
|
|
80
|
+
return self._detect_from_file(path)
|
|
81
|
+
# If path doesn't exist, check if it's a short question (QA task)
|
|
82
|
+
elif len(source) < 500 and not any(c in source for c in ['/', '\\', '.csv', '.json', '.txt']):
|
|
83
|
+
return self._detect_qa_task(source)
|
|
84
|
+
|
|
85
|
+
# Dict: QA task
|
|
86
|
+
if isinstance(source, dict):
|
|
87
|
+
return self._detect_qa_task(str(source))
|
|
88
|
+
|
|
89
|
+
# Fallback
|
|
90
|
+
return TaskDetection(
|
|
91
|
+
task_type="datasci",
|
|
92
|
+
task_mode="standard_ml",
|
|
93
|
+
description="Custom data science task",
|
|
94
|
+
recommended_workflow="aide",
|
|
95
|
+
confidence=0.5
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def _detect_qa_task(self, question: str) -> TaskDetection:
|
|
99
|
+
"""Detect a QA (question-answering) task."""
|
|
100
|
+
self.logger.info("Detected QA task")
|
|
101
|
+
|
|
102
|
+
return TaskDetection(
|
|
103
|
+
task_type="qa",
|
|
104
|
+
task_mode="standard_ml",
|
|
105
|
+
description=f"Answer the following question: {question}",
|
|
106
|
+
io_instructions="Provide a clear, concise answer.",
|
|
107
|
+
recommended_workflow=WORKFLOW_RECOMMENDATIONS["qa"]["default"],
|
|
108
|
+
confidence=0.95,
|
|
109
|
+
metadata={"question": question}
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
def _detect_dataframe_task(self, df: pd.DataFrame) -> TaskDetection:
|
|
113
|
+
"""Detect task type from a pandas DataFrame."""
|
|
114
|
+
self.logger.info(f"Detected DataFrame task with {len(df)} rows, {len(df.columns)} columns")
|
|
115
|
+
|
|
116
|
+
# Check if it has a target column (common pattern: last column is target)
|
|
117
|
+
has_target = len(df.columns) > 1
|
|
118
|
+
|
|
119
|
+
if has_target:
|
|
120
|
+
description = (
|
|
121
|
+
f"Build a machine learning model to predict the target variable. "
|
|
122
|
+
f"Dataset has {len(df)} rows and {len(df.columns)} columns."
|
|
123
|
+
)
|
|
124
|
+
io_instructions = (
|
|
125
|
+
"Train a model on the data and provide predictions. "
|
|
126
|
+
"Use appropriate evaluation metrics."
|
|
127
|
+
)
|
|
128
|
+
recommended_workflow = "aide"
|
|
129
|
+
else:
|
|
130
|
+
description = (
|
|
131
|
+
f"Perform exploratory data analysis and clustering. "
|
|
132
|
+
f"Dataset has {len(df)} rows and {len(df.columns)} columns."
|
|
133
|
+
)
|
|
134
|
+
io_instructions = "Analyze the data and provide insights."
|
|
135
|
+
recommended_workflow = "data_interpreter"
|
|
136
|
+
|
|
137
|
+
return TaskDetection(
|
|
138
|
+
task_type="datasci",
|
|
139
|
+
task_mode="standard_ml" if has_target else "open_ended",
|
|
140
|
+
description=description,
|
|
141
|
+
io_instructions=io_instructions,
|
|
142
|
+
recommended_workflow=recommended_workflow,
|
|
143
|
+
confidence=0.85,
|
|
144
|
+
metadata={
|
|
145
|
+
"shape": df.shape,
|
|
146
|
+
"columns": list(df.columns),
|
|
147
|
+
"dtypes": df.dtypes.to_dict()
|
|
148
|
+
}
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
def _detect_from_directory(self, dir_path: Path) -> TaskDetection:
|
|
152
|
+
"""Detect task type from a directory structure."""
|
|
153
|
+
self.logger.info(f"Detecting task from directory: {dir_path}")
|
|
154
|
+
|
|
155
|
+
# Priority 1: Check for MLE competition structure (prepared/public & prepared/private)
|
|
156
|
+
if self._is_mle_competition(dir_path):
|
|
157
|
+
return self._detect_mle_competition(dir_path)
|
|
158
|
+
|
|
159
|
+
# Priority 2: Check for Kaggle competition structure (train.csv, test.csv)
|
|
160
|
+
if self._is_kaggle_format(dir_path):
|
|
161
|
+
return self._detect_kaggle_competition(dir_path)
|
|
162
|
+
|
|
163
|
+
# Priority 3: Check for open-ended task
|
|
164
|
+
if self._is_open_ended_task(dir_path):
|
|
165
|
+
return self._detect_open_ended_task(dir_path)
|
|
166
|
+
|
|
167
|
+
# Priority 4: Check for DataSci task
|
|
168
|
+
if self._is_datasci_task(dir_path):
|
|
169
|
+
return self._detect_datasci_task(dir_path)
|
|
170
|
+
|
|
171
|
+
# Default: treat as generic data directory
|
|
172
|
+
return self._detect_generic_directory(dir_path)
|
|
173
|
+
|
|
174
|
+
def _detect_from_file(self, file_path: Path) -> TaskDetection:
|
|
175
|
+
"""Detect task type from a single file."""
|
|
176
|
+
self.logger.info(f"Detecting task from file: {file_path}")
|
|
177
|
+
|
|
178
|
+
suffix = file_path.suffix.lower()
|
|
179
|
+
|
|
180
|
+
if suffix == ".csv":
|
|
181
|
+
return self._detect_csv_file(file_path)
|
|
182
|
+
elif suffix in [".json", ".jsonl"]:
|
|
183
|
+
return self._detect_json_file(file_path)
|
|
184
|
+
elif suffix in [".txt", ".md"]:
|
|
185
|
+
return self._detect_text_file(file_path)
|
|
186
|
+
else:
|
|
187
|
+
# Unknown file type
|
|
188
|
+
return TaskDetection(
|
|
189
|
+
task_type="datasci",
|
|
190
|
+
task_mode="standard_ml",
|
|
191
|
+
description=f"Process file: {file_path.name}",
|
|
192
|
+
recommended_workflow="aide",
|
|
193
|
+
confidence=0.6
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
def _is_mle_competition(self, dir_path: Path) -> bool:
|
|
197
|
+
"""Check if directory is an MLE competition structure (prepared/public & prepared/private)."""
|
|
198
|
+
# Check for prepared/public and prepared/private
|
|
199
|
+
prepared = dir_path / "prepared"
|
|
200
|
+
if prepared.exists():
|
|
201
|
+
public = prepared / "public"
|
|
202
|
+
private = prepared / "private"
|
|
203
|
+
if public.exists() and private.exists():
|
|
204
|
+
return True
|
|
205
|
+
return False
|
|
206
|
+
|
|
207
|
+
def _is_kaggle_format(self, dir_path: Path) -> bool:
|
|
208
|
+
"""Check if directory has Kaggle-style files (train.csv, test.csv, sample_submission.csv)."""
|
|
209
|
+
# Check for train.csv + test.csv
|
|
210
|
+
has_train = (dir_path / "train.csv").exists()
|
|
211
|
+
has_test = (dir_path / "test.csv").exists()
|
|
212
|
+
has_sample_submission = (dir_path / "sample_submission.csv").exists()
|
|
213
|
+
|
|
214
|
+
return has_train and (has_test or has_sample_submission)
|
|
215
|
+
|
|
216
|
+
def _is_open_ended_task(self, dir_path: Path) -> bool:
|
|
217
|
+
"""Check if directory is an open-ended task."""
|
|
218
|
+
has_description = (dir_path / "description.md").exists()
|
|
219
|
+
has_rubric = (dir_path / "rubric.md").exists()
|
|
220
|
+
return has_description and has_rubric
|
|
221
|
+
|
|
222
|
+
def _is_datasci_task(self, dir_path: Path) -> bool:
|
|
223
|
+
"""Check if directory is a DataSci task."""
|
|
224
|
+
has_prompt = (dir_path / "prompt.txt").exists()
|
|
225
|
+
has_description = (dir_path / "description.md").exists()
|
|
226
|
+
return has_prompt or has_description
|
|
227
|
+
|
|
228
|
+
def _detect_mle_competition(self, dir_path: Path) -> TaskDetection:
|
|
229
|
+
"""Detect MLE competition task (standard DSAT format with prepared/public & prepared/private)."""
|
|
230
|
+
self.logger.info("Detected MLE competition structure (prepared/public & prepared/private)")
|
|
231
|
+
|
|
232
|
+
# Load description if exists
|
|
233
|
+
description = ""
|
|
234
|
+
description_file = dir_path / "description.md"
|
|
235
|
+
if description_file.exists():
|
|
236
|
+
description = description_file.read_text(encoding='utf-8')
|
|
237
|
+
|
|
238
|
+
# MLE competitions use kaggle task_type internally
|
|
239
|
+
return TaskDetection(
|
|
240
|
+
task_type="kaggle",
|
|
241
|
+
task_mode="standard_ml",
|
|
242
|
+
data_dir=dir_path,
|
|
243
|
+
description=description or "MLE competition task",
|
|
244
|
+
io_instructions="Train a model and generate predictions for the test set.",
|
|
245
|
+
recommended_workflow="aide",
|
|
246
|
+
confidence=0.95,
|
|
247
|
+
metadata={"structure": "mle_competition"}
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
def _detect_kaggle_competition(self, dir_path: Path) -> TaskDetection:
|
|
251
|
+
"""Detect Kaggle competition task."""
|
|
252
|
+
self.logger.info("Detected Kaggle competition structure")
|
|
253
|
+
|
|
254
|
+
# Load description if exists
|
|
255
|
+
description = ""
|
|
256
|
+
description_file = dir_path / "description.md"
|
|
257
|
+
if description_file.exists():
|
|
258
|
+
description = description_file.read_text(encoding='utf-8')
|
|
259
|
+
|
|
260
|
+
# Detect if tabular or time series
|
|
261
|
+
is_tabular = self._is_tabular_competition(dir_path)
|
|
262
|
+
is_time_series = self._is_time_series_competition(dir_path)
|
|
263
|
+
|
|
264
|
+
if is_time_series:
|
|
265
|
+
recommended = WORKFLOW_RECOMMENDATIONS["kaggle_competition"]["time_series"][0]
|
|
266
|
+
elif is_tabular:
|
|
267
|
+
recommended = WORKFLOW_RECOMMENDATIONS["kaggle_competition"]["tabular"][0]
|
|
268
|
+
else:
|
|
269
|
+
recommended = WORKFLOW_RECOMMENDATIONS["kaggle_competition"]["default"]
|
|
270
|
+
|
|
271
|
+
return TaskDetection(
|
|
272
|
+
task_type="kaggle",
|
|
273
|
+
task_mode="standard_ml",
|
|
274
|
+
data_dir=dir_path,
|
|
275
|
+
description=description or "Kaggle competition task",
|
|
276
|
+
io_instructions="Train a model and generate predictions for the test set.",
|
|
277
|
+
recommended_workflow=recommended,
|
|
278
|
+
confidence=0.9,
|
|
279
|
+
metadata={"structure": "kaggle_competition"}
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
def _detect_open_ended_task(self, dir_path: Path) -> TaskDetection:
|
|
283
|
+
"""Detect open-ended exploration task."""
|
|
284
|
+
self.logger.info("Detected open-ended task")
|
|
285
|
+
|
|
286
|
+
description_file = dir_path / "description.md"
|
|
287
|
+
description = ""
|
|
288
|
+
if description_file.exists():
|
|
289
|
+
description = description_file.read_text(encoding='utf-8')
|
|
290
|
+
|
|
291
|
+
return TaskDetection(
|
|
292
|
+
task_type="open_ended",
|
|
293
|
+
task_mode="open_ended",
|
|
294
|
+
data_dir=dir_path,
|
|
295
|
+
description=description or "Open-ended data exploration task",
|
|
296
|
+
io_instructions="Explore the data and provide comprehensive insights.",
|
|
297
|
+
recommended_workflow=WORKFLOW_RECOMMENDATIONS["open_ended"]["default"],
|
|
298
|
+
confidence=0.9,
|
|
299
|
+
metadata={"structure": "open_ended"}
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
def _detect_datasci_task(self, dir_path: Path) -> TaskDetection:
|
|
303
|
+
"""Detect DataSci task."""
|
|
304
|
+
self.logger.info("Detected DataSci task")
|
|
305
|
+
|
|
306
|
+
prompt_file = dir_path / "prompt.txt"
|
|
307
|
+
description_file = dir_path / "description.md"
|
|
308
|
+
|
|
309
|
+
description = ""
|
|
310
|
+
if prompt_file.exists():
|
|
311
|
+
description = prompt_file.read_text(encoding='utf-8')
|
|
312
|
+
elif description_file.exists():
|
|
313
|
+
description = description_file.read_text(encoding='utf-8')
|
|
314
|
+
|
|
315
|
+
return TaskDetection(
|
|
316
|
+
task_type="datasci",
|
|
317
|
+
task_mode="standard_ml",
|
|
318
|
+
data_dir=dir_path,
|
|
319
|
+
description=description or "Data science task",
|
|
320
|
+
io_instructions="Complete the data science task as described.",
|
|
321
|
+
recommended_workflow=WORKFLOW_RECOMMENDATIONS["datasci"]["default"],
|
|
322
|
+
confidence=0.85,
|
|
323
|
+
metadata={"structure": "datasci"}
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
def _detect_generic_directory(self, dir_path: Path) -> TaskDetection:
|
|
327
|
+
"""Detect task from generic directory."""
|
|
328
|
+
self.logger.info("Treating as generic data directory")
|
|
329
|
+
|
|
330
|
+
# List files in directory
|
|
331
|
+
files = list(dir_path.glob("*"))
|
|
332
|
+
file_names = [f.name for f in files if f.is_file()]
|
|
333
|
+
|
|
334
|
+
description = (
|
|
335
|
+
f"Process data in directory: {dir_path.name}. "
|
|
336
|
+
f"Contains {len(files)} files: {', '.join(file_names[:5])}"
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
return TaskDetection(
|
|
340
|
+
task_type="datasci",
|
|
341
|
+
task_mode="standard_ml",
|
|
342
|
+
data_dir=dir_path,
|
|
343
|
+
description=description,
|
|
344
|
+
io_instructions="Analyze the data and provide results.",
|
|
345
|
+
recommended_workflow="aide",
|
|
346
|
+
confidence=0.7,
|
|
347
|
+
metadata={"files": file_names}
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
def _detect_csv_file(self, file_path: Path) -> TaskDetection:
|
|
351
|
+
"""Detect task type from a CSV file."""
|
|
352
|
+
try:
|
|
353
|
+
df = pd.read_csv(file_path, nrows=10)
|
|
354
|
+
return self._detect_dataframe_task(df)
|
|
355
|
+
except Exception as e:
|
|
356
|
+
self.logger.warning(f"Failed to read CSV file {file_path}: {e}")
|
|
357
|
+
return TaskDetection(
|
|
358
|
+
task_type="datasci",
|
|
359
|
+
task_mode="standard_ml",
|
|
360
|
+
description=f"Process CSV file: {file_path.name}",
|
|
361
|
+
recommended_workflow="aide",
|
|
362
|
+
confidence=0.6
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
def _detect_json_file(self, file_path: Path) -> TaskDetection:
|
|
366
|
+
"""Detect task type from a JSON file."""
|
|
367
|
+
import json
|
|
368
|
+
|
|
369
|
+
try:
|
|
370
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
371
|
+
data = json.load(f)
|
|
372
|
+
|
|
373
|
+
if isinstance(data, list) and len(data) > 0:
|
|
374
|
+
# Likely a dataset
|
|
375
|
+
return TaskDetection(
|
|
376
|
+
task_type="datasci",
|
|
377
|
+
task_mode="standard_ml",
|
|
378
|
+
description=f"Process JSON dataset with {len(data)} records",
|
|
379
|
+
recommended_workflow="aide",
|
|
380
|
+
confidence=0.7
|
|
381
|
+
)
|
|
382
|
+
else:
|
|
383
|
+
# Likely a QA task or config
|
|
384
|
+
return self._detect_qa_task(str(data))
|
|
385
|
+
except Exception as e:
|
|
386
|
+
self.logger.warning(f"Failed to read JSON file {file_path}: {e}")
|
|
387
|
+
return TaskDetection(
|
|
388
|
+
task_type="datasci",
|
|
389
|
+
task_mode="standard_ml",
|
|
390
|
+
description=f"Process JSON file: {file_path.name}",
|
|
391
|
+
recommended_workflow="aide",
|
|
392
|
+
confidence=0.6
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
def _detect_text_file(self, file_path: Path) -> TaskDetection:
|
|
396
|
+
"""Detect task type from a text/markdown file."""
|
|
397
|
+
content = file_path.read_text(encoding='utf-8')
|
|
398
|
+
|
|
399
|
+
if len(content) < 1000:
|
|
400
|
+
# Short text: likely QA or description
|
|
401
|
+
return self._detect_qa_task(content)
|
|
402
|
+
else:
|
|
403
|
+
# Long text: likely description for a task
|
|
404
|
+
return TaskDetection(
|
|
405
|
+
task_type="open_ended",
|
|
406
|
+
task_mode="open_ended",
|
|
407
|
+
description=content,
|
|
408
|
+
recommended_workflow="deepanalyze",
|
|
409
|
+
confidence=0.75
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
def _is_tabular_competition(self, dir_path: Path) -> bool:
|
|
413
|
+
"""Check if competition is tabular (has CSV files)."""
|
|
414
|
+
csv_files = list(dir_path.glob("**/*.csv"))
|
|
415
|
+
return len(csv_files) > 0
|
|
416
|
+
|
|
417
|
+
def _is_time_series_competition(self, dir_path: Path) -> bool:
|
|
418
|
+
"""Check if competition is time series."""
|
|
419
|
+
# Heuristic: check for time-related keywords in file/directory names
|
|
420
|
+
name_lower = str(dir_path).lower()
|
|
421
|
+
time_keywords = ["time", "temporal", "forecast", "series", "date"]
|
|
422
|
+
return any(keyword in name_lower for keyword in time_keywords)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility modules for DSLighting.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from dslighting.utils.defaults import (
|
|
6
|
+
DEFAULT_WORKFLOW,
|
|
7
|
+
DEFAULT_LLM_MODEL,
|
|
8
|
+
DEFAULT_TEMPERATURE,
|
|
9
|
+
DEFAULT_MAX_ITERATIONS,
|
|
10
|
+
WORKFLOW_RECOMMENDATIONS,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"DEFAULT_WORKFLOW",
|
|
15
|
+
"DEFAULT_LLM_MODEL",
|
|
16
|
+
"DEFAULT_TEMPERATURE",
|
|
17
|
+
"DEFAULT_MAX_ITERATIONS",
|
|
18
|
+
"WORKFLOW_RECOMMENDATIONS",
|
|
19
|
+
]
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Default configurations for DSLighting simplified API.
|
|
3
|
+
|
|
4
|
+
This module defines sensible defaults that can be overridden by:
|
|
5
|
+
1. User parameters (highest priority)
|
|
6
|
+
2. Environment variables
|
|
7
|
+
3. These defaults (lowest priority)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import Dict, List, Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# ============================================================================
|
|
14
|
+
# LLM Defaults
|
|
15
|
+
# ============================================================================
|
|
16
|
+
|
|
17
|
+
DEFAULT_LLM_MODEL = "gpt-4o-mini"
|
|
18
|
+
DEFAULT_TEMPERATURE = 0.7
|
|
19
|
+
DEFAULT_MAX_RETRIES = 3
|
|
20
|
+
DEFAULT_API_BASE = "https://api.openai.com/v1"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# ============================================================================
|
|
24
|
+
# Workflow Defaults
|
|
25
|
+
# ============================================================================
|
|
26
|
+
|
|
27
|
+
DEFAULT_WORKFLOW = "aide"
|
|
28
|
+
DEFAULT_MAX_ITERATIONS = 5
|
|
29
|
+
DEFAULT_NUM_DRAFTS = 5
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ============================================================================
|
|
33
|
+
# Sandbox Defaults
|
|
34
|
+
# ============================================================================
|
|
35
|
+
|
|
36
|
+
DEFAULT_SANDBOX_TIMEOUT = 6 * 3600 # 6 hours
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# ============================================================================
|
|
40
|
+
# Workspace Defaults
|
|
41
|
+
# ============================================================================
|
|
42
|
+
|
|
43
|
+
DEFAULT_WORKSPACE_DIR = "./runs/dslighting"
|
|
44
|
+
DEFAULT_KEEP_WORKSPACE_ON_FAILURE = True
|
|
45
|
+
DEFAULT_KEEP_ALL_WORKSPACES = False
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# ============================================================================
|
|
49
|
+
# Workflow Recommendations
|
|
50
|
+
# ============================================================================
|
|
51
|
+
|
|
52
|
+
"""
|
|
53
|
+
Workflow recommendations based on task type and data characteristics.
|
|
54
|
+
|
|
55
|
+
This mapping helps auto-select the best workflow for a given task.
|
|
56
|
+
Users can override this by explicitly specifying the workflow.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
WORKFLOW_RECOMMENDATIONS: Dict[str, Dict[str, Any]] = {
|
|
60
|
+
"kaggle_competition": {
|
|
61
|
+
"tabular": ["autokaggle", "aide"],
|
|
62
|
+
"time_series": ["aide", "automind"],
|
|
63
|
+
"default": "aide"
|
|
64
|
+
},
|
|
65
|
+
"open_ended": {
|
|
66
|
+
"analysis": ["deepanalyze", "automind"],
|
|
67
|
+
"modeling": ["aide", "deepanalyze"],
|
|
68
|
+
"default": "deepanalyze"
|
|
69
|
+
},
|
|
70
|
+
"quick_analysis": {
|
|
71
|
+
"eda": ["data_interpreter"],
|
|
72
|
+
"debugging": ["data_interpreter"],
|
|
73
|
+
"default": "data_interpreter"
|
|
74
|
+
},
|
|
75
|
+
"qa": {
|
|
76
|
+
"default": "aide"
|
|
77
|
+
},
|
|
78
|
+
"datasci": {
|
|
79
|
+
"default": "aide"
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# ============================================================================
|
|
85
|
+
# Full Default Configuration
|
|
86
|
+
# ============================================================================
|
|
87
|
+
|
|
88
|
+
"""
|
|
89
|
+
Complete default configuration structure.
|
|
90
|
+
|
|
91
|
+
This can be merged with user parameters and environment variables
|
|
92
|
+
to create the final DSATConfig.
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
DEFAULT_CONFIG: Dict[str, Any] = {
|
|
96
|
+
"llm": {
|
|
97
|
+
"model": DEFAULT_LLM_MODEL,
|
|
98
|
+
"temperature": DEFAULT_TEMPERATURE,
|
|
99
|
+
"max_retries": DEFAULT_MAX_RETRIES,
|
|
100
|
+
"api_base": DEFAULT_API_BASE,
|
|
101
|
+
"api_key": None, # Will be loaded from env
|
|
102
|
+
},
|
|
103
|
+
"workflow": {
|
|
104
|
+
"name": DEFAULT_WORKFLOW,
|
|
105
|
+
"params": {}
|
|
106
|
+
},
|
|
107
|
+
"run": {
|
|
108
|
+
"name": "dsat_run", # Use "dsat_run" to let DSATRunner auto-generate: dsat_run_{task_id}_{uid}
|
|
109
|
+
"total_steps": DEFAULT_MAX_ITERATIONS,
|
|
110
|
+
"keep_all_workspaces": DEFAULT_KEEP_ALL_WORKSPACES,
|
|
111
|
+
"keep_workspace_on_failure": DEFAULT_KEEP_WORKSPACE_ON_FAILURE,
|
|
112
|
+
"parameters": {},
|
|
113
|
+
},
|
|
114
|
+
"sandbox": {
|
|
115
|
+
"timeout": DEFAULT_SANDBOX_TIMEOUT,
|
|
116
|
+
},
|
|
117
|
+
"agent": {
|
|
118
|
+
"search": {
|
|
119
|
+
"num_drafts": DEFAULT_NUM_DRAFTS,
|
|
120
|
+
"max_iterations": DEFAULT_MAX_ITERATIONS,
|
|
121
|
+
"debug_prob": 0.8,
|
|
122
|
+
"max_debug_depth": 10,
|
|
123
|
+
},
|
|
124
|
+
"max_retries": 10,
|
|
125
|
+
"autokaggle": {
|
|
126
|
+
"max_attempts_per_phase": 10,
|
|
127
|
+
"success_threshold": 3.0,
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# ============================================================================
|
|
134
|
+
# Environment Variable Names
|
|
135
|
+
# ============================================================================
|
|
136
|
+
|
|
137
|
+
"""
|
|
138
|
+
Environment variables that DSLighting reads.
|
|
139
|
+
|
|
140
|
+
These can be set in .env file or exported in the shell.
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
ENV_API_KEY = "API_KEY"
|
|
144
|
+
ENV_API_BASE = "API_BASE"
|
|
145
|
+
ENV_LLM_MODEL = "LLM_MODEL"
|
|
146
|
+
ENV_LLM_PROVIDER = "LLM_PROVIDER"
|
|
147
|
+
ENV_LLM_MODEL_CONFIGS = "LLM_MODEL_CONFIGS"
|
|
148
|
+
ENV_LLM_TEMPERATURE = "LLM_TEMPERATURE"
|
|
149
|
+
|
|
150
|
+
ENV_DSLIGHTING_DEFAULT_WORKFLOW = "DSLIGHTING_DEFAULT_WORKFLOW"
|
|
151
|
+
ENV_DSLIGHTING_WORKSPACE_DIR = "DSLIGHTING_WORKSPACE_DIR"
|