dslighting 1.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. dsat/__init__.py +3 -0
  2. dsat/benchmark/__init__.py +1 -0
  3. dsat/benchmark/benchmark.py +168 -0
  4. dsat/benchmark/datasci.py +291 -0
  5. dsat/benchmark/mle.py +777 -0
  6. dsat/benchmark/sciencebench.py +304 -0
  7. dsat/common/__init__.py +0 -0
  8. dsat/common/constants.py +11 -0
  9. dsat/common/exceptions.py +48 -0
  10. dsat/common/typing.py +19 -0
  11. dsat/config.py +79 -0
  12. dsat/models/__init__.py +3 -0
  13. dsat/models/candidates.py +16 -0
  14. dsat/models/formats.py +52 -0
  15. dsat/models/task.py +64 -0
  16. dsat/operators/__init__.py +0 -0
  17. dsat/operators/aflow_ops.py +90 -0
  18. dsat/operators/autokaggle_ops.py +170 -0
  19. dsat/operators/automind_ops.py +38 -0
  20. dsat/operators/base.py +22 -0
  21. dsat/operators/code.py +45 -0
  22. dsat/operators/dsagent_ops.py +123 -0
  23. dsat/operators/llm_basic.py +84 -0
  24. dsat/prompts/__init__.py +0 -0
  25. dsat/prompts/aflow_prompt.py +76 -0
  26. dsat/prompts/aide_prompt.py +52 -0
  27. dsat/prompts/autokaggle_prompt.py +290 -0
  28. dsat/prompts/automind_prompt.py +29 -0
  29. dsat/prompts/common.py +51 -0
  30. dsat/prompts/data_interpreter_prompt.py +82 -0
  31. dsat/prompts/dsagent_prompt.py +88 -0
  32. dsat/runner.py +554 -0
  33. dsat/services/__init__.py +0 -0
  34. dsat/services/data_analyzer.py +387 -0
  35. dsat/services/llm.py +486 -0
  36. dsat/services/llm_single.py +421 -0
  37. dsat/services/sandbox.py +386 -0
  38. dsat/services/states/__init__.py +0 -0
  39. dsat/services/states/autokaggle_state.py +43 -0
  40. dsat/services/states/base.py +14 -0
  41. dsat/services/states/dsa_log.py +13 -0
  42. dsat/services/states/experience.py +237 -0
  43. dsat/services/states/journal.py +153 -0
  44. dsat/services/states/operator_library.py +290 -0
  45. dsat/services/vdb.py +76 -0
  46. dsat/services/workspace.py +178 -0
  47. dsat/tasks/__init__.py +3 -0
  48. dsat/tasks/handlers.py +376 -0
  49. dsat/templates/open_ended/grade_template.py +107 -0
  50. dsat/tools/__init__.py +4 -0
  51. dsat/utils/__init__.py +0 -0
  52. dsat/utils/context.py +172 -0
  53. dsat/utils/dynamic_import.py +71 -0
  54. dsat/utils/parsing.py +33 -0
  55. dsat/workflows/__init__.py +12 -0
  56. dsat/workflows/base.py +53 -0
  57. dsat/workflows/factory.py +439 -0
  58. dsat/workflows/manual/__init__.py +0 -0
  59. dsat/workflows/manual/autokaggle_workflow.py +148 -0
  60. dsat/workflows/manual/data_interpreter_workflow.py +153 -0
  61. dsat/workflows/manual/deepanalyze_workflow.py +484 -0
  62. dsat/workflows/manual/dsagent_workflow.py +76 -0
  63. dsat/workflows/search/__init__.py +0 -0
  64. dsat/workflows/search/aflow_workflow.py +344 -0
  65. dsat/workflows/search/aide_workflow.py +283 -0
  66. dsat/workflows/search/automind_workflow.py +237 -0
  67. dsat/workflows/templates/__init__.py +0 -0
  68. dsat/workflows/templates/basic_kaggle_loop.py +71 -0
  69. dslighting/__init__.py +170 -0
  70. dslighting/core/__init__.py +13 -0
  71. dslighting/core/agent.py +646 -0
  72. dslighting/core/config_builder.py +318 -0
  73. dslighting/core/data_loader.py +422 -0
  74. dslighting/core/task_detector.py +422 -0
  75. dslighting/utils/__init__.py +19 -0
  76. dslighting/utils/defaults.py +151 -0
  77. dslighting-1.3.9.dist-info/METADATA +554 -0
  78. dslighting-1.3.9.dist-info/RECORD +80 -0
  79. dslighting-1.3.9.dist-info/WHEEL +5 -0
  80. dslighting-1.3.9.dist-info/top_level.txt +2 -0
@@ -0,0 +1,422 @@
1
+ """
2
+ Data loading and preprocessing.
3
+
4
+ This module provides a unified interface for loading data from various sources
5
+ with automatic type detection and validation.
6
+ """
7
+
8
+ import logging
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ from typing import Any, Dict, List, Optional, Union
12
+
13
+ import pandas as pd
14
+
15
+ from dslighting.core.task_detector import TaskDetector, TaskDetection
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ @dataclass
21
+ class LoadedData:
22
+ """
23
+ Container for loaded data with metadata.
24
+
25
+ Attributes:
26
+ source: Original data source (path, DataFrame, etc.)
27
+ data_dir: Data directory path (for file-based sources)
28
+ task_detection: Detected task information
29
+ task_id: Task/Competition ID (extracted from path)
30
+ metadata: Additional metadata
31
+ """
32
+ source: Any
33
+ data_dir: Optional[Path] = None
34
+ task_detection: Optional[TaskDetection] = None
35
+ task_id: Optional[str] = None
36
+ metadata: Dict[str, Any] = None
37
+
38
+ def __post_init__(self):
39
+ if self.metadata is None:
40
+ self.metadata = {}
41
+
42
+ def get_description(self) -> str:
43
+ """Get task description."""
44
+ if self.task_detection:
45
+ return self.task_detection.description
46
+ return ""
47
+
48
+ def get_io_instructions(self) -> str:
49
+ """Get I/O instructions."""
50
+ if self.task_detection:
51
+ return self.task_detection.io_instructions
52
+ return ""
53
+
54
+ def get_recommended_workflow(self) -> str:
55
+ """Get recommended workflow."""
56
+ if self.task_detection:
57
+ return self.task_detection.recommended_workflow
58
+ return "aide"
59
+
60
+ def get_task_type(self) -> str:
61
+ """Get task type."""
62
+ if self.task_detection:
63
+ return self.task_detection.task_type
64
+ return "datasci"
65
+
66
+
67
+ class DataLoader:
68
+ """
69
+ Load data from various sources with optional automatic type detection.
70
+
71
+ By default, all data is treated as MLE format (prepared/public & prepared/private).
72
+ Set auto_detect=True to enable automatic task type detection.
73
+
74
+ Supported sources:
75
+ - File paths (CSV, JSON, parquet)
76
+ - Directory paths (competition layout)
77
+ - Pandas DataFrames
78
+ - Dict/question strings (QA tasks)
79
+ """
80
+
81
+ def __init__(self, auto_detect: bool = False):
82
+ """
83
+ Initialize DataLoader.
84
+
85
+ Args:
86
+ auto_detect: If True, automatically detect task type.
87
+ If False (default), treat all data as MLE format.
88
+ """
89
+ self.detector = TaskDetector()
90
+ self.auto_detect = auto_detect
91
+ self.logger = logger
92
+
93
+ def load(
94
+ self,
95
+ source: Union[str, Path, pd.DataFrame, dict],
96
+ auto_detect: bool = None,
97
+ **kwargs
98
+ ) -> LoadedData:
99
+ """
100
+ Load data from a source.
101
+
102
+ Args:
103
+ source: Data source (path, DataFrame, dict, etc.)
104
+ auto_detect: Override the instance's auto_detect setting.
105
+ If None (default), use instance setting.
106
+ **kwargs: Additional parameters
107
+
108
+ Returns:
109
+ LoadedData with detection information
110
+ """
111
+ self.logger.info(f"Loading data from source: {type(source).__name__}")
112
+
113
+ # Determine whether to auto-detect
114
+ should_auto_detect = auto_detect if auto_detect is not None else self.auto_detect
115
+
116
+ # Detect task type (or use MLE default)
117
+ if should_auto_detect:
118
+ task_detection = self.detector.detect(source)
119
+ else:
120
+ task_detection = self._get_default_mle_detection(source)
121
+
122
+ # Determine data directory
123
+ data_dir = self._extract_data_dir(source, task_detection)
124
+
125
+ # Extract task_id from path
126
+ task_id = self._extract_task_id(source, data_dir)
127
+
128
+ # Create LoadedData
129
+ loaded_data = LoadedData(
130
+ source=source,
131
+ data_dir=data_dir,
132
+ task_detection=task_detection,
133
+ task_id=task_id,
134
+ metadata=kwargs
135
+ )
136
+
137
+ self.logger.info(
138
+ f"Loaded data: task_type={task_detection.task_type}, "
139
+ f"workflow={task_detection.recommended_workflow}"
140
+ )
141
+
142
+ return loaded_data
143
+
144
+ def load_csv(
145
+ self,
146
+ path: Union[str, Path],
147
+ **kwargs
148
+ ) -> LoadedData:
149
+ """
150
+ Load data from a CSV file.
151
+
152
+ Args:
153
+ path: Path to CSV file
154
+ **kwargs: Additional parameters passed to pd.read_csv
155
+
156
+ Returns:
157
+ LoadedData with DataFrame and detection
158
+ """
159
+ path = Path(path)
160
+ self.logger.info(f"Loading CSV file: {path}")
161
+
162
+ try:
163
+ df = pd.read_csv(path, **kwargs)
164
+ return self.load(df)
165
+ except Exception as e:
166
+ self.logger.error(f"Failed to load CSV file {path}: {e}")
167
+ raise
168
+
169
+ def load_directory(
170
+ self,
171
+ path: Union[str, Path],
172
+ **kwargs
173
+ ) -> LoadedData:
174
+ """
175
+ Load data from a directory.
176
+
177
+ Args:
178
+ path: Path to directory
179
+ **kwargs: Additional parameters
180
+
181
+ Returns:
182
+ LoadedData with directory and detection
183
+ """
184
+ path = Path(path)
185
+ self.logger.info(f"Loading directory: {path}")
186
+
187
+ if not path.is_dir():
188
+ raise ValueError(f"Not a directory: {path}")
189
+
190
+ return self.load(path, **kwargs)
191
+
192
+ def load_dataframe(
193
+ self,
194
+ df: pd.DataFrame,
195
+ **kwargs
196
+ ) -> LoadedData:
197
+ """
198
+ Load data from a pandas DataFrame.
199
+
200
+ Args:
201
+ df: Pandas DataFrame
202
+ **kwargs: Additional parameters
203
+
204
+ Returns:
205
+ LoadedData with DataFrame and detection
206
+ """
207
+ self.logger.info(f"Loading DataFrame with shape {df.shape}")
208
+ return self.load(df, **kwargs)
209
+
210
+ def load_competition(
211
+ self,
212
+ competition_id: str,
213
+ data_dir: Union[str, Path] = None,
214
+ **kwargs
215
+ ) -> LoadedData:
216
+ """
217
+ Load data from a competition (MLE-Bench style).
218
+
219
+ Args:
220
+ competition_id: Competition identifier
221
+ data_dir: Base data directory containing competitions
222
+ **kwargs: Additional parameters
223
+
224
+ Returns:
225
+ LoadedData with competition data
226
+ """
227
+ self.logger.info(f"Loading competition: {competition_id}")
228
+
229
+ # Default to data/competitions directory
230
+ if data_dir is None:
231
+ data_dir = Path("data/competitions") / competition_id
232
+ else:
233
+ data_dir = Path(data_dir) / competition_id
234
+
235
+ if not data_dir.exists():
236
+ raise ValueError(f"Competition directory not found: {data_dir}")
237
+
238
+ return self.load(data_dir, **kwargs)
239
+
240
+ def load_question(
241
+ self,
242
+ question: str,
243
+ **kwargs
244
+ ) -> LoadedData:
245
+ """
246
+ Load a QA question.
247
+
248
+ Args:
249
+ question: Question text
250
+ **kwargs: Additional parameters
251
+
252
+ Returns:
253
+ LoadedData with question
254
+ """
255
+ self.logger.info("Loading QA question")
256
+ return self.load(question, **kwargs)
257
+
258
+ def _get_default_mle_detection(self, source: Any) -> TaskDetection:
259
+ """
260
+ Get default MLE competition detection for any data source.
261
+
262
+ This method treats all data as MLE format (prepared/public & prepared/private).
263
+ It extracts the data directory if available.
264
+
265
+ Args:
266
+ source: Data source (path, DataFrame, dict, etc.)
267
+
268
+ Returns:
269
+ TaskDetection configured for MLE competition
270
+ """
271
+ self.logger.info("Using default MLE competition format (prepared/public & prepared/private)")
272
+
273
+ # Try to extract data directory
274
+ data_dir = None
275
+ description = "MLE competition task"
276
+
277
+ if isinstance(source, (str, Path)):
278
+ path = Path(source).resolve() # Convert to absolute path
279
+ self.logger.info(f"Resolved path: {path}")
280
+
281
+ if path.exists():
282
+ if path.is_dir():
283
+ data_dir = path
284
+ self.logger.info(f"Data directory found: {data_dir}")
285
+ # Try to load description
286
+ desc_file = path / "description.md"
287
+ if desc_file.exists():
288
+ try:
289
+ description = desc_file.read_text(encoding='utf-8')
290
+ self.logger.info(f"Loaded description from {desc_file}")
291
+ except Exception:
292
+ pass
293
+ elif path.is_file():
294
+ data_dir = path.parent
295
+ self.logger.info(f"Data directory (from file parent): {data_dir}")
296
+ # Try to load description from parent directory
297
+ desc_file = path.parent / "description.md"
298
+ if desc_file.exists():
299
+ try:
300
+ description = desc_file.read_text(encoding='utf-8')
301
+ self.logger.info(f"Loaded description from {desc_file}")
302
+ except Exception:
303
+ pass
304
+ else:
305
+ self.logger.warning(f"Path does not exist: {path}")
306
+
307
+ # Try to find the data in common locations
308
+ competition_id = path.name
309
+
310
+ # Common search locations for data
311
+ search_locations = [
312
+ # Current project: ./data/competitions/
313
+ Path.cwd() / "data" / "competitions" / competition_id,
314
+ # Parent dslighting: ../dslighting/data/competitions/
315
+ Path.cwd().parent / "dslighting" / "data" / "competitions" / competition_id,
316
+ # Parent data: ../data/competitions/
317
+ Path.cwd().parent / "data" / "competitions" / competition_id,
318
+ # From package location: ../../data/competitions/
319
+ Path(__file__).parent.parent.parent / "data" / "competitions" / competition_id,
320
+ # Absolute path fallback
321
+ Path("/Users/liufan/Applications/Github/dslighting/data/competitions") / competition_id,
322
+ ]
323
+
324
+ for location in search_locations:
325
+ self.logger.info(f" Trying: {location}")
326
+ if location.exists() and location.is_dir():
327
+ data_dir = location
328
+ self.logger.info(f" ✓ Found data at: {data_dir}")
329
+ break
330
+
331
+ if data_dir is None:
332
+ # Last resort: use the original resolved path
333
+ self.logger.warning(f" Could not find data, using original path: {path}")
334
+ data_dir = path
335
+
336
+ # Try to load description (if data_dir was found)
337
+ if data_dir and data_dir.exists():
338
+ desc_file = data_dir / "description.md"
339
+ if desc_file.exists():
340
+ try:
341
+ description = desc_file.read_text(encoding='utf-8')
342
+ self.logger.info(f"Loaded description from {desc_file}")
343
+ except Exception:
344
+ pass
345
+
346
+ # Create MLE-style detection
347
+ from dslighting.utils.defaults import WORKFLOW_RECOMMENDATIONS
348
+
349
+ return TaskDetection(
350
+ task_type="kaggle", # MLE uses kaggle task type internally
351
+ task_mode="standard_ml",
352
+ data_dir=data_dir,
353
+ description=description,
354
+ io_instructions="Train a model and generate predictions for the test set.",
355
+ recommended_workflow=WORKFLOW_RECOMMENDATIONS.get("kaggle_competition", {}).get("default", "aide"),
356
+ confidence=1.0, # High confidence since this is explicit user intent
357
+ metadata={"structure": "mle_competition", "auto_detected": False}
358
+ )
359
+
360
+ def _extract_data_dir(
361
+ self,
362
+ source: Any,
363
+ detection: TaskDetection
364
+ ) -> Optional[Path]:
365
+ """
366
+ Extract data directory from source and detection.
367
+
368
+ Args:
369
+ source: Original data source
370
+ detection: Task detection result
371
+
372
+ Returns:
373
+ Path to data directory or None
374
+ """
375
+ # If detection already has data_dir, use it
376
+ if detection.data_dir:
377
+ return detection.data_dir
378
+
379
+ # If source is a path, use its parent
380
+ if isinstance(source, (str, Path)):
381
+ path = Path(source)
382
+ if path.is_file():
383
+ return path.parent
384
+ elif path.is_dir():
385
+ return path
386
+
387
+ # No data directory
388
+ return None
389
+
390
+ def _extract_task_id(
391
+ self,
392
+ source: Any,
393
+ data_dir: Optional[Path]
394
+ ) -> Optional[str]:
395
+ """
396
+ Extract task/competition ID from source path.
397
+
398
+ Args:
399
+ source: Original data source
400
+ data_dir: Detected data directory
401
+
402
+ Returns:
403
+ Task ID (e.g., "bike-sharing-demand") or None
404
+ """
405
+ # If source is a path, extract the last directory name as task_id
406
+ if isinstance(source, (str, Path)):
407
+ path = Path(source)
408
+
409
+ # If it's a file, use parent directory name
410
+ if path.is_file():
411
+ return path.parent.name
412
+
413
+ # If it's a directory, use its name
414
+ if path.is_dir():
415
+ return path.name
416
+
417
+ # If data_dir is available, use its name
418
+ if data_dir:
419
+ return data_dir.name
420
+
421
+ # No task_id found
422
+ return None