dslighting 1.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dsat/__init__.py +3 -0
- dsat/benchmark/__init__.py +1 -0
- dsat/benchmark/benchmark.py +168 -0
- dsat/benchmark/datasci.py +291 -0
- dsat/benchmark/mle.py +777 -0
- dsat/benchmark/sciencebench.py +304 -0
- dsat/common/__init__.py +0 -0
- dsat/common/constants.py +11 -0
- dsat/common/exceptions.py +48 -0
- dsat/common/typing.py +19 -0
- dsat/config.py +79 -0
- dsat/models/__init__.py +3 -0
- dsat/models/candidates.py +16 -0
- dsat/models/formats.py +52 -0
- dsat/models/task.py +64 -0
- dsat/operators/__init__.py +0 -0
- dsat/operators/aflow_ops.py +90 -0
- dsat/operators/autokaggle_ops.py +170 -0
- dsat/operators/automind_ops.py +38 -0
- dsat/operators/base.py +22 -0
- dsat/operators/code.py +45 -0
- dsat/operators/dsagent_ops.py +123 -0
- dsat/operators/llm_basic.py +84 -0
- dsat/prompts/__init__.py +0 -0
- dsat/prompts/aflow_prompt.py +76 -0
- dsat/prompts/aide_prompt.py +52 -0
- dsat/prompts/autokaggle_prompt.py +290 -0
- dsat/prompts/automind_prompt.py +29 -0
- dsat/prompts/common.py +51 -0
- dsat/prompts/data_interpreter_prompt.py +82 -0
- dsat/prompts/dsagent_prompt.py +88 -0
- dsat/runner.py +554 -0
- dsat/services/__init__.py +0 -0
- dsat/services/data_analyzer.py +387 -0
- dsat/services/llm.py +486 -0
- dsat/services/llm_single.py +421 -0
- dsat/services/sandbox.py +386 -0
- dsat/services/states/__init__.py +0 -0
- dsat/services/states/autokaggle_state.py +43 -0
- dsat/services/states/base.py +14 -0
- dsat/services/states/dsa_log.py +13 -0
- dsat/services/states/experience.py +237 -0
- dsat/services/states/journal.py +153 -0
- dsat/services/states/operator_library.py +290 -0
- dsat/services/vdb.py +76 -0
- dsat/services/workspace.py +178 -0
- dsat/tasks/__init__.py +3 -0
- dsat/tasks/handlers.py +376 -0
- dsat/templates/open_ended/grade_template.py +107 -0
- dsat/tools/__init__.py +4 -0
- dsat/utils/__init__.py +0 -0
- dsat/utils/context.py +172 -0
- dsat/utils/dynamic_import.py +71 -0
- dsat/utils/parsing.py +33 -0
- dsat/workflows/__init__.py +12 -0
- dsat/workflows/base.py +53 -0
- dsat/workflows/factory.py +439 -0
- dsat/workflows/manual/__init__.py +0 -0
- dsat/workflows/manual/autokaggle_workflow.py +148 -0
- dsat/workflows/manual/data_interpreter_workflow.py +153 -0
- dsat/workflows/manual/deepanalyze_workflow.py +484 -0
- dsat/workflows/manual/dsagent_workflow.py +76 -0
- dsat/workflows/search/__init__.py +0 -0
- dsat/workflows/search/aflow_workflow.py +344 -0
- dsat/workflows/search/aide_workflow.py +283 -0
- dsat/workflows/search/automind_workflow.py +237 -0
- dsat/workflows/templates/__init__.py +0 -0
- dsat/workflows/templates/basic_kaggle_loop.py +71 -0
- dslighting/__init__.py +170 -0
- dslighting/core/__init__.py +13 -0
- dslighting/core/agent.py +646 -0
- dslighting/core/config_builder.py +318 -0
- dslighting/core/data_loader.py +422 -0
- dslighting/core/task_detector.py +422 -0
- dslighting/utils/__init__.py +19 -0
- dslighting/utils/defaults.py +151 -0
- dslighting-1.3.9.dist-info/METADATA +554 -0
- dslighting-1.3.9.dist-info/RECORD +80 -0
- dslighting-1.3.9.dist-info/WHEEL +5 -0
- dslighting-1.3.9.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
# dsat/services/data_analyzer.py
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import traceback
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from typing import Optional, List
|
|
8
|
+
from itertools import islice
|
|
9
|
+
|
|
10
|
+
from dsat.models.task import TaskType
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def generate_file_tree(
|
|
16
|
+
start_path: Path,
|
|
17
|
+
max_depth: int = 5,
|
|
18
|
+
max_files: int = 100,
|
|
19
|
+
max_items_per_dir: int = 20,
|
|
20
|
+
display_root_name: Optional[str] = None
|
|
21
|
+
) -> str:
|
|
22
|
+
"""
|
|
23
|
+
Generates a textual representation of the file tree with intelligent truncation.
|
|
24
|
+
|
|
25
|
+
This version prevents a single large directory from consuming the entire file limit,
|
|
26
|
+
and it filters out common noise files.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
start_path: The root directory to start the tree from.
|
|
30
|
+
max_depth: The maximum depth to traverse into directories.
|
|
31
|
+
max_files: The global limit for the total number of files to display.
|
|
32
|
+
max_items_per_dir: The maximum number of items (files and dirs) to show per directory.
|
|
33
|
+
display_root_name: An optional name to display for the root directory.
|
|
34
|
+
"""
|
|
35
|
+
tree = []
|
|
36
|
+
start_path = Path(start_path)
|
|
37
|
+
if not start_path.exists():
|
|
38
|
+
return f"Directory not found: {start_path}"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
base_name = display_root_name if display_root_name is not None else start_path.name
|
|
42
|
+
file_count = 0
|
|
43
|
+
global_limit_reached = False
|
|
44
|
+
|
|
45
|
+
def _walk(path: Path, prefix: str, depth: int):
|
|
46
|
+
nonlocal file_count, global_limit_reached
|
|
47
|
+
|
|
48
|
+
if depth > max_depth or global_limit_reached:
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
# Avoid listing huge directories fully (e.g., image folders).
|
|
53
|
+
# We only sample a small number of entries for display.
|
|
54
|
+
sampled = list(islice(path.iterdir(), max_items_per_dir + 1))
|
|
55
|
+
except OSError as e:
|
|
56
|
+
logger.warning(f"Error reading directory {path}: {e}")
|
|
57
|
+
tree.append(f"{prefix}└── [Error reading directory]")
|
|
58
|
+
return
|
|
59
|
+
|
|
60
|
+
truncated_in_dir = len(sampled) > max_items_per_dir
|
|
61
|
+
if truncated_in_dir:
|
|
62
|
+
display_items = sampled[: max(1, max_items_per_dir // 2)]
|
|
63
|
+
else:
|
|
64
|
+
display_items = sampled
|
|
65
|
+
display_items = sorted(display_items, key=lambda p: p.name)
|
|
66
|
+
|
|
67
|
+
pointers = ['├── '] * (len(display_items) - 1) + ['└── ']
|
|
68
|
+
# If we truncated this directory, the last visible item is not the true last item
|
|
69
|
+
if truncated_in_dir:
|
|
70
|
+
pointers[-1] = '├── '
|
|
71
|
+
|
|
72
|
+
for pointer, sub_path in zip(pointers, display_items):
|
|
73
|
+
if global_limit_reached:
|
|
74
|
+
return
|
|
75
|
+
|
|
76
|
+
if not sub_path.is_dir():
|
|
77
|
+
# Check global file limit *before* adding the next file
|
|
78
|
+
if file_count >= max_files:
|
|
79
|
+
global_limit_reached = True
|
|
80
|
+
return
|
|
81
|
+
file_count += 1
|
|
82
|
+
|
|
83
|
+
display_name = sub_path.name + ('/' if sub_path.is_dir() else '')
|
|
84
|
+
tree.append(f"{prefix}{pointer}{display_name}")
|
|
85
|
+
|
|
86
|
+
if sub_path.is_dir():
|
|
87
|
+
extension = '│ ' if pointer == '├── ' else ' '
|
|
88
|
+
_walk(sub_path, prefix=prefix + extension, depth=depth + 1)
|
|
89
|
+
|
|
90
|
+
if truncated_in_dir:
|
|
91
|
+
tree.append(f"{prefix}└── [... more items truncated ...]")
|
|
92
|
+
|
|
93
|
+
tree.append(f"{base_name}/")
|
|
94
|
+
_walk(start_path, prefix="", depth=1)
|
|
95
|
+
|
|
96
|
+
if global_limit_reached:
|
|
97
|
+
tree.append(f"\n[... Truncated. Total file limit ({max_files}) reached ...]")
|
|
98
|
+
|
|
99
|
+
return "\n".join(tree)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class DataAnalyzer:
|
|
103
|
+
"""
|
|
104
|
+
A centralized service for analyzing input data directories and generating
|
|
105
|
+
a comprehensive textual overview for the Agent.
|
|
106
|
+
"""
|
|
107
|
+
def analyze(self, data_dir: Path, output_filename: str, task_type: Optional[TaskType] = None, optimization_context: bool = False) -> str:
|
|
108
|
+
"""
|
|
109
|
+
Analyzes the data directory and returns a formatted overview string,
|
|
110
|
+
including critical I/O instructions.
|
|
111
|
+
"""
|
|
112
|
+
report = self.analyze_data(data_dir, task_type)
|
|
113
|
+
report += self.generate_io_instructions(output_filename, optimization_context)
|
|
114
|
+
return report
|
|
115
|
+
|
|
116
|
+
def analyze_data(self, data_dir: Path, task_type: Optional[TaskType] = None) -> str:
|
|
117
|
+
"""
|
|
118
|
+
Analyzes the data directory and returns ONLY the data report (structure, format, etc.).
|
|
119
|
+
Does NOT include I/O instructions.
|
|
120
|
+
"""
|
|
121
|
+
if not data_dir or not data_dir.exists() or not data_dir.is_dir():
|
|
122
|
+
logger.error(f"Data directory issue during analysis: {data_dir}")
|
|
123
|
+
return "Error: Input data directory not found, not provided, or is not a directory."
|
|
124
|
+
|
|
125
|
+
report = "\n\n--- COMPREHENSIVE DATA REPORT ---\n\n"
|
|
126
|
+
|
|
127
|
+
# 1. Analyze File Structure (Universal)
|
|
128
|
+
report += self._analyze_structure(data_dir)
|
|
129
|
+
|
|
130
|
+
report += self._analyze_data_schema(data_dir)
|
|
131
|
+
|
|
132
|
+
# 3. Task-Specific Analysis
|
|
133
|
+
if task_type == "kaggle":
|
|
134
|
+
submission_analysis = self._analyze_kaggle_submission_format(data_dir)
|
|
135
|
+
if submission_analysis:
|
|
136
|
+
report += f"## Submission Format Requirements\n{submission_analysis}\n\n"
|
|
137
|
+
|
|
138
|
+
# Remove the call to _generate_io_instructions from here
|
|
139
|
+
return report
|
|
140
|
+
|
|
141
|
+
def generate_io_instructions(self, output_filename: str, optimization_context: bool = False) -> str:
|
|
142
|
+
"""
|
|
143
|
+
Generate standardized I/O instructions reflecting that CWD is the input directory.
|
|
144
|
+
"""
|
|
145
|
+
output_suffix = Path(output_filename).suffix.lower()
|
|
146
|
+
|
|
147
|
+
input_instructions = (
|
|
148
|
+
"1. **INPUT DATA:**\n"
|
|
149
|
+
" - All input files are located in the **current working directory** (./).\n"
|
|
150
|
+
" - Example: Use `pd.read_csv('train.csv')`."
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
if optimization_context:
|
|
154
|
+
example_write = ""
|
|
155
|
+
if output_suffix == ".csv":
|
|
156
|
+
example_write = " - **Example Write (Conceptual):** `final_df.to_csv(output_path.name, index=False)`"
|
|
157
|
+
elif output_suffix == ".npy":
|
|
158
|
+
example_write = " - **Example Write (Conceptual):** `np.save(output_path.name, preds)`"
|
|
159
|
+
|
|
160
|
+
output_instructions = (
|
|
161
|
+
"2. **OUTPUT FILE (Dynamic Workflow Context):**\n"
|
|
162
|
+
" - Your workflow's `solve` method receives an `output_path` argument.\n"
|
|
163
|
+
" - You MUST save your final submission file using the filename derived from this argument (e.g., `output_path.name`).\n"
|
|
164
|
+
" - The file must be saved in the current working directory (./).\n"
|
|
165
|
+
+ (f"\n{example_write}" if example_write else "")
|
|
166
|
+
)
|
|
167
|
+
else:
|
|
168
|
+
example_write = ""
|
|
169
|
+
if output_suffix == ".csv":
|
|
170
|
+
example_write = f" - **Correct Example:** `submission_df.to_csv('{output_filename}', index=False)`"
|
|
171
|
+
elif output_suffix == ".npy":
|
|
172
|
+
example_write = f" - **Correct Example:** `np.save('{output_filename}', preds)`"
|
|
173
|
+
|
|
174
|
+
output_instructions = (
|
|
175
|
+
f"2. **OUTPUT FILE:**\n"
|
|
176
|
+
f" - You MUST save your final submission file to the **current working directory** (./).\n"
|
|
177
|
+
f" - The required output filename is: `{output_filename}`\n"
|
|
178
|
+
+ (f"{example_write}\n" if example_write else "")
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
return f"""
|
|
182
|
+
--- CRITICAL I/O REQUIREMENTS ---
|
|
183
|
+
|
|
184
|
+
You MUST follow these file system rules precisely. Failure to do so will cause a fatal error.
|
|
185
|
+
|
|
186
|
+
{input_instructions}
|
|
187
|
+
|
|
188
|
+
{output_instructions}
|
|
189
|
+
|
|
190
|
+
**IMPORTANT:** These path requirements are non-negotiable and must be followed exactly.
|
|
191
|
+
"""
|
|
192
|
+
|
|
193
|
+
def _analyze_structure(self, data_dir: Path) -> str:
|
|
194
|
+
"""Generates the file tree representation."""
|
|
195
|
+
try:
|
|
196
|
+
tree_output = generate_file_tree(data_dir, display_root_name=".")
|
|
197
|
+
return f"## Directory Structure (Current Working Directory)\n```text\n{tree_output}\n```\n\n"
|
|
198
|
+
except Exception as e:
|
|
199
|
+
logger.error(f"Failed to generate file tree for {data_dir}: {traceback.format_exc()}")
|
|
200
|
+
return f"## Directory Structure\nError analyzing structure: {traceback.format_exc()}\n\n"
|
|
201
|
+
|
|
202
|
+
def _analyze_kaggle_submission_format(self, data_dir: Path) -> str:
|
|
203
|
+
"""
|
|
204
|
+
Analyzes the sample submission file for Kaggle tasks and extracts a
|
|
205
|
+
prescriptive schema.
|
|
206
|
+
"""
|
|
207
|
+
sample_submission_file = self._find_sample_submission(data_dir)
|
|
208
|
+
|
|
209
|
+
if not sample_submission_file:
|
|
210
|
+
return ""
|
|
211
|
+
|
|
212
|
+
try:
|
|
213
|
+
# Read the sample submission file to get format details
|
|
214
|
+
sample_df = pd.read_csv(sample_submission_file)
|
|
215
|
+
|
|
216
|
+
# Get the first few rows and data types
|
|
217
|
+
head_info = sample_df.head().to_string(index=False)
|
|
218
|
+
dtypes_info = sample_df.dtypes.to_string()
|
|
219
|
+
|
|
220
|
+
required_columns = sample_df.columns.tolist()
|
|
221
|
+
columns_instruction = f"""
|
|
222
|
+
**Required Submission Columns:**
|
|
223
|
+
Your submission file MUST contain the following columns in this exact order:
|
|
224
|
+
```
|
|
225
|
+
{required_columns}
|
|
226
|
+
```
|
|
227
|
+
This is a strict requirement for the submission to be graded correctly. The grading system uses the non-prediction columns (like 'Comment' or an 'id') to match your predictions against the ground truth.
|
|
228
|
+
"""
|
|
229
|
+
|
|
230
|
+
return f"""
|
|
231
|
+
**CRITICAL:** Your final submission file MUST EXACTLY match the format of the sample submission file provided (`{sample_submission_file.name}`).
|
|
232
|
+
This includes the column names, column order, and data types. Failure to adhere to this format will result in a score of zero.
|
|
233
|
+
|
|
234
|
+
{columns_instruction}
|
|
235
|
+
|
|
236
|
+
**Format Details:**
|
|
237
|
+
*First 5 rows:*
|
|
238
|
+
```text
|
|
239
|
+
{head_info}
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
*Data types:*
|
|
243
|
+
```text
|
|
244
|
+
{dtypes_info}
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
"""
|
|
248
|
+
except Exception as e:
|
|
249
|
+
# Fallback if pandas fails
|
|
250
|
+
logger.warning(f"Could not read sample submission file '{sample_submission_file}' for detailed analysis: {traceback.format_exc()}")
|
|
251
|
+
return f"""
|
|
252
|
+
**CRITICAL:** Your final submission file MUST match the format of the sample submission file: `{sample_submission_file.name}`.
|
|
253
|
+
(Note: Automatic format analysis failed, please inspect the file manually).
|
|
254
|
+
"""
|
|
255
|
+
|
|
256
|
+
def _find_sample_submission(self, data_dir: Path) -> Optional[Path]:
|
|
257
|
+
"""Helper to locate the sample submission file."""
|
|
258
|
+
try:
|
|
259
|
+
for file in data_dir.iterdir():
|
|
260
|
+
file_name_lower = file.name.lower()
|
|
261
|
+
if "sample" in file_name_lower and "submission" in file_name_lower and file_name_lower.endswith(".csv"):
|
|
262
|
+
return file
|
|
263
|
+
except Exception as e:
|
|
264
|
+
logger.warning(f"Could not scan data directory '{data_dir}': {traceback.format_exc()}")
|
|
265
|
+
return None
|
|
266
|
+
|
|
267
|
+
def _analyze_data_schema(self, data_dir: Path) -> str:
|
|
268
|
+
"""
|
|
269
|
+
Analyzes the schema of potential training and testing files to provide a
|
|
270
|
+
structured overview of columns, data types, missing values, and cardinality.
|
|
271
|
+
This helps the agent make better decisions about preprocessing.
|
|
272
|
+
"""
|
|
273
|
+
report_parts = []
|
|
274
|
+
# Define supported extensions and keywords for more robust discovery
|
|
275
|
+
SUPPORTED_EXTENSIONS = ('.csv', '.tsv', '.parquet', '.xlsx')
|
|
276
|
+
KEYWORDS = ('train', 'test', 'val', 'eval', 'sample', 'submission', 'sub', 'data')
|
|
277
|
+
|
|
278
|
+
# Fast path: most prepared competitions keep train/test-like tables at the root.
|
|
279
|
+
files_to_analyze: List[Path] = []
|
|
280
|
+
try:
|
|
281
|
+
root_files = [p for p in data_dir.iterdir() if p.is_file()]
|
|
282
|
+
except OSError:
|
|
283
|
+
root_files = []
|
|
284
|
+
|
|
285
|
+
for p in root_files:
|
|
286
|
+
if p.suffix.lower() not in SUPPORTED_EXTENSIONS:
|
|
287
|
+
continue
|
|
288
|
+
if any(keyword in p.stem.lower() for keyword in KEYWORDS):
|
|
289
|
+
files_to_analyze.append(p)
|
|
290
|
+
|
|
291
|
+
# Fallback: if still nothing, just take the first few supported files at root
|
|
292
|
+
if not files_to_analyze and root_files:
|
|
293
|
+
files_to_analyze = [p for p in root_files if p.suffix.lower() in SUPPORTED_EXTENSIONS][:3]
|
|
294
|
+
|
|
295
|
+
# Deep discoveryFallback: bounded recursive search (avoid walking huge image folders).
|
|
296
|
+
if not files_to_analyze:
|
|
297
|
+
max_depth = 3
|
|
298
|
+
max_dirs = 200
|
|
299
|
+
max_files = 8
|
|
300
|
+
per_dir_limit = 200
|
|
301
|
+
|
|
302
|
+
queue: List[tuple[Path, int]] = [(data_dir, 0)]
|
|
303
|
+
visited = 0
|
|
304
|
+
while queue and visited < max_dirs and len(files_to_analyze) < max_files:
|
|
305
|
+
current, depth = queue.pop(0)
|
|
306
|
+
visited += 1
|
|
307
|
+
if depth > max_depth:
|
|
308
|
+
continue
|
|
309
|
+
try:
|
|
310
|
+
sampled = list(islice(current.iterdir(), per_dir_limit + 1))
|
|
311
|
+
except OSError:
|
|
312
|
+
continue
|
|
313
|
+
|
|
314
|
+
truncated = len(sampled) > per_dir_limit
|
|
315
|
+
entries = sampled[:per_dir_limit] if truncated else sampled
|
|
316
|
+
|
|
317
|
+
for entry in entries:
|
|
318
|
+
if entry.is_dir():
|
|
319
|
+
# If this directory is huge, avoid descending further.
|
|
320
|
+
if truncated and depth >= 1:
|
|
321
|
+
continue
|
|
322
|
+
queue.append((entry, depth + 1))
|
|
323
|
+
continue
|
|
324
|
+
if entry.suffix.lower() not in SUPPORTED_EXTENSIONS:
|
|
325
|
+
continue
|
|
326
|
+
if not any(keyword in entry.stem.lower() for keyword in KEYWORDS):
|
|
327
|
+
continue
|
|
328
|
+
files_to_analyze.append(entry)
|
|
329
|
+
if len(files_to_analyze) >= max_files:
|
|
330
|
+
break
|
|
331
|
+
|
|
332
|
+
files_to_analyze = sorted(set(files_to_analyze))
|
|
333
|
+
|
|
334
|
+
if not files_to_analyze:
|
|
335
|
+
return ""
|
|
336
|
+
|
|
337
|
+
max_rows = 5000
|
|
338
|
+
for file_path in files_to_analyze:
|
|
339
|
+
try:
|
|
340
|
+
# Dynamically choose the reader based on file extension
|
|
341
|
+
ext = file_path.suffix.lower()
|
|
342
|
+
if ext in ['.csv', '.tsv']:
|
|
343
|
+
sep = "\t" if ext == ".tsv" else ","
|
|
344
|
+
# Try multiple encodings
|
|
345
|
+
df = None
|
|
346
|
+
for enc in ['utf-8', 'gbk', 'latin1', 'utf-8-sig']:
|
|
347
|
+
try:
|
|
348
|
+
df = pd.read_csv(file_path, sep=sep, nrows=max_rows, encoding=enc)
|
|
349
|
+
break
|
|
350
|
+
except (UnicodeDecodeError, Exception):
|
|
351
|
+
continue
|
|
352
|
+
|
|
353
|
+
if df is None:
|
|
354
|
+
raise Exception(f"Failed to read CSV with multiple encodings for {file_path.name}")
|
|
355
|
+
elif ext == '.parquet':
|
|
356
|
+
# Note: This requires 'pyarrow' or 'fastparquet' to be installed
|
|
357
|
+
df = pd.read_parquet(file_path).head(max_rows)
|
|
358
|
+
else:
|
|
359
|
+
# Skip unsupported but matched files
|
|
360
|
+
continue
|
|
361
|
+
|
|
362
|
+
report_parts.append(f"### Analysis of `{file_path.relative_to(data_dir)}`")
|
|
363
|
+
|
|
364
|
+
# Create a summary DataFrame
|
|
365
|
+
summary = pd.DataFrame({
|
|
366
|
+
'Data Type': df.dtypes,
|
|
367
|
+
'Missing (%)': (df.isnull().sum() * 100 / len(df)).round(2),
|
|
368
|
+
'Cardinality': df.nunique(),
|
|
369
|
+
})
|
|
370
|
+
|
|
371
|
+
sample_values = [col.dropna().head(2).tolist() for _, col in df.items()]
|
|
372
|
+
summary['Sample Values'] = sample_values
|
|
373
|
+
|
|
374
|
+
# Truncate sample values for readability
|
|
375
|
+
summary['Sample Values'] = summary['Sample Values'].apply(
|
|
376
|
+
lambda x: str(x) if len(str(x)) < 40 else str(x)[:37] + '...'
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
report_parts.append(f"```text\n{summary.to_string()}\n```")
|
|
380
|
+
except Exception as e:
|
|
381
|
+
logger.warning(f"Could not analyze schema for {file_path.name}: {traceback.format_exc()}")
|
|
382
|
+
report_parts.append(f"### Analysis of `{file_path.relative_to(data_dir)}`\nCould not be analyzed due to error: {e}")
|
|
383
|
+
|
|
384
|
+
if not report_parts:
|
|
385
|
+
return ""
|
|
386
|
+
|
|
387
|
+
return "## Data Schema Analysis\n" + "\n\n".join(report_parts) + "\n\n"
|