ai-data-science-team 0.0.0.9009__py3-none-any.whl → 0.0.0.9011__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/agents/__init__.py +1 -0
- ai_data_science_team/agents/data_cleaning_agent.py +6 -6
- ai_data_science_team/agents/data_loader_tools_agent.py +272 -0
- ai_data_science_team/agents/data_visualization_agent.py +6 -7
- ai_data_science_team/agents/data_wrangling_agent.py +6 -6
- ai_data_science_team/agents/feature_engineering_agent.py +6 -6
- ai_data_science_team/agents/sql_database_agent.py +6 -6
- ai_data_science_team/ml_agents/__init__.py +1 -0
- ai_data_science_team/ml_agents/h2o_ml_agent.py +206 -385
- ai_data_science_team/ml_agents/h2o_ml_tools_agent.py +0 -0
- ai_data_science_team/ml_agents/mlflow_tools_agent.py +350 -0
- ai_data_science_team/multiagents/sql_data_analyst.py +3 -4
- ai_data_science_team/parsers/__init__.py +0 -0
- ai_data_science_team/{tools → parsers}/parsers.py +0 -1
- ai_data_science_team/templates/agent_templates.py +6 -6
- ai_data_science_team/tools/data_loader.py +448 -0
- ai_data_science_team/tools/dataframe.py +139 -0
- ai_data_science_team/tools/h2o.py +643 -0
- ai_data_science_team/tools/mlflow.py +961 -0
- ai_data_science_team/tools/{metadata.py → sql.py} +1 -137
- {ai_data_science_team-0.0.0.9009.dist-info → ai_data_science_team-0.0.0.9011.dist-info}/METADATA +40 -19
- ai_data_science_team-0.0.0.9011.dist-info/RECORD +36 -0
- ai_data_science_team-0.0.0.9009.dist-info/RECORD +0 -28
- /ai_data_science_team/{tools → utils}/logging.py +0 -0
- /ai_data_science_team/{tools → utils}/regex.py +0 -0
- {ai_data_science_team-0.0.0.9009.dist-info → ai_data_science_team-0.0.0.9011.dist-info}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9009.dist-info → ai_data_science_team-0.0.0.9011.dist-info}/WHEEL +0 -0
- {ai_data_science_team-0.0.0.9009.dist-info → ai_data_science_team-0.0.0.9011.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,448 @@
|
|
1
|
+
|
2
|
+
from langchain.tools import tool
|
3
|
+
from langgraph.prebuilt import InjectedState
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
import os
|
7
|
+
|
8
|
+
from typing import Tuple, List, Dict, Optional, Annotated
|
9
|
+
|
10
|
+
|
11
|
+
@tool(response_format='content_and_artifact')
|
12
|
+
def load_directory(
|
13
|
+
directory_path: str = os.getcwd(),
|
14
|
+
file_type: Optional[str] = None
|
15
|
+
) -> Tuple[str, Dict]:
|
16
|
+
"""
|
17
|
+
Tool: load_directory
|
18
|
+
Description: Loads all recognized tabular files in a directory.
|
19
|
+
If file_type is specified (e.g., 'csv'), only files
|
20
|
+
with that extension are loaded.
|
21
|
+
|
22
|
+
Parameters:
|
23
|
+
----------
|
24
|
+
directory_path : str
|
25
|
+
The path to the directory to load. Defaults to the current working directory.
|
26
|
+
|
27
|
+
file_type : str, optional
|
28
|
+
The extension of the file type you want to load exclusively
|
29
|
+
(e.g., 'csv', 'xlsx', 'parquet'). If None or not provided,
|
30
|
+
attempts to load all recognized tabular files.
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
-------
|
34
|
+
Tuple[str, Dict]
|
35
|
+
A tuple containing a message and a dictionary of data frames.
|
36
|
+
"""
|
37
|
+
print(f" * Tool: load_directory | {directory_path}")
|
38
|
+
|
39
|
+
import os
|
40
|
+
import pandas as pd
|
41
|
+
|
42
|
+
if directory_path is None:
|
43
|
+
return "No directory path provided.", {}
|
44
|
+
|
45
|
+
if not os.path.isdir(directory_path):
|
46
|
+
return f"Directory not found: {directory_path}", {}
|
47
|
+
|
48
|
+
data_frames = {}
|
49
|
+
|
50
|
+
for filename in os.listdir(directory_path):
|
51
|
+
file_path = os.path.join(directory_path, filename)
|
52
|
+
|
53
|
+
# Skip directories
|
54
|
+
if os.path.isdir(file_path):
|
55
|
+
continue
|
56
|
+
|
57
|
+
# If file_type is specified, only process files that match.
|
58
|
+
if file_type:
|
59
|
+
# Make sure extension check is case-insensitive
|
60
|
+
if not filename.lower().endswith(f".{file_type.lower()}"):
|
61
|
+
continue
|
62
|
+
|
63
|
+
try:
|
64
|
+
# Attempt to auto-detect and load the file
|
65
|
+
data_frames[filename] = auto_load_file(file_path).to_dict()
|
66
|
+
except Exception as e:
|
67
|
+
# If loading fails, record the error message
|
68
|
+
data_frames[filename] = f"Error loading file: {e}"
|
69
|
+
|
70
|
+
return (
|
71
|
+
f"Returned the following data frames: {list(data_frames.keys())}",
|
72
|
+
data_frames
|
73
|
+
)
|
74
|
+
|
75
|
+
|
76
|
+
@tool(response_format='content_and_artifact')
|
77
|
+
def load_file(file_path: str) -> Tuple[str, Dict]:
|
78
|
+
"""
|
79
|
+
Automatically loads a file based on its extension.
|
80
|
+
|
81
|
+
Parameters:
|
82
|
+
----------
|
83
|
+
file_path : str
|
84
|
+
The path to the file to load.
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
-------
|
88
|
+
Tuple[str, Dict]
|
89
|
+
A tuple containing a message and a dictionary of the data frame.
|
90
|
+
"""
|
91
|
+
print(f" * Tool: load_file | {file_path}")
|
92
|
+
return f"Returned the following data frame from this file: {file_path}", auto_load_file(file_path).to_dict()
|
93
|
+
|
94
|
+
|
95
|
+
@tool(response_format='content_and_artifact')
|
96
|
+
def list_directory_contents(
|
97
|
+
directory_path: str = os.getcwd(),
|
98
|
+
show_hidden: bool = False
|
99
|
+
) -> Tuple[List[str], List[Dict]]:
|
100
|
+
"""
|
101
|
+
Tool: list_directory_contents
|
102
|
+
Description: Lists all files and folders in the specified directory.
|
103
|
+
Args:
|
104
|
+
directory_path (str): The path of the directory to list.
|
105
|
+
show_hidden (bool): Whether to include hidden files (default: False).
|
106
|
+
Returns:
|
107
|
+
tuple:
|
108
|
+
- content (list[str]): A list of filenames/folders (suitable for display)
|
109
|
+
- artifact (list[dict]): A list of dictionaries where each dict includes
|
110
|
+
the keys {"filename": <name>, "type": <'file' or 'directory'>}.
|
111
|
+
This structure can be easily converted to a pandas DataFrame.
|
112
|
+
"""
|
113
|
+
print(f" * Tool: list_directory_contents | {directory_path}")
|
114
|
+
import os
|
115
|
+
|
116
|
+
if directory_path is None:
|
117
|
+
return "No directory path provided.", []
|
118
|
+
|
119
|
+
if not os.path.isdir(directory_path):
|
120
|
+
return f"Directory not found: {directory_path}", []
|
121
|
+
|
122
|
+
items = []
|
123
|
+
for item in os.listdir(directory_path):
|
124
|
+
# If show_hidden is False, skip items starting with '.'
|
125
|
+
if not show_hidden and item.startswith('.'):
|
126
|
+
continue
|
127
|
+
items.append(item)
|
128
|
+
items.reverse()
|
129
|
+
|
130
|
+
# content: just the raw list of item names (files/folders).
|
131
|
+
content = items.copy()
|
132
|
+
|
133
|
+
content.append(f"Total items: {len(items)}")
|
134
|
+
content.append(f"Directory: {directory_path}")
|
135
|
+
|
136
|
+
# artifact: list of dicts with both "filename" and "type" keys.
|
137
|
+
artifact = []
|
138
|
+
for item in items:
|
139
|
+
item_path = os.path.join(directory_path, item)
|
140
|
+
artifact.append({
|
141
|
+
"filename": item,
|
142
|
+
"type": "directory" if os.path.isdir(item_path) else "file"
|
143
|
+
})
|
144
|
+
|
145
|
+
return content, artifact
|
146
|
+
|
147
|
+
|
148
|
+
|
149
|
+
@tool(response_format='content_and_artifact')
|
150
|
+
def list_directory_recursive(
|
151
|
+
directory_path: str = os.getcwd(),
|
152
|
+
show_hidden: bool = False
|
153
|
+
) -> Tuple[str, List[Dict]]:
|
154
|
+
"""
|
155
|
+
Tool: list_directory_recursive
|
156
|
+
Description:
|
157
|
+
Recursively lists all files and folders within the specified directory.
|
158
|
+
Returns a two-tuple:
|
159
|
+
(1) A human-readable tree representation of the directory (content).
|
160
|
+
(2) A list of dicts (artifact) that can be easily converted into a DataFrame.
|
161
|
+
|
162
|
+
Args:
|
163
|
+
directory_path (str): The path of the directory to list.
|
164
|
+
show_hidden (bool): Whether to include hidden files (default: False).
|
165
|
+
|
166
|
+
Returns:
|
167
|
+
Tuple[str, List[dict]]:
|
168
|
+
content: A multiline string showing the directory tree.
|
169
|
+
artifact: A list of dictionaries, each with information about a file or directory.
|
170
|
+
|
171
|
+
Example:
|
172
|
+
content, artifact = list_directory_recursive("/path/to/folder", show_hidden=False)
|
173
|
+
"""
|
174
|
+
print(f" * Tool: list_directory_recursive | {directory_path}")
|
175
|
+
|
176
|
+
# We'll store two things as we recurse:
|
177
|
+
# 1) lines for building the "tree" string
|
178
|
+
# 2) records in a list of dicts for easy DataFrame creation
|
179
|
+
import os
|
180
|
+
|
181
|
+
if directory_path is None:
|
182
|
+
return "No directory path provided.", {}
|
183
|
+
|
184
|
+
if not os.path.isdir(directory_path):
|
185
|
+
return f"Directory not found: {directory_path}", {}
|
186
|
+
|
187
|
+
lines = []
|
188
|
+
records = []
|
189
|
+
|
190
|
+
def recurse(path: str, indent_level: int = 0):
|
191
|
+
# List items in the current directory
|
192
|
+
try:
|
193
|
+
items = os.listdir(path)
|
194
|
+
except PermissionError:
|
195
|
+
# If we don't have permission to read the directory, just note it.
|
196
|
+
lines.append(" " * indent_level + "[Permission Denied]")
|
197
|
+
return
|
198
|
+
|
199
|
+
# Sort items for a consistent order (optional)
|
200
|
+
items.sort()
|
201
|
+
|
202
|
+
for item in items:
|
203
|
+
if not show_hidden and item.startswith('.'):
|
204
|
+
continue
|
205
|
+
|
206
|
+
full_path = os.path.join(path, item)
|
207
|
+
# Build an indented prefix for the tree
|
208
|
+
prefix = " " * indent_level
|
209
|
+
|
210
|
+
if os.path.isdir(full_path):
|
211
|
+
# Directory
|
212
|
+
lines.append(f"{prefix}{item}/")
|
213
|
+
records.append({
|
214
|
+
"type": "directory",
|
215
|
+
"name": item,
|
216
|
+
"parent_path": path,
|
217
|
+
"absolute_path": full_path
|
218
|
+
})
|
219
|
+
# Recursively descend
|
220
|
+
recurse(full_path, indent_level + 1)
|
221
|
+
else:
|
222
|
+
# File
|
223
|
+
lines.append(f"{prefix}- {item}")
|
224
|
+
records.append({
|
225
|
+
"type": "file",
|
226
|
+
"name": item,
|
227
|
+
"parent_path": path,
|
228
|
+
"absolute_path": full_path
|
229
|
+
})
|
230
|
+
|
231
|
+
# Kick off recursion
|
232
|
+
if os.path.isdir(directory_path):
|
233
|
+
# Add the top-level directory to lines/records if you like
|
234
|
+
dir_name = os.path.basename(os.path.normpath(directory_path)) or directory_path
|
235
|
+
lines.append(f"{dir_name}/") # Show the root as well
|
236
|
+
records.append({
|
237
|
+
"type": "directory",
|
238
|
+
"name": dir_name,
|
239
|
+
"parent_path": os.path.dirname(directory_path),
|
240
|
+
"absolute_path": os.path.abspath(directory_path)
|
241
|
+
})
|
242
|
+
recurse(directory_path, indent_level=1)
|
243
|
+
else:
|
244
|
+
# If the given path is not a directory, just return a note
|
245
|
+
lines.append(f"{directory_path} is not a directory.")
|
246
|
+
records.append({
|
247
|
+
"type": "error",
|
248
|
+
"name": directory_path,
|
249
|
+
"parent_path": None,
|
250
|
+
"absolute_path": os.path.abspath(directory_path)
|
251
|
+
})
|
252
|
+
|
253
|
+
# content: multiline string with the entire tree
|
254
|
+
content = "\n".join(lines)
|
255
|
+
# artifact: list of dicts, easily converted into a DataFrame
|
256
|
+
artifact = records
|
257
|
+
|
258
|
+
return content, artifact
|
259
|
+
|
260
|
+
|
261
|
+
@tool(response_format='content_and_artifact')
|
262
|
+
def get_file_info(file_path: str) -> Tuple[str, List[Dict]]:
|
263
|
+
"""
|
264
|
+
Tool: get_file_info
|
265
|
+
Description: Retrieves metadata (size, modification time, etc.) about a file.
|
266
|
+
Returns a tuple (content, artifact):
|
267
|
+
- content (str): A textual summary of the file info.
|
268
|
+
- artifact (List[Dict]): A list with a single dictionary of file metadata.
|
269
|
+
Useful for direct conversion into a DataFrame.
|
270
|
+
Args:
|
271
|
+
file_path (str): The path of the file to inspect.
|
272
|
+
Returns:
|
273
|
+
Tuple[str, List[dict]]:
|
274
|
+
content: Summary text
|
275
|
+
artifact: A list[dict] of file metadata
|
276
|
+
Example:
|
277
|
+
content, artifact = get_file_info("/path/to/mydata.csv")
|
278
|
+
"""
|
279
|
+
print(f" * Tool: get_file_info | {file_path}")
|
280
|
+
|
281
|
+
# Ensure the file exists
|
282
|
+
import os
|
283
|
+
import time
|
284
|
+
|
285
|
+
if not os.path.isfile(file_path):
|
286
|
+
raise FileNotFoundError(f"{file_path} is not a valid file.")
|
287
|
+
|
288
|
+
file_stats = os.stat(file_path)
|
289
|
+
|
290
|
+
# Construct the data dictionary
|
291
|
+
file_data = {
|
292
|
+
"file_name": os.path.basename(file_path),
|
293
|
+
"size_bytes": file_stats.st_size,
|
294
|
+
"modification_time": time.ctime(file_stats.st_mtime),
|
295
|
+
"absolute_path": os.path.abspath(file_path),
|
296
|
+
}
|
297
|
+
|
298
|
+
# Create a user-friendly summary (content)
|
299
|
+
content_str = (
|
300
|
+
f"File Name: {file_data['file_name']}\n"
|
301
|
+
f"Size (bytes): {file_data['size_bytes']}\n"
|
302
|
+
f"Last Modified: {file_data['modification_time']}\n"
|
303
|
+
f"Absolute Path: {file_data['absolute_path']}"
|
304
|
+
)
|
305
|
+
|
306
|
+
# Artifact should be a list of dict(s) to easily convert to DataFrame
|
307
|
+
artifact = [file_data]
|
308
|
+
|
309
|
+
return content_str, artifact
|
310
|
+
|
311
|
+
|
312
|
+
@tool(response_format='content_and_artifact')
|
313
|
+
def search_files_by_pattern(
|
314
|
+
directory_path: str = os.getcwd(),
|
315
|
+
pattern: str = "*.csv",
|
316
|
+
recursive: bool = False
|
317
|
+
) -> Tuple[str, List[Dict]]:
|
318
|
+
"""
|
319
|
+
Tool: search_files_by_pattern
|
320
|
+
Description:
|
321
|
+
Searches for files (optionally in subdirectories) that match a given
|
322
|
+
wildcard pattern (e.g. "*.csv", "*.xlsx", etc.), returning a tuple:
|
323
|
+
(1) content (str): A multiline summary of the matched files.
|
324
|
+
(2) artifact (List[Dict]): A list of dicts with file path info.
|
325
|
+
|
326
|
+
Args:
|
327
|
+
directory_path (str): Directory path to start searching from.
|
328
|
+
pattern (str): A wildcard pattern, e.g. "*.csv". Default is "*.csv".
|
329
|
+
recursive (bool): Whether to search in subdirectories. Default is False.
|
330
|
+
|
331
|
+
Returns:
|
332
|
+
Tuple[str, List[Dict]]:
|
333
|
+
content: A user-friendly string showing matched file paths.
|
334
|
+
artifact: A list of dictionaries, each representing a matched file.
|
335
|
+
|
336
|
+
Example:
|
337
|
+
content, artifact = search_files_by_pattern("/path/to/folder", "*.csv", recursive=True)
|
338
|
+
"""
|
339
|
+
print(f" * Tool: search_files_by_pattern | {directory_path}")
|
340
|
+
|
341
|
+
import os
|
342
|
+
import fnmatch
|
343
|
+
|
344
|
+
matched_files = []
|
345
|
+
if recursive:
|
346
|
+
for root, dirs, files in os.walk(directory_path):
|
347
|
+
for filename in files:
|
348
|
+
if fnmatch.fnmatch(filename, pattern):
|
349
|
+
matched_files.append(os.path.join(root, filename))
|
350
|
+
else:
|
351
|
+
# Non-recursive
|
352
|
+
for filename in os.listdir(directory_path):
|
353
|
+
full_path = os.path.join(directory_path, filename)
|
354
|
+
if os.path.isfile(full_path) and fnmatch.fnmatch(filename, pattern):
|
355
|
+
matched_files.append(full_path)
|
356
|
+
|
357
|
+
# Create a human-readable summary (content)
|
358
|
+
if matched_files:
|
359
|
+
lines = [f"Found {len(matched_files)} file(s) matching '{pattern}':"]
|
360
|
+
for f in matched_files:
|
361
|
+
lines.append(f" - {f}")
|
362
|
+
content = "\n".join(lines)
|
363
|
+
else:
|
364
|
+
content = f"No files found matching '{pattern}'."
|
365
|
+
|
366
|
+
# Create artifact as a list of dicts for DataFrame conversion
|
367
|
+
artifact = [{"file_path": path} for path in matched_files]
|
368
|
+
|
369
|
+
return content, artifact
|
370
|
+
|
371
|
+
|
372
|
+
# Loaders
|
373
|
+
|
374
|
+
def auto_load_file(file_path: str) -> pd.DataFrame:
|
375
|
+
"""
|
376
|
+
Auto loads a file based on its extension.
|
377
|
+
|
378
|
+
Parameters:
|
379
|
+
----------
|
380
|
+
file_path : str
|
381
|
+
The path to the file to load.
|
382
|
+
|
383
|
+
Returns:
|
384
|
+
-------
|
385
|
+
pd.DataFrame
|
386
|
+
"""
|
387
|
+
import pandas as pd
|
388
|
+
try:
|
389
|
+
ext = file_path.split(".")[-1].lower()
|
390
|
+
if ext == "csv":
|
391
|
+
return load_csv(file_path)
|
392
|
+
elif ext in ["xlsx", "xls"]:
|
393
|
+
return load_excel(file_path)
|
394
|
+
elif ext == "json":
|
395
|
+
return load_json(file_path)
|
396
|
+
elif ext == "parquet":
|
397
|
+
return load_parquet(file_path)
|
398
|
+
elif ext == "pkl":
|
399
|
+
return load_pickle(file_path)
|
400
|
+
else:
|
401
|
+
return f"Unsupported file extension: {ext}"
|
402
|
+
except Exception as e:
|
403
|
+
return f"Error loading file: {e}"
|
404
|
+
|
405
|
+
def load_csv(file_path: str) -> pd.DataFrame:
|
406
|
+
"""
|
407
|
+
Tool: load_csv
|
408
|
+
Description: Loads a CSV file into a pandas DataFrame.
|
409
|
+
Args:
|
410
|
+
file_path (str): Path to the CSV file.
|
411
|
+
Returns:
|
412
|
+
pd.DataFrame
|
413
|
+
"""
|
414
|
+
import pandas as pd
|
415
|
+
return pd.read_csv(file_path)
|
416
|
+
|
417
|
+
def load_excel(file_path: str, sheet_name=None) -> pd.DataFrame:
|
418
|
+
"""
|
419
|
+
Tool: load_excel
|
420
|
+
Description: Loads an Excel file into a pandas DataFrame.
|
421
|
+
"""
|
422
|
+
import pandas as pd
|
423
|
+
return pd.read_excel(file_path, sheet_name=sheet_name)
|
424
|
+
|
425
|
+
def load_json(file_path: str) -> pd.DataFrame:
|
426
|
+
"""
|
427
|
+
Tool: load_json
|
428
|
+
Description: Loads a JSON file or NDJSON into a pandas DataFrame.
|
429
|
+
"""
|
430
|
+
import pandas as pd
|
431
|
+
# For simple JSON arrays
|
432
|
+
return pd.read_json(file_path, orient="records", lines=False)
|
433
|
+
|
434
|
+
def load_parquet(file_path: str) -> pd.DataFrame:
|
435
|
+
"""
|
436
|
+
Tool: load_parquet
|
437
|
+
Description: Loads a Parquet file into a pandas DataFrame.
|
438
|
+
"""
|
439
|
+
import pandas as pd
|
440
|
+
return pd.read_parquet(file_path)
|
441
|
+
|
442
|
+
def load_pickle(file_path: str) -> pd.DataFrame:
|
443
|
+
"""
|
444
|
+
Tool: load_pickle
|
445
|
+
Description: Loads a Pickle file into a pandas DataFrame.
|
446
|
+
"""
|
447
|
+
import pandas as pd
|
448
|
+
return pd.read_pickle(file_path)
|
@@ -0,0 +1,139 @@
|
|
1
|
+
import io
|
2
|
+
import pandas as pd
|
3
|
+
from typing import Union, List, Dict
|
4
|
+
|
5
|
+
def get_dataframe_summary(
|
6
|
+
dataframes: Union[pd.DataFrame, List[pd.DataFrame], Dict[str, pd.DataFrame]],
|
7
|
+
n_sample: int = 30,
|
8
|
+
skip_stats: bool = False,
|
9
|
+
) -> List[str]:
|
10
|
+
"""
|
11
|
+
Generate a summary for one or more DataFrames. Accepts a single DataFrame, a list of DataFrames,
|
12
|
+
or a dictionary mapping names to DataFrames.
|
13
|
+
|
14
|
+
Parameters
|
15
|
+
----------
|
16
|
+
dataframes : pandas.DataFrame or list of pandas.DataFrame or dict of (str -> pandas.DataFrame)
|
17
|
+
- Single DataFrame: produce a single summary (returned within a one-element list).
|
18
|
+
- List of DataFrames: produce a summary for each DataFrame, using index-based names.
|
19
|
+
- Dictionary of DataFrames: produce a summary for each DataFrame, using dictionary keys as names.
|
20
|
+
n_sample : int, default 30
|
21
|
+
Number of rows to display in the "Data (first 30 rows)" section.
|
22
|
+
skip_stats : bool, default False
|
23
|
+
If True, skip the descriptive statistics and DataFrame info sections.
|
24
|
+
|
25
|
+
Example:
|
26
|
+
--------
|
27
|
+
``` python
|
28
|
+
import pandas as pd
|
29
|
+
from sklearn.datasets import load_iris
|
30
|
+
data = load_iris(as_frame=True)
|
31
|
+
dataframes = {
|
32
|
+
"iris": data.frame,
|
33
|
+
"iris_target": data.target,
|
34
|
+
}
|
35
|
+
summaries = get_dataframe_summary(dataframes)
|
36
|
+
print(summaries[0])
|
37
|
+
```
|
38
|
+
|
39
|
+
Returns
|
40
|
+
-------
|
41
|
+
list of str
|
42
|
+
A list of summaries, one for each provided DataFrame. Each summary includes:
|
43
|
+
- Shape of the DataFrame (rows, columns)
|
44
|
+
- Column data types
|
45
|
+
- Missing value percentage
|
46
|
+
- Unique value counts
|
47
|
+
- First 30 rows
|
48
|
+
- Descriptive statistics
|
49
|
+
- DataFrame info output
|
50
|
+
"""
|
51
|
+
|
52
|
+
summaries = []
|
53
|
+
|
54
|
+
# --- Dictionary Case ---
|
55
|
+
if isinstance(dataframes, dict):
|
56
|
+
for dataset_name, df in dataframes.items():
|
57
|
+
summaries.append(_summarize_dataframe(df, dataset_name, n_sample, skip_stats))
|
58
|
+
|
59
|
+
# --- Single DataFrame Case ---
|
60
|
+
elif isinstance(dataframes, pd.DataFrame):
|
61
|
+
summaries.append(_summarize_dataframe(dataframes, "Single_Dataset", n_sample, skip_stats))
|
62
|
+
|
63
|
+
# --- List of DataFrames Case ---
|
64
|
+
elif isinstance(dataframes, list):
|
65
|
+
for idx, df in enumerate(dataframes):
|
66
|
+
dataset_name = f"Dataset_{idx}"
|
67
|
+
summaries.append(_summarize_dataframe(df, dataset_name, n_sample, skip_stats))
|
68
|
+
|
69
|
+
else:
|
70
|
+
raise TypeError(
|
71
|
+
"Input must be a single DataFrame, a list of DataFrames, or a dictionary of DataFrames."
|
72
|
+
)
|
73
|
+
|
74
|
+
return summaries
|
75
|
+
|
76
|
+
|
77
|
+
def _summarize_dataframe(df: pd.DataFrame, dataset_name: str, n_sample=30, skip_stats=False) -> str:
|
78
|
+
"""Generate a summary string for a single DataFrame."""
|
79
|
+
# 1. Convert dictionary-type cells to strings
|
80
|
+
# This prevents unhashable dict errors during df.nunique().
|
81
|
+
df = df.apply(lambda col: col.map(lambda x: str(x) if isinstance(x, dict) else x))
|
82
|
+
|
83
|
+
# 2. Capture df.info() output
|
84
|
+
buffer = io.StringIO()
|
85
|
+
df.info(buf=buffer)
|
86
|
+
info_text = buffer.getvalue()
|
87
|
+
|
88
|
+
# 3. Calculate missing value stats
|
89
|
+
missing_stats = (df.isna().sum() / len(df) * 100).sort_values(ascending=False)
|
90
|
+
missing_summary = "\n".join([f"{col}: {val:.2f}%" for col, val in missing_stats.items()])
|
91
|
+
|
92
|
+
# 4. Get column data types
|
93
|
+
column_types = "\n".join([f"{col}: {dtype}" for col, dtype in df.dtypes.items()])
|
94
|
+
|
95
|
+
# 5. Get unique value counts
|
96
|
+
unique_counts = df.nunique() # Will no longer fail on unhashable dict
|
97
|
+
unique_counts_summary = "\n".join([f"{col}: {count}" for col, count in unique_counts.items()])
|
98
|
+
|
99
|
+
# 6. Generate the summary text
|
100
|
+
if not skip_stats:
|
101
|
+
summary_text = f"""
|
102
|
+
Dataset Name: {dataset_name}
|
103
|
+
----------------------------
|
104
|
+
Shape: {df.shape[0]} rows x {df.shape[1]} columns
|
105
|
+
|
106
|
+
Column Data Types:
|
107
|
+
{column_types}
|
108
|
+
|
109
|
+
Missing Value Percentage:
|
110
|
+
{missing_summary}
|
111
|
+
|
112
|
+
Unique Value Counts:
|
113
|
+
{unique_counts_summary}
|
114
|
+
|
115
|
+
Data (first {n_sample} rows):
|
116
|
+
{df.head(n_sample).to_string()}
|
117
|
+
|
118
|
+
Data Description:
|
119
|
+
{df.describe().to_string()}
|
120
|
+
|
121
|
+
Data Info:
|
122
|
+
{info_text}
|
123
|
+
"""
|
124
|
+
else:
|
125
|
+
summary_text = f"""
|
126
|
+
Dataset Name: {dataset_name}
|
127
|
+
----------------------------
|
128
|
+
Shape: {df.shape[0]} rows x {df.shape[1]} columns
|
129
|
+
|
130
|
+
Column Data Types:
|
131
|
+
{column_types}
|
132
|
+
|
133
|
+
Data (first {n_sample} rows):
|
134
|
+
{df.head(n_sample).to_string()}
|
135
|
+
"""
|
136
|
+
|
137
|
+
return summary_text.strip()
|
138
|
+
|
139
|
+
|