ai-data-science-team 0.0.0.9009__py3-none-any.whl → 0.0.0.9011__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. ai_data_science_team/_version.py +1 -1
  2. ai_data_science_team/agents/__init__.py +1 -0
  3. ai_data_science_team/agents/data_cleaning_agent.py +6 -6
  4. ai_data_science_team/agents/data_loader_tools_agent.py +272 -0
  5. ai_data_science_team/agents/data_visualization_agent.py +6 -7
  6. ai_data_science_team/agents/data_wrangling_agent.py +6 -6
  7. ai_data_science_team/agents/feature_engineering_agent.py +6 -6
  8. ai_data_science_team/agents/sql_database_agent.py +6 -6
  9. ai_data_science_team/ml_agents/__init__.py +1 -0
  10. ai_data_science_team/ml_agents/h2o_ml_agent.py +206 -385
  11. ai_data_science_team/ml_agents/h2o_ml_tools_agent.py +0 -0
  12. ai_data_science_team/ml_agents/mlflow_tools_agent.py +350 -0
  13. ai_data_science_team/multiagents/sql_data_analyst.py +3 -4
  14. ai_data_science_team/parsers/__init__.py +0 -0
  15. ai_data_science_team/{tools → parsers}/parsers.py +0 -1
  16. ai_data_science_team/templates/agent_templates.py +6 -6
  17. ai_data_science_team/tools/data_loader.py +448 -0
  18. ai_data_science_team/tools/dataframe.py +139 -0
  19. ai_data_science_team/tools/h2o.py +643 -0
  20. ai_data_science_team/tools/mlflow.py +961 -0
  21. ai_data_science_team/tools/{metadata.py → sql.py} +1 -137
  22. {ai_data_science_team-0.0.0.9009.dist-info → ai_data_science_team-0.0.0.9011.dist-info}/METADATA +40 -19
  23. ai_data_science_team-0.0.0.9011.dist-info/RECORD +36 -0
  24. ai_data_science_team-0.0.0.9009.dist-info/RECORD +0 -28
  25. /ai_data_science_team/{tools → utils}/logging.py +0 -0
  26. /ai_data_science_team/{tools → utils}/regex.py +0 -0
  27. {ai_data_science_team-0.0.0.9009.dist-info → ai_data_science_team-0.0.0.9011.dist-info}/LICENSE +0 -0
  28. {ai_data_science_team-0.0.0.9009.dist-info → ai_data_science_team-0.0.0.9011.dist-info}/WHEEL +0 -0
  29. {ai_data_science_team-0.0.0.9009.dist-info → ai_data_science_team-0.0.0.9011.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,448 @@
1
+
2
+ from langchain.tools import tool
3
+ from langgraph.prebuilt import InjectedState
4
+
5
+ import pandas as pd
6
+ import os
7
+
8
+ from typing import Tuple, List, Dict, Optional, Annotated
9
+
10
+
11
+ @tool(response_format='content_and_artifact')
12
+ def load_directory(
13
+ directory_path: str = os.getcwd(),
14
+ file_type: Optional[str] = None
15
+ ) -> Tuple[str, Dict]:
16
+ """
17
+ Tool: load_directory
18
+ Description: Loads all recognized tabular files in a directory.
19
+ If file_type is specified (e.g., 'csv'), only files
20
+ with that extension are loaded.
21
+
22
+ Parameters:
23
+ ----------
24
+ directory_path : str
25
+ The path to the directory to load. Defaults to the current working directory.
26
+
27
+ file_type : str, optional
28
+ The extension of the file type you want to load exclusively
29
+ (e.g., 'csv', 'xlsx', 'parquet'). If None or not provided,
30
+ attempts to load all recognized tabular files.
31
+
32
+ Returns:
33
+ -------
34
+ Tuple[str, Dict]
35
+ A tuple containing a message and a dictionary of data frames.
36
+ """
37
+ print(f" * Tool: load_directory | {directory_path}")
38
+
39
+ import os
40
+ import pandas as pd
41
+
42
+ if directory_path is None:
43
+ return "No directory path provided.", {}
44
+
45
+ if not os.path.isdir(directory_path):
46
+ return f"Directory not found: {directory_path}", {}
47
+
48
+ data_frames = {}
49
+
50
+ for filename in os.listdir(directory_path):
51
+ file_path = os.path.join(directory_path, filename)
52
+
53
+ # Skip directories
54
+ if os.path.isdir(file_path):
55
+ continue
56
+
57
+ # If file_type is specified, only process files that match.
58
+ if file_type:
59
+ # Make sure extension check is case-insensitive
60
+ if not filename.lower().endswith(f".{file_type.lower()}"):
61
+ continue
62
+
63
+ try:
64
+ # Attempt to auto-detect and load the file
65
+ data_frames[filename] = auto_load_file(file_path).to_dict()
66
+ except Exception as e:
67
+ # If loading fails, record the error message
68
+ data_frames[filename] = f"Error loading file: {e}"
69
+
70
+ return (
71
+ f"Returned the following data frames: {list(data_frames.keys())}",
72
+ data_frames
73
+ )
74
+
75
+
76
+ @tool(response_format='content_and_artifact')
77
+ def load_file(file_path: str) -> Tuple[str, Dict]:
78
+ """
79
+ Automatically loads a file based on its extension.
80
+
81
+ Parameters:
82
+ ----------
83
+ file_path : str
84
+ The path to the file to load.
85
+
86
+ Returns:
87
+ -------
88
+ Tuple[str, Dict]
89
+ A tuple containing a message and a dictionary of the data frame.
90
+ """
91
+ print(f" * Tool: load_file | {file_path}")
92
+ return f"Returned the following data frame from this file: {file_path}", auto_load_file(file_path).to_dict()
93
+
94
+
95
+ @tool(response_format='content_and_artifact')
96
+ def list_directory_contents(
97
+ directory_path: str = os.getcwd(),
98
+ show_hidden: bool = False
99
+ ) -> Tuple[List[str], List[Dict]]:
100
+ """
101
+ Tool: list_directory_contents
102
+ Description: Lists all files and folders in the specified directory.
103
+ Args:
104
+ directory_path (str): The path of the directory to list.
105
+ show_hidden (bool): Whether to include hidden files (default: False).
106
+ Returns:
107
+ tuple:
108
+ - content (list[str]): A list of filenames/folders (suitable for display)
109
+ - artifact (list[dict]): A list of dictionaries where each dict includes
110
+ the keys {"filename": <name>, "type": <'file' or 'directory'>}.
111
+ This structure can be easily converted to a pandas DataFrame.
112
+ """
113
+ print(f" * Tool: list_directory_contents | {directory_path}")
114
+ import os
115
+
116
+ if directory_path is None:
117
+ return "No directory path provided.", []
118
+
119
+ if not os.path.isdir(directory_path):
120
+ return f"Directory not found: {directory_path}", []
121
+
122
+ items = []
123
+ for item in os.listdir(directory_path):
124
+ # If show_hidden is False, skip items starting with '.'
125
+ if not show_hidden and item.startswith('.'):
126
+ continue
127
+ items.append(item)
128
+ items.reverse()
129
+
130
+ # content: just the raw list of item names (files/folders).
131
+ content = items.copy()
132
+
133
+ content.append(f"Total items: {len(items)}")
134
+ content.append(f"Directory: {directory_path}")
135
+
136
+ # artifact: list of dicts with both "filename" and "type" keys.
137
+ artifact = []
138
+ for item in items:
139
+ item_path = os.path.join(directory_path, item)
140
+ artifact.append({
141
+ "filename": item,
142
+ "type": "directory" if os.path.isdir(item_path) else "file"
143
+ })
144
+
145
+ return content, artifact
146
+
147
+
148
+
149
+ @tool(response_format='content_and_artifact')
150
+ def list_directory_recursive(
151
+ directory_path: str = os.getcwd(),
152
+ show_hidden: bool = False
153
+ ) -> Tuple[str, List[Dict]]:
154
+ """
155
+ Tool: list_directory_recursive
156
+ Description:
157
+ Recursively lists all files and folders within the specified directory.
158
+ Returns a two-tuple:
159
+ (1) A human-readable tree representation of the directory (content).
160
+ (2) A list of dicts (artifact) that can be easily converted into a DataFrame.
161
+
162
+ Args:
163
+ directory_path (str): The path of the directory to list.
164
+ show_hidden (bool): Whether to include hidden files (default: False).
165
+
166
+ Returns:
167
+ Tuple[str, List[dict]]:
168
+ content: A multiline string showing the directory tree.
169
+ artifact: A list of dictionaries, each with information about a file or directory.
170
+
171
+ Example:
172
+ content, artifact = list_directory_recursive("/path/to/folder", show_hidden=False)
173
+ """
174
+ print(f" * Tool: list_directory_recursive | {directory_path}")
175
+
176
+ # We'll store two things as we recurse:
177
+ # 1) lines for building the "tree" string
178
+ # 2) records in a list of dicts for easy DataFrame creation
179
+ import os
180
+
181
+ if directory_path is None:
182
+ return "No directory path provided.", {}
183
+
184
+ if not os.path.isdir(directory_path):
185
+ return f"Directory not found: {directory_path}", {}
186
+
187
+ lines = []
188
+ records = []
189
+
190
+ def recurse(path: str, indent_level: int = 0):
191
+ # List items in the current directory
192
+ try:
193
+ items = os.listdir(path)
194
+ except PermissionError:
195
+ # If we don't have permission to read the directory, just note it.
196
+ lines.append(" " * indent_level + "[Permission Denied]")
197
+ return
198
+
199
+ # Sort items for a consistent order (optional)
200
+ items.sort()
201
+
202
+ for item in items:
203
+ if not show_hidden and item.startswith('.'):
204
+ continue
205
+
206
+ full_path = os.path.join(path, item)
207
+ # Build an indented prefix for the tree
208
+ prefix = " " * indent_level
209
+
210
+ if os.path.isdir(full_path):
211
+ # Directory
212
+ lines.append(f"{prefix}{item}/")
213
+ records.append({
214
+ "type": "directory",
215
+ "name": item,
216
+ "parent_path": path,
217
+ "absolute_path": full_path
218
+ })
219
+ # Recursively descend
220
+ recurse(full_path, indent_level + 1)
221
+ else:
222
+ # File
223
+ lines.append(f"{prefix}- {item}")
224
+ records.append({
225
+ "type": "file",
226
+ "name": item,
227
+ "parent_path": path,
228
+ "absolute_path": full_path
229
+ })
230
+
231
+ # Kick off recursion
232
+ if os.path.isdir(directory_path):
233
+ # Add the top-level directory to lines/records if you like
234
+ dir_name = os.path.basename(os.path.normpath(directory_path)) or directory_path
235
+ lines.append(f"{dir_name}/") # Show the root as well
236
+ records.append({
237
+ "type": "directory",
238
+ "name": dir_name,
239
+ "parent_path": os.path.dirname(directory_path),
240
+ "absolute_path": os.path.abspath(directory_path)
241
+ })
242
+ recurse(directory_path, indent_level=1)
243
+ else:
244
+ # If the given path is not a directory, just return a note
245
+ lines.append(f"{directory_path} is not a directory.")
246
+ records.append({
247
+ "type": "error",
248
+ "name": directory_path,
249
+ "parent_path": None,
250
+ "absolute_path": os.path.abspath(directory_path)
251
+ })
252
+
253
+ # content: multiline string with the entire tree
254
+ content = "\n".join(lines)
255
+ # artifact: list of dicts, easily converted into a DataFrame
256
+ artifact = records
257
+
258
+ return content, artifact
259
+
260
+
261
+ @tool(response_format='content_and_artifact')
262
+ def get_file_info(file_path: str) -> Tuple[str, List[Dict]]:
263
+ """
264
+ Tool: get_file_info
265
+ Description: Retrieves metadata (size, modification time, etc.) about a file.
266
+ Returns a tuple (content, artifact):
267
+ - content (str): A textual summary of the file info.
268
+ - artifact (List[Dict]): A list with a single dictionary of file metadata.
269
+ Useful for direct conversion into a DataFrame.
270
+ Args:
271
+ file_path (str): The path of the file to inspect.
272
+ Returns:
273
+ Tuple[str, List[dict]]:
274
+ content: Summary text
275
+ artifact: A list[dict] of file metadata
276
+ Example:
277
+ content, artifact = get_file_info("/path/to/mydata.csv")
278
+ """
279
+ print(f" * Tool: get_file_info | {file_path}")
280
+
281
+ # Ensure the file exists
282
+ import os
283
+ import time
284
+
285
+ if not os.path.isfile(file_path):
286
+ raise FileNotFoundError(f"{file_path} is not a valid file.")
287
+
288
+ file_stats = os.stat(file_path)
289
+
290
+ # Construct the data dictionary
291
+ file_data = {
292
+ "file_name": os.path.basename(file_path),
293
+ "size_bytes": file_stats.st_size,
294
+ "modification_time": time.ctime(file_stats.st_mtime),
295
+ "absolute_path": os.path.abspath(file_path),
296
+ }
297
+
298
+ # Create a user-friendly summary (content)
299
+ content_str = (
300
+ f"File Name: {file_data['file_name']}\n"
301
+ f"Size (bytes): {file_data['size_bytes']}\n"
302
+ f"Last Modified: {file_data['modification_time']}\n"
303
+ f"Absolute Path: {file_data['absolute_path']}"
304
+ )
305
+
306
+ # Artifact should be a list of dict(s) to easily convert to DataFrame
307
+ artifact = [file_data]
308
+
309
+ return content_str, artifact
310
+
311
+
312
+ @tool(response_format='content_and_artifact')
313
+ def search_files_by_pattern(
314
+ directory_path: str = os.getcwd(),
315
+ pattern: str = "*.csv",
316
+ recursive: bool = False
317
+ ) -> Tuple[str, List[Dict]]:
318
+ """
319
+ Tool: search_files_by_pattern
320
+ Description:
321
+ Searches for files (optionally in subdirectories) that match a given
322
+ wildcard pattern (e.g. "*.csv", "*.xlsx", etc.), returning a tuple:
323
+ (1) content (str): A multiline summary of the matched files.
324
+ (2) artifact (List[Dict]): A list of dicts with file path info.
325
+
326
+ Args:
327
+ directory_path (str): Directory path to start searching from.
328
+ pattern (str): A wildcard pattern, e.g. "*.csv". Default is "*.csv".
329
+ recursive (bool): Whether to search in subdirectories. Default is False.
330
+
331
+ Returns:
332
+ Tuple[str, List[Dict]]:
333
+ content: A user-friendly string showing matched file paths.
334
+ artifact: A list of dictionaries, each representing a matched file.
335
+
336
+ Example:
337
+ content, artifact = search_files_by_pattern("/path/to/folder", "*.csv", recursive=True)
338
+ """
339
+ print(f" * Tool: search_files_by_pattern | {directory_path}")
340
+
341
+ import os
342
+ import fnmatch
343
+
344
+ matched_files = []
345
+ if recursive:
346
+ for root, dirs, files in os.walk(directory_path):
347
+ for filename in files:
348
+ if fnmatch.fnmatch(filename, pattern):
349
+ matched_files.append(os.path.join(root, filename))
350
+ else:
351
+ # Non-recursive
352
+ for filename in os.listdir(directory_path):
353
+ full_path = os.path.join(directory_path, filename)
354
+ if os.path.isfile(full_path) and fnmatch.fnmatch(filename, pattern):
355
+ matched_files.append(full_path)
356
+
357
+ # Create a human-readable summary (content)
358
+ if matched_files:
359
+ lines = [f"Found {len(matched_files)} file(s) matching '{pattern}':"]
360
+ for f in matched_files:
361
+ lines.append(f" - {f}")
362
+ content = "\n".join(lines)
363
+ else:
364
+ content = f"No files found matching '{pattern}'."
365
+
366
+ # Create artifact as a list of dicts for DataFrame conversion
367
+ artifact = [{"file_path": path} for path in matched_files]
368
+
369
+ return content, artifact
370
+
371
+
372
+ # Loaders
373
+
374
+ def auto_load_file(file_path: str) -> pd.DataFrame:
375
+ """
376
+ Auto loads a file based on its extension.
377
+
378
+ Parameters:
379
+ ----------
380
+ file_path : str
381
+ The path to the file to load.
382
+
383
+ Returns:
384
+ -------
385
+ pd.DataFrame
386
+ """
387
+ import pandas as pd
388
+ try:
389
+ ext = file_path.split(".")[-1].lower()
390
+ if ext == "csv":
391
+ return load_csv(file_path)
392
+ elif ext in ["xlsx", "xls"]:
393
+ return load_excel(file_path)
394
+ elif ext == "json":
395
+ return load_json(file_path)
396
+ elif ext == "parquet":
397
+ return load_parquet(file_path)
398
+ elif ext == "pkl":
399
+ return load_pickle(file_path)
400
+ else:
401
+ return f"Unsupported file extension: {ext}"
402
+ except Exception as e:
403
+ return f"Error loading file: {e}"
404
+
405
+ def load_csv(file_path: str) -> pd.DataFrame:
406
+ """
407
+ Tool: load_csv
408
+ Description: Loads a CSV file into a pandas DataFrame.
409
+ Args:
410
+ file_path (str): Path to the CSV file.
411
+ Returns:
412
+ pd.DataFrame
413
+ """
414
+ import pandas as pd
415
+ return pd.read_csv(file_path)
416
+
417
+ def load_excel(file_path: str, sheet_name=None) -> pd.DataFrame:
418
+ """
419
+ Tool: load_excel
420
+ Description: Loads an Excel file into a pandas DataFrame.
421
+ """
422
+ import pandas as pd
423
+ return pd.read_excel(file_path, sheet_name=sheet_name)
424
+
425
+ def load_json(file_path: str) -> pd.DataFrame:
426
+ """
427
+ Tool: load_json
428
+ Description: Loads a JSON file or NDJSON into a pandas DataFrame.
429
+ """
430
+ import pandas as pd
431
+ # For simple JSON arrays
432
+ return pd.read_json(file_path, orient="records", lines=False)
433
+
434
+ def load_parquet(file_path: str) -> pd.DataFrame:
435
+ """
436
+ Tool: load_parquet
437
+ Description: Loads a Parquet file into a pandas DataFrame.
438
+ """
439
+ import pandas as pd
440
+ return pd.read_parquet(file_path)
441
+
442
+ def load_pickle(file_path: str) -> pd.DataFrame:
443
+ """
444
+ Tool: load_pickle
445
+ Description: Loads a Pickle file into a pandas DataFrame.
446
+ """
447
+ import pandas as pd
448
+ return pd.read_pickle(file_path)
@@ -0,0 +1,139 @@
1
+ import io
2
+ import pandas as pd
3
+ from typing import Union, List, Dict
4
+
5
+ def get_dataframe_summary(
6
+ dataframes: Union[pd.DataFrame, List[pd.DataFrame], Dict[str, pd.DataFrame]],
7
+ n_sample: int = 30,
8
+ skip_stats: bool = False,
9
+ ) -> List[str]:
10
+ """
11
+ Generate a summary for one or more DataFrames. Accepts a single DataFrame, a list of DataFrames,
12
+ or a dictionary mapping names to DataFrames.
13
+
14
+ Parameters
15
+ ----------
16
+ dataframes : pandas.DataFrame or list of pandas.DataFrame or dict of (str -> pandas.DataFrame)
17
+ - Single DataFrame: produce a single summary (returned within a one-element list).
18
+ - List of DataFrames: produce a summary for each DataFrame, using index-based names.
19
+ - Dictionary of DataFrames: produce a summary for each DataFrame, using dictionary keys as names.
20
+ n_sample : int, default 30
21
+ Number of rows to display in the "Data (first 30 rows)" section.
22
+ skip_stats : bool, default False
23
+ If True, skip the descriptive statistics and DataFrame info sections.
24
+
25
+ Example:
26
+ --------
27
+ ``` python
28
+ import pandas as pd
29
+ from sklearn.datasets import load_iris
30
+ data = load_iris(as_frame=True)
31
+ dataframes = {
32
+ "iris": data.frame,
33
+ "iris_target": data.target,
34
+ }
35
+ summaries = get_dataframe_summary(dataframes)
36
+ print(summaries[0])
37
+ ```
38
+
39
+ Returns
40
+ -------
41
+ list of str
42
+ A list of summaries, one for each provided DataFrame. Each summary includes:
43
+ - Shape of the DataFrame (rows, columns)
44
+ - Column data types
45
+ - Missing value percentage
46
+ - Unique value counts
47
+ - First 30 rows
48
+ - Descriptive statistics
49
+ - DataFrame info output
50
+ """
51
+
52
+ summaries = []
53
+
54
+ # --- Dictionary Case ---
55
+ if isinstance(dataframes, dict):
56
+ for dataset_name, df in dataframes.items():
57
+ summaries.append(_summarize_dataframe(df, dataset_name, n_sample, skip_stats))
58
+
59
+ # --- Single DataFrame Case ---
60
+ elif isinstance(dataframes, pd.DataFrame):
61
+ summaries.append(_summarize_dataframe(dataframes, "Single_Dataset", n_sample, skip_stats))
62
+
63
+ # --- List of DataFrames Case ---
64
+ elif isinstance(dataframes, list):
65
+ for idx, df in enumerate(dataframes):
66
+ dataset_name = f"Dataset_{idx}"
67
+ summaries.append(_summarize_dataframe(df, dataset_name, n_sample, skip_stats))
68
+
69
+ else:
70
+ raise TypeError(
71
+ "Input must be a single DataFrame, a list of DataFrames, or a dictionary of DataFrames."
72
+ )
73
+
74
+ return summaries
75
+
76
+
77
+ def _summarize_dataframe(df: pd.DataFrame, dataset_name: str, n_sample=30, skip_stats=False) -> str:
78
+ """Generate a summary string for a single DataFrame."""
79
+ # 1. Convert dictionary-type cells to strings
80
+ # This prevents unhashable dict errors during df.nunique().
81
+ df = df.apply(lambda col: col.map(lambda x: str(x) if isinstance(x, dict) else x))
82
+
83
+ # 2. Capture df.info() output
84
+ buffer = io.StringIO()
85
+ df.info(buf=buffer)
86
+ info_text = buffer.getvalue()
87
+
88
+ # 3. Calculate missing value stats
89
+ missing_stats = (df.isna().sum() / len(df) * 100).sort_values(ascending=False)
90
+ missing_summary = "\n".join([f"{col}: {val:.2f}%" for col, val in missing_stats.items()])
91
+
92
+ # 4. Get column data types
93
+ column_types = "\n".join([f"{col}: {dtype}" for col, dtype in df.dtypes.items()])
94
+
95
+ # 5. Get unique value counts
96
+ unique_counts = df.nunique() # Will no longer fail on unhashable dict
97
+ unique_counts_summary = "\n".join([f"{col}: {count}" for col, count in unique_counts.items()])
98
+
99
+ # 6. Generate the summary text
100
+ if not skip_stats:
101
+ summary_text = f"""
102
+ Dataset Name: {dataset_name}
103
+ ----------------------------
104
+ Shape: {df.shape[0]} rows x {df.shape[1]} columns
105
+
106
+ Column Data Types:
107
+ {column_types}
108
+
109
+ Missing Value Percentage:
110
+ {missing_summary}
111
+
112
+ Unique Value Counts:
113
+ {unique_counts_summary}
114
+
115
+ Data (first {n_sample} rows):
116
+ {df.head(n_sample).to_string()}
117
+
118
+ Data Description:
119
+ {df.describe().to_string()}
120
+
121
+ Data Info:
122
+ {info_text}
123
+ """
124
+ else:
125
+ summary_text = f"""
126
+ Dataset Name: {dataset_name}
127
+ ----------------------------
128
+ Shape: {df.shape[0]} rows x {df.shape[1]} columns
129
+
130
+ Column Data Types:
131
+ {column_types}
132
+
133
+ Data (first {n_sample} rows):
134
+ {df.head(n_sample).to_string()}
135
+ """
136
+
137
+ return summary_text.strip()
138
+
139
+