ai-data-science-team 0.0.0.9009__py3-none-any.whl → 0.0.0.9011__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (29) hide show
  1. ai_data_science_team/_version.py +1 -1
  2. ai_data_science_team/agents/__init__.py +1 -0
  3. ai_data_science_team/agents/data_cleaning_agent.py +6 -6
  4. ai_data_science_team/agents/data_loader_tools_agent.py +272 -0
  5. ai_data_science_team/agents/data_visualization_agent.py +6 -7
  6. ai_data_science_team/agents/data_wrangling_agent.py +6 -6
  7. ai_data_science_team/agents/feature_engineering_agent.py +6 -6
  8. ai_data_science_team/agents/sql_database_agent.py +6 -6
  9. ai_data_science_team/ml_agents/__init__.py +1 -0
  10. ai_data_science_team/ml_agents/h2o_ml_agent.py +206 -385
  11. ai_data_science_team/ml_agents/h2o_ml_tools_agent.py +0 -0
  12. ai_data_science_team/ml_agents/mlflow_tools_agent.py +350 -0
  13. ai_data_science_team/multiagents/sql_data_analyst.py +3 -4
  14. ai_data_science_team/parsers/__init__.py +0 -0
  15. ai_data_science_team/{tools → parsers}/parsers.py +0 -1
  16. ai_data_science_team/templates/agent_templates.py +6 -6
  17. ai_data_science_team/tools/data_loader.py +448 -0
  18. ai_data_science_team/tools/dataframe.py +139 -0
  19. ai_data_science_team/tools/h2o.py +643 -0
  20. ai_data_science_team/tools/mlflow.py +961 -0
  21. ai_data_science_team/tools/{metadata.py → sql.py} +1 -137
  22. {ai_data_science_team-0.0.0.9009.dist-info → ai_data_science_team-0.0.0.9011.dist-info}/METADATA +40 -19
  23. ai_data_science_team-0.0.0.9011.dist-info/RECORD +36 -0
  24. ai_data_science_team-0.0.0.9009.dist-info/RECORD +0 -28
  25. /ai_data_science_team/{tools → utils}/logging.py +0 -0
  26. /ai_data_science_team/{tools → utils}/regex.py +0 -0
  27. {ai_data_science_team-0.0.0.9009.dist-info → ai_data_science_team-0.0.0.9011.dist-info}/LICENSE +0 -0
  28. {ai_data_science_team-0.0.0.9009.dist-info → ai_data_science_team-0.0.0.9011.dist-info}/WHEEL +0 -0
  29. {ai_data_science_team-0.0.0.9009.dist-info → ai_data_science_team-0.0.0.9011.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,448 @@
1
+
2
+ from langchain.tools import tool
3
+ from langgraph.prebuilt import InjectedState
4
+
5
+ import pandas as pd
6
+ import os
7
+
8
+ from typing import Tuple, List, Dict, Optional, Annotated
9
+
10
+
11
+ @tool(response_format='content_and_artifact')
12
+ def load_directory(
13
+ directory_path: str = os.getcwd(),
14
+ file_type: Optional[str] = None
15
+ ) -> Tuple[str, Dict]:
16
+ """
17
+ Tool: load_directory
18
+ Description: Loads all recognized tabular files in a directory.
19
+ If file_type is specified (e.g., 'csv'), only files
20
+ with that extension are loaded.
21
+
22
+ Parameters:
23
+ ----------
24
+ directory_path : str
25
+ The path to the directory to load. Defaults to the current working directory.
26
+
27
+ file_type : str, optional
28
+ The extension of the file type you want to load exclusively
29
+ (e.g., 'csv', 'xlsx', 'parquet'). If None or not provided,
30
+ attempts to load all recognized tabular files.
31
+
32
+ Returns:
33
+ -------
34
+ Tuple[str, Dict]
35
+ A tuple containing a message and a dictionary of data frames.
36
+ """
37
+ print(f" * Tool: load_directory | {directory_path}")
38
+
39
+ import os
40
+ import pandas as pd
41
+
42
+ if directory_path is None:
43
+ return "No directory path provided.", {}
44
+
45
+ if not os.path.isdir(directory_path):
46
+ return f"Directory not found: {directory_path}", {}
47
+
48
+ data_frames = {}
49
+
50
+ for filename in os.listdir(directory_path):
51
+ file_path = os.path.join(directory_path, filename)
52
+
53
+ # Skip directories
54
+ if os.path.isdir(file_path):
55
+ continue
56
+
57
+ # If file_type is specified, only process files that match.
58
+ if file_type:
59
+ # Make sure extension check is case-insensitive
60
+ if not filename.lower().endswith(f".{file_type.lower()}"):
61
+ continue
62
+
63
+ try:
64
+ # Attempt to auto-detect and load the file
65
+ data_frames[filename] = auto_load_file(file_path).to_dict()
66
+ except Exception as e:
67
+ # If loading fails, record the error message
68
+ data_frames[filename] = f"Error loading file: {e}"
69
+
70
+ return (
71
+ f"Returned the following data frames: {list(data_frames.keys())}",
72
+ data_frames
73
+ )
74
+
75
+
76
+ @tool(response_format='content_and_artifact')
77
+ def load_file(file_path: str) -> Tuple[str, Dict]:
78
+ """
79
+ Automatically loads a file based on its extension.
80
+
81
+ Parameters:
82
+ ----------
83
+ file_path : str
84
+ The path to the file to load.
85
+
86
+ Returns:
87
+ -------
88
+ Tuple[str, Dict]
89
+ A tuple containing a message and a dictionary of the data frame.
90
+ """
91
+ print(f" * Tool: load_file | {file_path}")
92
+ return f"Returned the following data frame from this file: {file_path}", auto_load_file(file_path).to_dict()
93
+
94
+
95
+ @tool(response_format='content_and_artifact')
96
+ def list_directory_contents(
97
+ directory_path: str = os.getcwd(),
98
+ show_hidden: bool = False
99
+ ) -> Tuple[List[str], List[Dict]]:
100
+ """
101
+ Tool: list_directory_contents
102
+ Description: Lists all files and folders in the specified directory.
103
+ Args:
104
+ directory_path (str): The path of the directory to list.
105
+ show_hidden (bool): Whether to include hidden files (default: False).
106
+ Returns:
107
+ tuple:
108
+ - content (list[str]): A list of filenames/folders (suitable for display)
109
+ - artifact (list[dict]): A list of dictionaries where each dict includes
110
+ the keys {"filename": <name>, "type": <'file' or 'directory'>}.
111
+ This structure can be easily converted to a pandas DataFrame.
112
+ """
113
+ print(f" * Tool: list_directory_contents | {directory_path}")
114
+ import os
115
+
116
+ if directory_path is None:
117
+ return "No directory path provided.", []
118
+
119
+ if not os.path.isdir(directory_path):
120
+ return f"Directory not found: {directory_path}", []
121
+
122
+ items = []
123
+ for item in os.listdir(directory_path):
124
+ # If show_hidden is False, skip items starting with '.'
125
+ if not show_hidden and item.startswith('.'):
126
+ continue
127
+ items.append(item)
128
+ items.reverse()
129
+
130
+ # content: just the raw list of item names (files/folders).
131
+ content = items.copy()
132
+
133
+ content.append(f"Total items: {len(items)}")
134
+ content.append(f"Directory: {directory_path}")
135
+
136
+ # artifact: list of dicts with both "filename" and "type" keys.
137
+ artifact = []
138
+ for item in items:
139
+ item_path = os.path.join(directory_path, item)
140
+ artifact.append({
141
+ "filename": item,
142
+ "type": "directory" if os.path.isdir(item_path) else "file"
143
+ })
144
+
145
+ return content, artifact
146
+
147
+
148
+
149
+ @tool(response_format='content_and_artifact')
150
+ def list_directory_recursive(
151
+ directory_path: str = os.getcwd(),
152
+ show_hidden: bool = False
153
+ ) -> Tuple[str, List[Dict]]:
154
+ """
155
+ Tool: list_directory_recursive
156
+ Description:
157
+ Recursively lists all files and folders within the specified directory.
158
+ Returns a two-tuple:
159
+ (1) A human-readable tree representation of the directory (content).
160
+ (2) A list of dicts (artifact) that can be easily converted into a DataFrame.
161
+
162
+ Args:
163
+ directory_path (str): The path of the directory to list.
164
+ show_hidden (bool): Whether to include hidden files (default: False).
165
+
166
+ Returns:
167
+ Tuple[str, List[dict]]:
168
+ content: A multiline string showing the directory tree.
169
+ artifact: A list of dictionaries, each with information about a file or directory.
170
+
171
+ Example:
172
+ content, artifact = list_directory_recursive("/path/to/folder", show_hidden=False)
173
+ """
174
+ print(f" * Tool: list_directory_recursive | {directory_path}")
175
+
176
+ # We'll store two things as we recurse:
177
+ # 1) lines for building the "tree" string
178
+ # 2) records in a list of dicts for easy DataFrame creation
179
+ import os
180
+
181
+ if directory_path is None:
182
+ return "No directory path provided.", {}
183
+
184
+ if not os.path.isdir(directory_path):
185
+ return f"Directory not found: {directory_path}", {}
186
+
187
+ lines = []
188
+ records = []
189
+
190
+ def recurse(path: str, indent_level: int = 0):
191
+ # List items in the current directory
192
+ try:
193
+ items = os.listdir(path)
194
+ except PermissionError:
195
+ # If we don't have permission to read the directory, just note it.
196
+ lines.append(" " * indent_level + "[Permission Denied]")
197
+ return
198
+
199
+ # Sort items for a consistent order (optional)
200
+ items.sort()
201
+
202
+ for item in items:
203
+ if not show_hidden and item.startswith('.'):
204
+ continue
205
+
206
+ full_path = os.path.join(path, item)
207
+ # Build an indented prefix for the tree
208
+ prefix = " " * indent_level
209
+
210
+ if os.path.isdir(full_path):
211
+ # Directory
212
+ lines.append(f"{prefix}{item}/")
213
+ records.append({
214
+ "type": "directory",
215
+ "name": item,
216
+ "parent_path": path,
217
+ "absolute_path": full_path
218
+ })
219
+ # Recursively descend
220
+ recurse(full_path, indent_level + 1)
221
+ else:
222
+ # File
223
+ lines.append(f"{prefix}- {item}")
224
+ records.append({
225
+ "type": "file",
226
+ "name": item,
227
+ "parent_path": path,
228
+ "absolute_path": full_path
229
+ })
230
+
231
+ # Kick off recursion
232
+ if os.path.isdir(directory_path):
233
+ # Add the top-level directory to lines/records if you like
234
+ dir_name = os.path.basename(os.path.normpath(directory_path)) or directory_path
235
+ lines.append(f"{dir_name}/") # Show the root as well
236
+ records.append({
237
+ "type": "directory",
238
+ "name": dir_name,
239
+ "parent_path": os.path.dirname(directory_path),
240
+ "absolute_path": os.path.abspath(directory_path)
241
+ })
242
+ recurse(directory_path, indent_level=1)
243
+ else:
244
+ # If the given path is not a directory, just return a note
245
+ lines.append(f"{directory_path} is not a directory.")
246
+ records.append({
247
+ "type": "error",
248
+ "name": directory_path,
249
+ "parent_path": None,
250
+ "absolute_path": os.path.abspath(directory_path)
251
+ })
252
+
253
+ # content: multiline string with the entire tree
254
+ content = "\n".join(lines)
255
+ # artifact: list of dicts, easily converted into a DataFrame
256
+ artifact = records
257
+
258
+ return content, artifact
259
+
260
+
261
+ @tool(response_format='content_and_artifact')
262
+ def get_file_info(file_path: str) -> Tuple[str, List[Dict]]:
263
+ """
264
+ Tool: get_file_info
265
+ Description: Retrieves metadata (size, modification time, etc.) about a file.
266
+ Returns a tuple (content, artifact):
267
+ - content (str): A textual summary of the file info.
268
+ - artifact (List[Dict]): A list with a single dictionary of file metadata.
269
+ Useful for direct conversion into a DataFrame.
270
+ Args:
271
+ file_path (str): The path of the file to inspect.
272
+ Returns:
273
+ Tuple[str, List[dict]]:
274
+ content: Summary text
275
+ artifact: A list[dict] of file metadata
276
+ Example:
277
+ content, artifact = get_file_info("/path/to/mydata.csv")
278
+ """
279
+ print(f" * Tool: get_file_info | {file_path}")
280
+
281
+ # Ensure the file exists
282
+ import os
283
+ import time
284
+
285
+ if not os.path.isfile(file_path):
286
+ raise FileNotFoundError(f"{file_path} is not a valid file.")
287
+
288
+ file_stats = os.stat(file_path)
289
+
290
+ # Construct the data dictionary
291
+ file_data = {
292
+ "file_name": os.path.basename(file_path),
293
+ "size_bytes": file_stats.st_size,
294
+ "modification_time": time.ctime(file_stats.st_mtime),
295
+ "absolute_path": os.path.abspath(file_path),
296
+ }
297
+
298
+ # Create a user-friendly summary (content)
299
+ content_str = (
300
+ f"File Name: {file_data['file_name']}\n"
301
+ f"Size (bytes): {file_data['size_bytes']}\n"
302
+ f"Last Modified: {file_data['modification_time']}\n"
303
+ f"Absolute Path: {file_data['absolute_path']}"
304
+ )
305
+
306
+ # Artifact should be a list of dict(s) to easily convert to DataFrame
307
+ artifact = [file_data]
308
+
309
+ return content_str, artifact
310
+
311
+
312
+ @tool(response_format='content_and_artifact')
313
+ def search_files_by_pattern(
314
+ directory_path: str = os.getcwd(),
315
+ pattern: str = "*.csv",
316
+ recursive: bool = False
317
+ ) -> Tuple[str, List[Dict]]:
318
+ """
319
+ Tool: search_files_by_pattern
320
+ Description:
321
+ Searches for files (optionally in subdirectories) that match a given
322
+ wildcard pattern (e.g. "*.csv", "*.xlsx", etc.), returning a tuple:
323
+ (1) content (str): A multiline summary of the matched files.
324
+ (2) artifact (List[Dict]): A list of dicts with file path info.
325
+
326
+ Args:
327
+ directory_path (str): Directory path to start searching from.
328
+ pattern (str): A wildcard pattern, e.g. "*.csv". Default is "*.csv".
329
+ recursive (bool): Whether to search in subdirectories. Default is False.
330
+
331
+ Returns:
332
+ Tuple[str, List[Dict]]:
333
+ content: A user-friendly string showing matched file paths.
334
+ artifact: A list of dictionaries, each representing a matched file.
335
+
336
+ Example:
337
+ content, artifact = search_files_by_pattern("/path/to/folder", "*.csv", recursive=True)
338
+ """
339
+ print(f" * Tool: search_files_by_pattern | {directory_path}")
340
+
341
+ import os
342
+ import fnmatch
343
+
344
+ matched_files = []
345
+ if recursive:
346
+ for root, dirs, files in os.walk(directory_path):
347
+ for filename in files:
348
+ if fnmatch.fnmatch(filename, pattern):
349
+ matched_files.append(os.path.join(root, filename))
350
+ else:
351
+ # Non-recursive
352
+ for filename in os.listdir(directory_path):
353
+ full_path = os.path.join(directory_path, filename)
354
+ if os.path.isfile(full_path) and fnmatch.fnmatch(filename, pattern):
355
+ matched_files.append(full_path)
356
+
357
+ # Create a human-readable summary (content)
358
+ if matched_files:
359
+ lines = [f"Found {len(matched_files)} file(s) matching '{pattern}':"]
360
+ for f in matched_files:
361
+ lines.append(f" - {f}")
362
+ content = "\n".join(lines)
363
+ else:
364
+ content = f"No files found matching '{pattern}'."
365
+
366
+ # Create artifact as a list of dicts for DataFrame conversion
367
+ artifact = [{"file_path": path} for path in matched_files]
368
+
369
+ return content, artifact
370
+
371
+
372
+ # Loaders
373
+
374
+ def auto_load_file(file_path: str) -> pd.DataFrame:
375
+ """
376
+ Auto loads a file based on its extension.
377
+
378
+ Parameters:
379
+ ----------
380
+ file_path : str
381
+ The path to the file to load.
382
+
383
+ Returns:
384
+ -------
385
+ pd.DataFrame
386
+ """
387
+ import pandas as pd
388
+ try:
389
+ ext = file_path.split(".")[-1].lower()
390
+ if ext == "csv":
391
+ return load_csv(file_path)
392
+ elif ext in ["xlsx", "xls"]:
393
+ return load_excel(file_path)
394
+ elif ext == "json":
395
+ return load_json(file_path)
396
+ elif ext == "parquet":
397
+ return load_parquet(file_path)
398
+ elif ext == "pkl":
399
+ return load_pickle(file_path)
400
+ else:
401
+ return f"Unsupported file extension: {ext}"
402
+ except Exception as e:
403
+ return f"Error loading file: {e}"
404
+
405
+ def load_csv(file_path: str) -> pd.DataFrame:
406
+ """
407
+ Tool: load_csv
408
+ Description: Loads a CSV file into a pandas DataFrame.
409
+ Args:
410
+ file_path (str): Path to the CSV file.
411
+ Returns:
412
+ pd.DataFrame
413
+ """
414
+ import pandas as pd
415
+ return pd.read_csv(file_path)
416
+
417
+ def load_excel(file_path: str, sheet_name=None) -> pd.DataFrame:
418
+ """
419
+ Tool: load_excel
420
+ Description: Loads an Excel file into a pandas DataFrame.
421
+ """
422
+ import pandas as pd
423
+ return pd.read_excel(file_path, sheet_name=sheet_name)
424
+
425
+ def load_json(file_path: str) -> pd.DataFrame:
426
+ """
427
+ Tool: load_json
428
+ Description: Loads a JSON file or NDJSON into a pandas DataFrame.
429
+ """
430
+ import pandas as pd
431
+ # For simple JSON arrays
432
+ return pd.read_json(file_path, orient="records", lines=False)
433
+
434
+ def load_parquet(file_path: str) -> pd.DataFrame:
435
+ """
436
+ Tool: load_parquet
437
+ Description: Loads a Parquet file into a pandas DataFrame.
438
+ """
439
+ import pandas as pd
440
+ return pd.read_parquet(file_path)
441
+
442
+ def load_pickle(file_path: str) -> pd.DataFrame:
443
+ """
444
+ Tool: load_pickle
445
+ Description: Loads a Pickle file into a pandas DataFrame.
446
+ """
447
+ import pandas as pd
448
+ return pd.read_pickle(file_path)
@@ -0,0 +1,139 @@
1
+ import io
2
+ import pandas as pd
3
+ from typing import Union, List, Dict
4
+
5
+ def get_dataframe_summary(
6
+ dataframes: Union[pd.DataFrame, List[pd.DataFrame], Dict[str, pd.DataFrame]],
7
+ n_sample: int = 30,
8
+ skip_stats: bool = False,
9
+ ) -> List[str]:
10
+ """
11
+ Generate a summary for one or more DataFrames. Accepts a single DataFrame, a list of DataFrames,
12
+ or a dictionary mapping names to DataFrames.
13
+
14
+ Parameters
15
+ ----------
16
+ dataframes : pandas.DataFrame or list of pandas.DataFrame or dict of (str -> pandas.DataFrame)
17
+ - Single DataFrame: produce a single summary (returned within a one-element list).
18
+ - List of DataFrames: produce a summary for each DataFrame, using index-based names.
19
+ - Dictionary of DataFrames: produce a summary for each DataFrame, using dictionary keys as names.
20
+ n_sample : int, default 30
21
+ Number of rows to display in the "Data (first 30 rows)" section.
22
+ skip_stats : bool, default False
23
+ If True, skip the descriptive statistics and DataFrame info sections.
24
+
25
+ Example:
26
+ --------
27
+ ``` python
28
+ import pandas as pd
29
+ from sklearn.datasets import load_iris
30
+ data = load_iris(as_frame=True)
31
+ dataframes = {
32
+ "iris": data.frame,
33
+ "iris_target": data.target,
34
+ }
35
+ summaries = get_dataframe_summary(dataframes)
36
+ print(summaries[0])
37
+ ```
38
+
39
+ Returns
40
+ -------
41
+ list of str
42
+ A list of summaries, one for each provided DataFrame. Each summary includes:
43
+ - Shape of the DataFrame (rows, columns)
44
+ - Column data types
45
+ - Missing value percentage
46
+ - Unique value counts
47
+ - First 30 rows
48
+ - Descriptive statistics
49
+ - DataFrame info output
50
+ """
51
+
52
+ summaries = []
53
+
54
+ # --- Dictionary Case ---
55
+ if isinstance(dataframes, dict):
56
+ for dataset_name, df in dataframes.items():
57
+ summaries.append(_summarize_dataframe(df, dataset_name, n_sample, skip_stats))
58
+
59
+ # --- Single DataFrame Case ---
60
+ elif isinstance(dataframes, pd.DataFrame):
61
+ summaries.append(_summarize_dataframe(dataframes, "Single_Dataset", n_sample, skip_stats))
62
+
63
+ # --- List of DataFrames Case ---
64
+ elif isinstance(dataframes, list):
65
+ for idx, df in enumerate(dataframes):
66
+ dataset_name = f"Dataset_{idx}"
67
+ summaries.append(_summarize_dataframe(df, dataset_name, n_sample, skip_stats))
68
+
69
+ else:
70
+ raise TypeError(
71
+ "Input must be a single DataFrame, a list of DataFrames, or a dictionary of DataFrames."
72
+ )
73
+
74
+ return summaries
75
+
76
+
77
+ def _summarize_dataframe(df: pd.DataFrame, dataset_name: str, n_sample=30, skip_stats=False) -> str:
78
+ """Generate a summary string for a single DataFrame."""
79
+ # 1. Convert dictionary-type cells to strings
80
+ # This prevents unhashable dict errors during df.nunique().
81
+ df = df.apply(lambda col: col.map(lambda x: str(x) if isinstance(x, dict) else x))
82
+
83
+ # 2. Capture df.info() output
84
+ buffer = io.StringIO()
85
+ df.info(buf=buffer)
86
+ info_text = buffer.getvalue()
87
+
88
+ # 3. Calculate missing value stats
89
+ missing_stats = (df.isna().sum() / len(df) * 100).sort_values(ascending=False)
90
+ missing_summary = "\n".join([f"{col}: {val:.2f}%" for col, val in missing_stats.items()])
91
+
92
+ # 4. Get column data types
93
+ column_types = "\n".join([f"{col}: {dtype}" for col, dtype in df.dtypes.items()])
94
+
95
+ # 5. Get unique value counts
96
+ unique_counts = df.nunique() # Will no longer fail on unhashable dict
97
+ unique_counts_summary = "\n".join([f"{col}: {count}" for col, count in unique_counts.items()])
98
+
99
+ # 6. Generate the summary text
100
+ if not skip_stats:
101
+ summary_text = f"""
102
+ Dataset Name: {dataset_name}
103
+ ----------------------------
104
+ Shape: {df.shape[0]} rows x {df.shape[1]} columns
105
+
106
+ Column Data Types:
107
+ {column_types}
108
+
109
+ Missing Value Percentage:
110
+ {missing_summary}
111
+
112
+ Unique Value Counts:
113
+ {unique_counts_summary}
114
+
115
+ Data (first {n_sample} rows):
116
+ {df.head(n_sample).to_string()}
117
+
118
+ Data Description:
119
+ {df.describe().to_string()}
120
+
121
+ Data Info:
122
+ {info_text}
123
+ """
124
+ else:
125
+ summary_text = f"""
126
+ Dataset Name: {dataset_name}
127
+ ----------------------------
128
+ Shape: {df.shape[0]} rows x {df.shape[1]} columns
129
+
130
+ Column Data Types:
131
+ {column_types}
132
+
133
+ Data (first {n_sample} rows):
134
+ {df.head(n_sample).to_string()}
135
+ """
136
+
137
+ return summary_text.strip()
138
+
139
+