ai-data-science-team 0.0.0.9009__py3-none-any.whl → 0.0.0.9010__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. ai_data_science_team/_version.py +1 -1
  2. ai_data_science_team/agents/data_cleaning_agent.py +6 -6
  3. ai_data_science_team/agents/data_loader_tools_agent.py +69 -0
  4. ai_data_science_team/agents/data_visualization_agent.py +6 -7
  5. ai_data_science_team/agents/data_wrangling_agent.py +6 -6
  6. ai_data_science_team/agents/feature_engineering_agent.py +6 -6
  7. ai_data_science_team/agents/sql_database_agent.py +6 -6
  8. ai_data_science_team/ml_agents/__init__.py +1 -0
  9. ai_data_science_team/ml_agents/h2o_ml_agent.py +205 -385
  10. ai_data_science_team/ml_agents/mlflow_tools_agent.py +327 -0
  11. ai_data_science_team/multiagents/sql_data_analyst.py +3 -4
  12. ai_data_science_team/parsers/__init__.py +0 -0
  13. ai_data_science_team/{tools → parsers}/parsers.py +0 -1
  14. ai_data_science_team/templates/agent_templates.py +6 -6
  15. ai_data_science_team/tools/data_loader.py +378 -0
  16. ai_data_science_team/tools/dataframe.py +139 -0
  17. ai_data_science_team/tools/h2o.py +643 -0
  18. ai_data_science_team/tools/mlflow.py +961 -0
  19. ai_data_science_team/tools/{metadata.py → sql.py} +1 -137
  20. {ai_data_science_team-0.0.0.9009.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/METADATA +34 -16
  21. ai_data_science_team-0.0.0.9010.dist-info/RECORD +35 -0
  22. ai_data_science_team-0.0.0.9009.dist-info/RECORD +0 -28
  23. /ai_data_science_team/{tools → utils}/logging.py +0 -0
  24. /ai_data_science_team/{tools → utils}/regex.py +0 -0
  25. {ai_data_science_team-0.0.0.9009.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/LICENSE +0 -0
  26. {ai_data_science_team-0.0.0.9009.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/WHEEL +0 -0
  27. {ai_data_science_team-0.0.0.9009.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,378 @@
1
+
2
+ from langchain.tools import tool
3
+
4
+ import pandas as pd
5
+
6
+ from typing import Tuple, List, Dict
7
+
8
+
9
+ @tool(response_format='content_and_artifact')
10
+ def load_directory(dir_path: str) -> Tuple[str, Dict]:
11
+ """
12
+ Tool: load_directory
13
+ Description: Loads all recognized tabular files in a directory.
14
+
15
+ Parameters:
16
+ ----------
17
+ dir_path : str
18
+ The path to the directory to load.
19
+
20
+ Returns:
21
+ -------
22
+ Tuple[str, Dict]
23
+ A tuple containing a message and a dictionary of data frames.
24
+ """
25
+ print(" * Tool: load_directory")
26
+ import os
27
+ import pandas as pd
28
+ data_frames = {}
29
+ for filename in os.listdir(dir_path):
30
+ file_path = os.path.join(dir_path, filename)
31
+ # Skip directories
32
+ if os.path.isdir(file_path):
33
+ continue
34
+ try:
35
+ data_frames[filename] = auto_load_file(file_path).to_dict()
36
+ except Exception as e:
37
+ data_frames[filename] = f"Error loading file: {e}"
38
+ return f"Returned the following data frames: {list(data_frames.keys())}", data_frames
39
+
40
+ @tool(response_format='content_and_artifact')
41
+ def load_file(file_path: str) -> Tuple[str, Dict]:
42
+ """
43
+ Automatically loads a file based on its extension.
44
+
45
+ Parameters:
46
+ ----------
47
+ file_path : str
48
+ The path to the file to load.
49
+
50
+ Returns:
51
+ -------
52
+ Tuple[str, Dict]
53
+ A tuple containing a message and a dictionary of the data frame.
54
+ """
55
+ print(" * Tool: load_file")
56
+ return f"Returned the following data frame from this file: {file_path}", auto_load_file(file_path).to_dict()
57
+
58
+
59
+ @tool(response_format='content_and_artifact')
60
+ def list_directory_contents(directory_path: str, show_hidden: bool = False) -> Tuple[List[str], List[Dict]]:
61
+ """
62
+ Tool: list_directory_contents
63
+ Description: Lists all files and folders in the specified directory.
64
+ Args:
65
+ directory_path (str): The path of the directory to list.
66
+ show_hidden (bool): Whether to include hidden files (default: False).
67
+ Returns:
68
+ tuple:
69
+ - content (list[str]): A list of filenames/folders (suitable for display)
70
+ - artifact (list[dict]): A list of dictionaries where each dict has keys like {"filename": <name>}.
71
+ This structure can be easily converted to a pandas DataFrame.
72
+ """
73
+ print(" * Tool: list_directory_contents")
74
+ import os
75
+
76
+ items = []
77
+ for item in os.listdir(directory_path):
78
+ # If show_hidden is False, skip items starting with '.'
79
+ if not show_hidden and item.startswith('.'):
80
+ continue
81
+ items.append(item)
82
+
83
+ # content: just the raw list of filenames
84
+ content = items
85
+
86
+ # artifact: list of dicts (each row is {"filename": ...}), easily turned into a DataFrame
87
+ artifact = [{"filename": item} for item in items]
88
+
89
+ return content, artifact
90
+
91
+
92
+ @tool(response_format='content_and_artifact')
93
+ def list_directory_recursive(directory_path: str, show_hidden: bool = False) -> Tuple[str, List[Dict]]:
94
+ """
95
+ Tool: list_directory_recursive
96
+ Description:
97
+ Recursively lists all files and folders within the specified directory.
98
+ Returns a two-tuple:
99
+ (1) A human-readable tree representation of the directory (content).
100
+ (2) A list of dicts (artifact) that can be easily converted into a DataFrame.
101
+
102
+ Args:
103
+ directory_path (str): The path of the directory to list.
104
+ show_hidden (bool): Whether to include hidden files (default: False).
105
+
106
+ Returns:
107
+ Tuple[str, List[dict]]:
108
+ content: A multiline string showing the directory tree.
109
+ artifact: A list of dictionaries, each with information about a file or directory.
110
+
111
+ Example:
112
+ content, artifact = list_directory_recursive("/path/to/folder", show_hidden=False)
113
+ """
114
+ print(" * Tool: list_directory_recursive")
115
+
116
+ # We'll store two things as we recurse:
117
+ # 1) lines for building the "tree" string
118
+ # 2) records in a list of dicts for easy DataFrame creation
119
+ import os
120
+
121
+ lines = []
122
+ records = []
123
+
124
+ def recurse(path: str, indent_level: int = 0):
125
+ # List items in the current directory
126
+ try:
127
+ items = os.listdir(path)
128
+ except PermissionError:
129
+ # If we don't have permission to read the directory, just note it.
130
+ lines.append(" " * indent_level + "[Permission Denied]")
131
+ return
132
+
133
+ # Sort items for a consistent order (optional)
134
+ items.sort()
135
+
136
+ for item in items:
137
+ if not show_hidden and item.startswith('.'):
138
+ continue
139
+
140
+ full_path = os.path.join(path, item)
141
+ # Build an indented prefix for the tree
142
+ prefix = " " * indent_level
143
+
144
+ if os.path.isdir(full_path):
145
+ # Directory
146
+ lines.append(f"{prefix}{item}/")
147
+ records.append({
148
+ "type": "directory",
149
+ "name": item,
150
+ "parent_path": path,
151
+ "absolute_path": full_path
152
+ })
153
+ # Recursively descend
154
+ recurse(full_path, indent_level + 1)
155
+ else:
156
+ # File
157
+ lines.append(f"{prefix}- {item}")
158
+ records.append({
159
+ "type": "file",
160
+ "name": item,
161
+ "parent_path": path,
162
+ "absolute_path": full_path
163
+ })
164
+
165
+ # Kick off recursion
166
+ if os.path.isdir(directory_path):
167
+ # Add the top-level directory to lines/records if you like
168
+ dir_name = os.path.basename(os.path.normpath(directory_path)) or directory_path
169
+ lines.append(f"{dir_name}/") # Show the root as well
170
+ records.append({
171
+ "type": "directory",
172
+ "name": dir_name,
173
+ "parent_path": os.path.dirname(directory_path),
174
+ "absolute_path": os.path.abspath(directory_path)
175
+ })
176
+ recurse(directory_path, indent_level=1)
177
+ else:
178
+ # If the given path is not a directory, just return a note
179
+ lines.append(f"{directory_path} is not a directory.")
180
+ records.append({
181
+ "type": "error",
182
+ "name": directory_path,
183
+ "parent_path": None,
184
+ "absolute_path": os.path.abspath(directory_path)
185
+ })
186
+
187
+ # content: multiline string with the entire tree
188
+ content = "\n".join(lines)
189
+ # artifact: list of dicts, easily converted into a DataFrame
190
+ artifact = records
191
+
192
+ return content, artifact
193
+
194
+
195
+ @tool(response_format='content_and_artifact')
196
+ def get_file_info(file_path: str) -> Tuple[str, List[Dict]]:
197
+ """
198
+ Tool: get_file_info
199
+ Description: Retrieves metadata (size, modification time, etc.) about a file.
200
+ Returns a tuple (content, artifact):
201
+ - content (str): A textual summary of the file info.
202
+ - artifact (List[Dict]): A list with a single dictionary of file metadata.
203
+ Useful for direct conversion into a DataFrame.
204
+ Args:
205
+ file_path (str): The path of the file to inspect.
206
+ Returns:
207
+ Tuple[str, List[dict]]:
208
+ content: Summary text
209
+ artifact: A list[dict] of file metadata
210
+ Example:
211
+ content, artifact = get_file_info("/path/to/mydata.csv")
212
+ """
213
+ print(" * Tool: get_file_info")
214
+
215
+ # Ensure the file exists
216
+ import os
217
+ import time
218
+
219
+ if not os.path.isfile(file_path):
220
+ raise FileNotFoundError(f"{file_path} is not a valid file.")
221
+
222
+ file_stats = os.stat(file_path)
223
+
224
+ # Construct the data dictionary
225
+ file_data = {
226
+ "file_name": os.path.basename(file_path),
227
+ "size_bytes": file_stats.st_size,
228
+ "modification_time": time.ctime(file_stats.st_mtime),
229
+ "absolute_path": os.path.abspath(file_path),
230
+ }
231
+
232
+ # Create a user-friendly summary (content)
233
+ content_str = (
234
+ f"File Name: {file_data['file_name']}\n"
235
+ f"Size (bytes): {file_data['size_bytes']}\n"
236
+ f"Last Modified: {file_data['modification_time']}\n"
237
+ f"Absolute Path: {file_data['absolute_path']}"
238
+ )
239
+
240
+ # Artifact should be a list of dict(s) to easily convert to DataFrame
241
+ artifact = [file_data]
242
+
243
+ return content_str, artifact
244
+
245
+
246
+ @tool(response_format='content_and_artifact')
247
+ def search_files_by_pattern(directory_path: str, pattern: str = "*.csv", recursive: bool = False) -> Tuple[str, List[Dict]]:
248
+ """
249
+ Tool: search_files_by_pattern
250
+ Description:
251
+ Searches for files (optionally in subdirectories) that match a given
252
+ wildcard pattern (e.g. "*.csv", "*.xlsx", etc.), returning a tuple:
253
+ (1) content (str): A multiline summary of the matched files.
254
+ (2) artifact (List[Dict]): A list of dicts with file path info.
255
+
256
+ Args:
257
+ directory_path (str): Directory path to start searching from.
258
+ pattern (str): A wildcard pattern, e.g. "*.csv". Default is "*.csv".
259
+ recursive (bool): Whether to search in subdirectories. Default is False.
260
+
261
+ Returns:
262
+ Tuple[str, List[Dict]]:
263
+ content: A user-friendly string showing matched file paths.
264
+ artifact: A list of dictionaries, each representing a matched file.
265
+
266
+ Example:
267
+ content, artifact = search_files_by_pattern("/path/to/folder", "*.csv", recursive=True)
268
+ """
269
+ print(" * Tool: search_files_by_pattern")
270
+
271
+ import os
272
+ import fnmatch
273
+
274
+ matched_files = []
275
+ if recursive:
276
+ for root, dirs, files in os.walk(directory_path):
277
+ for filename in files:
278
+ if fnmatch.fnmatch(filename, pattern):
279
+ matched_files.append(os.path.join(root, filename))
280
+ else:
281
+ # Non-recursive
282
+ for filename in os.listdir(directory_path):
283
+ full_path = os.path.join(directory_path, filename)
284
+ if os.path.isfile(full_path) and fnmatch.fnmatch(filename, pattern):
285
+ matched_files.append(full_path)
286
+
287
+ # Create a human-readable summary (content)
288
+ if matched_files:
289
+ lines = [f"Found {len(matched_files)} file(s) matching '{pattern}':"]
290
+ for f in matched_files:
291
+ lines.append(f" - {f}")
292
+ content = "\n".join(lines)
293
+ else:
294
+ content = f"No files found matching '{pattern}'."
295
+
296
+ # Create artifact as a list of dicts for DataFrame conversion
297
+ artifact = [{"file_path": path} for path in matched_files]
298
+
299
+ return content, artifact
300
+
301
+
302
+ # Loaders
303
+
304
+ def auto_load_file(file_path: str) -> pd.DataFrame:
305
+ """
306
+ Auto loads a file based on its extension.
307
+
308
+ Parameters:
309
+ ----------
310
+ file_path : str
311
+ The path to the file to load.
312
+
313
+ Returns:
314
+ -------
315
+ pd.DataFrame
316
+ """
317
+ import pandas as pd
318
+ try:
319
+ ext = file_path.split(".")[-1].lower()
320
+ if ext == "csv":
321
+ return load_csv(file_path)
322
+ elif ext in ["xlsx", "xls"]:
323
+ return load_excel(file_path)
324
+ elif ext == "json":
325
+ return load_json(file_path)
326
+ elif ext == "parquet":
327
+ return load_parquet(file_path)
328
+ elif ext == "pkl":
329
+ return load_pickle(file_path)
330
+ else:
331
+ return f"Unsupported file extension: {ext}"
332
+ except Exception as e:
333
+ return f"Error loading file: {e}"
334
+
335
+ def load_csv(file_path: str) -> pd.DataFrame:
336
+ """
337
+ Tool: load_csv
338
+ Description: Loads a CSV file into a pandas DataFrame.
339
+ Args:
340
+ file_path (str): Path to the CSV file.
341
+ Returns:
342
+ pd.DataFrame
343
+ """
344
+ import pandas as pd
345
+ return pd.read_csv(file_path)
346
+
347
+ def load_excel(file_path: str, sheet_name=None) -> pd.DataFrame:
348
+ """
349
+ Tool: load_excel
350
+ Description: Loads an Excel file into a pandas DataFrame.
351
+ """
352
+ import pandas as pd
353
+ return pd.read_excel(file_path, sheet_name=sheet_name)
354
+
355
+ def load_json(file_path: str) -> pd.DataFrame:
356
+ """
357
+ Tool: load_json
358
+ Description: Loads a JSON file or NDJSON into a pandas DataFrame.
359
+ """
360
+ import pandas as pd
361
+ # For simple JSON arrays
362
+ return pd.read_json(file_path, orient="records", lines=False)
363
+
364
+ def load_parquet(file_path: str) -> pd.DataFrame:
365
+ """
366
+ Tool: load_parquet
367
+ Description: Loads a Parquet file into a pandas DataFrame.
368
+ """
369
+ import pandas as pd
370
+ return pd.read_parquet(file_path)
371
+
372
+ def load_pickle(file_path: str) -> pd.DataFrame:
373
+ """
374
+ Tool: load_pickle
375
+ Description: Loads a Pickle file into a pandas DataFrame.
376
+ """
377
+ import pandas as pd
378
+ return pd.read_pickle(file_path)
@@ -0,0 +1,139 @@
1
+ import io
2
+ import pandas as pd
3
+ from typing import Union, List, Dict
4
+
5
+ def get_dataframe_summary(
6
+ dataframes: Union[pd.DataFrame, List[pd.DataFrame], Dict[str, pd.DataFrame]],
7
+ n_sample: int = 30,
8
+ skip_stats: bool = False,
9
+ ) -> List[str]:
10
+ """
11
+ Generate a summary for one or more DataFrames. Accepts a single DataFrame, a list of DataFrames,
12
+ or a dictionary mapping names to DataFrames.
13
+
14
+ Parameters
15
+ ----------
16
+ dataframes : pandas.DataFrame or list of pandas.DataFrame or dict of (str -> pandas.DataFrame)
17
+ - Single DataFrame: produce a single summary (returned within a one-element list).
18
+ - List of DataFrames: produce a summary for each DataFrame, using index-based names.
19
+ - Dictionary of DataFrames: produce a summary for each DataFrame, using dictionary keys as names.
20
+ n_sample : int, default 30
21
+ Number of rows to display in the "Data (first 30 rows)" section.
22
+ skip_stats : bool, default False
23
+ If True, skip the descriptive statistics and DataFrame info sections.
24
+
25
+ Example:
26
+ --------
27
+ ``` python
28
+ import pandas as pd
29
+ from sklearn.datasets import load_iris
30
+ data = load_iris(as_frame=True)
31
+ dataframes = {
32
+ "iris": data.frame,
33
+ "iris_target": data.target,
34
+ }
35
+ summaries = get_dataframe_summary(dataframes)
36
+ print(summaries[0])
37
+ ```
38
+
39
+ Returns
40
+ -------
41
+ list of str
42
+ A list of summaries, one for each provided DataFrame. Each summary includes:
43
+ - Shape of the DataFrame (rows, columns)
44
+ - Column data types
45
+ - Missing value percentage
46
+ - Unique value counts
47
+ - First 30 rows
48
+ - Descriptive statistics
49
+ - DataFrame info output
50
+ """
51
+
52
+ summaries = []
53
+
54
+ # --- Dictionary Case ---
55
+ if isinstance(dataframes, dict):
56
+ for dataset_name, df in dataframes.items():
57
+ summaries.append(_summarize_dataframe(df, dataset_name, n_sample, skip_stats))
58
+
59
+ # --- Single DataFrame Case ---
60
+ elif isinstance(dataframes, pd.DataFrame):
61
+ summaries.append(_summarize_dataframe(dataframes, "Single_Dataset", n_sample, skip_stats))
62
+
63
+ # --- List of DataFrames Case ---
64
+ elif isinstance(dataframes, list):
65
+ for idx, df in enumerate(dataframes):
66
+ dataset_name = f"Dataset_{idx}"
67
+ summaries.append(_summarize_dataframe(df, dataset_name, n_sample, skip_stats))
68
+
69
+ else:
70
+ raise TypeError(
71
+ "Input must be a single DataFrame, a list of DataFrames, or a dictionary of DataFrames."
72
+ )
73
+
74
+ return summaries
75
+
76
+
77
+ def _summarize_dataframe(df: pd.DataFrame, dataset_name: str, n_sample=30, skip_stats=False) -> str:
78
+ """Generate a summary string for a single DataFrame."""
79
+ # 1. Convert dictionary-type cells to strings
80
+ # This prevents unhashable dict errors during df.nunique().
81
+ df = df.apply(lambda col: col.map(lambda x: str(x) if isinstance(x, dict) else x))
82
+
83
+ # 2. Capture df.info() output
84
+ buffer = io.StringIO()
85
+ df.info(buf=buffer)
86
+ info_text = buffer.getvalue()
87
+
88
+ # 3. Calculate missing value stats
89
+ missing_stats = (df.isna().sum() / len(df) * 100).sort_values(ascending=False)
90
+ missing_summary = "\n".join([f"{col}: {val:.2f}%" for col, val in missing_stats.items()])
91
+
92
+ # 4. Get column data types
93
+ column_types = "\n".join([f"{col}: {dtype}" for col, dtype in df.dtypes.items()])
94
+
95
+ # 5. Get unique value counts
96
+ unique_counts = df.nunique() # Will no longer fail on unhashable dict
97
+ unique_counts_summary = "\n".join([f"{col}: {count}" for col, count in unique_counts.items()])
98
+
99
+ # 6. Generate the summary text
100
+ if not skip_stats:
101
+ summary_text = f"""
102
+ Dataset Name: {dataset_name}
103
+ ----------------------------
104
+ Shape: {df.shape[0]} rows x {df.shape[1]} columns
105
+
106
+ Column Data Types:
107
+ {column_types}
108
+
109
+ Missing Value Percentage:
110
+ {missing_summary}
111
+
112
+ Unique Value Counts:
113
+ {unique_counts_summary}
114
+
115
+ Data (first {n_sample} rows):
116
+ {df.head(n_sample).to_string()}
117
+
118
+ Data Description:
119
+ {df.describe().to_string()}
120
+
121
+ Data Info:
122
+ {info_text}
123
+ """
124
+ else:
125
+ summary_text = f"""
126
+ Dataset Name: {dataset_name}
127
+ ----------------------------
128
+ Shape: {df.shape[0]} rows x {df.shape[1]} columns
129
+
130
+ Column Data Types:
131
+ {column_types}
132
+
133
+ Data (first {n_sample} rows):
134
+ {df.head(n_sample).to_string()}
135
+ """
136
+
137
+ return summary_text.strip()
138
+
139
+