ai-data-science-team 0.0.0.9009__py3-none-any.whl → 0.0.0.9010__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (27) hide show
  1. ai_data_science_team/_version.py +1 -1
  2. ai_data_science_team/agents/data_cleaning_agent.py +6 -6
  3. ai_data_science_team/agents/data_loader_tools_agent.py +69 -0
  4. ai_data_science_team/agents/data_visualization_agent.py +6 -7
  5. ai_data_science_team/agents/data_wrangling_agent.py +6 -6
  6. ai_data_science_team/agents/feature_engineering_agent.py +6 -6
  7. ai_data_science_team/agents/sql_database_agent.py +6 -6
  8. ai_data_science_team/ml_agents/__init__.py +1 -0
  9. ai_data_science_team/ml_agents/h2o_ml_agent.py +205 -385
  10. ai_data_science_team/ml_agents/mlflow_tools_agent.py +327 -0
  11. ai_data_science_team/multiagents/sql_data_analyst.py +3 -4
  12. ai_data_science_team/parsers/__init__.py +0 -0
  13. ai_data_science_team/{tools → parsers}/parsers.py +0 -1
  14. ai_data_science_team/templates/agent_templates.py +6 -6
  15. ai_data_science_team/tools/data_loader.py +378 -0
  16. ai_data_science_team/tools/dataframe.py +139 -0
  17. ai_data_science_team/tools/h2o.py +643 -0
  18. ai_data_science_team/tools/mlflow.py +961 -0
  19. ai_data_science_team/tools/{metadata.py → sql.py} +1 -137
  20. {ai_data_science_team-0.0.0.9009.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/METADATA +34 -16
  21. ai_data_science_team-0.0.0.9010.dist-info/RECORD +35 -0
  22. ai_data_science_team-0.0.0.9009.dist-info/RECORD +0 -28
  23. /ai_data_science_team/{tools → utils}/logging.py +0 -0
  24. /ai_data_science_team/{tools → utils}/regex.py +0 -0
  25. {ai_data_science_team-0.0.0.9009.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/LICENSE +0 -0
  26. {ai_data_science_team-0.0.0.9009.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/WHEEL +0 -0
  27. {ai_data_science_team-0.0.0.9009.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,378 @@
1
+
2
+ from langchain.tools import tool
3
+
4
+ import pandas as pd
5
+
6
+ from typing import Tuple, List, Dict
7
+
8
+
9
+ @tool(response_format='content_and_artifact')
10
+ def load_directory(dir_path: str) -> Tuple[str, Dict]:
11
+ """
12
+ Tool: load_directory
13
+ Description: Loads all recognized tabular files in a directory.
14
+
15
+ Parameters:
16
+ ----------
17
+ dir_path : str
18
+ The path to the directory to load.
19
+
20
+ Returns:
21
+ -------
22
+ Tuple[str, Dict]
23
+ A tuple containing a message and a dictionary of data frames.
24
+ """
25
+ print(" * Tool: load_directory")
26
+ import os
27
+ import pandas as pd
28
+ data_frames = {}
29
+ for filename in os.listdir(dir_path):
30
+ file_path = os.path.join(dir_path, filename)
31
+ # Skip directories
32
+ if os.path.isdir(file_path):
33
+ continue
34
+ try:
35
+ data_frames[filename] = auto_load_file(file_path).to_dict()
36
+ except Exception as e:
37
+ data_frames[filename] = f"Error loading file: {e}"
38
+ return f"Returned the following data frames: {list(data_frames.keys())}", data_frames
39
+
40
+ @tool(response_format='content_and_artifact')
41
+ def load_file(file_path: str) -> Tuple[str, Dict]:
42
+ """
43
+ Automatically loads a file based on its extension.
44
+
45
+ Parameters:
46
+ ----------
47
+ file_path : str
48
+ The path to the file to load.
49
+
50
+ Returns:
51
+ -------
52
+ Tuple[str, Dict]
53
+ A tuple containing a message and a dictionary of the data frame.
54
+ """
55
+ print(" * Tool: load_file")
56
+ return f"Returned the following data frame from this file: {file_path}", auto_load_file(file_path).to_dict()
57
+
58
+
59
+ @tool(response_format='content_and_artifact')
60
+ def list_directory_contents(directory_path: str, show_hidden: bool = False) -> Tuple[List[str], List[Dict]]:
61
+ """
62
+ Tool: list_directory_contents
63
+ Description: Lists all files and folders in the specified directory.
64
+ Args:
65
+ directory_path (str): The path of the directory to list.
66
+ show_hidden (bool): Whether to include hidden files (default: False).
67
+ Returns:
68
+ tuple:
69
+ - content (list[str]): A list of filenames/folders (suitable for display)
70
+ - artifact (list[dict]): A list of dictionaries where each dict has keys like {"filename": <name>}.
71
+ This structure can be easily converted to a pandas DataFrame.
72
+ """
73
+ print(" * Tool: list_directory_contents")
74
+ import os
75
+
76
+ items = []
77
+ for item in os.listdir(directory_path):
78
+ # If show_hidden is False, skip items starting with '.'
79
+ if not show_hidden and item.startswith('.'):
80
+ continue
81
+ items.append(item)
82
+
83
+ # content: just the raw list of filenames
84
+ content = items
85
+
86
+ # artifact: list of dicts (each row is {"filename": ...}), easily turned into a DataFrame
87
+ artifact = [{"filename": item} for item in items]
88
+
89
+ return content, artifact
90
+
91
+
92
+ @tool(response_format='content_and_artifact')
93
+ def list_directory_recursive(directory_path: str, show_hidden: bool = False) -> Tuple[str, List[Dict]]:
94
+ """
95
+ Tool: list_directory_recursive
96
+ Description:
97
+ Recursively lists all files and folders within the specified directory.
98
+ Returns a two-tuple:
99
+ (1) A human-readable tree representation of the directory (content).
100
+ (2) A list of dicts (artifact) that can be easily converted into a DataFrame.
101
+
102
+ Args:
103
+ directory_path (str): The path of the directory to list.
104
+ show_hidden (bool): Whether to include hidden files (default: False).
105
+
106
+ Returns:
107
+ Tuple[str, List[dict]]:
108
+ content: A multiline string showing the directory tree.
109
+ artifact: A list of dictionaries, each with information about a file or directory.
110
+
111
+ Example:
112
+ content, artifact = list_directory_recursive("/path/to/folder", show_hidden=False)
113
+ """
114
+ print(" * Tool: list_directory_recursive")
115
+
116
+ # We'll store two things as we recurse:
117
+ # 1) lines for building the "tree" string
118
+ # 2) records in a list of dicts for easy DataFrame creation
119
+ import os
120
+
121
+ lines = []
122
+ records = []
123
+
124
+ def recurse(path: str, indent_level: int = 0):
125
+ # List items in the current directory
126
+ try:
127
+ items = os.listdir(path)
128
+ except PermissionError:
129
+ # If we don't have permission to read the directory, just note it.
130
+ lines.append(" " * indent_level + "[Permission Denied]")
131
+ return
132
+
133
+ # Sort items for a consistent order (optional)
134
+ items.sort()
135
+
136
+ for item in items:
137
+ if not show_hidden and item.startswith('.'):
138
+ continue
139
+
140
+ full_path = os.path.join(path, item)
141
+ # Build an indented prefix for the tree
142
+ prefix = " " * indent_level
143
+
144
+ if os.path.isdir(full_path):
145
+ # Directory
146
+ lines.append(f"{prefix}{item}/")
147
+ records.append({
148
+ "type": "directory",
149
+ "name": item,
150
+ "parent_path": path,
151
+ "absolute_path": full_path
152
+ })
153
+ # Recursively descend
154
+ recurse(full_path, indent_level + 1)
155
+ else:
156
+ # File
157
+ lines.append(f"{prefix}- {item}")
158
+ records.append({
159
+ "type": "file",
160
+ "name": item,
161
+ "parent_path": path,
162
+ "absolute_path": full_path
163
+ })
164
+
165
+ # Kick off recursion
166
+ if os.path.isdir(directory_path):
167
+ # Add the top-level directory to lines/records if you like
168
+ dir_name = os.path.basename(os.path.normpath(directory_path)) or directory_path
169
+ lines.append(f"{dir_name}/") # Show the root as well
170
+ records.append({
171
+ "type": "directory",
172
+ "name": dir_name,
173
+ "parent_path": os.path.dirname(directory_path),
174
+ "absolute_path": os.path.abspath(directory_path)
175
+ })
176
+ recurse(directory_path, indent_level=1)
177
+ else:
178
+ # If the given path is not a directory, just return a note
179
+ lines.append(f"{directory_path} is not a directory.")
180
+ records.append({
181
+ "type": "error",
182
+ "name": directory_path,
183
+ "parent_path": None,
184
+ "absolute_path": os.path.abspath(directory_path)
185
+ })
186
+
187
+ # content: multiline string with the entire tree
188
+ content = "\n".join(lines)
189
+ # artifact: list of dicts, easily converted into a DataFrame
190
+ artifact = records
191
+
192
+ return content, artifact
193
+
194
+
195
+ @tool(response_format='content_and_artifact')
196
+ def get_file_info(file_path: str) -> Tuple[str, List[Dict]]:
197
+ """
198
+ Tool: get_file_info
199
+ Description: Retrieves metadata (size, modification time, etc.) about a file.
200
+ Returns a tuple (content, artifact):
201
+ - content (str): A textual summary of the file info.
202
+ - artifact (List[Dict]): A list with a single dictionary of file metadata.
203
+ Useful for direct conversion into a DataFrame.
204
+ Args:
205
+ file_path (str): The path of the file to inspect.
206
+ Returns:
207
+ Tuple[str, List[dict]]:
208
+ content: Summary text
209
+ artifact: A list[dict] of file metadata
210
+ Example:
211
+ content, artifact = get_file_info("/path/to/mydata.csv")
212
+ """
213
+ print(" * Tool: get_file_info")
214
+
215
+ # Ensure the file exists
216
+ import os
217
+ import time
218
+
219
+ if not os.path.isfile(file_path):
220
+ raise FileNotFoundError(f"{file_path} is not a valid file.")
221
+
222
+ file_stats = os.stat(file_path)
223
+
224
+ # Construct the data dictionary
225
+ file_data = {
226
+ "file_name": os.path.basename(file_path),
227
+ "size_bytes": file_stats.st_size,
228
+ "modification_time": time.ctime(file_stats.st_mtime),
229
+ "absolute_path": os.path.abspath(file_path),
230
+ }
231
+
232
+ # Create a user-friendly summary (content)
233
+ content_str = (
234
+ f"File Name: {file_data['file_name']}\n"
235
+ f"Size (bytes): {file_data['size_bytes']}\n"
236
+ f"Last Modified: {file_data['modification_time']}\n"
237
+ f"Absolute Path: {file_data['absolute_path']}"
238
+ )
239
+
240
+ # Artifact should be a list of dict(s) to easily convert to DataFrame
241
+ artifact = [file_data]
242
+
243
+ return content_str, artifact
244
+
245
+
246
+ @tool(response_format='content_and_artifact')
247
+ def search_files_by_pattern(directory_path: str, pattern: str = "*.csv", recursive: bool = False) -> Tuple[str, List[Dict]]:
248
+ """
249
+ Tool: search_files_by_pattern
250
+ Description:
251
+ Searches for files (optionally in subdirectories) that match a given
252
+ wildcard pattern (e.g. "*.csv", "*.xlsx", etc.), returning a tuple:
253
+ (1) content (str): A multiline summary of the matched files.
254
+ (2) artifact (List[Dict]): A list of dicts with file path info.
255
+
256
+ Args:
257
+ directory_path (str): Directory path to start searching from.
258
+ pattern (str): A wildcard pattern, e.g. "*.csv". Default is "*.csv".
259
+ recursive (bool): Whether to search in subdirectories. Default is False.
260
+
261
+ Returns:
262
+ Tuple[str, List[Dict]]:
263
+ content: A user-friendly string showing matched file paths.
264
+ artifact: A list of dictionaries, each representing a matched file.
265
+
266
+ Example:
267
+ content, artifact = search_files_by_pattern("/path/to/folder", "*.csv", recursive=True)
268
+ """
269
+ print(" * Tool: search_files_by_pattern")
270
+
271
+ import os
272
+ import fnmatch
273
+
274
+ matched_files = []
275
+ if recursive:
276
+ for root, dirs, files in os.walk(directory_path):
277
+ for filename in files:
278
+ if fnmatch.fnmatch(filename, pattern):
279
+ matched_files.append(os.path.join(root, filename))
280
+ else:
281
+ # Non-recursive
282
+ for filename in os.listdir(directory_path):
283
+ full_path = os.path.join(directory_path, filename)
284
+ if os.path.isfile(full_path) and fnmatch.fnmatch(filename, pattern):
285
+ matched_files.append(full_path)
286
+
287
+ # Create a human-readable summary (content)
288
+ if matched_files:
289
+ lines = [f"Found {len(matched_files)} file(s) matching '{pattern}':"]
290
+ for f in matched_files:
291
+ lines.append(f" - {f}")
292
+ content = "\n".join(lines)
293
+ else:
294
+ content = f"No files found matching '{pattern}'."
295
+
296
+ # Create artifact as a list of dicts for DataFrame conversion
297
+ artifact = [{"file_path": path} for path in matched_files]
298
+
299
+ return content, artifact
300
+
301
+
302
+ # Loaders
303
+
304
+ def auto_load_file(file_path: str) -> pd.DataFrame:
305
+ """
306
+ Auto loads a file based on its extension.
307
+
308
+ Parameters:
309
+ ----------
310
+ file_path : str
311
+ The path to the file to load.
312
+
313
+ Returns:
314
+ -------
315
+ pd.DataFrame
316
+ """
317
+ import pandas as pd
318
+ try:
319
+ ext = file_path.split(".")[-1].lower()
320
+ if ext == "csv":
321
+ return load_csv(file_path)
322
+ elif ext in ["xlsx", "xls"]:
323
+ return load_excel(file_path)
324
+ elif ext == "json":
325
+ return load_json(file_path)
326
+ elif ext == "parquet":
327
+ return load_parquet(file_path)
328
+ elif ext == "pkl":
329
+ return load_pickle(file_path)
330
+ else:
331
+ return f"Unsupported file extension: {ext}"
332
+ except Exception as e:
333
+ return f"Error loading file: {e}"
334
+
335
+ def load_csv(file_path: str) -> pd.DataFrame:
336
+ """
337
+ Tool: load_csv
338
+ Description: Loads a CSV file into a pandas DataFrame.
339
+ Args:
340
+ file_path (str): Path to the CSV file.
341
+ Returns:
342
+ pd.DataFrame
343
+ """
344
+ import pandas as pd
345
+ return pd.read_csv(file_path)
346
+
347
+ def load_excel(file_path: str, sheet_name=None) -> pd.DataFrame:
348
+ """
349
+ Tool: load_excel
350
+ Description: Loads an Excel file into a pandas DataFrame.
351
+ """
352
+ import pandas as pd
353
+ return pd.read_excel(file_path, sheet_name=sheet_name)
354
+
355
+ def load_json(file_path: str) -> pd.DataFrame:
356
+ """
357
+ Tool: load_json
358
+ Description: Loads a JSON file or NDJSON into a pandas DataFrame.
359
+ """
360
+ import pandas as pd
361
+ # For simple JSON arrays
362
+ return pd.read_json(file_path, orient="records", lines=False)
363
+
364
+ def load_parquet(file_path: str) -> pd.DataFrame:
365
+ """
366
+ Tool: load_parquet
367
+ Description: Loads a Parquet file into a pandas DataFrame.
368
+ """
369
+ import pandas as pd
370
+ return pd.read_parquet(file_path)
371
+
372
+ def load_pickle(file_path: str) -> pd.DataFrame:
373
+ """
374
+ Tool: load_pickle
375
+ Description: Loads a Pickle file into a pandas DataFrame.
376
+ """
377
+ import pandas as pd
378
+ return pd.read_pickle(file_path)
@@ -0,0 +1,139 @@
1
+ import io
2
+ import pandas as pd
3
+ from typing import Union, List, Dict
4
+
5
+ def get_dataframe_summary(
6
+ dataframes: Union[pd.DataFrame, List[pd.DataFrame], Dict[str, pd.DataFrame]],
7
+ n_sample: int = 30,
8
+ skip_stats: bool = False,
9
+ ) -> List[str]:
10
+ """
11
+ Generate a summary for one or more DataFrames. Accepts a single DataFrame, a list of DataFrames,
12
+ or a dictionary mapping names to DataFrames.
13
+
14
+ Parameters
15
+ ----------
16
+ dataframes : pandas.DataFrame or list of pandas.DataFrame or dict of (str -> pandas.DataFrame)
17
+ - Single DataFrame: produce a single summary (returned within a one-element list).
18
+ - List of DataFrames: produce a summary for each DataFrame, using index-based names.
19
+ - Dictionary of DataFrames: produce a summary for each DataFrame, using dictionary keys as names.
20
+ n_sample : int, default 30
21
+ Number of rows to display in the "Data (first 30 rows)" section.
22
+ skip_stats : bool, default False
23
+ If True, skip the descriptive statistics and DataFrame info sections.
24
+
25
+ Example:
26
+ --------
27
+ ``` python
28
+ import pandas as pd
29
+ from sklearn.datasets import load_iris
30
+ data = load_iris(as_frame=True)
31
+ dataframes = {
32
+ "iris": data.frame,
33
+ "iris_target": data.target,
34
+ }
35
+ summaries = get_dataframe_summary(dataframes)
36
+ print(summaries[0])
37
+ ```
38
+
39
+ Returns
40
+ -------
41
+ list of str
42
+ A list of summaries, one for each provided DataFrame. Each summary includes:
43
+ - Shape of the DataFrame (rows, columns)
44
+ - Column data types
45
+ - Missing value percentage
46
+ - Unique value counts
47
+ - First 30 rows
48
+ - Descriptive statistics
49
+ - DataFrame info output
50
+ """
51
+
52
+ summaries = []
53
+
54
+ # --- Dictionary Case ---
55
+ if isinstance(dataframes, dict):
56
+ for dataset_name, df in dataframes.items():
57
+ summaries.append(_summarize_dataframe(df, dataset_name, n_sample, skip_stats))
58
+
59
+ # --- Single DataFrame Case ---
60
+ elif isinstance(dataframes, pd.DataFrame):
61
+ summaries.append(_summarize_dataframe(dataframes, "Single_Dataset", n_sample, skip_stats))
62
+
63
+ # --- List of DataFrames Case ---
64
+ elif isinstance(dataframes, list):
65
+ for idx, df in enumerate(dataframes):
66
+ dataset_name = f"Dataset_{idx}"
67
+ summaries.append(_summarize_dataframe(df, dataset_name, n_sample, skip_stats))
68
+
69
+ else:
70
+ raise TypeError(
71
+ "Input must be a single DataFrame, a list of DataFrames, or a dictionary of DataFrames."
72
+ )
73
+
74
+ return summaries
75
+
76
+
77
+ def _summarize_dataframe(df: pd.DataFrame, dataset_name: str, n_sample=30, skip_stats=False) -> str:
78
+ """Generate a summary string for a single DataFrame."""
79
+ # 1. Convert dictionary-type cells to strings
80
+ # This prevents unhashable dict errors during df.nunique().
81
+ df = df.apply(lambda col: col.map(lambda x: str(x) if isinstance(x, dict) else x))
82
+
83
+ # 2. Capture df.info() output
84
+ buffer = io.StringIO()
85
+ df.info(buf=buffer)
86
+ info_text = buffer.getvalue()
87
+
88
+ # 3. Calculate missing value stats
89
+ missing_stats = (df.isna().sum() / len(df) * 100).sort_values(ascending=False)
90
+ missing_summary = "\n".join([f"{col}: {val:.2f}%" for col, val in missing_stats.items()])
91
+
92
+ # 4. Get column data types
93
+ column_types = "\n".join([f"{col}: {dtype}" for col, dtype in df.dtypes.items()])
94
+
95
+ # 5. Get unique value counts
96
+ unique_counts = df.nunique() # Will no longer fail on unhashable dict
97
+ unique_counts_summary = "\n".join([f"{col}: {count}" for col, count in unique_counts.items()])
98
+
99
+ # 6. Generate the summary text
100
+ if not skip_stats:
101
+ summary_text = f"""
102
+ Dataset Name: {dataset_name}
103
+ ----------------------------
104
+ Shape: {df.shape[0]} rows x {df.shape[1]} columns
105
+
106
+ Column Data Types:
107
+ {column_types}
108
+
109
+ Missing Value Percentage:
110
+ {missing_summary}
111
+
112
+ Unique Value Counts:
113
+ {unique_counts_summary}
114
+
115
+ Data (first {n_sample} rows):
116
+ {df.head(n_sample).to_string()}
117
+
118
+ Data Description:
119
+ {df.describe().to_string()}
120
+
121
+ Data Info:
122
+ {info_text}
123
+ """
124
+ else:
125
+ summary_text = f"""
126
+ Dataset Name: {dataset_name}
127
+ ----------------------------
128
+ Shape: {df.shape[0]} rows x {df.shape[1]} columns
129
+
130
+ Column Data Types:
131
+ {column_types}
132
+
133
+ Data (first {n_sample} rows):
134
+ {df.head(n_sample).to_string()}
135
+ """
136
+
137
+ return summary_text.strip()
138
+
139
+