ai-data-science-team 0.0.0.9008__py3-none-any.whl → 0.0.0.9010__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (29) hide show
  1. ai_data_science_team/_version.py +1 -1
  2. ai_data_science_team/agents/__init__.py +0 -1
  3. ai_data_science_team/agents/data_cleaning_agent.py +50 -39
  4. ai_data_science_team/agents/data_loader_tools_agent.py +69 -0
  5. ai_data_science_team/agents/data_visualization_agent.py +45 -50
  6. ai_data_science_team/agents/data_wrangling_agent.py +50 -49
  7. ai_data_science_team/agents/feature_engineering_agent.py +48 -67
  8. ai_data_science_team/agents/sql_database_agent.py +130 -76
  9. ai_data_science_team/ml_agents/__init__.py +2 -0
  10. ai_data_science_team/ml_agents/h2o_ml_agent.py +852 -0
  11. ai_data_science_team/ml_agents/mlflow_tools_agent.py +327 -0
  12. ai_data_science_team/multiagents/sql_data_analyst.py +120 -9
  13. ai_data_science_team/parsers/__init__.py +0 -0
  14. ai_data_science_team/{tools → parsers}/parsers.py +0 -1
  15. ai_data_science_team/templates/__init__.py +1 -0
  16. ai_data_science_team/templates/agent_templates.py +78 -7
  17. ai_data_science_team/tools/data_loader.py +378 -0
  18. ai_data_science_team/tools/{metadata.py → dataframe.py} +0 -91
  19. ai_data_science_team/tools/h2o.py +643 -0
  20. ai_data_science_team/tools/mlflow.py +961 -0
  21. ai_data_science_team/tools/sql.py +126 -0
  22. ai_data_science_team/{tools → utils}/regex.py +59 -1
  23. {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/METADATA +56 -24
  24. ai_data_science_team-0.0.0.9010.dist-info/RECORD +35 -0
  25. ai_data_science_team-0.0.0.9008.dist-info/RECORD +0 -26
  26. /ai_data_science_team/{tools → utils}/logging.py +0 -0
  27. {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/LICENSE +0 -0
  28. {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/WHEEL +0 -0
  29. {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,378 @@
1
+
2
+ from langchain.tools import tool
3
+
4
+ import pandas as pd
5
+
6
+ from typing import Tuple, List, Dict
7
+
8
+
9
+ @tool(response_format='content_and_artifact')
10
+ def load_directory(dir_path: str) -> Tuple[str, Dict]:
11
+ """
12
+ Tool: load_directory
13
+ Description: Loads all recognized tabular files in a directory.
14
+
15
+ Parameters:
16
+ ----------
17
+ dir_path : str
18
+ The path to the directory to load.
19
+
20
+ Returns:
21
+ -------
22
+ Tuple[str, Dict]
23
+ A tuple containing a message and a dictionary of data frames.
24
+ """
25
+ print(" * Tool: load_directory")
26
+ import os
27
+ import pandas as pd
28
+ data_frames = {}
29
+ for filename in os.listdir(dir_path):
30
+ file_path = os.path.join(dir_path, filename)
31
+ # Skip directories
32
+ if os.path.isdir(file_path):
33
+ continue
34
+ try:
35
+ data_frames[filename] = auto_load_file(file_path).to_dict()
36
+ except Exception as e:
37
+ data_frames[filename] = f"Error loading file: {e}"
38
+ return f"Returned the following data frames: {list(data_frames.keys())}", data_frames
39
+
40
+ @tool(response_format='content_and_artifact')
41
+ def load_file(file_path: str) -> Tuple[str, Dict]:
42
+ """
43
+ Automatically loads a file based on its extension.
44
+
45
+ Parameters:
46
+ ----------
47
+ file_path : str
48
+ The path to the file to load.
49
+
50
+ Returns:
51
+ -------
52
+ Tuple[str, Dict]
53
+ A tuple containing a message and a dictionary of the data frame.
54
+ """
55
+ print(" * Tool: load_file")
56
+ return f"Returned the following data frame from this file: {file_path}", auto_load_file(file_path).to_dict()
57
+
58
+
59
+ @tool(response_format='content_and_artifact')
60
+ def list_directory_contents(directory_path: str, show_hidden: bool = False) -> Tuple[List[str], List[Dict]]:
61
+ """
62
+ Tool: list_directory_contents
63
+ Description: Lists all files and folders in the specified directory.
64
+ Args:
65
+ directory_path (str): The path of the directory to list.
66
+ show_hidden (bool): Whether to include hidden files (default: False).
67
+ Returns:
68
+ tuple:
69
+ - content (list[str]): A list of filenames/folders (suitable for display)
70
+ - artifact (list[dict]): A list of dictionaries where each dict has keys like {"filename": <name>}.
71
+ This structure can be easily converted to a pandas DataFrame.
72
+ """
73
+ print(" * Tool: list_directory_contents")
74
+ import os
75
+
76
+ items = []
77
+ for item in os.listdir(directory_path):
78
+ # If show_hidden is False, skip items starting with '.'
79
+ if not show_hidden and item.startswith('.'):
80
+ continue
81
+ items.append(item)
82
+
83
+ # content: just the raw list of filenames
84
+ content = items
85
+
86
+ # artifact: list of dicts (each row is {"filename": ...}), easily turned into a DataFrame
87
+ artifact = [{"filename": item} for item in items]
88
+
89
+ return content, artifact
90
+
91
+
92
+ @tool(response_format='content_and_artifact')
93
+ def list_directory_recursive(directory_path: str, show_hidden: bool = False) -> Tuple[str, List[Dict]]:
94
+ """
95
+ Tool: list_directory_recursive
96
+ Description:
97
+ Recursively lists all files and folders within the specified directory.
98
+ Returns a two-tuple:
99
+ (1) A human-readable tree representation of the directory (content).
100
+ (2) A list of dicts (artifact) that can be easily converted into a DataFrame.
101
+
102
+ Args:
103
+ directory_path (str): The path of the directory to list.
104
+ show_hidden (bool): Whether to include hidden files (default: False).
105
+
106
+ Returns:
107
+ Tuple[str, List[dict]]:
108
+ content: A multiline string showing the directory tree.
109
+ artifact: A list of dictionaries, each with information about a file or directory.
110
+
111
+ Example:
112
+ content, artifact = list_directory_recursive("/path/to/folder", show_hidden=False)
113
+ """
114
+ print(" * Tool: list_directory_recursive")
115
+
116
+ # We'll store two things as we recurse:
117
+ # 1) lines for building the "tree" string
118
+ # 2) records in a list of dicts for easy DataFrame creation
119
+ import os
120
+
121
+ lines = []
122
+ records = []
123
+
124
+ def recurse(path: str, indent_level: int = 0):
125
+ # List items in the current directory
126
+ try:
127
+ items = os.listdir(path)
128
+ except PermissionError:
129
+ # If we don't have permission to read the directory, just note it.
130
+ lines.append(" " * indent_level + "[Permission Denied]")
131
+ return
132
+
133
+ # Sort items for a consistent order (optional)
134
+ items.sort()
135
+
136
+ for item in items:
137
+ if not show_hidden and item.startswith('.'):
138
+ continue
139
+
140
+ full_path = os.path.join(path, item)
141
+ # Build an indented prefix for the tree
142
+ prefix = " " * indent_level
143
+
144
+ if os.path.isdir(full_path):
145
+ # Directory
146
+ lines.append(f"{prefix}{item}/")
147
+ records.append({
148
+ "type": "directory",
149
+ "name": item,
150
+ "parent_path": path,
151
+ "absolute_path": full_path
152
+ })
153
+ # Recursively descend
154
+ recurse(full_path, indent_level + 1)
155
+ else:
156
+ # File
157
+ lines.append(f"{prefix}- {item}")
158
+ records.append({
159
+ "type": "file",
160
+ "name": item,
161
+ "parent_path": path,
162
+ "absolute_path": full_path
163
+ })
164
+
165
+ # Kick off recursion
166
+ if os.path.isdir(directory_path):
167
+ # Add the top-level directory to lines/records if you like
168
+ dir_name = os.path.basename(os.path.normpath(directory_path)) or directory_path
169
+ lines.append(f"{dir_name}/") # Show the root as well
170
+ records.append({
171
+ "type": "directory",
172
+ "name": dir_name,
173
+ "parent_path": os.path.dirname(directory_path),
174
+ "absolute_path": os.path.abspath(directory_path)
175
+ })
176
+ recurse(directory_path, indent_level=1)
177
+ else:
178
+ # If the given path is not a directory, just return a note
179
+ lines.append(f"{directory_path} is not a directory.")
180
+ records.append({
181
+ "type": "error",
182
+ "name": directory_path,
183
+ "parent_path": None,
184
+ "absolute_path": os.path.abspath(directory_path)
185
+ })
186
+
187
+ # content: multiline string with the entire tree
188
+ content = "\n".join(lines)
189
+ # artifact: list of dicts, easily converted into a DataFrame
190
+ artifact = records
191
+
192
+ return content, artifact
193
+
194
+
195
+ @tool(response_format='content_and_artifact')
196
+ def get_file_info(file_path: str) -> Tuple[str, List[Dict]]:
197
+ """
198
+ Tool: get_file_info
199
+ Description: Retrieves metadata (size, modification time, etc.) about a file.
200
+ Returns a tuple (content, artifact):
201
+ - content (str): A textual summary of the file info.
202
+ - artifact (List[Dict]): A list with a single dictionary of file metadata.
203
+ Useful for direct conversion into a DataFrame.
204
+ Args:
205
+ file_path (str): The path of the file to inspect.
206
+ Returns:
207
+ Tuple[str, List[dict]]:
208
+ content: Summary text
209
+ artifact: A list[dict] of file metadata
210
+ Example:
211
+ content, artifact = get_file_info("/path/to/mydata.csv")
212
+ """
213
+ print(" * Tool: get_file_info")
214
+
215
+ # Ensure the file exists
216
+ import os
217
+ import time
218
+
219
+ if not os.path.isfile(file_path):
220
+ raise FileNotFoundError(f"{file_path} is not a valid file.")
221
+
222
+ file_stats = os.stat(file_path)
223
+
224
+ # Construct the data dictionary
225
+ file_data = {
226
+ "file_name": os.path.basename(file_path),
227
+ "size_bytes": file_stats.st_size,
228
+ "modification_time": time.ctime(file_stats.st_mtime),
229
+ "absolute_path": os.path.abspath(file_path),
230
+ }
231
+
232
+ # Create a user-friendly summary (content)
233
+ content_str = (
234
+ f"File Name: {file_data['file_name']}\n"
235
+ f"Size (bytes): {file_data['size_bytes']}\n"
236
+ f"Last Modified: {file_data['modification_time']}\n"
237
+ f"Absolute Path: {file_data['absolute_path']}"
238
+ )
239
+
240
+ # Artifact should be a list of dict(s) to easily convert to DataFrame
241
+ artifact = [file_data]
242
+
243
+ return content_str, artifact
244
+
245
+
246
+ @tool(response_format='content_and_artifact')
247
+ def search_files_by_pattern(directory_path: str, pattern: str = "*.csv", recursive: bool = False) -> Tuple[str, List[Dict]]:
248
+ """
249
+ Tool: search_files_by_pattern
250
+ Description:
251
+ Searches for files (optionally in subdirectories) that match a given
252
+ wildcard pattern (e.g. "*.csv", "*.xlsx", etc.), returning a tuple:
253
+ (1) content (str): A multiline summary of the matched files.
254
+ (2) artifact (List[Dict]): A list of dicts with file path info.
255
+
256
+ Args:
257
+ directory_path (str): Directory path to start searching from.
258
+ pattern (str): A wildcard pattern, e.g. "*.csv". Default is "*.csv".
259
+ recursive (bool): Whether to search in subdirectories. Default is False.
260
+
261
+ Returns:
262
+ Tuple[str, List[Dict]]:
263
+ content: A user-friendly string showing matched file paths.
264
+ artifact: A list of dictionaries, each representing a matched file.
265
+
266
+ Example:
267
+ content, artifact = search_files_by_pattern("/path/to/folder", "*.csv", recursive=True)
268
+ """
269
+ print(" * Tool: search_files_by_pattern")
270
+
271
+ import os
272
+ import fnmatch
273
+
274
+ matched_files = []
275
+ if recursive:
276
+ for root, dirs, files in os.walk(directory_path):
277
+ for filename in files:
278
+ if fnmatch.fnmatch(filename, pattern):
279
+ matched_files.append(os.path.join(root, filename))
280
+ else:
281
+ # Non-recursive
282
+ for filename in os.listdir(directory_path):
283
+ full_path = os.path.join(directory_path, filename)
284
+ if os.path.isfile(full_path) and fnmatch.fnmatch(filename, pattern):
285
+ matched_files.append(full_path)
286
+
287
+ # Create a human-readable summary (content)
288
+ if matched_files:
289
+ lines = [f"Found {len(matched_files)} file(s) matching '{pattern}':"]
290
+ for f in matched_files:
291
+ lines.append(f" - {f}")
292
+ content = "\n".join(lines)
293
+ else:
294
+ content = f"No files found matching '{pattern}'."
295
+
296
+ # Create artifact as a list of dicts for DataFrame conversion
297
+ artifact = [{"file_path": path} for path in matched_files]
298
+
299
+ return content, artifact
300
+
301
+
302
+ # Loaders
303
+
304
+ def auto_load_file(file_path: str) -> pd.DataFrame:
305
+ """
306
+ Auto loads a file based on its extension.
307
+
308
+ Parameters:
309
+ ----------
310
+ file_path : str
311
+ The path to the file to load.
312
+
313
+ Returns:
314
+ -------
315
+ pd.DataFrame
316
+ """
317
+ import pandas as pd
318
+ try:
319
+ ext = file_path.split(".")[-1].lower()
320
+ if ext == "csv":
321
+ return load_csv(file_path)
322
+ elif ext in ["xlsx", "xls"]:
323
+ return load_excel(file_path)
324
+ elif ext == "json":
325
+ return load_json(file_path)
326
+ elif ext == "parquet":
327
+ return load_parquet(file_path)
328
+ elif ext == "pkl":
329
+ return load_pickle(file_path)
330
+ else:
331
+ return f"Unsupported file extension: {ext}"
332
+ except Exception as e:
333
+ return f"Error loading file: {e}"
334
+
335
+ def load_csv(file_path: str) -> pd.DataFrame:
336
+ """
337
+ Tool: load_csv
338
+ Description: Loads a CSV file into a pandas DataFrame.
339
+ Args:
340
+ file_path (str): Path to the CSV file.
341
+ Returns:
342
+ pd.DataFrame
343
+ """
344
+ import pandas as pd
345
+ return pd.read_csv(file_path)
346
+
347
+ def load_excel(file_path: str, sheet_name=None) -> pd.DataFrame:
348
+ """
349
+ Tool: load_excel
350
+ Description: Loads an Excel file into a pandas DataFrame.
351
+ """
352
+ import pandas as pd
353
+ return pd.read_excel(file_path, sheet_name=sheet_name)
354
+
355
+ def load_json(file_path: str) -> pd.DataFrame:
356
+ """
357
+ Tool: load_json
358
+ Description: Loads a JSON file or NDJSON into a pandas DataFrame.
359
+ """
360
+ import pandas as pd
361
+ # For simple JSON arrays
362
+ return pd.read_json(file_path, orient="records", lines=False)
363
+
364
+ def load_parquet(file_path: str) -> pd.DataFrame:
365
+ """
366
+ Tool: load_parquet
367
+ Description: Loads a Parquet file into a pandas DataFrame.
368
+ """
369
+ import pandas as pd
370
+ return pd.read_parquet(file_path)
371
+
372
+ def load_pickle(file_path: str) -> pd.DataFrame:
373
+ """
374
+ Tool: load_pickle
375
+ Description: Loads a Pickle file into a pandas DataFrame.
376
+ """
377
+ import pandas as pd
378
+ return pd.read_pickle(file_path)
@@ -1,6 +1,5 @@
1
1
  import io
2
2
  import pandas as pd
3
- import sqlalchemy as sql
4
3
  from typing import Union, List, Dict
5
4
 
6
5
  def get_dataframe_summary(
@@ -138,93 +137,3 @@ def _summarize_dataframe(df: pd.DataFrame, dataset_name: str, n_sample=30, skip_
138
137
  return summary_text.strip()
139
138
 
140
139
 
141
-
142
- def get_database_metadata(connection: Union[sql.engine.base.Connection, sql.engine.base.Engine],
143
- n_samples: int = 10) -> str:
144
- """
145
- Collects metadata and sample data from a database, with safe identifier quoting and
146
- basic dialect-aware row limiting. Prevents issues with spaces/reserved words in identifiers.
147
-
148
- Parameters
149
- ----------
150
- connection : Union[sql.engine.base.Connection, sql.engine.base.Engine]
151
- An active SQLAlchemy connection or engine.
152
- n_samples : int
153
- Number of sample values to retrieve for each column.
154
-
155
- Returns
156
- -------
157
- str
158
- A formatted string with database metadata, including some sample data from each column.
159
- """
160
-
161
- # If a connection is passed, use it; if an engine is passed, connect to it
162
- is_engine = isinstance(connection, sql.engine.base.Engine)
163
- conn = connection.connect() if is_engine else connection
164
-
165
- output = []
166
- try:
167
- # Grab the engine off the connection
168
- sql_engine = conn.engine
169
- dialect_name = sql_engine.dialect.name.lower()
170
-
171
- output.append(f"Database Dialect: {sql_engine.dialect.name}")
172
- output.append(f"Driver: {sql_engine.driver}")
173
- output.append(f"Connection URL: {sql_engine.url}")
174
-
175
- # Inspect the database
176
- inspector = sql.inspect(sql_engine)
177
- tables = inspector.get_table_names()
178
- output.append(f"Tables: {tables}")
179
- output.append(f"Schemas: {inspector.get_schema_names()}")
180
-
181
- # Helper to build a dialect-specific limit clause
182
- def build_query(col_name_quoted: str, table_name_quoted: str, n: int) -> str:
183
- """
184
- Returns a SQL query string to select N rows from the given column/table
185
- across different dialects (SQLite, MySQL, Postgres, MSSQL, Oracle, etc.)
186
- """
187
- if "sqlite" in dialect_name or "mysql" in dialect_name or "postgres" in dialect_name:
188
- # Common dialects supporting LIMIT
189
- return f"SELECT {col_name_quoted} FROM {table_name_quoted} LIMIT {n}"
190
- elif "mssql" in dialect_name:
191
- # Microsoft SQL Server syntax
192
- return f"SELECT TOP {n} {col_name_quoted} FROM {table_name_quoted}"
193
- elif "oracle" in dialect_name:
194
- # Oracle syntax
195
- return f"SELECT {col_name_quoted} FROM {table_name_quoted} WHERE ROWNUM <= {n}"
196
- else:
197
- # Fallback
198
- return f"SELECT {col_name_quoted} FROM {table_name_quoted} LIMIT {n}"
199
-
200
- # Prepare for quoting
201
- preparer = inspector.bind.dialect.identifier_preparer
202
-
203
- # For each table, get columns and sample data
204
- for table_name in tables:
205
- output.append(f"\nTable: {table_name}")
206
- # Properly quote the table name
207
- table_name_quoted = preparer.quote_identifier(table_name)
208
-
209
- for column in inspector.get_columns(table_name):
210
- col_name = column["name"]
211
- col_type = column["type"]
212
- output.append(f" Column: {col_name} Type: {col_type}")
213
-
214
- # Properly quote the column name
215
- col_name_quoted = preparer.quote_identifier(col_name)
216
-
217
- # Build a dialect-aware query with safe quoting
218
- query = build_query(col_name_quoted, table_name_quoted, n_samples)
219
-
220
- # Read a few sample values
221
- df = pd.read_sql(sql.text(query), conn)
222
- first_values = df[col_name].tolist()
223
- output.append(f" First {n_samples} Values: {first_values}")
224
-
225
- finally:
226
- # Close connection if created inside the function
227
- if is_engine:
228
- conn.close()
229
-
230
- return "\n".join(output)