ai-data-science-team 0.0.0.9008__py3-none-any.whl → 0.0.0.9010__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. ai_data_science_team/_version.py +1 -1
  2. ai_data_science_team/agents/__init__.py +0 -1
  3. ai_data_science_team/agents/data_cleaning_agent.py +50 -39
  4. ai_data_science_team/agents/data_loader_tools_agent.py +69 -0
  5. ai_data_science_team/agents/data_visualization_agent.py +45 -50
  6. ai_data_science_team/agents/data_wrangling_agent.py +50 -49
  7. ai_data_science_team/agents/feature_engineering_agent.py +48 -67
  8. ai_data_science_team/agents/sql_database_agent.py +130 -76
  9. ai_data_science_team/ml_agents/__init__.py +2 -0
  10. ai_data_science_team/ml_agents/h2o_ml_agent.py +852 -0
  11. ai_data_science_team/ml_agents/mlflow_tools_agent.py +327 -0
  12. ai_data_science_team/multiagents/sql_data_analyst.py +120 -9
  13. ai_data_science_team/parsers/__init__.py +0 -0
  14. ai_data_science_team/{tools → parsers}/parsers.py +0 -1
  15. ai_data_science_team/templates/__init__.py +1 -0
  16. ai_data_science_team/templates/agent_templates.py +78 -7
  17. ai_data_science_team/tools/data_loader.py +378 -0
  18. ai_data_science_team/tools/{metadata.py → dataframe.py} +0 -91
  19. ai_data_science_team/tools/h2o.py +643 -0
  20. ai_data_science_team/tools/mlflow.py +961 -0
  21. ai_data_science_team/tools/sql.py +126 -0
  22. ai_data_science_team/{tools → utils}/regex.py +59 -1
  23. {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/METADATA +56 -24
  24. ai_data_science_team-0.0.0.9010.dist-info/RECORD +35 -0
  25. ai_data_science_team-0.0.0.9008.dist-info/RECORD +0 -26
  26. /ai_data_science_team/{tools → utils}/logging.py +0 -0
  27. {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/LICENSE +0 -0
  28. {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/WHEEL +0 -0
  29. {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,378 @@
1
+
2
+ from langchain.tools import tool
3
+
4
+ import pandas as pd
5
+
6
+ from typing import Tuple, List, Dict
7
+
8
+
9
+ @tool(response_format='content_and_artifact')
10
+ def load_directory(dir_path: str) -> Tuple[str, Dict]:
11
+ """
12
+ Tool: load_directory
13
+ Description: Loads all recognized tabular files in a directory.
14
+
15
+ Parameters:
16
+ ----------
17
+ dir_path : str
18
+ The path to the directory to load.
19
+
20
+ Returns:
21
+ -------
22
+ Tuple[str, Dict]
23
+ A tuple containing a message and a dictionary of data frames.
24
+ """
25
+ print(" * Tool: load_directory")
26
+ import os
27
+ import pandas as pd
28
+ data_frames = {}
29
+ for filename in os.listdir(dir_path):
30
+ file_path = os.path.join(dir_path, filename)
31
+ # Skip directories
32
+ if os.path.isdir(file_path):
33
+ continue
34
+ try:
35
+ data_frames[filename] = auto_load_file(file_path).to_dict()
36
+ except Exception as e:
37
+ data_frames[filename] = f"Error loading file: {e}"
38
+ return f"Returned the following data frames: {list(data_frames.keys())}", data_frames
39
+
40
+ @tool(response_format='content_and_artifact')
41
+ def load_file(file_path: str) -> Tuple[str, Dict]:
42
+ """
43
+ Automatically loads a file based on its extension.
44
+
45
+ Parameters:
46
+ ----------
47
+ file_path : str
48
+ The path to the file to load.
49
+
50
+ Returns:
51
+ -------
52
+ Tuple[str, Dict]
53
+ A tuple containing a message and a dictionary of the data frame.
54
+ """
55
+ print(" * Tool: load_file")
56
+ return f"Returned the following data frame from this file: {file_path}", auto_load_file(file_path).to_dict()
57
+
58
+
59
+ @tool(response_format='content_and_artifact')
60
+ def list_directory_contents(directory_path: str, show_hidden: bool = False) -> Tuple[List[str], List[Dict]]:
61
+ """
62
+ Tool: list_directory_contents
63
+ Description: Lists all files and folders in the specified directory.
64
+ Args:
65
+ directory_path (str): The path of the directory to list.
66
+ show_hidden (bool): Whether to include hidden files (default: False).
67
+ Returns:
68
+ tuple:
69
+ - content (list[str]): A list of filenames/folders (suitable for display)
70
+ - artifact (list[dict]): A list of dictionaries where each dict has keys like {"filename": <name>}.
71
+ This structure can be easily converted to a pandas DataFrame.
72
+ """
73
+ print(" * Tool: list_directory_contents")
74
+ import os
75
+
76
+ items = []
77
+ for item in os.listdir(directory_path):
78
+ # If show_hidden is False, skip items starting with '.'
79
+ if not show_hidden and item.startswith('.'):
80
+ continue
81
+ items.append(item)
82
+
83
+ # content: just the raw list of filenames
84
+ content = items
85
+
86
+ # artifact: list of dicts (each row is {"filename": ...}), easily turned into a DataFrame
87
+ artifact = [{"filename": item} for item in items]
88
+
89
+ return content, artifact
90
+
91
+
92
+ @tool(response_format='content_and_artifact')
93
+ def list_directory_recursive(directory_path: str, show_hidden: bool = False) -> Tuple[str, List[Dict]]:
94
+ """
95
+ Tool: list_directory_recursive
96
+ Description:
97
+ Recursively lists all files and folders within the specified directory.
98
+ Returns a two-tuple:
99
+ (1) A human-readable tree representation of the directory (content).
100
+ (2) A list of dicts (artifact) that can be easily converted into a DataFrame.
101
+
102
+ Args:
103
+ directory_path (str): The path of the directory to list.
104
+ show_hidden (bool): Whether to include hidden files (default: False).
105
+
106
+ Returns:
107
+ Tuple[str, List[dict]]:
108
+ content: A multiline string showing the directory tree.
109
+ artifact: A list of dictionaries, each with information about a file or directory.
110
+
111
+ Example:
112
+ content, artifact = list_directory_recursive("/path/to/folder", show_hidden=False)
113
+ """
114
+ print(" * Tool: list_directory_recursive")
115
+
116
+ # We'll store two things as we recurse:
117
+ # 1) lines for building the "tree" string
118
+ # 2) records in a list of dicts for easy DataFrame creation
119
+ import os
120
+
121
+ lines = []
122
+ records = []
123
+
124
+ def recurse(path: str, indent_level: int = 0):
125
+ # List items in the current directory
126
+ try:
127
+ items = os.listdir(path)
128
+ except PermissionError:
129
+ # If we don't have permission to read the directory, just note it.
130
+ lines.append(" " * indent_level + "[Permission Denied]")
131
+ return
132
+
133
+ # Sort items for a consistent order (optional)
134
+ items.sort()
135
+
136
+ for item in items:
137
+ if not show_hidden and item.startswith('.'):
138
+ continue
139
+
140
+ full_path = os.path.join(path, item)
141
+ # Build an indented prefix for the tree
142
+ prefix = " " * indent_level
143
+
144
+ if os.path.isdir(full_path):
145
+ # Directory
146
+ lines.append(f"{prefix}{item}/")
147
+ records.append({
148
+ "type": "directory",
149
+ "name": item,
150
+ "parent_path": path,
151
+ "absolute_path": full_path
152
+ })
153
+ # Recursively descend
154
+ recurse(full_path, indent_level + 1)
155
+ else:
156
+ # File
157
+ lines.append(f"{prefix}- {item}")
158
+ records.append({
159
+ "type": "file",
160
+ "name": item,
161
+ "parent_path": path,
162
+ "absolute_path": full_path
163
+ })
164
+
165
+ # Kick off recursion
166
+ if os.path.isdir(directory_path):
167
+ # Add the top-level directory to lines/records if you like
168
+ dir_name = os.path.basename(os.path.normpath(directory_path)) or directory_path
169
+ lines.append(f"{dir_name}/") # Show the root as well
170
+ records.append({
171
+ "type": "directory",
172
+ "name": dir_name,
173
+ "parent_path": os.path.dirname(directory_path),
174
+ "absolute_path": os.path.abspath(directory_path)
175
+ })
176
+ recurse(directory_path, indent_level=1)
177
+ else:
178
+ # If the given path is not a directory, just return a note
179
+ lines.append(f"{directory_path} is not a directory.")
180
+ records.append({
181
+ "type": "error",
182
+ "name": directory_path,
183
+ "parent_path": None,
184
+ "absolute_path": os.path.abspath(directory_path)
185
+ })
186
+
187
+ # content: multiline string with the entire tree
188
+ content = "\n".join(lines)
189
+ # artifact: list of dicts, easily converted into a DataFrame
190
+ artifact = records
191
+
192
+ return content, artifact
193
+
194
+
195
+ @tool(response_format='content_and_artifact')
196
+ def get_file_info(file_path: str) -> Tuple[str, List[Dict]]:
197
+ """
198
+ Tool: get_file_info
199
+ Description: Retrieves metadata (size, modification time, etc.) about a file.
200
+ Returns a tuple (content, artifact):
201
+ - content (str): A textual summary of the file info.
202
+ - artifact (List[Dict]): A list with a single dictionary of file metadata.
203
+ Useful for direct conversion into a DataFrame.
204
+ Args:
205
+ file_path (str): The path of the file to inspect.
206
+ Returns:
207
+ Tuple[str, List[dict]]:
208
+ content: Summary text
209
+ artifact: A list[dict] of file metadata
210
+ Example:
211
+ content, artifact = get_file_info("/path/to/mydata.csv")
212
+ """
213
+ print(" * Tool: get_file_info")
214
+
215
+ # Ensure the file exists
216
+ import os
217
+ import time
218
+
219
+ if not os.path.isfile(file_path):
220
+ raise FileNotFoundError(f"{file_path} is not a valid file.")
221
+
222
+ file_stats = os.stat(file_path)
223
+
224
+ # Construct the data dictionary
225
+ file_data = {
226
+ "file_name": os.path.basename(file_path),
227
+ "size_bytes": file_stats.st_size,
228
+ "modification_time": time.ctime(file_stats.st_mtime),
229
+ "absolute_path": os.path.abspath(file_path),
230
+ }
231
+
232
+ # Create a user-friendly summary (content)
233
+ content_str = (
234
+ f"File Name: {file_data['file_name']}\n"
235
+ f"Size (bytes): {file_data['size_bytes']}\n"
236
+ f"Last Modified: {file_data['modification_time']}\n"
237
+ f"Absolute Path: {file_data['absolute_path']}"
238
+ )
239
+
240
+ # Artifact should be a list of dict(s) to easily convert to DataFrame
241
+ artifact = [file_data]
242
+
243
+ return content_str, artifact
244
+
245
+
246
+ @tool(response_format='content_and_artifact')
247
+ def search_files_by_pattern(directory_path: str, pattern: str = "*.csv", recursive: bool = False) -> Tuple[str, List[Dict]]:
248
+ """
249
+ Tool: search_files_by_pattern
250
+ Description:
251
+ Searches for files (optionally in subdirectories) that match a given
252
+ wildcard pattern (e.g. "*.csv", "*.xlsx", etc.), returning a tuple:
253
+ (1) content (str): A multiline summary of the matched files.
254
+ (2) artifact (List[Dict]): A list of dicts with file path info.
255
+
256
+ Args:
257
+ directory_path (str): Directory path to start searching from.
258
+ pattern (str): A wildcard pattern, e.g. "*.csv". Default is "*.csv".
259
+ recursive (bool): Whether to search in subdirectories. Default is False.
260
+
261
+ Returns:
262
+ Tuple[str, List[Dict]]:
263
+ content: A user-friendly string showing matched file paths.
264
+ artifact: A list of dictionaries, each representing a matched file.
265
+
266
+ Example:
267
+ content, artifact = search_files_by_pattern("/path/to/folder", "*.csv", recursive=True)
268
+ """
269
+ print(" * Tool: search_files_by_pattern")
270
+
271
+ import os
272
+ import fnmatch
273
+
274
+ matched_files = []
275
+ if recursive:
276
+ for root, dirs, files in os.walk(directory_path):
277
+ for filename in files:
278
+ if fnmatch.fnmatch(filename, pattern):
279
+ matched_files.append(os.path.join(root, filename))
280
+ else:
281
+ # Non-recursive
282
+ for filename in os.listdir(directory_path):
283
+ full_path = os.path.join(directory_path, filename)
284
+ if os.path.isfile(full_path) and fnmatch.fnmatch(filename, pattern):
285
+ matched_files.append(full_path)
286
+
287
+ # Create a human-readable summary (content)
288
+ if matched_files:
289
+ lines = [f"Found {len(matched_files)} file(s) matching '{pattern}':"]
290
+ for f in matched_files:
291
+ lines.append(f" - {f}")
292
+ content = "\n".join(lines)
293
+ else:
294
+ content = f"No files found matching '{pattern}'."
295
+
296
+ # Create artifact as a list of dicts for DataFrame conversion
297
+ artifact = [{"file_path": path} for path in matched_files]
298
+
299
+ return content, artifact
300
+
301
+
302
+ # Loaders
303
+
304
+ def auto_load_file(file_path: str) -> pd.DataFrame:
305
+ """
306
+ Auto loads a file based on its extension.
307
+
308
+ Parameters:
309
+ ----------
310
+ file_path : str
311
+ The path to the file to load.
312
+
313
+ Returns:
314
+ -------
315
+ pd.DataFrame
316
+ """
317
+ import pandas as pd
318
+ try:
319
+ ext = file_path.split(".")[-1].lower()
320
+ if ext == "csv":
321
+ return load_csv(file_path)
322
+ elif ext in ["xlsx", "xls"]:
323
+ return load_excel(file_path)
324
+ elif ext == "json":
325
+ return load_json(file_path)
326
+ elif ext == "parquet":
327
+ return load_parquet(file_path)
328
+ elif ext == "pkl":
329
+ return load_pickle(file_path)
330
+ else:
331
+ return f"Unsupported file extension: {ext}"
332
+ except Exception as e:
333
+ return f"Error loading file: {e}"
334
+
335
+ def load_csv(file_path: str) -> pd.DataFrame:
336
+ """
337
+ Tool: load_csv
338
+ Description: Loads a CSV file into a pandas DataFrame.
339
+ Args:
340
+ file_path (str): Path to the CSV file.
341
+ Returns:
342
+ pd.DataFrame
343
+ """
344
+ import pandas as pd
345
+ return pd.read_csv(file_path)
346
+
347
+ def load_excel(file_path: str, sheet_name=None) -> pd.DataFrame:
348
+ """
349
+ Tool: load_excel
350
+ Description: Loads an Excel file into a pandas DataFrame.
351
+ """
352
+ import pandas as pd
353
+ return pd.read_excel(file_path, sheet_name=sheet_name)
354
+
355
+ def load_json(file_path: str) -> pd.DataFrame:
356
+ """
357
+ Tool: load_json
358
+ Description: Loads a JSON file or NDJSON into a pandas DataFrame.
359
+ """
360
+ import pandas as pd
361
+ # For simple JSON arrays
362
+ return pd.read_json(file_path, orient="records", lines=False)
363
+
364
+ def load_parquet(file_path: str) -> pd.DataFrame:
365
+ """
366
+ Tool: load_parquet
367
+ Description: Loads a Parquet file into a pandas DataFrame.
368
+ """
369
+ import pandas as pd
370
+ return pd.read_parquet(file_path)
371
+
372
+ def load_pickle(file_path: str) -> pd.DataFrame:
373
+ """
374
+ Tool: load_pickle
375
+ Description: Loads a Pickle file into a pandas DataFrame.
376
+ """
377
+ import pandas as pd
378
+ return pd.read_pickle(file_path)
@@ -1,6 +1,5 @@
1
1
  import io
2
2
  import pandas as pd
3
- import sqlalchemy as sql
4
3
  from typing import Union, List, Dict
5
4
 
6
5
  def get_dataframe_summary(
@@ -138,93 +137,3 @@ def _summarize_dataframe(df: pd.DataFrame, dataset_name: str, n_sample=30, skip_
138
137
  return summary_text.strip()
139
138
 
140
139
 
141
-
142
- def get_database_metadata(connection: Union[sql.engine.base.Connection, sql.engine.base.Engine],
143
- n_samples: int = 10) -> str:
144
- """
145
- Collects metadata and sample data from a database, with safe identifier quoting and
146
- basic dialect-aware row limiting. Prevents issues with spaces/reserved words in identifiers.
147
-
148
- Parameters
149
- ----------
150
- connection : Union[sql.engine.base.Connection, sql.engine.base.Engine]
151
- An active SQLAlchemy connection or engine.
152
- n_samples : int
153
- Number of sample values to retrieve for each column.
154
-
155
- Returns
156
- -------
157
- str
158
- A formatted string with database metadata, including some sample data from each column.
159
- """
160
-
161
- # If a connection is passed, use it; if an engine is passed, connect to it
162
- is_engine = isinstance(connection, sql.engine.base.Engine)
163
- conn = connection.connect() if is_engine else connection
164
-
165
- output = []
166
- try:
167
- # Grab the engine off the connection
168
- sql_engine = conn.engine
169
- dialect_name = sql_engine.dialect.name.lower()
170
-
171
- output.append(f"Database Dialect: {sql_engine.dialect.name}")
172
- output.append(f"Driver: {sql_engine.driver}")
173
- output.append(f"Connection URL: {sql_engine.url}")
174
-
175
- # Inspect the database
176
- inspector = sql.inspect(sql_engine)
177
- tables = inspector.get_table_names()
178
- output.append(f"Tables: {tables}")
179
- output.append(f"Schemas: {inspector.get_schema_names()}")
180
-
181
- # Helper to build a dialect-specific limit clause
182
- def build_query(col_name_quoted: str, table_name_quoted: str, n: int) -> str:
183
- """
184
- Returns a SQL query string to select N rows from the given column/table
185
- across different dialects (SQLite, MySQL, Postgres, MSSQL, Oracle, etc.)
186
- """
187
- if "sqlite" in dialect_name or "mysql" in dialect_name or "postgres" in dialect_name:
188
- # Common dialects supporting LIMIT
189
- return f"SELECT {col_name_quoted} FROM {table_name_quoted} LIMIT {n}"
190
- elif "mssql" in dialect_name:
191
- # Microsoft SQL Server syntax
192
- return f"SELECT TOP {n} {col_name_quoted} FROM {table_name_quoted}"
193
- elif "oracle" in dialect_name:
194
- # Oracle syntax
195
- return f"SELECT {col_name_quoted} FROM {table_name_quoted} WHERE ROWNUM <= {n}"
196
- else:
197
- # Fallback
198
- return f"SELECT {col_name_quoted} FROM {table_name_quoted} LIMIT {n}"
199
-
200
- # Prepare for quoting
201
- preparer = inspector.bind.dialect.identifier_preparer
202
-
203
- # For each table, get columns and sample data
204
- for table_name in tables:
205
- output.append(f"\nTable: {table_name}")
206
- # Properly quote the table name
207
- table_name_quoted = preparer.quote_identifier(table_name)
208
-
209
- for column in inspector.get_columns(table_name):
210
- col_name = column["name"]
211
- col_type = column["type"]
212
- output.append(f" Column: {col_name} Type: {col_type}")
213
-
214
- # Properly quote the column name
215
- col_name_quoted = preparer.quote_identifier(col_name)
216
-
217
- # Build a dialect-aware query with safe quoting
218
- query = build_query(col_name_quoted, table_name_quoted, n_samples)
219
-
220
- # Read a few sample values
221
- df = pd.read_sql(sql.text(query), conn)
222
- first_values = df[col_name].tolist()
223
- output.append(f" First {n_samples} Values: {first_values}")
224
-
225
- finally:
226
- # Close connection if created inside the function
227
- if is_engine:
228
- conn.close()
229
-
230
- return "\n".join(output)