ai-data-science-team 0.0.0.9010__py3-none-any.whl → 0.0.0.9012__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/agents/__init__.py +1 -0
- ai_data_science_team/agents/data_loader_tools_agent.py +210 -7
- ai_data_science_team/ds_agents/__init__.py +1 -0
- ai_data_science_team/ds_agents/eda_tools_agent.py +245 -0
- ai_data_science_team/ds_agents/modeling_tools_agent.py +0 -0
- ai_data_science_team/ml_agents/h2o_ml_agent.py +2 -1
- ai_data_science_team/ml_agents/h2o_ml_tools_agent.py +0 -0
- ai_data_science_team/ml_agents/mlflow_tools_agent.py +32 -9
- ai_data_science_team/tools/data_loader.py +95 -25
- ai_data_science_team/tools/eda.py +293 -0
- ai_data_science_team/utils/html.py +27 -0
- ai_data_science_team/utils/matplotlib.py +46 -0
- {ai_data_science_team-0.0.0.9010.dist-info → ai_data_science_team-0.0.0.9012.dist-info}/METADATA +26 -9
- {ai_data_science_team-0.0.0.9010.dist-info → ai_data_science_team-0.0.0.9012.dist-info}/RECORD +18 -11
- {ai_data_science_team-0.0.0.9010.dist-info → ai_data_science_team-0.0.0.9012.dist-info}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9010.dist-info → ai_data_science_team-0.0.0.9012.dist-info}/WHEEL +0 -0
- {ai_data_science_team-0.0.0.9010.dist-info → ai_data_science_team-0.0.0.9012.dist-info}/top_level.txt +0 -0
@@ -1,41 +1,77 @@
|
|
1
1
|
|
2
2
|
from langchain.tools import tool
|
3
|
+
from langgraph.prebuilt import InjectedState
|
3
4
|
|
4
5
|
import pandas as pd
|
6
|
+
import os
|
5
7
|
|
6
|
-
from typing import Tuple, List, Dict
|
8
|
+
from typing import Tuple, List, Dict, Optional, Annotated
|
7
9
|
|
8
10
|
|
9
11
|
@tool(response_format='content_and_artifact')
|
10
|
-
def load_directory(
|
12
|
+
def load_directory(
|
13
|
+
directory_path: str = os.getcwd(),
|
14
|
+
file_type: Optional[str] = None
|
15
|
+
) -> Tuple[str, Dict]:
|
11
16
|
"""
|
12
17
|
Tool: load_directory
|
13
|
-
Description: Loads all recognized tabular files in a directory.
|
18
|
+
Description: Loads all recognized tabular files in a directory.
|
19
|
+
If file_type is specified (e.g., 'csv'), only files
|
20
|
+
with that extension are loaded.
|
14
21
|
|
15
22
|
Parameters:
|
16
23
|
----------
|
17
|
-
|
18
|
-
The path to the directory to load.
|
24
|
+
directory_path : str
|
25
|
+
The path to the directory to load. Defaults to the current working directory.
|
26
|
+
|
27
|
+
file_type : str, optional
|
28
|
+
The extension of the file type you want to load exclusively
|
29
|
+
(e.g., 'csv', 'xlsx', 'parquet'). If None or not provided,
|
30
|
+
attempts to load all recognized tabular files.
|
19
31
|
|
20
32
|
Returns:
|
21
33
|
-------
|
22
34
|
Tuple[str, Dict]
|
23
35
|
A tuple containing a message and a dictionary of data frames.
|
24
36
|
"""
|
25
|
-
print(" * Tool: load_directory")
|
37
|
+
print(f" * Tool: load_directory | {directory_path}")
|
38
|
+
|
26
39
|
import os
|
27
40
|
import pandas as pd
|
41
|
+
|
42
|
+
if directory_path is None:
|
43
|
+
return "No directory path provided.", {}
|
44
|
+
|
45
|
+
if not os.path.isdir(directory_path):
|
46
|
+
return f"Directory not found: {directory_path}", {}
|
47
|
+
|
28
48
|
data_frames = {}
|
29
|
-
|
30
|
-
|
49
|
+
|
50
|
+
for filename in os.listdir(directory_path):
|
51
|
+
file_path = os.path.join(directory_path, filename)
|
52
|
+
|
31
53
|
# Skip directories
|
32
54
|
if os.path.isdir(file_path):
|
33
55
|
continue
|
56
|
+
|
57
|
+
# If file_type is specified, only process files that match.
|
58
|
+
if file_type:
|
59
|
+
# Make sure extension check is case-insensitive
|
60
|
+
if not filename.lower().endswith(f".{file_type.lower()}"):
|
61
|
+
continue
|
62
|
+
|
34
63
|
try:
|
64
|
+
# Attempt to auto-detect and load the file
|
35
65
|
data_frames[filename] = auto_load_file(file_path).to_dict()
|
36
66
|
except Exception as e:
|
67
|
+
# If loading fails, record the error message
|
37
68
|
data_frames[filename] = f"Error loading file: {e}"
|
38
|
-
|
69
|
+
|
70
|
+
return (
|
71
|
+
f"Returned the following data frames: {list(data_frames.keys())}",
|
72
|
+
data_frames
|
73
|
+
)
|
74
|
+
|
39
75
|
|
40
76
|
@tool(response_format='content_and_artifact')
|
41
77
|
def load_file(file_path: str) -> Tuple[str, Dict]:
|
@@ -52,12 +88,15 @@ def load_file(file_path: str) -> Tuple[str, Dict]:
|
|
52
88
|
Tuple[str, Dict]
|
53
89
|
A tuple containing a message and a dictionary of the data frame.
|
54
90
|
"""
|
55
|
-
print(" * Tool: load_file")
|
91
|
+
print(f" * Tool: load_file | {file_path}")
|
56
92
|
return f"Returned the following data frame from this file: {file_path}", auto_load_file(file_path).to_dict()
|
57
93
|
|
58
94
|
|
59
95
|
@tool(response_format='content_and_artifact')
|
60
|
-
def list_directory_contents(
|
96
|
+
def list_directory_contents(
|
97
|
+
directory_path: str = os.getcwd(),
|
98
|
+
show_hidden: bool = False
|
99
|
+
) -> Tuple[List[str], List[Dict]]:
|
61
100
|
"""
|
62
101
|
Tool: list_directory_contents
|
63
102
|
Description: Lists all files and folders in the specified directory.
|
@@ -67,30 +106,51 @@ def list_directory_contents(directory_path: str, show_hidden: bool = False) -> T
|
|
67
106
|
Returns:
|
68
107
|
tuple:
|
69
108
|
- content (list[str]): A list of filenames/folders (suitable for display)
|
70
|
-
- artifact (list[dict]): A list of dictionaries where each dict
|
71
|
-
|
109
|
+
- artifact (list[dict]): A list of dictionaries where each dict includes
|
110
|
+
the keys {"filename": <name>, "type": <'file' or 'directory'>}.
|
111
|
+
This structure can be easily converted to a pandas DataFrame.
|
72
112
|
"""
|
73
|
-
print(" * Tool: list_directory_contents")
|
113
|
+
print(f" * Tool: list_directory_contents | {directory_path}")
|
74
114
|
import os
|
75
|
-
|
115
|
+
|
116
|
+
if directory_path is None:
|
117
|
+
return "No directory path provided.", []
|
118
|
+
|
119
|
+
if not os.path.isdir(directory_path):
|
120
|
+
return f"Directory not found: {directory_path}", []
|
121
|
+
|
76
122
|
items = []
|
77
123
|
for item in os.listdir(directory_path):
|
78
124
|
# If show_hidden is False, skip items starting with '.'
|
79
125
|
if not show_hidden and item.startswith('.'):
|
80
126
|
continue
|
81
127
|
items.append(item)
|
128
|
+
items.reverse()
|
82
129
|
|
83
|
-
# content: just the raw list of
|
84
|
-
content = items
|
85
|
-
|
86
|
-
|
87
|
-
|
130
|
+
# content: just the raw list of item names (files/folders).
|
131
|
+
content = items.copy()
|
132
|
+
|
133
|
+
content.append(f"Total items: {len(items)}")
|
134
|
+
content.append(f"Directory: {directory_path}")
|
135
|
+
|
136
|
+
# artifact: list of dicts with both "filename" and "type" keys.
|
137
|
+
artifact = []
|
138
|
+
for item in items:
|
139
|
+
item_path = os.path.join(directory_path, item)
|
140
|
+
artifact.append({
|
141
|
+
"filename": item,
|
142
|
+
"type": "directory" if os.path.isdir(item_path) else "file"
|
143
|
+
})
|
88
144
|
|
89
145
|
return content, artifact
|
90
146
|
|
91
147
|
|
148
|
+
|
92
149
|
@tool(response_format='content_and_artifact')
|
93
|
-
def list_directory_recursive(
|
150
|
+
def list_directory_recursive(
|
151
|
+
directory_path: str = os.getcwd(),
|
152
|
+
show_hidden: bool = False
|
153
|
+
) -> Tuple[str, List[Dict]]:
|
94
154
|
"""
|
95
155
|
Tool: list_directory_recursive
|
96
156
|
Description:
|
@@ -111,13 +171,19 @@ def list_directory_recursive(directory_path: str, show_hidden: bool = False) ->
|
|
111
171
|
Example:
|
112
172
|
content, artifact = list_directory_recursive("/path/to/folder", show_hidden=False)
|
113
173
|
"""
|
114
|
-
print(" * Tool: list_directory_recursive")
|
174
|
+
print(f" * Tool: list_directory_recursive | {directory_path}")
|
115
175
|
|
116
176
|
# We'll store two things as we recurse:
|
117
177
|
# 1) lines for building the "tree" string
|
118
178
|
# 2) records in a list of dicts for easy DataFrame creation
|
119
179
|
import os
|
120
180
|
|
181
|
+
if directory_path is None:
|
182
|
+
return "No directory path provided.", {}
|
183
|
+
|
184
|
+
if not os.path.isdir(directory_path):
|
185
|
+
return f"Directory not found: {directory_path}", {}
|
186
|
+
|
121
187
|
lines = []
|
122
188
|
records = []
|
123
189
|
|
@@ -210,7 +276,7 @@ def get_file_info(file_path: str) -> Tuple[str, List[Dict]]:
|
|
210
276
|
Example:
|
211
277
|
content, artifact = get_file_info("/path/to/mydata.csv")
|
212
278
|
"""
|
213
|
-
print(" * Tool: get_file_info")
|
279
|
+
print(f" * Tool: get_file_info | {file_path}")
|
214
280
|
|
215
281
|
# Ensure the file exists
|
216
282
|
import os
|
@@ -244,7 +310,11 @@ def get_file_info(file_path: str) -> Tuple[str, List[Dict]]:
|
|
244
310
|
|
245
311
|
|
246
312
|
@tool(response_format='content_and_artifact')
|
247
|
-
def search_files_by_pattern(
|
313
|
+
def search_files_by_pattern(
|
314
|
+
directory_path: str = os.getcwd(),
|
315
|
+
pattern: str = "*.csv",
|
316
|
+
recursive: bool = False
|
317
|
+
) -> Tuple[str, List[Dict]]:
|
248
318
|
"""
|
249
319
|
Tool: search_files_by_pattern
|
250
320
|
Description:
|
@@ -266,7 +336,7 @@ def search_files_by_pattern(directory_path: str, pattern: str = "*.csv", recursi
|
|
266
336
|
Example:
|
267
337
|
content, artifact = search_files_by_pattern("/path/to/folder", "*.csv", recursive=True)
|
268
338
|
"""
|
269
|
-
print(" * Tool: search_files_by_pattern")
|
339
|
+
print(f" * Tool: search_files_by_pattern | {directory_path}")
|
270
340
|
|
271
341
|
import os
|
272
342
|
import fnmatch
|
@@ -0,0 +1,293 @@
|
|
1
|
+
|
2
|
+
from typing import Annotated, Dict, Tuple, Union
|
3
|
+
|
4
|
+
import os
|
5
|
+
|
6
|
+
from langchain.tools import tool
|
7
|
+
|
8
|
+
from langgraph.prebuilt import InjectedState
|
9
|
+
|
10
|
+
|
11
|
+
@tool(response_format='content_and_artifact')
|
12
|
+
def describe_dataset(
|
13
|
+
data_raw: Annotated[dict, InjectedState("data_raw")]
|
14
|
+
) -> Tuple[str, Dict]:
|
15
|
+
"""
|
16
|
+
Tool: describe_dataset
|
17
|
+
Description:
|
18
|
+
Describe the dataset by computing summary
|
19
|
+
statistics using the DataFrame's describe() method.
|
20
|
+
|
21
|
+
Returns:
|
22
|
+
-------
|
23
|
+
Tuple[str, Dict]:
|
24
|
+
content: A textual summary of the DataFrame's descriptive statistics.
|
25
|
+
artifact: A dictionary (from DataFrame.describe()) for further inspection.
|
26
|
+
"""
|
27
|
+
print(" * Tool: describe_dataset")
|
28
|
+
import pandas as pd
|
29
|
+
df = pd.DataFrame(data_raw)
|
30
|
+
description_df = df.describe(include='all')
|
31
|
+
content = "Summary statistics computed using pandas describe()."
|
32
|
+
artifact = description_df.to_dict()
|
33
|
+
return content, artifact
|
34
|
+
|
35
|
+
|
36
|
+
@tool(response_format='content_and_artifact')
|
37
|
+
def visualize_missing(
|
38
|
+
data_raw: Annotated[dict, InjectedState("data_raw")],
|
39
|
+
n_sample: int = None
|
40
|
+
) -> Tuple[str, Dict]:
|
41
|
+
"""
|
42
|
+
Tool: visualize_missing
|
43
|
+
Description:
|
44
|
+
Missing value analysis using the missingno library. Generates a matrix plot, bar plot, and heatmap plot.
|
45
|
+
|
46
|
+
Parameters:
|
47
|
+
-----------
|
48
|
+
data_raw : dict
|
49
|
+
The raw data in dictionary format.
|
50
|
+
n_sample : int, optional (default: None)
|
51
|
+
The number of rows to sample from the dataset if it is large.
|
52
|
+
|
53
|
+
Returns:
|
54
|
+
-------
|
55
|
+
Tuple[str, Dict]:
|
56
|
+
content: A message describing the generated plots.
|
57
|
+
artifact: A dict with keys 'matrix_plot', 'bar_plot', and 'heatmap_plot' each containing the
|
58
|
+
corresponding base64 encoded PNG image.
|
59
|
+
"""
|
60
|
+
print(" * Tool: visualize_missing")
|
61
|
+
|
62
|
+
try:
|
63
|
+
import missingno as msno # Ensure missingno is installed
|
64
|
+
except ImportError:
|
65
|
+
raise ImportError("Please install the 'missingno' package to use this tool. pip install missingno")
|
66
|
+
|
67
|
+
import pandas as pd
|
68
|
+
import base64
|
69
|
+
from io import BytesIO
|
70
|
+
import matplotlib.pyplot as plt
|
71
|
+
|
72
|
+
# Create the DataFrame and sample if n_sample is provided.
|
73
|
+
df = pd.DataFrame(data_raw)
|
74
|
+
if n_sample is not None:
|
75
|
+
df = df.sample(n=n_sample, random_state=42)
|
76
|
+
|
77
|
+
# Dictionary to store the base64 encoded images for each plot.
|
78
|
+
encoded_plots = {}
|
79
|
+
|
80
|
+
# Define a helper function to create a plot, save it, and encode it.
|
81
|
+
def create_and_encode_plot(plot_func, plot_name: str):
|
82
|
+
plt.figure(figsize=(8, 6))
|
83
|
+
# Call the missingno plotting function.
|
84
|
+
plot_func(df)
|
85
|
+
plt.tight_layout()
|
86
|
+
buf = BytesIO()
|
87
|
+
plt.savefig(buf, format="png")
|
88
|
+
plt.close()
|
89
|
+
buf.seek(0)
|
90
|
+
return base64.b64encode(buf.getvalue()).decode("utf-8")
|
91
|
+
|
92
|
+
# Create and encode the matrix plot.
|
93
|
+
encoded_plots["matrix_plot"] = create_and_encode_plot(msno.matrix, "matrix")
|
94
|
+
|
95
|
+
# Create and encode the bar plot.
|
96
|
+
encoded_plots["bar_plot"] = create_and_encode_plot(msno.bar, "bar")
|
97
|
+
|
98
|
+
# Create and encode the heatmap plot.
|
99
|
+
encoded_plots["heatmap_plot"] = create_and_encode_plot(msno.heatmap, "heatmap")
|
100
|
+
|
101
|
+
content = "Missing data visualizations (matrix, bar, and heatmap) have been generated."
|
102
|
+
artifact = encoded_plots
|
103
|
+
return content, artifact
|
104
|
+
|
105
|
+
|
106
|
+
|
107
|
+
@tool(response_format='content_and_artifact')
|
108
|
+
def correlation_funnel(
|
109
|
+
data_raw: Annotated[dict, InjectedState("data_raw")],
|
110
|
+
target: str,
|
111
|
+
target_bin_index: Union[int, str] = -1,
|
112
|
+
corr_method: str = "pearson",
|
113
|
+
n_bins: int = 4,
|
114
|
+
thresh_infreq: float = 0.01,
|
115
|
+
name_infreq: str = "-OTHER",
|
116
|
+
) -> Tuple[str, Dict]:
|
117
|
+
"""
|
118
|
+
Tool: correlation_funnel
|
119
|
+
Description:
|
120
|
+
Correlation analysis using the correlation funnel method. The tool binarizes the data and computes correlation versus a target column.
|
121
|
+
|
122
|
+
Parameters:
|
123
|
+
----------
|
124
|
+
target : str
|
125
|
+
The base target column name (e.g., 'Member_Status'). The tool will look for columns that begin
|
126
|
+
with this string followed by '__' (e.g., 'Member_Status__Gold', 'Member_Status__Platinum').
|
127
|
+
target_bin_index : int or str, default -1
|
128
|
+
If an integer, selects the target level by position from the matching columns.
|
129
|
+
If a string (e.g., "Yes"), attempts to match to the suffix of a column name
|
130
|
+
(i.e., 'target__Yes').
|
131
|
+
corr_method : str
|
132
|
+
The correlation method ('pearson', 'kendall', or 'spearman'). Default is 'pearson'.
|
133
|
+
n_bins : int
|
134
|
+
The number of bins to use for binarization. Default is 4.
|
135
|
+
thresh_infreq : float
|
136
|
+
The threshold for infrequent levels. Default is 0.01.
|
137
|
+
name_infreq : str
|
138
|
+
The name to use for infrequent levels. Default is '-OTHER'.
|
139
|
+
"""
|
140
|
+
print(" * Tool: correlation_funnel")
|
141
|
+
try:
|
142
|
+
import pytimetk as tk
|
143
|
+
except ImportError:
|
144
|
+
raise ImportError("Please install the 'pytimetk' package to use this tool. pip install pytimetk")
|
145
|
+
import pandas as pd
|
146
|
+
import base64
|
147
|
+
from io import BytesIO
|
148
|
+
import matplotlib.pyplot as plt
|
149
|
+
import json
|
150
|
+
import plotly.graph_objects as go
|
151
|
+
import plotly.io as pio
|
152
|
+
from typing import Union
|
153
|
+
|
154
|
+
# Convert the raw injected state into a DataFrame.
|
155
|
+
df = pd.DataFrame(data_raw)
|
156
|
+
|
157
|
+
# Apply the binarization method.
|
158
|
+
df_binarized = df.binarize(
|
159
|
+
n_bins=n_bins,
|
160
|
+
thresh_infreq=thresh_infreq,
|
161
|
+
name_infreq=name_infreq,
|
162
|
+
one_hot=True
|
163
|
+
)
|
164
|
+
|
165
|
+
# Determine the full target column name.
|
166
|
+
# Look for all columns that start with "target__"
|
167
|
+
matching_columns = [col for col in df_binarized.columns if col.startswith(f"{target}__")]
|
168
|
+
if not matching_columns:
|
169
|
+
# If no matching columns are found, warn and use the provided target as-is.
|
170
|
+
full_target = target
|
171
|
+
else:
|
172
|
+
# Determine the full target based on target_bin_index.
|
173
|
+
if isinstance(target_bin_index, str):
|
174
|
+
# Build the candidate column name
|
175
|
+
candidate = f"{target}__{target_bin_index}"
|
176
|
+
if candidate in matching_columns:
|
177
|
+
full_target = candidate
|
178
|
+
else:
|
179
|
+
# If no matching candidate is found, default to the last matching column.
|
180
|
+
full_target = matching_columns[-1]
|
181
|
+
else:
|
182
|
+
# target_bin_index is an integer.
|
183
|
+
try:
|
184
|
+
full_target = matching_columns[target_bin_index]
|
185
|
+
except IndexError:
|
186
|
+
# If index is out of bounds, use the last matching column.
|
187
|
+
full_target = matching_columns[-1]
|
188
|
+
|
189
|
+
# Compute correlation funnel using the full target column name.
|
190
|
+
df_correlated = df_binarized.correlate(target=full_target, method=corr_method)
|
191
|
+
|
192
|
+
# Attempt to generate a static plot.
|
193
|
+
try:
|
194
|
+
# Here we assume that your DataFrame has a method plot_correlation_funnel.
|
195
|
+
fig = df_correlated.plot_correlation_funnel(engine='plotnine', height=600)
|
196
|
+
buf = BytesIO()
|
197
|
+
# Use the appropriate save method for your figure object.
|
198
|
+
fig.save(buf, format="png")
|
199
|
+
plt.close()
|
200
|
+
buf.seek(0)
|
201
|
+
encoded = base64.b64encode(buf.getvalue()).decode("utf-8")
|
202
|
+
except Exception as e:
|
203
|
+
encoded = {"error": str(e)}
|
204
|
+
|
205
|
+
# Attempt to generate a Plotly plot.
|
206
|
+
try:
|
207
|
+
fig = df_correlated.plot_correlation_funnel(engine='plotly')
|
208
|
+
fig_json = pio.to_json(fig)
|
209
|
+
fig_dict = json.loads(fig_json)
|
210
|
+
except Exception as e:
|
211
|
+
fig_dict = {"error": str(e)}
|
212
|
+
|
213
|
+
content = (f"Correlation funnel computed using method '{corr_method}' for target level '{full_target}'. "
|
214
|
+
f"Base target was '{target}' with target_bin_index '{target_bin_index}'.")
|
215
|
+
artifact = {
|
216
|
+
"correlation_data": df_correlated.to_dict(orient="list"),
|
217
|
+
"plot_image": encoded,
|
218
|
+
"plotly_figure": fig_dict,
|
219
|
+
}
|
220
|
+
return content, artifact
|
221
|
+
|
222
|
+
|
223
|
+
|
224
|
+
@tool(response_format='content_and_artifact')
|
225
|
+
def generate_sweetviz_report(
|
226
|
+
data_raw: Annotated[dict, InjectedState("data_raw")],
|
227
|
+
target: str = None,
|
228
|
+
report_name: str = "sweetviz_report.html",
|
229
|
+
report_directory: str = os.path.join(os.getcwd(), "reports"),
|
230
|
+
open_browser: bool = True,
|
231
|
+
) -> Tuple[str, Dict]:
|
232
|
+
"""
|
233
|
+
Tool: generate_sweetviz_report
|
234
|
+
Description:
|
235
|
+
Make an Exploratory Data Analysis (EDA) report using the Sweetviz library.
|
236
|
+
|
237
|
+
Parameters:
|
238
|
+
-----------
|
239
|
+
data_raw : dict
|
240
|
+
The raw data injected as a dictionary (converted from a DataFrame).
|
241
|
+
target : str, optional
|
242
|
+
The target feature to analyze. Default is None.
|
243
|
+
report_name : str, optional
|
244
|
+
The file name to save the Sweetviz HTML report. Default is "sweetviz_report.html".
|
245
|
+
report_directory : str, optional
|
246
|
+
The directory where the report should be saved. Defaults to a 'reports' directory in the current working directory.
|
247
|
+
open_browser : bool, optional
|
248
|
+
Whether to open the report in a web browser. Default is True.
|
249
|
+
|
250
|
+
Returns:
|
251
|
+
--------
|
252
|
+
Tuple[str, Dict]:
|
253
|
+
content: A summary message describing the generated report.
|
254
|
+
artifact: A dictionary with the report file path and optionally the report's HTML content.
|
255
|
+
"""
|
256
|
+
print(" * Tool: generate_sweetviz_report")
|
257
|
+
try:
|
258
|
+
import sweetviz as sv
|
259
|
+
except ImportError:
|
260
|
+
raise ImportError("Please install the 'sweetviz' package to use this tool. Run: pip install sweetviz")
|
261
|
+
import pandas as pd
|
262
|
+
# Convert injected raw data to a DataFrame.
|
263
|
+
df = pd.DataFrame(data_raw)
|
264
|
+
|
265
|
+
# Create the Sweetviz report.
|
266
|
+
report = sv.analyze(df, target_feat=target)
|
267
|
+
|
268
|
+
# Ensure the directory exists; default is os.getcwd()/reports
|
269
|
+
if not os.path.exists(report_directory):
|
270
|
+
os.makedirs(report_directory)
|
271
|
+
|
272
|
+
# Determine the full path for the report.
|
273
|
+
full_report_path = os.path.join(report_directory, report_name)
|
274
|
+
|
275
|
+
# Save the report to the specified HTML file.
|
276
|
+
report.show_html(
|
277
|
+
filepath=full_report_path,
|
278
|
+
open_browser=True,
|
279
|
+
)
|
280
|
+
|
281
|
+
# Optionally, read the HTML content (if desired to pass along in the artifact).
|
282
|
+
try:
|
283
|
+
with open(full_report_path, "r", encoding="utf-8") as f:
|
284
|
+
html_content = f.read()
|
285
|
+
except Exception:
|
286
|
+
html_content = None
|
287
|
+
|
288
|
+
content = f"Sweetviz EDA report generated and saved as '{os.path.abspath(full_report_path)}'."
|
289
|
+
artifact = {
|
290
|
+
"report_file": os.path.abspath(full_report_path),
|
291
|
+
"report_html": html_content,
|
292
|
+
}
|
293
|
+
return content, artifact
|
@@ -0,0 +1,27 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
import webbrowser
|
4
|
+
import os
|
5
|
+
|
6
|
+
def open_html_file_in_browser(file_path: str):
|
7
|
+
"""
|
8
|
+
Opens an HTML file in the default web browser.
|
9
|
+
|
10
|
+
Parameters:
|
11
|
+
-----------
|
12
|
+
file_path : str
|
13
|
+
The file path or URL of the HTML file to open.
|
14
|
+
|
15
|
+
Returns:
|
16
|
+
--------
|
17
|
+
None
|
18
|
+
"""
|
19
|
+
# Check if the file exists if a local path is provided.
|
20
|
+
if os.path.isfile(file_path):
|
21
|
+
# Convert file path to a file URL
|
22
|
+
file_url = 'file://' + os.path.abspath(file_path)
|
23
|
+
else:
|
24
|
+
# If the file doesn't exist locally, assume it's a URL
|
25
|
+
file_url = file_path
|
26
|
+
|
27
|
+
webbrowser.open(file_url)
|
@@ -0,0 +1,46 @@
|
|
1
|
+
import base64
|
2
|
+
from io import BytesIO
|
3
|
+
import matplotlib.pyplot as plt
|
4
|
+
from PIL import Image
|
5
|
+
|
6
|
+
def matplotlib_from_base64(encoded: str, title: str = None, figsize: tuple = (8, 6)):
|
7
|
+
"""
|
8
|
+
Convert a base64-encoded image to a matplotlib plot and display it.
|
9
|
+
|
10
|
+
Parameters:
|
11
|
+
-----------
|
12
|
+
encoded : str
|
13
|
+
The base64-encoded image string.
|
14
|
+
title : str, optional
|
15
|
+
A title for the plot. Default is None.
|
16
|
+
figsize : tuple, optional
|
17
|
+
Figure size (width, height) for the plot. Default is (8, 6).
|
18
|
+
|
19
|
+
Returns:
|
20
|
+
--------
|
21
|
+
fig, ax : tuple
|
22
|
+
The matplotlib figure and axes objects.
|
23
|
+
"""
|
24
|
+
# Decode the base64 string to bytes
|
25
|
+
img_data = base64.b64decode(encoded)
|
26
|
+
|
27
|
+
# Load the bytes data into a BytesIO buffer
|
28
|
+
buf = BytesIO(img_data)
|
29
|
+
|
30
|
+
# Open the image using Pillow
|
31
|
+
img = Image.open(buf)
|
32
|
+
|
33
|
+
# Create a matplotlib figure and axis
|
34
|
+
fig, ax = plt.subplots(figsize=figsize)
|
35
|
+
|
36
|
+
# Display the image
|
37
|
+
ax.imshow(img)
|
38
|
+
ax.axis('off') # Hide the axis
|
39
|
+
|
40
|
+
if title:
|
41
|
+
ax.set_title(title)
|
42
|
+
|
43
|
+
# Show the plot
|
44
|
+
plt.show()
|
45
|
+
|
46
|
+
return fig, ax
|
{ai_data_science_team-0.0.0.9010.dist-info → ai_data_science_team-0.0.0.9012.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: ai-data-science-team
|
3
|
-
Version: 0.0.0.
|
3
|
+
Version: 0.0.0.9012
|
4
4
|
Summary: Build and run an AI-powered data science team.
|
5
5
|
Home-page: https://github.com/business-science/ai-data-science-team
|
6
6
|
Author: Matt Dancho
|
@@ -31,9 +31,16 @@ Requires-Dist: psutil
|
|
31
31
|
Provides-Extra: machine-learning
|
32
32
|
Requires-Dist: h2o; extra == "machine-learning"
|
33
33
|
Requires-Dist: mlflow; extra == "machine-learning"
|
34
|
+
Provides-Extra: data-science
|
35
|
+
Requires-Dist: pytimetk; extra == "data-science"
|
36
|
+
Requires-Dist: missingno; extra == "data-science"
|
37
|
+
Requires-Dist: sweetviz; extra == "data-science"
|
34
38
|
Provides-Extra: all
|
35
39
|
Requires-Dist: h2o; extra == "all"
|
36
40
|
Requires-Dist: mlflow; extra == "all"
|
41
|
+
Requires-Dist: pytimetk; extra == "all"
|
42
|
+
Requires-Dist: missingno; extra == "all"
|
43
|
+
Requires-Dist: sweetviz; extra == "all"
|
37
44
|
Dynamic: author
|
38
45
|
Dynamic: author-email
|
39
46
|
Dynamic: classifier
|
@@ -59,6 +66,8 @@ Dynamic: summary
|
|
59
66
|
<a href="https://pypi.python.org/pypi/ai-data-science-team"><img src="https://img.shields.io/pypi/v/ai-data-science-team.svg?style=for-the-badge" alt="PyPI"></a>
|
60
67
|
<a href="https://github.com/business-science/ai-data-science-team"><img src="https://img.shields.io/pypi/pyversions/ai-data-science-team.svg?style=for-the-badge" alt="versions"></a>
|
61
68
|
<a href="https://github.com/business-science/ai-data-science-team/blob/main/LICENSE"><img src="https://img.shields.io/github/license/business-science/ai-data-science-team.svg?style=for-the-badge" alt="license"></a>
|
69
|
+
<img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/business-science/ai-data-science-team?style=for-the-badge">
|
70
|
+
|
62
71
|
</div>
|
63
72
|
|
64
73
|
|
@@ -93,8 +102,9 @@ The AI Data Science Team of Copilots includes Agents that specialize data cleani
|
|
93
102
|
- [Apps Available Now](#apps-available-now)
|
94
103
|
- [🔥 Agentic Applications](#-agentic-applications)
|
95
104
|
- [Agents Available Now](#agents-available-now)
|
105
|
+
- [Standard Agents](#standard-agents)
|
96
106
|
- [🔥🔥 NEW! Machine Learning Agents](#-new-machine-learning-agents)
|
97
|
-
- [Data Science Agents](
|
107
|
+
- [🔥 NEW! Data Science Agents](#-new-data-science-agents)
|
98
108
|
- [Multi-Agents](#multi-agents)
|
99
109
|
- [Agents Coming Soon](#agents-coming-soon)
|
100
110
|
- [Disclaimer](#disclaimer)
|
@@ -122,7 +132,7 @@ If you're an aspiring data scientist who wants to learn how to build AI Agents a
|
|
122
132
|
|
123
133
|
This project is a work in progress. New data science agents will be released soon.
|
124
134
|
|
125
|
-

|
135
|
+

|
126
136
|
|
127
137
|
### NEW: Multi-Agents
|
128
138
|
|
@@ -146,18 +156,25 @@ This is a top secret project I'm working on. It's a multi-agent data science app
|
|
146
156
|
|
147
157
|
### Agents Available Now
|
148
158
|
|
159
|
+
#### Standard Agents
|
160
|
+
|
161
|
+
1. **Data Wrangling Agent:** Merges, Joins, Preps and Wrangles data into a format that is ready for data analysis. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/data_wrangling_agent.ipynb)
|
162
|
+
2. **Data Visualization Agent:** Creates visualizations to help you understand your data. Returns JSON serializable plotly visualizations. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/data_visualization_agent.ipynb)
|
163
|
+
3. **🔥 Data Cleaning Agent:** Performs Data Preparation steps including handling missing values, outliers, and data type conversions. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/data_cleaning_agent.ipynb)
|
164
|
+
4. **Feature Engineering Agent:** Converts the prepared data into ML-ready data. Adds features to increase predictive accuracy of ML models. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/feature_engineering_agent.ipynb)
|
165
|
+
5. **🔥 SQL Database Agent:** Connects to SQL databases to pull data into the data science environment. Creates pipelines to automate data extraction. Performs Joins, Aggregations, and other SQL Query operations. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/sql_database_agent.ipynb)
|
166
|
+
6. **🔥 Data Loader Tools Agent:** Loads data from various sources including CSV, Excel, Parquet, and Pickle files. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/data_loader_tools_agent.ipynb)
|
167
|
+
|
168
|
+
|
149
169
|
#### 🔥🔥 NEW! Machine Learning Agents
|
150
170
|
|
151
171
|
1. **🔥 H2O Machine Learning Agent:** Builds and logs 100's of high-performance machine learning models. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/ml_agents/h2o_machine_learning_agent.ipynb)
|
152
172
|
2. **🔥 MLflow Tools Agent (MLOps):** This agent has 11+ tools for managing models, ML projects, and making production ML predictions with MLflow. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/ml_agents/mlflow_tools_agent.ipynb)
|
153
173
|
|
154
|
-
#### Data Science Agents
|
174
|
+
#### 🔥 NEW! Data Science Agents
|
175
|
+
|
176
|
+
1. **🔥🔥 EDA Tools Agent:** Performs automated exploratory data analysis (EDA) with EDA Reporting, Missing Data Analysis, Correlation Analysis, and more. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/ds_agents/eda_tools_agent.ipynb)
|
155
177
|
|
156
|
-
1. **Data Wrangling Agent:** Merges, Joins, Preps and Wrangles data into a format that is ready for data analysis. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/data_wrangling_agent.ipynb)
|
157
|
-
2. **Data Visualization Agent:** Creates visualizations to help you understand your data. Returns JSON serializable plotly visualizations. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/data_visualization_agent.ipynb)
|
158
|
-
3. **Data Cleaning Agent:** Performs Data Preparation steps including handling missing values, outliers, and data type conversions. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/data_cleaning_agent.ipynb)
|
159
|
-
4. **Feature Engineering Agent:** Converts the prepared data into ML-ready data. Adds features to increase predictive accuracy of ML models. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/feature_engineering_agent.ipynb)
|
160
|
-
5. **SQL Database Agent:** Connects to SQL databases to pull data into the data science environment. Creates pipelines to automate data extraction. Performs Joins, Aggregations, and other SQL Query operations. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/sql_database_agent.ipynb)
|
161
178
|
|
162
179
|
#### Multi-Agents
|
163
180
|
|