ai-data-science-team 0.0.0.9011__py3-none-any.whl → 0.0.0.9012__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/ds_agents/__init__.py +1 -0
- ai_data_science_team/ds_agents/eda_tools_agent.py +245 -0
- ai_data_science_team/ds_agents/modeling_tools_agent.py +0 -0
- ai_data_science_team/tools/eda.py +293 -0
- ai_data_science_team/utils/html.py +27 -0
- ai_data_science_team/utils/matplotlib.py +46 -0
- {ai_data_science_team-0.0.0.9011.dist-info → ai_data_science_team-0.0.0.9012.dist-info}/METADATA +19 -5
- {ai_data_science_team-0.0.0.9011.dist-info → ai_data_science_team-0.0.0.9012.dist-info}/RECORD +12 -6
- {ai_data_science_team-0.0.0.9011.dist-info → ai_data_science_team-0.0.0.9012.dist-info}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9011.dist-info → ai_data_science_team-0.0.0.9012.dist-info}/WHEEL +0 -0
- {ai_data_science_team-0.0.0.9011.dist-info → ai_data_science_team-0.0.0.9012.dist-info}/top_level.txt +0 -0
ai_data_science_team/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.0.0.
|
1
|
+
__version__ = "0.0.0.9012"
|
@@ -0,0 +1 @@
|
|
1
|
+
from ai_data_science_team.ds_agents.eda_tools_agent import EDAToolsAgent, make_eda_tools_agent
|
@@ -0,0 +1,245 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
from typing import Any, Optional, Annotated, Sequence, List, Dict, Tuple
|
4
|
+
import operator
|
5
|
+
import pandas as pd
|
6
|
+
import os
|
7
|
+
from io import StringIO, BytesIO
|
8
|
+
import base64
|
9
|
+
import matplotlib.pyplot as plt
|
10
|
+
|
11
|
+
from IPython.display import Markdown
|
12
|
+
|
13
|
+
from langchain_core.messages import BaseMessage, AIMessage
|
14
|
+
from langgraph.prebuilt import create_react_agent, ToolNode
|
15
|
+
from langgraph.prebuilt.chat_agent_executor import AgentState
|
16
|
+
from langgraph.graph import START, END, StateGraph
|
17
|
+
|
18
|
+
from ai_data_science_team.templates import BaseAgent
|
19
|
+
from ai_data_science_team.utils.regex import format_agent_name
|
20
|
+
|
21
|
+
from ai_data_science_team.tools.eda import (
|
22
|
+
describe_dataset,
|
23
|
+
visualize_missing,
|
24
|
+
correlation_funnel,
|
25
|
+
generate_sweetviz_report,
|
26
|
+
)
|
27
|
+
|
28
|
+
|
29
|
+
AGENT_NAME = "exploratory_data_analyst_agent"
|
30
|
+
|
31
|
+
# Updated tool list for EDA
|
32
|
+
EDA_TOOLS = [
|
33
|
+
describe_dataset,
|
34
|
+
visualize_missing,
|
35
|
+
correlation_funnel,
|
36
|
+
generate_sweetviz_report,
|
37
|
+
]
|
38
|
+
|
39
|
+
class EDAToolsAgent(BaseAgent):
|
40
|
+
"""
|
41
|
+
An Exploratory Data Analysis Tools Agent that interacts with EDA tools to generate summary statistics,
|
42
|
+
missing data visualizations, correlation funnels, EDA reports, etc.
|
43
|
+
|
44
|
+
Parameters:
|
45
|
+
----------
|
46
|
+
model : langchain.llms.base.LLM
|
47
|
+
The language model for generating the tool-calling agent.
|
48
|
+
create_react_agent_kwargs : dict
|
49
|
+
Additional kwargs for create_react_agent.
|
50
|
+
invoke_react_agent_kwargs : dict
|
51
|
+
Additional kwargs for agent invocation.
|
52
|
+
"""
|
53
|
+
|
54
|
+
def __init__(
|
55
|
+
self,
|
56
|
+
model: Any,
|
57
|
+
create_react_agent_kwargs: Optional[Dict] = {},
|
58
|
+
invoke_react_agent_kwargs: Optional[Dict] = {},
|
59
|
+
):
|
60
|
+
self._params = {
|
61
|
+
"model": model,
|
62
|
+
"create_react_agent_kwargs": create_react_agent_kwargs,
|
63
|
+
"invoke_react_agent_kwargs": invoke_react_agent_kwargs,
|
64
|
+
}
|
65
|
+
self._compiled_graph = self._make_compiled_graph()
|
66
|
+
self.response = None
|
67
|
+
|
68
|
+
def _make_compiled_graph(self):
|
69
|
+
"""
|
70
|
+
Creates the compiled state graph for the EDA agent.
|
71
|
+
"""
|
72
|
+
self.response = None
|
73
|
+
return make_eda_tools_agent(**self._params)
|
74
|
+
|
75
|
+
def update_params(self, **kwargs):
|
76
|
+
"""
|
77
|
+
Updates the agent's parameters and rebuilds the compiled graph.
|
78
|
+
"""
|
79
|
+
for k, v in kwargs.items():
|
80
|
+
self._params[k] = v
|
81
|
+
self._compiled_graph = self._make_compiled_graph()
|
82
|
+
|
83
|
+
async def ainvoke_agent(
|
84
|
+
self,
|
85
|
+
user_instructions: str = None,
|
86
|
+
data_raw: pd.DataFrame = None,
|
87
|
+
**kwargs
|
88
|
+
):
|
89
|
+
"""
|
90
|
+
Asynchronously runs the agent with user instructions and data.
|
91
|
+
|
92
|
+
Parameters:
|
93
|
+
----------
|
94
|
+
user_instructions : str, optional
|
95
|
+
The instructions for the agent.
|
96
|
+
data_raw : pd.DataFrame, optional
|
97
|
+
The input data as a DataFrame.
|
98
|
+
"""
|
99
|
+
response = await self._compiled_graph.ainvoke(
|
100
|
+
{
|
101
|
+
"user_instructions": user_instructions,
|
102
|
+
"data_raw": data_raw.to_dict() if data_raw is not None else None,
|
103
|
+
},
|
104
|
+
**kwargs
|
105
|
+
)
|
106
|
+
self.response = response
|
107
|
+
return None
|
108
|
+
|
109
|
+
def invoke_agent(
|
110
|
+
self,
|
111
|
+
user_instructions: str = None,
|
112
|
+
data_raw: pd.DataFrame = None,
|
113
|
+
**kwargs
|
114
|
+
):
|
115
|
+
"""
|
116
|
+
Synchronously runs the agent with user instructions and data.
|
117
|
+
|
118
|
+
Parameters:
|
119
|
+
----------
|
120
|
+
user_instructions : str, optional
|
121
|
+
The instructions for the agent.
|
122
|
+
data_raw : pd.DataFrame, optional
|
123
|
+
The input data as a DataFrame.
|
124
|
+
"""
|
125
|
+
response = self._compiled_graph.invoke(
|
126
|
+
{
|
127
|
+
"user_instructions": user_instructions,
|
128
|
+
"data_raw": data_raw.to_dict() if data_raw is not None else None,
|
129
|
+
},
|
130
|
+
**kwargs
|
131
|
+
)
|
132
|
+
self.response = response
|
133
|
+
return None
|
134
|
+
|
135
|
+
def get_internal_messages(self, markdown: bool = False):
|
136
|
+
"""
|
137
|
+
Returns internal messages from the agent response.
|
138
|
+
"""
|
139
|
+
pretty_print = "\n\n".join(
|
140
|
+
[f"### {msg.type.upper()}\n\nID: {msg.id}\n\nContent:\n\n{msg.content}"
|
141
|
+
for msg in self.response["internal_messages"]]
|
142
|
+
)
|
143
|
+
if markdown:
|
144
|
+
return Markdown(pretty_print)
|
145
|
+
else:
|
146
|
+
return self.response["internal_messages"]
|
147
|
+
|
148
|
+
def get_artifacts(self, as_dataframe: bool = False):
|
149
|
+
"""
|
150
|
+
Returns the EDA artifacts from the agent response.
|
151
|
+
"""
|
152
|
+
if as_dataframe:
|
153
|
+
return pd.DataFrame(self.response["eda_artifacts"])
|
154
|
+
else:
|
155
|
+
return self.response["eda_artifacts"]
|
156
|
+
|
157
|
+
def get_ai_message(self, markdown: bool = False):
|
158
|
+
"""
|
159
|
+
Returns the AI message from the agent response.
|
160
|
+
"""
|
161
|
+
if markdown:
|
162
|
+
return Markdown(self.response["messages"][0].content)
|
163
|
+
else:
|
164
|
+
return self.response["messages"][0].content
|
165
|
+
|
166
|
+
def make_eda_tools_agent(
|
167
|
+
model: Any,
|
168
|
+
create_react_agent_kwargs: Optional[Dict] = {},
|
169
|
+
invoke_react_agent_kwargs: Optional[Dict] = {},
|
170
|
+
):
|
171
|
+
"""
|
172
|
+
Creates an Exploratory Data Analyst Agent that can interact with EDA tools.
|
173
|
+
|
174
|
+
Parameters:
|
175
|
+
----------
|
176
|
+
model : Any
|
177
|
+
The language model used for tool-calling.
|
178
|
+
create_react_agent_kwargs : dict
|
179
|
+
Additional kwargs for create_react_agent.
|
180
|
+
invoke_react_agent_kwargs : dict
|
181
|
+
Additional kwargs for agent invocation.
|
182
|
+
|
183
|
+
Returns:
|
184
|
+
-------
|
185
|
+
app : langgraph.graph.CompiledStateGraph
|
186
|
+
The compiled state graph for the EDA agent.
|
187
|
+
"""
|
188
|
+
|
189
|
+
class GraphState(AgentState):
|
190
|
+
internal_messages: Annotated[Sequence[BaseMessage], operator.add]
|
191
|
+
user_instructions: str
|
192
|
+
data_raw: dict
|
193
|
+
eda_artifacts: dict
|
194
|
+
|
195
|
+
def exploratory_agent(state):
|
196
|
+
print(format_agent_name(AGENT_NAME))
|
197
|
+
print(" * RUN REACT TOOL-CALLING AGENT FOR EDA")
|
198
|
+
|
199
|
+
tool_node = ToolNode(
|
200
|
+
tools=EDA_TOOLS
|
201
|
+
)
|
202
|
+
|
203
|
+
eda_agent = create_react_agent(
|
204
|
+
model,
|
205
|
+
tools=tool_node,
|
206
|
+
state_schema=GraphState,
|
207
|
+
**create_react_agent_kwargs,
|
208
|
+
)
|
209
|
+
|
210
|
+
response = eda_agent.invoke(
|
211
|
+
{
|
212
|
+
"messages": [("user", state["user_instructions"])],
|
213
|
+
"data_raw": state["data_raw"],
|
214
|
+
},
|
215
|
+
invoke_react_agent_kwargs,
|
216
|
+
)
|
217
|
+
|
218
|
+
print(" * POST-PROCESSING EDA RESULTS")
|
219
|
+
|
220
|
+
internal_messages = response['messages']
|
221
|
+
if not internal_messages:
|
222
|
+
return {"internal_messages": [], "eda_artifacts": None}
|
223
|
+
|
224
|
+
last_ai_message = AIMessage(internal_messages[-1].content, role=AGENT_NAME)
|
225
|
+
last_tool_artifact = None
|
226
|
+
if len(internal_messages) > 1:
|
227
|
+
last_message = internal_messages[-2]
|
228
|
+
if hasattr(last_message, "artifact"):
|
229
|
+
last_tool_artifact = last_message.artifact
|
230
|
+
elif isinstance(last_message, dict) and "artifact" in last_message:
|
231
|
+
last_tool_artifact = last_message["artifact"]
|
232
|
+
|
233
|
+
return {
|
234
|
+
"messages": [last_ai_message],
|
235
|
+
"internal_messages": internal_messages,
|
236
|
+
"eda_artifacts": last_tool_artifact,
|
237
|
+
}
|
238
|
+
|
239
|
+
workflow = StateGraph(GraphState)
|
240
|
+
workflow.add_node("exploratory_agent", exploratory_agent)
|
241
|
+
workflow.add_edge(START, "exploratory_agent")
|
242
|
+
workflow.add_edge("exploratory_agent", END)
|
243
|
+
|
244
|
+
app = workflow.compile()
|
245
|
+
return app
|
File without changes
|
@@ -0,0 +1,293 @@
|
|
1
|
+
|
2
|
+
from typing import Annotated, Dict, Tuple, Union
|
3
|
+
|
4
|
+
import os
|
5
|
+
|
6
|
+
from langchain.tools import tool
|
7
|
+
|
8
|
+
from langgraph.prebuilt import InjectedState
|
9
|
+
|
10
|
+
|
11
|
+
@tool(response_format='content_and_artifact')
|
12
|
+
def describe_dataset(
|
13
|
+
data_raw: Annotated[dict, InjectedState("data_raw")]
|
14
|
+
) -> Tuple[str, Dict]:
|
15
|
+
"""
|
16
|
+
Tool: describe_dataset
|
17
|
+
Description:
|
18
|
+
Describe the dataset by computing summary
|
19
|
+
statistics using the DataFrame's describe() method.
|
20
|
+
|
21
|
+
Returns:
|
22
|
+
-------
|
23
|
+
Tuple[str, Dict]:
|
24
|
+
content: A textual summary of the DataFrame's descriptive statistics.
|
25
|
+
artifact: A dictionary (from DataFrame.describe()) for further inspection.
|
26
|
+
"""
|
27
|
+
print(" * Tool: describe_dataset")
|
28
|
+
import pandas as pd
|
29
|
+
df = pd.DataFrame(data_raw)
|
30
|
+
description_df = df.describe(include='all')
|
31
|
+
content = "Summary statistics computed using pandas describe()."
|
32
|
+
artifact = description_df.to_dict()
|
33
|
+
return content, artifact
|
34
|
+
|
35
|
+
|
36
|
+
@tool(response_format='content_and_artifact')
|
37
|
+
def visualize_missing(
|
38
|
+
data_raw: Annotated[dict, InjectedState("data_raw")],
|
39
|
+
n_sample: int = None
|
40
|
+
) -> Tuple[str, Dict]:
|
41
|
+
"""
|
42
|
+
Tool: visualize_missing
|
43
|
+
Description:
|
44
|
+
Missing value analysis using the missingno library. Generates a matrix plot, bar plot, and heatmap plot.
|
45
|
+
|
46
|
+
Parameters:
|
47
|
+
-----------
|
48
|
+
data_raw : dict
|
49
|
+
The raw data in dictionary format.
|
50
|
+
n_sample : int, optional (default: None)
|
51
|
+
The number of rows to sample from the dataset if it is large.
|
52
|
+
|
53
|
+
Returns:
|
54
|
+
-------
|
55
|
+
Tuple[str, Dict]:
|
56
|
+
content: A message describing the generated plots.
|
57
|
+
artifact: A dict with keys 'matrix_plot', 'bar_plot', and 'heatmap_plot' each containing the
|
58
|
+
corresponding base64 encoded PNG image.
|
59
|
+
"""
|
60
|
+
print(" * Tool: visualize_missing")
|
61
|
+
|
62
|
+
try:
|
63
|
+
import missingno as msno # Ensure missingno is installed
|
64
|
+
except ImportError:
|
65
|
+
raise ImportError("Please install the 'missingno' package to use this tool. pip install missingno")
|
66
|
+
|
67
|
+
import pandas as pd
|
68
|
+
import base64
|
69
|
+
from io import BytesIO
|
70
|
+
import matplotlib.pyplot as plt
|
71
|
+
|
72
|
+
# Create the DataFrame and sample if n_sample is provided.
|
73
|
+
df = pd.DataFrame(data_raw)
|
74
|
+
if n_sample is not None:
|
75
|
+
df = df.sample(n=n_sample, random_state=42)
|
76
|
+
|
77
|
+
# Dictionary to store the base64 encoded images for each plot.
|
78
|
+
encoded_plots = {}
|
79
|
+
|
80
|
+
# Define a helper function to create a plot, save it, and encode it.
|
81
|
+
def create_and_encode_plot(plot_func, plot_name: str):
|
82
|
+
plt.figure(figsize=(8, 6))
|
83
|
+
# Call the missingno plotting function.
|
84
|
+
plot_func(df)
|
85
|
+
plt.tight_layout()
|
86
|
+
buf = BytesIO()
|
87
|
+
plt.savefig(buf, format="png")
|
88
|
+
plt.close()
|
89
|
+
buf.seek(0)
|
90
|
+
return base64.b64encode(buf.getvalue()).decode("utf-8")
|
91
|
+
|
92
|
+
# Create and encode the matrix plot.
|
93
|
+
encoded_plots["matrix_plot"] = create_and_encode_plot(msno.matrix, "matrix")
|
94
|
+
|
95
|
+
# Create and encode the bar plot.
|
96
|
+
encoded_plots["bar_plot"] = create_and_encode_plot(msno.bar, "bar")
|
97
|
+
|
98
|
+
# Create and encode the heatmap plot.
|
99
|
+
encoded_plots["heatmap_plot"] = create_and_encode_plot(msno.heatmap, "heatmap")
|
100
|
+
|
101
|
+
content = "Missing data visualizations (matrix, bar, and heatmap) have been generated."
|
102
|
+
artifact = encoded_plots
|
103
|
+
return content, artifact
|
104
|
+
|
105
|
+
|
106
|
+
|
107
|
+
@tool(response_format='content_and_artifact')
|
108
|
+
def correlation_funnel(
|
109
|
+
data_raw: Annotated[dict, InjectedState("data_raw")],
|
110
|
+
target: str,
|
111
|
+
target_bin_index: Union[int, str] = -1,
|
112
|
+
corr_method: str = "pearson",
|
113
|
+
n_bins: int = 4,
|
114
|
+
thresh_infreq: float = 0.01,
|
115
|
+
name_infreq: str = "-OTHER",
|
116
|
+
) -> Tuple[str, Dict]:
|
117
|
+
"""
|
118
|
+
Tool: correlation_funnel
|
119
|
+
Description:
|
120
|
+
Correlation analysis using the correlation funnel method. The tool binarizes the data and computes correlation versus a target column.
|
121
|
+
|
122
|
+
Parameters:
|
123
|
+
----------
|
124
|
+
target : str
|
125
|
+
The base target column name (e.g., 'Member_Status'). The tool will look for columns that begin
|
126
|
+
with this string followed by '__' (e.g., 'Member_Status__Gold', 'Member_Status__Platinum').
|
127
|
+
target_bin_index : int or str, default -1
|
128
|
+
If an integer, selects the target level by position from the matching columns.
|
129
|
+
If a string (e.g., "Yes"), attempts to match to the suffix of a column name
|
130
|
+
(i.e., 'target__Yes').
|
131
|
+
corr_method : str
|
132
|
+
The correlation method ('pearson', 'kendall', or 'spearman'). Default is 'pearson'.
|
133
|
+
n_bins : int
|
134
|
+
The number of bins to use for binarization. Default is 4.
|
135
|
+
thresh_infreq : float
|
136
|
+
The threshold for infrequent levels. Default is 0.01.
|
137
|
+
name_infreq : str
|
138
|
+
The name to use for infrequent levels. Default is '-OTHER'.
|
139
|
+
"""
|
140
|
+
print(" * Tool: correlation_funnel")
|
141
|
+
try:
|
142
|
+
import pytimetk as tk
|
143
|
+
except ImportError:
|
144
|
+
raise ImportError("Please install the 'pytimetk' package to use this tool. pip install pytimetk")
|
145
|
+
import pandas as pd
|
146
|
+
import base64
|
147
|
+
from io import BytesIO
|
148
|
+
import matplotlib.pyplot as plt
|
149
|
+
import json
|
150
|
+
import plotly.graph_objects as go
|
151
|
+
import plotly.io as pio
|
152
|
+
from typing import Union
|
153
|
+
|
154
|
+
# Convert the raw injected state into a DataFrame.
|
155
|
+
df = pd.DataFrame(data_raw)
|
156
|
+
|
157
|
+
# Apply the binarization method.
|
158
|
+
df_binarized = df.binarize(
|
159
|
+
n_bins=n_bins,
|
160
|
+
thresh_infreq=thresh_infreq,
|
161
|
+
name_infreq=name_infreq,
|
162
|
+
one_hot=True
|
163
|
+
)
|
164
|
+
|
165
|
+
# Determine the full target column name.
|
166
|
+
# Look for all columns that start with "target__"
|
167
|
+
matching_columns = [col for col in df_binarized.columns if col.startswith(f"{target}__")]
|
168
|
+
if not matching_columns:
|
169
|
+
# If no matching columns are found, warn and use the provided target as-is.
|
170
|
+
full_target = target
|
171
|
+
else:
|
172
|
+
# Determine the full target based on target_bin_index.
|
173
|
+
if isinstance(target_bin_index, str):
|
174
|
+
# Build the candidate column name
|
175
|
+
candidate = f"{target}__{target_bin_index}"
|
176
|
+
if candidate in matching_columns:
|
177
|
+
full_target = candidate
|
178
|
+
else:
|
179
|
+
# If no matching candidate is found, default to the last matching column.
|
180
|
+
full_target = matching_columns[-1]
|
181
|
+
else:
|
182
|
+
# target_bin_index is an integer.
|
183
|
+
try:
|
184
|
+
full_target = matching_columns[target_bin_index]
|
185
|
+
except IndexError:
|
186
|
+
# If index is out of bounds, use the last matching column.
|
187
|
+
full_target = matching_columns[-1]
|
188
|
+
|
189
|
+
# Compute correlation funnel using the full target column name.
|
190
|
+
df_correlated = df_binarized.correlate(target=full_target, method=corr_method)
|
191
|
+
|
192
|
+
# Attempt to generate a static plot.
|
193
|
+
try:
|
194
|
+
# Here we assume that your DataFrame has a method plot_correlation_funnel.
|
195
|
+
fig = df_correlated.plot_correlation_funnel(engine='plotnine', height=600)
|
196
|
+
buf = BytesIO()
|
197
|
+
# Use the appropriate save method for your figure object.
|
198
|
+
fig.save(buf, format="png")
|
199
|
+
plt.close()
|
200
|
+
buf.seek(0)
|
201
|
+
encoded = base64.b64encode(buf.getvalue()).decode("utf-8")
|
202
|
+
except Exception as e:
|
203
|
+
encoded = {"error": str(e)}
|
204
|
+
|
205
|
+
# Attempt to generate a Plotly plot.
|
206
|
+
try:
|
207
|
+
fig = df_correlated.plot_correlation_funnel(engine='plotly')
|
208
|
+
fig_json = pio.to_json(fig)
|
209
|
+
fig_dict = json.loads(fig_json)
|
210
|
+
except Exception as e:
|
211
|
+
fig_dict = {"error": str(e)}
|
212
|
+
|
213
|
+
content = (f"Correlation funnel computed using method '{corr_method}' for target level '{full_target}'. "
|
214
|
+
f"Base target was '{target}' with target_bin_index '{target_bin_index}'.")
|
215
|
+
artifact = {
|
216
|
+
"correlation_data": df_correlated.to_dict(orient="list"),
|
217
|
+
"plot_image": encoded,
|
218
|
+
"plotly_figure": fig_dict,
|
219
|
+
}
|
220
|
+
return content, artifact
|
221
|
+
|
222
|
+
|
223
|
+
|
224
|
+
@tool(response_format='content_and_artifact')
|
225
|
+
def generate_sweetviz_report(
|
226
|
+
data_raw: Annotated[dict, InjectedState("data_raw")],
|
227
|
+
target: str = None,
|
228
|
+
report_name: str = "sweetviz_report.html",
|
229
|
+
report_directory: str = os.path.join(os.getcwd(), "reports"),
|
230
|
+
open_browser: bool = True,
|
231
|
+
) -> Tuple[str, Dict]:
|
232
|
+
"""
|
233
|
+
Tool: generate_sweetviz_report
|
234
|
+
Description:
|
235
|
+
Make an Exploratory Data Analysis (EDA) report using the Sweetviz library.
|
236
|
+
|
237
|
+
Parameters:
|
238
|
+
-----------
|
239
|
+
data_raw : dict
|
240
|
+
The raw data injected as a dictionary (converted from a DataFrame).
|
241
|
+
target : str, optional
|
242
|
+
The target feature to analyze. Default is None.
|
243
|
+
report_name : str, optional
|
244
|
+
The file name to save the Sweetviz HTML report. Default is "sweetviz_report.html".
|
245
|
+
report_directory : str, optional
|
246
|
+
The directory where the report should be saved. Defaults to a 'reports' directory in the current working directory.
|
247
|
+
open_browser : bool, optional
|
248
|
+
Whether to open the report in a web browser. Default is True.
|
249
|
+
|
250
|
+
Returns:
|
251
|
+
--------
|
252
|
+
Tuple[str, Dict]:
|
253
|
+
content: A summary message describing the generated report.
|
254
|
+
artifact: A dictionary with the report file path and optionally the report's HTML content.
|
255
|
+
"""
|
256
|
+
print(" * Tool: generate_sweetviz_report")
|
257
|
+
try:
|
258
|
+
import sweetviz as sv
|
259
|
+
except ImportError:
|
260
|
+
raise ImportError("Please install the 'sweetviz' package to use this tool. Run: pip install sweetviz")
|
261
|
+
import pandas as pd
|
262
|
+
# Convert injected raw data to a DataFrame.
|
263
|
+
df = pd.DataFrame(data_raw)
|
264
|
+
|
265
|
+
# Create the Sweetviz report.
|
266
|
+
report = sv.analyze(df, target_feat=target)
|
267
|
+
|
268
|
+
# Ensure the directory exists; default is os.getcwd()/reports
|
269
|
+
if not os.path.exists(report_directory):
|
270
|
+
os.makedirs(report_directory)
|
271
|
+
|
272
|
+
# Determine the full path for the report.
|
273
|
+
full_report_path = os.path.join(report_directory, report_name)
|
274
|
+
|
275
|
+
# Save the report to the specified HTML file.
|
276
|
+
report.show_html(
|
277
|
+
filepath=full_report_path,
|
278
|
+
open_browser=True,
|
279
|
+
)
|
280
|
+
|
281
|
+
# Optionally, read the HTML content (if desired to pass along in the artifact).
|
282
|
+
try:
|
283
|
+
with open(full_report_path, "r", encoding="utf-8") as f:
|
284
|
+
html_content = f.read()
|
285
|
+
except Exception:
|
286
|
+
html_content = None
|
287
|
+
|
288
|
+
content = f"Sweetviz EDA report generated and saved as '{os.path.abspath(full_report_path)}'."
|
289
|
+
artifact = {
|
290
|
+
"report_file": os.path.abspath(full_report_path),
|
291
|
+
"report_html": html_content,
|
292
|
+
}
|
293
|
+
return content, artifact
|
@@ -0,0 +1,27 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
import webbrowser
|
4
|
+
import os
|
5
|
+
|
6
|
+
def open_html_file_in_browser(file_path: str):
|
7
|
+
"""
|
8
|
+
Opens an HTML file in the default web browser.
|
9
|
+
|
10
|
+
Parameters:
|
11
|
+
-----------
|
12
|
+
file_path : str
|
13
|
+
The file path or URL of the HTML file to open.
|
14
|
+
|
15
|
+
Returns:
|
16
|
+
--------
|
17
|
+
None
|
18
|
+
"""
|
19
|
+
# Check if the file exists if a local path is provided.
|
20
|
+
if os.path.isfile(file_path):
|
21
|
+
# Convert file path to a file URL
|
22
|
+
file_url = 'file://' + os.path.abspath(file_path)
|
23
|
+
else:
|
24
|
+
# If the file doesn't exist locally, assume it's a URL
|
25
|
+
file_url = file_path
|
26
|
+
|
27
|
+
webbrowser.open(file_url)
|
@@ -0,0 +1,46 @@
|
|
1
|
+
import base64
|
2
|
+
from io import BytesIO
|
3
|
+
import matplotlib.pyplot as plt
|
4
|
+
from PIL import Image
|
5
|
+
|
6
|
+
def matplotlib_from_base64(encoded: str, title: str = None, figsize: tuple = (8, 6)):
|
7
|
+
"""
|
8
|
+
Convert a base64-encoded image to a matplotlib plot and display it.
|
9
|
+
|
10
|
+
Parameters:
|
11
|
+
-----------
|
12
|
+
encoded : str
|
13
|
+
The base64-encoded image string.
|
14
|
+
title : str, optional
|
15
|
+
A title for the plot. Default is None.
|
16
|
+
figsize : tuple, optional
|
17
|
+
Figure size (width, height) for the plot. Default is (8, 6).
|
18
|
+
|
19
|
+
Returns:
|
20
|
+
--------
|
21
|
+
fig, ax : tuple
|
22
|
+
The matplotlib figure and axes objects.
|
23
|
+
"""
|
24
|
+
# Decode the base64 string to bytes
|
25
|
+
img_data = base64.b64decode(encoded)
|
26
|
+
|
27
|
+
# Load the bytes data into a BytesIO buffer
|
28
|
+
buf = BytesIO(img_data)
|
29
|
+
|
30
|
+
# Open the image using Pillow
|
31
|
+
img = Image.open(buf)
|
32
|
+
|
33
|
+
# Create a matplotlib figure and axis
|
34
|
+
fig, ax = plt.subplots(figsize=figsize)
|
35
|
+
|
36
|
+
# Display the image
|
37
|
+
ax.imshow(img)
|
38
|
+
ax.axis('off') # Hide the axis
|
39
|
+
|
40
|
+
if title:
|
41
|
+
ax.set_title(title)
|
42
|
+
|
43
|
+
# Show the plot
|
44
|
+
plt.show()
|
45
|
+
|
46
|
+
return fig, ax
|
{ai_data_science_team-0.0.0.9011.dist-info → ai_data_science_team-0.0.0.9012.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: ai-data-science-team
|
3
|
-
Version: 0.0.0.
|
3
|
+
Version: 0.0.0.9012
|
4
4
|
Summary: Build and run an AI-powered data science team.
|
5
5
|
Home-page: https://github.com/business-science/ai-data-science-team
|
6
6
|
Author: Matt Dancho
|
@@ -31,9 +31,16 @@ Requires-Dist: psutil
|
|
31
31
|
Provides-Extra: machine-learning
|
32
32
|
Requires-Dist: h2o; extra == "machine-learning"
|
33
33
|
Requires-Dist: mlflow; extra == "machine-learning"
|
34
|
+
Provides-Extra: data-science
|
35
|
+
Requires-Dist: pytimetk; extra == "data-science"
|
36
|
+
Requires-Dist: missingno; extra == "data-science"
|
37
|
+
Requires-Dist: sweetviz; extra == "data-science"
|
34
38
|
Provides-Extra: all
|
35
39
|
Requires-Dist: h2o; extra == "all"
|
36
40
|
Requires-Dist: mlflow; extra == "all"
|
41
|
+
Requires-Dist: pytimetk; extra == "all"
|
42
|
+
Requires-Dist: missingno; extra == "all"
|
43
|
+
Requires-Dist: sweetviz; extra == "all"
|
37
44
|
Dynamic: author
|
38
45
|
Dynamic: author-email
|
39
46
|
Dynamic: classifier
|
@@ -59,6 +66,8 @@ Dynamic: summary
|
|
59
66
|
<a href="https://pypi.python.org/pypi/ai-data-science-team"><img src="https://img.shields.io/pypi/v/ai-data-science-team.svg?style=for-the-badge" alt="PyPI"></a>
|
60
67
|
<a href="https://github.com/business-science/ai-data-science-team"><img src="https://img.shields.io/pypi/pyversions/ai-data-science-team.svg?style=for-the-badge" alt="versions"></a>
|
61
68
|
<a href="https://github.com/business-science/ai-data-science-team/blob/main/LICENSE"><img src="https://img.shields.io/github/license/business-science/ai-data-science-team.svg?style=for-the-badge" alt="license"></a>
|
69
|
+
<img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/business-science/ai-data-science-team?style=for-the-badge">
|
70
|
+
|
62
71
|
</div>
|
63
72
|
|
64
73
|
|
@@ -93,8 +102,9 @@ The AI Data Science Team of Copilots includes Agents that specialize data cleani
|
|
93
102
|
- [Apps Available Now](#apps-available-now)
|
94
103
|
- [🔥 Agentic Applications](#-agentic-applications)
|
95
104
|
- [Agents Available Now](#agents-available-now)
|
96
|
-
- [Agents](#agents)
|
105
|
+
- [Standard Agents](#standard-agents)
|
97
106
|
- [🔥🔥 NEW! Machine Learning Agents](#-new-machine-learning-agents)
|
107
|
+
- [🔥 NEW! Data Science Agents](#-new-data-science-agents)
|
98
108
|
- [Multi-Agents](#multi-agents)
|
99
109
|
- [Agents Coming Soon](#agents-coming-soon)
|
100
110
|
- [Disclaimer](#disclaimer)
|
@@ -122,7 +132,7 @@ If you're an aspiring data scientist who wants to learn how to build AI Agents a
|
|
122
132
|
|
123
133
|
This project is a work in progress. New data science agents will be released soon.
|
124
134
|
|
125
|
-

|
126
136
|
|
127
137
|
### NEW: Multi-Agents
|
128
138
|
|
@@ -146,14 +156,14 @@ This is a top secret project I'm working on. It's a multi-agent data science app
|
|
146
156
|
|
147
157
|
### Agents Available Now
|
148
158
|
|
149
|
-
#### Agents
|
159
|
+
#### Standard Agents
|
150
160
|
|
151
161
|
1. **Data Wrangling Agent:** Merges, Joins, Preps and Wrangles data into a format that is ready for data analysis. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/data_wrangling_agent.ipynb)
|
152
162
|
2. **Data Visualization Agent:** Creates visualizations to help you understand your data. Returns JSON serializable plotly visualizations. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/data_visualization_agent.ipynb)
|
153
163
|
3. **🔥 Data Cleaning Agent:** Performs Data Preparation steps including handling missing values, outliers, and data type conversions. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/data_cleaning_agent.ipynb)
|
154
164
|
4. **Feature Engineering Agent:** Converts the prepared data into ML-ready data. Adds features to increase predictive accuracy of ML models. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/feature_engineering_agent.ipynb)
|
155
165
|
5. **🔥 SQL Database Agent:** Connects to SQL databases to pull data into the data science environment. Creates pipelines to automate data extraction. Performs Joins, Aggregations, and other SQL Query operations. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/sql_database_agent.ipynb)
|
156
|
-
6.
|
166
|
+
6. **🔥 Data Loader Tools Agent:** Loads data from various sources including CSV, Excel, Parquet, and Pickle files. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/data_loader_tools_agent.ipynb)
|
157
167
|
|
158
168
|
|
159
169
|
#### 🔥🔥 NEW! Machine Learning Agents
|
@@ -161,6 +171,10 @@ This is a top secret project I'm working on. It's a multi-agent data science app
|
|
161
171
|
1. **🔥 H2O Machine Learning Agent:** Builds and logs 100's of high-performance machine learning models. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/ml_agents/h2o_machine_learning_agent.ipynb)
|
162
172
|
2. **🔥 MLflow Tools Agent (MLOps):** This agent has 11+ tools for managing models, ML projects, and making production ML predictions with MLflow. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/ml_agents/mlflow_tools_agent.ipynb)
|
163
173
|
|
174
|
+
#### 🔥 NEW! Data Science Agents
|
175
|
+
|
176
|
+
1. **🔥🔥 EDA Tools Agent:** Performs automated exploratory data analysis (EDA) with EDA Reporting, Missing Data Analysis, Correlation Analysis, and more. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/ds_agents/eda_tools_agent.ipynb)
|
177
|
+
|
164
178
|
|
165
179
|
#### Multi-Agents
|
166
180
|
|
{ai_data_science_team-0.0.0.9011.dist-info → ai_data_science_team-0.0.0.9012.dist-info}/RECORD
RENAMED
@@ -1,5 +1,5 @@
|
|
1
1
|
ai_data_science_team/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
ai_data_science_team/_version.py,sha256=
|
2
|
+
ai_data_science_team/_version.py,sha256=BybGt-zGNDZsdJxDMV3xmjghiRF8jmwG3ov_dt_rM7E,26
|
3
3
|
ai_data_science_team/orchestration.py,sha256=xiIFOsrLwPdkSmtme7wNCCGv8XopnMTNElNzlZokL-4,303
|
4
4
|
ai_data_science_team/agents/__init__.py,sha256=Gnotza9SKr_0IxuaX8k1nsZK48wXkkeZcGcrR1EqNks,668
|
5
5
|
ai_data_science_team/agents/data_cleaning_agent.py,sha256=V5tJMwGJK0JwrF_H-7r3S0E8UkAY6ci4BGxqjhZiGBI,27352
|
@@ -8,6 +8,9 @@ ai_data_science_team/agents/data_visualization_agent.py,sha256=tJy9Ehnh9mvAu6H--
|
|
8
8
|
ai_data_science_team/agents/data_wrangling_agent.py,sha256=LxzphH-TmrFG0GjejGOjulhPq4SsWFo5Y9tk4WEuN4M,32347
|
9
9
|
ai_data_science_team/agents/feature_engineering_agent.py,sha256=KmPBkj7WUBz6LFUlDDfQHMi7ujXwsH5P9LWRS-F4tdM,31026
|
10
10
|
ai_data_science_team/agents/sql_database_agent.py,sha256=1K2o3NiuKgGKdbMz_Tq9IeQ8xhXjpfGOxx9lArZh1yE,31173
|
11
|
+
ai_data_science_team/ds_agents/__init__.py,sha256=dnuagUTebTDHhGXbCt-hZIilzXMSUwyHaEI7sOxhvoE,95
|
12
|
+
ai_data_science_team/ds_agents/eda_tools_agent.py,sha256=y65lsBXhQNOGwWealEho6uFxGSTW7FNfvTUZnW8_XNY,7609
|
13
|
+
ai_data_science_team/ds_agents/modeling_tools_agent.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
14
|
ai_data_science_team/ml_agents/__init__.py,sha256=qq3UlDCRV_z4FHQ1jj3YR6zPbA6kuCvYCisj_bHYfO4,190
|
12
15
|
ai_data_science_team/ml_agents/h2o_ml_agent.py,sha256=DamR72agrTKfdcdhablmP2mpbj0CqtMonP-QU8p7o9w,33394
|
13
16
|
ai_data_science_team/ml_agents/h2o_ml_tools_agent.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -22,15 +25,18 @@ ai_data_science_team/templates/agent_templates.py,sha256=Lezp0ugtIP3m5WUOmjLwghN
|
|
22
25
|
ai_data_science_team/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
26
|
ai_data_science_team/tools/data_loader.py,sha256=ITs_6UAJ0m9h68R9_LruiaJSElv9l7SxTQYryI7YZPY,14702
|
24
27
|
ai_data_science_team/tools/dataframe.py,sha256=qSflGDByqqCXv4TjuvOFvGPZmegzeOesb0Y4i4Y0gdQ,4551
|
28
|
+
ai_data_science_team/tools/eda.py,sha256=UGD6PC12RsB_UmStvR4TmSqv0noxjM4DkzY-kHjI0-E,10591
|
25
29
|
ai_data_science_team/tools/h2o.py,sha256=gSK0f2FULfAfipFTTjDMUS6DjHwFFvvl4jxshr6QpS0,38997
|
26
30
|
ai_data_science_team/tools/mlflow.py,sha256=8NTkSOvbTk01GOmwFaMkLBRse80w9Kk7Ypi6Fv4kTII,29475
|
27
31
|
ai_data_science_team/tools/sql.py,sha256=vvz_CiOg6GqXo2_mlF4kq5IS6if79dpaizAgLR9sRyg,4784
|
28
32
|
ai_data_science_team/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
33
|
+
ai_data_science_team/utils/html.py,sha256=1MBcjNyATi3FPOyVdqf6-_QYCJmDVQWmVPIInUr50dk,628
|
29
34
|
ai_data_science_team/utils/logging.py,sha256=7wFOv6GGhXR_RPbh-8p0GyrS608XOnZtiaGK2IbDl_s,2081
|
35
|
+
ai_data_science_team/utils/matplotlib.py,sha256=d6DZfCXvZ5Kocxtsp92etIymKW2cRBcUG9GmCOMtgJo,1145
|
30
36
|
ai_data_science_team/utils/plotly.py,sha256=nST-NG0oizKVHhH6HsjHUpTUumq9bCccBdxjuaJWnVQ,504
|
31
37
|
ai_data_science_team/utils/regex.py,sha256=lwarbLqTA2VfNQSyqKCl-PBlH_0WH3zXZvYGBYGUiu4,5144
|
32
|
-
ai_data_science_team-0.0.0.
|
33
|
-
ai_data_science_team-0.0.0.
|
34
|
-
ai_data_science_team-0.0.0.
|
35
|
-
ai_data_science_team-0.0.0.
|
36
|
-
ai_data_science_team-0.0.0.
|
38
|
+
ai_data_science_team-0.0.0.9012.dist-info/LICENSE,sha256=Xif0IRLdd2HGLATxV2EVp91aSY6KOuacRr_6BorKGzA,1084
|
39
|
+
ai_data_science_team-0.0.0.9012.dist-info/METADATA,sha256=geRCFLG3YO9uprp_CGKiqCTSThg06L2U6WxVqYKzyM8,12704
|
40
|
+
ai_data_science_team-0.0.0.9012.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
41
|
+
ai_data_science_team-0.0.0.9012.dist-info/top_level.txt,sha256=CnoMgOphCoAdGTLueWdCVByVyjwOubaGiTB1lchdy4M,21
|
42
|
+
ai_data_science_team-0.0.0.9012.dist-info/RECORD,,
|
{ai_data_science_team-0.0.0.9011.dist-info → ai_data_science_team-0.0.0.9012.dist-info}/LICENSE
RENAMED
File without changes
|
{ai_data_science_team-0.0.0.9011.dist-info → ai_data_science_team-0.0.0.9012.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|