ai-data-science-team 0.0.0.9015__py3-none-any.whl → 0.0.0.9016__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/ds_agents/eda_tools_agent.py +46 -50
- ai_data_science_team/tools/eda.py +121 -60
- {ai_data_science_team-0.0.0.9015.dist-info → ai_data_science_team-0.0.0.9016.dist-info}/METADATA +62 -56
- {ai_data_science_team-0.0.0.9015.dist-info → ai_data_science_team-0.0.0.9016.dist-info}/RECORD +8 -8
- {ai_data_science_team-0.0.0.9015.dist-info → ai_data_science_team-0.0.0.9016.dist-info}/WHEEL +1 -1
- {ai_data_science_team-0.0.0.9015.dist-info → ai_data_science_team-0.0.0.9016.dist-info/licenses}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9015.dist-info → ai_data_science_team-0.0.0.9016.dist-info}/top_level.txt +0 -0
ai_data_science_team/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.0.0.
|
1
|
+
__version__ = "0.0.0.9016"
|
@@ -1,5 +1,3 @@
|
|
1
|
-
|
2
|
-
|
3
1
|
from typing import Any, Optional, Annotated, Sequence, Dict
|
4
2
|
import operator
|
5
3
|
import pandas as pd
|
@@ -17,10 +15,11 @@ from ai_data_science_team.utils.regex import format_agent_name
|
|
17
15
|
|
18
16
|
from ai_data_science_team.tools.eda import (
|
19
17
|
explain_data,
|
20
|
-
describe_dataset,
|
21
|
-
visualize_missing,
|
22
|
-
|
18
|
+
describe_dataset,
|
19
|
+
visualize_missing,
|
20
|
+
generate_correlation_funnel,
|
23
21
|
generate_sweetviz_report,
|
22
|
+
generate_dtale_report,
|
24
23
|
)
|
25
24
|
from ai_data_science_team.utils.messages import get_tool_call_names
|
26
25
|
|
@@ -32,15 +31,17 @@ EDA_TOOLS = [
|
|
32
31
|
explain_data,
|
33
32
|
describe_dataset,
|
34
33
|
visualize_missing,
|
35
|
-
|
34
|
+
generate_correlation_funnel,
|
36
35
|
generate_sweetviz_report,
|
36
|
+
generate_dtale_report,
|
37
37
|
]
|
38
38
|
|
39
|
+
|
39
40
|
class EDAToolsAgent(BaseAgent):
|
40
41
|
"""
|
41
42
|
An Exploratory Data Analysis Tools Agent that interacts with EDA tools to generate summary statistics,
|
42
43
|
missing data visualizations, correlation funnels, EDA reports, etc.
|
43
|
-
|
44
|
+
|
44
45
|
Parameters:
|
45
46
|
----------
|
46
47
|
model : langchain.llms.base.LLM
|
@@ -52,9 +53,9 @@ class EDAToolsAgent(BaseAgent):
|
|
52
53
|
checkpointer : Checkpointer, optional
|
53
54
|
The checkpointer for the agent.
|
54
55
|
"""
|
55
|
-
|
56
|
+
|
56
57
|
def __init__(
|
57
|
-
self,
|
58
|
+
self,
|
58
59
|
model: Any,
|
59
60
|
create_react_agent_kwargs: Optional[Dict] = {},
|
60
61
|
invoke_react_agent_kwargs: Optional[Dict] = {},
|
@@ -64,18 +65,18 @@ class EDAToolsAgent(BaseAgent):
|
|
64
65
|
"model": model,
|
65
66
|
"create_react_agent_kwargs": create_react_agent_kwargs,
|
66
67
|
"invoke_react_agent_kwargs": invoke_react_agent_kwargs,
|
67
|
-
"checkpointer": checkpointer
|
68
|
+
"checkpointer": checkpointer,
|
68
69
|
}
|
69
70
|
self._compiled_graph = self._make_compiled_graph()
|
70
71
|
self.response = None
|
71
|
-
|
72
|
+
|
72
73
|
def _make_compiled_graph(self):
|
73
74
|
"""
|
74
75
|
Creates the compiled state graph for the EDA agent.
|
75
76
|
"""
|
76
77
|
self.response = None
|
77
78
|
return make_eda_tools_agent(**self._params)
|
78
|
-
|
79
|
+
|
79
80
|
def update_params(self, **kwargs):
|
80
81
|
"""
|
81
82
|
Updates the agent's parameters and rebuilds the compiled graph.
|
@@ -83,16 +84,13 @@ class EDAToolsAgent(BaseAgent):
|
|
83
84
|
for k, v in kwargs.items():
|
84
85
|
self._params[k] = v
|
85
86
|
self._compiled_graph = self._make_compiled_graph()
|
86
|
-
|
87
|
+
|
87
88
|
async def ainvoke_agent(
|
88
|
-
self,
|
89
|
-
user_instructions: str = None,
|
90
|
-
data_raw: pd.DataFrame = None,
|
91
|
-
**kwargs
|
89
|
+
self, user_instructions: str = None, data_raw: pd.DataFrame = None, **kwargs
|
92
90
|
):
|
93
91
|
"""
|
94
92
|
Asynchronously runs the agent with user instructions and data.
|
95
|
-
|
93
|
+
|
96
94
|
Parameters:
|
97
95
|
----------
|
98
96
|
user_instructions : str, optional
|
@@ -105,20 +103,17 @@ class EDAToolsAgent(BaseAgent):
|
|
105
103
|
"user_instructions": user_instructions,
|
106
104
|
"data_raw": data_raw.to_dict() if data_raw is not None else None,
|
107
105
|
},
|
108
|
-
**kwargs
|
106
|
+
**kwargs,
|
109
107
|
)
|
110
108
|
self.response = response
|
111
109
|
return None
|
112
|
-
|
110
|
+
|
113
111
|
def invoke_agent(
|
114
|
-
self,
|
115
|
-
user_instructions: str = None,
|
116
|
-
data_raw: pd.DataFrame = None,
|
117
|
-
**kwargs
|
112
|
+
self, user_instructions: str = None, data_raw: pd.DataFrame = None, **kwargs
|
118
113
|
):
|
119
114
|
"""
|
120
115
|
Synchronously runs the agent with user instructions and data.
|
121
|
-
|
116
|
+
|
122
117
|
Parameters:
|
123
118
|
----------
|
124
119
|
user_instructions : str, optional
|
@@ -131,24 +126,26 @@ class EDAToolsAgent(BaseAgent):
|
|
131
126
|
"user_instructions": user_instructions,
|
132
127
|
"data_raw": data_raw.to_dict() if data_raw is not None else None,
|
133
128
|
},
|
134
|
-
**kwargs
|
129
|
+
**kwargs,
|
135
130
|
)
|
136
131
|
self.response = response
|
137
132
|
return None
|
138
|
-
|
133
|
+
|
139
134
|
def get_internal_messages(self, markdown: bool = False):
|
140
135
|
"""
|
141
136
|
Returns internal messages from the agent response.
|
142
137
|
"""
|
143
138
|
pretty_print = "\n\n".join(
|
144
|
-
[
|
145
|
-
|
139
|
+
[
|
140
|
+
f"### {msg.type.upper()}\n\nID: {msg.id}\n\nContent:\n\n{msg.content}"
|
141
|
+
for msg in self.response["internal_messages"]
|
142
|
+
]
|
146
143
|
)
|
147
144
|
if markdown:
|
148
145
|
return Markdown(pretty_print)
|
149
146
|
else:
|
150
147
|
return self.response["internal_messages"]
|
151
|
-
|
148
|
+
|
152
149
|
def get_artifacts(self, as_dataframe: bool = False):
|
153
150
|
"""
|
154
151
|
Returns the EDA artifacts from the agent response.
|
@@ -157,7 +154,7 @@ class EDAToolsAgent(BaseAgent):
|
|
157
154
|
return pd.DataFrame(self.response["eda_artifacts"])
|
158
155
|
else:
|
159
156
|
return self.response["eda_artifacts"]
|
160
|
-
|
157
|
+
|
161
158
|
def get_ai_message(self, markdown: bool = False):
|
162
159
|
"""
|
163
160
|
Returns the AI message from the agent response.
|
@@ -166,13 +163,14 @@ class EDAToolsAgent(BaseAgent):
|
|
166
163
|
return Markdown(self.response["messages"][0].content)
|
167
164
|
else:
|
168
165
|
return self.response["messages"][0].content
|
169
|
-
|
166
|
+
|
170
167
|
def get_tool_calls(self):
|
171
168
|
"""
|
172
169
|
Returns the tool calls made by the agent.
|
173
170
|
"""
|
174
171
|
return self.response["tool_calls"]
|
175
172
|
|
173
|
+
|
176
174
|
def make_eda_tools_agent(
|
177
175
|
model: Any,
|
178
176
|
create_react_agent_kwargs: Optional[Dict] = {},
|
@@ -181,7 +179,7 @@ def make_eda_tools_agent(
|
|
181
179
|
):
|
182
180
|
"""
|
183
181
|
Creates an Exploratory Data Analyst Agent that can interact with EDA tools.
|
184
|
-
|
182
|
+
|
185
183
|
Parameters:
|
186
184
|
----------
|
187
185
|
model : Any
|
@@ -192,13 +190,13 @@ def make_eda_tools_agent(
|
|
192
190
|
Additional kwargs for agent invocation.
|
193
191
|
checkpointer : Checkpointer, optional
|
194
192
|
The checkpointer for the agent.
|
195
|
-
|
193
|
+
|
196
194
|
Returns:
|
197
195
|
-------
|
198
196
|
app : langgraph.graph.CompiledStateGraph
|
199
197
|
The compiled state graph for the EDA agent.
|
200
198
|
"""
|
201
|
-
|
199
|
+
|
202
200
|
class GraphState(AgentState):
|
203
201
|
internal_messages: Annotated[Sequence[BaseMessage], operator.add]
|
204
202
|
user_instructions: str
|
@@ -209,11 +207,9 @@ def make_eda_tools_agent(
|
|
209
207
|
def exploratory_agent(state):
|
210
208
|
print(format_agent_name(AGENT_NAME))
|
211
209
|
print(" * RUN REACT TOOL-CALLING AGENT FOR EDA")
|
212
|
-
|
213
|
-
tool_node = ToolNode(
|
214
|
-
|
215
|
-
)
|
216
|
-
|
210
|
+
|
211
|
+
tool_node = ToolNode(tools=EDA_TOOLS)
|
212
|
+
|
217
213
|
eda_agent = create_react_agent(
|
218
214
|
model,
|
219
215
|
tools=tool_node,
|
@@ -221,7 +217,7 @@ def make_eda_tools_agent(
|
|
221
217
|
**create_react_agent_kwargs,
|
222
218
|
checkpointer=checkpointer,
|
223
219
|
)
|
224
|
-
|
220
|
+
|
225
221
|
response = eda_agent.invoke(
|
226
222
|
{
|
227
223
|
"messages": [("user", state["user_instructions"])],
|
@@ -229,13 +225,13 @@ def make_eda_tools_agent(
|
|
229
225
|
},
|
230
226
|
invoke_react_agent_kwargs,
|
231
227
|
)
|
232
|
-
|
228
|
+
|
233
229
|
print(" * POST-PROCESSING EDA RESULTS")
|
234
|
-
|
235
|
-
internal_messages = response[
|
230
|
+
|
231
|
+
internal_messages = response["messages"]
|
236
232
|
if not internal_messages:
|
237
233
|
return {"internal_messages": [], "eda_artifacts": None}
|
238
|
-
|
234
|
+
|
239
235
|
last_ai_message = AIMessage(internal_messages[-1].content, role=AGENT_NAME)
|
240
236
|
last_tool_artifact = None
|
241
237
|
if len(internal_messages) > 1:
|
@@ -244,24 +240,24 @@ def make_eda_tools_agent(
|
|
244
240
|
last_tool_artifact = last_message.artifact
|
245
241
|
elif isinstance(last_message, dict) and "artifact" in last_message:
|
246
242
|
last_tool_artifact = last_message["artifact"]
|
247
|
-
|
243
|
+
|
248
244
|
tool_calls = get_tool_call_names(internal_messages)
|
249
|
-
|
245
|
+
|
250
246
|
return {
|
251
247
|
"messages": [last_ai_message],
|
252
248
|
"internal_messages": internal_messages,
|
253
249
|
"eda_artifacts": last_tool_artifact,
|
254
250
|
"tool_calls": tool_calls,
|
255
251
|
}
|
256
|
-
|
252
|
+
|
257
253
|
workflow = StateGraph(GraphState)
|
258
254
|
workflow.add_node("exploratory_agent", exploratory_agent)
|
259
255
|
workflow.add_edge(START, "exploratory_agent")
|
260
256
|
workflow.add_edge("exploratory_agent", END)
|
261
|
-
|
257
|
+
|
262
258
|
app = workflow.compile(
|
263
259
|
checkpointer=checkpointer,
|
264
260
|
name=AGENT_NAME,
|
265
261
|
)
|
266
|
-
|
262
|
+
|
267
263
|
return app
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
from typing import Annotated, Dict, Tuple, Union
|
3
2
|
|
4
3
|
import os
|
@@ -6,12 +5,12 @@ import tempfile
|
|
6
5
|
|
7
6
|
from langchain.tools import tool
|
8
7
|
|
9
|
-
from langgraph.prebuilt import InjectedState
|
8
|
+
from langgraph.prebuilt import InjectedState
|
10
9
|
|
11
10
|
from ai_data_science_team.tools.dataframe import get_dataframe_summary
|
12
11
|
|
13
12
|
|
14
|
-
@tool(response_format=
|
13
|
+
@tool(response_format="content")
|
15
14
|
def explain_data(
|
16
15
|
data_raw: Annotated[dict, InjectedState("data_raw")],
|
17
16
|
n_sample: int = 30,
|
@@ -36,14 +35,17 @@ def explain_data(
|
|
36
35
|
"""
|
37
36
|
print(" * Tool: explain_data")
|
38
37
|
import pandas as pd
|
39
|
-
|
40
|
-
result = get_dataframe_summary(
|
41
|
-
|
38
|
+
|
39
|
+
result = get_dataframe_summary(
|
40
|
+
pd.DataFrame(data_raw), n_sample=n_sample, skip_stats=skip_stats
|
41
|
+
)
|
42
|
+
|
42
43
|
return result
|
43
44
|
|
44
|
-
|
45
|
+
|
46
|
+
@tool(response_format="content_and_artifact")
|
45
47
|
def describe_dataset(
|
46
|
-
data_raw: Annotated[dict, InjectedState("data_raw")]
|
48
|
+
data_raw: Annotated[dict, InjectedState("data_raw")],
|
47
49
|
) -> Tuple[str, Dict]:
|
48
50
|
"""
|
49
51
|
Tool: describe_dataset
|
@@ -71,30 +73,30 @@ def describe_dataset(
|
|
71
73
|
"""
|
72
74
|
print(" * Tool: describe_dataset")
|
73
75
|
import pandas as pd
|
76
|
+
|
74
77
|
df = pd.DataFrame(data_raw)
|
75
|
-
description_df = df.describe(include=
|
78
|
+
description_df = df.describe(include="all")
|
76
79
|
content = "Summary statistics computed using pandas describe()."
|
77
|
-
artifact = {
|
80
|
+
artifact = {"describe_df": description_df.to_dict()}
|
78
81
|
return content, artifact
|
79
82
|
|
80
83
|
|
81
|
-
@tool(response_format=
|
84
|
+
@tool(response_format="content_and_artifact")
|
82
85
|
def visualize_missing(
|
83
|
-
data_raw: Annotated[dict, InjectedState("data_raw")],
|
84
|
-
n_sample: int = None
|
86
|
+
data_raw: Annotated[dict, InjectedState("data_raw")], n_sample: int = None
|
85
87
|
) -> Tuple[str, Dict]:
|
86
88
|
"""
|
87
89
|
Tool: visualize_missing
|
88
90
|
Description:
|
89
91
|
Missing value analysis using the missingno library. Generates a matrix plot, bar plot, and heatmap plot.
|
90
|
-
|
92
|
+
|
91
93
|
Parameters:
|
92
94
|
-----------
|
93
95
|
data_raw : dict
|
94
96
|
The raw data in dictionary format.
|
95
97
|
n_sample : int, optional (default: None)
|
96
98
|
The number of rows to sample from the dataset if it is large.
|
97
|
-
|
99
|
+
|
98
100
|
Returns:
|
99
101
|
-------
|
100
102
|
Tuple[str, Dict]:
|
@@ -103,12 +105,14 @@ def visualize_missing(
|
|
103
105
|
corresponding base64 encoded PNG image.
|
104
106
|
"""
|
105
107
|
print(" * Tool: visualize_missing")
|
106
|
-
|
108
|
+
|
107
109
|
try:
|
108
110
|
import missingno as msno # Ensure missingno is installed
|
109
111
|
except ImportError:
|
110
|
-
raise ImportError(
|
111
|
-
|
112
|
+
raise ImportError(
|
113
|
+
"Please install the 'missingno' package to use this tool. pip install missingno"
|
114
|
+
)
|
115
|
+
|
112
116
|
import pandas as pd
|
113
117
|
import base64
|
114
118
|
from io import BytesIO
|
@@ -136,21 +140,22 @@ def visualize_missing(
|
|
136
140
|
|
137
141
|
# Create and encode the matrix plot.
|
138
142
|
encoded_plots["matrix_plot"] = create_and_encode_plot(msno.matrix, "matrix")
|
139
|
-
|
143
|
+
|
140
144
|
# Create and encode the bar plot.
|
141
145
|
encoded_plots["bar_plot"] = create_and_encode_plot(msno.bar, "bar")
|
142
|
-
|
146
|
+
|
143
147
|
# Create and encode the heatmap plot.
|
144
148
|
encoded_plots["heatmap_plot"] = create_and_encode_plot(msno.heatmap, "heatmap")
|
145
149
|
|
146
|
-
content =
|
150
|
+
content = (
|
151
|
+
"Missing data visualizations (matrix, bar, and heatmap) have been generated."
|
152
|
+
)
|
147
153
|
artifact = encoded_plots
|
148
154
|
return content, artifact
|
149
155
|
|
150
156
|
|
151
|
-
|
152
|
-
|
153
|
-
def correlation_funnel(
|
157
|
+
@tool(response_format="content_and_artifact")
|
158
|
+
def generate_correlation_funnel(
|
154
159
|
data_raw: Annotated[dict, InjectedState("data_raw")],
|
155
160
|
target: str,
|
156
161
|
target_bin_index: Union[int, str] = -1,
|
@@ -160,10 +165,10 @@ def correlation_funnel(
|
|
160
165
|
name_infreq: str = "-OTHER",
|
161
166
|
) -> Tuple[str, Dict]:
|
162
167
|
"""
|
163
|
-
Tool:
|
168
|
+
Tool: generate_correlation_funnel
|
164
169
|
Description:
|
165
170
|
Correlation analysis using the correlation funnel method. The tool binarizes the data and computes correlation versus a target column.
|
166
|
-
|
171
|
+
|
167
172
|
Parameters:
|
168
173
|
----------
|
169
174
|
target : str
|
@@ -171,8 +176,8 @@ def correlation_funnel(
|
|
171
176
|
with this string followed by '__' (e.g., 'Member_Status__Gold', 'Member_Status__Platinum').
|
172
177
|
target_bin_index : int or str, default -1
|
173
178
|
If an integer, selects the target level by position from the matching columns.
|
174
|
-
If a string (e.g., "Yes"), attempts to match to the suffix of a column name
|
175
|
-
(i.e., 'target__Yes').
|
179
|
+
If a string (e.g., "Yes"), attempts to match to the suffix of a column name
|
180
|
+
(i.e., 'target__Yes').
|
176
181
|
corr_method : str
|
177
182
|
The correlation method ('pearson', 'kendall', or 'spearman'). Default is 'pearson'.
|
178
183
|
n_bins : int
|
@@ -182,34 +187,36 @@ def correlation_funnel(
|
|
182
187
|
name_infreq : str
|
183
188
|
The name to use for infrequent levels. Default is '-OTHER'.
|
184
189
|
"""
|
185
|
-
print(" * Tool:
|
190
|
+
print(" * Tool: generate_correlation_funnel")
|
186
191
|
try:
|
187
192
|
import pytimetk as tk
|
188
193
|
except ImportError:
|
189
|
-
raise ImportError(
|
194
|
+
raise ImportError(
|
195
|
+
"Please install the 'pytimetk' package to use this tool. pip install pytimetk"
|
196
|
+
)
|
190
197
|
import pandas as pd
|
191
198
|
import base64
|
192
199
|
from io import BytesIO
|
193
200
|
import matplotlib.pyplot as plt
|
194
201
|
import json
|
195
|
-
import plotly.graph_objects as go
|
196
202
|
import plotly.io as pio
|
197
|
-
from typing import Union
|
198
203
|
|
199
204
|
# Convert the raw injected state into a DataFrame.
|
200
205
|
df = pd.DataFrame(data_raw)
|
201
|
-
|
206
|
+
|
202
207
|
# Apply the binarization method.
|
203
208
|
df_binarized = df.binarize(
|
204
|
-
n_bins=n_bins,
|
205
|
-
thresh_infreq=thresh_infreq,
|
206
|
-
name_infreq=name_infreq,
|
207
|
-
one_hot=True
|
209
|
+
n_bins=n_bins,
|
210
|
+
thresh_infreq=thresh_infreq,
|
211
|
+
name_infreq=name_infreq,
|
212
|
+
one_hot=True,
|
208
213
|
)
|
209
|
-
|
214
|
+
|
210
215
|
# Determine the full target column name.
|
211
216
|
# Look for all columns that start with "target__"
|
212
|
-
matching_columns = [
|
217
|
+
matching_columns = [
|
218
|
+
col for col in df_binarized.columns if col.startswith(f"{target}__")
|
219
|
+
]
|
213
220
|
if not matching_columns:
|
214
221
|
# If no matching columns are found, warn and use the provided target as-is.
|
215
222
|
full_target = target
|
@@ -230,15 +237,15 @@ def correlation_funnel(
|
|
230
237
|
except IndexError:
|
231
238
|
# If index is out of bounds, use the last matching column.
|
232
239
|
full_target = matching_columns[-1]
|
233
|
-
|
240
|
+
|
234
241
|
# Compute correlation funnel using the full target column name.
|
235
242
|
df_correlated = df_binarized.correlate(target=full_target, method=corr_method)
|
236
|
-
|
243
|
+
|
237
244
|
# Attempt to generate a static plot.
|
238
245
|
encoded = None
|
239
246
|
try:
|
240
247
|
# Here we assume that your DataFrame has a method plot_correlation_funnel.
|
241
|
-
fig = df_correlated.plot_correlation_funnel(engine=
|
248
|
+
fig = df_correlated.plot_correlation_funnel(engine="plotnine", height=600)
|
242
249
|
buf = BytesIO()
|
243
250
|
# Use the appropriate save method for your figure object.
|
244
251
|
fig.save(buf, format="png")
|
@@ -247,18 +254,21 @@ def correlation_funnel(
|
|
247
254
|
encoded = base64.b64encode(buf.getvalue()).decode("utf-8")
|
248
255
|
except Exception as e:
|
249
256
|
encoded = {"error": str(e)}
|
250
|
-
|
257
|
+
|
251
258
|
# Attempt to generate a Plotly plot.
|
252
259
|
fig_dict = None
|
253
260
|
try:
|
254
|
-
fig = df_correlated.plot_correlation_funnel(engine=
|
261
|
+
fig = df_correlated.plot_correlation_funnel(engine="plotly", base_size=14)
|
262
|
+
|
255
263
|
fig_json = pio.to_json(fig)
|
256
264
|
fig_dict = json.loads(fig_json)
|
257
265
|
except Exception as e:
|
258
266
|
fig_dict = {"error": str(e)}
|
259
267
|
|
260
|
-
content = (
|
261
|
-
|
268
|
+
content = (
|
269
|
+
f"Correlation funnel computed using method '{corr_method}' for target level '{full_target}'. "
|
270
|
+
f"Base target was '{target}' with target_bin_index '{target_bin_index}'."
|
271
|
+
)
|
262
272
|
artifact = {
|
263
273
|
"correlation_data": df_correlated.to_dict(orient="list"),
|
264
274
|
"plot_image": encoded,
|
@@ -267,8 +277,7 @@ def correlation_funnel(
|
|
267
277
|
return content, artifact
|
268
278
|
|
269
279
|
|
270
|
-
|
271
|
-
@tool(response_format='content_and_artifact')
|
280
|
+
@tool(response_format="content_and_artifact")
|
272
281
|
def generate_sweetviz_report(
|
273
282
|
data_raw: Annotated[dict, InjectedState("data_raw")],
|
274
283
|
target: str = None,
|
@@ -280,7 +289,7 @@ def generate_sweetviz_report(
|
|
280
289
|
Tool: generate_sweetviz_report
|
281
290
|
Description:
|
282
291
|
Make an Exploratory Data Analysis (EDA) report using the Sweetviz library.
|
283
|
-
|
292
|
+
|
284
293
|
Parameters:
|
285
294
|
-----------
|
286
295
|
data_raw : dict
|
@@ -290,11 +299,11 @@ def generate_sweetviz_report(
|
|
290
299
|
report_name : str, optional
|
291
300
|
The file name to save the Sweetviz HTML report. Default is "sweetviz_report.html".
|
292
301
|
report_directory : str, optional
|
293
|
-
The directory where the report should be saved.
|
302
|
+
The directory where the report should be saved.
|
294
303
|
If None, a temporary directory is created and used.
|
295
304
|
open_browser : bool, optional
|
296
305
|
Whether to open the report in a web browser. Default is False.
|
297
|
-
|
306
|
+
|
298
307
|
Returns:
|
299
308
|
--------
|
300
309
|
Tuple[str, Dict]:
|
@@ -307,13 +316,15 @@ def generate_sweetviz_report(
|
|
307
316
|
try:
|
308
317
|
import sweetviz as sv
|
309
318
|
except ImportError:
|
310
|
-
raise ImportError(
|
311
|
-
|
319
|
+
raise ImportError(
|
320
|
+
"Please install the 'sweetviz' package to use this tool. Run: pip install sweetviz"
|
321
|
+
)
|
322
|
+
|
312
323
|
import pandas as pd
|
313
|
-
|
324
|
+
|
314
325
|
# Convert injected raw data to a DataFrame.
|
315
326
|
df = pd.DataFrame(data_raw)
|
316
|
-
|
327
|
+
|
317
328
|
# If no directory is specified, use a temporary directory.
|
318
329
|
if not report_directory:
|
319
330
|
report_directory = tempfile.mkdtemp()
|
@@ -322,26 +333,26 @@ def generate_sweetviz_report(
|
|
322
333
|
# Ensure user-specified directory exists.
|
323
334
|
if not os.path.exists(report_directory):
|
324
335
|
os.makedirs(report_directory)
|
325
|
-
|
336
|
+
|
326
337
|
# Create the Sweetviz report.
|
327
338
|
report = sv.analyze(df, target_feat=target)
|
328
|
-
|
339
|
+
|
329
340
|
# Determine the full path for the report.
|
330
341
|
full_report_path = os.path.join(report_directory, report_name)
|
331
|
-
|
342
|
+
|
332
343
|
# Save the report to the specified HTML file.
|
333
344
|
report.show_html(
|
334
345
|
filepath=full_report_path,
|
335
346
|
open_browser=open_browser,
|
336
347
|
)
|
337
|
-
|
348
|
+
|
338
349
|
# Optionally, read the HTML content (if desired to pass along in the artifact).
|
339
350
|
try:
|
340
351
|
with open(full_report_path, "r", encoding="utf-8") as f:
|
341
352
|
html_content = f.read()
|
342
353
|
except Exception:
|
343
354
|
html_content = None
|
344
|
-
|
355
|
+
|
345
356
|
content = (
|
346
357
|
f"Sweetviz EDA report generated and saved as '{os.path.abspath(full_report_path)}'. "
|
347
358
|
f"{'This was saved in a temporary directory.' if 'tmp' in report_directory else ''}"
|
@@ -352,3 +363,53 @@ def generate_sweetviz_report(
|
|
352
363
|
}
|
353
364
|
return content, artifact
|
354
365
|
|
366
|
+
|
367
|
+
@tool(response_format="content_and_artifact")
|
368
|
+
def generate_dtale_report(
|
369
|
+
data_raw: Annotated[dict, InjectedState("data_raw")],
|
370
|
+
host: str = "localhost",
|
371
|
+
port: int = 40000,
|
372
|
+
open_browser: bool = False,
|
373
|
+
) -> Tuple[str, Dict]:
|
374
|
+
"""
|
375
|
+
Tool: generate_dtale_report
|
376
|
+
Description:
|
377
|
+
Creates an interactive data exploration report using the dtale library.
|
378
|
+
|
379
|
+
Parameters:
|
380
|
+
-----------
|
381
|
+
data_raw : dict
|
382
|
+
The raw data in dictionary format.
|
383
|
+
host : str, optional
|
384
|
+
The host IP address to serve the dtale app. Default is "localhost".
|
385
|
+
port : int, optional
|
386
|
+
The port number to serve the dtale app. Default is 40000.
|
387
|
+
open_browser : bool, optional
|
388
|
+
Whether to open the report in a web browser. Default is False.
|
389
|
+
|
390
|
+
Returns:
|
391
|
+
--------
|
392
|
+
Tuple[str, Dict]:
|
393
|
+
content: A summary message describing the dtale report.
|
394
|
+
artifact: A dictionary containing the URL of the dtale report.
|
395
|
+
"""
|
396
|
+
print(" * Tool: generate_dtale_report")
|
397
|
+
|
398
|
+
try:
|
399
|
+
import dtale
|
400
|
+
except ImportError:
|
401
|
+
raise ImportError(
|
402
|
+
"Please install the 'dtale' package to use this tool. Run: pip install dtale"
|
403
|
+
)
|
404
|
+
|
405
|
+
import pandas as pd
|
406
|
+
|
407
|
+
df = pd.DataFrame(data_raw)
|
408
|
+
|
409
|
+
# Create the dtale report
|
410
|
+
d = dtale.show(df, host=host, port=port, open_browser=open_browser)
|
411
|
+
|
412
|
+
content = f"Dtale report generated and available at: {d.main_url()}"
|
413
|
+
artifact = {"dtale_url": d.main_url()}
|
414
|
+
|
415
|
+
return content, artifact
|
{ai_data_science_team-0.0.0.9015.dist-info → ai_data_science_team-0.0.0.9016.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: ai-data-science-team
|
3
|
-
Version: 0.0.0.
|
3
|
+
Version: 0.0.0.9016
|
4
4
|
Summary: Build and run an AI-powered data science team.
|
5
5
|
Home-page: https://github.com/business-science/ai-data-science-team
|
6
6
|
Author: Matt Dancho
|
@@ -47,6 +47,7 @@ Dynamic: classifier
|
|
47
47
|
Dynamic: description
|
48
48
|
Dynamic: description-content-type
|
49
49
|
Dynamic: home-page
|
50
|
+
Dynamic: license-file
|
50
51
|
Dynamic: provides-extra
|
51
52
|
Dynamic: requires-dist
|
52
53
|
Dynamic: requires-python
|
@@ -97,9 +98,8 @@ The AI Data Science Team of Copilots includes Agents that specialize data cleani
|
|
97
98
|
- [Companies That Want A Custom AI Data Science Team (And AI Apps)](#companies-that-want-a-custom-ai-data-science-team-and-ai-apps)
|
98
99
|
- [Generative AI for Data Scientists Workshop](#generative-ai-for-data-scientists-workshop)
|
99
100
|
- [Data Science Agents](#data-science-agents)
|
101
|
+
- [🔥 NEW: Data Science Apps](#-new-data-science-apps)
|
100
102
|
- [NEW: Multi-Agents](#new-multi-agents)
|
101
|
-
- [Data Science Apps](#data-science-apps)
|
102
|
-
- [Apps Available Now](#apps-available-now)
|
103
103
|
- [🔥 Agentic Applications](#-agentic-applications)
|
104
104
|
- [Agents Available Now](#agents-available-now)
|
105
105
|
- [Standard Agents](#standard-agents)
|
@@ -110,11 +110,11 @@ The AI Data Science Team of Copilots includes Agents that specialize data cleani
|
|
110
110
|
- [Disclaimer](#disclaimer)
|
111
111
|
- [Installation](#installation)
|
112
112
|
- [Usage](#usage)
|
113
|
-
- [Example
|
114
|
-
- [Example 2: Cleaning Data with the Data Cleaning Agent](#example-2-cleaning-data-with-the-data-cleaning-agent)
|
113
|
+
- [Example: H2O Machine Learning Agent](#example-h2o-machine-learning-agent)
|
115
114
|
- [Contributing](#contributing)
|
116
115
|
- [License](#license)
|
117
116
|
- [Want To Become A Full-Stack Generative AI Data Scientist?](#want-to-become-a-full-stack-generative-ai-data-scientist)
|
117
|
+
- [⭐️ Star History](#️-star-history)
|
118
118
|
|
119
119
|
## Companies That Want A Custom AI Data Science Team (And AI Apps)
|
120
120
|
|
@@ -134,21 +134,24 @@ This project is a work in progress. New data science agents will be released soo
|
|
134
134
|
|
135
135
|

|
136
136
|
|
137
|
-
### NEW:
|
137
|
+
### 🔥 NEW: Data Science Apps
|
138
138
|
|
139
|
-
**🔥 Pandas Data Analyst
|
139
|
+
**🔥 Open Pandas AI Data Analyst:** Load an Excel or CSV file and ask it questions. Get data and charts back.
|
140
|
+
|
141
|
+

|
142
|
+
|
143
|
+
**🔥 SQL Database Agent:** Connects any SQL Database, generates SQL queries from natural language, and returns data as a downloadable table.
|
140
144
|
|
141
|
-
|
145
|
+
**🔥 Exploratory Data Copilot:** An AI-powered data science app that performs automated exploratory data analysis (EDA) with EDA Reporting, Missing Data Analysis, Correlation Analysis, and more.
|
142
146
|
|
143
|
-
|
147
|
+
[See all available apps here](/apps)
|
144
148
|
|
145
|
-
|
149
|
+
### NEW: Multi-Agents
|
146
150
|
|
147
|
-
|
151
|
+
**🔥 Pandas Data Analyst Agent:** Combines the ability to wrangle, transform, and analyze data with an optional data visualization agent that can create interactive plots.
|
148
152
|
|
149
|
-
|
153
|
+

|
150
154
|
|
151
|
-
[See all available apps here](/apps)
|
152
155
|
|
153
156
|
#### 🔥 Agentic Applications
|
154
157
|
|
@@ -205,6 +208,14 @@ By using this software, you agree to use it solely for learning purposes.
|
|
205
208
|
|
206
209
|
## Installation
|
207
210
|
|
211
|
+
You can install via PyPI (note that this is a beta version and breaking changes may occur until 0.1.0):
|
212
|
+
|
213
|
+
``` bash
|
214
|
+
pip install ai-data-science-team
|
215
|
+
```
|
216
|
+
|
217
|
+
Or, if you want the latest version from GitHub:
|
218
|
+
|
208
219
|
``` bash
|
209
220
|
pip install git+https://github.com/business-science/ai-data-science-team.git --upgrade
|
210
221
|
```
|
@@ -213,55 +224,46 @@ pip install git+https://github.com/business-science/ai-data-science-team.git --u
|
|
213
224
|
|
214
225
|
[See all examples here.](/examples)
|
215
226
|
|
216
|
-
### Example
|
227
|
+
### Example: H2O Machine Learning Agent
|
217
228
|
|
218
|
-
[See the full example here.](/examples/
|
229
|
+
[See the full example here.](https://github.com/business-science/ai-data-science-team/blob/master/examples/ml_agents/h2o_machine_learning_agent.ipynb)
|
219
230
|
|
220
231
|
``` python
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
232
|
+
# Import libraries
|
233
|
+
from langchain_openai import ChatOpenAI
|
234
|
+
import pandas as pd
|
235
|
+
import h2o
|
236
|
+
import os
|
237
|
+
from ai_data_science_team.ml_agents import H2OMLAgent
|
238
|
+
|
239
|
+
# Load the data
|
240
|
+
df = pd.read_csv("data/churn_data.csv")
|
241
|
+
df
|
242
|
+
|
243
|
+
# Initialize the language model
|
244
|
+
os.environ['OPENAI_API_KEY'] = "YOUR_OPENAI_API_KEY"
|
245
|
+
llm = ChatOpenAI(model=MODEL)
|
246
|
+
llm
|
247
|
+
|
248
|
+
# Initialize the H2O ML Agent
|
249
|
+
ml_agent = H2OMLAgent(
|
250
|
+
model=llm,
|
251
|
+
log=True,
|
252
|
+
log_path="logs/",
|
253
|
+
model_directory="h2o_models/",
|
254
|
+
enable_mlflow=True, # Use this if you wish to log models to MLflow
|
228
255
|
)
|
229
|
-
|
230
|
-
|
231
|
-
``` bash
|
232
|
-
---FEATURE ENGINEERING AGENT----
|
233
|
-
* CREATE FEATURE ENGINEER CODE
|
234
|
-
* EXECUTING AGENT CODE
|
235
|
-
* EXPLAIN AGENT CODE
|
236
|
-
```
|
237
|
-
|
238
|
-
``` python
|
239
|
-
feature_engineering_agent.get_data_engineered()
|
240
|
-
```
|
241
|
-
|
242
|
-
### Example 2: Cleaning Data with the Data Cleaning Agent
|
243
|
-
|
244
|
-
[See the full example here.](/examples/data_cleaning_agent.ipynb)
|
245
|
-
|
246
|
-
``` python
|
247
|
-
data_cleaning_agent = DataCleaningAgent(model = llm)
|
256
|
+
ml_agent
|
248
257
|
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
258
|
+
# Run the agent
|
259
|
+
ml_agent.invoke_agent(
|
260
|
+
data_raw=df.drop(columns=["customerID"]),
|
261
|
+
user_instructions="Please do classification on 'Churn'. Use a max runtime of 30 seconds.",
|
262
|
+
target_variable="Churn"
|
253
263
|
)
|
254
|
-
```
|
255
|
-
|
256
|
-
``` bash
|
257
|
-
---DATA CLEANING AGENT----
|
258
|
-
* CREATE DATA CLEANER CODE
|
259
|
-
* EXECUTING AGENT CODE
|
260
|
-
* EXPLAIN AGENT CODE
|
261
|
-
```
|
262
264
|
|
263
|
-
|
264
|
-
|
265
|
+
# Retrieve and display the leaderboard of models
|
266
|
+
ml_agent.get_leaderboard()
|
265
267
|
```
|
266
268
|
|
267
269
|
## Contributing
|
@@ -282,4 +284,8 @@ This project is licensed under the MIT License. See LICENSE file for details.
|
|
282
284
|
|
283
285
|
I teach Generative AI Data Science to help you build AI-powered data science apps. [**Register for my next Generative AI for Data Scientists workshop here.**](https://learn.business-science.io/ai-register)
|
284
286
|
|
287
|
+
# ⭐️ Star History
|
285
288
|
|
289
|
+
[](https://star-history.com/#)
|
290
|
+
|
291
|
+
[**Please ⭐ us on GitHub (it takes 2 seconds and means a lot).**](https://github.com/business-science/ai-data-science-team)
|
{ai_data_science_team-0.0.0.9015.dist-info → ai_data_science_team-0.0.0.9016.dist-info}/RECORD
RENAMED
@@ -1,5 +1,5 @@
|
|
1
1
|
ai_data_science_team/__init__.py,sha256=LmogkhGnxvvVe1ukJM6I6lXy4B7SuCr5eXZpwjyDMKQ,444
|
2
|
-
ai_data_science_team/_version.py,sha256=
|
2
|
+
ai_data_science_team/_version.py,sha256=CuRBSRSns8bxBgkn7Hp4BqQhLmZGuLWdyc2Xq7zO6ww,27
|
3
3
|
ai_data_science_team/orchestration.py,sha256=xiIFOsrLwPdkSmtme7wNCCGv8XopnMTNElNzlZokL-4,303
|
4
4
|
ai_data_science_team/agents/__init__.py,sha256=Gnotza9SKr_0IxuaX8k1nsZK48wXkkeZcGcrR1EqNks,668
|
5
5
|
ai_data_science_team/agents/data_cleaning_agent.py,sha256=aZLhnN2EBlY_hmAg_r73dwi1w5utSFNEgEs8aWl8Cho,27991
|
@@ -9,7 +9,7 @@ ai_data_science_team/agents/data_wrangling_agent.py,sha256=jyBrEfLsgIqSF6xcmRgnk
|
|
9
9
|
ai_data_science_team/agents/feature_engineering_agent.py,sha256=xZGDFnmM6wx4bi3e4c_dNOZzGcxBmX8k0iveL7dlA-k,31608
|
10
10
|
ai_data_science_team/agents/sql_database_agent.py,sha256=fln8unefn5Jd2exeyGs-9PljyLXAK60HI81tJACYeCY,31726
|
11
11
|
ai_data_science_team/ds_agents/__init__.py,sha256=dnuagUTebTDHhGXbCt-hZIilzXMSUwyHaEI7sOxhvoE,95
|
12
|
-
ai_data_science_team/ds_agents/eda_tools_agent.py,sha256=
|
12
|
+
ai_data_science_team/ds_agents/eda_tools_agent.py,sha256=RiwpAp2dIZyN1kRNk7WBUI5KsiP14dLuHm8fhOCsKCk,8228
|
13
13
|
ai_data_science_team/ds_agents/modeling_tools_agent.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
14
|
ai_data_science_team/ml_agents/__init__.py,sha256=qq3UlDCRV_z4FHQ1jj3YR6zPbA6kuCvYCisj_bHYfO4,190
|
15
15
|
ai_data_science_team/ml_agents/h2o_ml_agent.py,sha256=S0uayngaVwVUyA4zy05QYlq5NXrNHb723NeF2rns0Y0,33934
|
@@ -26,7 +26,7 @@ ai_data_science_team/templates/agent_templates.py,sha256=QHRNZVmIfeClEef2Fr2Wb9J
|
|
26
26
|
ai_data_science_team/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
27
|
ai_data_science_team/tools/data_loader.py,sha256=ITs_6UAJ0m9h68R9_LruiaJSElv9l7SxTQYryI7YZPY,14702
|
28
28
|
ai_data_science_team/tools/dataframe.py,sha256=cckplDWu9SsA_PRo89pYsyVCmBE0PoDIwMv6tuLunT4,4572
|
29
|
-
ai_data_science_team/tools/eda.py,sha256=
|
29
|
+
ai_data_science_team/tools/eda.py,sha256=ycE_VAgeDoJyZpt6jjprID-D3ocseYTdzlry-qiSc5w,14201
|
30
30
|
ai_data_science_team/tools/h2o.py,sha256=gSK0f2FULfAfipFTTjDMUS6DjHwFFvvl4jxshr6QpS0,38997
|
31
31
|
ai_data_science_team/tools/mlflow.py,sha256=8NTkSOvbTk01GOmwFaMkLBRse80w9Kk7Ypi6Fv4kTII,29475
|
32
32
|
ai_data_science_team/tools/sql.py,sha256=vvz_CiOg6GqXo2_mlF4kq5IS6if79dpaizAgLR9sRyg,4784
|
@@ -37,8 +37,8 @@ ai_data_science_team/utils/matplotlib.py,sha256=d6DZfCXvZ5Kocxtsp92etIymKW2cRBcU
|
|
37
37
|
ai_data_science_team/utils/messages.py,sha256=feWIPGsv8ly9jpNnS97SoPsn1feaY1Km0VCbHTbRpI8,549
|
38
38
|
ai_data_science_team/utils/plotly.py,sha256=nST-NG0oizKVHhH6HsjHUpTUumq9bCccBdxjuaJWnVQ,504
|
39
39
|
ai_data_science_team/utils/regex.py,sha256=lwarbLqTA2VfNQSyqKCl-PBlH_0WH3zXZvYGBYGUiu4,5144
|
40
|
-
ai_data_science_team-0.0.0.
|
41
|
-
ai_data_science_team-0.0.0.
|
42
|
-
ai_data_science_team-0.0.0.
|
43
|
-
ai_data_science_team-0.0.0.
|
44
|
-
ai_data_science_team-0.0.0.
|
40
|
+
ai_data_science_team-0.0.0.9016.dist-info/licenses/LICENSE,sha256=Xif0IRLdd2HGLATxV2EVp91aSY6KOuacRr_6BorKGzA,1084
|
41
|
+
ai_data_science_team-0.0.0.9016.dist-info/METADATA,sha256=Fxmv56STouZdBJurMyf98VgpATeLYajJlmIDtgsbPXg,13746
|
42
|
+
ai_data_science_team-0.0.0.9016.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
43
|
+
ai_data_science_team-0.0.0.9016.dist-info/top_level.txt,sha256=CnoMgOphCoAdGTLueWdCVByVyjwOubaGiTB1lchdy4M,21
|
44
|
+
ai_data_science_team-0.0.0.9016.dist-info/RECORD,,
|
File without changes
|
File without changes
|