ai-data-science-team 0.0.0.9014__py3-none-any.whl → 0.0.0.9016__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/agents/data_visualization_agent.py +172 -129
- ai_data_science_team/agents/data_wrangling_agent.py +1 -0
- ai_data_science_team/ds_agents/eda_tools_agent.py +46 -50
- ai_data_science_team/multiagents/pandas_data_analyst.py +5 -5
- ai_data_science_team/multiagents/sql_data_analyst.py +7 -18
- ai_data_science_team/tools/eda.py +123 -60
- {ai_data_science_team-0.0.0.9014.dist-info → ai_data_science_team-0.0.0.9016.dist-info}/METADATA +64 -57
- {ai_data_science_team-0.0.0.9014.dist-info → ai_data_science_team-0.0.0.9016.dist-info}/RECORD +12 -12
- {ai_data_science_team-0.0.0.9014.dist-info → ai_data_science_team-0.0.0.9016.dist-info}/WHEEL +1 -1
- {ai_data_science_team-0.0.0.9014.dist-info → ai_data_science_team-0.0.0.9016.dist-info/licenses}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9014.dist-info → ai_data_science_team-0.0.0.9016.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
from typing import Annotated, Dict, Tuple, Union
|
3
2
|
|
4
3
|
import os
|
@@ -6,12 +5,12 @@ import tempfile
|
|
6
5
|
|
7
6
|
from langchain.tools import tool
|
8
7
|
|
9
|
-
from langgraph.prebuilt import InjectedState
|
8
|
+
from langgraph.prebuilt import InjectedState
|
10
9
|
|
11
10
|
from ai_data_science_team.tools.dataframe import get_dataframe_summary
|
12
11
|
|
13
12
|
|
14
|
-
@tool(response_format=
|
13
|
+
@tool(response_format="content")
|
15
14
|
def explain_data(
|
16
15
|
data_raw: Annotated[dict, InjectedState("data_raw")],
|
17
16
|
n_sample: int = 30,
|
@@ -36,14 +35,17 @@ def explain_data(
|
|
36
35
|
"""
|
37
36
|
print(" * Tool: explain_data")
|
38
37
|
import pandas as pd
|
39
|
-
|
40
|
-
result = get_dataframe_summary(
|
41
|
-
|
38
|
+
|
39
|
+
result = get_dataframe_summary(
|
40
|
+
pd.DataFrame(data_raw), n_sample=n_sample, skip_stats=skip_stats
|
41
|
+
)
|
42
|
+
|
42
43
|
return result
|
43
44
|
|
44
|
-
|
45
|
+
|
46
|
+
@tool(response_format="content_and_artifact")
|
45
47
|
def describe_dataset(
|
46
|
-
data_raw: Annotated[dict, InjectedState("data_raw")]
|
48
|
+
data_raw: Annotated[dict, InjectedState("data_raw")],
|
47
49
|
) -> Tuple[str, Dict]:
|
48
50
|
"""
|
49
51
|
Tool: describe_dataset
|
@@ -71,30 +73,30 @@ def describe_dataset(
|
|
71
73
|
"""
|
72
74
|
print(" * Tool: describe_dataset")
|
73
75
|
import pandas as pd
|
76
|
+
|
74
77
|
df = pd.DataFrame(data_raw)
|
75
|
-
description_df = df.describe(include=
|
78
|
+
description_df = df.describe(include="all")
|
76
79
|
content = "Summary statistics computed using pandas describe()."
|
77
|
-
artifact = {
|
80
|
+
artifact = {"describe_df": description_df.to_dict()}
|
78
81
|
return content, artifact
|
79
82
|
|
80
83
|
|
81
|
-
@tool(response_format=
|
84
|
+
@tool(response_format="content_and_artifact")
|
82
85
|
def visualize_missing(
|
83
|
-
data_raw: Annotated[dict, InjectedState("data_raw")],
|
84
|
-
n_sample: int = None
|
86
|
+
data_raw: Annotated[dict, InjectedState("data_raw")], n_sample: int = None
|
85
87
|
) -> Tuple[str, Dict]:
|
86
88
|
"""
|
87
89
|
Tool: visualize_missing
|
88
90
|
Description:
|
89
91
|
Missing value analysis using the missingno library. Generates a matrix plot, bar plot, and heatmap plot.
|
90
|
-
|
92
|
+
|
91
93
|
Parameters:
|
92
94
|
-----------
|
93
95
|
data_raw : dict
|
94
96
|
The raw data in dictionary format.
|
95
97
|
n_sample : int, optional (default: None)
|
96
98
|
The number of rows to sample from the dataset if it is large.
|
97
|
-
|
99
|
+
|
98
100
|
Returns:
|
99
101
|
-------
|
100
102
|
Tuple[str, Dict]:
|
@@ -103,12 +105,14 @@ def visualize_missing(
|
|
103
105
|
corresponding base64 encoded PNG image.
|
104
106
|
"""
|
105
107
|
print(" * Tool: visualize_missing")
|
106
|
-
|
108
|
+
|
107
109
|
try:
|
108
110
|
import missingno as msno # Ensure missingno is installed
|
109
111
|
except ImportError:
|
110
|
-
raise ImportError(
|
111
|
-
|
112
|
+
raise ImportError(
|
113
|
+
"Please install the 'missingno' package to use this tool. pip install missingno"
|
114
|
+
)
|
115
|
+
|
112
116
|
import pandas as pd
|
113
117
|
import base64
|
114
118
|
from io import BytesIO
|
@@ -136,21 +140,22 @@ def visualize_missing(
|
|
136
140
|
|
137
141
|
# Create and encode the matrix plot.
|
138
142
|
encoded_plots["matrix_plot"] = create_and_encode_plot(msno.matrix, "matrix")
|
139
|
-
|
143
|
+
|
140
144
|
# Create and encode the bar plot.
|
141
145
|
encoded_plots["bar_plot"] = create_and_encode_plot(msno.bar, "bar")
|
142
|
-
|
146
|
+
|
143
147
|
# Create and encode the heatmap plot.
|
144
148
|
encoded_plots["heatmap_plot"] = create_and_encode_plot(msno.heatmap, "heatmap")
|
145
149
|
|
146
|
-
content =
|
150
|
+
content = (
|
151
|
+
"Missing data visualizations (matrix, bar, and heatmap) have been generated."
|
152
|
+
)
|
147
153
|
artifact = encoded_plots
|
148
154
|
return content, artifact
|
149
155
|
|
150
156
|
|
151
|
-
|
152
|
-
|
153
|
-
def correlation_funnel(
|
157
|
+
@tool(response_format="content_and_artifact")
|
158
|
+
def generate_correlation_funnel(
|
154
159
|
data_raw: Annotated[dict, InjectedState("data_raw")],
|
155
160
|
target: str,
|
156
161
|
target_bin_index: Union[int, str] = -1,
|
@@ -160,10 +165,10 @@ def correlation_funnel(
|
|
160
165
|
name_infreq: str = "-OTHER",
|
161
166
|
) -> Tuple[str, Dict]:
|
162
167
|
"""
|
163
|
-
Tool:
|
168
|
+
Tool: generate_correlation_funnel
|
164
169
|
Description:
|
165
170
|
Correlation analysis using the correlation funnel method. The tool binarizes the data and computes correlation versus a target column.
|
166
|
-
|
171
|
+
|
167
172
|
Parameters:
|
168
173
|
----------
|
169
174
|
target : str
|
@@ -171,8 +176,8 @@ def correlation_funnel(
|
|
171
176
|
with this string followed by '__' (e.g., 'Member_Status__Gold', 'Member_Status__Platinum').
|
172
177
|
target_bin_index : int or str, default -1
|
173
178
|
If an integer, selects the target level by position from the matching columns.
|
174
|
-
If a string (e.g., "Yes"), attempts to match to the suffix of a column name
|
175
|
-
(i.e., 'target__Yes').
|
179
|
+
If a string (e.g., "Yes"), attempts to match to the suffix of a column name
|
180
|
+
(i.e., 'target__Yes').
|
176
181
|
corr_method : str
|
177
182
|
The correlation method ('pearson', 'kendall', or 'spearman'). Default is 'pearson'.
|
178
183
|
n_bins : int
|
@@ -182,34 +187,36 @@ def correlation_funnel(
|
|
182
187
|
name_infreq : str
|
183
188
|
The name to use for infrequent levels. Default is '-OTHER'.
|
184
189
|
"""
|
185
|
-
print(" * Tool:
|
190
|
+
print(" * Tool: generate_correlation_funnel")
|
186
191
|
try:
|
187
192
|
import pytimetk as tk
|
188
193
|
except ImportError:
|
189
|
-
raise ImportError(
|
194
|
+
raise ImportError(
|
195
|
+
"Please install the 'pytimetk' package to use this tool. pip install pytimetk"
|
196
|
+
)
|
190
197
|
import pandas as pd
|
191
198
|
import base64
|
192
199
|
from io import BytesIO
|
193
200
|
import matplotlib.pyplot as plt
|
194
201
|
import json
|
195
|
-
import plotly.graph_objects as go
|
196
202
|
import plotly.io as pio
|
197
|
-
from typing import Union
|
198
203
|
|
199
204
|
# Convert the raw injected state into a DataFrame.
|
200
205
|
df = pd.DataFrame(data_raw)
|
201
|
-
|
206
|
+
|
202
207
|
# Apply the binarization method.
|
203
208
|
df_binarized = df.binarize(
|
204
|
-
n_bins=n_bins,
|
205
|
-
thresh_infreq=thresh_infreq,
|
206
|
-
name_infreq=name_infreq,
|
207
|
-
one_hot=True
|
209
|
+
n_bins=n_bins,
|
210
|
+
thresh_infreq=thresh_infreq,
|
211
|
+
name_infreq=name_infreq,
|
212
|
+
one_hot=True,
|
208
213
|
)
|
209
|
-
|
214
|
+
|
210
215
|
# Determine the full target column name.
|
211
216
|
# Look for all columns that start with "target__"
|
212
|
-
matching_columns = [
|
217
|
+
matching_columns = [
|
218
|
+
col for col in df_binarized.columns if col.startswith(f"{target}__")
|
219
|
+
]
|
213
220
|
if not matching_columns:
|
214
221
|
# If no matching columns are found, warn and use the provided target as-is.
|
215
222
|
full_target = target
|
@@ -230,14 +237,15 @@ def correlation_funnel(
|
|
230
237
|
except IndexError:
|
231
238
|
# If index is out of bounds, use the last matching column.
|
232
239
|
full_target = matching_columns[-1]
|
233
|
-
|
240
|
+
|
234
241
|
# Compute correlation funnel using the full target column name.
|
235
242
|
df_correlated = df_binarized.correlate(target=full_target, method=corr_method)
|
236
|
-
|
243
|
+
|
237
244
|
# Attempt to generate a static plot.
|
245
|
+
encoded = None
|
238
246
|
try:
|
239
247
|
# Here we assume that your DataFrame has a method plot_correlation_funnel.
|
240
|
-
fig = df_correlated.plot_correlation_funnel(engine=
|
248
|
+
fig = df_correlated.plot_correlation_funnel(engine="plotnine", height=600)
|
241
249
|
buf = BytesIO()
|
242
250
|
# Use the appropriate save method for your figure object.
|
243
251
|
fig.save(buf, format="png")
|
@@ -246,17 +254,21 @@ def correlation_funnel(
|
|
246
254
|
encoded = base64.b64encode(buf.getvalue()).decode("utf-8")
|
247
255
|
except Exception as e:
|
248
256
|
encoded = {"error": str(e)}
|
249
|
-
|
257
|
+
|
250
258
|
# Attempt to generate a Plotly plot.
|
259
|
+
fig_dict = None
|
251
260
|
try:
|
252
|
-
fig = df_correlated.plot_correlation_funnel(engine=
|
261
|
+
fig = df_correlated.plot_correlation_funnel(engine="plotly", base_size=14)
|
262
|
+
|
253
263
|
fig_json = pio.to_json(fig)
|
254
264
|
fig_dict = json.loads(fig_json)
|
255
265
|
except Exception as e:
|
256
266
|
fig_dict = {"error": str(e)}
|
257
267
|
|
258
|
-
content = (
|
259
|
-
|
268
|
+
content = (
|
269
|
+
f"Correlation funnel computed using method '{corr_method}' for target level '{full_target}'. "
|
270
|
+
f"Base target was '{target}' with target_bin_index '{target_bin_index}'."
|
271
|
+
)
|
260
272
|
artifact = {
|
261
273
|
"correlation_data": df_correlated.to_dict(orient="list"),
|
262
274
|
"plot_image": encoded,
|
@@ -265,8 +277,7 @@ def correlation_funnel(
|
|
265
277
|
return content, artifact
|
266
278
|
|
267
279
|
|
268
|
-
|
269
|
-
@tool(response_format='content_and_artifact')
|
280
|
+
@tool(response_format="content_and_artifact")
|
270
281
|
def generate_sweetviz_report(
|
271
282
|
data_raw: Annotated[dict, InjectedState("data_raw")],
|
272
283
|
target: str = None,
|
@@ -278,7 +289,7 @@ def generate_sweetviz_report(
|
|
278
289
|
Tool: generate_sweetviz_report
|
279
290
|
Description:
|
280
291
|
Make an Exploratory Data Analysis (EDA) report using the Sweetviz library.
|
281
|
-
|
292
|
+
|
282
293
|
Parameters:
|
283
294
|
-----------
|
284
295
|
data_raw : dict
|
@@ -288,11 +299,11 @@ def generate_sweetviz_report(
|
|
288
299
|
report_name : str, optional
|
289
300
|
The file name to save the Sweetviz HTML report. Default is "sweetviz_report.html".
|
290
301
|
report_directory : str, optional
|
291
|
-
The directory where the report should be saved.
|
302
|
+
The directory where the report should be saved.
|
292
303
|
If None, a temporary directory is created and used.
|
293
304
|
open_browser : bool, optional
|
294
305
|
Whether to open the report in a web browser. Default is False.
|
295
|
-
|
306
|
+
|
296
307
|
Returns:
|
297
308
|
--------
|
298
309
|
Tuple[str, Dict]:
|
@@ -305,13 +316,15 @@ def generate_sweetviz_report(
|
|
305
316
|
try:
|
306
317
|
import sweetviz as sv
|
307
318
|
except ImportError:
|
308
|
-
raise ImportError(
|
309
|
-
|
319
|
+
raise ImportError(
|
320
|
+
"Please install the 'sweetviz' package to use this tool. Run: pip install sweetviz"
|
321
|
+
)
|
322
|
+
|
310
323
|
import pandas as pd
|
311
|
-
|
324
|
+
|
312
325
|
# Convert injected raw data to a DataFrame.
|
313
326
|
df = pd.DataFrame(data_raw)
|
314
|
-
|
327
|
+
|
315
328
|
# If no directory is specified, use a temporary directory.
|
316
329
|
if not report_directory:
|
317
330
|
report_directory = tempfile.mkdtemp()
|
@@ -320,26 +333,26 @@ def generate_sweetviz_report(
|
|
320
333
|
# Ensure user-specified directory exists.
|
321
334
|
if not os.path.exists(report_directory):
|
322
335
|
os.makedirs(report_directory)
|
323
|
-
|
336
|
+
|
324
337
|
# Create the Sweetviz report.
|
325
338
|
report = sv.analyze(df, target_feat=target)
|
326
|
-
|
339
|
+
|
327
340
|
# Determine the full path for the report.
|
328
341
|
full_report_path = os.path.join(report_directory, report_name)
|
329
|
-
|
342
|
+
|
330
343
|
# Save the report to the specified HTML file.
|
331
344
|
report.show_html(
|
332
345
|
filepath=full_report_path,
|
333
346
|
open_browser=open_browser,
|
334
347
|
)
|
335
|
-
|
348
|
+
|
336
349
|
# Optionally, read the HTML content (if desired to pass along in the artifact).
|
337
350
|
try:
|
338
351
|
with open(full_report_path, "r", encoding="utf-8") as f:
|
339
352
|
html_content = f.read()
|
340
353
|
except Exception:
|
341
354
|
html_content = None
|
342
|
-
|
355
|
+
|
343
356
|
content = (
|
344
357
|
f"Sweetviz EDA report generated and saved as '{os.path.abspath(full_report_path)}'. "
|
345
358
|
f"{'This was saved in a temporary directory.' if 'tmp' in report_directory else ''}"
|
@@ -350,3 +363,53 @@ def generate_sweetviz_report(
|
|
350
363
|
}
|
351
364
|
return content, artifact
|
352
365
|
|
366
|
+
|
367
|
+
@tool(response_format="content_and_artifact")
|
368
|
+
def generate_dtale_report(
|
369
|
+
data_raw: Annotated[dict, InjectedState("data_raw")],
|
370
|
+
host: str = "localhost",
|
371
|
+
port: int = 40000,
|
372
|
+
open_browser: bool = False,
|
373
|
+
) -> Tuple[str, Dict]:
|
374
|
+
"""
|
375
|
+
Tool: generate_dtale_report
|
376
|
+
Description:
|
377
|
+
Creates an interactive data exploration report using the dtale library.
|
378
|
+
|
379
|
+
Parameters:
|
380
|
+
-----------
|
381
|
+
data_raw : dict
|
382
|
+
The raw data in dictionary format.
|
383
|
+
host : str, optional
|
384
|
+
The host IP address to serve the dtale app. Default is "localhost".
|
385
|
+
port : int, optional
|
386
|
+
The port number to serve the dtale app. Default is 40000.
|
387
|
+
open_browser : bool, optional
|
388
|
+
Whether to open the report in a web browser. Default is False.
|
389
|
+
|
390
|
+
Returns:
|
391
|
+
--------
|
392
|
+
Tuple[str, Dict]:
|
393
|
+
content: A summary message describing the dtale report.
|
394
|
+
artifact: A dictionary containing the URL of the dtale report.
|
395
|
+
"""
|
396
|
+
print(" * Tool: generate_dtale_report")
|
397
|
+
|
398
|
+
try:
|
399
|
+
import dtale
|
400
|
+
except ImportError:
|
401
|
+
raise ImportError(
|
402
|
+
"Please install the 'dtale' package to use this tool. Run: pip install dtale"
|
403
|
+
)
|
404
|
+
|
405
|
+
import pandas as pd
|
406
|
+
|
407
|
+
df = pd.DataFrame(data_raw)
|
408
|
+
|
409
|
+
# Create the dtale report
|
410
|
+
d = dtale.show(df, host=host, port=port, open_browser=open_browser)
|
411
|
+
|
412
|
+
content = f"Dtale report generated and available at: {d.main_url()}"
|
413
|
+
artifact = {"dtale_url": d.main_url()}
|
414
|
+
|
415
|
+
return content, artifact
|
{ai_data_science_team-0.0.0.9014.dist-info → ai_data_science_team-0.0.0.9016.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: ai-data-science-team
|
3
|
-
Version: 0.0.0.
|
3
|
+
Version: 0.0.0.9016
|
4
4
|
Summary: Build and run an AI-powered data science team.
|
5
5
|
Home-page: https://github.com/business-science/ai-data-science-team
|
6
6
|
Author: Matt Dancho
|
@@ -47,6 +47,7 @@ Dynamic: classifier
|
|
47
47
|
Dynamic: description
|
48
48
|
Dynamic: description-content-type
|
49
49
|
Dynamic: home-page
|
50
|
+
Dynamic: license-file
|
50
51
|
Dynamic: provides-extra
|
51
52
|
Dynamic: requires-dist
|
52
53
|
Dynamic: requires-python
|
@@ -97,9 +98,8 @@ The AI Data Science Team of Copilots includes Agents that specialize data cleani
|
|
97
98
|
- [Companies That Want A Custom AI Data Science Team (And AI Apps)](#companies-that-want-a-custom-ai-data-science-team-and-ai-apps)
|
98
99
|
- [Generative AI for Data Scientists Workshop](#generative-ai-for-data-scientists-workshop)
|
99
100
|
- [Data Science Agents](#data-science-agents)
|
101
|
+
- [🔥 NEW: Data Science Apps](#-new-data-science-apps)
|
100
102
|
- [NEW: Multi-Agents](#new-multi-agents)
|
101
|
-
- [Data Science Apps](#data-science-apps)
|
102
|
-
- [Apps Available Now](#apps-available-now)
|
103
103
|
- [🔥 Agentic Applications](#-agentic-applications)
|
104
104
|
- [Agents Available Now](#agents-available-now)
|
105
105
|
- [Standard Agents](#standard-agents)
|
@@ -110,11 +110,11 @@ The AI Data Science Team of Copilots includes Agents that specialize data cleani
|
|
110
110
|
- [Disclaimer](#disclaimer)
|
111
111
|
- [Installation](#installation)
|
112
112
|
- [Usage](#usage)
|
113
|
-
- [Example
|
114
|
-
- [Example 2: Cleaning Data with the Data Cleaning Agent](#example-2-cleaning-data-with-the-data-cleaning-agent)
|
113
|
+
- [Example: H2O Machine Learning Agent](#example-h2o-machine-learning-agent)
|
115
114
|
- [Contributing](#contributing)
|
116
115
|
- [License](#license)
|
117
116
|
- [Want To Become A Full-Stack Generative AI Data Scientist?](#want-to-become-a-full-stack-generative-ai-data-scientist)
|
117
|
+
- [⭐️ Star History](#️-star-history)
|
118
118
|
|
119
119
|
## Companies That Want A Custom AI Data Science Team (And AI Apps)
|
120
120
|
|
@@ -134,21 +134,24 @@ This project is a work in progress. New data science agents will be released soo
|
|
134
134
|
|
135
135
|

|
136
136
|
|
137
|
-
### NEW:
|
137
|
+
### 🔥 NEW: Data Science Apps
|
138
138
|
|
139
|
-
|
139
|
+
**🔥 Open Pandas AI Data Analyst:** Load an Excel or CSV file and ask it questions. Get data and charts back.
|
140
140
|
|
141
|
-

|
142
142
|
|
143
|
-
|
143
|
+
**🔥 SQL Database Agent:** Connects any SQL Database, generates SQL queries from natural language, and returns data as a downloadable table.
|
144
144
|
|
145
|
-
|
145
|
+
**🔥 Exploratory Data Copilot:** An AI-powered data science app that performs automated exploratory data analysis (EDA) with EDA Reporting, Missing Data Analysis, Correlation Analysis, and more.
|
146
146
|
|
147
|
-
|
147
|
+
[See all available apps here](/apps)
|
148
148
|
|
149
|
-
###
|
149
|
+
### NEW: Multi-Agents
|
150
|
+
|
151
|
+
**🔥 Pandas Data Analyst Agent:** Combines the ability to wrangle, transform, and analyze data with an optional data visualization agent that can create interactive plots.
|
152
|
+
|
153
|
+

|
150
154
|
|
151
|
-
[See all available apps here](/apps)
|
152
155
|
|
153
156
|
#### 🔥 Agentic Applications
|
154
157
|
|
@@ -182,7 +185,8 @@ This is a top secret project I'm working on. It's a multi-agent data science app
|
|
182
185
|
|
183
186
|
#### Multi-Agents
|
184
187
|
|
185
|
-
1.
|
188
|
+
1. **🔥🔥 Pandas Data Analyst Agent:** Combines the ability to wrangle, transform, and analyze data with an optional data visualization agent that can create interactive plots. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/multiagents/pandas_data_analyst.ipynb)
|
189
|
+
2. **🔥🔥 SQL Data Analyst Agent:** Connects to SQL databases to pull data into the data science environment. Creates pipelines to automate data extraction. Performs Joins, Aggregations, and other SQL Query operations. Includes a Data Visualization Agent that creates visualizations to help you understand your data. [See Example](https://github.com/business-science/ai-data-science-team/blob/master/examples/multiagents/sql_data_analyst.ipynb)
|
186
190
|
|
187
191
|
### Agents Coming Soon
|
188
192
|
|
@@ -204,6 +208,14 @@ By using this software, you agree to use it solely for learning purposes.
|
|
204
208
|
|
205
209
|
## Installation
|
206
210
|
|
211
|
+
You can install via PyPI (note that this is a beta version and breaking changes may occur until 0.1.0):
|
212
|
+
|
213
|
+
``` bash
|
214
|
+
pip install ai-data-science-team
|
215
|
+
```
|
216
|
+
|
217
|
+
Or, if you want the latest version from GitHub:
|
218
|
+
|
207
219
|
``` bash
|
208
220
|
pip install git+https://github.com/business-science/ai-data-science-team.git --upgrade
|
209
221
|
```
|
@@ -212,55 +224,46 @@ pip install git+https://github.com/business-science/ai-data-science-team.git --u
|
|
212
224
|
|
213
225
|
[See all examples here.](/examples)
|
214
226
|
|
215
|
-
### Example
|
227
|
+
### Example: H2O Machine Learning Agent
|
216
228
|
|
217
|
-
[See the full example here.](/examples/
|
229
|
+
[See the full example here.](https://github.com/business-science/ai-data-science-team/blob/master/examples/ml_agents/h2o_machine_learning_agent.ipynb)
|
218
230
|
|
219
231
|
``` python
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
232
|
+
# Import libraries
|
233
|
+
from langchain_openai import ChatOpenAI
|
234
|
+
import pandas as pd
|
235
|
+
import h2o
|
236
|
+
import os
|
237
|
+
from ai_data_science_team.ml_agents import H2OMLAgent
|
238
|
+
|
239
|
+
# Load the data
|
240
|
+
df = pd.read_csv("data/churn_data.csv")
|
241
|
+
df
|
242
|
+
|
243
|
+
# Initialize the language model
|
244
|
+
os.environ['OPENAI_API_KEY'] = "YOUR_OPENAI_API_KEY"
|
245
|
+
llm = ChatOpenAI(model=MODEL)
|
246
|
+
llm
|
247
|
+
|
248
|
+
# Initialize the H2O ML Agent
|
249
|
+
ml_agent = H2OMLAgent(
|
250
|
+
model=llm,
|
251
|
+
log=True,
|
252
|
+
log_path="logs/",
|
253
|
+
model_directory="h2o_models/",
|
254
|
+
enable_mlflow=True, # Use this if you wish to log models to MLflow
|
227
255
|
)
|
228
|
-
|
229
|
-
|
230
|
-
``` bash
|
231
|
-
---FEATURE ENGINEERING AGENT----
|
232
|
-
* CREATE FEATURE ENGINEER CODE
|
233
|
-
* EXECUTING AGENT CODE
|
234
|
-
* EXPLAIN AGENT CODE
|
235
|
-
```
|
236
|
-
|
237
|
-
``` python
|
238
|
-
feature_engineering_agent.get_data_engineered()
|
239
|
-
```
|
240
|
-
|
241
|
-
### Example 2: Cleaning Data with the Data Cleaning Agent
|
242
|
-
|
243
|
-
[See the full example here.](/examples/data_cleaning_agent.ipynb)
|
244
|
-
|
245
|
-
``` python
|
246
|
-
data_cleaning_agent = DataCleaningAgent(model = llm)
|
256
|
+
ml_agent
|
247
257
|
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
258
|
+
# Run the agent
|
259
|
+
ml_agent.invoke_agent(
|
260
|
+
data_raw=df.drop(columns=["customerID"]),
|
261
|
+
user_instructions="Please do classification on 'Churn'. Use a max runtime of 30 seconds.",
|
262
|
+
target_variable="Churn"
|
252
263
|
)
|
253
|
-
```
|
254
264
|
|
255
|
-
|
256
|
-
|
257
|
-
* CREATE DATA CLEANER CODE
|
258
|
-
* EXECUTING AGENT CODE
|
259
|
-
* EXPLAIN AGENT CODE
|
260
|
-
```
|
261
|
-
|
262
|
-
``` python
|
263
|
-
data_cleaning_agent.get_data_cleaned()
|
265
|
+
# Retrieve and display the leaderboard of models
|
266
|
+
ml_agent.get_leaderboard()
|
264
267
|
```
|
265
268
|
|
266
269
|
## Contributing
|
@@ -281,4 +284,8 @@ This project is licensed under the MIT License. See LICENSE file for details.
|
|
281
284
|
|
282
285
|
I teach Generative AI Data Science to help you build AI-powered data science apps. [**Register for my next Generative AI for Data Scientists workshop here.**](https://learn.business-science.io/ai-register)
|
283
286
|
|
287
|
+
# ⭐️ Star History
|
288
|
+
|
289
|
+
[](https://star-history.com/#)
|
284
290
|
|
291
|
+
[**Please ⭐ us on GitHub (it takes 2 seconds and means a lot).**](https://github.com/business-science/ai-data-science-team)
|
{ai_data_science_team-0.0.0.9014.dist-info → ai_data_science_team-0.0.0.9016.dist-info}/RECORD
RENAMED
@@ -1,23 +1,23 @@
|
|
1
1
|
ai_data_science_team/__init__.py,sha256=LmogkhGnxvvVe1ukJM6I6lXy4B7SuCr5eXZpwjyDMKQ,444
|
2
|
-
ai_data_science_team/_version.py,sha256=
|
2
|
+
ai_data_science_team/_version.py,sha256=CuRBSRSns8bxBgkn7Hp4BqQhLmZGuLWdyc2Xq7zO6ww,27
|
3
3
|
ai_data_science_team/orchestration.py,sha256=xiIFOsrLwPdkSmtme7wNCCGv8XopnMTNElNzlZokL-4,303
|
4
4
|
ai_data_science_team/agents/__init__.py,sha256=Gnotza9SKr_0IxuaX8k1nsZK48wXkkeZcGcrR1EqNks,668
|
5
5
|
ai_data_science_team/agents/data_cleaning_agent.py,sha256=aZLhnN2EBlY_hmAg_r73dwi1w5utSFNEgEs8aWl8Cho,27991
|
6
6
|
ai_data_science_team/agents/data_loader_tools_agent.py,sha256=TFKzYqV6cvU-sMbfL-hg8-NgF_Hz3nysGFldvb5K3fM,9327
|
7
|
-
ai_data_science_team/agents/data_visualization_agent.py,sha256=
|
8
|
-
ai_data_science_team/agents/data_wrangling_agent.py,sha256=
|
7
|
+
ai_data_science_team/agents/data_visualization_agent.py,sha256=IHNagAVY4XIRfyKKj3jdJZV0vUpzBqqnQBVbzP1lZj0,29829
|
8
|
+
ai_data_science_team/agents/data_wrangling_agent.py,sha256=jyBrEfLsgIqSF6xcmRgnkzvNqJfkXdjn6FDefQij62o,33439
|
9
9
|
ai_data_science_team/agents/feature_engineering_agent.py,sha256=xZGDFnmM6wx4bi3e4c_dNOZzGcxBmX8k0iveL7dlA-k,31608
|
10
10
|
ai_data_science_team/agents/sql_database_agent.py,sha256=fln8unefn5Jd2exeyGs-9PljyLXAK60HI81tJACYeCY,31726
|
11
11
|
ai_data_science_team/ds_agents/__init__.py,sha256=dnuagUTebTDHhGXbCt-hZIilzXMSUwyHaEI7sOxhvoE,95
|
12
|
-
ai_data_science_team/ds_agents/eda_tools_agent.py,sha256=
|
12
|
+
ai_data_science_team/ds_agents/eda_tools_agent.py,sha256=RiwpAp2dIZyN1kRNk7WBUI5KsiP14dLuHm8fhOCsKCk,8228
|
13
13
|
ai_data_science_team/ds_agents/modeling_tools_agent.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
14
|
ai_data_science_team/ml_agents/__init__.py,sha256=qq3UlDCRV_z4FHQ1jj3YR6zPbA6kuCvYCisj_bHYfO4,190
|
15
15
|
ai_data_science_team/ml_agents/h2o_ml_agent.py,sha256=S0uayngaVwVUyA4zy05QYlq5NXrNHb723NeF2rns0Y0,33934
|
16
16
|
ai_data_science_team/ml_agents/h2o_ml_tools_agent.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
17
|
ai_data_science_team/ml_agents/mlflow_tools_agent.py,sha256=QImaZnS8hPdrU7GI6pZ0dUDO-LXx40MSA3XyMDppIh0,12003
|
18
18
|
ai_data_science_team/multiagents/__init__.py,sha256=5tpmZBQ_UT5SKDCS_NivZhN19HEStKIcstiqSXPXDl0,208
|
19
|
-
ai_data_science_team/multiagents/pandas_data_analyst.py,sha256=
|
20
|
-
ai_data_science_team/multiagents/sql_data_analyst.py,sha256=
|
19
|
+
ai_data_science_team/multiagents/pandas_data_analyst.py,sha256=6JvcGFvDH7_ozRo-RQvjA_to5R27c7ZSEdKt4VQGL6U,13935
|
20
|
+
ai_data_science_team/multiagents/sql_data_analyst.py,sha256=ZZx3Edzff6zf27iPl8lUGoqaZkPaJQtCJIgNx9wdCZY,18232
|
21
21
|
ai_data_science_team/multiagents/supervised_data_analyst.py,sha256=uduCYpicga-UCf9nPQktQggW96-HDlqvioYmEdWejtI,158
|
22
22
|
ai_data_science_team/parsers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
23
|
ai_data_science_team/parsers/parsers.py,sha256=hIsMZXRHz9hqs8R1ebymKA7D6NxOf5UVMpDAr_gGhE8,2027
|
@@ -26,7 +26,7 @@ ai_data_science_team/templates/agent_templates.py,sha256=QHRNZVmIfeClEef2Fr2Wb9J
|
|
26
26
|
ai_data_science_team/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
27
|
ai_data_science_team/tools/data_loader.py,sha256=ITs_6UAJ0m9h68R9_LruiaJSElv9l7SxTQYryI7YZPY,14702
|
28
28
|
ai_data_science_team/tools/dataframe.py,sha256=cckplDWu9SsA_PRo89pYsyVCmBE0PoDIwMv6tuLunT4,4572
|
29
|
-
ai_data_science_team/tools/eda.py,sha256=
|
29
|
+
ai_data_science_team/tools/eda.py,sha256=ycE_VAgeDoJyZpt6jjprID-D3ocseYTdzlry-qiSc5w,14201
|
30
30
|
ai_data_science_team/tools/h2o.py,sha256=gSK0f2FULfAfipFTTjDMUS6DjHwFFvvl4jxshr6QpS0,38997
|
31
31
|
ai_data_science_team/tools/mlflow.py,sha256=8NTkSOvbTk01GOmwFaMkLBRse80w9Kk7Ypi6Fv4kTII,29475
|
32
32
|
ai_data_science_team/tools/sql.py,sha256=vvz_CiOg6GqXo2_mlF4kq5IS6if79dpaizAgLR9sRyg,4784
|
@@ -37,8 +37,8 @@ ai_data_science_team/utils/matplotlib.py,sha256=d6DZfCXvZ5Kocxtsp92etIymKW2cRBcU
|
|
37
37
|
ai_data_science_team/utils/messages.py,sha256=feWIPGsv8ly9jpNnS97SoPsn1feaY1Km0VCbHTbRpI8,549
|
38
38
|
ai_data_science_team/utils/plotly.py,sha256=nST-NG0oizKVHhH6HsjHUpTUumq9bCccBdxjuaJWnVQ,504
|
39
39
|
ai_data_science_team/utils/regex.py,sha256=lwarbLqTA2VfNQSyqKCl-PBlH_0WH3zXZvYGBYGUiu4,5144
|
40
|
-
ai_data_science_team-0.0.0.
|
41
|
-
ai_data_science_team-0.0.0.
|
42
|
-
ai_data_science_team-0.0.0.
|
43
|
-
ai_data_science_team-0.0.0.
|
44
|
-
ai_data_science_team-0.0.0.
|
40
|
+
ai_data_science_team-0.0.0.9016.dist-info/licenses/LICENSE,sha256=Xif0IRLdd2HGLATxV2EVp91aSY6KOuacRr_6BorKGzA,1084
|
41
|
+
ai_data_science_team-0.0.0.9016.dist-info/METADATA,sha256=Fxmv56STouZdBJurMyf98VgpATeLYajJlmIDtgsbPXg,13746
|
42
|
+
ai_data_science_team-0.0.0.9016.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
43
|
+
ai_data_science_team-0.0.0.9016.dist-info/top_level.txt,sha256=CnoMgOphCoAdGTLueWdCVByVyjwOubaGiTB1lchdy4M,21
|
44
|
+
ai_data_science_team-0.0.0.9016.dist-info/RECORD,,
|
File without changes
|
File without changes
|