ai-data-science-team 0.0.0.9005__py3-none-any.whl → 0.0.0.9007__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/agents/__init__.py +3 -1
- ai_data_science_team/agents/data_cleaning_agent.py +213 -20
- ai_data_science_team/agents/data_visualization_agent.py +331 -0
- ai_data_science_team/agents/data_wrangling_agent.py +66 -24
- ai_data_science_team/agents/feature_engineering_agent.py +50 -13
- ai_data_science_team/agents/sql_database_agent.py +397 -0
- ai_data_science_team/templates/__init__.py +8 -0
- ai_data_science_team/templates/agent_templates.py +154 -37
- ai_data_science_team/tools/logging.py +1 -1
- ai_data_science_team/tools/metadata.py +230 -0
- ai_data_science_team/tools/regex.py +7 -1
- {ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.9007.dist-info}/METADATA +43 -22
- ai_data_science_team-0.0.0.9007.dist-info/RECORD +21 -0
- {ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.9007.dist-info}/WHEEL +1 -1
- ai_data_science_team/tools/data_analysis.py +0 -116
- ai_data_science_team-0.0.0.9005.dist-info/RECORD +0 -19
- {ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.9007.dist-info}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.9007.dist-info}/top_level.txt +0 -0
ai_data_science_team/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.0.0.
|
1
|
+
__version__ = "0.0.0.9007"
|
@@ -1,4 +1,6 @@
|
|
1
|
-
from ai_data_science_team.agents.data_cleaning_agent import make_data_cleaning_agent
|
1
|
+
from ai_data_science_team.agents.data_cleaning_agent import make_data_cleaning_agent, DataCleaningAgent
|
2
2
|
from ai_data_science_team.agents.feature_engineering_agent import make_feature_engineering_agent
|
3
3
|
from ai_data_science_team.agents.data_wrangling_agent import make_data_wrangling_agent
|
4
|
+
from ai_data_science_team.agents.sql_database_agent import make_sql_database_agent
|
5
|
+
from ai_data_science_team.agents.data_visualization_agent import make_data_visualization_agent
|
4
6
|
|
@@ -13,11 +13,13 @@ from langchain_core.messages import BaseMessage
|
|
13
13
|
from langgraph.types import Command
|
14
14
|
from langgraph.checkpoint.memory import MemorySaver
|
15
15
|
|
16
|
+
from langgraph.graph.state import CompiledStateGraph
|
17
|
+
|
16
18
|
import os
|
17
19
|
import io
|
18
20
|
import pandas as pd
|
19
21
|
|
20
|
-
from ai_data_science_team.templates
|
22
|
+
from ai_data_science_team.templates import(
|
21
23
|
node_func_execute_agent_code_on_data,
|
22
24
|
node_func_human_review,
|
23
25
|
node_func_fix_agent_code,
|
@@ -25,17 +27,178 @@ from ai_data_science_team.templates.agent_templates import(
|
|
25
27
|
create_coding_agent_graph
|
26
28
|
)
|
27
29
|
from ai_data_science_team.tools.parsers import PythonOutputParser
|
28
|
-
from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top
|
29
|
-
from ai_data_science_team.tools.
|
30
|
+
from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top, format_agent_name
|
31
|
+
from ai_data_science_team.tools.metadata import get_dataframe_summary
|
30
32
|
from ai_data_science_team.tools.logging import log_ai_function
|
31
33
|
|
32
34
|
# Setup
|
33
35
|
AGENT_NAME = "data_cleaning_agent"
|
34
36
|
LOG_PATH = os.path.join(os.getcwd(), "logs/")
|
35
37
|
|
38
|
+
|
39
|
+
|
40
|
+
# Class
|
41
|
+
class DataCleaningAgent(CompiledStateGraph):
|
42
|
+
|
43
|
+
def __init__(
|
44
|
+
self,
|
45
|
+
model,
|
46
|
+
n_samples=30,
|
47
|
+
log=False,
|
48
|
+
log_path=None,
|
49
|
+
file_name="data_cleaner.py",
|
50
|
+
overwrite=True,
|
51
|
+
human_in_the_loop=False,
|
52
|
+
bypass_recommended_steps=False,
|
53
|
+
bypass_explain_code=False
|
54
|
+
):
|
55
|
+
self._params = {
|
56
|
+
"model": model,
|
57
|
+
"n_samples": n_samples,
|
58
|
+
"log": log,
|
59
|
+
"log_path": log_path,
|
60
|
+
"file_name": file_name,
|
61
|
+
"overwrite": overwrite,
|
62
|
+
"human_in_the_loop": human_in_the_loop,
|
63
|
+
"bypass_recommended_steps": bypass_recommended_steps,
|
64
|
+
"bypass_explain_code": bypass_explain_code,
|
65
|
+
}
|
66
|
+
self._compiled_graph = self._make_compiled_graph()
|
67
|
+
self.response = None
|
68
|
+
|
69
|
+
def _make_compiled_graph(self):
|
70
|
+
self.response = None
|
71
|
+
return make_data_cleaning_agent(**self._params)
|
72
|
+
|
73
|
+
def update_params(self, **kwargs):
|
74
|
+
"""
|
75
|
+
Update one or more parameters at once, then rebuild the compiled graph.
|
76
|
+
e.g. agent.update_params(model=new_llm, n_samples=100)
|
77
|
+
"""
|
78
|
+
self._params.update(kwargs)
|
79
|
+
self._compiled_graph = self._make_compiled_graph()
|
80
|
+
|
81
|
+
def __getattr__(self, name: str):
|
82
|
+
"""
|
83
|
+
Delegate attribute access to `_compiled_graph` if `name` is not
|
84
|
+
found in this instance. This 'inherits' methods from the compiled graph.
|
85
|
+
"""
|
86
|
+
return getattr(self._compiled_graph, name)
|
87
|
+
|
88
|
+
def ainvoke(self, user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0):
|
89
|
+
"""
|
90
|
+
Cleans the provided dataset based on user instructions.
|
91
|
+
|
92
|
+
Parameters:
|
93
|
+
user_instructions (str): Instructions for data cleaning.
|
94
|
+
data_raw (pd.DataFrame): The raw dataset to be cleaned.
|
95
|
+
max_retries (int): Maximum retry attempts for cleaning.
|
96
|
+
retry_count (int): Current retry attempt.
|
97
|
+
|
98
|
+
Returns:
|
99
|
+
None. The response is stored in the response attribute.
|
100
|
+
"""
|
101
|
+
response = self.ainvoke({
|
102
|
+
"user_instructions": user_instructions,
|
103
|
+
"data_raw": data_raw.to_dict(),
|
104
|
+
"max_retries": max_retries,
|
105
|
+
"retry_count": retry_count,
|
106
|
+
})
|
107
|
+
self.response = response
|
108
|
+
return None
|
109
|
+
|
110
|
+
def invoke(self, user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0):
|
111
|
+
"""
|
112
|
+
Cleans the provided dataset based on user instructions.
|
113
|
+
|
114
|
+
Parameters:
|
115
|
+
user_instructions (str): Instructions for data cleaning.
|
116
|
+
data_raw (pd.DataFrame): The raw dataset to be cleaned.
|
117
|
+
max_retries (int): Maximum retry attempts for cleaning.
|
118
|
+
retry_count (int): Current retry attempt.
|
119
|
+
|
120
|
+
Returns:
|
121
|
+
None. The response is stored in the response attribute.
|
122
|
+
"""
|
123
|
+
response = self.invoke({
|
124
|
+
"user_instructions": user_instructions,
|
125
|
+
"data_raw": data_raw.to_dict(),
|
126
|
+
"max_retries": max_retries,
|
127
|
+
"retry_count": retry_count,
|
128
|
+
})
|
129
|
+
self.response = response
|
130
|
+
return None
|
131
|
+
|
132
|
+
def explain_cleaning_steps(self):
|
133
|
+
"""
|
134
|
+
Provides an explanation of the cleaning steps performed by the agent.
|
135
|
+
|
136
|
+
Returns:
|
137
|
+
str: Explanation of the cleaning steps.
|
138
|
+
"""
|
139
|
+
messages = self.response.get("messages", [])
|
140
|
+
return messages
|
141
|
+
|
142
|
+
def get_log_summary(self):
|
143
|
+
"""
|
144
|
+
Logs a summary of the agent's operations, if logging is enabled.
|
145
|
+
"""
|
146
|
+
if self.response:
|
147
|
+
if self.log:
|
148
|
+
log_details = f"Log Path: {self.response.get('data_cleaner_function_path')}"
|
149
|
+
return log_details
|
150
|
+
|
151
|
+
def get_state_keys(self):
|
152
|
+
"""
|
153
|
+
Returns a list of keys that the state graph returns in a response.
|
154
|
+
"""
|
155
|
+
return list(self.get_output_jsonschema()['properties'].keys())
|
156
|
+
|
157
|
+
def get_state_properties(self):
|
158
|
+
"""
|
159
|
+
Returns a list of keys that the state graph returns in a response.
|
160
|
+
"""
|
161
|
+
return self.get_output_jsonschema()['properties']
|
162
|
+
|
163
|
+
def get_data_cleaned(self):
|
164
|
+
"""
|
165
|
+
Retrieves the cleaned data stored after running invoke or clean_data methods.
|
166
|
+
"""
|
167
|
+
if self.response:
|
168
|
+
return pd.DataFrame(self.response.get("data_cleaned"))
|
169
|
+
|
170
|
+
def get_data_raw(self):
|
171
|
+
"""
|
172
|
+
Retrieves the raw data.
|
173
|
+
"""
|
174
|
+
if self.response:
|
175
|
+
return pd.DataFrame(self.response.get("data_raw"))
|
176
|
+
|
177
|
+
def get_data_cleaner_function(self):
|
178
|
+
"""
|
179
|
+
Retrieves the agent's pipeline function.
|
180
|
+
"""
|
181
|
+
if self.response:
|
182
|
+
return self.response.get("data_cleaner_function")
|
183
|
+
|
184
|
+
|
185
|
+
|
186
|
+
|
187
|
+
|
188
|
+
|
36
189
|
# Agent
|
37
190
|
|
38
|
-
def make_data_cleaning_agent(
|
191
|
+
def make_data_cleaning_agent(
|
192
|
+
model,
|
193
|
+
n_samples = 30,
|
194
|
+
log=False,
|
195
|
+
log_path=None,
|
196
|
+
file_name="data_cleaner.py",
|
197
|
+
overwrite = True,
|
198
|
+
human_in_the_loop=False,
|
199
|
+
bypass_recommended_steps=False,
|
200
|
+
bypass_explain_code=False
|
201
|
+
):
|
39
202
|
"""
|
40
203
|
Creates a data cleaning agent that can be run on a dataset. The agent can be used to clean a dataset in a variety of
|
41
204
|
ways, such as removing columns with more than 40% missing values, imputing missing
|
@@ -44,9 +207,9 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
44
207
|
The agent takes in a dataset and some user instructions, and outputs a python
|
45
208
|
function that can be used to clean the dataset. The agent also logs the code
|
46
209
|
generated and any errors that occur.
|
47
|
-
|
210
|
+
|
48
211
|
The agent is instructed to to perform the following data cleaning steps:
|
49
|
-
|
212
|
+
|
50
213
|
- Removing columns if more than 40 percent of the data is missing
|
51
214
|
- Imputing missing values with the mean of the column if the column is numeric
|
52
215
|
- Imputing missing values with the mode of the column if the column is categorical
|
@@ -60,17 +223,27 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
60
223
|
----------
|
61
224
|
model : langchain.llms.base.LLM
|
62
225
|
The language model to use to generate code.
|
226
|
+
n_samples : int, optional
|
227
|
+
The number of samples to use when summarizing the dataset. Defaults to 30.
|
228
|
+
If you get an error due to maximum tokens, try reducing this number.
|
229
|
+
> "This model's maximum context length is 128000 tokens. However, your messages resulted in 333858 tokens. Please reduce the length of the messages."
|
63
230
|
log : bool, optional
|
64
231
|
Whether or not to log the code generated and any errors that occur.
|
65
232
|
Defaults to False.
|
66
233
|
log_path : str, optional
|
67
234
|
The path to the directory where the log files should be stored. Defaults to
|
68
235
|
"logs/".
|
236
|
+
file_name : str, optional
|
237
|
+
The name of the file to save the response to. Defaults to "data_cleaner.py".
|
69
238
|
overwrite : bool, optional
|
70
239
|
Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
|
71
240
|
Defaults to True.
|
72
241
|
human_in_the_loop : bool, optional
|
73
242
|
Whether or not to use human in the loop. If True, adds an interput and human in the loop step that asks the user to review the data cleaning instructions. Defaults to False.
|
243
|
+
bypass_recommended_steps : bool, optional
|
244
|
+
Bypass the recommendation step, by default False
|
245
|
+
bypass_explain_code : bool, optional
|
246
|
+
Bypass the code explanation step, by default False.
|
74
247
|
|
75
248
|
Examples
|
76
249
|
-------
|
@@ -78,26 +251,26 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
78
251
|
import pandas as pd
|
79
252
|
from langchain_openai import ChatOpenAI
|
80
253
|
from ai_data_science_team.agents import data_cleaning_agent
|
81
|
-
|
254
|
+
|
82
255
|
llm = ChatOpenAI(model = "gpt-4o-mini")
|
83
256
|
|
84
257
|
data_cleaning_agent = make_data_cleaning_agent(llm)
|
85
|
-
|
258
|
+
|
86
259
|
df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
|
87
|
-
|
260
|
+
|
88
261
|
response = data_cleaning_agent.invoke({
|
89
262
|
"user_instructions": "Don't remove outliers when cleaning the data.",
|
90
263
|
"data_raw": df.to_dict(),
|
91
264
|
"max_retries":3,
|
92
265
|
"retry_count":0
|
93
266
|
})
|
94
|
-
|
267
|
+
|
95
268
|
pd.DataFrame(response['data_cleaned'])
|
96
269
|
```
|
97
270
|
|
98
271
|
Returns
|
99
272
|
-------
|
100
|
-
app : langchain.graphs.
|
273
|
+
app : langchain.graphs.CompiledStateGraph
|
101
274
|
The data cleaning agent as a state graph.
|
102
275
|
"""
|
103
276
|
llm = model
|
@@ -130,7 +303,7 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
130
303
|
Recommend a series of data cleaning steps based on the input data.
|
131
304
|
These recommended steps will be appended to the user_instructions.
|
132
305
|
"""
|
133
|
-
print(
|
306
|
+
print(format_agent_name(AGENT_NAME))
|
134
307
|
print(" * RECOMMEND CLEANING STEPS")
|
135
308
|
|
136
309
|
# Prompt to get recommended steps from the LLM
|
@@ -173,6 +346,7 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
173
346
|
|
174
347
|
Avoid these:
|
175
348
|
1. Do not include steps to save files.
|
349
|
+
2. Do not include unrelated user instructions that are not related to the data cleaning.
|
176
350
|
""",
|
177
351
|
input_variables=["user_instructions", "recommended_steps", "all_datasets_summary"]
|
178
352
|
)
|
@@ -180,7 +354,7 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
180
354
|
data_raw = state.get("data_raw")
|
181
355
|
df = pd.DataFrame.from_dict(data_raw)
|
182
356
|
|
183
|
-
all_datasets_summary =
|
357
|
+
all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
|
184
358
|
|
185
359
|
all_datasets_summary_str = "\n\n".join(all_datasets_summary)
|
186
360
|
|
@@ -197,8 +371,21 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
197
371
|
}
|
198
372
|
|
199
373
|
def create_data_cleaner_code(state: GraphState):
|
374
|
+
|
200
375
|
print(" * CREATE DATA CLEANER CODE")
|
201
376
|
|
377
|
+
if bypass_recommended_steps:
|
378
|
+
print(format_agent_name(AGENT_NAME))
|
379
|
+
|
380
|
+
data_raw = state.get("data_raw")
|
381
|
+
df = pd.DataFrame.from_dict(data_raw)
|
382
|
+
|
383
|
+
all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
|
384
|
+
|
385
|
+
all_datasets_summary_str = "\n\n".join(all_datasets_summary)
|
386
|
+
else:
|
387
|
+
all_datasets_summary_str = state.get("all_datasets_summary")
|
388
|
+
|
202
389
|
data_cleaning_prompt = PromptTemplate(
|
203
390
|
template="""
|
204
391
|
You are a Data Cleaning Agent. Your job is to create a data_cleaner() function that can be run on the data provided using the following recommended steps.
|
@@ -212,7 +399,7 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
212
399
|
|
213
400
|
{all_datasets_summary}
|
214
401
|
|
215
|
-
Return Python code in ```python ``` format with a single function definition, data_cleaner(data_raw), that
|
402
|
+
Return Python code in ```python ``` format with a single function definition, data_cleaner(data_raw), that includes all imports inside the function.
|
216
403
|
|
217
404
|
Return code to provide the data cleaning function:
|
218
405
|
|
@@ -234,16 +421,16 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
234
421
|
|
235
422
|
response = data_cleaning_agent.invoke({
|
236
423
|
"recommended_steps": state.get("recommended_steps"),
|
237
|
-
"all_datasets_summary":
|
424
|
+
"all_datasets_summary": all_datasets_summary_str
|
238
425
|
})
|
239
426
|
|
240
427
|
response = relocate_imports_inside_function(response)
|
241
428
|
response = add_comments_to_top(response, agent_name=AGENT_NAME)
|
242
429
|
|
243
430
|
# For logging: store the code generated:
|
244
|
-
file_path,
|
431
|
+
file_path, file_name_2 = log_ai_function(
|
245
432
|
response=response,
|
246
|
-
file_name=
|
433
|
+
file_name=file_name,
|
247
434
|
log=log,
|
248
435
|
log_path=log_path,
|
249
436
|
overwrite=overwrite
|
@@ -252,7 +439,8 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
252
439
|
return {
|
253
440
|
"data_cleaner_function" : response,
|
254
441
|
"data_cleaner_function_path": file_path,
|
255
|
-
"data_cleaner_function_name":
|
442
|
+
"data_cleaner_function_name": file_name_2,
|
443
|
+
"all_datasets_summary": all_datasets_summary_str
|
256
444
|
}
|
257
445
|
|
258
446
|
def human_review(state: GraphState) -> Command[Literal["recommend_cleaning_steps", "create_data_cleaner_code"]]:
|
@@ -274,7 +462,7 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
274
462
|
code_snippet_key="data_cleaner_function",
|
275
463
|
agent_function_name="data_cleaner",
|
276
464
|
pre_processing=lambda data: pd.DataFrame.from_dict(data),
|
277
|
-
post_processing=lambda df: df.to_dict(),
|
465
|
+
post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
|
278
466
|
error_message_prefix="An error occurred during data cleaning: "
|
279
467
|
)
|
280
468
|
|
@@ -341,7 +529,12 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
341
529
|
error_key="data_cleaner_error",
|
342
530
|
human_in_the_loop=human_in_the_loop, # or False
|
343
531
|
human_review_node_name="human_review",
|
344
|
-
checkpointer=MemorySaver() if human_in_the_loop else None
|
532
|
+
checkpointer=MemorySaver() if human_in_the_loop else None,
|
533
|
+
bypass_recommended_steps=bypass_recommended_steps,
|
534
|
+
bypass_explain_code=bypass_explain_code,
|
345
535
|
)
|
346
536
|
|
347
537
|
return app
|
538
|
+
|
539
|
+
|
540
|
+
|