ai-data-science-team 0.0.0.9007__py3-none-any.whl → 0.0.0.9009__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/agents/__init__.py +4 -5
- ai_data_science_team/agents/data_cleaning_agent.py +268 -116
- ai_data_science_team/agents/data_visualization_agent.py +470 -41
- ai_data_science_team/agents/data_wrangling_agent.py +471 -31
- ai_data_science_team/agents/feature_engineering_agent.py +426 -41
- ai_data_science_team/agents/sql_database_agent.py +458 -58
- ai_data_science_team/ml_agents/__init__.py +1 -0
- ai_data_science_team/ml_agents/h2o_ml_agent.py +1032 -0
- ai_data_science_team/multiagents/__init__.py +1 -0
- ai_data_science_team/multiagents/sql_data_analyst.py +398 -0
- ai_data_science_team/multiagents/supervised_data_analyst.py +2 -0
- ai_data_science_team/templates/__init__.py +3 -1
- ai_data_science_team/templates/agent_templates.py +319 -43
- ai_data_science_team/tools/metadata.py +94 -62
- ai_data_science_team/tools/regex.py +86 -1
- ai_data_science_team/utils/__init__.py +0 -0
- ai_data_science_team/utils/plotly.py +24 -0
- ai_data_science_team-0.0.0.9009.dist-info/METADATA +245 -0
- ai_data_science_team-0.0.0.9009.dist-info/RECORD +28 -0
- ai_data_science_team-0.0.0.9007.dist-info/METADATA +0 -183
- ai_data_science_team-0.0.0.9007.dist-info/RECORD +0 -21
- {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/WHEEL +0 -0
- {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/top_level.txt +0 -0
ai_data_science_team/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.0.0.
|
1
|
+
__version__ = "0.0.0.9009"
|
@@ -1,6 +1,5 @@
|
|
1
1
|
from ai_data_science_team.agents.data_cleaning_agent import make_data_cleaning_agent, DataCleaningAgent
|
2
|
-
from ai_data_science_team.agents.feature_engineering_agent import make_feature_engineering_agent
|
3
|
-
from ai_data_science_team.agents.data_wrangling_agent import make_data_wrangling_agent
|
4
|
-
from ai_data_science_team.agents.sql_database_agent import make_sql_database_agent
|
5
|
-
from ai_data_science_team.agents.data_visualization_agent import make_data_visualization_agent
|
6
|
-
|
2
|
+
from ai_data_science_team.agents.feature_engineering_agent import make_feature_engineering_agent, FeatureEngineeringAgent
|
3
|
+
from ai_data_science_team.agents.data_wrangling_agent import make_data_wrangling_agent, DataWranglingAgent
|
4
|
+
from ai_data_science_team.agents.sql_database_agent import make_sql_database_agent, SQLDatabaseAgent
|
5
|
+
from ai_data_science_team.agents.data_visualization_agent import make_data_visualization_agent, DataVisualizationAgent
|
@@ -13,21 +13,28 @@ from langchain_core.messages import BaseMessage
|
|
13
13
|
from langgraph.types import Command
|
14
14
|
from langgraph.checkpoint.memory import MemorySaver
|
15
15
|
|
16
|
-
from langgraph.graph.state import CompiledStateGraph
|
17
|
-
|
18
16
|
import os
|
19
|
-
import
|
17
|
+
import json
|
20
18
|
import pandas as pd
|
21
19
|
|
20
|
+
from IPython.display import Markdown
|
21
|
+
|
22
22
|
from ai_data_science_team.templates import(
|
23
23
|
node_func_execute_agent_code_on_data,
|
24
24
|
node_func_human_review,
|
25
25
|
node_func_fix_agent_code,
|
26
|
-
|
27
|
-
create_coding_agent_graph
|
26
|
+
node_func_report_agent_outputs,
|
27
|
+
create_coding_agent_graph,
|
28
|
+
BaseAgent,
|
28
29
|
)
|
29
30
|
from ai_data_science_team.tools.parsers import PythonOutputParser
|
30
|
-
from ai_data_science_team.tools.regex import
|
31
|
+
from ai_data_science_team.tools.regex import (
|
32
|
+
relocate_imports_inside_function,
|
33
|
+
add_comments_to_top,
|
34
|
+
format_agent_name,
|
35
|
+
format_recommended_steps,
|
36
|
+
get_generic_summary,
|
37
|
+
)
|
31
38
|
from ai_data_science_team.tools.metadata import get_dataframe_summary
|
32
39
|
from ai_data_science_team.tools.logging import log_ai_function
|
33
40
|
|
@@ -36,9 +43,110 @@ AGENT_NAME = "data_cleaning_agent"
|
|
36
43
|
LOG_PATH = os.path.join(os.getcwd(), "logs/")
|
37
44
|
|
38
45
|
|
39
|
-
|
40
46
|
# Class
|
41
|
-
class DataCleaningAgent(
|
47
|
+
class DataCleaningAgent(BaseAgent):
|
48
|
+
"""
|
49
|
+
Creates a data cleaning agent that can process datasets based on user-defined instructions or default cleaning steps.
|
50
|
+
The agent generates a Python function to clean the dataset, performs the cleaning, and logs the process, including code
|
51
|
+
and errors. It is designed to facilitate reproducible and customizable data cleaning workflows.
|
52
|
+
|
53
|
+
The agent performs the following default cleaning steps unless instructed otherwise:
|
54
|
+
|
55
|
+
- Removing columns with more than 40% missing values.
|
56
|
+
- Imputing missing values with the mean for numeric columns.
|
57
|
+
- Imputing missing values with the mode for categorical columns.
|
58
|
+
- Converting columns to appropriate data types.
|
59
|
+
- Removing duplicate rows.
|
60
|
+
- Removing rows with missing values.
|
61
|
+
- Removing rows with extreme outliers (values 3x the interquartile range).
|
62
|
+
|
63
|
+
User instructions can modify, add, or remove any of these steps to tailor the cleaning process.
|
64
|
+
|
65
|
+
Parameters
|
66
|
+
----------
|
67
|
+
model : langchain.llms.base.LLM
|
68
|
+
The language model used to generate the data cleaning function.
|
69
|
+
n_samples : int, optional
|
70
|
+
Number of samples used when summarizing the dataset. Defaults to 30. Reducing this number can help
|
71
|
+
avoid exceeding the model's token limits.
|
72
|
+
log : bool, optional
|
73
|
+
Whether to log the generated code and errors. Defaults to False.
|
74
|
+
log_path : str, optional
|
75
|
+
Directory path for storing log files. Defaults to None.
|
76
|
+
file_name : str, optional
|
77
|
+
Name of the file for saving the generated response. Defaults to "data_cleaner.py".
|
78
|
+
function_name : str, optional
|
79
|
+
Name of the generated data cleaning function. Defaults to "data_cleaner".
|
80
|
+
overwrite : bool, optional
|
81
|
+
Whether to overwrite the log file if it exists. If False, a unique file name is created. Defaults to True.
|
82
|
+
human_in_the_loop : bool, optional
|
83
|
+
Enables user review of data cleaning instructions. Defaults to False.
|
84
|
+
bypass_recommended_steps : bool, optional
|
85
|
+
If True, skips the default recommended cleaning steps. Defaults to False.
|
86
|
+
bypass_explain_code : bool, optional
|
87
|
+
If True, skips the step that provides code explanations. Defaults to False.
|
88
|
+
|
89
|
+
Methods
|
90
|
+
-------
|
91
|
+
update_params(**kwargs)
|
92
|
+
Updates the agent's parameters and rebuilds the compiled state graph.
|
93
|
+
ainvoke_agent(user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0)
|
94
|
+
Cleans the provided dataset asynchronously based on user instructions.
|
95
|
+
invoke_agent(user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0)
|
96
|
+
Cleans the provided dataset synchronously based on user instructions.
|
97
|
+
get_workflow_summary()
|
98
|
+
Retrieves a summary of the agent's workflow.
|
99
|
+
get_log_summary()
|
100
|
+
Retrieves a summary of logged operations if logging is enabled.
|
101
|
+
get_state_keys()
|
102
|
+
Returns a list of keys from the state graph response.
|
103
|
+
get_state_properties()
|
104
|
+
Returns detailed properties of the state graph response.
|
105
|
+
get_data_cleaned()
|
106
|
+
Retrieves the cleaned dataset as a pandas DataFrame.
|
107
|
+
get_data_raw()
|
108
|
+
Retrieves the raw dataset as a pandas DataFrame.
|
109
|
+
get_data_cleaner_function()
|
110
|
+
Retrieves the generated Python function used for cleaning the data.
|
111
|
+
get_recommended_cleaning_steps()
|
112
|
+
Retrieves the agent's recommended cleaning steps.
|
113
|
+
get_response()
|
114
|
+
Returns the response from the agent as a dictionary.
|
115
|
+
show()
|
116
|
+
Displays the agent's mermaid diagram.
|
117
|
+
|
118
|
+
Examples
|
119
|
+
--------
|
120
|
+
```python
|
121
|
+
import pandas as pd
|
122
|
+
from langchain_openai import ChatOpenAI
|
123
|
+
from ai_data_science_team.agents import DataCleaningAgent
|
124
|
+
|
125
|
+
llm = ChatOpenAI(model="gpt-4o-mini")
|
126
|
+
|
127
|
+
data_cleaning_agent = DataCleaningAgent(
|
128
|
+
model=llm, n_samples=50, log=True, log_path="logs", human_in_the_loop=True
|
129
|
+
)
|
130
|
+
|
131
|
+
df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
|
132
|
+
|
133
|
+
data_cleaning_agent.invoke_agent(
|
134
|
+
user_instructions="Don't remove outliers when cleaning the data.",
|
135
|
+
data_raw=df,
|
136
|
+
max_retries=3,
|
137
|
+
retry_count=0
|
138
|
+
)
|
139
|
+
|
140
|
+
cleaned_data = data_cleaning_agent.get_data_cleaned()
|
141
|
+
|
142
|
+
response = data_cleaning_agent.response
|
143
|
+
```
|
144
|
+
|
145
|
+
Returns
|
146
|
+
--------
|
147
|
+
DataCleaningAgent : langchain.graphs.CompiledStateGraph
|
148
|
+
A data cleaning agent implemented as a compiled state graph.
|
149
|
+
"""
|
42
150
|
|
43
151
|
def __init__(
|
44
152
|
self,
|
@@ -47,6 +155,7 @@ class DataCleaningAgent(CompiledStateGraph):
|
|
47
155
|
log=False,
|
48
156
|
log_path=None,
|
49
157
|
file_name="data_cleaner.py",
|
158
|
+
function_name="data_cleaner",
|
50
159
|
overwrite=True,
|
51
160
|
human_in_the_loop=False,
|
52
161
|
bypass_recommended_steps=False,
|
@@ -58,6 +167,7 @@ class DataCleaningAgent(CompiledStateGraph):
|
|
58
167
|
"log": log,
|
59
168
|
"log_path": log_path,
|
60
169
|
"file_name": file_name,
|
170
|
+
"function_name": function_name,
|
61
171
|
"overwrite": overwrite,
|
62
172
|
"human_in_the_loop": human_in_the_loop,
|
63
173
|
"bypass_recommended_steps": bypass_recommended_steps,
|
@@ -67,102 +177,104 @@ class DataCleaningAgent(CompiledStateGraph):
|
|
67
177
|
self.response = None
|
68
178
|
|
69
179
|
def _make_compiled_graph(self):
|
70
|
-
self.response = None
|
71
|
-
return make_data_cleaning_agent(**self._params)
|
72
|
-
|
73
|
-
def update_params(self, **kwargs):
|
74
180
|
"""
|
75
|
-
|
76
|
-
e.g. agent.update_params(model=new_llm, n_samples=100)
|
181
|
+
Create the compiled graph for the data cleaning agent. Running this method will reset the response to None.
|
77
182
|
"""
|
78
|
-
self.
|
79
|
-
|
183
|
+
self.response=None
|
184
|
+
return make_data_cleaning_agent(**self._params)
|
80
185
|
|
81
|
-
def
|
82
|
-
"""
|
83
|
-
Delegate attribute access to `_compiled_graph` if `name` is not
|
84
|
-
found in this instance. This 'inherits' methods from the compiled graph.
|
85
|
-
"""
|
86
|
-
return getattr(self._compiled_graph, name)
|
87
|
-
|
88
|
-
def ainvoke(self, user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0):
|
186
|
+
def ainvoke_agent(self, data_raw: pd.DataFrame, user_instructions: str=None, max_retries:int=3, retry_count:int=0, **kwargs):
|
89
187
|
"""
|
90
|
-
|
188
|
+
Asynchronously invokes the agent. The response is stored in the response attribute.
|
91
189
|
|
92
190
|
Parameters:
|
93
|
-
|
94
|
-
data_raw (pd.DataFrame):
|
95
|
-
|
96
|
-
|
191
|
+
----------
|
192
|
+
data_raw (pd.DataFrame):
|
193
|
+
The raw dataset to be cleaned.
|
194
|
+
user_instructions (str):
|
195
|
+
Instructions for data cleaning agent.
|
196
|
+
max_retries (int):
|
197
|
+
Maximum retry attempts for cleaning.
|
198
|
+
retry_count (int):
|
199
|
+
Current retry attempt.
|
200
|
+
**kwargs
|
201
|
+
Additional keyword arguments to pass to ainvoke().
|
97
202
|
|
98
203
|
Returns:
|
204
|
+
--------
|
99
205
|
None. The response is stored in the response attribute.
|
100
206
|
"""
|
101
|
-
response = self.ainvoke({
|
207
|
+
response = self._compiled_graph.ainvoke({
|
102
208
|
"user_instructions": user_instructions,
|
103
209
|
"data_raw": data_raw.to_dict(),
|
104
210
|
"max_retries": max_retries,
|
105
211
|
"retry_count": retry_count,
|
106
|
-
})
|
212
|
+
}, **kwargs)
|
107
213
|
self.response = response
|
108
214
|
return None
|
109
215
|
|
110
|
-
def
|
216
|
+
def invoke_agent(self, data_raw: pd.DataFrame, user_instructions: str=None, max_retries:int=3, retry_count:int=0, **kwargs):
|
111
217
|
"""
|
112
|
-
|
218
|
+
Invokes the agent. The response is stored in the response attribute.
|
113
219
|
|
114
220
|
Parameters:
|
115
|
-
|
116
|
-
data_raw (pd.DataFrame):
|
117
|
-
|
118
|
-
|
221
|
+
----------
|
222
|
+
data_raw (pd.DataFrame):
|
223
|
+
The raw dataset to be cleaned.
|
224
|
+
user_instructions (str):
|
225
|
+
Instructions for data cleaning agent.
|
226
|
+
max_retries (int):
|
227
|
+
Maximum retry attempts for cleaning.
|
228
|
+
retry_count (int):
|
229
|
+
Current retry attempt.
|
230
|
+
**kwargs
|
231
|
+
Additional keyword arguments to pass to invoke().
|
119
232
|
|
120
233
|
Returns:
|
234
|
+
--------
|
121
235
|
None. The response is stored in the response attribute.
|
122
236
|
"""
|
123
|
-
response = self.invoke({
|
237
|
+
response = self._compiled_graph.invoke({
|
124
238
|
"user_instructions": user_instructions,
|
125
239
|
"data_raw": data_raw.to_dict(),
|
126
240
|
"max_retries": max_retries,
|
127
241
|
"retry_count": retry_count,
|
128
|
-
})
|
242
|
+
},**kwargs)
|
129
243
|
self.response = response
|
130
244
|
return None
|
131
245
|
|
132
|
-
def
|
246
|
+
def get_workflow_summary(self, markdown=False):
|
133
247
|
"""
|
134
|
-
|
135
|
-
|
136
|
-
Returns:
|
137
|
-
str: Explanation of the cleaning steps.
|
248
|
+
Retrieves the agent's workflow summary, if logging is enabled.
|
138
249
|
"""
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
250
|
+
if self.response and self.response.get("messages"):
|
251
|
+
summary = get_generic_summary(json.loads(self.response.get("messages")[-1].content))
|
252
|
+
if markdown:
|
253
|
+
return Markdown(summary)
|
254
|
+
else:
|
255
|
+
return summary
|
256
|
+
|
257
|
+
def get_log_summary(self, markdown=False):
|
143
258
|
"""
|
144
259
|
Logs a summary of the agent's operations, if logging is enabled.
|
145
260
|
"""
|
146
261
|
if self.response:
|
147
|
-
if self.
|
148
|
-
log_details = f"
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
Returns a list of keys that the state graph returns in a response.
|
160
|
-
"""
|
161
|
-
return self.get_output_jsonschema()['properties']
|
262
|
+
if self.response.get('data_cleaner_function_path'):
|
263
|
+
log_details = f"""
|
264
|
+
## Data Cleaning Agent Log Summary:
|
265
|
+
|
266
|
+
Function Path: {self.response.get('data_cleaner_function_path')}
|
267
|
+
|
268
|
+
Function Name: {self.response.get('data_cleaner_function_name')}
|
269
|
+
"""
|
270
|
+
if markdown:
|
271
|
+
return Markdown(log_details)
|
272
|
+
else:
|
273
|
+
return log_details
|
162
274
|
|
163
275
|
def get_data_cleaned(self):
|
164
276
|
"""
|
165
|
-
Retrieves the cleaned data stored after running
|
277
|
+
Retrieves the cleaned data stored after running invoke_agent or clean_data methods.
|
166
278
|
"""
|
167
279
|
if self.response:
|
168
280
|
return pd.DataFrame(self.response.get("data_cleaned"))
|
@@ -174,15 +286,25 @@ class DataCleaningAgent(CompiledStateGraph):
|
|
174
286
|
if self.response:
|
175
287
|
return pd.DataFrame(self.response.get("data_raw"))
|
176
288
|
|
177
|
-
def get_data_cleaner_function(self):
|
289
|
+
def get_data_cleaner_function(self, markdown=False):
|
178
290
|
"""
|
179
291
|
Retrieves the agent's pipeline function.
|
180
292
|
"""
|
181
293
|
if self.response:
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
294
|
+
if markdown:
|
295
|
+
return Markdown(f"```python\n{self.response.get('data_cleaner_function')}\n```")
|
296
|
+
else:
|
297
|
+
return self.response.get("data_cleaner_function")
|
298
|
+
|
299
|
+
def get_recommended_cleaning_steps(self, markdown=False):
|
300
|
+
"""
|
301
|
+
Retrieves the agent's recommended cleaning steps
|
302
|
+
"""
|
303
|
+
if self.response:
|
304
|
+
if markdown:
|
305
|
+
return Markdown(self.response.get('recommended_steps'))
|
306
|
+
else:
|
307
|
+
return self.response.get('recommended_steps')
|
186
308
|
|
187
309
|
|
188
310
|
|
@@ -194,6 +316,7 @@ def make_data_cleaning_agent(
|
|
194
316
|
log=False,
|
195
317
|
log_path=None,
|
196
318
|
file_name="data_cleaner.py",
|
319
|
+
function_name="data_cleaner",
|
197
320
|
overwrite = True,
|
198
321
|
human_in_the_loop=False,
|
199
322
|
bypass_recommended_steps=False,
|
@@ -235,6 +358,8 @@ def make_data_cleaning_agent(
|
|
235
358
|
"logs/".
|
236
359
|
file_name : str, optional
|
237
360
|
The name of the file to save the response to. Defaults to "data_cleaner.py".
|
361
|
+
function_name : str, optional
|
362
|
+
The name of the function that will be generated to clean the data. Defaults to "data_cleaner".
|
238
363
|
overwrite : bool, optional
|
239
364
|
Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
|
240
365
|
Defaults to True.
|
@@ -275,6 +400,11 @@ def make_data_cleaning_agent(
|
|
275
400
|
"""
|
276
401
|
llm = model
|
277
402
|
|
403
|
+
# Human in th loop requires recommended steps
|
404
|
+
if bypass_recommended_steps and human_in_the_loop:
|
405
|
+
bypass_recommended_steps = False
|
406
|
+
print("Bypass recommended steps set to False to enable human in the loop.")
|
407
|
+
|
278
408
|
# Setup Log Directory
|
279
409
|
if log:
|
280
410
|
if log_path is None:
|
@@ -292,6 +422,7 @@ def make_data_cleaning_agent(
|
|
292
422
|
all_datasets_summary: str
|
293
423
|
data_cleaner_function: str
|
294
424
|
data_cleaner_function_path: str
|
425
|
+
data_cleaner_file_name: str
|
295
426
|
data_cleaner_function_name: str
|
296
427
|
data_cleaner_error: str
|
297
428
|
max_retries: int
|
@@ -342,7 +473,7 @@ def make_data_cleaning_agent(
|
|
342
473
|
Below are summaries of all datasets provided:
|
343
474
|
{all_datasets_summary}
|
344
475
|
|
345
|
-
Return
|
476
|
+
Return steps as a numbered list. You can return short code snippets to demonstrate actions. But do not return a fully coded solution. The code will be generated separately by a Coding Agent.
|
346
477
|
|
347
478
|
Avoid these:
|
348
479
|
1. Do not include steps to save files.
|
@@ -366,7 +497,7 @@ def make_data_cleaning_agent(
|
|
366
497
|
})
|
367
498
|
|
368
499
|
return {
|
369
|
-
"recommended_steps": "
|
500
|
+
"recommended_steps": format_recommended_steps(recommended_steps.content.strip(), heading="# Recommended Data Cleaning Steps:"),
|
370
501
|
"all_datasets_summary": all_datasets_summary_str
|
371
502
|
}
|
372
503
|
|
@@ -386,42 +517,44 @@ def make_data_cleaning_agent(
|
|
386
517
|
else:
|
387
518
|
all_datasets_summary_str = state.get("all_datasets_summary")
|
388
519
|
|
520
|
+
|
389
521
|
data_cleaning_prompt = PromptTemplate(
|
390
522
|
template="""
|
391
|
-
You are a Data Cleaning Agent. Your job is to create a
|
392
|
-
|
523
|
+
You are a Data Cleaning Agent. Your job is to create a {function_name}() function that can be run on the data provided using the following recommended steps.
|
524
|
+
|
393
525
|
Recommended Steps:
|
394
526
|
{recommended_steps}
|
395
|
-
|
527
|
+
|
396
528
|
You can use Pandas, Numpy, and Scikit Learn libraries to clean the data.
|
397
|
-
|
529
|
+
|
398
530
|
Below are summaries of all datasets provided. Use this information about the data to help determine how to clean the data:
|
399
531
|
|
400
532
|
{all_datasets_summary}
|
401
|
-
|
402
|
-
Return Python code in ```python
|
403
|
-
|
533
|
+
|
534
|
+
Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
|
535
|
+
|
404
536
|
Return code to provide the data cleaning function:
|
405
|
-
|
406
|
-
def
|
537
|
+
|
538
|
+
def {function_name}(data_raw):
|
407
539
|
import pandas as pd
|
408
540
|
import numpy as np
|
409
541
|
...
|
410
542
|
return data_cleaned
|
411
|
-
|
543
|
+
|
412
544
|
Best Practices and Error Preventions:
|
413
|
-
|
545
|
+
|
414
546
|
Always ensure that when assigning the output of fit_transform() from SimpleImputer to a Pandas DataFrame column, you call .ravel() or flatten the array, because fit_transform() returns a 2D array while a DataFrame column is 1D.
|
415
547
|
|
416
548
|
""",
|
417
|
-
input_variables=["recommended_steps", "all_datasets_summary"]
|
549
|
+
input_variables=["recommended_steps", "all_datasets_summary", "function_name"]
|
418
550
|
)
|
419
551
|
|
420
552
|
data_cleaning_agent = data_cleaning_prompt | llm | PythonOutputParser()
|
421
553
|
|
422
554
|
response = data_cleaning_agent.invoke({
|
423
555
|
"recommended_steps": state.get("recommended_steps"),
|
424
|
-
"all_datasets_summary": all_datasets_summary_str
|
556
|
+
"all_datasets_summary": all_datasets_summary_str,
|
557
|
+
"function_name": function_name
|
425
558
|
})
|
426
559
|
|
427
560
|
response = relocate_imports_inside_function(response)
|
@@ -439,19 +572,37 @@ def make_data_cleaning_agent(
|
|
439
572
|
return {
|
440
573
|
"data_cleaner_function" : response,
|
441
574
|
"data_cleaner_function_path": file_path,
|
442
|
-
"
|
575
|
+
"data_cleaner_file_name": file_name_2,
|
576
|
+
"data_cleaner_function_name": function_name,
|
443
577
|
"all_datasets_summary": all_datasets_summary_str
|
444
578
|
}
|
579
|
+
|
580
|
+
# Human Review
|
581
|
+
|
582
|
+
prompt_text_human_review = "Are the following data cleaning instructions correct? (Answer 'yes' or provide modifications)\n{steps}"
|
445
583
|
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
584
|
+
if not bypass_explain_code:
|
585
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_cleaning_steps", "explain_data_cleaner_code"]]:
|
586
|
+
return node_func_human_review(
|
587
|
+
state=state,
|
588
|
+
prompt_text=prompt_text_human_review,
|
589
|
+
yes_goto= 'explain_data_cleaner_code',
|
590
|
+
no_goto="recommend_cleaning_steps",
|
591
|
+
user_instructions_key="user_instructions",
|
592
|
+
recommended_steps_key="recommended_steps",
|
593
|
+
code_snippet_key="data_cleaner_function",
|
594
|
+
)
|
595
|
+
else:
|
596
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_cleaning_steps", "__end__"]]:
|
597
|
+
return node_func_human_review(
|
598
|
+
state=state,
|
599
|
+
prompt_text=prompt_text_human_review,
|
600
|
+
yes_goto= '__end__',
|
601
|
+
no_goto="recommend_cleaning_steps",
|
602
|
+
user_instructions_key="user_instructions",
|
603
|
+
recommended_steps_key="recommended_steps",
|
604
|
+
code_snippet_key="data_cleaner_function",
|
605
|
+
)
|
455
606
|
|
456
607
|
def execute_data_cleaner_code(state):
|
457
608
|
return node_func_execute_agent_code_on_data(
|
@@ -460,7 +611,7 @@ def make_data_cleaning_agent(
|
|
460
611
|
result_key="data_cleaned",
|
461
612
|
error_key="data_cleaner_error",
|
462
613
|
code_snippet_key="data_cleaner_function",
|
463
|
-
agent_function_name="
|
614
|
+
agent_function_name=state.get("data_cleaner_function_name"),
|
464
615
|
pre_processing=lambda data: pd.DataFrame.from_dict(data),
|
465
616
|
post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
|
466
617
|
error_message_prefix="An error occurred during data cleaning: "
|
@@ -468,11 +619,11 @@ def make_data_cleaning_agent(
|
|
468
619
|
|
469
620
|
def fix_data_cleaner_code(state: GraphState):
|
470
621
|
data_cleaner_prompt = """
|
471
|
-
You are a Data Cleaning Agent. Your job is to create a
|
622
|
+
You are a Data Cleaning Agent. Your job is to create a {function_name}() function that can be run on the data provided. The function is currently broken and needs to be fixed.
|
472
623
|
|
473
|
-
Make sure to only return the function definition for
|
624
|
+
Make sure to only return the function definition for {function_name}().
|
474
625
|
|
475
|
-
Return Python code in ```python``` format with a single function definition,
|
626
|
+
Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
|
476
627
|
|
477
628
|
This is the broken code (please fix):
|
478
629
|
{code_snippet}
|
@@ -490,34 +641,34 @@ def make_data_cleaning_agent(
|
|
490
641
|
agent_name=AGENT_NAME,
|
491
642
|
log=log,
|
492
643
|
file_path=state.get("data_cleaner_function_path"),
|
644
|
+
function_name=state.get("data_cleaner_function_name"),
|
493
645
|
)
|
494
646
|
|
495
|
-
|
496
|
-
|
647
|
+
# Final reporting node
|
648
|
+
def report_agent_outputs(state: GraphState):
|
649
|
+
return node_func_report_agent_outputs(
|
497
650
|
state=state,
|
498
|
-
|
651
|
+
keys_to_include=[
|
652
|
+
"recommended_steps",
|
653
|
+
"data_cleaner_function",
|
654
|
+
"data_cleaner_function_path",
|
655
|
+
"data_cleaner_function_name",
|
656
|
+
"data_cleaner_error",
|
657
|
+
],
|
499
658
|
result_key="messages",
|
500
|
-
error_key="data_cleaner_error",
|
501
|
-
llm=llm,
|
502
659
|
role=AGENT_NAME,
|
503
|
-
|
504
|
-
Explain the data cleaning steps that the data cleaning agent performed in this function.
|
505
|
-
Keep the summary succinct and to the point.\n\n# Data Cleaning Agent:\n\n{code}
|
506
|
-
""",
|
507
|
-
success_prefix="# Data Cleaning Agent:\n\n ",
|
508
|
-
error_message="The Data Cleaning Agent encountered an error during data cleaning. Data could not be explained."
|
660
|
+
custom_title="Data Cleaning Agent Outputs"
|
509
661
|
)
|
510
|
-
|
511
|
-
# Define the graph
|
662
|
+
|
512
663
|
node_functions = {
|
513
664
|
"recommend_cleaning_steps": recommend_cleaning_steps,
|
514
665
|
"human_review": human_review,
|
515
666
|
"create_data_cleaner_code": create_data_cleaner_code,
|
516
667
|
"execute_data_cleaner_code": execute_data_cleaner_code,
|
517
668
|
"fix_data_cleaner_code": fix_data_cleaner_code,
|
518
|
-
"
|
669
|
+
"report_agent_outputs": report_agent_outputs,
|
519
670
|
}
|
520
|
-
|
671
|
+
|
521
672
|
app = create_coding_agent_graph(
|
522
673
|
GraphState=GraphState,
|
523
674
|
node_functions=node_functions,
|
@@ -525,16 +676,17 @@ def make_data_cleaning_agent(
|
|
525
676
|
create_code_node_name="create_data_cleaner_code",
|
526
677
|
execute_code_node_name="execute_data_cleaner_code",
|
527
678
|
fix_code_node_name="fix_data_cleaner_code",
|
528
|
-
explain_code_node_name="
|
679
|
+
explain_code_node_name="report_agent_outputs",
|
529
680
|
error_key="data_cleaner_error",
|
530
|
-
human_in_the_loop=human_in_the_loop,
|
681
|
+
human_in_the_loop=human_in_the_loop,
|
531
682
|
human_review_node_name="human_review",
|
532
683
|
checkpointer=MemorySaver() if human_in_the_loop else None,
|
533
684
|
bypass_recommended_steps=bypass_recommended_steps,
|
534
685
|
bypass_explain_code=bypass_explain_code,
|
535
686
|
)
|
536
|
-
|
687
|
+
|
537
688
|
return app
|
689
|
+
|
538
690
|
|
539
691
|
|
540
692
|
|