ai-data-science-team 0.0.0.9007__py3-none-any.whl → 0.0.0.9008__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/agents/__init__.py +4 -4
- ai_data_science_team/agents/data_cleaning_agent.py +225 -84
- ai_data_science_team/agents/data_visualization_agent.py +460 -27
- ai_data_science_team/agents/data_wrangling_agent.py +455 -16
- ai_data_science_team/agents/feature_engineering_agent.py +429 -25
- ai_data_science_team/agents/sql_database_agent.py +367 -21
- ai_data_science_team/multiagents/__init__.py +1 -0
- ai_data_science_team/multiagents/sql_data_analyst.py +286 -0
- ai_data_science_team/multiagents/supervised_data_analyst.py +2 -0
- ai_data_science_team/templates/__init__.py +2 -1
- ai_data_science_team/templates/agent_templates.py +247 -42
- ai_data_science_team/tools/regex.py +28 -1
- ai_data_science_team/utils/__init__.py +0 -0
- ai_data_science_team/utils/plotly.py +24 -0
- {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9008.dist-info}/METADATA +76 -28
- ai_data_science_team-0.0.0.9008.dist-info/RECORD +26 -0
- ai_data_science_team-0.0.0.9007.dist-info/RECORD +0 -21
- {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9008.dist-info}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9008.dist-info}/WHEEL +0 -0
- {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9008.dist-info}/top_level.txt +0 -0
ai_data_science_team/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.0.0.
|
1
|
+
__version__ = "0.0.0.9008"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from ai_data_science_team.agents.data_cleaning_agent import make_data_cleaning_agent, DataCleaningAgent
|
2
|
-
from ai_data_science_team.agents.feature_engineering_agent import make_feature_engineering_agent
|
3
|
-
from ai_data_science_team.agents.data_wrangling_agent import make_data_wrangling_agent
|
4
|
-
from ai_data_science_team.agents.sql_database_agent import make_sql_database_agent
|
5
|
-
from ai_data_science_team.agents.data_visualization_agent import make_data_visualization_agent
|
2
|
+
from ai_data_science_team.agents.feature_engineering_agent import make_feature_engineering_agent, FeatureEngineeringAgent
|
3
|
+
from ai_data_science_team.agents.data_wrangling_agent import make_data_wrangling_agent, DataWranglingAgent
|
4
|
+
from ai_data_science_team.agents.sql_database_agent import make_sql_database_agent, SQLDatabaseAgent
|
5
|
+
from ai_data_science_team.agents.data_visualization_agent import make_data_visualization_agent, DataVisualizationAgent
|
6
6
|
|
@@ -13,21 +13,22 @@ from langchain_core.messages import BaseMessage
|
|
13
13
|
from langgraph.types import Command
|
14
14
|
from langgraph.checkpoint.memory import MemorySaver
|
15
15
|
|
16
|
-
from langgraph.graph.state import CompiledStateGraph
|
17
|
-
|
18
16
|
import os
|
19
17
|
import io
|
20
18
|
import pandas as pd
|
21
19
|
|
20
|
+
from IPython.display import Markdown
|
21
|
+
|
22
22
|
from ai_data_science_team.templates import(
|
23
23
|
node_func_execute_agent_code_on_data,
|
24
24
|
node_func_human_review,
|
25
25
|
node_func_fix_agent_code,
|
26
26
|
node_func_explain_agent_code,
|
27
|
-
create_coding_agent_graph
|
27
|
+
create_coding_agent_graph,
|
28
|
+
BaseAgent,
|
28
29
|
)
|
29
30
|
from ai_data_science_team.tools.parsers import PythonOutputParser
|
30
|
-
from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top, format_agent_name
|
31
|
+
from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top, format_agent_name, format_recommended_steps
|
31
32
|
from ai_data_science_team.tools.metadata import get_dataframe_summary
|
32
33
|
from ai_data_science_team.tools.logging import log_ai_function
|
33
34
|
|
@@ -38,7 +39,109 @@ LOG_PATH = os.path.join(os.getcwd(), "logs/")
|
|
38
39
|
|
39
40
|
|
40
41
|
# Class
|
41
|
-
class DataCleaningAgent(
|
42
|
+
class DataCleaningAgent(BaseAgent):
|
43
|
+
"""
|
44
|
+
Creates a data cleaning agent that can process datasets based on user-defined instructions or default cleaning steps.
|
45
|
+
The agent generates a Python function to clean the dataset, performs the cleaning, and logs the process, including code
|
46
|
+
and errors. It is designed to facilitate reproducible and customizable data cleaning workflows.
|
47
|
+
|
48
|
+
The agent performs the following default cleaning steps unless instructed otherwise:
|
49
|
+
|
50
|
+
- Removing columns with more than 40% missing values.
|
51
|
+
- Imputing missing values with the mean for numeric columns.
|
52
|
+
- Imputing missing values with the mode for categorical columns.
|
53
|
+
- Converting columns to appropriate data types.
|
54
|
+
- Removing duplicate rows.
|
55
|
+
- Removing rows with missing values.
|
56
|
+
- Removing rows with extreme outliers (values 3x the interquartile range).
|
57
|
+
|
58
|
+
User instructions can modify, add, or remove any of these steps to tailor the cleaning process.
|
59
|
+
|
60
|
+
Parameters
|
61
|
+
----------
|
62
|
+
model : langchain.llms.base.LLM
|
63
|
+
The language model used to generate the data cleaning function.
|
64
|
+
n_samples : int, optional
|
65
|
+
Number of samples used when summarizing the dataset. Defaults to 30. Reducing this number can help
|
66
|
+
avoid exceeding the model's token limits.
|
67
|
+
log : bool, optional
|
68
|
+
Whether to log the generated code and errors. Defaults to False.
|
69
|
+
log_path : str, optional
|
70
|
+
Directory path for storing log files. Defaults to None.
|
71
|
+
file_name : str, optional
|
72
|
+
Name of the file for saving the generated response. Defaults to "data_cleaner.py".
|
73
|
+
function_name : str, optional
|
74
|
+
Name of the generated data cleaning function. Defaults to "data_cleaner".
|
75
|
+
overwrite : bool, optional
|
76
|
+
Whether to overwrite the log file if it exists. If False, a unique file name is created. Defaults to True.
|
77
|
+
human_in_the_loop : bool, optional
|
78
|
+
Enables user review of data cleaning instructions. Defaults to False.
|
79
|
+
bypass_recommended_steps : bool, optional
|
80
|
+
If True, skips the default recommended cleaning steps. Defaults to False.
|
81
|
+
bypass_explain_code : bool, optional
|
82
|
+
If True, skips the step that provides code explanations. Defaults to False.
|
83
|
+
|
84
|
+
Methods
|
85
|
+
-------
|
86
|
+
update_params(**kwargs)
|
87
|
+
Updates the agent's parameters and rebuilds the compiled state graph.
|
88
|
+
ainvoke_agent(user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0)
|
89
|
+
Cleans the provided dataset asynchronously based on user instructions.
|
90
|
+
invoke_agent(user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0)
|
91
|
+
Cleans the provided dataset synchronously based on user instructions.
|
92
|
+
explain_cleaning_steps()
|
93
|
+
Returns an explanation of the cleaning steps performed by the agent.
|
94
|
+
get_log_summary()
|
95
|
+
Retrieves a summary of logged operations if logging is enabled.
|
96
|
+
get_state_keys()
|
97
|
+
Returns a list of keys from the state graph response.
|
98
|
+
get_state_properties()
|
99
|
+
Returns detailed properties of the state graph response.
|
100
|
+
get_data_cleaned()
|
101
|
+
Retrieves the cleaned dataset as a pandas DataFrame.
|
102
|
+
get_data_raw()
|
103
|
+
Retrieves the raw dataset as a pandas DataFrame.
|
104
|
+
get_data_cleaner_function()
|
105
|
+
Retrieves the generated Python function used for cleaning the data.
|
106
|
+
get_recommended_cleaning_steps()
|
107
|
+
Retrieves the agent's recommended cleaning steps.
|
108
|
+
get_response()
|
109
|
+
Returns the response from the agent as a dictionary.
|
110
|
+
show()
|
111
|
+
Displays the agent's mermaid diagram.
|
112
|
+
|
113
|
+
Examples
|
114
|
+
--------
|
115
|
+
```python
|
116
|
+
import pandas as pd
|
117
|
+
from langchain_openai import ChatOpenAI
|
118
|
+
from ai_data_science_team.agents import DataCleaningAgent
|
119
|
+
|
120
|
+
llm = ChatOpenAI(model="gpt-4o-mini")
|
121
|
+
|
122
|
+
data_cleaning_agent = DataCleaningAgent(
|
123
|
+
model=llm, n_samples=50, log=True, log_path="logs", human_in_the_loop=True
|
124
|
+
)
|
125
|
+
|
126
|
+
df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
|
127
|
+
|
128
|
+
data_cleaning_agent.invoke_agent(
|
129
|
+
user_instructions="Don't remove outliers when cleaning the data.",
|
130
|
+
data_raw=df,
|
131
|
+
max_retries=3,
|
132
|
+
retry_count=0
|
133
|
+
)
|
134
|
+
|
135
|
+
cleaned_data = data_cleaning_agent.get_data_cleaned()
|
136
|
+
|
137
|
+
response = data_cleaning_agent.response
|
138
|
+
```
|
139
|
+
|
140
|
+
Returns
|
141
|
+
--------
|
142
|
+
DataCleaningAgent : langchain.graphs.CompiledStateGraph
|
143
|
+
A data cleaning agent implemented as a compiled state graph.
|
144
|
+
"""
|
42
145
|
|
43
146
|
def __init__(
|
44
147
|
self,
|
@@ -47,6 +150,7 @@ class DataCleaningAgent(CompiledStateGraph):
|
|
47
150
|
log=False,
|
48
151
|
log_path=None,
|
49
152
|
file_name="data_cleaner.py",
|
153
|
+
function_name="data_cleaner",
|
50
154
|
overwrite=True,
|
51
155
|
human_in_the_loop=False,
|
52
156
|
bypass_recommended_steps=False,
|
@@ -58,6 +162,7 @@ class DataCleaningAgent(CompiledStateGraph):
|
|
58
162
|
"log": log,
|
59
163
|
"log_path": log_path,
|
60
164
|
"file_name": file_name,
|
165
|
+
"function_name": function_name,
|
61
166
|
"overwrite": overwrite,
|
62
167
|
"human_in_the_loop": human_in_the_loop,
|
63
168
|
"bypass_recommended_steps": bypass_recommended_steps,
|
@@ -67,65 +172,70 @@ class DataCleaningAgent(CompiledStateGraph):
|
|
67
172
|
self.response = None
|
68
173
|
|
69
174
|
def _make_compiled_graph(self):
|
70
|
-
self.response = None
|
71
|
-
return make_data_cleaning_agent(**self._params)
|
72
|
-
|
73
|
-
def update_params(self, **kwargs):
|
74
175
|
"""
|
75
|
-
|
76
|
-
e.g. agent.update_params(model=new_llm, n_samples=100)
|
176
|
+
Create the compiled graph for the data cleaning agent. Running this method will reset the response to None.
|
77
177
|
"""
|
78
|
-
self.
|
79
|
-
|
178
|
+
self.response=None
|
179
|
+
return make_data_cleaning_agent(**self._params)
|
80
180
|
|
81
|
-
def __getattr__(self, name: str):
|
82
|
-
"""
|
83
|
-
Delegate attribute access to `_compiled_graph` if `name` is not
|
84
|
-
found in this instance. This 'inherits' methods from the compiled graph.
|
85
|
-
"""
|
86
|
-
return getattr(self._compiled_graph, name)
|
87
181
|
|
88
|
-
def
|
182
|
+
def ainvoke_agent(self, data_raw: pd.DataFrame, user_instructions: str=None, max_retries:int=3, retry_count:int=0, **kwargs):
|
89
183
|
"""
|
90
|
-
|
184
|
+
Asynchronously invokes the agent. The response is stored in the response attribute.
|
91
185
|
|
92
186
|
Parameters:
|
93
|
-
|
94
|
-
data_raw (pd.DataFrame):
|
95
|
-
|
96
|
-
|
187
|
+
----------
|
188
|
+
data_raw (pd.DataFrame):
|
189
|
+
The raw dataset to be cleaned.
|
190
|
+
user_instructions (str):
|
191
|
+
Instructions for data cleaning agent.
|
192
|
+
max_retries (int):
|
193
|
+
Maximum retry attempts for cleaning.
|
194
|
+
retry_count (int):
|
195
|
+
Current retry attempt.
|
196
|
+
**kwargs
|
197
|
+
Additional keyword arguments to pass to ainvoke().
|
97
198
|
|
98
199
|
Returns:
|
200
|
+
--------
|
99
201
|
None. The response is stored in the response attribute.
|
100
202
|
"""
|
101
|
-
response = self.ainvoke({
|
203
|
+
response = self._compiled_graph.ainvoke({
|
102
204
|
"user_instructions": user_instructions,
|
103
205
|
"data_raw": data_raw.to_dict(),
|
104
206
|
"max_retries": max_retries,
|
105
207
|
"retry_count": retry_count,
|
106
|
-
})
|
208
|
+
}, **kwargs)
|
107
209
|
self.response = response
|
108
210
|
return None
|
109
211
|
|
110
|
-
def
|
212
|
+
def invoke_agent(self, data_raw: pd.DataFrame, user_instructions: str=None, max_retries:int=3, retry_count:int=0, **kwargs):
|
111
213
|
"""
|
112
|
-
|
214
|
+
Invokes the agent. The response is stored in the response attribute.
|
113
215
|
|
114
216
|
Parameters:
|
115
|
-
|
116
|
-
data_raw (pd.DataFrame):
|
117
|
-
|
118
|
-
|
217
|
+
----------
|
218
|
+
data_raw (pd.DataFrame):
|
219
|
+
The raw dataset to be cleaned.
|
220
|
+
user_instructions (str):
|
221
|
+
Instructions for data cleaning agent.
|
222
|
+
max_retries (int):
|
223
|
+
Maximum retry attempts for cleaning.
|
224
|
+
retry_count (int):
|
225
|
+
Current retry attempt.
|
226
|
+
**kwargs
|
227
|
+
Additional keyword arguments to pass to invoke().
|
119
228
|
|
120
229
|
Returns:
|
230
|
+
--------
|
121
231
|
None. The response is stored in the response attribute.
|
122
232
|
"""
|
123
|
-
response = self.invoke({
|
233
|
+
response = self._compiled_graph.invoke({
|
124
234
|
"user_instructions": user_instructions,
|
125
235
|
"data_raw": data_raw.to_dict(),
|
126
236
|
"max_retries": max_retries,
|
127
237
|
"retry_count": retry_count,
|
128
|
-
})
|
238
|
+
},**kwargs)
|
129
239
|
self.response = response
|
130
240
|
return None
|
131
241
|
|
@@ -139,30 +249,21 @@ class DataCleaningAgent(CompiledStateGraph):
|
|
139
249
|
messages = self.response.get("messages", [])
|
140
250
|
return messages
|
141
251
|
|
142
|
-
def get_log_summary(self):
|
252
|
+
def get_log_summary(self, markdown=False):
|
143
253
|
"""
|
144
254
|
Logs a summary of the agent's operations, if logging is enabled.
|
145
255
|
"""
|
146
256
|
if self.response:
|
147
|
-
if self.
|
257
|
+
if self.response.get('data_cleaner_function_path'):
|
148
258
|
log_details = f"Log Path: {self.response.get('data_cleaner_function_path')}"
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
Returns a list of keys that the state graph returns in a response.
|
154
|
-
"""
|
155
|
-
return list(self.get_output_jsonschema()['properties'].keys())
|
156
|
-
|
157
|
-
def get_state_properties(self):
|
158
|
-
"""
|
159
|
-
Returns a list of keys that the state graph returns in a response.
|
160
|
-
"""
|
161
|
-
return self.get_output_jsonschema()['properties']
|
259
|
+
if markdown:
|
260
|
+
return Markdown(log_details)
|
261
|
+
else:
|
262
|
+
return log_details
|
162
263
|
|
163
264
|
def get_data_cleaned(self):
|
164
265
|
"""
|
165
|
-
Retrieves the cleaned data stored after running
|
266
|
+
Retrieves the cleaned data stored after running invoke_agent or clean_data methods.
|
166
267
|
"""
|
167
268
|
if self.response:
|
168
269
|
return pd.DataFrame(self.response.get("data_cleaned"))
|
@@ -174,15 +275,25 @@ class DataCleaningAgent(CompiledStateGraph):
|
|
174
275
|
if self.response:
|
175
276
|
return pd.DataFrame(self.response.get("data_raw"))
|
176
277
|
|
177
|
-
def get_data_cleaner_function(self):
|
278
|
+
def get_data_cleaner_function(self, markdown=False):
|
178
279
|
"""
|
179
280
|
Retrieves the agent's pipeline function.
|
180
281
|
"""
|
181
282
|
if self.response:
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
283
|
+
if markdown:
|
284
|
+
return Markdown(f"```python\n{self.response.get('data_cleaner_function')}\n```")
|
285
|
+
else:
|
286
|
+
return self.response.get("data_cleaner_function")
|
287
|
+
|
288
|
+
def get_recommended_cleaning_steps(self, markdown=False):
|
289
|
+
"""
|
290
|
+
Retrieves the agent's recommended cleaning steps
|
291
|
+
"""
|
292
|
+
if self.response:
|
293
|
+
if markdown:
|
294
|
+
return Markdown(self.response.get('recommended_steps'))
|
295
|
+
else:
|
296
|
+
return self.response.get('recommended_steps')
|
186
297
|
|
187
298
|
|
188
299
|
|
@@ -194,6 +305,7 @@ def make_data_cleaning_agent(
|
|
194
305
|
log=False,
|
195
306
|
log_path=None,
|
196
307
|
file_name="data_cleaner.py",
|
308
|
+
function_name="data_cleaner",
|
197
309
|
overwrite = True,
|
198
310
|
human_in_the_loop=False,
|
199
311
|
bypass_recommended_steps=False,
|
@@ -235,6 +347,8 @@ def make_data_cleaning_agent(
|
|
235
347
|
"logs/".
|
236
348
|
file_name : str, optional
|
237
349
|
The name of the file to save the response to. Defaults to "data_cleaner.py".
|
350
|
+
function_name : str, optional
|
351
|
+
The name of the function that will be generated to clean the data. Defaults to "data_cleaner".
|
238
352
|
overwrite : bool, optional
|
239
353
|
Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
|
240
354
|
Defaults to True.
|
@@ -275,6 +389,11 @@ def make_data_cleaning_agent(
|
|
275
389
|
"""
|
276
390
|
llm = model
|
277
391
|
|
392
|
+
# Human in th loop requires recommended steps
|
393
|
+
if bypass_recommended_steps and human_in_the_loop:
|
394
|
+
bypass_recommended_steps = False
|
395
|
+
print("Bypass recommended steps set to False to enable human in the loop.")
|
396
|
+
|
278
397
|
# Setup Log Directory
|
279
398
|
if log:
|
280
399
|
if log_path is None:
|
@@ -292,6 +411,7 @@ def make_data_cleaning_agent(
|
|
292
411
|
all_datasets_summary: str
|
293
412
|
data_cleaner_function: str
|
294
413
|
data_cleaner_function_path: str
|
414
|
+
data_cleaner_file_name: str
|
295
415
|
data_cleaner_function_name: str
|
296
416
|
data_cleaner_error: str
|
297
417
|
max_retries: int
|
@@ -366,7 +486,7 @@ def make_data_cleaning_agent(
|
|
366
486
|
})
|
367
487
|
|
368
488
|
return {
|
369
|
-
"recommended_steps": "
|
489
|
+
"recommended_steps": format_recommended_steps(recommended_steps.content.strip(), heading="# Recommended Data Cleaning Steps:"),
|
370
490
|
"all_datasets_summary": all_datasets_summary_str
|
371
491
|
}
|
372
492
|
|
@@ -386,42 +506,44 @@ def make_data_cleaning_agent(
|
|
386
506
|
else:
|
387
507
|
all_datasets_summary_str = state.get("all_datasets_summary")
|
388
508
|
|
509
|
+
|
389
510
|
data_cleaning_prompt = PromptTemplate(
|
390
511
|
template="""
|
391
|
-
You are a Data Cleaning Agent. Your job is to create a
|
392
|
-
|
512
|
+
You are a Data Cleaning Agent. Your job is to create a {function_name}() function that can be run on the data provided using the following recommended steps.
|
513
|
+
|
393
514
|
Recommended Steps:
|
394
515
|
{recommended_steps}
|
395
|
-
|
516
|
+
|
396
517
|
You can use Pandas, Numpy, and Scikit Learn libraries to clean the data.
|
397
|
-
|
518
|
+
|
398
519
|
Below are summaries of all datasets provided. Use this information about the data to help determine how to clean the data:
|
399
520
|
|
400
521
|
{all_datasets_summary}
|
401
|
-
|
402
|
-
Return Python code in ```python
|
403
|
-
|
522
|
+
|
523
|
+
Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
|
524
|
+
|
404
525
|
Return code to provide the data cleaning function:
|
405
|
-
|
406
|
-
def
|
526
|
+
|
527
|
+
def {function_name}(data_raw):
|
407
528
|
import pandas as pd
|
408
529
|
import numpy as np
|
409
530
|
...
|
410
531
|
return data_cleaned
|
411
|
-
|
532
|
+
|
412
533
|
Best Practices and Error Preventions:
|
413
|
-
|
534
|
+
|
414
535
|
Always ensure that when assigning the output of fit_transform() from SimpleImputer to a Pandas DataFrame column, you call .ravel() or flatten the array, because fit_transform() returns a 2D array while a DataFrame column is 1D.
|
415
536
|
|
416
537
|
""",
|
417
|
-
input_variables=["recommended_steps", "all_datasets_summary"]
|
538
|
+
input_variables=["recommended_steps", "all_datasets_summary", "function_name"]
|
418
539
|
)
|
419
540
|
|
420
541
|
data_cleaning_agent = data_cleaning_prompt | llm | PythonOutputParser()
|
421
542
|
|
422
543
|
response = data_cleaning_agent.invoke({
|
423
544
|
"recommended_steps": state.get("recommended_steps"),
|
424
|
-
"all_datasets_summary": all_datasets_summary_str
|
545
|
+
"all_datasets_summary": all_datasets_summary_str,
|
546
|
+
"function_name": function_name
|
425
547
|
})
|
426
548
|
|
427
549
|
response = relocate_imports_inside_function(response)
|
@@ -439,19 +561,37 @@ def make_data_cleaning_agent(
|
|
439
561
|
return {
|
440
562
|
"data_cleaner_function" : response,
|
441
563
|
"data_cleaner_function_path": file_path,
|
442
|
-
"
|
564
|
+
"data_cleaner_file_name": file_name_2,
|
565
|
+
"data_cleaner_function_name": function_name,
|
443
566
|
"all_datasets_summary": all_datasets_summary_str
|
444
567
|
}
|
568
|
+
|
569
|
+
# Human Review
|
570
|
+
|
571
|
+
prompt_text_human_review = "Are the following data cleaning instructions correct? (Answer 'yes' or provide modifications)\n{steps}"
|
445
572
|
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
573
|
+
if not bypass_explain_code:
|
574
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_cleaning_steps", "explain_data_cleaner_code"]]:
|
575
|
+
return node_func_human_review(
|
576
|
+
state=state,
|
577
|
+
prompt_text=prompt_text_human_review,
|
578
|
+
yes_goto= 'explain_data_cleaner_code',
|
579
|
+
no_goto="recommend_cleaning_steps",
|
580
|
+
user_instructions_key="user_instructions",
|
581
|
+
recommended_steps_key="recommended_steps",
|
582
|
+
code_snippet_key="data_cleaner_function",
|
583
|
+
)
|
584
|
+
else:
|
585
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_cleaning_steps", "__end__"]]:
|
586
|
+
return node_func_human_review(
|
587
|
+
state=state,
|
588
|
+
prompt_text=prompt_text_human_review,
|
589
|
+
yes_goto= '__end__',
|
590
|
+
no_goto="recommend_cleaning_steps",
|
591
|
+
user_instructions_key="user_instructions",
|
592
|
+
recommended_steps_key="recommended_steps",
|
593
|
+
code_snippet_key="data_cleaner_function",
|
594
|
+
)
|
455
595
|
|
456
596
|
def execute_data_cleaner_code(state):
|
457
597
|
return node_func_execute_agent_code_on_data(
|
@@ -460,7 +600,7 @@ def make_data_cleaning_agent(
|
|
460
600
|
result_key="data_cleaned",
|
461
601
|
error_key="data_cleaner_error",
|
462
602
|
code_snippet_key="data_cleaner_function",
|
463
|
-
agent_function_name="
|
603
|
+
agent_function_name=state.get("data_cleaner_function_name"),
|
464
604
|
pre_processing=lambda data: pd.DataFrame.from_dict(data),
|
465
605
|
post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
|
466
606
|
error_message_prefix="An error occurred during data cleaning: "
|
@@ -468,11 +608,11 @@ def make_data_cleaning_agent(
|
|
468
608
|
|
469
609
|
def fix_data_cleaner_code(state: GraphState):
|
470
610
|
data_cleaner_prompt = """
|
471
|
-
You are a Data Cleaning Agent. Your job is to create a
|
611
|
+
You are a Data Cleaning Agent. Your job is to create a {function_name}() function that can be run on the data provided. The function is currently broken and needs to be fixed.
|
472
612
|
|
473
|
-
Make sure to only return the function definition for
|
613
|
+
Make sure to only return the function definition for {function_name}().
|
474
614
|
|
475
|
-
Return Python code in ```python``` format with a single function definition,
|
615
|
+
Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
|
476
616
|
|
477
617
|
This is the broken code (please fix):
|
478
618
|
{code_snippet}
|
@@ -490,6 +630,7 @@ def make_data_cleaning_agent(
|
|
490
630
|
agent_name=AGENT_NAME,
|
491
631
|
log=log,
|
492
632
|
file_path=state.get("data_cleaner_function_path"),
|
633
|
+
function_name=state.get("data_cleaner_function_name"),
|
493
634
|
)
|
494
635
|
|
495
636
|
def explain_data_cleaner_code(state: GraphState):
|