ai-data-science-team 0.0.0.9007__py3-none-any.whl → 0.0.0.9009__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (25) hide show
  1. ai_data_science_team/_version.py +1 -1
  2. ai_data_science_team/agents/__init__.py +4 -5
  3. ai_data_science_team/agents/data_cleaning_agent.py +268 -116
  4. ai_data_science_team/agents/data_visualization_agent.py +470 -41
  5. ai_data_science_team/agents/data_wrangling_agent.py +471 -31
  6. ai_data_science_team/agents/feature_engineering_agent.py +426 -41
  7. ai_data_science_team/agents/sql_database_agent.py +458 -58
  8. ai_data_science_team/ml_agents/__init__.py +1 -0
  9. ai_data_science_team/ml_agents/h2o_ml_agent.py +1032 -0
  10. ai_data_science_team/multiagents/__init__.py +1 -0
  11. ai_data_science_team/multiagents/sql_data_analyst.py +398 -0
  12. ai_data_science_team/multiagents/supervised_data_analyst.py +2 -0
  13. ai_data_science_team/templates/__init__.py +3 -1
  14. ai_data_science_team/templates/agent_templates.py +319 -43
  15. ai_data_science_team/tools/metadata.py +94 -62
  16. ai_data_science_team/tools/regex.py +86 -1
  17. ai_data_science_team/utils/__init__.py +0 -0
  18. ai_data_science_team/utils/plotly.py +24 -0
  19. ai_data_science_team-0.0.0.9009.dist-info/METADATA +245 -0
  20. ai_data_science_team-0.0.0.9009.dist-info/RECORD +28 -0
  21. ai_data_science_team-0.0.0.9007.dist-info/METADATA +0 -183
  22. ai_data_science_team-0.0.0.9007.dist-info/RECORD +0 -21
  23. {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/LICENSE +0 -0
  24. {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/WHEEL +0 -0
  25. {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- __version__ = "0.0.0.9007"
1
+ __version__ = "0.0.0.9009"
@@ -1,6 +1,5 @@
1
1
  from ai_data_science_team.agents.data_cleaning_agent import make_data_cleaning_agent, DataCleaningAgent
2
- from ai_data_science_team.agents.feature_engineering_agent import make_feature_engineering_agent
3
- from ai_data_science_team.agents.data_wrangling_agent import make_data_wrangling_agent
4
- from ai_data_science_team.agents.sql_database_agent import make_sql_database_agent
5
- from ai_data_science_team.agents.data_visualization_agent import make_data_visualization_agent
6
-
2
+ from ai_data_science_team.agents.feature_engineering_agent import make_feature_engineering_agent, FeatureEngineeringAgent
3
+ from ai_data_science_team.agents.data_wrangling_agent import make_data_wrangling_agent, DataWranglingAgent
4
+ from ai_data_science_team.agents.sql_database_agent import make_sql_database_agent, SQLDatabaseAgent
5
+ from ai_data_science_team.agents.data_visualization_agent import make_data_visualization_agent, DataVisualizationAgent
@@ -13,21 +13,28 @@ from langchain_core.messages import BaseMessage
13
13
  from langgraph.types import Command
14
14
  from langgraph.checkpoint.memory import MemorySaver
15
15
 
16
- from langgraph.graph.state import CompiledStateGraph
17
-
18
16
  import os
19
- import io
17
+ import json
20
18
  import pandas as pd
21
19
 
20
+ from IPython.display import Markdown
21
+
22
22
  from ai_data_science_team.templates import(
23
23
  node_func_execute_agent_code_on_data,
24
24
  node_func_human_review,
25
25
  node_func_fix_agent_code,
26
- node_func_explain_agent_code,
27
- create_coding_agent_graph
26
+ node_func_report_agent_outputs,
27
+ create_coding_agent_graph,
28
+ BaseAgent,
28
29
  )
29
30
  from ai_data_science_team.tools.parsers import PythonOutputParser
30
- from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top, format_agent_name
31
+ from ai_data_science_team.tools.regex import (
32
+ relocate_imports_inside_function,
33
+ add_comments_to_top,
34
+ format_agent_name,
35
+ format_recommended_steps,
36
+ get_generic_summary,
37
+ )
31
38
  from ai_data_science_team.tools.metadata import get_dataframe_summary
32
39
  from ai_data_science_team.tools.logging import log_ai_function
33
40
 
@@ -36,9 +43,110 @@ AGENT_NAME = "data_cleaning_agent"
36
43
  LOG_PATH = os.path.join(os.getcwd(), "logs/")
37
44
 
38
45
 
39
-
40
46
  # Class
41
- class DataCleaningAgent(CompiledStateGraph):
47
+ class DataCleaningAgent(BaseAgent):
48
+ """
49
+ Creates a data cleaning agent that can process datasets based on user-defined instructions or default cleaning steps.
50
+ The agent generates a Python function to clean the dataset, performs the cleaning, and logs the process, including code
51
+ and errors. It is designed to facilitate reproducible and customizable data cleaning workflows.
52
+
53
+ The agent performs the following default cleaning steps unless instructed otherwise:
54
+
55
+ - Removing columns with more than 40% missing values.
56
+ - Imputing missing values with the mean for numeric columns.
57
+ - Imputing missing values with the mode for categorical columns.
58
+ - Converting columns to appropriate data types.
59
+ - Removing duplicate rows.
60
+ - Removing rows with missing values.
61
+ - Removing rows with extreme outliers (values 3x the interquartile range).
62
+
63
+ User instructions can modify, add, or remove any of these steps to tailor the cleaning process.
64
+
65
+ Parameters
66
+ ----------
67
+ model : langchain.llms.base.LLM
68
+ The language model used to generate the data cleaning function.
69
+ n_samples : int, optional
70
+ Number of samples used when summarizing the dataset. Defaults to 30. Reducing this number can help
71
+ avoid exceeding the model's token limits.
72
+ log : bool, optional
73
+ Whether to log the generated code and errors. Defaults to False.
74
+ log_path : str, optional
75
+ Directory path for storing log files. Defaults to None.
76
+ file_name : str, optional
77
+ Name of the file for saving the generated response. Defaults to "data_cleaner.py".
78
+ function_name : str, optional
79
+ Name of the generated data cleaning function. Defaults to "data_cleaner".
80
+ overwrite : bool, optional
81
+ Whether to overwrite the log file if it exists. If False, a unique file name is created. Defaults to True.
82
+ human_in_the_loop : bool, optional
83
+ Enables user review of data cleaning instructions. Defaults to False.
84
+ bypass_recommended_steps : bool, optional
85
+ If True, skips the default recommended cleaning steps. Defaults to False.
86
+ bypass_explain_code : bool, optional
87
+ If True, skips the step that provides code explanations. Defaults to False.
88
+
89
+ Methods
90
+ -------
91
+ update_params(**kwargs)
92
+ Updates the agent's parameters and rebuilds the compiled state graph.
93
+ ainvoke_agent(user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0)
94
+ Cleans the provided dataset asynchronously based on user instructions.
95
+ invoke_agent(user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0)
96
+ Cleans the provided dataset synchronously based on user instructions.
97
+ get_workflow_summary()
98
+ Retrieves a summary of the agent's workflow.
99
+ get_log_summary()
100
+ Retrieves a summary of logged operations if logging is enabled.
101
+ get_state_keys()
102
+ Returns a list of keys from the state graph response.
103
+ get_state_properties()
104
+ Returns detailed properties of the state graph response.
105
+ get_data_cleaned()
106
+ Retrieves the cleaned dataset as a pandas DataFrame.
107
+ get_data_raw()
108
+ Retrieves the raw dataset as a pandas DataFrame.
109
+ get_data_cleaner_function()
110
+ Retrieves the generated Python function used for cleaning the data.
111
+ get_recommended_cleaning_steps()
112
+ Retrieves the agent's recommended cleaning steps.
113
+ get_response()
114
+ Returns the response from the agent as a dictionary.
115
+ show()
116
+ Displays the agent's mermaid diagram.
117
+
118
+ Examples
119
+ --------
120
+ ```python
121
+ import pandas as pd
122
+ from langchain_openai import ChatOpenAI
123
+ from ai_data_science_team.agents import DataCleaningAgent
124
+
125
+ llm = ChatOpenAI(model="gpt-4o-mini")
126
+
127
+ data_cleaning_agent = DataCleaningAgent(
128
+ model=llm, n_samples=50, log=True, log_path="logs", human_in_the_loop=True
129
+ )
130
+
131
+ df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
132
+
133
+ data_cleaning_agent.invoke_agent(
134
+ user_instructions="Don't remove outliers when cleaning the data.",
135
+ data_raw=df,
136
+ max_retries=3,
137
+ retry_count=0
138
+ )
139
+
140
+ cleaned_data = data_cleaning_agent.get_data_cleaned()
141
+
142
+ response = data_cleaning_agent.response
143
+ ```
144
+
145
+ Returns
146
+ --------
147
+ DataCleaningAgent : langchain.graphs.CompiledStateGraph
148
+ A data cleaning agent implemented as a compiled state graph.
149
+ """
42
150
 
43
151
  def __init__(
44
152
  self,
@@ -47,6 +155,7 @@ class DataCleaningAgent(CompiledStateGraph):
47
155
  log=False,
48
156
  log_path=None,
49
157
  file_name="data_cleaner.py",
158
+ function_name="data_cleaner",
50
159
  overwrite=True,
51
160
  human_in_the_loop=False,
52
161
  bypass_recommended_steps=False,
@@ -58,6 +167,7 @@ class DataCleaningAgent(CompiledStateGraph):
58
167
  "log": log,
59
168
  "log_path": log_path,
60
169
  "file_name": file_name,
170
+ "function_name": function_name,
61
171
  "overwrite": overwrite,
62
172
  "human_in_the_loop": human_in_the_loop,
63
173
  "bypass_recommended_steps": bypass_recommended_steps,
@@ -67,102 +177,104 @@ class DataCleaningAgent(CompiledStateGraph):
67
177
  self.response = None
68
178
 
69
179
  def _make_compiled_graph(self):
70
- self.response = None
71
- return make_data_cleaning_agent(**self._params)
72
-
73
- def update_params(self, **kwargs):
74
180
  """
75
- Update one or more parameters at once, then rebuild the compiled graph.
76
- e.g. agent.update_params(model=new_llm, n_samples=100)
181
+ Create the compiled graph for the data cleaning agent. Running this method will reset the response to None.
77
182
  """
78
- self._params.update(kwargs)
79
- self._compiled_graph = self._make_compiled_graph()
183
+ self.response=None
184
+ return make_data_cleaning_agent(**self._params)
80
185
 
81
- def __getattr__(self, name: str):
82
- """
83
- Delegate attribute access to `_compiled_graph` if `name` is not
84
- found in this instance. This 'inherits' methods from the compiled graph.
85
- """
86
- return getattr(self._compiled_graph, name)
87
-
88
- def ainvoke(self, user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0):
186
+ def ainvoke_agent(self, data_raw: pd.DataFrame, user_instructions: str=None, max_retries:int=3, retry_count:int=0, **kwargs):
89
187
  """
90
- Cleans the provided dataset based on user instructions.
188
+ Asynchronously invokes the agent. The response is stored in the response attribute.
91
189
 
92
190
  Parameters:
93
- user_instructions (str): Instructions for data cleaning.
94
- data_raw (pd.DataFrame): The raw dataset to be cleaned.
95
- max_retries (int): Maximum retry attempts for cleaning.
96
- retry_count (int): Current retry attempt.
191
+ ----------
192
+ data_raw (pd.DataFrame):
193
+ The raw dataset to be cleaned.
194
+ user_instructions (str):
195
+ Instructions for data cleaning agent.
196
+ max_retries (int):
197
+ Maximum retry attempts for cleaning.
198
+ retry_count (int):
199
+ Current retry attempt.
200
+ **kwargs
201
+ Additional keyword arguments to pass to ainvoke().
97
202
 
98
203
  Returns:
204
+ --------
99
205
  None. The response is stored in the response attribute.
100
206
  """
101
- response = self.ainvoke({
207
+ response = self._compiled_graph.ainvoke({
102
208
  "user_instructions": user_instructions,
103
209
  "data_raw": data_raw.to_dict(),
104
210
  "max_retries": max_retries,
105
211
  "retry_count": retry_count,
106
- })
212
+ }, **kwargs)
107
213
  self.response = response
108
214
  return None
109
215
 
110
- def invoke(self, user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0):
216
+ def invoke_agent(self, data_raw: pd.DataFrame, user_instructions: str=None, max_retries:int=3, retry_count:int=0, **kwargs):
111
217
  """
112
- Cleans the provided dataset based on user instructions.
218
+ Invokes the agent. The response is stored in the response attribute.
113
219
 
114
220
  Parameters:
115
- user_instructions (str): Instructions for data cleaning.
116
- data_raw (pd.DataFrame): The raw dataset to be cleaned.
117
- max_retries (int): Maximum retry attempts for cleaning.
118
- retry_count (int): Current retry attempt.
221
+ ----------
222
+ data_raw (pd.DataFrame):
223
+ The raw dataset to be cleaned.
224
+ user_instructions (str):
225
+ Instructions for data cleaning agent.
226
+ max_retries (int):
227
+ Maximum retry attempts for cleaning.
228
+ retry_count (int):
229
+ Current retry attempt.
230
+ **kwargs
231
+ Additional keyword arguments to pass to invoke().
119
232
 
120
233
  Returns:
234
+ --------
121
235
  None. The response is stored in the response attribute.
122
236
  """
123
- response = self.invoke({
237
+ response = self._compiled_graph.invoke({
124
238
  "user_instructions": user_instructions,
125
239
  "data_raw": data_raw.to_dict(),
126
240
  "max_retries": max_retries,
127
241
  "retry_count": retry_count,
128
- })
242
+ },**kwargs)
129
243
  self.response = response
130
244
  return None
131
245
 
132
- def explain_cleaning_steps(self):
246
+ def get_workflow_summary(self, markdown=False):
133
247
  """
134
- Provides an explanation of the cleaning steps performed by the agent.
135
-
136
- Returns:
137
- str: Explanation of the cleaning steps.
248
+ Retrieves the agent's workflow summary, if logging is enabled.
138
249
  """
139
- messages = self.response.get("messages", [])
140
- return messages
141
-
142
- def get_log_summary(self):
250
+ if self.response and self.response.get("messages"):
251
+ summary = get_generic_summary(json.loads(self.response.get("messages")[-1].content))
252
+ if markdown:
253
+ return Markdown(summary)
254
+ else:
255
+ return summary
256
+
257
+ def get_log_summary(self, markdown=False):
143
258
  """
144
259
  Logs a summary of the agent's operations, if logging is enabled.
145
260
  """
146
261
  if self.response:
147
- if self.log:
148
- log_details = f"Log Path: {self.response.get('data_cleaner_function_path')}"
149
- return log_details
150
-
151
- def get_state_keys(self):
152
- """
153
- Returns a list of keys that the state graph returns in a response.
154
- """
155
- return list(self.get_output_jsonschema()['properties'].keys())
156
-
157
- def get_state_properties(self):
158
- """
159
- Returns a list of keys that the state graph returns in a response.
160
- """
161
- return self.get_output_jsonschema()['properties']
262
+ if self.response.get('data_cleaner_function_path'):
263
+ log_details = f"""
264
+ ## Data Cleaning Agent Log Summary:
265
+
266
+ Function Path: {self.response.get('data_cleaner_function_path')}
267
+
268
+ Function Name: {self.response.get('data_cleaner_function_name')}
269
+ """
270
+ if markdown:
271
+ return Markdown(log_details)
272
+ else:
273
+ return log_details
162
274
 
163
275
  def get_data_cleaned(self):
164
276
  """
165
- Retrieves the cleaned data stored after running invoke or clean_data methods.
277
+ Retrieves the cleaned data stored after running invoke_agent or clean_data methods.
166
278
  """
167
279
  if self.response:
168
280
  return pd.DataFrame(self.response.get("data_cleaned"))
@@ -174,15 +286,25 @@ class DataCleaningAgent(CompiledStateGraph):
174
286
  if self.response:
175
287
  return pd.DataFrame(self.response.get("data_raw"))
176
288
 
177
- def get_data_cleaner_function(self):
289
+ def get_data_cleaner_function(self, markdown=False):
178
290
  """
179
291
  Retrieves the agent's pipeline function.
180
292
  """
181
293
  if self.response:
182
- return self.response.get("data_cleaner_function")
183
-
184
-
185
-
294
+ if markdown:
295
+ return Markdown(f"```python\n{self.response.get('data_cleaner_function')}\n```")
296
+ else:
297
+ return self.response.get("data_cleaner_function")
298
+
299
+ def get_recommended_cleaning_steps(self, markdown=False):
300
+ """
301
+ Retrieves the agent's recommended cleaning steps
302
+ """
303
+ if self.response:
304
+ if markdown:
305
+ return Markdown(self.response.get('recommended_steps'))
306
+ else:
307
+ return self.response.get('recommended_steps')
186
308
 
187
309
 
188
310
 
@@ -194,6 +316,7 @@ def make_data_cleaning_agent(
194
316
  log=False,
195
317
  log_path=None,
196
318
  file_name="data_cleaner.py",
319
+ function_name="data_cleaner",
197
320
  overwrite = True,
198
321
  human_in_the_loop=False,
199
322
  bypass_recommended_steps=False,
@@ -235,6 +358,8 @@ def make_data_cleaning_agent(
235
358
  "logs/".
236
359
  file_name : str, optional
237
360
  The name of the file to save the response to. Defaults to "data_cleaner.py".
361
+ function_name : str, optional
362
+ The name of the function that will be generated to clean the data. Defaults to "data_cleaner".
238
363
  overwrite : bool, optional
239
364
  Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
240
365
  Defaults to True.
@@ -275,6 +400,11 @@ def make_data_cleaning_agent(
275
400
  """
276
401
  llm = model
277
402
 
403
+ # Human in th loop requires recommended steps
404
+ if bypass_recommended_steps and human_in_the_loop:
405
+ bypass_recommended_steps = False
406
+ print("Bypass recommended steps set to False to enable human in the loop.")
407
+
278
408
  # Setup Log Directory
279
409
  if log:
280
410
  if log_path is None:
@@ -292,6 +422,7 @@ def make_data_cleaning_agent(
292
422
  all_datasets_summary: str
293
423
  data_cleaner_function: str
294
424
  data_cleaner_function_path: str
425
+ data_cleaner_file_name: str
295
426
  data_cleaner_function_name: str
296
427
  data_cleaner_error: str
297
428
  max_retries: int
@@ -342,7 +473,7 @@ def make_data_cleaning_agent(
342
473
  Below are summaries of all datasets provided:
343
474
  {all_datasets_summary}
344
475
 
345
- Return the steps as a bullet point list (no code, just the steps).
476
+ Return steps as a numbered list. You can return short code snippets to demonstrate actions. But do not return a fully coded solution. The code will be generated separately by a Coding Agent.
346
477
 
347
478
  Avoid these:
348
479
  1. Do not include steps to save files.
@@ -366,7 +497,7 @@ def make_data_cleaning_agent(
366
497
  })
367
498
 
368
499
  return {
369
- "recommended_steps": "\n\n# Recommended Data Cleaning Steps:\n" + recommended_steps.content.strip(),
500
+ "recommended_steps": format_recommended_steps(recommended_steps.content.strip(), heading="# Recommended Data Cleaning Steps:"),
370
501
  "all_datasets_summary": all_datasets_summary_str
371
502
  }
372
503
 
@@ -386,42 +517,44 @@ def make_data_cleaning_agent(
386
517
  else:
387
518
  all_datasets_summary_str = state.get("all_datasets_summary")
388
519
 
520
+
389
521
  data_cleaning_prompt = PromptTemplate(
390
522
  template="""
391
- You are a Data Cleaning Agent. Your job is to create a data_cleaner() function that can be run on the data provided using the following recommended steps.
392
-
523
+ You are a Data Cleaning Agent. Your job is to create a {function_name}() function that can be run on the data provided using the following recommended steps.
524
+
393
525
  Recommended Steps:
394
526
  {recommended_steps}
395
-
527
+
396
528
  You can use Pandas, Numpy, and Scikit Learn libraries to clean the data.
397
-
529
+
398
530
  Below are summaries of all datasets provided. Use this information about the data to help determine how to clean the data:
399
531
 
400
532
  {all_datasets_summary}
401
-
402
- Return Python code in ```python ``` format with a single function definition, data_cleaner(data_raw), that includes all imports inside the function.
403
-
533
+
534
+ Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
535
+
404
536
  Return code to provide the data cleaning function:
405
-
406
- def data_cleaner(data_raw):
537
+
538
+ def {function_name}(data_raw):
407
539
  import pandas as pd
408
540
  import numpy as np
409
541
  ...
410
542
  return data_cleaned
411
-
543
+
412
544
  Best Practices and Error Preventions:
413
-
545
+
414
546
  Always ensure that when assigning the output of fit_transform() from SimpleImputer to a Pandas DataFrame column, you call .ravel() or flatten the array, because fit_transform() returns a 2D array while a DataFrame column is 1D.
415
547
 
416
548
  """,
417
- input_variables=["recommended_steps", "all_datasets_summary"]
549
+ input_variables=["recommended_steps", "all_datasets_summary", "function_name"]
418
550
  )
419
551
 
420
552
  data_cleaning_agent = data_cleaning_prompt | llm | PythonOutputParser()
421
553
 
422
554
  response = data_cleaning_agent.invoke({
423
555
  "recommended_steps": state.get("recommended_steps"),
424
- "all_datasets_summary": all_datasets_summary_str
556
+ "all_datasets_summary": all_datasets_summary_str,
557
+ "function_name": function_name
425
558
  })
426
559
 
427
560
  response = relocate_imports_inside_function(response)
@@ -439,19 +572,37 @@ def make_data_cleaning_agent(
439
572
  return {
440
573
  "data_cleaner_function" : response,
441
574
  "data_cleaner_function_path": file_path,
442
- "data_cleaner_function_name": file_name_2,
575
+ "data_cleaner_file_name": file_name_2,
576
+ "data_cleaner_function_name": function_name,
443
577
  "all_datasets_summary": all_datasets_summary_str
444
578
  }
579
+
580
+ # Human Review
581
+
582
+ prompt_text_human_review = "Are the following data cleaning instructions correct? (Answer 'yes' or provide modifications)\n{steps}"
445
583
 
446
- def human_review(state: GraphState) -> Command[Literal["recommend_cleaning_steps", "create_data_cleaner_code"]]:
447
- return node_func_human_review(
448
- state=state,
449
- prompt_text="Is the following data cleaning instructions correct? (Answer 'yes' or provide modifications)\n{steps}",
450
- yes_goto="create_data_cleaner_code",
451
- no_goto="recommend_cleaning_steps",
452
- user_instructions_key="user_instructions",
453
- recommended_steps_key="recommended_steps"
454
- )
584
+ if not bypass_explain_code:
585
+ def human_review(state: GraphState) -> Command[Literal["recommend_cleaning_steps", "explain_data_cleaner_code"]]:
586
+ return node_func_human_review(
587
+ state=state,
588
+ prompt_text=prompt_text_human_review,
589
+ yes_goto= 'explain_data_cleaner_code',
590
+ no_goto="recommend_cleaning_steps",
591
+ user_instructions_key="user_instructions",
592
+ recommended_steps_key="recommended_steps",
593
+ code_snippet_key="data_cleaner_function",
594
+ )
595
+ else:
596
+ def human_review(state: GraphState) -> Command[Literal["recommend_cleaning_steps", "__end__"]]:
597
+ return node_func_human_review(
598
+ state=state,
599
+ prompt_text=prompt_text_human_review,
600
+ yes_goto= '__end__',
601
+ no_goto="recommend_cleaning_steps",
602
+ user_instructions_key="user_instructions",
603
+ recommended_steps_key="recommended_steps",
604
+ code_snippet_key="data_cleaner_function",
605
+ )
455
606
 
456
607
  def execute_data_cleaner_code(state):
457
608
  return node_func_execute_agent_code_on_data(
@@ -460,7 +611,7 @@ def make_data_cleaning_agent(
460
611
  result_key="data_cleaned",
461
612
  error_key="data_cleaner_error",
462
613
  code_snippet_key="data_cleaner_function",
463
- agent_function_name="data_cleaner",
614
+ agent_function_name=state.get("data_cleaner_function_name"),
464
615
  pre_processing=lambda data: pd.DataFrame.from_dict(data),
465
616
  post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
466
617
  error_message_prefix="An error occurred during data cleaning: "
@@ -468,11 +619,11 @@ def make_data_cleaning_agent(
468
619
 
469
620
  def fix_data_cleaner_code(state: GraphState):
470
621
  data_cleaner_prompt = """
471
- You are a Data Cleaning Agent. Your job is to create a data_cleaner() function that can be run on the data provided. The function is currently broken and needs to be fixed.
622
+ You are a Data Cleaning Agent. Your job is to create a {function_name}() function that can be run on the data provided. The function is currently broken and needs to be fixed.
472
623
 
473
- Make sure to only return the function definition for data_cleaner().
624
+ Make sure to only return the function definition for {function_name}().
474
625
 
475
- Return Python code in ```python``` format with a single function definition, data_cleaner(data_raw), that includes all imports inside the function.
626
+ Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
476
627
 
477
628
  This is the broken code (please fix):
478
629
  {code_snippet}
@@ -490,34 +641,34 @@ def make_data_cleaning_agent(
490
641
  agent_name=AGENT_NAME,
491
642
  log=log,
492
643
  file_path=state.get("data_cleaner_function_path"),
644
+ function_name=state.get("data_cleaner_function_name"),
493
645
  )
494
646
 
495
- def explain_data_cleaner_code(state: GraphState):
496
- return node_func_explain_agent_code(
647
+ # Final reporting node
648
+ def report_agent_outputs(state: GraphState):
649
+ return node_func_report_agent_outputs(
497
650
  state=state,
498
- code_snippet_key="data_cleaner_function",
651
+ keys_to_include=[
652
+ "recommended_steps",
653
+ "data_cleaner_function",
654
+ "data_cleaner_function_path",
655
+ "data_cleaner_function_name",
656
+ "data_cleaner_error",
657
+ ],
499
658
  result_key="messages",
500
- error_key="data_cleaner_error",
501
- llm=llm,
502
659
  role=AGENT_NAME,
503
- explanation_prompt_template="""
504
- Explain the data cleaning steps that the data cleaning agent performed in this function.
505
- Keep the summary succinct and to the point.\n\n# Data Cleaning Agent:\n\n{code}
506
- """,
507
- success_prefix="# Data Cleaning Agent:\n\n ",
508
- error_message="The Data Cleaning Agent encountered an error during data cleaning. Data could not be explained."
660
+ custom_title="Data Cleaning Agent Outputs"
509
661
  )
510
-
511
- # Define the graph
662
+
512
663
  node_functions = {
513
664
  "recommend_cleaning_steps": recommend_cleaning_steps,
514
665
  "human_review": human_review,
515
666
  "create_data_cleaner_code": create_data_cleaner_code,
516
667
  "execute_data_cleaner_code": execute_data_cleaner_code,
517
668
  "fix_data_cleaner_code": fix_data_cleaner_code,
518
- "explain_data_cleaner_code": explain_data_cleaner_code
669
+ "report_agent_outputs": report_agent_outputs,
519
670
  }
520
-
671
+
521
672
  app = create_coding_agent_graph(
522
673
  GraphState=GraphState,
523
674
  node_functions=node_functions,
@@ -525,16 +676,17 @@ def make_data_cleaning_agent(
525
676
  create_code_node_name="create_data_cleaner_code",
526
677
  execute_code_node_name="execute_data_cleaner_code",
527
678
  fix_code_node_name="fix_data_cleaner_code",
528
- explain_code_node_name="explain_data_cleaner_code",
679
+ explain_code_node_name="report_agent_outputs",
529
680
  error_key="data_cleaner_error",
530
- human_in_the_loop=human_in_the_loop, # or False
681
+ human_in_the_loop=human_in_the_loop,
531
682
  human_review_node_name="human_review",
532
683
  checkpointer=MemorySaver() if human_in_the_loop else None,
533
684
  bypass_recommended_steps=bypass_recommended_steps,
534
685
  bypass_explain_code=bypass_explain_code,
535
686
  )
536
-
687
+
537
688
  return app
689
+
538
690
 
539
691
 
540
692