ai-data-science-team 0.0.0.9007__py3-none-any.whl → 0.0.0.9009__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. ai_data_science_team/_version.py +1 -1
  2. ai_data_science_team/agents/__init__.py +4 -5
  3. ai_data_science_team/agents/data_cleaning_agent.py +268 -116
  4. ai_data_science_team/agents/data_visualization_agent.py +470 -41
  5. ai_data_science_team/agents/data_wrangling_agent.py +471 -31
  6. ai_data_science_team/agents/feature_engineering_agent.py +426 -41
  7. ai_data_science_team/agents/sql_database_agent.py +458 -58
  8. ai_data_science_team/ml_agents/__init__.py +1 -0
  9. ai_data_science_team/ml_agents/h2o_ml_agent.py +1032 -0
  10. ai_data_science_team/multiagents/__init__.py +1 -0
  11. ai_data_science_team/multiagents/sql_data_analyst.py +398 -0
  12. ai_data_science_team/multiagents/supervised_data_analyst.py +2 -0
  13. ai_data_science_team/templates/__init__.py +3 -1
  14. ai_data_science_team/templates/agent_templates.py +319 -43
  15. ai_data_science_team/tools/metadata.py +94 -62
  16. ai_data_science_team/tools/regex.py +86 -1
  17. ai_data_science_team/utils/__init__.py +0 -0
  18. ai_data_science_team/utils/plotly.py +24 -0
  19. ai_data_science_team-0.0.0.9009.dist-info/METADATA +245 -0
  20. ai_data_science_team-0.0.0.9009.dist-info/RECORD +28 -0
  21. ai_data_science_team-0.0.0.9007.dist-info/METADATA +0 -183
  22. ai_data_science_team-0.0.0.9007.dist-info/RECORD +0 -21
  23. {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/LICENSE +0 -0
  24. {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/WHEEL +0 -0
  25. {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- __version__ = "0.0.0.9007"
1
+ __version__ = "0.0.0.9009"
@@ -1,6 +1,5 @@
1
1
  from ai_data_science_team.agents.data_cleaning_agent import make_data_cleaning_agent, DataCleaningAgent
2
- from ai_data_science_team.agents.feature_engineering_agent import make_feature_engineering_agent
3
- from ai_data_science_team.agents.data_wrangling_agent import make_data_wrangling_agent
4
- from ai_data_science_team.agents.sql_database_agent import make_sql_database_agent
5
- from ai_data_science_team.agents.data_visualization_agent import make_data_visualization_agent
6
-
2
+ from ai_data_science_team.agents.feature_engineering_agent import make_feature_engineering_agent, FeatureEngineeringAgent
3
+ from ai_data_science_team.agents.data_wrangling_agent import make_data_wrangling_agent, DataWranglingAgent
4
+ from ai_data_science_team.agents.sql_database_agent import make_sql_database_agent, SQLDatabaseAgent
5
+ from ai_data_science_team.agents.data_visualization_agent import make_data_visualization_agent, DataVisualizationAgent
@@ -13,21 +13,28 @@ from langchain_core.messages import BaseMessage
13
13
  from langgraph.types import Command
14
14
  from langgraph.checkpoint.memory import MemorySaver
15
15
 
16
- from langgraph.graph.state import CompiledStateGraph
17
-
18
16
  import os
19
- import io
17
+ import json
20
18
  import pandas as pd
21
19
 
20
+ from IPython.display import Markdown
21
+
22
22
  from ai_data_science_team.templates import(
23
23
  node_func_execute_agent_code_on_data,
24
24
  node_func_human_review,
25
25
  node_func_fix_agent_code,
26
- node_func_explain_agent_code,
27
- create_coding_agent_graph
26
+ node_func_report_agent_outputs,
27
+ create_coding_agent_graph,
28
+ BaseAgent,
28
29
  )
29
30
  from ai_data_science_team.tools.parsers import PythonOutputParser
30
- from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top, format_agent_name
31
+ from ai_data_science_team.tools.regex import (
32
+ relocate_imports_inside_function,
33
+ add_comments_to_top,
34
+ format_agent_name,
35
+ format_recommended_steps,
36
+ get_generic_summary,
37
+ )
31
38
  from ai_data_science_team.tools.metadata import get_dataframe_summary
32
39
  from ai_data_science_team.tools.logging import log_ai_function
33
40
 
@@ -36,9 +43,110 @@ AGENT_NAME = "data_cleaning_agent"
36
43
  LOG_PATH = os.path.join(os.getcwd(), "logs/")
37
44
 
38
45
 
39
-
40
46
  # Class
41
- class DataCleaningAgent(CompiledStateGraph):
47
+ class DataCleaningAgent(BaseAgent):
48
+ """
49
+ Creates a data cleaning agent that can process datasets based on user-defined instructions or default cleaning steps.
50
+ The agent generates a Python function to clean the dataset, performs the cleaning, and logs the process, including code
51
+ and errors. It is designed to facilitate reproducible and customizable data cleaning workflows.
52
+
53
+ The agent performs the following default cleaning steps unless instructed otherwise:
54
+
55
+ - Removing columns with more than 40% missing values.
56
+ - Imputing missing values with the mean for numeric columns.
57
+ - Imputing missing values with the mode for categorical columns.
58
+ - Converting columns to appropriate data types.
59
+ - Removing duplicate rows.
60
+ - Removing rows with missing values.
61
+ - Removing rows with extreme outliers (values 3x the interquartile range).
62
+
63
+ User instructions can modify, add, or remove any of these steps to tailor the cleaning process.
64
+
65
+ Parameters
66
+ ----------
67
+ model : langchain.llms.base.LLM
68
+ The language model used to generate the data cleaning function.
69
+ n_samples : int, optional
70
+ Number of samples used when summarizing the dataset. Defaults to 30. Reducing this number can help
71
+ avoid exceeding the model's token limits.
72
+ log : bool, optional
73
+ Whether to log the generated code and errors. Defaults to False.
74
+ log_path : str, optional
75
+ Directory path for storing log files. Defaults to None.
76
+ file_name : str, optional
77
+ Name of the file for saving the generated response. Defaults to "data_cleaner.py".
78
+ function_name : str, optional
79
+ Name of the generated data cleaning function. Defaults to "data_cleaner".
80
+ overwrite : bool, optional
81
+ Whether to overwrite the log file if it exists. If False, a unique file name is created. Defaults to True.
82
+ human_in_the_loop : bool, optional
83
+ Enables user review of data cleaning instructions. Defaults to False.
84
+ bypass_recommended_steps : bool, optional
85
+ If True, skips the default recommended cleaning steps. Defaults to False.
86
+ bypass_explain_code : bool, optional
87
+ If True, skips the step that provides code explanations. Defaults to False.
88
+
89
+ Methods
90
+ -------
91
+ update_params(**kwargs)
92
+ Updates the agent's parameters and rebuilds the compiled state graph.
93
+ ainvoke_agent(user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0)
94
+ Cleans the provided dataset asynchronously based on user instructions.
95
+ invoke_agent(user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0)
96
+ Cleans the provided dataset synchronously based on user instructions.
97
+ get_workflow_summary()
98
+ Retrieves a summary of the agent's workflow.
99
+ get_log_summary()
100
+ Retrieves a summary of logged operations if logging is enabled.
101
+ get_state_keys()
102
+ Returns a list of keys from the state graph response.
103
+ get_state_properties()
104
+ Returns detailed properties of the state graph response.
105
+ get_data_cleaned()
106
+ Retrieves the cleaned dataset as a pandas DataFrame.
107
+ get_data_raw()
108
+ Retrieves the raw dataset as a pandas DataFrame.
109
+ get_data_cleaner_function()
110
+ Retrieves the generated Python function used for cleaning the data.
111
+ get_recommended_cleaning_steps()
112
+ Retrieves the agent's recommended cleaning steps.
113
+ get_response()
114
+ Returns the response from the agent as a dictionary.
115
+ show()
116
+ Displays the agent's mermaid diagram.
117
+
118
+ Examples
119
+ --------
120
+ ```python
121
+ import pandas as pd
122
+ from langchain_openai import ChatOpenAI
123
+ from ai_data_science_team.agents import DataCleaningAgent
124
+
125
+ llm = ChatOpenAI(model="gpt-4o-mini")
126
+
127
+ data_cleaning_agent = DataCleaningAgent(
128
+ model=llm, n_samples=50, log=True, log_path="logs", human_in_the_loop=True
129
+ )
130
+
131
+ df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
132
+
133
+ data_cleaning_agent.invoke_agent(
134
+ user_instructions="Don't remove outliers when cleaning the data.",
135
+ data_raw=df,
136
+ max_retries=3,
137
+ retry_count=0
138
+ )
139
+
140
+ cleaned_data = data_cleaning_agent.get_data_cleaned()
141
+
142
+ response = data_cleaning_agent.response
143
+ ```
144
+
145
+ Returns
146
+ --------
147
+ DataCleaningAgent : langchain.graphs.CompiledStateGraph
148
+ A data cleaning agent implemented as a compiled state graph.
149
+ """
42
150
 
43
151
  def __init__(
44
152
  self,
@@ -47,6 +155,7 @@ class DataCleaningAgent(CompiledStateGraph):
47
155
  log=False,
48
156
  log_path=None,
49
157
  file_name="data_cleaner.py",
158
+ function_name="data_cleaner",
50
159
  overwrite=True,
51
160
  human_in_the_loop=False,
52
161
  bypass_recommended_steps=False,
@@ -58,6 +167,7 @@ class DataCleaningAgent(CompiledStateGraph):
58
167
  "log": log,
59
168
  "log_path": log_path,
60
169
  "file_name": file_name,
170
+ "function_name": function_name,
61
171
  "overwrite": overwrite,
62
172
  "human_in_the_loop": human_in_the_loop,
63
173
  "bypass_recommended_steps": bypass_recommended_steps,
@@ -67,102 +177,104 @@ class DataCleaningAgent(CompiledStateGraph):
67
177
  self.response = None
68
178
 
69
179
  def _make_compiled_graph(self):
70
- self.response = None
71
- return make_data_cleaning_agent(**self._params)
72
-
73
- def update_params(self, **kwargs):
74
180
  """
75
- Update one or more parameters at once, then rebuild the compiled graph.
76
- e.g. agent.update_params(model=new_llm, n_samples=100)
181
+ Create the compiled graph for the data cleaning agent. Running this method will reset the response to None.
77
182
  """
78
- self._params.update(kwargs)
79
- self._compiled_graph = self._make_compiled_graph()
183
+ self.response=None
184
+ return make_data_cleaning_agent(**self._params)
80
185
 
81
- def __getattr__(self, name: str):
82
- """
83
- Delegate attribute access to `_compiled_graph` if `name` is not
84
- found in this instance. This 'inherits' methods from the compiled graph.
85
- """
86
- return getattr(self._compiled_graph, name)
87
-
88
- def ainvoke(self, user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0):
186
+ def ainvoke_agent(self, data_raw: pd.DataFrame, user_instructions: str=None, max_retries:int=3, retry_count:int=0, **kwargs):
89
187
  """
90
- Cleans the provided dataset based on user instructions.
188
+ Asynchronously invokes the agent. The response is stored in the response attribute.
91
189
 
92
190
  Parameters:
93
- user_instructions (str): Instructions for data cleaning.
94
- data_raw (pd.DataFrame): The raw dataset to be cleaned.
95
- max_retries (int): Maximum retry attempts for cleaning.
96
- retry_count (int): Current retry attempt.
191
+ ----------
192
+ data_raw (pd.DataFrame):
193
+ The raw dataset to be cleaned.
194
+ user_instructions (str):
195
+ Instructions for data cleaning agent.
196
+ max_retries (int):
197
+ Maximum retry attempts for cleaning.
198
+ retry_count (int):
199
+ Current retry attempt.
200
+ **kwargs
201
+ Additional keyword arguments to pass to ainvoke().
97
202
 
98
203
  Returns:
204
+ --------
99
205
  None. The response is stored in the response attribute.
100
206
  """
101
- response = self.ainvoke({
207
+ response = self._compiled_graph.ainvoke({
102
208
  "user_instructions": user_instructions,
103
209
  "data_raw": data_raw.to_dict(),
104
210
  "max_retries": max_retries,
105
211
  "retry_count": retry_count,
106
- })
212
+ }, **kwargs)
107
213
  self.response = response
108
214
  return None
109
215
 
110
- def invoke(self, user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0):
216
+ def invoke_agent(self, data_raw: pd.DataFrame, user_instructions: str=None, max_retries:int=3, retry_count:int=0, **kwargs):
111
217
  """
112
- Cleans the provided dataset based on user instructions.
218
+ Invokes the agent. The response is stored in the response attribute.
113
219
 
114
220
  Parameters:
115
- user_instructions (str): Instructions for data cleaning.
116
- data_raw (pd.DataFrame): The raw dataset to be cleaned.
117
- max_retries (int): Maximum retry attempts for cleaning.
118
- retry_count (int): Current retry attempt.
221
+ ----------
222
+ data_raw (pd.DataFrame):
223
+ The raw dataset to be cleaned.
224
+ user_instructions (str):
225
+ Instructions for data cleaning agent.
226
+ max_retries (int):
227
+ Maximum retry attempts for cleaning.
228
+ retry_count (int):
229
+ Current retry attempt.
230
+ **kwargs
231
+ Additional keyword arguments to pass to invoke().
119
232
 
120
233
  Returns:
234
+ --------
121
235
  None. The response is stored in the response attribute.
122
236
  """
123
- response = self.invoke({
237
+ response = self._compiled_graph.invoke({
124
238
  "user_instructions": user_instructions,
125
239
  "data_raw": data_raw.to_dict(),
126
240
  "max_retries": max_retries,
127
241
  "retry_count": retry_count,
128
- })
242
+ },**kwargs)
129
243
  self.response = response
130
244
  return None
131
245
 
132
- def explain_cleaning_steps(self):
246
+ def get_workflow_summary(self, markdown=False):
133
247
  """
134
- Provides an explanation of the cleaning steps performed by the agent.
135
-
136
- Returns:
137
- str: Explanation of the cleaning steps.
248
+ Retrieves the agent's workflow summary, if logging is enabled.
138
249
  """
139
- messages = self.response.get("messages", [])
140
- return messages
141
-
142
- def get_log_summary(self):
250
+ if self.response and self.response.get("messages"):
251
+ summary = get_generic_summary(json.loads(self.response.get("messages")[-1].content))
252
+ if markdown:
253
+ return Markdown(summary)
254
+ else:
255
+ return summary
256
+
257
+ def get_log_summary(self, markdown=False):
143
258
  """
144
259
  Logs a summary of the agent's operations, if logging is enabled.
145
260
  """
146
261
  if self.response:
147
- if self.log:
148
- log_details = f"Log Path: {self.response.get('data_cleaner_function_path')}"
149
- return log_details
150
-
151
- def get_state_keys(self):
152
- """
153
- Returns a list of keys that the state graph returns in a response.
154
- """
155
- return list(self.get_output_jsonschema()['properties'].keys())
156
-
157
- def get_state_properties(self):
158
- """
159
- Returns a list of keys that the state graph returns in a response.
160
- """
161
- return self.get_output_jsonschema()['properties']
262
+ if self.response.get('data_cleaner_function_path'):
263
+ log_details = f"""
264
+ ## Data Cleaning Agent Log Summary:
265
+
266
+ Function Path: {self.response.get('data_cleaner_function_path')}
267
+
268
+ Function Name: {self.response.get('data_cleaner_function_name')}
269
+ """
270
+ if markdown:
271
+ return Markdown(log_details)
272
+ else:
273
+ return log_details
162
274
 
163
275
  def get_data_cleaned(self):
164
276
  """
165
- Retrieves the cleaned data stored after running invoke or clean_data methods.
277
+ Retrieves the cleaned data stored after running invoke_agent or clean_data methods.
166
278
  """
167
279
  if self.response:
168
280
  return pd.DataFrame(self.response.get("data_cleaned"))
@@ -174,15 +286,25 @@ class DataCleaningAgent(CompiledStateGraph):
174
286
  if self.response:
175
287
  return pd.DataFrame(self.response.get("data_raw"))
176
288
 
177
- def get_data_cleaner_function(self):
289
+ def get_data_cleaner_function(self, markdown=False):
178
290
  """
179
291
  Retrieves the agent's pipeline function.
180
292
  """
181
293
  if self.response:
182
- return self.response.get("data_cleaner_function")
183
-
184
-
185
-
294
+ if markdown:
295
+ return Markdown(f"```python\n{self.response.get('data_cleaner_function')}\n```")
296
+ else:
297
+ return self.response.get("data_cleaner_function")
298
+
299
+ def get_recommended_cleaning_steps(self, markdown=False):
300
+ """
301
+ Retrieves the agent's recommended cleaning steps
302
+ """
303
+ if self.response:
304
+ if markdown:
305
+ return Markdown(self.response.get('recommended_steps'))
306
+ else:
307
+ return self.response.get('recommended_steps')
186
308
 
187
309
 
188
310
 
@@ -194,6 +316,7 @@ def make_data_cleaning_agent(
194
316
  log=False,
195
317
  log_path=None,
196
318
  file_name="data_cleaner.py",
319
+ function_name="data_cleaner",
197
320
  overwrite = True,
198
321
  human_in_the_loop=False,
199
322
  bypass_recommended_steps=False,
@@ -235,6 +358,8 @@ def make_data_cleaning_agent(
235
358
  "logs/".
236
359
  file_name : str, optional
237
360
  The name of the file to save the response to. Defaults to "data_cleaner.py".
361
+ function_name : str, optional
362
+ The name of the function that will be generated to clean the data. Defaults to "data_cleaner".
238
363
  overwrite : bool, optional
239
364
  Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
240
365
  Defaults to True.
@@ -275,6 +400,11 @@ def make_data_cleaning_agent(
275
400
  """
276
401
  llm = model
277
402
 
403
+ # Human in th loop requires recommended steps
404
+ if bypass_recommended_steps and human_in_the_loop:
405
+ bypass_recommended_steps = False
406
+ print("Bypass recommended steps set to False to enable human in the loop.")
407
+
278
408
  # Setup Log Directory
279
409
  if log:
280
410
  if log_path is None:
@@ -292,6 +422,7 @@ def make_data_cleaning_agent(
292
422
  all_datasets_summary: str
293
423
  data_cleaner_function: str
294
424
  data_cleaner_function_path: str
425
+ data_cleaner_file_name: str
295
426
  data_cleaner_function_name: str
296
427
  data_cleaner_error: str
297
428
  max_retries: int
@@ -342,7 +473,7 @@ def make_data_cleaning_agent(
342
473
  Below are summaries of all datasets provided:
343
474
  {all_datasets_summary}
344
475
 
345
- Return the steps as a bullet point list (no code, just the steps).
476
+ Return steps as a numbered list. You can return short code snippets to demonstrate actions. But do not return a fully coded solution. The code will be generated separately by a Coding Agent.
346
477
 
347
478
  Avoid these:
348
479
  1. Do not include steps to save files.
@@ -366,7 +497,7 @@ def make_data_cleaning_agent(
366
497
  })
367
498
 
368
499
  return {
369
- "recommended_steps": "\n\n# Recommended Data Cleaning Steps:\n" + recommended_steps.content.strip(),
500
+ "recommended_steps": format_recommended_steps(recommended_steps.content.strip(), heading="# Recommended Data Cleaning Steps:"),
370
501
  "all_datasets_summary": all_datasets_summary_str
371
502
  }
372
503
 
@@ -386,42 +517,44 @@ def make_data_cleaning_agent(
386
517
  else:
387
518
  all_datasets_summary_str = state.get("all_datasets_summary")
388
519
 
520
+
389
521
  data_cleaning_prompt = PromptTemplate(
390
522
  template="""
391
- You are a Data Cleaning Agent. Your job is to create a data_cleaner() function that can be run on the data provided using the following recommended steps.
392
-
523
+ You are a Data Cleaning Agent. Your job is to create a {function_name}() function that can be run on the data provided using the following recommended steps.
524
+
393
525
  Recommended Steps:
394
526
  {recommended_steps}
395
-
527
+
396
528
  You can use Pandas, Numpy, and Scikit Learn libraries to clean the data.
397
-
529
+
398
530
  Below are summaries of all datasets provided. Use this information about the data to help determine how to clean the data:
399
531
 
400
532
  {all_datasets_summary}
401
-
402
- Return Python code in ```python ``` format with a single function definition, data_cleaner(data_raw), that includes all imports inside the function.
403
-
533
+
534
+ Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
535
+
404
536
  Return code to provide the data cleaning function:
405
-
406
- def data_cleaner(data_raw):
537
+
538
+ def {function_name}(data_raw):
407
539
  import pandas as pd
408
540
  import numpy as np
409
541
  ...
410
542
  return data_cleaned
411
-
543
+
412
544
  Best Practices and Error Preventions:
413
-
545
+
414
546
  Always ensure that when assigning the output of fit_transform() from SimpleImputer to a Pandas DataFrame column, you call .ravel() or flatten the array, because fit_transform() returns a 2D array while a DataFrame column is 1D.
415
547
 
416
548
  """,
417
- input_variables=["recommended_steps", "all_datasets_summary"]
549
+ input_variables=["recommended_steps", "all_datasets_summary", "function_name"]
418
550
  )
419
551
 
420
552
  data_cleaning_agent = data_cleaning_prompt | llm | PythonOutputParser()
421
553
 
422
554
  response = data_cleaning_agent.invoke({
423
555
  "recommended_steps": state.get("recommended_steps"),
424
- "all_datasets_summary": all_datasets_summary_str
556
+ "all_datasets_summary": all_datasets_summary_str,
557
+ "function_name": function_name
425
558
  })
426
559
 
427
560
  response = relocate_imports_inside_function(response)
@@ -439,19 +572,37 @@ def make_data_cleaning_agent(
439
572
  return {
440
573
  "data_cleaner_function" : response,
441
574
  "data_cleaner_function_path": file_path,
442
- "data_cleaner_function_name": file_name_2,
575
+ "data_cleaner_file_name": file_name_2,
576
+ "data_cleaner_function_name": function_name,
443
577
  "all_datasets_summary": all_datasets_summary_str
444
578
  }
579
+
580
+ # Human Review
581
+
582
+ prompt_text_human_review = "Are the following data cleaning instructions correct? (Answer 'yes' or provide modifications)\n{steps}"
445
583
 
446
- def human_review(state: GraphState) -> Command[Literal["recommend_cleaning_steps", "create_data_cleaner_code"]]:
447
- return node_func_human_review(
448
- state=state,
449
- prompt_text="Is the following data cleaning instructions correct? (Answer 'yes' or provide modifications)\n{steps}",
450
- yes_goto="create_data_cleaner_code",
451
- no_goto="recommend_cleaning_steps",
452
- user_instructions_key="user_instructions",
453
- recommended_steps_key="recommended_steps"
454
- )
584
+ if not bypass_explain_code:
585
+ def human_review(state: GraphState) -> Command[Literal["recommend_cleaning_steps", "explain_data_cleaner_code"]]:
586
+ return node_func_human_review(
587
+ state=state,
588
+ prompt_text=prompt_text_human_review,
589
+ yes_goto= 'explain_data_cleaner_code',
590
+ no_goto="recommend_cleaning_steps",
591
+ user_instructions_key="user_instructions",
592
+ recommended_steps_key="recommended_steps",
593
+ code_snippet_key="data_cleaner_function",
594
+ )
595
+ else:
596
+ def human_review(state: GraphState) -> Command[Literal["recommend_cleaning_steps", "__end__"]]:
597
+ return node_func_human_review(
598
+ state=state,
599
+ prompt_text=prompt_text_human_review,
600
+ yes_goto= '__end__',
601
+ no_goto="recommend_cleaning_steps",
602
+ user_instructions_key="user_instructions",
603
+ recommended_steps_key="recommended_steps",
604
+ code_snippet_key="data_cleaner_function",
605
+ )
455
606
 
456
607
  def execute_data_cleaner_code(state):
457
608
  return node_func_execute_agent_code_on_data(
@@ -460,7 +611,7 @@ def make_data_cleaning_agent(
460
611
  result_key="data_cleaned",
461
612
  error_key="data_cleaner_error",
462
613
  code_snippet_key="data_cleaner_function",
463
- agent_function_name="data_cleaner",
614
+ agent_function_name=state.get("data_cleaner_function_name"),
464
615
  pre_processing=lambda data: pd.DataFrame.from_dict(data),
465
616
  post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
466
617
  error_message_prefix="An error occurred during data cleaning: "
@@ -468,11 +619,11 @@ def make_data_cleaning_agent(
468
619
 
469
620
  def fix_data_cleaner_code(state: GraphState):
470
621
  data_cleaner_prompt = """
471
- You are a Data Cleaning Agent. Your job is to create a data_cleaner() function that can be run on the data provided. The function is currently broken and needs to be fixed.
622
+ You are a Data Cleaning Agent. Your job is to create a {function_name}() function that can be run on the data provided. The function is currently broken and needs to be fixed.
472
623
 
473
- Make sure to only return the function definition for data_cleaner().
624
+ Make sure to only return the function definition for {function_name}().
474
625
 
475
- Return Python code in ```python``` format with a single function definition, data_cleaner(data_raw), that includes all imports inside the function.
626
+ Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
476
627
 
477
628
  This is the broken code (please fix):
478
629
  {code_snippet}
@@ -490,34 +641,34 @@ def make_data_cleaning_agent(
490
641
  agent_name=AGENT_NAME,
491
642
  log=log,
492
643
  file_path=state.get("data_cleaner_function_path"),
644
+ function_name=state.get("data_cleaner_function_name"),
493
645
  )
494
646
 
495
- def explain_data_cleaner_code(state: GraphState):
496
- return node_func_explain_agent_code(
647
+ # Final reporting node
648
+ def report_agent_outputs(state: GraphState):
649
+ return node_func_report_agent_outputs(
497
650
  state=state,
498
- code_snippet_key="data_cleaner_function",
651
+ keys_to_include=[
652
+ "recommended_steps",
653
+ "data_cleaner_function",
654
+ "data_cleaner_function_path",
655
+ "data_cleaner_function_name",
656
+ "data_cleaner_error",
657
+ ],
499
658
  result_key="messages",
500
- error_key="data_cleaner_error",
501
- llm=llm,
502
659
  role=AGENT_NAME,
503
- explanation_prompt_template="""
504
- Explain the data cleaning steps that the data cleaning agent performed in this function.
505
- Keep the summary succinct and to the point.\n\n# Data Cleaning Agent:\n\n{code}
506
- """,
507
- success_prefix="# Data Cleaning Agent:\n\n ",
508
- error_message="The Data Cleaning Agent encountered an error during data cleaning. Data could not be explained."
660
+ custom_title="Data Cleaning Agent Outputs"
509
661
  )
510
-
511
- # Define the graph
662
+
512
663
  node_functions = {
513
664
  "recommend_cleaning_steps": recommend_cleaning_steps,
514
665
  "human_review": human_review,
515
666
  "create_data_cleaner_code": create_data_cleaner_code,
516
667
  "execute_data_cleaner_code": execute_data_cleaner_code,
517
668
  "fix_data_cleaner_code": fix_data_cleaner_code,
518
- "explain_data_cleaner_code": explain_data_cleaner_code
669
+ "report_agent_outputs": report_agent_outputs,
519
670
  }
520
-
671
+
521
672
  app = create_coding_agent_graph(
522
673
  GraphState=GraphState,
523
674
  node_functions=node_functions,
@@ -525,16 +676,17 @@ def make_data_cleaning_agent(
525
676
  create_code_node_name="create_data_cleaner_code",
526
677
  execute_code_node_name="execute_data_cleaner_code",
527
678
  fix_code_node_name="fix_data_cleaner_code",
528
- explain_code_node_name="explain_data_cleaner_code",
679
+ explain_code_node_name="report_agent_outputs",
529
680
  error_key="data_cleaner_error",
530
- human_in_the_loop=human_in_the_loop, # or False
681
+ human_in_the_loop=human_in_the_loop,
531
682
  human_review_node_name="human_review",
532
683
  checkpointer=MemorySaver() if human_in_the_loop else None,
533
684
  bypass_recommended_steps=bypass_recommended_steps,
534
685
  bypass_explain_code=bypass_explain_code,
535
686
  )
536
-
687
+
537
688
  return app
689
+
538
690
 
539
691
 
540
692