ai-data-science-team 0.0.0.9007__py3-none-any.whl → 0.0.0.9008__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1 +1 @@
1
- __version__ = "0.0.0.9007"
1
+ __version__ = "0.0.0.9008"
@@ -1,6 +1,6 @@
1
1
  from ai_data_science_team.agents.data_cleaning_agent import make_data_cleaning_agent, DataCleaningAgent
2
- from ai_data_science_team.agents.feature_engineering_agent import make_feature_engineering_agent
3
- from ai_data_science_team.agents.data_wrangling_agent import make_data_wrangling_agent
4
- from ai_data_science_team.agents.sql_database_agent import make_sql_database_agent
5
- from ai_data_science_team.agents.data_visualization_agent import make_data_visualization_agent
2
+ from ai_data_science_team.agents.feature_engineering_agent import make_feature_engineering_agent, FeatureEngineeringAgent
3
+ from ai_data_science_team.agents.data_wrangling_agent import make_data_wrangling_agent, DataWranglingAgent
4
+ from ai_data_science_team.agents.sql_database_agent import make_sql_database_agent, SQLDatabaseAgent
5
+ from ai_data_science_team.agents.data_visualization_agent import make_data_visualization_agent, DataVisualizationAgent
6
6
 
@@ -13,21 +13,22 @@ from langchain_core.messages import BaseMessage
13
13
  from langgraph.types import Command
14
14
  from langgraph.checkpoint.memory import MemorySaver
15
15
 
16
- from langgraph.graph.state import CompiledStateGraph
17
-
18
16
  import os
19
17
  import io
20
18
  import pandas as pd
21
19
 
20
+ from IPython.display import Markdown
21
+
22
22
  from ai_data_science_team.templates import(
23
23
  node_func_execute_agent_code_on_data,
24
24
  node_func_human_review,
25
25
  node_func_fix_agent_code,
26
26
  node_func_explain_agent_code,
27
- create_coding_agent_graph
27
+ create_coding_agent_graph,
28
+ BaseAgent,
28
29
  )
29
30
  from ai_data_science_team.tools.parsers import PythonOutputParser
30
- from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top, format_agent_name
31
+ from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top, format_agent_name, format_recommended_steps
31
32
  from ai_data_science_team.tools.metadata import get_dataframe_summary
32
33
  from ai_data_science_team.tools.logging import log_ai_function
33
34
 
@@ -38,7 +39,109 @@ LOG_PATH = os.path.join(os.getcwd(), "logs/")
38
39
 
39
40
 
40
41
  # Class
41
- class DataCleaningAgent(CompiledStateGraph):
42
+ class DataCleaningAgent(BaseAgent):
43
+ """
44
+ Creates a data cleaning agent that can process datasets based on user-defined instructions or default cleaning steps.
45
+ The agent generates a Python function to clean the dataset, performs the cleaning, and logs the process, including code
46
+ and errors. It is designed to facilitate reproducible and customizable data cleaning workflows.
47
+
48
+ The agent performs the following default cleaning steps unless instructed otherwise:
49
+
50
+ - Removing columns with more than 40% missing values.
51
+ - Imputing missing values with the mean for numeric columns.
52
+ - Imputing missing values with the mode for categorical columns.
53
+ - Converting columns to appropriate data types.
54
+ - Removing duplicate rows.
55
+ - Removing rows with missing values.
56
+ - Removing rows with extreme outliers (values 3x the interquartile range).
57
+
58
+ User instructions can modify, add, or remove any of these steps to tailor the cleaning process.
59
+
60
+ Parameters
61
+ ----------
62
+ model : langchain.llms.base.LLM
63
+ The language model used to generate the data cleaning function.
64
+ n_samples : int, optional
65
+ Number of samples used when summarizing the dataset. Defaults to 30. Reducing this number can help
66
+ avoid exceeding the model's token limits.
67
+ log : bool, optional
68
+ Whether to log the generated code and errors. Defaults to False.
69
+ log_path : str, optional
70
+ Directory path for storing log files. Defaults to None.
71
+ file_name : str, optional
72
+ Name of the file for saving the generated response. Defaults to "data_cleaner.py".
73
+ function_name : str, optional
74
+ Name of the generated data cleaning function. Defaults to "data_cleaner".
75
+ overwrite : bool, optional
76
+ Whether to overwrite the log file if it exists. If False, a unique file name is created. Defaults to True.
77
+ human_in_the_loop : bool, optional
78
+ Enables user review of data cleaning instructions. Defaults to False.
79
+ bypass_recommended_steps : bool, optional
80
+ If True, skips the default recommended cleaning steps. Defaults to False.
81
+ bypass_explain_code : bool, optional
82
+ If True, skips the step that provides code explanations. Defaults to False.
83
+
84
+ Methods
85
+ -------
86
+ update_params(**kwargs)
87
+ Updates the agent's parameters and rebuilds the compiled state graph.
88
+ ainvoke_agent(user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0)
89
+ Cleans the provided dataset asynchronously based on user instructions.
90
+ invoke_agent(user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0)
91
+ Cleans the provided dataset synchronously based on user instructions.
92
+ explain_cleaning_steps()
93
+ Returns an explanation of the cleaning steps performed by the agent.
94
+ get_log_summary()
95
+ Retrieves a summary of logged operations if logging is enabled.
96
+ get_state_keys()
97
+ Returns a list of keys from the state graph response.
98
+ get_state_properties()
99
+ Returns detailed properties of the state graph response.
100
+ get_data_cleaned()
101
+ Retrieves the cleaned dataset as a pandas DataFrame.
102
+ get_data_raw()
103
+ Retrieves the raw dataset as a pandas DataFrame.
104
+ get_data_cleaner_function()
105
+ Retrieves the generated Python function used for cleaning the data.
106
+ get_recommended_cleaning_steps()
107
+ Retrieves the agent's recommended cleaning steps.
108
+ get_response()
109
+ Returns the response from the agent as a dictionary.
110
+ show()
111
+ Displays the agent's mermaid diagram.
112
+
113
+ Examples
114
+ --------
115
+ ```python
116
+ import pandas as pd
117
+ from langchain_openai import ChatOpenAI
118
+ from ai_data_science_team.agents import DataCleaningAgent
119
+
120
+ llm = ChatOpenAI(model="gpt-4o-mini")
121
+
122
+ data_cleaning_agent = DataCleaningAgent(
123
+ model=llm, n_samples=50, log=True, log_path="logs", human_in_the_loop=True
124
+ )
125
+
126
+ df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
127
+
128
+ data_cleaning_agent.invoke_agent(
129
+ user_instructions="Don't remove outliers when cleaning the data.",
130
+ data_raw=df,
131
+ max_retries=3,
132
+ retry_count=0
133
+ )
134
+
135
+ cleaned_data = data_cleaning_agent.get_data_cleaned()
136
+
137
+ response = data_cleaning_agent.response
138
+ ```
139
+
140
+ Returns
141
+ --------
142
+ DataCleaningAgent : langchain.graphs.CompiledStateGraph
143
+ A data cleaning agent implemented as a compiled state graph.
144
+ """
42
145
 
43
146
  def __init__(
44
147
  self,
@@ -47,6 +150,7 @@ class DataCleaningAgent(CompiledStateGraph):
47
150
  log=False,
48
151
  log_path=None,
49
152
  file_name="data_cleaner.py",
153
+ function_name="data_cleaner",
50
154
  overwrite=True,
51
155
  human_in_the_loop=False,
52
156
  bypass_recommended_steps=False,
@@ -58,6 +162,7 @@ class DataCleaningAgent(CompiledStateGraph):
58
162
  "log": log,
59
163
  "log_path": log_path,
60
164
  "file_name": file_name,
165
+ "function_name": function_name,
61
166
  "overwrite": overwrite,
62
167
  "human_in_the_loop": human_in_the_loop,
63
168
  "bypass_recommended_steps": bypass_recommended_steps,
@@ -67,65 +172,70 @@ class DataCleaningAgent(CompiledStateGraph):
67
172
  self.response = None
68
173
 
69
174
  def _make_compiled_graph(self):
70
- self.response = None
71
- return make_data_cleaning_agent(**self._params)
72
-
73
- def update_params(self, **kwargs):
74
175
  """
75
- Update one or more parameters at once, then rebuild the compiled graph.
76
- e.g. agent.update_params(model=new_llm, n_samples=100)
176
+ Create the compiled graph for the data cleaning agent. Running this method will reset the response to None.
77
177
  """
78
- self._params.update(kwargs)
79
- self._compiled_graph = self._make_compiled_graph()
178
+ self.response=None
179
+ return make_data_cleaning_agent(**self._params)
80
180
 
81
- def __getattr__(self, name: str):
82
- """
83
- Delegate attribute access to `_compiled_graph` if `name` is not
84
- found in this instance. This 'inherits' methods from the compiled graph.
85
- """
86
- return getattr(self._compiled_graph, name)
87
181
 
88
- def ainvoke(self, user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0):
182
+ def ainvoke_agent(self, data_raw: pd.DataFrame, user_instructions: str=None, max_retries:int=3, retry_count:int=0, **kwargs):
89
183
  """
90
- Cleans the provided dataset based on user instructions.
184
+ Asynchronously invokes the agent. The response is stored in the response attribute.
91
185
 
92
186
  Parameters:
93
- user_instructions (str): Instructions for data cleaning.
94
- data_raw (pd.DataFrame): The raw dataset to be cleaned.
95
- max_retries (int): Maximum retry attempts for cleaning.
96
- retry_count (int): Current retry attempt.
187
+ ----------
188
+ data_raw (pd.DataFrame):
189
+ The raw dataset to be cleaned.
190
+ user_instructions (str):
191
+ Instructions for data cleaning agent.
192
+ max_retries (int):
193
+ Maximum retry attempts for cleaning.
194
+ retry_count (int):
195
+ Current retry attempt.
196
+ **kwargs
197
+ Additional keyword arguments to pass to ainvoke().
97
198
 
98
199
  Returns:
200
+ --------
99
201
  None. The response is stored in the response attribute.
100
202
  """
101
- response = self.ainvoke({
203
+ response = self._compiled_graph.ainvoke({
102
204
  "user_instructions": user_instructions,
103
205
  "data_raw": data_raw.to_dict(),
104
206
  "max_retries": max_retries,
105
207
  "retry_count": retry_count,
106
- })
208
+ }, **kwargs)
107
209
  self.response = response
108
210
  return None
109
211
 
110
- def invoke(self, user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0):
212
+ def invoke_agent(self, data_raw: pd.DataFrame, user_instructions: str=None, max_retries:int=3, retry_count:int=0, **kwargs):
111
213
  """
112
- Cleans the provided dataset based on user instructions.
214
+ Invokes the agent. The response is stored in the response attribute.
113
215
 
114
216
  Parameters:
115
- user_instructions (str): Instructions for data cleaning.
116
- data_raw (pd.DataFrame): The raw dataset to be cleaned.
117
- max_retries (int): Maximum retry attempts for cleaning.
118
- retry_count (int): Current retry attempt.
217
+ ----------
218
+ data_raw (pd.DataFrame):
219
+ The raw dataset to be cleaned.
220
+ user_instructions (str):
221
+ Instructions for data cleaning agent.
222
+ max_retries (int):
223
+ Maximum retry attempts for cleaning.
224
+ retry_count (int):
225
+ Current retry attempt.
226
+ **kwargs
227
+ Additional keyword arguments to pass to invoke().
119
228
 
120
229
  Returns:
230
+ --------
121
231
  None. The response is stored in the response attribute.
122
232
  """
123
- response = self.invoke({
233
+ response = self._compiled_graph.invoke({
124
234
  "user_instructions": user_instructions,
125
235
  "data_raw": data_raw.to_dict(),
126
236
  "max_retries": max_retries,
127
237
  "retry_count": retry_count,
128
- })
238
+ },**kwargs)
129
239
  self.response = response
130
240
  return None
131
241
 
@@ -139,30 +249,21 @@ class DataCleaningAgent(CompiledStateGraph):
139
249
  messages = self.response.get("messages", [])
140
250
  return messages
141
251
 
142
- def get_log_summary(self):
252
+ def get_log_summary(self, markdown=False):
143
253
  """
144
254
  Logs a summary of the agent's operations, if logging is enabled.
145
255
  """
146
256
  if self.response:
147
- if self.log:
257
+ if self.response.get('data_cleaner_function_path'):
148
258
  log_details = f"Log Path: {self.response.get('data_cleaner_function_path')}"
149
- return log_details
150
-
151
- def get_state_keys(self):
152
- """
153
- Returns a list of keys that the state graph returns in a response.
154
- """
155
- return list(self.get_output_jsonschema()['properties'].keys())
156
-
157
- def get_state_properties(self):
158
- """
159
- Returns a list of keys that the state graph returns in a response.
160
- """
161
- return self.get_output_jsonschema()['properties']
259
+ if markdown:
260
+ return Markdown(log_details)
261
+ else:
262
+ return log_details
162
263
 
163
264
  def get_data_cleaned(self):
164
265
  """
165
- Retrieves the cleaned data stored after running invoke or clean_data methods.
266
+ Retrieves the cleaned data stored after running invoke_agent or clean_data methods.
166
267
  """
167
268
  if self.response:
168
269
  return pd.DataFrame(self.response.get("data_cleaned"))
@@ -174,15 +275,25 @@ class DataCleaningAgent(CompiledStateGraph):
174
275
  if self.response:
175
276
  return pd.DataFrame(self.response.get("data_raw"))
176
277
 
177
- def get_data_cleaner_function(self):
278
+ def get_data_cleaner_function(self, markdown=False):
178
279
  """
179
280
  Retrieves the agent's pipeline function.
180
281
  """
181
282
  if self.response:
182
- return self.response.get("data_cleaner_function")
183
-
184
-
185
-
283
+ if markdown:
284
+ return Markdown(f"```python\n{self.response.get('data_cleaner_function')}\n```")
285
+ else:
286
+ return self.response.get("data_cleaner_function")
287
+
288
+ def get_recommended_cleaning_steps(self, markdown=False):
289
+ """
290
+ Retrieves the agent's recommended cleaning steps
291
+ """
292
+ if self.response:
293
+ if markdown:
294
+ return Markdown(self.response.get('recommended_steps'))
295
+ else:
296
+ return self.response.get('recommended_steps')
186
297
 
187
298
 
188
299
 
@@ -194,6 +305,7 @@ def make_data_cleaning_agent(
194
305
  log=False,
195
306
  log_path=None,
196
307
  file_name="data_cleaner.py",
308
+ function_name="data_cleaner",
197
309
  overwrite = True,
198
310
  human_in_the_loop=False,
199
311
  bypass_recommended_steps=False,
@@ -235,6 +347,8 @@ def make_data_cleaning_agent(
235
347
  "logs/".
236
348
  file_name : str, optional
237
349
  The name of the file to save the response to. Defaults to "data_cleaner.py".
350
+ function_name : str, optional
351
+ The name of the function that will be generated to clean the data. Defaults to "data_cleaner".
238
352
  overwrite : bool, optional
239
353
  Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
240
354
  Defaults to True.
@@ -275,6 +389,11 @@ def make_data_cleaning_agent(
275
389
  """
276
390
  llm = model
277
391
 
392
+ # Human in th loop requires recommended steps
393
+ if bypass_recommended_steps and human_in_the_loop:
394
+ bypass_recommended_steps = False
395
+ print("Bypass recommended steps set to False to enable human in the loop.")
396
+
278
397
  # Setup Log Directory
279
398
  if log:
280
399
  if log_path is None:
@@ -292,6 +411,7 @@ def make_data_cleaning_agent(
292
411
  all_datasets_summary: str
293
412
  data_cleaner_function: str
294
413
  data_cleaner_function_path: str
414
+ data_cleaner_file_name: str
295
415
  data_cleaner_function_name: str
296
416
  data_cleaner_error: str
297
417
  max_retries: int
@@ -366,7 +486,7 @@ def make_data_cleaning_agent(
366
486
  })
367
487
 
368
488
  return {
369
- "recommended_steps": "\n\n# Recommended Data Cleaning Steps:\n" + recommended_steps.content.strip(),
489
+ "recommended_steps": format_recommended_steps(recommended_steps.content.strip(), heading="# Recommended Data Cleaning Steps:"),
370
490
  "all_datasets_summary": all_datasets_summary_str
371
491
  }
372
492
 
@@ -386,42 +506,44 @@ def make_data_cleaning_agent(
386
506
  else:
387
507
  all_datasets_summary_str = state.get("all_datasets_summary")
388
508
 
509
+
389
510
  data_cleaning_prompt = PromptTemplate(
390
511
  template="""
391
- You are a Data Cleaning Agent. Your job is to create a data_cleaner() function that can be run on the data provided using the following recommended steps.
392
-
512
+ You are a Data Cleaning Agent. Your job is to create a {function_name}() function that can be run on the data provided using the following recommended steps.
513
+
393
514
  Recommended Steps:
394
515
  {recommended_steps}
395
-
516
+
396
517
  You can use Pandas, Numpy, and Scikit Learn libraries to clean the data.
397
-
518
+
398
519
  Below are summaries of all datasets provided. Use this information about the data to help determine how to clean the data:
399
520
 
400
521
  {all_datasets_summary}
401
-
402
- Return Python code in ```python ``` format with a single function definition, data_cleaner(data_raw), that includes all imports inside the function.
403
-
522
+
523
+ Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
524
+
404
525
  Return code to provide the data cleaning function:
405
-
406
- def data_cleaner(data_raw):
526
+
527
+ def {function_name}(data_raw):
407
528
  import pandas as pd
408
529
  import numpy as np
409
530
  ...
410
531
  return data_cleaned
411
-
532
+
412
533
  Best Practices and Error Preventions:
413
-
534
+
414
535
  Always ensure that when assigning the output of fit_transform() from SimpleImputer to a Pandas DataFrame column, you call .ravel() or flatten the array, because fit_transform() returns a 2D array while a DataFrame column is 1D.
415
536
 
416
537
  """,
417
- input_variables=["recommended_steps", "all_datasets_summary"]
538
+ input_variables=["recommended_steps", "all_datasets_summary", "function_name"]
418
539
  )
419
540
 
420
541
  data_cleaning_agent = data_cleaning_prompt | llm | PythonOutputParser()
421
542
 
422
543
  response = data_cleaning_agent.invoke({
423
544
  "recommended_steps": state.get("recommended_steps"),
424
- "all_datasets_summary": all_datasets_summary_str
545
+ "all_datasets_summary": all_datasets_summary_str,
546
+ "function_name": function_name
425
547
  })
426
548
 
427
549
  response = relocate_imports_inside_function(response)
@@ -439,19 +561,37 @@ def make_data_cleaning_agent(
439
561
  return {
440
562
  "data_cleaner_function" : response,
441
563
  "data_cleaner_function_path": file_path,
442
- "data_cleaner_function_name": file_name_2,
564
+ "data_cleaner_file_name": file_name_2,
565
+ "data_cleaner_function_name": function_name,
443
566
  "all_datasets_summary": all_datasets_summary_str
444
567
  }
568
+
569
+ # Human Review
570
+
571
+ prompt_text_human_review = "Are the following data cleaning instructions correct? (Answer 'yes' or provide modifications)\n{steps}"
445
572
 
446
- def human_review(state: GraphState) -> Command[Literal["recommend_cleaning_steps", "create_data_cleaner_code"]]:
447
- return node_func_human_review(
448
- state=state,
449
- prompt_text="Is the following data cleaning instructions correct? (Answer 'yes' or provide modifications)\n{steps}",
450
- yes_goto="create_data_cleaner_code",
451
- no_goto="recommend_cleaning_steps",
452
- user_instructions_key="user_instructions",
453
- recommended_steps_key="recommended_steps"
454
- )
573
+ if not bypass_explain_code:
574
+ def human_review(state: GraphState) -> Command[Literal["recommend_cleaning_steps", "explain_data_cleaner_code"]]:
575
+ return node_func_human_review(
576
+ state=state,
577
+ prompt_text=prompt_text_human_review,
578
+ yes_goto= 'explain_data_cleaner_code',
579
+ no_goto="recommend_cleaning_steps",
580
+ user_instructions_key="user_instructions",
581
+ recommended_steps_key="recommended_steps",
582
+ code_snippet_key="data_cleaner_function",
583
+ )
584
+ else:
585
+ def human_review(state: GraphState) -> Command[Literal["recommend_cleaning_steps", "__end__"]]:
586
+ return node_func_human_review(
587
+ state=state,
588
+ prompt_text=prompt_text_human_review,
589
+ yes_goto= '__end__',
590
+ no_goto="recommend_cleaning_steps",
591
+ user_instructions_key="user_instructions",
592
+ recommended_steps_key="recommended_steps",
593
+ code_snippet_key="data_cleaner_function",
594
+ )
455
595
 
456
596
  def execute_data_cleaner_code(state):
457
597
  return node_func_execute_agent_code_on_data(
@@ -460,7 +600,7 @@ def make_data_cleaning_agent(
460
600
  result_key="data_cleaned",
461
601
  error_key="data_cleaner_error",
462
602
  code_snippet_key="data_cleaner_function",
463
- agent_function_name="data_cleaner",
603
+ agent_function_name=state.get("data_cleaner_function_name"),
464
604
  pre_processing=lambda data: pd.DataFrame.from_dict(data),
465
605
  post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
466
606
  error_message_prefix="An error occurred during data cleaning: "
@@ -468,11 +608,11 @@ def make_data_cleaning_agent(
468
608
 
469
609
  def fix_data_cleaner_code(state: GraphState):
470
610
  data_cleaner_prompt = """
471
- You are a Data Cleaning Agent. Your job is to create a data_cleaner() function that can be run on the data provided. The function is currently broken and needs to be fixed.
611
+ You are a Data Cleaning Agent. Your job is to create a {function_name}() function that can be run on the data provided. The function is currently broken and needs to be fixed.
472
612
 
473
- Make sure to only return the function definition for data_cleaner().
613
+ Make sure to only return the function definition for {function_name}().
474
614
 
475
- Return Python code in ```python``` format with a single function definition, data_cleaner(data_raw), that includes all imports inside the function.
615
+ Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
476
616
 
477
617
  This is the broken code (please fix):
478
618
  {code_snippet}
@@ -490,6 +630,7 @@ def make_data_cleaning_agent(
490
630
  agent_name=AGENT_NAME,
491
631
  log=log,
492
632
  file_path=state.get("data_cleaner_function_path"),
633
+ function_name=state.get("data_cleaner_function_name"),
493
634
  )
494
635
 
495
636
  def explain_data_cleaner_code(state: GraphState):