ai-data-science-team 0.0.0.9007__py3-none-any.whl → 0.0.0.9008__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- __version__ = "0.0.0.9007"
1
+ __version__ = "0.0.0.9008"
@@ -1,6 +1,6 @@
1
1
  from ai_data_science_team.agents.data_cleaning_agent import make_data_cleaning_agent, DataCleaningAgent
2
- from ai_data_science_team.agents.feature_engineering_agent import make_feature_engineering_agent
3
- from ai_data_science_team.agents.data_wrangling_agent import make_data_wrangling_agent
4
- from ai_data_science_team.agents.sql_database_agent import make_sql_database_agent
5
- from ai_data_science_team.agents.data_visualization_agent import make_data_visualization_agent
2
+ from ai_data_science_team.agents.feature_engineering_agent import make_feature_engineering_agent, FeatureEngineeringAgent
3
+ from ai_data_science_team.agents.data_wrangling_agent import make_data_wrangling_agent, DataWranglingAgent
4
+ from ai_data_science_team.agents.sql_database_agent import make_sql_database_agent, SQLDatabaseAgent
5
+ from ai_data_science_team.agents.data_visualization_agent import make_data_visualization_agent, DataVisualizationAgent
6
6
 
@@ -13,21 +13,22 @@ from langchain_core.messages import BaseMessage
13
13
  from langgraph.types import Command
14
14
  from langgraph.checkpoint.memory import MemorySaver
15
15
 
16
- from langgraph.graph.state import CompiledStateGraph
17
-
18
16
  import os
19
17
  import io
20
18
  import pandas as pd
21
19
 
20
+ from IPython.display import Markdown
21
+
22
22
  from ai_data_science_team.templates import(
23
23
  node_func_execute_agent_code_on_data,
24
24
  node_func_human_review,
25
25
  node_func_fix_agent_code,
26
26
  node_func_explain_agent_code,
27
- create_coding_agent_graph
27
+ create_coding_agent_graph,
28
+ BaseAgent,
28
29
  )
29
30
  from ai_data_science_team.tools.parsers import PythonOutputParser
30
- from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top, format_agent_name
31
+ from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top, format_agent_name, format_recommended_steps
31
32
  from ai_data_science_team.tools.metadata import get_dataframe_summary
32
33
  from ai_data_science_team.tools.logging import log_ai_function
33
34
 
@@ -38,7 +39,109 @@ LOG_PATH = os.path.join(os.getcwd(), "logs/")
38
39
 
39
40
 
40
41
  # Class
41
- class DataCleaningAgent(CompiledStateGraph):
42
+ class DataCleaningAgent(BaseAgent):
43
+ """
44
+ Creates a data cleaning agent that can process datasets based on user-defined instructions or default cleaning steps.
45
+ The agent generates a Python function to clean the dataset, performs the cleaning, and logs the process, including code
46
+ and errors. It is designed to facilitate reproducible and customizable data cleaning workflows.
47
+
48
+ The agent performs the following default cleaning steps unless instructed otherwise:
49
+
50
+ - Removing columns with more than 40% missing values.
51
+ - Imputing missing values with the mean for numeric columns.
52
+ - Imputing missing values with the mode for categorical columns.
53
+ - Converting columns to appropriate data types.
54
+ - Removing duplicate rows.
55
+ - Removing rows with missing values.
56
+ - Removing rows with extreme outliers (values 3x the interquartile range).
57
+
58
+ User instructions can modify, add, or remove any of these steps to tailor the cleaning process.
59
+
60
+ Parameters
61
+ ----------
62
+ model : langchain.llms.base.LLM
63
+ The language model used to generate the data cleaning function.
64
+ n_samples : int, optional
65
+ Number of samples used when summarizing the dataset. Defaults to 30. Reducing this number can help
66
+ avoid exceeding the model's token limits.
67
+ log : bool, optional
68
+ Whether to log the generated code and errors. Defaults to False.
69
+ log_path : str, optional
70
+ Directory path for storing log files. Defaults to None.
71
+ file_name : str, optional
72
+ Name of the file for saving the generated response. Defaults to "data_cleaner.py".
73
+ function_name : str, optional
74
+ Name of the generated data cleaning function. Defaults to "data_cleaner".
75
+ overwrite : bool, optional
76
+ Whether to overwrite the log file if it exists. If False, a unique file name is created. Defaults to True.
77
+ human_in_the_loop : bool, optional
78
+ Enables user review of data cleaning instructions. Defaults to False.
79
+ bypass_recommended_steps : bool, optional
80
+ If True, skips the default recommended cleaning steps. Defaults to False.
81
+ bypass_explain_code : bool, optional
82
+ If True, skips the step that provides code explanations. Defaults to False.
83
+
84
+ Methods
85
+ -------
86
+ update_params(**kwargs)
87
+ Updates the agent's parameters and rebuilds the compiled state graph.
88
+ ainvoke_agent(user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0)
89
+ Cleans the provided dataset asynchronously based on user instructions.
90
+ invoke_agent(user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0)
91
+ Cleans the provided dataset synchronously based on user instructions.
92
+ explain_cleaning_steps()
93
+ Returns an explanation of the cleaning steps performed by the agent.
94
+ get_log_summary()
95
+ Retrieves a summary of logged operations if logging is enabled.
96
+ get_state_keys()
97
+ Returns a list of keys from the state graph response.
98
+ get_state_properties()
99
+ Returns detailed properties of the state graph response.
100
+ get_data_cleaned()
101
+ Retrieves the cleaned dataset as a pandas DataFrame.
102
+ get_data_raw()
103
+ Retrieves the raw dataset as a pandas DataFrame.
104
+ get_data_cleaner_function()
105
+ Retrieves the generated Python function used for cleaning the data.
106
+ get_recommended_cleaning_steps()
107
+ Retrieves the agent's recommended cleaning steps.
108
+ get_response()
109
+ Returns the response from the agent as a dictionary.
110
+ show()
111
+ Displays the agent's mermaid diagram.
112
+
113
+ Examples
114
+ --------
115
+ ```python
116
+ import pandas as pd
117
+ from langchain_openai import ChatOpenAI
118
+ from ai_data_science_team.agents import DataCleaningAgent
119
+
120
+ llm = ChatOpenAI(model="gpt-4o-mini")
121
+
122
+ data_cleaning_agent = DataCleaningAgent(
123
+ model=llm, n_samples=50, log=True, log_path="logs", human_in_the_loop=True
124
+ )
125
+
126
+ df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
127
+
128
+ data_cleaning_agent.invoke_agent(
129
+ user_instructions="Don't remove outliers when cleaning the data.",
130
+ data_raw=df,
131
+ max_retries=3,
132
+ retry_count=0
133
+ )
134
+
135
+ cleaned_data = data_cleaning_agent.get_data_cleaned()
136
+
137
+ response = data_cleaning_agent.response
138
+ ```
139
+
140
+ Returns
141
+ --------
142
+ DataCleaningAgent : langchain.graphs.CompiledStateGraph
143
+ A data cleaning agent implemented as a compiled state graph.
144
+ """
42
145
 
43
146
  def __init__(
44
147
  self,
@@ -47,6 +150,7 @@ class DataCleaningAgent(CompiledStateGraph):
47
150
  log=False,
48
151
  log_path=None,
49
152
  file_name="data_cleaner.py",
153
+ function_name="data_cleaner",
50
154
  overwrite=True,
51
155
  human_in_the_loop=False,
52
156
  bypass_recommended_steps=False,
@@ -58,6 +162,7 @@ class DataCleaningAgent(CompiledStateGraph):
58
162
  "log": log,
59
163
  "log_path": log_path,
60
164
  "file_name": file_name,
165
+ "function_name": function_name,
61
166
  "overwrite": overwrite,
62
167
  "human_in_the_loop": human_in_the_loop,
63
168
  "bypass_recommended_steps": bypass_recommended_steps,
@@ -67,65 +172,70 @@ class DataCleaningAgent(CompiledStateGraph):
67
172
  self.response = None
68
173
 
69
174
  def _make_compiled_graph(self):
70
- self.response = None
71
- return make_data_cleaning_agent(**self._params)
72
-
73
- def update_params(self, **kwargs):
74
175
  """
75
- Update one or more parameters at once, then rebuild the compiled graph.
76
- e.g. agent.update_params(model=new_llm, n_samples=100)
176
+ Create the compiled graph for the data cleaning agent. Running this method will reset the response to None.
77
177
  """
78
- self._params.update(kwargs)
79
- self._compiled_graph = self._make_compiled_graph()
178
+ self.response=None
179
+ return make_data_cleaning_agent(**self._params)
80
180
 
81
- def __getattr__(self, name: str):
82
- """
83
- Delegate attribute access to `_compiled_graph` if `name` is not
84
- found in this instance. This 'inherits' methods from the compiled graph.
85
- """
86
- return getattr(self._compiled_graph, name)
87
181
 
88
- def ainvoke(self, user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0):
182
+ def ainvoke_agent(self, data_raw: pd.DataFrame, user_instructions: str=None, max_retries:int=3, retry_count:int=0, **kwargs):
89
183
  """
90
- Cleans the provided dataset based on user instructions.
184
+ Asynchronously invokes the agent. The response is stored in the response attribute.
91
185
 
92
186
  Parameters:
93
- user_instructions (str): Instructions for data cleaning.
94
- data_raw (pd.DataFrame): The raw dataset to be cleaned.
95
- max_retries (int): Maximum retry attempts for cleaning.
96
- retry_count (int): Current retry attempt.
187
+ ----------
188
+ data_raw (pd.DataFrame):
189
+ The raw dataset to be cleaned.
190
+ user_instructions (str):
191
+ Instructions for data cleaning agent.
192
+ max_retries (int):
193
+ Maximum retry attempts for cleaning.
194
+ retry_count (int):
195
+ Current retry attempt.
196
+ **kwargs
197
+ Additional keyword arguments to pass to ainvoke().
97
198
 
98
199
  Returns:
200
+ --------
99
201
  None. The response is stored in the response attribute.
100
202
  """
101
- response = self.ainvoke({
203
+ response = self._compiled_graph.ainvoke({
102
204
  "user_instructions": user_instructions,
103
205
  "data_raw": data_raw.to_dict(),
104
206
  "max_retries": max_retries,
105
207
  "retry_count": retry_count,
106
- })
208
+ }, **kwargs)
107
209
  self.response = response
108
210
  return None
109
211
 
110
- def invoke(self, user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0):
212
+ def invoke_agent(self, data_raw: pd.DataFrame, user_instructions: str=None, max_retries:int=3, retry_count:int=0, **kwargs):
111
213
  """
112
- Cleans the provided dataset based on user instructions.
214
+ Invokes the agent. The response is stored in the response attribute.
113
215
 
114
216
  Parameters:
115
- user_instructions (str): Instructions for data cleaning.
116
- data_raw (pd.DataFrame): The raw dataset to be cleaned.
117
- max_retries (int): Maximum retry attempts for cleaning.
118
- retry_count (int): Current retry attempt.
217
+ ----------
218
+ data_raw (pd.DataFrame):
219
+ The raw dataset to be cleaned.
220
+ user_instructions (str):
221
+ Instructions for data cleaning agent.
222
+ max_retries (int):
223
+ Maximum retry attempts for cleaning.
224
+ retry_count (int):
225
+ Current retry attempt.
226
+ **kwargs
227
+ Additional keyword arguments to pass to invoke().
119
228
 
120
229
  Returns:
230
+ --------
121
231
  None. The response is stored in the response attribute.
122
232
  """
123
- response = self.invoke({
233
+ response = self._compiled_graph.invoke({
124
234
  "user_instructions": user_instructions,
125
235
  "data_raw": data_raw.to_dict(),
126
236
  "max_retries": max_retries,
127
237
  "retry_count": retry_count,
128
- })
238
+ },**kwargs)
129
239
  self.response = response
130
240
  return None
131
241
 
@@ -139,30 +249,21 @@ class DataCleaningAgent(CompiledStateGraph):
139
249
  messages = self.response.get("messages", [])
140
250
  return messages
141
251
 
142
- def get_log_summary(self):
252
+ def get_log_summary(self, markdown=False):
143
253
  """
144
254
  Logs a summary of the agent's operations, if logging is enabled.
145
255
  """
146
256
  if self.response:
147
- if self.log:
257
+ if self.response.get('data_cleaner_function_path'):
148
258
  log_details = f"Log Path: {self.response.get('data_cleaner_function_path')}"
149
- return log_details
150
-
151
- def get_state_keys(self):
152
- """
153
- Returns a list of keys that the state graph returns in a response.
154
- """
155
- return list(self.get_output_jsonschema()['properties'].keys())
156
-
157
- def get_state_properties(self):
158
- """
159
- Returns a list of keys that the state graph returns in a response.
160
- """
161
- return self.get_output_jsonschema()['properties']
259
+ if markdown:
260
+ return Markdown(log_details)
261
+ else:
262
+ return log_details
162
263
 
163
264
  def get_data_cleaned(self):
164
265
  """
165
- Retrieves the cleaned data stored after running invoke or clean_data methods.
266
+ Retrieves the cleaned data stored after running invoke_agent or clean_data methods.
166
267
  """
167
268
  if self.response:
168
269
  return pd.DataFrame(self.response.get("data_cleaned"))
@@ -174,15 +275,25 @@ class DataCleaningAgent(CompiledStateGraph):
174
275
  if self.response:
175
276
  return pd.DataFrame(self.response.get("data_raw"))
176
277
 
177
- def get_data_cleaner_function(self):
278
+ def get_data_cleaner_function(self, markdown=False):
178
279
  """
179
280
  Retrieves the agent's pipeline function.
180
281
  """
181
282
  if self.response:
182
- return self.response.get("data_cleaner_function")
183
-
184
-
185
-
283
+ if markdown:
284
+ return Markdown(f"```python\n{self.response.get('data_cleaner_function')}\n```")
285
+ else:
286
+ return self.response.get("data_cleaner_function")
287
+
288
+ def get_recommended_cleaning_steps(self, markdown=False):
289
+ """
290
+ Retrieves the agent's recommended cleaning steps
291
+ """
292
+ if self.response:
293
+ if markdown:
294
+ return Markdown(self.response.get('recommended_steps'))
295
+ else:
296
+ return self.response.get('recommended_steps')
186
297
 
187
298
 
188
299
 
@@ -194,6 +305,7 @@ def make_data_cleaning_agent(
194
305
  log=False,
195
306
  log_path=None,
196
307
  file_name="data_cleaner.py",
308
+ function_name="data_cleaner",
197
309
  overwrite = True,
198
310
  human_in_the_loop=False,
199
311
  bypass_recommended_steps=False,
@@ -235,6 +347,8 @@ def make_data_cleaning_agent(
235
347
  "logs/".
236
348
  file_name : str, optional
237
349
  The name of the file to save the response to. Defaults to "data_cleaner.py".
350
+ function_name : str, optional
351
+ The name of the function that will be generated to clean the data. Defaults to "data_cleaner".
238
352
  overwrite : bool, optional
239
353
  Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
240
354
  Defaults to True.
@@ -275,6 +389,11 @@ def make_data_cleaning_agent(
275
389
  """
276
390
  llm = model
277
391
 
392
+ # Human in th loop requires recommended steps
393
+ if bypass_recommended_steps and human_in_the_loop:
394
+ bypass_recommended_steps = False
395
+ print("Bypass recommended steps set to False to enable human in the loop.")
396
+
278
397
  # Setup Log Directory
279
398
  if log:
280
399
  if log_path is None:
@@ -292,6 +411,7 @@ def make_data_cleaning_agent(
292
411
  all_datasets_summary: str
293
412
  data_cleaner_function: str
294
413
  data_cleaner_function_path: str
414
+ data_cleaner_file_name: str
295
415
  data_cleaner_function_name: str
296
416
  data_cleaner_error: str
297
417
  max_retries: int
@@ -366,7 +486,7 @@ def make_data_cleaning_agent(
366
486
  })
367
487
 
368
488
  return {
369
- "recommended_steps": "\n\n# Recommended Data Cleaning Steps:\n" + recommended_steps.content.strip(),
489
+ "recommended_steps": format_recommended_steps(recommended_steps.content.strip(), heading="# Recommended Data Cleaning Steps:"),
370
490
  "all_datasets_summary": all_datasets_summary_str
371
491
  }
372
492
 
@@ -386,42 +506,44 @@ def make_data_cleaning_agent(
386
506
  else:
387
507
  all_datasets_summary_str = state.get("all_datasets_summary")
388
508
 
509
+
389
510
  data_cleaning_prompt = PromptTemplate(
390
511
  template="""
391
- You are a Data Cleaning Agent. Your job is to create a data_cleaner() function that can be run on the data provided using the following recommended steps.
392
-
512
+ You are a Data Cleaning Agent. Your job is to create a {function_name}() function that can be run on the data provided using the following recommended steps.
513
+
393
514
  Recommended Steps:
394
515
  {recommended_steps}
395
-
516
+
396
517
  You can use Pandas, Numpy, and Scikit Learn libraries to clean the data.
397
-
518
+
398
519
  Below are summaries of all datasets provided. Use this information about the data to help determine how to clean the data:
399
520
 
400
521
  {all_datasets_summary}
401
-
402
- Return Python code in ```python ``` format with a single function definition, data_cleaner(data_raw), that includes all imports inside the function.
403
-
522
+
523
+ Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
524
+
404
525
  Return code to provide the data cleaning function:
405
-
406
- def data_cleaner(data_raw):
526
+
527
+ def {function_name}(data_raw):
407
528
  import pandas as pd
408
529
  import numpy as np
409
530
  ...
410
531
  return data_cleaned
411
-
532
+
412
533
  Best Practices and Error Preventions:
413
-
534
+
414
535
  Always ensure that when assigning the output of fit_transform() from SimpleImputer to a Pandas DataFrame column, you call .ravel() or flatten the array, because fit_transform() returns a 2D array while a DataFrame column is 1D.
415
536
 
416
537
  """,
417
- input_variables=["recommended_steps", "all_datasets_summary"]
538
+ input_variables=["recommended_steps", "all_datasets_summary", "function_name"]
418
539
  )
419
540
 
420
541
  data_cleaning_agent = data_cleaning_prompt | llm | PythonOutputParser()
421
542
 
422
543
  response = data_cleaning_agent.invoke({
423
544
  "recommended_steps": state.get("recommended_steps"),
424
- "all_datasets_summary": all_datasets_summary_str
545
+ "all_datasets_summary": all_datasets_summary_str,
546
+ "function_name": function_name
425
547
  })
426
548
 
427
549
  response = relocate_imports_inside_function(response)
@@ -439,19 +561,37 @@ def make_data_cleaning_agent(
439
561
  return {
440
562
  "data_cleaner_function" : response,
441
563
  "data_cleaner_function_path": file_path,
442
- "data_cleaner_function_name": file_name_2,
564
+ "data_cleaner_file_name": file_name_2,
565
+ "data_cleaner_function_name": function_name,
443
566
  "all_datasets_summary": all_datasets_summary_str
444
567
  }
568
+
569
+ # Human Review
570
+
571
+ prompt_text_human_review = "Are the following data cleaning instructions correct? (Answer 'yes' or provide modifications)\n{steps}"
445
572
 
446
- def human_review(state: GraphState) -> Command[Literal["recommend_cleaning_steps", "create_data_cleaner_code"]]:
447
- return node_func_human_review(
448
- state=state,
449
- prompt_text="Is the following data cleaning instructions correct? (Answer 'yes' or provide modifications)\n{steps}",
450
- yes_goto="create_data_cleaner_code",
451
- no_goto="recommend_cleaning_steps",
452
- user_instructions_key="user_instructions",
453
- recommended_steps_key="recommended_steps"
454
- )
573
+ if not bypass_explain_code:
574
+ def human_review(state: GraphState) -> Command[Literal["recommend_cleaning_steps", "explain_data_cleaner_code"]]:
575
+ return node_func_human_review(
576
+ state=state,
577
+ prompt_text=prompt_text_human_review,
578
+ yes_goto= 'explain_data_cleaner_code',
579
+ no_goto="recommend_cleaning_steps",
580
+ user_instructions_key="user_instructions",
581
+ recommended_steps_key="recommended_steps",
582
+ code_snippet_key="data_cleaner_function",
583
+ )
584
+ else:
585
+ def human_review(state: GraphState) -> Command[Literal["recommend_cleaning_steps", "__end__"]]:
586
+ return node_func_human_review(
587
+ state=state,
588
+ prompt_text=prompt_text_human_review,
589
+ yes_goto= '__end__',
590
+ no_goto="recommend_cleaning_steps",
591
+ user_instructions_key="user_instructions",
592
+ recommended_steps_key="recommended_steps",
593
+ code_snippet_key="data_cleaner_function",
594
+ )
455
595
 
456
596
  def execute_data_cleaner_code(state):
457
597
  return node_func_execute_agent_code_on_data(
@@ -460,7 +600,7 @@ def make_data_cleaning_agent(
460
600
  result_key="data_cleaned",
461
601
  error_key="data_cleaner_error",
462
602
  code_snippet_key="data_cleaner_function",
463
- agent_function_name="data_cleaner",
603
+ agent_function_name=state.get("data_cleaner_function_name"),
464
604
  pre_processing=lambda data: pd.DataFrame.from_dict(data),
465
605
  post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
466
606
  error_message_prefix="An error occurred during data cleaning: "
@@ -468,11 +608,11 @@ def make_data_cleaning_agent(
468
608
 
469
609
  def fix_data_cleaner_code(state: GraphState):
470
610
  data_cleaner_prompt = """
471
- You are a Data Cleaning Agent. Your job is to create a data_cleaner() function that can be run on the data provided. The function is currently broken and needs to be fixed.
611
+ You are a Data Cleaning Agent. Your job is to create a {function_name}() function that can be run on the data provided. The function is currently broken and needs to be fixed.
472
612
 
473
- Make sure to only return the function definition for data_cleaner().
613
+ Make sure to only return the function definition for {function_name}().
474
614
 
475
- Return Python code in ```python``` format with a single function definition, data_cleaner(data_raw), that includes all imports inside the function.
615
+ Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
476
616
 
477
617
  This is the broken code (please fix):
478
618
  {code_snippet}
@@ -490,6 +630,7 @@ def make_data_cleaning_agent(
490
630
  agent_name=AGENT_NAME,
491
631
  log=log,
492
632
  file_path=state.get("data_cleaner_function_path"),
633
+ function_name=state.get("data_cleaner_function_name"),
493
634
  )
494
635
 
495
636
  def explain_data_cleaner_code(state: GraphState):