ai-data-science-team 0.0.0.9007__py3-none-any.whl → 0.0.0.9008__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -14,18 +14,25 @@ from langgraph.types import Command
14
14
  from langgraph.checkpoint.memory import MemorySaver
15
15
 
16
16
  import os
17
- import io
18
17
  import pandas as pd
19
18
 
19
+ from IPython.display import Markdown
20
+
20
21
  from ai_data_science_team.templates import(
21
22
  node_func_execute_agent_code_on_data,
22
23
  node_func_human_review,
23
24
  node_func_fix_agent_code,
24
25
  node_func_explain_agent_code,
25
- create_coding_agent_graph
26
+ create_coding_agent_graph,
27
+ BaseAgent,
26
28
  )
27
29
  from ai_data_science_team.tools.parsers import PythonOutputParser
28
- from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top, format_agent_name
30
+ from ai_data_science_team.tools.regex import (
31
+ relocate_imports_inside_function,
32
+ add_comments_to_top,
33
+ format_agent_name,
34
+ format_recommended_steps
35
+ )
29
36
  from ai_data_science_team.tools.metadata import get_dataframe_summary
30
37
  from ai_data_science_team.tools.logging import log_ai_function
31
38
 
@@ -33,6 +40,372 @@ from ai_data_science_team.tools.logging import log_ai_function
33
40
  AGENT_NAME = "feature_engineering_agent"
34
41
  LOG_PATH = os.path.join(os.getcwd(), "logs/")
35
42
 
43
+ # Class
44
+
45
+ class FeatureEngineeringAgent(BaseAgent):
46
+ """
47
+ Creates a feature engineering agent that can process datasets based on user-defined instructions or
48
+ default feature engineering steps. The agent generates a Python function to engineer features, executes it,
49
+ and logs the process, including code and errors. It is designed to facilitate reproducible and
50
+ customizable feature engineering workflows.
51
+
52
+ The agent can perform the following default feature engineering steps unless instructed otherwise:
53
+ - Convert features to appropriate data types
54
+ - Remove features that have unique values for each row
55
+ - Remove constant features
56
+ - Encode high-cardinality categoricals (threshold <= 5% of dataset) as 'other'
57
+ - One-hot-encode categorical variables
58
+ - Convert booleans to integer (1/0)
59
+ - Create datetime-based features (if applicable)
60
+ - Handle target variable encoding if specified
61
+ - Any user-provided instructions to add, remove, or modify steps
62
+
63
+ Parameters
64
+ ----------
65
+ model : langchain.llms.base.LLM
66
+ The language model used to generate the feature engineering function.
67
+ n_samples : int, optional
68
+ Number of samples used when summarizing the dataset. Defaults to 30.
69
+ log : bool, optional
70
+ Whether to log the generated code and errors. Defaults to False.
71
+ log_path : str, optional
72
+ Directory path for storing log files. Defaults to None.
73
+ file_name : str, optional
74
+ Name of the file for saving the generated response. Defaults to "feature_engineer.py".
75
+ function_name : str, optional
76
+ Name of the function for data visualization. Defaults to "feature_engineer".
77
+ overwrite : bool, optional
78
+ Whether to overwrite the log file if it exists. If False, a unique file name is created. Defaults to True.
79
+ human_in_the_loop : bool, optional
80
+ Enables user review of feature engineering instructions. Defaults to False.
81
+ bypass_recommended_steps : bool, optional
82
+ If True, skips the default recommended steps. Defaults to False.
83
+ bypass_explain_code : bool, optional
84
+ If True, skips the step that provides code explanations. Defaults to False.
85
+
86
+ Methods
87
+ -------
88
+ update_params(**kwargs)
89
+ Updates the agent's parameters and rebuilds the compiled state graph.
90
+ ainvoke_agent(
91
+ user_instructions: str,
92
+ data_raw: pd.DataFrame,
93
+ target_variable: str = None,
94
+ max_retries=3,
95
+ retry_count=0
96
+ )
97
+ Engineers features from the provided dataset asynchronously based on user instructions.
98
+ invoke_agent(
99
+ user_instructions: str,
100
+ data_raw: pd.DataFrame,
101
+ target_variable: str = None,
102
+ max_retries=3,
103
+ retry_count=0
104
+ )
105
+ Engineers features from the provided dataset synchronously based on user instructions.
106
+ explain_feature_engineering_steps()
107
+ Returns an explanation of the feature engineering steps performed by the agent.
108
+ get_log_summary()
109
+ Retrieves a summary of logged operations if logging is enabled.
110
+ get_data_engineered()
111
+ Retrieves the feature-engineered dataset as a pandas DataFrame.
112
+ get_data_raw()
113
+ Retrieves the raw dataset as a pandas DataFrame.
114
+ get_feature_engineer_function()
115
+ Retrieves the generated Python function used for feature engineering.
116
+ get_recommended_feature_engineering_steps()
117
+ Retrieves the agent's recommended feature engineering steps.
118
+ get_response()
119
+ Returns the response from the agent as a dictionary.
120
+ show()
121
+ Displays the agent's mermaid diagram.
122
+
123
+ Examples
124
+ --------
125
+ ```python
126
+ import pandas as pd
127
+ from langchain_openai import ChatOpenAI
128
+ from ai_data_science_team.agents import FeatureEngineeringAgent
129
+
130
+ llm = ChatOpenAI(model="gpt-4o-mini")
131
+
132
+ feature_agent = FeatureEngineeringAgent(
133
+ model=llm,
134
+ n_samples=30,
135
+ log=True,
136
+ log_path="logs",
137
+ human_in_the_loop=True
138
+ )
139
+
140
+ df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
141
+
142
+ feature_agent.invoke_agent(
143
+ user_instructions="Also encode the 'PaymentMethod' column with one-hot encoding.",
144
+ data_raw=df,
145
+ target_variable="Churn",
146
+ max_retries=3,
147
+ retry_count=0
148
+ )
149
+
150
+ engineered_data = feature_agent.get_data_engineered()
151
+ response = feature_agent.get_response()
152
+ ```
153
+
154
+ Returns
155
+ -------
156
+ FeatureEngineeringAgent : langchain.graphs.CompiledStateGraph
157
+ A feature engineering agent implemented as a compiled state graph.
158
+ """
159
+
160
+ def __init__(
161
+ self,
162
+ model,
163
+ n_samples=30,
164
+ log=False,
165
+ log_path=None,
166
+ file_name="feature_engineer.py",
167
+ function_name="feature_engineer",
168
+ overwrite=True,
169
+ human_in_the_loop=False,
170
+ bypass_recommended_steps=False,
171
+ bypass_explain_code=False
172
+ ):
173
+ self._params = {
174
+ "model": model,
175
+ "n_samples": n_samples,
176
+ "log": log,
177
+ "log_path": log_path,
178
+ "file_name": file_name,
179
+ "function_name": function_name,
180
+ "overwrite": overwrite,
181
+ "human_in_the_loop": human_in_the_loop,
182
+ "bypass_recommended_steps": bypass_recommended_steps,
183
+ "bypass_explain_code": bypass_explain_code
184
+ }
185
+ self._compiled_graph = self._make_compiled_graph()
186
+ self.response = None
187
+
188
+ def _make_compiled_graph(self):
189
+ """
190
+ Create the compiled graph for the feature engineering agent.
191
+ Running this method will reset the response to None.
192
+ """
193
+ self.response = None
194
+ return make_feature_engineering_agent(**self._params)
195
+
196
+ def update_params(self, **kwargs):
197
+ """
198
+ Updates the agent's parameters and rebuilds the compiled graph.
199
+ """
200
+ for k, v in kwargs.items():
201
+ self._params[k] = v
202
+ self._compiled_graph = self._make_compiled_graph()
203
+
204
+ def ainvoke_agent(
205
+ self,
206
+ data_raw: pd.DataFrame,
207
+ user_instructions: str=None,
208
+ target_variable: str = None,
209
+ max_retries=3,
210
+ retry_count=0,
211
+ **kwargs
212
+ ):
213
+ """
214
+ Asynchronously engineers features for the provided dataset.
215
+ The response is stored in the 'response' attribute.
216
+
217
+ Parameters
218
+ ----------
219
+ data_raw : pd.DataFrame
220
+ The raw dataset to be processed.
221
+ user_instructions : str, optional
222
+ Instructions for feature engineering.
223
+ target_variable : str, optional
224
+ The name of the target variable (if any).
225
+ max_retries : int
226
+ Maximum retry attempts.
227
+ retry_count : int
228
+ Current retry attempt count.
229
+ **kwargs
230
+ Additional keyword arguments to pass to ainvoke().
231
+
232
+ Returns
233
+ -------
234
+ None
235
+ """
236
+ response = self._compiled_graph.ainvoke({
237
+ "user_instructions": user_instructions,
238
+ "data_raw": data_raw.to_dict(),
239
+ "target_variable": target_variable,
240
+ "max_retries": max_retries,
241
+ "retry_count": retry_count
242
+ }, **kwargs)
243
+ self.response = response
244
+ return None
245
+
246
+ def invoke_agent(
247
+ self,
248
+ data_raw: pd.DataFrame,
249
+ user_instructions: str=None,
250
+ target_variable: str = None,
251
+ max_retries=3,
252
+ retry_count=0,
253
+ **kwargs
254
+ ):
255
+ """
256
+ Synchronously engineers features for the provided dataset.
257
+ The response is stored in the 'response' attribute.
258
+
259
+ Parameters
260
+ ----------
261
+ data_raw : pd.DataFrame
262
+ The raw dataset to be processed.
263
+ user_instructions : str
264
+ Instructions for feature engineering agent.
265
+ target_variable : str, optional
266
+ The name of the target variable (if any).
267
+ max_retries : int
268
+ Maximum retry attempts.
269
+ retry_count : int
270
+ Current retry attempt count.
271
+ **kwargs
272
+ Additional keyword arguments to pass to invoke().
273
+
274
+ Returns
275
+ -------
276
+ None
277
+ """
278
+ response = self._compiled_graph.invoke({
279
+ "user_instructions": user_instructions,
280
+ "data_raw": data_raw.to_dict(),
281
+ "target_variable": target_variable,
282
+ "max_retries": max_retries,
283
+ "retry_count": retry_count
284
+ }, **kwargs)
285
+ self.response = response
286
+ return None
287
+
288
+ def explain_feature_engineering_steps(self):
289
+ """
290
+ Provides an explanation of the feature engineering steps performed by the agent.
291
+
292
+ Returns
293
+ -------
294
+ str or list
295
+ Explanation of the feature engineering steps.
296
+ """
297
+ if self.response:
298
+ return self.response.get("messages", [])
299
+ return []
300
+
301
+ def get_log_summary(self, markdown=False):
302
+ """
303
+ Logs a summary of the agent's operations, if logging is enabled.
304
+
305
+ Parameters
306
+ ----------
307
+ markdown : bool, optional
308
+ If True, returns Markdown-formatted output.
309
+
310
+ Returns
311
+ -------
312
+ str or None
313
+ Summary of logs, or None if not available.
314
+ """
315
+ if self.response and self.response.get("feature_engineer_function_path"):
316
+ log_details = f"Log Path: {self.response.get('feature_engineer_function_path')}"
317
+ if markdown:
318
+ return Markdown(log_details)
319
+ else:
320
+ return log_details
321
+ return None
322
+
323
+ def get_data_engineered(self):
324
+ """
325
+ Retrieves the engineered data stored after running invoke/ainvoke.
326
+
327
+ Returns
328
+ -------
329
+ pd.DataFrame or None
330
+ The engineered dataset as a pandas DataFrame.
331
+ """
332
+ if self.response and "data_engineered" in self.response:
333
+ return pd.DataFrame(self.response["data_engineered"])
334
+ return None
335
+
336
+ def get_data_raw(self):
337
+ """
338
+ Retrieves the raw data.
339
+
340
+ Returns
341
+ -------
342
+ pd.DataFrame or None
343
+ The raw dataset as a pandas DataFrame if available.
344
+ """
345
+ if self.response and "data_raw" in self.response:
346
+ return pd.DataFrame(self.response["data_raw"])
347
+ return None
348
+
349
+ def get_feature_engineer_function(self, markdown=False):
350
+ """
351
+ Retrieves the feature engineering function generated by the agent.
352
+
353
+ Parameters
354
+ ----------
355
+ markdown : bool, optional
356
+ If True, returns the function in Markdown code block format.
357
+
358
+ Returns
359
+ -------
360
+ str or None
361
+ The Python function code, or None if unavailable.
362
+ """
363
+ if self.response and "feature_engineer_function" in self.response:
364
+ code = self.response["feature_engineer_function"]
365
+ if markdown:
366
+ return Markdown(f"```python\n{code}\n```")
367
+ return code
368
+ return None
369
+
370
+ def get_recommended_feature_engineering_steps(self, markdown=False):
371
+ """
372
+ Retrieves the agent's recommended feature engineering steps.
373
+
374
+ Parameters
375
+ ----------
376
+ markdown : bool, optional
377
+ If True, returns the steps in Markdown format.
378
+
379
+ Returns
380
+ -------
381
+ str or None
382
+ The recommended steps, or None if not available.
383
+ """
384
+ if self.response and "recommended_steps" in self.response:
385
+ steps = self.response["recommended_steps"]
386
+ if markdown:
387
+ return Markdown(steps)
388
+ return steps
389
+ return None
390
+
391
+ def get_response(self):
392
+ """
393
+ Returns the agent's full response dictionary.
394
+
395
+ Returns
396
+ -------
397
+ dict or None
398
+ The response dictionary if available, otherwise None.
399
+ """
400
+ return self.response
401
+
402
+ def show(self):
403
+ """
404
+ Displays the agent's mermaid diagram for visual inspection of the compiled graph.
405
+ """
406
+ return self._compiled_graph.show()
407
+
408
+
36
409
  # * Feature Engineering Agent
37
410
 
38
411
  def make_feature_engineering_agent(
@@ -41,6 +414,7 @@ def make_feature_engineering_agent(
41
414
  log=False,
42
415
  log_path=None,
43
416
  file_name="feature_engineer.py",
417
+ function_name="feature_engineer",
44
418
  overwrite = True,
45
419
  human_in_the_loop=False,
46
420
  bypass_recommended_steps=False,
@@ -82,6 +456,8 @@ def make_feature_engineering_agent(
82
456
  The path to the directory where the log files should be stored. Defaults to "logs/".
83
457
  file_name : str, optional
84
458
  The name of the file to save the log to. Defaults to "feature_engineer.py".
459
+ function_name : str, optional
460
+ The name of the function that will be generated. Defaults to "feature_engineer".
85
461
  overwrite : bool, optional
86
462
  Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
87
463
  Defaults to True.
@@ -122,6 +498,11 @@ def make_feature_engineering_agent(
122
498
  The feature engineering agent as a state graph.
123
499
  """
124
500
  llm = model
501
+
502
+ # Human in th loop requires recommended steps
503
+ if bypass_recommended_steps and human_in_the_loop:
504
+ bypass_recommended_steps = False
505
+ print("Bypass recommended steps set to False to enable human in the loop.")
125
506
 
126
507
  # Setup Log Directory
127
508
  if log:
@@ -141,6 +522,7 @@ def make_feature_engineering_agent(
141
522
  all_datasets_summary: str
142
523
  feature_engineer_function: str
143
524
  feature_engineer_function_path: str
525
+ feature_engineer_file_name: str
144
526
  feature_engineer_function_name: str
145
527
  feature_engineer_error: str
146
528
  max_retries: int
@@ -218,19 +600,36 @@ def make_feature_engineering_agent(
218
600
  })
219
601
 
220
602
  return {
221
- "recommended_steps": "\n\n# Recommended Feature Engineering Steps:\n" + recommended_steps.content.strip(),
603
+ "recommended_steps": format_recommended_steps(recommended_steps.content.strip(), heading="# Recommended Feature Engineering Steps:"),
222
604
  "all_datasets_summary": all_datasets_summary_str
223
605
  }
224
606
 
225
- def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "create_feature_engineering_code"]]:
226
- return node_func_human_review(
227
- state=state,
228
- prompt_text="Is the following feature engineering instructions correct? (Answer 'yes' or provide modifications)\n{steps}",
229
- yes_goto="create_feature_engineering_code",
230
- no_goto="recommend_feature_engineering_steps",
231
- user_instructions_key="user_instructions",
232
- recommended_steps_key="recommended_steps"
233
- )
607
+ # Human Review
608
+
609
+ prompt_text_human_review = "Are the following feature engineering instructions correct? (Answer 'yes' or provide modifications)\n{steps}"
610
+
611
+ if not bypass_explain_code:
612
+ def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "explain_feature_engineering_code"]]:
613
+ return node_func_human_review(
614
+ state=state,
615
+ prompt_text=prompt_text_human_review,
616
+ yes_goto= 'explain_feature_engineering_code',
617
+ no_goto="recommend_feature_engineering_steps",
618
+ user_instructions_key="user_instructions",
619
+ recommended_steps_key="recommended_steps",
620
+ code_snippet_key="feature_engineer_function",
621
+ )
622
+ else:
623
+ def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "__end__"]]:
624
+ return node_func_human_review(
625
+ state=state,
626
+ prompt_text=prompt_text_human_review,
627
+ yes_goto= '__end__',
628
+ no_goto="recommend_feature_engineering_steps",
629
+ user_instructions_key="user_instructions",
630
+ recommended_steps_key="recommended_steps",
631
+ code_snippet_key="feature_engineer_function",
632
+ )
234
633
 
235
634
  def create_feature_engineering_code(state: GraphState):
236
635
  if bypass_recommended_steps:
@@ -251,7 +650,7 @@ def make_feature_engineering_agent(
251
650
  feature_engineering_prompt = PromptTemplate(
252
651
  template="""
253
652
 
254
- You are a Feature Engineering Agent. Your job is to create a feature_engineer() function that can be run on the data provided using the following recommended steps.
653
+ You are a Feature Engineering Agent. Your job is to create a {function_name}() function that can be run on the data provided using the following recommended steps.
255
654
 
256
655
  Recommended Steps:
257
656
  {recommended_steps}
@@ -265,11 +664,11 @@ def make_feature_engineering_agent(
265
664
 
266
665
  You can use Pandas, Numpy, and Scikit Learn libraries to feature engineer the data.
267
666
 
268
- Return Python code in ```python``` format with a single function definition, feature_engineer(data_raw), including all imports inside the function.
667
+ Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), including all imports inside the function.
269
668
 
270
669
  Return code to provide the feature engineering function:
271
670
 
272
- def feature_engineer(data_raw):
671
+ def {function_name}(data_raw):
273
672
  import pandas as pd
274
673
  import numpy as np
275
674
  ...
@@ -292,7 +691,7 @@ def make_feature_engineering_agent(
292
691
 
293
692
 
294
693
  """,
295
- input_variables=["recommeded_steps", "target_variable", "all_datasets_summary"]
694
+ input_variables=["recommeded_steps", "target_variable", "all_datasets_summary", "function_name"]
296
695
  )
297
696
 
298
697
  feature_engineering_agent = feature_engineering_prompt | llm | PythonOutputParser()
@@ -301,6 +700,7 @@ def make_feature_engineering_agent(
301
700
  "recommended_steps": state.get("recommended_steps"),
302
701
  "target_variable": state.get("target_variable"),
303
702
  "all_datasets_summary": all_datasets_summary_str,
703
+ "function_name": function_name
304
704
  })
305
705
 
306
706
  response = relocate_imports_inside_function(response)
@@ -318,12 +718,11 @@ def make_feature_engineering_agent(
318
718
  return {
319
719
  "feature_engineer_function": response,
320
720
  "feature_engineer_function_path": file_path,
321
- "feature_engineer_function_name": file_name_2,
721
+ "feature_engineer_file_name": file_name_2,
722
+ "feature_engineer_function_name": function_name,
322
723
  "all_datasets_summary": all_datasets_summary_str
323
724
  }
324
725
 
325
-
326
-
327
726
  def execute_feature_engineering_code(state):
328
727
  return node_func_execute_agent_code_on_data(
329
728
  state=state,
@@ -331,7 +730,7 @@ def make_feature_engineering_agent(
331
730
  result_key="data_engineered",
332
731
  error_key="feature_engineer_error",
333
732
  code_snippet_key="feature_engineer_function",
334
- agent_function_name="feature_engineer",
733
+ agent_function_name=state.get("feature_engineer_function_name"),
335
734
  pre_processing=lambda data: pd.DataFrame.from_dict(data),
336
735
  post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
337
736
  error_message_prefix="An error occurred during feature engineering: "
@@ -339,11 +738,13 @@ def make_feature_engineering_agent(
339
738
 
340
739
  def fix_feature_engineering_code(state: GraphState):
341
740
  feature_engineer_prompt = """
342
- You are a Feature Engineering Agent. Your job is to fix the feature_engineer() function that currently contains errors.
741
+ You are a Feature Engineering Agent. Your job is to fix the {function_name}() function that currently contains errors.
742
+
743
+ Provide only the corrected function definition for {function_name}().
343
744
 
344
- Provide only the corrected function definition.
745
+ Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
345
746
 
346
- Broken code:
747
+ This is the broken code (please fix):
347
748
  {code_snippet}
348
749
 
349
750
  Last Known Error:
@@ -359,6 +760,7 @@ def make_feature_engineering_agent(
359
760
  agent_name=AGENT_NAME,
360
761
  log=log,
361
762
  file_path=state.get("feature_engineer_function_path"),
763
+ function_name=state.get("feature_engineer_function_name"),
362
764
  )
363
765
 
364
766
  def explain_feature_engineering_code(state: GraphState):
@@ -395,9 +797,11 @@ def make_feature_engineering_agent(
395
797
  fix_code_node_name="fix_feature_engineering_code",
396
798
  explain_code_node_name="explain_feature_engineering_code",
397
799
  error_key="feature_engineer_error",
800
+ max_retries_key = "max_retries",
801
+ retry_count_key = "retry_count",
398
802
  human_in_the_loop=human_in_the_loop,
399
803
  human_review_node_name="human_review",
400
- checkpointer=MemorySaver() if human_in_the_loop else None,
804
+ checkpointer=MemorySaver(),
401
805
  bypass_recommended_steps=bypass_recommended_steps,
402
806
  bypass_explain_code=bypass_explain_code,
403
807
  )