ai-data-science-team 0.0.0.9007__py3-none-any.whl → 0.0.0.9008__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,18 +14,25 @@ from langgraph.types import Command
14
14
  from langgraph.checkpoint.memory import MemorySaver
15
15
 
16
16
  import os
17
- import io
18
17
  import pandas as pd
19
18
 
19
+ from IPython.display import Markdown
20
+
20
21
  from ai_data_science_team.templates import(
21
22
  node_func_execute_agent_code_on_data,
22
23
  node_func_human_review,
23
24
  node_func_fix_agent_code,
24
25
  node_func_explain_agent_code,
25
- create_coding_agent_graph
26
+ create_coding_agent_graph,
27
+ BaseAgent,
26
28
  )
27
29
  from ai_data_science_team.tools.parsers import PythonOutputParser
28
- from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top, format_agent_name
30
+ from ai_data_science_team.tools.regex import (
31
+ relocate_imports_inside_function,
32
+ add_comments_to_top,
33
+ format_agent_name,
34
+ format_recommended_steps
35
+ )
29
36
  from ai_data_science_team.tools.metadata import get_dataframe_summary
30
37
  from ai_data_science_team.tools.logging import log_ai_function
31
38
 
@@ -33,6 +40,372 @@ from ai_data_science_team.tools.logging import log_ai_function
33
40
  AGENT_NAME = "feature_engineering_agent"
34
41
  LOG_PATH = os.path.join(os.getcwd(), "logs/")
35
42
 
43
+ # Class
44
+
45
+ class FeatureEngineeringAgent(BaseAgent):
46
+ """
47
+ Creates a feature engineering agent that can process datasets based on user-defined instructions or
48
+ default feature engineering steps. The agent generates a Python function to engineer features, executes it,
49
+ and logs the process, including code and errors. It is designed to facilitate reproducible and
50
+ customizable feature engineering workflows.
51
+
52
+ The agent can perform the following default feature engineering steps unless instructed otherwise:
53
+ - Convert features to appropriate data types
54
+ - Remove features that have unique values for each row
55
+ - Remove constant features
56
+ - Encode high-cardinality categoricals (threshold <= 5% of dataset) as 'other'
57
+ - One-hot-encode categorical variables
58
+ - Convert booleans to integer (1/0)
59
+ - Create datetime-based features (if applicable)
60
+ - Handle target variable encoding if specified
61
+ - Any user-provided instructions to add, remove, or modify steps
62
+
63
+ Parameters
64
+ ----------
65
+ model : langchain.llms.base.LLM
66
+ The language model used to generate the feature engineering function.
67
+ n_samples : int, optional
68
+ Number of samples used when summarizing the dataset. Defaults to 30.
69
+ log : bool, optional
70
+ Whether to log the generated code and errors. Defaults to False.
71
+ log_path : str, optional
72
+ Directory path for storing log files. Defaults to None.
73
+ file_name : str, optional
74
+ Name of the file for saving the generated response. Defaults to "feature_engineer.py".
75
+ function_name : str, optional
76
+ Name of the function for data visualization. Defaults to "feature_engineer".
77
+ overwrite : bool, optional
78
+ Whether to overwrite the log file if it exists. If False, a unique file name is created. Defaults to True.
79
+ human_in_the_loop : bool, optional
80
+ Enables user review of feature engineering instructions. Defaults to False.
81
+ bypass_recommended_steps : bool, optional
82
+ If True, skips the default recommended steps. Defaults to False.
83
+ bypass_explain_code : bool, optional
84
+ If True, skips the step that provides code explanations. Defaults to False.
85
+
86
+ Methods
87
+ -------
88
+ update_params(**kwargs)
89
+ Updates the agent's parameters and rebuilds the compiled state graph.
90
+ ainvoke_agent(
91
+ user_instructions: str,
92
+ data_raw: pd.DataFrame,
93
+ target_variable: str = None,
94
+ max_retries=3,
95
+ retry_count=0
96
+ )
97
+ Engineers features from the provided dataset asynchronously based on user instructions.
98
+ invoke_agent(
99
+ user_instructions: str,
100
+ data_raw: pd.DataFrame,
101
+ target_variable: str = None,
102
+ max_retries=3,
103
+ retry_count=0
104
+ )
105
+ Engineers features from the provided dataset synchronously based on user instructions.
106
+ explain_feature_engineering_steps()
107
+ Returns an explanation of the feature engineering steps performed by the agent.
108
+ get_log_summary()
109
+ Retrieves a summary of logged operations if logging is enabled.
110
+ get_data_engineered()
111
+ Retrieves the feature-engineered dataset as a pandas DataFrame.
112
+ get_data_raw()
113
+ Retrieves the raw dataset as a pandas DataFrame.
114
+ get_feature_engineer_function()
115
+ Retrieves the generated Python function used for feature engineering.
116
+ get_recommended_feature_engineering_steps()
117
+ Retrieves the agent's recommended feature engineering steps.
118
+ get_response()
119
+ Returns the response from the agent as a dictionary.
120
+ show()
121
+ Displays the agent's mermaid diagram.
122
+
123
+ Examples
124
+ --------
125
+ ```python
126
+ import pandas as pd
127
+ from langchain_openai import ChatOpenAI
128
+ from ai_data_science_team.agents import FeatureEngineeringAgent
129
+
130
+ llm = ChatOpenAI(model="gpt-4o-mini")
131
+
132
+ feature_agent = FeatureEngineeringAgent(
133
+ model=llm,
134
+ n_samples=30,
135
+ log=True,
136
+ log_path="logs",
137
+ human_in_the_loop=True
138
+ )
139
+
140
+ df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
141
+
142
+ feature_agent.invoke_agent(
143
+ user_instructions="Also encode the 'PaymentMethod' column with one-hot encoding.",
144
+ data_raw=df,
145
+ target_variable="Churn",
146
+ max_retries=3,
147
+ retry_count=0
148
+ )
149
+
150
+ engineered_data = feature_agent.get_data_engineered()
151
+ response = feature_agent.get_response()
152
+ ```
153
+
154
+ Returns
155
+ -------
156
+ FeatureEngineeringAgent : langchain.graphs.CompiledStateGraph
157
+ A feature engineering agent implemented as a compiled state graph.
158
+ """
159
+
160
+ def __init__(
161
+ self,
162
+ model,
163
+ n_samples=30,
164
+ log=False,
165
+ log_path=None,
166
+ file_name="feature_engineer.py",
167
+ function_name="feature_engineer",
168
+ overwrite=True,
169
+ human_in_the_loop=False,
170
+ bypass_recommended_steps=False,
171
+ bypass_explain_code=False
172
+ ):
173
+ self._params = {
174
+ "model": model,
175
+ "n_samples": n_samples,
176
+ "log": log,
177
+ "log_path": log_path,
178
+ "file_name": file_name,
179
+ "function_name": function_name,
180
+ "overwrite": overwrite,
181
+ "human_in_the_loop": human_in_the_loop,
182
+ "bypass_recommended_steps": bypass_recommended_steps,
183
+ "bypass_explain_code": bypass_explain_code
184
+ }
185
+ self._compiled_graph = self._make_compiled_graph()
186
+ self.response = None
187
+
188
+ def _make_compiled_graph(self):
189
+ """
190
+ Create the compiled graph for the feature engineering agent.
191
+ Running this method will reset the response to None.
192
+ """
193
+ self.response = None
194
+ return make_feature_engineering_agent(**self._params)
195
+
196
+ def update_params(self, **kwargs):
197
+ """
198
+ Updates the agent's parameters and rebuilds the compiled graph.
199
+ """
200
+ for k, v in kwargs.items():
201
+ self._params[k] = v
202
+ self._compiled_graph = self._make_compiled_graph()
203
+
204
+ def ainvoke_agent(
205
+ self,
206
+ data_raw: pd.DataFrame,
207
+ user_instructions: str=None,
208
+ target_variable: str = None,
209
+ max_retries=3,
210
+ retry_count=0,
211
+ **kwargs
212
+ ):
213
+ """
214
+ Asynchronously engineers features for the provided dataset.
215
+ The response is stored in the 'response' attribute.
216
+
217
+ Parameters
218
+ ----------
219
+ data_raw : pd.DataFrame
220
+ The raw dataset to be processed.
221
+ user_instructions : str, optional
222
+ Instructions for feature engineering.
223
+ target_variable : str, optional
224
+ The name of the target variable (if any).
225
+ max_retries : int
226
+ Maximum retry attempts.
227
+ retry_count : int
228
+ Current retry attempt count.
229
+ **kwargs
230
+ Additional keyword arguments to pass to ainvoke().
231
+
232
+ Returns
233
+ -------
234
+ None
235
+ """
236
+ response = self._compiled_graph.ainvoke({
237
+ "user_instructions": user_instructions,
238
+ "data_raw": data_raw.to_dict(),
239
+ "target_variable": target_variable,
240
+ "max_retries": max_retries,
241
+ "retry_count": retry_count
242
+ }, **kwargs)
243
+ self.response = response
244
+ return None
245
+
246
+ def invoke_agent(
247
+ self,
248
+ data_raw: pd.DataFrame,
249
+ user_instructions: str=None,
250
+ target_variable: str = None,
251
+ max_retries=3,
252
+ retry_count=0,
253
+ **kwargs
254
+ ):
255
+ """
256
+ Synchronously engineers features for the provided dataset.
257
+ The response is stored in the 'response' attribute.
258
+
259
+ Parameters
260
+ ----------
261
+ data_raw : pd.DataFrame
262
+ The raw dataset to be processed.
263
+ user_instructions : str
264
+ Instructions for feature engineering agent.
265
+ target_variable : str, optional
266
+ The name of the target variable (if any).
267
+ max_retries : int
268
+ Maximum retry attempts.
269
+ retry_count : int
270
+ Current retry attempt count.
271
+ **kwargs
272
+ Additional keyword arguments to pass to invoke().
273
+
274
+ Returns
275
+ -------
276
+ None
277
+ """
278
+ response = self._compiled_graph.invoke({
279
+ "user_instructions": user_instructions,
280
+ "data_raw": data_raw.to_dict(),
281
+ "target_variable": target_variable,
282
+ "max_retries": max_retries,
283
+ "retry_count": retry_count
284
+ }, **kwargs)
285
+ self.response = response
286
+ return None
287
+
288
+ def explain_feature_engineering_steps(self):
289
+ """
290
+ Provides an explanation of the feature engineering steps performed by the agent.
291
+
292
+ Returns
293
+ -------
294
+ str or list
295
+ Explanation of the feature engineering steps.
296
+ """
297
+ if self.response:
298
+ return self.response.get("messages", [])
299
+ return []
300
+
301
+ def get_log_summary(self, markdown=False):
302
+ """
303
+ Logs a summary of the agent's operations, if logging is enabled.
304
+
305
+ Parameters
306
+ ----------
307
+ markdown : bool, optional
308
+ If True, returns Markdown-formatted output.
309
+
310
+ Returns
311
+ -------
312
+ str or None
313
+ Summary of logs, or None if not available.
314
+ """
315
+ if self.response and self.response.get("feature_engineer_function_path"):
316
+ log_details = f"Log Path: {self.response.get('feature_engineer_function_path')}"
317
+ if markdown:
318
+ return Markdown(log_details)
319
+ else:
320
+ return log_details
321
+ return None
322
+
323
+ def get_data_engineered(self):
324
+ """
325
+ Retrieves the engineered data stored after running invoke/ainvoke.
326
+
327
+ Returns
328
+ -------
329
+ pd.DataFrame or None
330
+ The engineered dataset as a pandas DataFrame.
331
+ """
332
+ if self.response and "data_engineered" in self.response:
333
+ return pd.DataFrame(self.response["data_engineered"])
334
+ return None
335
+
336
+ def get_data_raw(self):
337
+ """
338
+ Retrieves the raw data.
339
+
340
+ Returns
341
+ -------
342
+ pd.DataFrame or None
343
+ The raw dataset as a pandas DataFrame if available.
344
+ """
345
+ if self.response and "data_raw" in self.response:
346
+ return pd.DataFrame(self.response["data_raw"])
347
+ return None
348
+
349
+ def get_feature_engineer_function(self, markdown=False):
350
+ """
351
+ Retrieves the feature engineering function generated by the agent.
352
+
353
+ Parameters
354
+ ----------
355
+ markdown : bool, optional
356
+ If True, returns the function in Markdown code block format.
357
+
358
+ Returns
359
+ -------
360
+ str or None
361
+ The Python function code, or None if unavailable.
362
+ """
363
+ if self.response and "feature_engineer_function" in self.response:
364
+ code = self.response["feature_engineer_function"]
365
+ if markdown:
366
+ return Markdown(f"```python\n{code}\n```")
367
+ return code
368
+ return None
369
+
370
+ def get_recommended_feature_engineering_steps(self, markdown=False):
371
+ """
372
+ Retrieves the agent's recommended feature engineering steps.
373
+
374
+ Parameters
375
+ ----------
376
+ markdown : bool, optional
377
+ If True, returns the steps in Markdown format.
378
+
379
+ Returns
380
+ -------
381
+ str or None
382
+ The recommended steps, or None if not available.
383
+ """
384
+ if self.response and "recommended_steps" in self.response:
385
+ steps = self.response["recommended_steps"]
386
+ if markdown:
387
+ return Markdown(steps)
388
+ return steps
389
+ return None
390
+
391
+ def get_response(self):
392
+ """
393
+ Returns the agent's full response dictionary.
394
+
395
+ Returns
396
+ -------
397
+ dict or None
398
+ The response dictionary if available, otherwise None.
399
+ """
400
+ return self.response
401
+
402
+ def show(self):
403
+ """
404
+ Displays the agent's mermaid diagram for visual inspection of the compiled graph.
405
+ """
406
+ return self._compiled_graph.show()
407
+
408
+
36
409
  # * Feature Engineering Agent
37
410
 
38
411
  def make_feature_engineering_agent(
@@ -41,6 +414,7 @@ def make_feature_engineering_agent(
41
414
  log=False,
42
415
  log_path=None,
43
416
  file_name="feature_engineer.py",
417
+ function_name="feature_engineer",
44
418
  overwrite = True,
45
419
  human_in_the_loop=False,
46
420
  bypass_recommended_steps=False,
@@ -82,6 +456,8 @@ def make_feature_engineering_agent(
82
456
  The path to the directory where the log files should be stored. Defaults to "logs/".
83
457
  file_name : str, optional
84
458
  The name of the file to save the log to. Defaults to "feature_engineer.py".
459
+ function_name : str, optional
460
+ The name of the function that will be generated. Defaults to "feature_engineer".
85
461
  overwrite : bool, optional
86
462
  Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
87
463
  Defaults to True.
@@ -122,6 +498,11 @@ def make_feature_engineering_agent(
122
498
  The feature engineering agent as a state graph.
123
499
  """
124
500
  llm = model
501
+
502
+ # Human in th loop requires recommended steps
503
+ if bypass_recommended_steps and human_in_the_loop:
504
+ bypass_recommended_steps = False
505
+ print("Bypass recommended steps set to False to enable human in the loop.")
125
506
 
126
507
  # Setup Log Directory
127
508
  if log:
@@ -141,6 +522,7 @@ def make_feature_engineering_agent(
141
522
  all_datasets_summary: str
142
523
  feature_engineer_function: str
143
524
  feature_engineer_function_path: str
525
+ feature_engineer_file_name: str
144
526
  feature_engineer_function_name: str
145
527
  feature_engineer_error: str
146
528
  max_retries: int
@@ -218,19 +600,36 @@ def make_feature_engineering_agent(
218
600
  })
219
601
 
220
602
  return {
221
- "recommended_steps": "\n\n# Recommended Feature Engineering Steps:\n" + recommended_steps.content.strip(),
603
+ "recommended_steps": format_recommended_steps(recommended_steps.content.strip(), heading="# Recommended Feature Engineering Steps:"),
222
604
  "all_datasets_summary": all_datasets_summary_str
223
605
  }
224
606
 
225
- def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "create_feature_engineering_code"]]:
226
- return node_func_human_review(
227
- state=state,
228
- prompt_text="Is the following feature engineering instructions correct? (Answer 'yes' or provide modifications)\n{steps}",
229
- yes_goto="create_feature_engineering_code",
230
- no_goto="recommend_feature_engineering_steps",
231
- user_instructions_key="user_instructions",
232
- recommended_steps_key="recommended_steps"
233
- )
607
+ # Human Review
608
+
609
+ prompt_text_human_review = "Are the following feature engineering instructions correct? (Answer 'yes' or provide modifications)\n{steps}"
610
+
611
+ if not bypass_explain_code:
612
+ def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "explain_feature_engineering_code"]]:
613
+ return node_func_human_review(
614
+ state=state,
615
+ prompt_text=prompt_text_human_review,
616
+ yes_goto= 'explain_feature_engineering_code',
617
+ no_goto="recommend_feature_engineering_steps",
618
+ user_instructions_key="user_instructions",
619
+ recommended_steps_key="recommended_steps",
620
+ code_snippet_key="feature_engineer_function",
621
+ )
622
+ else:
623
+ def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "__end__"]]:
624
+ return node_func_human_review(
625
+ state=state,
626
+ prompt_text=prompt_text_human_review,
627
+ yes_goto= '__end__',
628
+ no_goto="recommend_feature_engineering_steps",
629
+ user_instructions_key="user_instructions",
630
+ recommended_steps_key="recommended_steps",
631
+ code_snippet_key="feature_engineer_function",
632
+ )
234
633
 
235
634
  def create_feature_engineering_code(state: GraphState):
236
635
  if bypass_recommended_steps:
@@ -251,7 +650,7 @@ def make_feature_engineering_agent(
251
650
  feature_engineering_prompt = PromptTemplate(
252
651
  template="""
253
652
 
254
- You are a Feature Engineering Agent. Your job is to create a feature_engineer() function that can be run on the data provided using the following recommended steps.
653
+ You are a Feature Engineering Agent. Your job is to create a {function_name}() function that can be run on the data provided using the following recommended steps.
255
654
 
256
655
  Recommended Steps:
257
656
  {recommended_steps}
@@ -265,11 +664,11 @@ def make_feature_engineering_agent(
265
664
 
266
665
  You can use Pandas, Numpy, and Scikit Learn libraries to feature engineer the data.
267
666
 
268
- Return Python code in ```python``` format with a single function definition, feature_engineer(data_raw), including all imports inside the function.
667
+ Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), including all imports inside the function.
269
668
 
270
669
  Return code to provide the feature engineering function:
271
670
 
272
- def feature_engineer(data_raw):
671
+ def {function_name}(data_raw):
273
672
  import pandas as pd
274
673
  import numpy as np
275
674
  ...
@@ -292,7 +691,7 @@ def make_feature_engineering_agent(
292
691
 
293
692
 
294
693
  """,
295
- input_variables=["recommeded_steps", "target_variable", "all_datasets_summary"]
694
+ input_variables=["recommeded_steps", "target_variable", "all_datasets_summary", "function_name"]
296
695
  )
297
696
 
298
697
  feature_engineering_agent = feature_engineering_prompt | llm | PythonOutputParser()
@@ -301,6 +700,7 @@ def make_feature_engineering_agent(
301
700
  "recommended_steps": state.get("recommended_steps"),
302
701
  "target_variable": state.get("target_variable"),
303
702
  "all_datasets_summary": all_datasets_summary_str,
703
+ "function_name": function_name
304
704
  })
305
705
 
306
706
  response = relocate_imports_inside_function(response)
@@ -318,12 +718,11 @@ def make_feature_engineering_agent(
318
718
  return {
319
719
  "feature_engineer_function": response,
320
720
  "feature_engineer_function_path": file_path,
321
- "feature_engineer_function_name": file_name_2,
721
+ "feature_engineer_file_name": file_name_2,
722
+ "feature_engineer_function_name": function_name,
322
723
  "all_datasets_summary": all_datasets_summary_str
323
724
  }
324
725
 
325
-
326
-
327
726
  def execute_feature_engineering_code(state):
328
727
  return node_func_execute_agent_code_on_data(
329
728
  state=state,
@@ -331,7 +730,7 @@ def make_feature_engineering_agent(
331
730
  result_key="data_engineered",
332
731
  error_key="feature_engineer_error",
333
732
  code_snippet_key="feature_engineer_function",
334
- agent_function_name="feature_engineer",
733
+ agent_function_name=state.get("feature_engineer_function_name"),
335
734
  pre_processing=lambda data: pd.DataFrame.from_dict(data),
336
735
  post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
337
736
  error_message_prefix="An error occurred during feature engineering: "
@@ -339,11 +738,13 @@ def make_feature_engineering_agent(
339
738
 
340
739
  def fix_feature_engineering_code(state: GraphState):
341
740
  feature_engineer_prompt = """
342
- You are a Feature Engineering Agent. Your job is to fix the feature_engineer() function that currently contains errors.
741
+ You are a Feature Engineering Agent. Your job is to fix the {function_name}() function that currently contains errors.
742
+
743
+ Provide only the corrected function definition for {function_name}().
343
744
 
344
- Provide only the corrected function definition.
745
+ Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
345
746
 
346
- Broken code:
747
+ This is the broken code (please fix):
347
748
  {code_snippet}
348
749
 
349
750
  Last Known Error:
@@ -359,6 +760,7 @@ def make_feature_engineering_agent(
359
760
  agent_name=AGENT_NAME,
360
761
  log=log,
361
762
  file_path=state.get("feature_engineer_function_path"),
763
+ function_name=state.get("feature_engineer_function_name"),
362
764
  )
363
765
 
364
766
  def explain_feature_engineering_code(state: GraphState):
@@ -395,9 +797,11 @@ def make_feature_engineering_agent(
395
797
  fix_code_node_name="fix_feature_engineering_code",
396
798
  explain_code_node_name="explain_feature_engineering_code",
397
799
  error_key="feature_engineer_error",
800
+ max_retries_key = "max_retries",
801
+ retry_count_key = "retry_count",
398
802
  human_in_the_loop=human_in_the_loop,
399
803
  human_review_node_name="human_review",
400
- checkpointer=MemorySaver() if human_in_the_loop else None,
804
+ checkpointer=MemorySaver(),
401
805
  bypass_recommended_steps=bypass_recommended_steps,
402
806
  bypass_explain_code=bypass_explain_code,
403
807
  )