ai-data-science-team 0.0.0.9007__py3-none-any.whl → 0.0.0.9009__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/agents/__init__.py +4 -5
- ai_data_science_team/agents/data_cleaning_agent.py +268 -116
- ai_data_science_team/agents/data_visualization_agent.py +470 -41
- ai_data_science_team/agents/data_wrangling_agent.py +471 -31
- ai_data_science_team/agents/feature_engineering_agent.py +426 -41
- ai_data_science_team/agents/sql_database_agent.py +458 -58
- ai_data_science_team/ml_agents/__init__.py +1 -0
- ai_data_science_team/ml_agents/h2o_ml_agent.py +1032 -0
- ai_data_science_team/multiagents/__init__.py +1 -0
- ai_data_science_team/multiagents/sql_data_analyst.py +398 -0
- ai_data_science_team/multiagents/supervised_data_analyst.py +2 -0
- ai_data_science_team/templates/__init__.py +3 -1
- ai_data_science_team/templates/agent_templates.py +319 -43
- ai_data_science_team/tools/metadata.py +94 -62
- ai_data_science_team/tools/regex.py +86 -1
- ai_data_science_team/utils/__init__.py +0 -0
- ai_data_science_team/utils/plotly.py +24 -0
- ai_data_science_team-0.0.0.9009.dist-info/METADATA +245 -0
- ai_data_science_team-0.0.0.9009.dist-info/RECORD +28 -0
- ai_data_science_team-0.0.0.9007.dist-info/METADATA +0 -183
- ai_data_science_team-0.0.0.9007.dist-info/RECORD +0 -21
- {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/WHEEL +0 -0
- {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/top_level.txt +0 -0
@@ -14,18 +14,27 @@ from langgraph.types import Command
|
|
14
14
|
from langgraph.checkpoint.memory import MemorySaver
|
15
15
|
|
16
16
|
import os
|
17
|
-
import
|
17
|
+
import json
|
18
18
|
import pandas as pd
|
19
19
|
|
20
|
+
from IPython.display import Markdown
|
21
|
+
|
20
22
|
from ai_data_science_team.templates import(
|
21
23
|
node_func_execute_agent_code_on_data,
|
22
24
|
node_func_human_review,
|
23
25
|
node_func_fix_agent_code,
|
24
|
-
|
25
|
-
create_coding_agent_graph
|
26
|
+
node_func_report_agent_outputs,
|
27
|
+
create_coding_agent_graph,
|
28
|
+
BaseAgent,
|
26
29
|
)
|
27
30
|
from ai_data_science_team.tools.parsers import PythonOutputParser
|
28
|
-
from ai_data_science_team.tools.regex import
|
31
|
+
from ai_data_science_team.tools.regex import (
|
32
|
+
relocate_imports_inside_function,
|
33
|
+
add_comments_to_top,
|
34
|
+
format_agent_name,
|
35
|
+
format_recommended_steps,
|
36
|
+
get_generic_summary,
|
37
|
+
)
|
29
38
|
from ai_data_science_team.tools.metadata import get_dataframe_summary
|
30
39
|
from ai_data_science_team.tools.logging import log_ai_function
|
31
40
|
|
@@ -33,6 +42,351 @@ from ai_data_science_team.tools.logging import log_ai_function
|
|
33
42
|
AGENT_NAME = "feature_engineering_agent"
|
34
43
|
LOG_PATH = os.path.join(os.getcwd(), "logs/")
|
35
44
|
|
45
|
+
# Class
|
46
|
+
|
47
|
+
class FeatureEngineeringAgent(BaseAgent):
|
48
|
+
"""
|
49
|
+
Creates a feature engineering agent that can process datasets based on user-defined instructions or
|
50
|
+
default feature engineering steps. The agent generates a Python function to engineer features, executes it,
|
51
|
+
and logs the process, including code and errors. It is designed to facilitate reproducible and
|
52
|
+
customizable feature engineering workflows.
|
53
|
+
|
54
|
+
The agent can perform the following default feature engineering steps unless instructed otherwise:
|
55
|
+
- Convert features to appropriate data types
|
56
|
+
- Remove features that have unique values for each row
|
57
|
+
- Remove constant features
|
58
|
+
- Encode high-cardinality categoricals (threshold <= 5% of dataset) as 'other'
|
59
|
+
- One-hot-encode categorical variables
|
60
|
+
- Convert booleans to integer (1/0)
|
61
|
+
- Create datetime-based features (if applicable)
|
62
|
+
- Handle target variable encoding if specified
|
63
|
+
- Any user-provided instructions to add, remove, or modify steps
|
64
|
+
|
65
|
+
Parameters
|
66
|
+
----------
|
67
|
+
model : langchain.llms.base.LLM
|
68
|
+
The language model used to generate the feature engineering function.
|
69
|
+
n_samples : int, optional
|
70
|
+
Number of samples used when summarizing the dataset. Defaults to 30.
|
71
|
+
log : bool, optional
|
72
|
+
Whether to log the generated code and errors. Defaults to False.
|
73
|
+
log_path : str, optional
|
74
|
+
Directory path for storing log files. Defaults to None.
|
75
|
+
file_name : str, optional
|
76
|
+
Name of the file for saving the generated response. Defaults to "feature_engineer.py".
|
77
|
+
function_name : str, optional
|
78
|
+
Name of the function for data visualization. Defaults to "feature_engineer".
|
79
|
+
overwrite : bool, optional
|
80
|
+
Whether to overwrite the log file if it exists. If False, a unique file name is created. Defaults to True.
|
81
|
+
human_in_the_loop : bool, optional
|
82
|
+
Enables user review of feature engineering instructions. Defaults to False.
|
83
|
+
bypass_recommended_steps : bool, optional
|
84
|
+
If True, skips the default recommended steps. Defaults to False.
|
85
|
+
bypass_explain_code : bool, optional
|
86
|
+
If True, skips the step that provides code explanations. Defaults to False.
|
87
|
+
|
88
|
+
Methods
|
89
|
+
-------
|
90
|
+
update_params(**kwargs)
|
91
|
+
Updates the agent's parameters and rebuilds the compiled state graph.
|
92
|
+
ainvoke_agent(
|
93
|
+
user_instructions: str,
|
94
|
+
data_raw: pd.DataFrame,
|
95
|
+
target_variable: str = None,
|
96
|
+
max_retries=3,
|
97
|
+
retry_count=0
|
98
|
+
)
|
99
|
+
Engineers features from the provided dataset asynchronously based on user instructions.
|
100
|
+
invoke_agent(
|
101
|
+
user_instructions: str,
|
102
|
+
data_raw: pd.DataFrame,
|
103
|
+
target_variable: str = None,
|
104
|
+
max_retries=3,
|
105
|
+
retry_count=0
|
106
|
+
)
|
107
|
+
Engineers features from the provided dataset synchronously based on user instructions.
|
108
|
+
get_workflow_summary()
|
109
|
+
Retrieves a summary of the agent's workflow.
|
110
|
+
get_log_summary()
|
111
|
+
Retrieves a summary of logged operations if logging is enabled.
|
112
|
+
get_data_engineered()
|
113
|
+
Retrieves the feature-engineered dataset as a pandas DataFrame.
|
114
|
+
get_data_raw()
|
115
|
+
Retrieves the raw dataset as a pandas DataFrame.
|
116
|
+
get_feature_engineer_function()
|
117
|
+
Retrieves the generated Python function used for feature engineering.
|
118
|
+
get_recommended_feature_engineering_steps()
|
119
|
+
Retrieves the agent's recommended feature engineering steps.
|
120
|
+
get_response()
|
121
|
+
Returns the response from the agent as a dictionary.
|
122
|
+
show()
|
123
|
+
Displays the agent's mermaid diagram.
|
124
|
+
|
125
|
+
Examples
|
126
|
+
--------
|
127
|
+
```python
|
128
|
+
import pandas as pd
|
129
|
+
from langchain_openai import ChatOpenAI
|
130
|
+
from ai_data_science_team.agents import FeatureEngineeringAgent
|
131
|
+
|
132
|
+
llm = ChatOpenAI(model="gpt-4o-mini")
|
133
|
+
|
134
|
+
feature_agent = FeatureEngineeringAgent(
|
135
|
+
model=llm,
|
136
|
+
n_samples=30,
|
137
|
+
log=True,
|
138
|
+
log_path="logs",
|
139
|
+
human_in_the_loop=True
|
140
|
+
)
|
141
|
+
|
142
|
+
df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
|
143
|
+
|
144
|
+
feature_agent.invoke_agent(
|
145
|
+
user_instructions="Also encode the 'PaymentMethod' column with one-hot encoding.",
|
146
|
+
data_raw=df,
|
147
|
+
target_variable="Churn",
|
148
|
+
max_retries=3,
|
149
|
+
retry_count=0
|
150
|
+
)
|
151
|
+
|
152
|
+
engineered_data = feature_agent.get_data_engineered()
|
153
|
+
response = feature_agent.get_response()
|
154
|
+
```
|
155
|
+
|
156
|
+
Returns
|
157
|
+
-------
|
158
|
+
FeatureEngineeringAgent : langchain.graphs.CompiledStateGraph
|
159
|
+
A feature engineering agent implemented as a compiled state graph.
|
160
|
+
"""
|
161
|
+
|
162
|
+
def __init__(
|
163
|
+
self,
|
164
|
+
model,
|
165
|
+
n_samples=30,
|
166
|
+
log=False,
|
167
|
+
log_path=None,
|
168
|
+
file_name="feature_engineer.py",
|
169
|
+
function_name="feature_engineer",
|
170
|
+
overwrite=True,
|
171
|
+
human_in_the_loop=False,
|
172
|
+
bypass_recommended_steps=False,
|
173
|
+
bypass_explain_code=False
|
174
|
+
):
|
175
|
+
self._params = {
|
176
|
+
"model": model,
|
177
|
+
"n_samples": n_samples,
|
178
|
+
"log": log,
|
179
|
+
"log_path": log_path,
|
180
|
+
"file_name": file_name,
|
181
|
+
"function_name": function_name,
|
182
|
+
"overwrite": overwrite,
|
183
|
+
"human_in_the_loop": human_in_the_loop,
|
184
|
+
"bypass_recommended_steps": bypass_recommended_steps,
|
185
|
+
"bypass_explain_code": bypass_explain_code
|
186
|
+
}
|
187
|
+
self._compiled_graph = self._make_compiled_graph()
|
188
|
+
self.response = None
|
189
|
+
|
190
|
+
def _make_compiled_graph(self):
|
191
|
+
"""
|
192
|
+
Create the compiled graph for the feature engineering agent.
|
193
|
+
Running this method will reset the response to None.
|
194
|
+
"""
|
195
|
+
self.response = None
|
196
|
+
return make_feature_engineering_agent(**self._params)
|
197
|
+
|
198
|
+
def update_params(self, **kwargs):
|
199
|
+
"""
|
200
|
+
Updates the agent's parameters and rebuilds the compiled graph.
|
201
|
+
"""
|
202
|
+
for k, v in kwargs.items():
|
203
|
+
self._params[k] = v
|
204
|
+
self._compiled_graph = self._make_compiled_graph()
|
205
|
+
|
206
|
+
def ainvoke_agent(
|
207
|
+
self,
|
208
|
+
data_raw: pd.DataFrame,
|
209
|
+
user_instructions: str=None,
|
210
|
+
target_variable: str = None,
|
211
|
+
max_retries=3,
|
212
|
+
retry_count=0,
|
213
|
+
**kwargs
|
214
|
+
):
|
215
|
+
"""
|
216
|
+
Asynchronously engineers features for the provided dataset.
|
217
|
+
The response is stored in the 'response' attribute.
|
218
|
+
|
219
|
+
Parameters
|
220
|
+
----------
|
221
|
+
data_raw : pd.DataFrame
|
222
|
+
The raw dataset to be processed.
|
223
|
+
user_instructions : str, optional
|
224
|
+
Instructions for feature engineering.
|
225
|
+
target_variable : str, optional
|
226
|
+
The name of the target variable (if any).
|
227
|
+
max_retries : int
|
228
|
+
Maximum retry attempts.
|
229
|
+
retry_count : int
|
230
|
+
Current retry attempt count.
|
231
|
+
**kwargs
|
232
|
+
Additional keyword arguments to pass to ainvoke().
|
233
|
+
|
234
|
+
Returns
|
235
|
+
-------
|
236
|
+
None
|
237
|
+
"""
|
238
|
+
response = self._compiled_graph.ainvoke({
|
239
|
+
"user_instructions": user_instructions,
|
240
|
+
"data_raw": data_raw.to_dict(),
|
241
|
+
"target_variable": target_variable,
|
242
|
+
"max_retries": max_retries,
|
243
|
+
"retry_count": retry_count
|
244
|
+
}, **kwargs)
|
245
|
+
self.response = response
|
246
|
+
return None
|
247
|
+
|
248
|
+
def invoke_agent(
|
249
|
+
self,
|
250
|
+
data_raw: pd.DataFrame,
|
251
|
+
user_instructions: str=None,
|
252
|
+
target_variable: str = None,
|
253
|
+
max_retries=3,
|
254
|
+
retry_count=0,
|
255
|
+
**kwargs
|
256
|
+
):
|
257
|
+
"""
|
258
|
+
Synchronously engineers features for the provided dataset.
|
259
|
+
The response is stored in the 'response' attribute.
|
260
|
+
|
261
|
+
Parameters
|
262
|
+
----------
|
263
|
+
data_raw : pd.DataFrame
|
264
|
+
The raw dataset to be processed.
|
265
|
+
user_instructions : str
|
266
|
+
Instructions for feature engineering agent.
|
267
|
+
target_variable : str, optional
|
268
|
+
The name of the target variable (if any).
|
269
|
+
max_retries : int
|
270
|
+
Maximum retry attempts.
|
271
|
+
retry_count : int
|
272
|
+
Current retry attempt count.
|
273
|
+
**kwargs
|
274
|
+
Additional keyword arguments to pass to invoke().
|
275
|
+
|
276
|
+
Returns
|
277
|
+
-------
|
278
|
+
None
|
279
|
+
"""
|
280
|
+
response = self._compiled_graph.invoke({
|
281
|
+
"user_instructions": user_instructions,
|
282
|
+
"data_raw": data_raw.to_dict(),
|
283
|
+
"target_variable": target_variable,
|
284
|
+
"max_retries": max_retries,
|
285
|
+
"retry_count": retry_count
|
286
|
+
}, **kwargs)
|
287
|
+
self.response = response
|
288
|
+
return None
|
289
|
+
|
290
|
+
def get_workflow_summary(self, markdown=False):
|
291
|
+
"""
|
292
|
+
Retrieves the agent's workflow summary, if logging is enabled.
|
293
|
+
"""
|
294
|
+
if self.response and self.response.get("messages"):
|
295
|
+
summary = get_generic_summary(json.loads(self.response.get("messages")[-1].content))
|
296
|
+
if markdown:
|
297
|
+
return Markdown(summary)
|
298
|
+
else:
|
299
|
+
return summary
|
300
|
+
|
301
|
+
def get_log_summary(self, markdown=False):
|
302
|
+
"""
|
303
|
+
Logs a summary of the agent's operations, if logging is enabled.
|
304
|
+
"""
|
305
|
+
if self.response:
|
306
|
+
if self.response.get('feature_engineer_function_path'):
|
307
|
+
log_details = f"""
|
308
|
+
## Featuring Engineering Agent Log Summary:
|
309
|
+
|
310
|
+
Function Path: {self.response.get('feature_engineer_function_path')}
|
311
|
+
|
312
|
+
Function Name: {self.response.get('feature_engineer_function_name')}
|
313
|
+
"""
|
314
|
+
if markdown:
|
315
|
+
return Markdown(log_details)
|
316
|
+
else:
|
317
|
+
return log_details
|
318
|
+
|
319
|
+
def get_data_engineered(self):
|
320
|
+
"""
|
321
|
+
Retrieves the engineered data stored after running invoke/ainvoke.
|
322
|
+
|
323
|
+
Returns
|
324
|
+
-------
|
325
|
+
pd.DataFrame or None
|
326
|
+
The engineered dataset as a pandas DataFrame.
|
327
|
+
"""
|
328
|
+
if self.response and "data_engineered" in self.response:
|
329
|
+
return pd.DataFrame(self.response["data_engineered"])
|
330
|
+
return None
|
331
|
+
|
332
|
+
def get_data_raw(self):
|
333
|
+
"""
|
334
|
+
Retrieves the raw data.
|
335
|
+
|
336
|
+
Returns
|
337
|
+
-------
|
338
|
+
pd.DataFrame or None
|
339
|
+
The raw dataset as a pandas DataFrame if available.
|
340
|
+
"""
|
341
|
+
if self.response and "data_raw" in self.response:
|
342
|
+
return pd.DataFrame(self.response["data_raw"])
|
343
|
+
return None
|
344
|
+
|
345
|
+
def get_feature_engineer_function(self, markdown=False):
|
346
|
+
"""
|
347
|
+
Retrieves the feature engineering function generated by the agent.
|
348
|
+
|
349
|
+
Parameters
|
350
|
+
----------
|
351
|
+
markdown : bool, optional
|
352
|
+
If True, returns the function in Markdown code block format.
|
353
|
+
|
354
|
+
Returns
|
355
|
+
-------
|
356
|
+
str or None
|
357
|
+
The Python function code, or None if unavailable.
|
358
|
+
"""
|
359
|
+
if self.response and "feature_engineer_function" in self.response:
|
360
|
+
code = self.response["feature_engineer_function"]
|
361
|
+
if markdown:
|
362
|
+
return Markdown(f"```python\n{code}\n```")
|
363
|
+
return code
|
364
|
+
return None
|
365
|
+
|
366
|
+
def get_recommended_feature_engineering_steps(self, markdown=False):
|
367
|
+
"""
|
368
|
+
Retrieves the agent's recommended feature engineering steps.
|
369
|
+
|
370
|
+
Parameters
|
371
|
+
----------
|
372
|
+
markdown : bool, optional
|
373
|
+
If True, returns the steps in Markdown format.
|
374
|
+
|
375
|
+
Returns
|
376
|
+
-------
|
377
|
+
str or None
|
378
|
+
The recommended steps, or None if not available.
|
379
|
+
"""
|
380
|
+
if self.response and "recommended_steps" in self.response:
|
381
|
+
steps = self.response["recommended_steps"]
|
382
|
+
if markdown:
|
383
|
+
return Markdown(steps)
|
384
|
+
return steps
|
385
|
+
return None
|
386
|
+
|
387
|
+
|
388
|
+
|
389
|
+
|
36
390
|
# * Feature Engineering Agent
|
37
391
|
|
38
392
|
def make_feature_engineering_agent(
|
@@ -41,6 +395,7 @@ def make_feature_engineering_agent(
|
|
41
395
|
log=False,
|
42
396
|
log_path=None,
|
43
397
|
file_name="feature_engineer.py",
|
398
|
+
function_name="feature_engineer",
|
44
399
|
overwrite = True,
|
45
400
|
human_in_the_loop=False,
|
46
401
|
bypass_recommended_steps=False,
|
@@ -82,6 +437,8 @@ def make_feature_engineering_agent(
|
|
82
437
|
The path to the directory where the log files should be stored. Defaults to "logs/".
|
83
438
|
file_name : str, optional
|
84
439
|
The name of the file to save the log to. Defaults to "feature_engineer.py".
|
440
|
+
function_name : str, optional
|
441
|
+
The name of the function that will be generated. Defaults to "feature_engineer".
|
85
442
|
overwrite : bool, optional
|
86
443
|
Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
|
87
444
|
Defaults to True.
|
@@ -122,6 +479,11 @@ def make_feature_engineering_agent(
|
|
122
479
|
The feature engineering agent as a state graph.
|
123
480
|
"""
|
124
481
|
llm = model
|
482
|
+
|
483
|
+
# Human in th loop requires recommended steps
|
484
|
+
if bypass_recommended_steps and human_in_the_loop:
|
485
|
+
bypass_recommended_steps = False
|
486
|
+
print("Bypass recommended steps set to False to enable human in the loop.")
|
125
487
|
|
126
488
|
# Setup Log Directory
|
127
489
|
if log:
|
@@ -141,6 +503,7 @@ def make_feature_engineering_agent(
|
|
141
503
|
all_datasets_summary: str
|
142
504
|
feature_engineer_function: str
|
143
505
|
feature_engineer_function_path: str
|
506
|
+
feature_engineer_file_name: str
|
144
507
|
feature_engineer_function_name: str
|
145
508
|
feature_engineer_error: str
|
146
509
|
max_retries: int
|
@@ -194,7 +557,7 @@ def make_feature_engineering_agent(
|
|
194
557
|
Below are summaries of all datasets provided:
|
195
558
|
{all_datasets_summary}
|
196
559
|
|
197
|
-
Return
|
560
|
+
Return steps as a numbered list. You can return short code snippets to demonstrate actions. But do not return a fully coded solution. The code will be generated separately by a Coding Agent.
|
198
561
|
|
199
562
|
Avoid these:
|
200
563
|
1. Do not include steps to save files.
|
@@ -218,19 +581,36 @@ def make_feature_engineering_agent(
|
|
218
581
|
})
|
219
582
|
|
220
583
|
return {
|
221
|
-
"recommended_steps": "
|
584
|
+
"recommended_steps": format_recommended_steps(recommended_steps.content.strip(), heading="# Recommended Feature Engineering Steps:"),
|
222
585
|
"all_datasets_summary": all_datasets_summary_str
|
223
586
|
}
|
224
587
|
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
588
|
+
# Human Review
|
589
|
+
|
590
|
+
prompt_text_human_review = "Are the following feature engineering instructions correct? (Answer 'yes' or provide modifications)\n{steps}"
|
591
|
+
|
592
|
+
if not bypass_explain_code:
|
593
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "explain_feature_engineering_code"]]:
|
594
|
+
return node_func_human_review(
|
595
|
+
state=state,
|
596
|
+
prompt_text=prompt_text_human_review,
|
597
|
+
yes_goto= 'explain_feature_engineering_code',
|
598
|
+
no_goto="recommend_feature_engineering_steps",
|
599
|
+
user_instructions_key="user_instructions",
|
600
|
+
recommended_steps_key="recommended_steps",
|
601
|
+
code_snippet_key="feature_engineer_function",
|
602
|
+
)
|
603
|
+
else:
|
604
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "__end__"]]:
|
605
|
+
return node_func_human_review(
|
606
|
+
state=state,
|
607
|
+
prompt_text=prompt_text_human_review,
|
608
|
+
yes_goto= '__end__',
|
609
|
+
no_goto="recommend_feature_engineering_steps",
|
610
|
+
user_instructions_key="user_instructions",
|
611
|
+
recommended_steps_key="recommended_steps",
|
612
|
+
code_snippet_key="feature_engineer_function",
|
613
|
+
)
|
234
614
|
|
235
615
|
def create_feature_engineering_code(state: GraphState):
|
236
616
|
if bypass_recommended_steps:
|
@@ -250,8 +630,7 @@ def make_feature_engineering_agent(
|
|
250
630
|
|
251
631
|
feature_engineering_prompt = PromptTemplate(
|
252
632
|
template="""
|
253
|
-
|
254
|
-
You are a Feature Engineering Agent. Your job is to create a feature_engineer() function that can be run on the data provided using the following recommended steps.
|
633
|
+
You are a Feature Engineering Agent. Your job is to create a {function_name}() function that can be run on the data provided using the following recommended steps.
|
255
634
|
|
256
635
|
Recommended Steps:
|
257
636
|
{recommended_steps}
|
@@ -265,11 +644,11 @@ def make_feature_engineering_agent(
|
|
265
644
|
|
266
645
|
You can use Pandas, Numpy, and Scikit Learn libraries to feature engineer the data.
|
267
646
|
|
268
|
-
Return Python code in ```python``` format with a single function definition,
|
647
|
+
Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), including all imports inside the function.
|
269
648
|
|
270
649
|
Return code to provide the feature engineering function:
|
271
650
|
|
272
|
-
def
|
651
|
+
def {function_name}(data_raw):
|
273
652
|
import pandas as pd
|
274
653
|
import numpy as np
|
275
654
|
...
|
@@ -292,7 +671,7 @@ def make_feature_engineering_agent(
|
|
292
671
|
|
293
672
|
|
294
673
|
""",
|
295
|
-
input_variables=["recommeded_steps", "target_variable", "all_datasets_summary"]
|
674
|
+
input_variables=["recommeded_steps", "target_variable", "all_datasets_summary", "function_name"]
|
296
675
|
)
|
297
676
|
|
298
677
|
feature_engineering_agent = feature_engineering_prompt | llm | PythonOutputParser()
|
@@ -301,6 +680,7 @@ def make_feature_engineering_agent(
|
|
301
680
|
"recommended_steps": state.get("recommended_steps"),
|
302
681
|
"target_variable": state.get("target_variable"),
|
303
682
|
"all_datasets_summary": all_datasets_summary_str,
|
683
|
+
"function_name": function_name
|
304
684
|
})
|
305
685
|
|
306
686
|
response = relocate_imports_inside_function(response)
|
@@ -318,12 +698,11 @@ def make_feature_engineering_agent(
|
|
318
698
|
return {
|
319
699
|
"feature_engineer_function": response,
|
320
700
|
"feature_engineer_function_path": file_path,
|
321
|
-
"
|
701
|
+
"feature_engineer_file_name": file_name_2,
|
702
|
+
"feature_engineer_function_name": function_name,
|
322
703
|
"all_datasets_summary": all_datasets_summary_str
|
323
704
|
}
|
324
705
|
|
325
|
-
|
326
|
-
|
327
706
|
def execute_feature_engineering_code(state):
|
328
707
|
return node_func_execute_agent_code_on_data(
|
329
708
|
state=state,
|
@@ -331,7 +710,7 @@ def make_feature_engineering_agent(
|
|
331
710
|
result_key="data_engineered",
|
332
711
|
error_key="feature_engineer_error",
|
333
712
|
code_snippet_key="feature_engineer_function",
|
334
|
-
agent_function_name="
|
713
|
+
agent_function_name=state.get("feature_engineer_function_name"),
|
335
714
|
pre_processing=lambda data: pd.DataFrame.from_dict(data),
|
336
715
|
post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
|
337
716
|
error_message_prefix="An error occurred during feature engineering: "
|
@@ -339,11 +718,13 @@ def make_feature_engineering_agent(
|
|
339
718
|
|
340
719
|
def fix_feature_engineering_code(state: GraphState):
|
341
720
|
feature_engineer_prompt = """
|
342
|
-
You are a Feature Engineering Agent. Your job is to fix the
|
721
|
+
You are a Feature Engineering Agent. Your job is to fix the {function_name}() function that currently contains errors.
|
722
|
+
|
723
|
+
Provide only the corrected function definition for {function_name}().
|
343
724
|
|
344
|
-
|
725
|
+
Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
|
345
726
|
|
346
|
-
|
727
|
+
This is the broken code (please fix):
|
347
728
|
{code_snippet}
|
348
729
|
|
349
730
|
Last Known Error:
|
@@ -359,23 +740,25 @@ def make_feature_engineering_agent(
|
|
359
740
|
agent_name=AGENT_NAME,
|
360
741
|
log=log,
|
361
742
|
file_path=state.get("feature_engineer_function_path"),
|
743
|
+
function_name=state.get("feature_engineer_function_name"),
|
362
744
|
)
|
363
745
|
|
364
|
-
|
365
|
-
|
746
|
+
# Final reporting node
|
747
|
+
def report_agent_outputs(state: GraphState):
|
748
|
+
return node_func_report_agent_outputs(
|
366
749
|
state=state,
|
367
|
-
|
750
|
+
keys_to_include=[
|
751
|
+
"recommended_steps",
|
752
|
+
"feature_engineer_function",
|
753
|
+
"feature_engineer_function_path",
|
754
|
+
"feature_engineer_function_name",
|
755
|
+
"feature_engineer_error",
|
756
|
+
],
|
368
757
|
result_key="messages",
|
369
|
-
error_key="feature_engineer_error",
|
370
|
-
llm=llm,
|
371
758
|
role=AGENT_NAME,
|
372
|
-
|
373
|
-
Explain the feature engineering steps performed by this function. Keep the explanation clear and concise.\n\n# Feature Engineering Agent:\n\n{code}
|
374
|
-
""",
|
375
|
-
success_prefix="# Feature Engineering Agent:\n\n ",
|
376
|
-
error_message="The Feature Engineering Agent encountered an error during feature engineering. Data could not be explained."
|
759
|
+
custom_title="Feature Engineering Agent Outputs"
|
377
760
|
)
|
378
|
-
|
761
|
+
|
379
762
|
# Create the graph
|
380
763
|
node_functions = {
|
381
764
|
"recommend_feature_engineering_steps": recommend_feature_engineering_steps,
|
@@ -383,7 +766,7 @@ def make_feature_engineering_agent(
|
|
383
766
|
"create_feature_engineering_code": create_feature_engineering_code,
|
384
767
|
"execute_feature_engineering_code": execute_feature_engineering_code,
|
385
768
|
"fix_feature_engineering_code": fix_feature_engineering_code,
|
386
|
-
"
|
769
|
+
"report_agent_outputs": report_agent_outputs,
|
387
770
|
}
|
388
771
|
|
389
772
|
app = create_coding_agent_graph(
|
@@ -393,11 +776,13 @@ def make_feature_engineering_agent(
|
|
393
776
|
create_code_node_name="create_feature_engineering_code",
|
394
777
|
execute_code_node_name="execute_feature_engineering_code",
|
395
778
|
fix_code_node_name="fix_feature_engineering_code",
|
396
|
-
explain_code_node_name="
|
779
|
+
explain_code_node_name="report_agent_outputs",
|
397
780
|
error_key="feature_engineer_error",
|
781
|
+
max_retries_key = "max_retries",
|
782
|
+
retry_count_key = "retry_count",
|
398
783
|
human_in_the_loop=human_in_the_loop,
|
399
784
|
human_review_node_name="human_review",
|
400
|
-
checkpointer=MemorySaver()
|
785
|
+
checkpointer=MemorySaver(),
|
401
786
|
bypass_recommended_steps=bypass_recommended_steps,
|
402
787
|
bypass_explain_code=bypass_explain_code,
|
403
788
|
)
|