ai-data-science-team 0.0.0.9007__py3-none-any.whl → 0.0.0.9009__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/agents/__init__.py +4 -5
- ai_data_science_team/agents/data_cleaning_agent.py +268 -116
- ai_data_science_team/agents/data_visualization_agent.py +470 -41
- ai_data_science_team/agents/data_wrangling_agent.py +471 -31
- ai_data_science_team/agents/feature_engineering_agent.py +426 -41
- ai_data_science_team/agents/sql_database_agent.py +458 -58
- ai_data_science_team/ml_agents/__init__.py +1 -0
- ai_data_science_team/ml_agents/h2o_ml_agent.py +1032 -0
- ai_data_science_team/multiagents/__init__.py +1 -0
- ai_data_science_team/multiagents/sql_data_analyst.py +398 -0
- ai_data_science_team/multiagents/supervised_data_analyst.py +2 -0
- ai_data_science_team/templates/__init__.py +3 -1
- ai_data_science_team/templates/agent_templates.py +319 -43
- ai_data_science_team/tools/metadata.py +94 -62
- ai_data_science_team/tools/regex.py +86 -1
- ai_data_science_team/utils/__init__.py +0 -0
- ai_data_science_team/utils/plotly.py +24 -0
- ai_data_science_team-0.0.0.9009.dist-info/METADATA +245 -0
- ai_data_science_team-0.0.0.9009.dist-info/RECORD +28 -0
- ai_data_science_team-0.0.0.9007.dist-info/METADATA +0 -183
- ai_data_science_team-0.0.0.9007.dist-info/RECORD +0 -21
- {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/WHEEL +0 -0
- {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/top_level.txt +0 -0
@@ -14,18 +14,27 @@ from langgraph.types import Command
|
|
14
14
|
from langgraph.checkpoint.memory import MemorySaver
|
15
15
|
|
16
16
|
import os
|
17
|
-
import
|
17
|
+
import json
|
18
18
|
import pandas as pd
|
19
19
|
|
20
|
+
from IPython.display import Markdown
|
21
|
+
|
20
22
|
from ai_data_science_team.templates import(
|
21
23
|
node_func_execute_agent_code_on_data,
|
22
24
|
node_func_human_review,
|
23
25
|
node_func_fix_agent_code,
|
24
|
-
|
25
|
-
create_coding_agent_graph
|
26
|
+
node_func_report_agent_outputs,
|
27
|
+
create_coding_agent_graph,
|
28
|
+
BaseAgent,
|
26
29
|
)
|
27
30
|
from ai_data_science_team.tools.parsers import PythonOutputParser
|
28
|
-
from ai_data_science_team.tools.regex import
|
31
|
+
from ai_data_science_team.tools.regex import (
|
32
|
+
relocate_imports_inside_function,
|
33
|
+
add_comments_to_top,
|
34
|
+
format_agent_name,
|
35
|
+
format_recommended_steps,
|
36
|
+
get_generic_summary,
|
37
|
+
)
|
29
38
|
from ai_data_science_team.tools.metadata import get_dataframe_summary
|
30
39
|
from ai_data_science_team.tools.logging import log_ai_function
|
31
40
|
|
@@ -33,6 +42,351 @@ from ai_data_science_team.tools.logging import log_ai_function
|
|
33
42
|
AGENT_NAME = "feature_engineering_agent"
|
34
43
|
LOG_PATH = os.path.join(os.getcwd(), "logs/")
|
35
44
|
|
45
|
+
# Class
|
46
|
+
|
47
|
+
class FeatureEngineeringAgent(BaseAgent):
|
48
|
+
"""
|
49
|
+
Creates a feature engineering agent that can process datasets based on user-defined instructions or
|
50
|
+
default feature engineering steps. The agent generates a Python function to engineer features, executes it,
|
51
|
+
and logs the process, including code and errors. It is designed to facilitate reproducible and
|
52
|
+
customizable feature engineering workflows.
|
53
|
+
|
54
|
+
The agent can perform the following default feature engineering steps unless instructed otherwise:
|
55
|
+
- Convert features to appropriate data types
|
56
|
+
- Remove features that have unique values for each row
|
57
|
+
- Remove constant features
|
58
|
+
- Encode high-cardinality categoricals (threshold <= 5% of dataset) as 'other'
|
59
|
+
- One-hot-encode categorical variables
|
60
|
+
- Convert booleans to integer (1/0)
|
61
|
+
- Create datetime-based features (if applicable)
|
62
|
+
- Handle target variable encoding if specified
|
63
|
+
- Any user-provided instructions to add, remove, or modify steps
|
64
|
+
|
65
|
+
Parameters
|
66
|
+
----------
|
67
|
+
model : langchain.llms.base.LLM
|
68
|
+
The language model used to generate the feature engineering function.
|
69
|
+
n_samples : int, optional
|
70
|
+
Number of samples used when summarizing the dataset. Defaults to 30.
|
71
|
+
log : bool, optional
|
72
|
+
Whether to log the generated code and errors. Defaults to False.
|
73
|
+
log_path : str, optional
|
74
|
+
Directory path for storing log files. Defaults to None.
|
75
|
+
file_name : str, optional
|
76
|
+
Name of the file for saving the generated response. Defaults to "feature_engineer.py".
|
77
|
+
function_name : str, optional
|
78
|
+
Name of the function for data visualization. Defaults to "feature_engineer".
|
79
|
+
overwrite : bool, optional
|
80
|
+
Whether to overwrite the log file if it exists. If False, a unique file name is created. Defaults to True.
|
81
|
+
human_in_the_loop : bool, optional
|
82
|
+
Enables user review of feature engineering instructions. Defaults to False.
|
83
|
+
bypass_recommended_steps : bool, optional
|
84
|
+
If True, skips the default recommended steps. Defaults to False.
|
85
|
+
bypass_explain_code : bool, optional
|
86
|
+
If True, skips the step that provides code explanations. Defaults to False.
|
87
|
+
|
88
|
+
Methods
|
89
|
+
-------
|
90
|
+
update_params(**kwargs)
|
91
|
+
Updates the agent's parameters and rebuilds the compiled state graph.
|
92
|
+
ainvoke_agent(
|
93
|
+
user_instructions: str,
|
94
|
+
data_raw: pd.DataFrame,
|
95
|
+
target_variable: str = None,
|
96
|
+
max_retries=3,
|
97
|
+
retry_count=0
|
98
|
+
)
|
99
|
+
Engineers features from the provided dataset asynchronously based on user instructions.
|
100
|
+
invoke_agent(
|
101
|
+
user_instructions: str,
|
102
|
+
data_raw: pd.DataFrame,
|
103
|
+
target_variable: str = None,
|
104
|
+
max_retries=3,
|
105
|
+
retry_count=0
|
106
|
+
)
|
107
|
+
Engineers features from the provided dataset synchronously based on user instructions.
|
108
|
+
get_workflow_summary()
|
109
|
+
Retrieves a summary of the agent's workflow.
|
110
|
+
get_log_summary()
|
111
|
+
Retrieves a summary of logged operations if logging is enabled.
|
112
|
+
get_data_engineered()
|
113
|
+
Retrieves the feature-engineered dataset as a pandas DataFrame.
|
114
|
+
get_data_raw()
|
115
|
+
Retrieves the raw dataset as a pandas DataFrame.
|
116
|
+
get_feature_engineer_function()
|
117
|
+
Retrieves the generated Python function used for feature engineering.
|
118
|
+
get_recommended_feature_engineering_steps()
|
119
|
+
Retrieves the agent's recommended feature engineering steps.
|
120
|
+
get_response()
|
121
|
+
Returns the response from the agent as a dictionary.
|
122
|
+
show()
|
123
|
+
Displays the agent's mermaid diagram.
|
124
|
+
|
125
|
+
Examples
|
126
|
+
--------
|
127
|
+
```python
|
128
|
+
import pandas as pd
|
129
|
+
from langchain_openai import ChatOpenAI
|
130
|
+
from ai_data_science_team.agents import FeatureEngineeringAgent
|
131
|
+
|
132
|
+
llm = ChatOpenAI(model="gpt-4o-mini")
|
133
|
+
|
134
|
+
feature_agent = FeatureEngineeringAgent(
|
135
|
+
model=llm,
|
136
|
+
n_samples=30,
|
137
|
+
log=True,
|
138
|
+
log_path="logs",
|
139
|
+
human_in_the_loop=True
|
140
|
+
)
|
141
|
+
|
142
|
+
df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
|
143
|
+
|
144
|
+
feature_agent.invoke_agent(
|
145
|
+
user_instructions="Also encode the 'PaymentMethod' column with one-hot encoding.",
|
146
|
+
data_raw=df,
|
147
|
+
target_variable="Churn",
|
148
|
+
max_retries=3,
|
149
|
+
retry_count=0
|
150
|
+
)
|
151
|
+
|
152
|
+
engineered_data = feature_agent.get_data_engineered()
|
153
|
+
response = feature_agent.get_response()
|
154
|
+
```
|
155
|
+
|
156
|
+
Returns
|
157
|
+
-------
|
158
|
+
FeatureEngineeringAgent : langchain.graphs.CompiledStateGraph
|
159
|
+
A feature engineering agent implemented as a compiled state graph.
|
160
|
+
"""
|
161
|
+
|
162
|
+
def __init__(
|
163
|
+
self,
|
164
|
+
model,
|
165
|
+
n_samples=30,
|
166
|
+
log=False,
|
167
|
+
log_path=None,
|
168
|
+
file_name="feature_engineer.py",
|
169
|
+
function_name="feature_engineer",
|
170
|
+
overwrite=True,
|
171
|
+
human_in_the_loop=False,
|
172
|
+
bypass_recommended_steps=False,
|
173
|
+
bypass_explain_code=False
|
174
|
+
):
|
175
|
+
self._params = {
|
176
|
+
"model": model,
|
177
|
+
"n_samples": n_samples,
|
178
|
+
"log": log,
|
179
|
+
"log_path": log_path,
|
180
|
+
"file_name": file_name,
|
181
|
+
"function_name": function_name,
|
182
|
+
"overwrite": overwrite,
|
183
|
+
"human_in_the_loop": human_in_the_loop,
|
184
|
+
"bypass_recommended_steps": bypass_recommended_steps,
|
185
|
+
"bypass_explain_code": bypass_explain_code
|
186
|
+
}
|
187
|
+
self._compiled_graph = self._make_compiled_graph()
|
188
|
+
self.response = None
|
189
|
+
|
190
|
+
def _make_compiled_graph(self):
|
191
|
+
"""
|
192
|
+
Create the compiled graph for the feature engineering agent.
|
193
|
+
Running this method will reset the response to None.
|
194
|
+
"""
|
195
|
+
self.response = None
|
196
|
+
return make_feature_engineering_agent(**self._params)
|
197
|
+
|
198
|
+
def update_params(self, **kwargs):
|
199
|
+
"""
|
200
|
+
Updates the agent's parameters and rebuilds the compiled graph.
|
201
|
+
"""
|
202
|
+
for k, v in kwargs.items():
|
203
|
+
self._params[k] = v
|
204
|
+
self._compiled_graph = self._make_compiled_graph()
|
205
|
+
|
206
|
+
def ainvoke_agent(
|
207
|
+
self,
|
208
|
+
data_raw: pd.DataFrame,
|
209
|
+
user_instructions: str=None,
|
210
|
+
target_variable: str = None,
|
211
|
+
max_retries=3,
|
212
|
+
retry_count=0,
|
213
|
+
**kwargs
|
214
|
+
):
|
215
|
+
"""
|
216
|
+
Asynchronously engineers features for the provided dataset.
|
217
|
+
The response is stored in the 'response' attribute.
|
218
|
+
|
219
|
+
Parameters
|
220
|
+
----------
|
221
|
+
data_raw : pd.DataFrame
|
222
|
+
The raw dataset to be processed.
|
223
|
+
user_instructions : str, optional
|
224
|
+
Instructions for feature engineering.
|
225
|
+
target_variable : str, optional
|
226
|
+
The name of the target variable (if any).
|
227
|
+
max_retries : int
|
228
|
+
Maximum retry attempts.
|
229
|
+
retry_count : int
|
230
|
+
Current retry attempt count.
|
231
|
+
**kwargs
|
232
|
+
Additional keyword arguments to pass to ainvoke().
|
233
|
+
|
234
|
+
Returns
|
235
|
+
-------
|
236
|
+
None
|
237
|
+
"""
|
238
|
+
response = self._compiled_graph.ainvoke({
|
239
|
+
"user_instructions": user_instructions,
|
240
|
+
"data_raw": data_raw.to_dict(),
|
241
|
+
"target_variable": target_variable,
|
242
|
+
"max_retries": max_retries,
|
243
|
+
"retry_count": retry_count
|
244
|
+
}, **kwargs)
|
245
|
+
self.response = response
|
246
|
+
return None
|
247
|
+
|
248
|
+
def invoke_agent(
|
249
|
+
self,
|
250
|
+
data_raw: pd.DataFrame,
|
251
|
+
user_instructions: str=None,
|
252
|
+
target_variable: str = None,
|
253
|
+
max_retries=3,
|
254
|
+
retry_count=0,
|
255
|
+
**kwargs
|
256
|
+
):
|
257
|
+
"""
|
258
|
+
Synchronously engineers features for the provided dataset.
|
259
|
+
The response is stored in the 'response' attribute.
|
260
|
+
|
261
|
+
Parameters
|
262
|
+
----------
|
263
|
+
data_raw : pd.DataFrame
|
264
|
+
The raw dataset to be processed.
|
265
|
+
user_instructions : str
|
266
|
+
Instructions for feature engineering agent.
|
267
|
+
target_variable : str, optional
|
268
|
+
The name of the target variable (if any).
|
269
|
+
max_retries : int
|
270
|
+
Maximum retry attempts.
|
271
|
+
retry_count : int
|
272
|
+
Current retry attempt count.
|
273
|
+
**kwargs
|
274
|
+
Additional keyword arguments to pass to invoke().
|
275
|
+
|
276
|
+
Returns
|
277
|
+
-------
|
278
|
+
None
|
279
|
+
"""
|
280
|
+
response = self._compiled_graph.invoke({
|
281
|
+
"user_instructions": user_instructions,
|
282
|
+
"data_raw": data_raw.to_dict(),
|
283
|
+
"target_variable": target_variable,
|
284
|
+
"max_retries": max_retries,
|
285
|
+
"retry_count": retry_count
|
286
|
+
}, **kwargs)
|
287
|
+
self.response = response
|
288
|
+
return None
|
289
|
+
|
290
|
+
def get_workflow_summary(self, markdown=False):
|
291
|
+
"""
|
292
|
+
Retrieves the agent's workflow summary, if logging is enabled.
|
293
|
+
"""
|
294
|
+
if self.response and self.response.get("messages"):
|
295
|
+
summary = get_generic_summary(json.loads(self.response.get("messages")[-1].content))
|
296
|
+
if markdown:
|
297
|
+
return Markdown(summary)
|
298
|
+
else:
|
299
|
+
return summary
|
300
|
+
|
301
|
+
def get_log_summary(self, markdown=False):
|
302
|
+
"""
|
303
|
+
Logs a summary of the agent's operations, if logging is enabled.
|
304
|
+
"""
|
305
|
+
if self.response:
|
306
|
+
if self.response.get('feature_engineer_function_path'):
|
307
|
+
log_details = f"""
|
308
|
+
## Featuring Engineering Agent Log Summary:
|
309
|
+
|
310
|
+
Function Path: {self.response.get('feature_engineer_function_path')}
|
311
|
+
|
312
|
+
Function Name: {self.response.get('feature_engineer_function_name')}
|
313
|
+
"""
|
314
|
+
if markdown:
|
315
|
+
return Markdown(log_details)
|
316
|
+
else:
|
317
|
+
return log_details
|
318
|
+
|
319
|
+
def get_data_engineered(self):
|
320
|
+
"""
|
321
|
+
Retrieves the engineered data stored after running invoke/ainvoke.
|
322
|
+
|
323
|
+
Returns
|
324
|
+
-------
|
325
|
+
pd.DataFrame or None
|
326
|
+
The engineered dataset as a pandas DataFrame.
|
327
|
+
"""
|
328
|
+
if self.response and "data_engineered" in self.response:
|
329
|
+
return pd.DataFrame(self.response["data_engineered"])
|
330
|
+
return None
|
331
|
+
|
332
|
+
def get_data_raw(self):
|
333
|
+
"""
|
334
|
+
Retrieves the raw data.
|
335
|
+
|
336
|
+
Returns
|
337
|
+
-------
|
338
|
+
pd.DataFrame or None
|
339
|
+
The raw dataset as a pandas DataFrame if available.
|
340
|
+
"""
|
341
|
+
if self.response and "data_raw" in self.response:
|
342
|
+
return pd.DataFrame(self.response["data_raw"])
|
343
|
+
return None
|
344
|
+
|
345
|
+
def get_feature_engineer_function(self, markdown=False):
|
346
|
+
"""
|
347
|
+
Retrieves the feature engineering function generated by the agent.
|
348
|
+
|
349
|
+
Parameters
|
350
|
+
----------
|
351
|
+
markdown : bool, optional
|
352
|
+
If True, returns the function in Markdown code block format.
|
353
|
+
|
354
|
+
Returns
|
355
|
+
-------
|
356
|
+
str or None
|
357
|
+
The Python function code, or None if unavailable.
|
358
|
+
"""
|
359
|
+
if self.response and "feature_engineer_function" in self.response:
|
360
|
+
code = self.response["feature_engineer_function"]
|
361
|
+
if markdown:
|
362
|
+
return Markdown(f"```python\n{code}\n```")
|
363
|
+
return code
|
364
|
+
return None
|
365
|
+
|
366
|
+
def get_recommended_feature_engineering_steps(self, markdown=False):
|
367
|
+
"""
|
368
|
+
Retrieves the agent's recommended feature engineering steps.
|
369
|
+
|
370
|
+
Parameters
|
371
|
+
----------
|
372
|
+
markdown : bool, optional
|
373
|
+
If True, returns the steps in Markdown format.
|
374
|
+
|
375
|
+
Returns
|
376
|
+
-------
|
377
|
+
str or None
|
378
|
+
The recommended steps, or None if not available.
|
379
|
+
"""
|
380
|
+
if self.response and "recommended_steps" in self.response:
|
381
|
+
steps = self.response["recommended_steps"]
|
382
|
+
if markdown:
|
383
|
+
return Markdown(steps)
|
384
|
+
return steps
|
385
|
+
return None
|
386
|
+
|
387
|
+
|
388
|
+
|
389
|
+
|
36
390
|
# * Feature Engineering Agent
|
37
391
|
|
38
392
|
def make_feature_engineering_agent(
|
@@ -41,6 +395,7 @@ def make_feature_engineering_agent(
|
|
41
395
|
log=False,
|
42
396
|
log_path=None,
|
43
397
|
file_name="feature_engineer.py",
|
398
|
+
function_name="feature_engineer",
|
44
399
|
overwrite = True,
|
45
400
|
human_in_the_loop=False,
|
46
401
|
bypass_recommended_steps=False,
|
@@ -82,6 +437,8 @@ def make_feature_engineering_agent(
|
|
82
437
|
The path to the directory where the log files should be stored. Defaults to "logs/".
|
83
438
|
file_name : str, optional
|
84
439
|
The name of the file to save the log to. Defaults to "feature_engineer.py".
|
440
|
+
function_name : str, optional
|
441
|
+
The name of the function that will be generated. Defaults to "feature_engineer".
|
85
442
|
overwrite : bool, optional
|
86
443
|
Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
|
87
444
|
Defaults to True.
|
@@ -122,6 +479,11 @@ def make_feature_engineering_agent(
|
|
122
479
|
The feature engineering agent as a state graph.
|
123
480
|
"""
|
124
481
|
llm = model
|
482
|
+
|
483
|
+
# Human in th loop requires recommended steps
|
484
|
+
if bypass_recommended_steps and human_in_the_loop:
|
485
|
+
bypass_recommended_steps = False
|
486
|
+
print("Bypass recommended steps set to False to enable human in the loop.")
|
125
487
|
|
126
488
|
# Setup Log Directory
|
127
489
|
if log:
|
@@ -141,6 +503,7 @@ def make_feature_engineering_agent(
|
|
141
503
|
all_datasets_summary: str
|
142
504
|
feature_engineer_function: str
|
143
505
|
feature_engineer_function_path: str
|
506
|
+
feature_engineer_file_name: str
|
144
507
|
feature_engineer_function_name: str
|
145
508
|
feature_engineer_error: str
|
146
509
|
max_retries: int
|
@@ -194,7 +557,7 @@ def make_feature_engineering_agent(
|
|
194
557
|
Below are summaries of all datasets provided:
|
195
558
|
{all_datasets_summary}
|
196
559
|
|
197
|
-
Return
|
560
|
+
Return steps as a numbered list. You can return short code snippets to demonstrate actions. But do not return a fully coded solution. The code will be generated separately by a Coding Agent.
|
198
561
|
|
199
562
|
Avoid these:
|
200
563
|
1. Do not include steps to save files.
|
@@ -218,19 +581,36 @@ def make_feature_engineering_agent(
|
|
218
581
|
})
|
219
582
|
|
220
583
|
return {
|
221
|
-
"recommended_steps": "
|
584
|
+
"recommended_steps": format_recommended_steps(recommended_steps.content.strip(), heading="# Recommended Feature Engineering Steps:"),
|
222
585
|
"all_datasets_summary": all_datasets_summary_str
|
223
586
|
}
|
224
587
|
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
588
|
+
# Human Review
|
589
|
+
|
590
|
+
prompt_text_human_review = "Are the following feature engineering instructions correct? (Answer 'yes' or provide modifications)\n{steps}"
|
591
|
+
|
592
|
+
if not bypass_explain_code:
|
593
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "explain_feature_engineering_code"]]:
|
594
|
+
return node_func_human_review(
|
595
|
+
state=state,
|
596
|
+
prompt_text=prompt_text_human_review,
|
597
|
+
yes_goto= 'explain_feature_engineering_code',
|
598
|
+
no_goto="recommend_feature_engineering_steps",
|
599
|
+
user_instructions_key="user_instructions",
|
600
|
+
recommended_steps_key="recommended_steps",
|
601
|
+
code_snippet_key="feature_engineer_function",
|
602
|
+
)
|
603
|
+
else:
|
604
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "__end__"]]:
|
605
|
+
return node_func_human_review(
|
606
|
+
state=state,
|
607
|
+
prompt_text=prompt_text_human_review,
|
608
|
+
yes_goto= '__end__',
|
609
|
+
no_goto="recommend_feature_engineering_steps",
|
610
|
+
user_instructions_key="user_instructions",
|
611
|
+
recommended_steps_key="recommended_steps",
|
612
|
+
code_snippet_key="feature_engineer_function",
|
613
|
+
)
|
234
614
|
|
235
615
|
def create_feature_engineering_code(state: GraphState):
|
236
616
|
if bypass_recommended_steps:
|
@@ -250,8 +630,7 @@ def make_feature_engineering_agent(
|
|
250
630
|
|
251
631
|
feature_engineering_prompt = PromptTemplate(
|
252
632
|
template="""
|
253
|
-
|
254
|
-
You are a Feature Engineering Agent. Your job is to create a feature_engineer() function that can be run on the data provided using the following recommended steps.
|
633
|
+
You are a Feature Engineering Agent. Your job is to create a {function_name}() function that can be run on the data provided using the following recommended steps.
|
255
634
|
|
256
635
|
Recommended Steps:
|
257
636
|
{recommended_steps}
|
@@ -265,11 +644,11 @@ def make_feature_engineering_agent(
|
|
265
644
|
|
266
645
|
You can use Pandas, Numpy, and Scikit Learn libraries to feature engineer the data.
|
267
646
|
|
268
|
-
Return Python code in ```python``` format with a single function definition,
|
647
|
+
Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), including all imports inside the function.
|
269
648
|
|
270
649
|
Return code to provide the feature engineering function:
|
271
650
|
|
272
|
-
def
|
651
|
+
def {function_name}(data_raw):
|
273
652
|
import pandas as pd
|
274
653
|
import numpy as np
|
275
654
|
...
|
@@ -292,7 +671,7 @@ def make_feature_engineering_agent(
|
|
292
671
|
|
293
672
|
|
294
673
|
""",
|
295
|
-
input_variables=["recommeded_steps", "target_variable", "all_datasets_summary"]
|
674
|
+
input_variables=["recommeded_steps", "target_variable", "all_datasets_summary", "function_name"]
|
296
675
|
)
|
297
676
|
|
298
677
|
feature_engineering_agent = feature_engineering_prompt | llm | PythonOutputParser()
|
@@ -301,6 +680,7 @@ def make_feature_engineering_agent(
|
|
301
680
|
"recommended_steps": state.get("recommended_steps"),
|
302
681
|
"target_variable": state.get("target_variable"),
|
303
682
|
"all_datasets_summary": all_datasets_summary_str,
|
683
|
+
"function_name": function_name
|
304
684
|
})
|
305
685
|
|
306
686
|
response = relocate_imports_inside_function(response)
|
@@ -318,12 +698,11 @@ def make_feature_engineering_agent(
|
|
318
698
|
return {
|
319
699
|
"feature_engineer_function": response,
|
320
700
|
"feature_engineer_function_path": file_path,
|
321
|
-
"
|
701
|
+
"feature_engineer_file_name": file_name_2,
|
702
|
+
"feature_engineer_function_name": function_name,
|
322
703
|
"all_datasets_summary": all_datasets_summary_str
|
323
704
|
}
|
324
705
|
|
325
|
-
|
326
|
-
|
327
706
|
def execute_feature_engineering_code(state):
|
328
707
|
return node_func_execute_agent_code_on_data(
|
329
708
|
state=state,
|
@@ -331,7 +710,7 @@ def make_feature_engineering_agent(
|
|
331
710
|
result_key="data_engineered",
|
332
711
|
error_key="feature_engineer_error",
|
333
712
|
code_snippet_key="feature_engineer_function",
|
334
|
-
agent_function_name="
|
713
|
+
agent_function_name=state.get("feature_engineer_function_name"),
|
335
714
|
pre_processing=lambda data: pd.DataFrame.from_dict(data),
|
336
715
|
post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
|
337
716
|
error_message_prefix="An error occurred during feature engineering: "
|
@@ -339,11 +718,13 @@ def make_feature_engineering_agent(
|
|
339
718
|
|
340
719
|
def fix_feature_engineering_code(state: GraphState):
|
341
720
|
feature_engineer_prompt = """
|
342
|
-
You are a Feature Engineering Agent. Your job is to fix the
|
721
|
+
You are a Feature Engineering Agent. Your job is to fix the {function_name}() function that currently contains errors.
|
722
|
+
|
723
|
+
Provide only the corrected function definition for {function_name}().
|
343
724
|
|
344
|
-
|
725
|
+
Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
|
345
726
|
|
346
|
-
|
727
|
+
This is the broken code (please fix):
|
347
728
|
{code_snippet}
|
348
729
|
|
349
730
|
Last Known Error:
|
@@ -359,23 +740,25 @@ def make_feature_engineering_agent(
|
|
359
740
|
agent_name=AGENT_NAME,
|
360
741
|
log=log,
|
361
742
|
file_path=state.get("feature_engineer_function_path"),
|
743
|
+
function_name=state.get("feature_engineer_function_name"),
|
362
744
|
)
|
363
745
|
|
364
|
-
|
365
|
-
|
746
|
+
# Final reporting node
|
747
|
+
def report_agent_outputs(state: GraphState):
|
748
|
+
return node_func_report_agent_outputs(
|
366
749
|
state=state,
|
367
|
-
|
750
|
+
keys_to_include=[
|
751
|
+
"recommended_steps",
|
752
|
+
"feature_engineer_function",
|
753
|
+
"feature_engineer_function_path",
|
754
|
+
"feature_engineer_function_name",
|
755
|
+
"feature_engineer_error",
|
756
|
+
],
|
368
757
|
result_key="messages",
|
369
|
-
error_key="feature_engineer_error",
|
370
|
-
llm=llm,
|
371
758
|
role=AGENT_NAME,
|
372
|
-
|
373
|
-
Explain the feature engineering steps performed by this function. Keep the explanation clear and concise.\n\n# Feature Engineering Agent:\n\n{code}
|
374
|
-
""",
|
375
|
-
success_prefix="# Feature Engineering Agent:\n\n ",
|
376
|
-
error_message="The Feature Engineering Agent encountered an error during feature engineering. Data could not be explained."
|
759
|
+
custom_title="Feature Engineering Agent Outputs"
|
377
760
|
)
|
378
|
-
|
761
|
+
|
379
762
|
# Create the graph
|
380
763
|
node_functions = {
|
381
764
|
"recommend_feature_engineering_steps": recommend_feature_engineering_steps,
|
@@ -383,7 +766,7 @@ def make_feature_engineering_agent(
|
|
383
766
|
"create_feature_engineering_code": create_feature_engineering_code,
|
384
767
|
"execute_feature_engineering_code": execute_feature_engineering_code,
|
385
768
|
"fix_feature_engineering_code": fix_feature_engineering_code,
|
386
|
-
"
|
769
|
+
"report_agent_outputs": report_agent_outputs,
|
387
770
|
}
|
388
771
|
|
389
772
|
app = create_coding_agent_graph(
|
@@ -393,11 +776,13 @@ def make_feature_engineering_agent(
|
|
393
776
|
create_code_node_name="create_feature_engineering_code",
|
394
777
|
execute_code_node_name="execute_feature_engineering_code",
|
395
778
|
fix_code_node_name="fix_feature_engineering_code",
|
396
|
-
explain_code_node_name="
|
779
|
+
explain_code_node_name="report_agent_outputs",
|
397
780
|
error_key="feature_engineer_error",
|
781
|
+
max_retries_key = "max_retries",
|
782
|
+
retry_count_key = "retry_count",
|
398
783
|
human_in_the_loop=human_in_the_loop,
|
399
784
|
human_review_node_name="human_review",
|
400
|
-
checkpointer=MemorySaver()
|
785
|
+
checkpointer=MemorySaver(),
|
401
786
|
bypass_recommended_steps=bypass_recommended_steps,
|
402
787
|
bypass_explain_code=bypass_explain_code,
|
403
788
|
)
|