ai-data-science-team 0.0.0.9007__py3-none-any.whl → 0.0.0.9008__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/agents/__init__.py +4 -4
- ai_data_science_team/agents/data_cleaning_agent.py +225 -84
- ai_data_science_team/agents/data_visualization_agent.py +460 -27
- ai_data_science_team/agents/data_wrangling_agent.py +455 -16
- ai_data_science_team/agents/feature_engineering_agent.py +429 -25
- ai_data_science_team/agents/sql_database_agent.py +367 -21
- ai_data_science_team/multiagents/__init__.py +1 -0
- ai_data_science_team/multiagents/sql_data_analyst.py +286 -0
- ai_data_science_team/multiagents/supervised_data_analyst.py +2 -0
- ai_data_science_team/templates/__init__.py +2 -1
- ai_data_science_team/templates/agent_templates.py +247 -42
- ai_data_science_team/tools/regex.py +28 -1
- ai_data_science_team/utils/__init__.py +0 -0
- ai_data_science_team/utils/plotly.py +24 -0
- {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9008.dist-info}/METADATA +76 -28
- ai_data_science_team-0.0.0.9008.dist-info/RECORD +26 -0
- ai_data_science_team-0.0.0.9007.dist-info/RECORD +0 -21
- {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9008.dist-info}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9008.dist-info}/WHEEL +0 -0
- {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9008.dist-info}/top_level.txt +0 -0
@@ -14,18 +14,25 @@ from langgraph.types import Command
|
|
14
14
|
from langgraph.checkpoint.memory import MemorySaver
|
15
15
|
|
16
16
|
import os
|
17
|
-
import io
|
18
17
|
import pandas as pd
|
19
18
|
|
19
|
+
from IPython.display import Markdown
|
20
|
+
|
20
21
|
from ai_data_science_team.templates import(
|
21
22
|
node_func_execute_agent_code_on_data,
|
22
23
|
node_func_human_review,
|
23
24
|
node_func_fix_agent_code,
|
24
25
|
node_func_explain_agent_code,
|
25
|
-
create_coding_agent_graph
|
26
|
+
create_coding_agent_graph,
|
27
|
+
BaseAgent,
|
26
28
|
)
|
27
29
|
from ai_data_science_team.tools.parsers import PythonOutputParser
|
28
|
-
from ai_data_science_team.tools.regex import
|
30
|
+
from ai_data_science_team.tools.regex import (
|
31
|
+
relocate_imports_inside_function,
|
32
|
+
add_comments_to_top,
|
33
|
+
format_agent_name,
|
34
|
+
format_recommended_steps
|
35
|
+
)
|
29
36
|
from ai_data_science_team.tools.metadata import get_dataframe_summary
|
30
37
|
from ai_data_science_team.tools.logging import log_ai_function
|
31
38
|
|
@@ -33,6 +40,372 @@ from ai_data_science_team.tools.logging import log_ai_function
|
|
33
40
|
AGENT_NAME = "feature_engineering_agent"
|
34
41
|
LOG_PATH = os.path.join(os.getcwd(), "logs/")
|
35
42
|
|
43
|
+
# Class
|
44
|
+
|
45
|
+
class FeatureEngineeringAgent(BaseAgent):
|
46
|
+
"""
|
47
|
+
Creates a feature engineering agent that can process datasets based on user-defined instructions or
|
48
|
+
default feature engineering steps. The agent generates a Python function to engineer features, executes it,
|
49
|
+
and logs the process, including code and errors. It is designed to facilitate reproducible and
|
50
|
+
customizable feature engineering workflows.
|
51
|
+
|
52
|
+
The agent can perform the following default feature engineering steps unless instructed otherwise:
|
53
|
+
- Convert features to appropriate data types
|
54
|
+
- Remove features that have unique values for each row
|
55
|
+
- Remove constant features
|
56
|
+
- Encode high-cardinality categoricals (threshold <= 5% of dataset) as 'other'
|
57
|
+
- One-hot-encode categorical variables
|
58
|
+
- Convert booleans to integer (1/0)
|
59
|
+
- Create datetime-based features (if applicable)
|
60
|
+
- Handle target variable encoding if specified
|
61
|
+
- Any user-provided instructions to add, remove, or modify steps
|
62
|
+
|
63
|
+
Parameters
|
64
|
+
----------
|
65
|
+
model : langchain.llms.base.LLM
|
66
|
+
The language model used to generate the feature engineering function.
|
67
|
+
n_samples : int, optional
|
68
|
+
Number of samples used when summarizing the dataset. Defaults to 30.
|
69
|
+
log : bool, optional
|
70
|
+
Whether to log the generated code and errors. Defaults to False.
|
71
|
+
log_path : str, optional
|
72
|
+
Directory path for storing log files. Defaults to None.
|
73
|
+
file_name : str, optional
|
74
|
+
Name of the file for saving the generated response. Defaults to "feature_engineer.py".
|
75
|
+
function_name : str, optional
|
76
|
+
Name of the function for data visualization. Defaults to "feature_engineer".
|
77
|
+
overwrite : bool, optional
|
78
|
+
Whether to overwrite the log file if it exists. If False, a unique file name is created. Defaults to True.
|
79
|
+
human_in_the_loop : bool, optional
|
80
|
+
Enables user review of feature engineering instructions. Defaults to False.
|
81
|
+
bypass_recommended_steps : bool, optional
|
82
|
+
If True, skips the default recommended steps. Defaults to False.
|
83
|
+
bypass_explain_code : bool, optional
|
84
|
+
If True, skips the step that provides code explanations. Defaults to False.
|
85
|
+
|
86
|
+
Methods
|
87
|
+
-------
|
88
|
+
update_params(**kwargs)
|
89
|
+
Updates the agent's parameters and rebuilds the compiled state graph.
|
90
|
+
ainvoke_agent(
|
91
|
+
user_instructions: str,
|
92
|
+
data_raw: pd.DataFrame,
|
93
|
+
target_variable: str = None,
|
94
|
+
max_retries=3,
|
95
|
+
retry_count=0
|
96
|
+
)
|
97
|
+
Engineers features from the provided dataset asynchronously based on user instructions.
|
98
|
+
invoke_agent(
|
99
|
+
user_instructions: str,
|
100
|
+
data_raw: pd.DataFrame,
|
101
|
+
target_variable: str = None,
|
102
|
+
max_retries=3,
|
103
|
+
retry_count=0
|
104
|
+
)
|
105
|
+
Engineers features from the provided dataset synchronously based on user instructions.
|
106
|
+
explain_feature_engineering_steps()
|
107
|
+
Returns an explanation of the feature engineering steps performed by the agent.
|
108
|
+
get_log_summary()
|
109
|
+
Retrieves a summary of logged operations if logging is enabled.
|
110
|
+
get_data_engineered()
|
111
|
+
Retrieves the feature-engineered dataset as a pandas DataFrame.
|
112
|
+
get_data_raw()
|
113
|
+
Retrieves the raw dataset as a pandas DataFrame.
|
114
|
+
get_feature_engineer_function()
|
115
|
+
Retrieves the generated Python function used for feature engineering.
|
116
|
+
get_recommended_feature_engineering_steps()
|
117
|
+
Retrieves the agent's recommended feature engineering steps.
|
118
|
+
get_response()
|
119
|
+
Returns the response from the agent as a dictionary.
|
120
|
+
show()
|
121
|
+
Displays the agent's mermaid diagram.
|
122
|
+
|
123
|
+
Examples
|
124
|
+
--------
|
125
|
+
```python
|
126
|
+
import pandas as pd
|
127
|
+
from langchain_openai import ChatOpenAI
|
128
|
+
from ai_data_science_team.agents import FeatureEngineeringAgent
|
129
|
+
|
130
|
+
llm = ChatOpenAI(model="gpt-4o-mini")
|
131
|
+
|
132
|
+
feature_agent = FeatureEngineeringAgent(
|
133
|
+
model=llm,
|
134
|
+
n_samples=30,
|
135
|
+
log=True,
|
136
|
+
log_path="logs",
|
137
|
+
human_in_the_loop=True
|
138
|
+
)
|
139
|
+
|
140
|
+
df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
|
141
|
+
|
142
|
+
feature_agent.invoke_agent(
|
143
|
+
user_instructions="Also encode the 'PaymentMethod' column with one-hot encoding.",
|
144
|
+
data_raw=df,
|
145
|
+
target_variable="Churn",
|
146
|
+
max_retries=3,
|
147
|
+
retry_count=0
|
148
|
+
)
|
149
|
+
|
150
|
+
engineered_data = feature_agent.get_data_engineered()
|
151
|
+
response = feature_agent.get_response()
|
152
|
+
```
|
153
|
+
|
154
|
+
Returns
|
155
|
+
-------
|
156
|
+
FeatureEngineeringAgent : langchain.graphs.CompiledStateGraph
|
157
|
+
A feature engineering agent implemented as a compiled state graph.
|
158
|
+
"""
|
159
|
+
|
160
|
+
def __init__(
|
161
|
+
self,
|
162
|
+
model,
|
163
|
+
n_samples=30,
|
164
|
+
log=False,
|
165
|
+
log_path=None,
|
166
|
+
file_name="feature_engineer.py",
|
167
|
+
function_name="feature_engineer",
|
168
|
+
overwrite=True,
|
169
|
+
human_in_the_loop=False,
|
170
|
+
bypass_recommended_steps=False,
|
171
|
+
bypass_explain_code=False
|
172
|
+
):
|
173
|
+
self._params = {
|
174
|
+
"model": model,
|
175
|
+
"n_samples": n_samples,
|
176
|
+
"log": log,
|
177
|
+
"log_path": log_path,
|
178
|
+
"file_name": file_name,
|
179
|
+
"function_name": function_name,
|
180
|
+
"overwrite": overwrite,
|
181
|
+
"human_in_the_loop": human_in_the_loop,
|
182
|
+
"bypass_recommended_steps": bypass_recommended_steps,
|
183
|
+
"bypass_explain_code": bypass_explain_code
|
184
|
+
}
|
185
|
+
self._compiled_graph = self._make_compiled_graph()
|
186
|
+
self.response = None
|
187
|
+
|
188
|
+
def _make_compiled_graph(self):
|
189
|
+
"""
|
190
|
+
Create the compiled graph for the feature engineering agent.
|
191
|
+
Running this method will reset the response to None.
|
192
|
+
"""
|
193
|
+
self.response = None
|
194
|
+
return make_feature_engineering_agent(**self._params)
|
195
|
+
|
196
|
+
def update_params(self, **kwargs):
|
197
|
+
"""
|
198
|
+
Updates the agent's parameters and rebuilds the compiled graph.
|
199
|
+
"""
|
200
|
+
for k, v in kwargs.items():
|
201
|
+
self._params[k] = v
|
202
|
+
self._compiled_graph = self._make_compiled_graph()
|
203
|
+
|
204
|
+
def ainvoke_agent(
|
205
|
+
self,
|
206
|
+
data_raw: pd.DataFrame,
|
207
|
+
user_instructions: str=None,
|
208
|
+
target_variable: str = None,
|
209
|
+
max_retries=3,
|
210
|
+
retry_count=0,
|
211
|
+
**kwargs
|
212
|
+
):
|
213
|
+
"""
|
214
|
+
Asynchronously engineers features for the provided dataset.
|
215
|
+
The response is stored in the 'response' attribute.
|
216
|
+
|
217
|
+
Parameters
|
218
|
+
----------
|
219
|
+
data_raw : pd.DataFrame
|
220
|
+
The raw dataset to be processed.
|
221
|
+
user_instructions : str, optional
|
222
|
+
Instructions for feature engineering.
|
223
|
+
target_variable : str, optional
|
224
|
+
The name of the target variable (if any).
|
225
|
+
max_retries : int
|
226
|
+
Maximum retry attempts.
|
227
|
+
retry_count : int
|
228
|
+
Current retry attempt count.
|
229
|
+
**kwargs
|
230
|
+
Additional keyword arguments to pass to ainvoke().
|
231
|
+
|
232
|
+
Returns
|
233
|
+
-------
|
234
|
+
None
|
235
|
+
"""
|
236
|
+
response = self._compiled_graph.ainvoke({
|
237
|
+
"user_instructions": user_instructions,
|
238
|
+
"data_raw": data_raw.to_dict(),
|
239
|
+
"target_variable": target_variable,
|
240
|
+
"max_retries": max_retries,
|
241
|
+
"retry_count": retry_count
|
242
|
+
}, **kwargs)
|
243
|
+
self.response = response
|
244
|
+
return None
|
245
|
+
|
246
|
+
def invoke_agent(
|
247
|
+
self,
|
248
|
+
data_raw: pd.DataFrame,
|
249
|
+
user_instructions: str=None,
|
250
|
+
target_variable: str = None,
|
251
|
+
max_retries=3,
|
252
|
+
retry_count=0,
|
253
|
+
**kwargs
|
254
|
+
):
|
255
|
+
"""
|
256
|
+
Synchronously engineers features for the provided dataset.
|
257
|
+
The response is stored in the 'response' attribute.
|
258
|
+
|
259
|
+
Parameters
|
260
|
+
----------
|
261
|
+
data_raw : pd.DataFrame
|
262
|
+
The raw dataset to be processed.
|
263
|
+
user_instructions : str
|
264
|
+
Instructions for feature engineering agent.
|
265
|
+
target_variable : str, optional
|
266
|
+
The name of the target variable (if any).
|
267
|
+
max_retries : int
|
268
|
+
Maximum retry attempts.
|
269
|
+
retry_count : int
|
270
|
+
Current retry attempt count.
|
271
|
+
**kwargs
|
272
|
+
Additional keyword arguments to pass to invoke().
|
273
|
+
|
274
|
+
Returns
|
275
|
+
-------
|
276
|
+
None
|
277
|
+
"""
|
278
|
+
response = self._compiled_graph.invoke({
|
279
|
+
"user_instructions": user_instructions,
|
280
|
+
"data_raw": data_raw.to_dict(),
|
281
|
+
"target_variable": target_variable,
|
282
|
+
"max_retries": max_retries,
|
283
|
+
"retry_count": retry_count
|
284
|
+
}, **kwargs)
|
285
|
+
self.response = response
|
286
|
+
return None
|
287
|
+
|
288
|
+
def explain_feature_engineering_steps(self):
|
289
|
+
"""
|
290
|
+
Provides an explanation of the feature engineering steps performed by the agent.
|
291
|
+
|
292
|
+
Returns
|
293
|
+
-------
|
294
|
+
str or list
|
295
|
+
Explanation of the feature engineering steps.
|
296
|
+
"""
|
297
|
+
if self.response:
|
298
|
+
return self.response.get("messages", [])
|
299
|
+
return []
|
300
|
+
|
301
|
+
def get_log_summary(self, markdown=False):
|
302
|
+
"""
|
303
|
+
Logs a summary of the agent's operations, if logging is enabled.
|
304
|
+
|
305
|
+
Parameters
|
306
|
+
----------
|
307
|
+
markdown : bool, optional
|
308
|
+
If True, returns Markdown-formatted output.
|
309
|
+
|
310
|
+
Returns
|
311
|
+
-------
|
312
|
+
str or None
|
313
|
+
Summary of logs, or None if not available.
|
314
|
+
"""
|
315
|
+
if self.response and self.response.get("feature_engineer_function_path"):
|
316
|
+
log_details = f"Log Path: {self.response.get('feature_engineer_function_path')}"
|
317
|
+
if markdown:
|
318
|
+
return Markdown(log_details)
|
319
|
+
else:
|
320
|
+
return log_details
|
321
|
+
return None
|
322
|
+
|
323
|
+
def get_data_engineered(self):
|
324
|
+
"""
|
325
|
+
Retrieves the engineered data stored after running invoke/ainvoke.
|
326
|
+
|
327
|
+
Returns
|
328
|
+
-------
|
329
|
+
pd.DataFrame or None
|
330
|
+
The engineered dataset as a pandas DataFrame.
|
331
|
+
"""
|
332
|
+
if self.response and "data_engineered" in self.response:
|
333
|
+
return pd.DataFrame(self.response["data_engineered"])
|
334
|
+
return None
|
335
|
+
|
336
|
+
def get_data_raw(self):
|
337
|
+
"""
|
338
|
+
Retrieves the raw data.
|
339
|
+
|
340
|
+
Returns
|
341
|
+
-------
|
342
|
+
pd.DataFrame or None
|
343
|
+
The raw dataset as a pandas DataFrame if available.
|
344
|
+
"""
|
345
|
+
if self.response and "data_raw" in self.response:
|
346
|
+
return pd.DataFrame(self.response["data_raw"])
|
347
|
+
return None
|
348
|
+
|
349
|
+
def get_feature_engineer_function(self, markdown=False):
|
350
|
+
"""
|
351
|
+
Retrieves the feature engineering function generated by the agent.
|
352
|
+
|
353
|
+
Parameters
|
354
|
+
----------
|
355
|
+
markdown : bool, optional
|
356
|
+
If True, returns the function in Markdown code block format.
|
357
|
+
|
358
|
+
Returns
|
359
|
+
-------
|
360
|
+
str or None
|
361
|
+
The Python function code, or None if unavailable.
|
362
|
+
"""
|
363
|
+
if self.response and "feature_engineer_function" in self.response:
|
364
|
+
code = self.response["feature_engineer_function"]
|
365
|
+
if markdown:
|
366
|
+
return Markdown(f"```python\n{code}\n```")
|
367
|
+
return code
|
368
|
+
return None
|
369
|
+
|
370
|
+
def get_recommended_feature_engineering_steps(self, markdown=False):
|
371
|
+
"""
|
372
|
+
Retrieves the agent's recommended feature engineering steps.
|
373
|
+
|
374
|
+
Parameters
|
375
|
+
----------
|
376
|
+
markdown : bool, optional
|
377
|
+
If True, returns the steps in Markdown format.
|
378
|
+
|
379
|
+
Returns
|
380
|
+
-------
|
381
|
+
str or None
|
382
|
+
The recommended steps, or None if not available.
|
383
|
+
"""
|
384
|
+
if self.response and "recommended_steps" in self.response:
|
385
|
+
steps = self.response["recommended_steps"]
|
386
|
+
if markdown:
|
387
|
+
return Markdown(steps)
|
388
|
+
return steps
|
389
|
+
return None
|
390
|
+
|
391
|
+
def get_response(self):
|
392
|
+
"""
|
393
|
+
Returns the agent's full response dictionary.
|
394
|
+
|
395
|
+
Returns
|
396
|
+
-------
|
397
|
+
dict or None
|
398
|
+
The response dictionary if available, otherwise None.
|
399
|
+
"""
|
400
|
+
return self.response
|
401
|
+
|
402
|
+
def show(self):
|
403
|
+
"""
|
404
|
+
Displays the agent's mermaid diagram for visual inspection of the compiled graph.
|
405
|
+
"""
|
406
|
+
return self._compiled_graph.show()
|
407
|
+
|
408
|
+
|
36
409
|
# * Feature Engineering Agent
|
37
410
|
|
38
411
|
def make_feature_engineering_agent(
|
@@ -41,6 +414,7 @@ def make_feature_engineering_agent(
|
|
41
414
|
log=False,
|
42
415
|
log_path=None,
|
43
416
|
file_name="feature_engineer.py",
|
417
|
+
function_name="feature_engineer",
|
44
418
|
overwrite = True,
|
45
419
|
human_in_the_loop=False,
|
46
420
|
bypass_recommended_steps=False,
|
@@ -82,6 +456,8 @@ def make_feature_engineering_agent(
|
|
82
456
|
The path to the directory where the log files should be stored. Defaults to "logs/".
|
83
457
|
file_name : str, optional
|
84
458
|
The name of the file to save the log to. Defaults to "feature_engineer.py".
|
459
|
+
function_name : str, optional
|
460
|
+
The name of the function that will be generated. Defaults to "feature_engineer".
|
85
461
|
overwrite : bool, optional
|
86
462
|
Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
|
87
463
|
Defaults to True.
|
@@ -122,6 +498,11 @@ def make_feature_engineering_agent(
|
|
122
498
|
The feature engineering agent as a state graph.
|
123
499
|
"""
|
124
500
|
llm = model
|
501
|
+
|
502
|
+
# Human in th loop requires recommended steps
|
503
|
+
if bypass_recommended_steps and human_in_the_loop:
|
504
|
+
bypass_recommended_steps = False
|
505
|
+
print("Bypass recommended steps set to False to enable human in the loop.")
|
125
506
|
|
126
507
|
# Setup Log Directory
|
127
508
|
if log:
|
@@ -141,6 +522,7 @@ def make_feature_engineering_agent(
|
|
141
522
|
all_datasets_summary: str
|
142
523
|
feature_engineer_function: str
|
143
524
|
feature_engineer_function_path: str
|
525
|
+
feature_engineer_file_name: str
|
144
526
|
feature_engineer_function_name: str
|
145
527
|
feature_engineer_error: str
|
146
528
|
max_retries: int
|
@@ -218,19 +600,36 @@ def make_feature_engineering_agent(
|
|
218
600
|
})
|
219
601
|
|
220
602
|
return {
|
221
|
-
"recommended_steps": "
|
603
|
+
"recommended_steps": format_recommended_steps(recommended_steps.content.strip(), heading="# Recommended Feature Engineering Steps:"),
|
222
604
|
"all_datasets_summary": all_datasets_summary_str
|
223
605
|
}
|
224
606
|
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
607
|
+
# Human Review
|
608
|
+
|
609
|
+
prompt_text_human_review = "Are the following feature engineering instructions correct? (Answer 'yes' or provide modifications)\n{steps}"
|
610
|
+
|
611
|
+
if not bypass_explain_code:
|
612
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "explain_feature_engineering_code"]]:
|
613
|
+
return node_func_human_review(
|
614
|
+
state=state,
|
615
|
+
prompt_text=prompt_text_human_review,
|
616
|
+
yes_goto= 'explain_feature_engineering_code',
|
617
|
+
no_goto="recommend_feature_engineering_steps",
|
618
|
+
user_instructions_key="user_instructions",
|
619
|
+
recommended_steps_key="recommended_steps",
|
620
|
+
code_snippet_key="feature_engineer_function",
|
621
|
+
)
|
622
|
+
else:
|
623
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "__end__"]]:
|
624
|
+
return node_func_human_review(
|
625
|
+
state=state,
|
626
|
+
prompt_text=prompt_text_human_review,
|
627
|
+
yes_goto= '__end__',
|
628
|
+
no_goto="recommend_feature_engineering_steps",
|
629
|
+
user_instructions_key="user_instructions",
|
630
|
+
recommended_steps_key="recommended_steps",
|
631
|
+
code_snippet_key="feature_engineer_function",
|
632
|
+
)
|
234
633
|
|
235
634
|
def create_feature_engineering_code(state: GraphState):
|
236
635
|
if bypass_recommended_steps:
|
@@ -251,7 +650,7 @@ def make_feature_engineering_agent(
|
|
251
650
|
feature_engineering_prompt = PromptTemplate(
|
252
651
|
template="""
|
253
652
|
|
254
|
-
You are a Feature Engineering Agent. Your job is to create a
|
653
|
+
You are a Feature Engineering Agent. Your job is to create a {function_name}() function that can be run on the data provided using the following recommended steps.
|
255
654
|
|
256
655
|
Recommended Steps:
|
257
656
|
{recommended_steps}
|
@@ -265,11 +664,11 @@ def make_feature_engineering_agent(
|
|
265
664
|
|
266
665
|
You can use Pandas, Numpy, and Scikit Learn libraries to feature engineer the data.
|
267
666
|
|
268
|
-
Return Python code in ```python``` format with a single function definition,
|
667
|
+
Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), including all imports inside the function.
|
269
668
|
|
270
669
|
Return code to provide the feature engineering function:
|
271
670
|
|
272
|
-
def
|
671
|
+
def {function_name}(data_raw):
|
273
672
|
import pandas as pd
|
274
673
|
import numpy as np
|
275
674
|
...
|
@@ -292,7 +691,7 @@ def make_feature_engineering_agent(
|
|
292
691
|
|
293
692
|
|
294
693
|
""",
|
295
|
-
input_variables=["recommeded_steps", "target_variable", "all_datasets_summary"]
|
694
|
+
input_variables=["recommeded_steps", "target_variable", "all_datasets_summary", "function_name"]
|
296
695
|
)
|
297
696
|
|
298
697
|
feature_engineering_agent = feature_engineering_prompt | llm | PythonOutputParser()
|
@@ -301,6 +700,7 @@ def make_feature_engineering_agent(
|
|
301
700
|
"recommended_steps": state.get("recommended_steps"),
|
302
701
|
"target_variable": state.get("target_variable"),
|
303
702
|
"all_datasets_summary": all_datasets_summary_str,
|
703
|
+
"function_name": function_name
|
304
704
|
})
|
305
705
|
|
306
706
|
response = relocate_imports_inside_function(response)
|
@@ -318,12 +718,11 @@ def make_feature_engineering_agent(
|
|
318
718
|
return {
|
319
719
|
"feature_engineer_function": response,
|
320
720
|
"feature_engineer_function_path": file_path,
|
321
|
-
"
|
721
|
+
"feature_engineer_file_name": file_name_2,
|
722
|
+
"feature_engineer_function_name": function_name,
|
322
723
|
"all_datasets_summary": all_datasets_summary_str
|
323
724
|
}
|
324
725
|
|
325
|
-
|
326
|
-
|
327
726
|
def execute_feature_engineering_code(state):
|
328
727
|
return node_func_execute_agent_code_on_data(
|
329
728
|
state=state,
|
@@ -331,7 +730,7 @@ def make_feature_engineering_agent(
|
|
331
730
|
result_key="data_engineered",
|
332
731
|
error_key="feature_engineer_error",
|
333
732
|
code_snippet_key="feature_engineer_function",
|
334
|
-
agent_function_name="
|
733
|
+
agent_function_name=state.get("feature_engineer_function_name"),
|
335
734
|
pre_processing=lambda data: pd.DataFrame.from_dict(data),
|
336
735
|
post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
|
337
736
|
error_message_prefix="An error occurred during feature engineering: "
|
@@ -339,11 +738,13 @@ def make_feature_engineering_agent(
|
|
339
738
|
|
340
739
|
def fix_feature_engineering_code(state: GraphState):
|
341
740
|
feature_engineer_prompt = """
|
342
|
-
You are a Feature Engineering Agent. Your job is to fix the
|
741
|
+
You are a Feature Engineering Agent. Your job is to fix the {function_name}() function that currently contains errors.
|
742
|
+
|
743
|
+
Provide only the corrected function definition for {function_name}().
|
343
744
|
|
344
|
-
|
745
|
+
Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
|
345
746
|
|
346
|
-
|
747
|
+
This is the broken code (please fix):
|
347
748
|
{code_snippet}
|
348
749
|
|
349
750
|
Last Known Error:
|
@@ -359,6 +760,7 @@ def make_feature_engineering_agent(
|
|
359
760
|
agent_name=AGENT_NAME,
|
360
761
|
log=log,
|
361
762
|
file_path=state.get("feature_engineer_function_path"),
|
763
|
+
function_name=state.get("feature_engineer_function_name"),
|
362
764
|
)
|
363
765
|
|
364
766
|
def explain_feature_engineering_code(state: GraphState):
|
@@ -395,9 +797,11 @@ def make_feature_engineering_agent(
|
|
395
797
|
fix_code_node_name="fix_feature_engineering_code",
|
396
798
|
explain_code_node_name="explain_feature_engineering_code",
|
397
799
|
error_key="feature_engineer_error",
|
800
|
+
max_retries_key = "max_retries",
|
801
|
+
retry_count_key = "retry_count",
|
398
802
|
human_in_the_loop=human_in_the_loop,
|
399
803
|
human_review_node_name="human_review",
|
400
|
-
checkpointer=MemorySaver()
|
804
|
+
checkpointer=MemorySaver(),
|
401
805
|
bypass_recommended_steps=bypass_recommended_steps,
|
402
806
|
bypass_explain_code=bypass_explain_code,
|
403
807
|
)
|