ai-data-science-team 0.0.0.9006__py3-none-any.whl → 0.0.0.9008__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/agents/__init__.py +5 -4
- ai_data_science_team/agents/data_cleaning_agent.py +371 -45
- ai_data_science_team/agents/data_visualization_agent.py +764 -0
- ai_data_science_team/agents/data_wrangling_agent.py +507 -23
- ai_data_science_team/agents/feature_engineering_agent.py +467 -34
- ai_data_science_team/agents/sql_database_agent.py +394 -30
- ai_data_science_team/multiagents/__init__.py +1 -0
- ai_data_science_team/multiagents/sql_data_analyst.py +286 -0
- ai_data_science_team/multiagents/supervised_data_analyst.py +2 -0
- ai_data_science_team/templates/__init__.py +9 -0
- ai_data_science_team/templates/agent_templates.py +247 -42
- ai_data_science_team/tools/metadata.py +110 -47
- ai_data_science_team/tools/regex.py +33 -0
- ai_data_science_team/utils/__init__.py +0 -0
- ai_data_science_team/utils/plotly.py +24 -0
- ai_data_science_team-0.0.0.9008.dist-info/METADATA +231 -0
- ai_data_science_team-0.0.0.9008.dist-info/RECORD +26 -0
- {ai_data_science_team-0.0.0.9006.dist-info → ai_data_science_team-0.0.0.9008.dist-info}/WHEEL +1 -1
- ai_data_science_team-0.0.0.9006.dist-info/METADATA +0 -165
- ai_data_science_team-0.0.0.9006.dist-info/RECORD +0 -20
- {ai_data_science_team-0.0.0.9006.dist-info → ai_data_science_team-0.0.0.9008.dist-info}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9006.dist-info → ai_data_science_team-0.0.0.9008.dist-info}/top_level.txt +0 -0
@@ -14,18 +14,25 @@ from langgraph.types import Command
|
|
14
14
|
from langgraph.checkpoint.memory import MemorySaver
|
15
15
|
|
16
16
|
import os
|
17
|
-
import io
|
18
17
|
import pandas as pd
|
19
18
|
|
20
|
-
from
|
19
|
+
from IPython.display import Markdown
|
20
|
+
|
21
|
+
from ai_data_science_team.templates import(
|
21
22
|
node_func_execute_agent_code_on_data,
|
22
23
|
node_func_human_review,
|
23
24
|
node_func_fix_agent_code,
|
24
25
|
node_func_explain_agent_code,
|
25
|
-
create_coding_agent_graph
|
26
|
+
create_coding_agent_graph,
|
27
|
+
BaseAgent,
|
26
28
|
)
|
27
29
|
from ai_data_science_team.tools.parsers import PythonOutputParser
|
28
|
-
from ai_data_science_team.tools.regex import
|
30
|
+
from ai_data_science_team.tools.regex import (
|
31
|
+
relocate_imports_inside_function,
|
32
|
+
add_comments_to_top,
|
33
|
+
format_agent_name,
|
34
|
+
format_recommended_steps
|
35
|
+
)
|
29
36
|
from ai_data_science_team.tools.metadata import get_dataframe_summary
|
30
37
|
from ai_data_science_team.tools.logging import log_ai_function
|
31
38
|
|
@@ -33,9 +40,386 @@ from ai_data_science_team.tools.logging import log_ai_function
|
|
33
40
|
AGENT_NAME = "feature_engineering_agent"
|
34
41
|
LOG_PATH = os.path.join(os.getcwd(), "logs/")
|
35
42
|
|
43
|
+
# Class
|
44
|
+
|
45
|
+
class FeatureEngineeringAgent(BaseAgent):
|
46
|
+
"""
|
47
|
+
Creates a feature engineering agent that can process datasets based on user-defined instructions or
|
48
|
+
default feature engineering steps. The agent generates a Python function to engineer features, executes it,
|
49
|
+
and logs the process, including code and errors. It is designed to facilitate reproducible and
|
50
|
+
customizable feature engineering workflows.
|
51
|
+
|
52
|
+
The agent can perform the following default feature engineering steps unless instructed otherwise:
|
53
|
+
- Convert features to appropriate data types
|
54
|
+
- Remove features that have unique values for each row
|
55
|
+
- Remove constant features
|
56
|
+
- Encode high-cardinality categoricals (threshold <= 5% of dataset) as 'other'
|
57
|
+
- One-hot-encode categorical variables
|
58
|
+
- Convert booleans to integer (1/0)
|
59
|
+
- Create datetime-based features (if applicable)
|
60
|
+
- Handle target variable encoding if specified
|
61
|
+
- Any user-provided instructions to add, remove, or modify steps
|
62
|
+
|
63
|
+
Parameters
|
64
|
+
----------
|
65
|
+
model : langchain.llms.base.LLM
|
66
|
+
The language model used to generate the feature engineering function.
|
67
|
+
n_samples : int, optional
|
68
|
+
Number of samples used when summarizing the dataset. Defaults to 30.
|
69
|
+
log : bool, optional
|
70
|
+
Whether to log the generated code and errors. Defaults to False.
|
71
|
+
log_path : str, optional
|
72
|
+
Directory path for storing log files. Defaults to None.
|
73
|
+
file_name : str, optional
|
74
|
+
Name of the file for saving the generated response. Defaults to "feature_engineer.py".
|
75
|
+
function_name : str, optional
|
76
|
+
Name of the function for data visualization. Defaults to "feature_engineer".
|
77
|
+
overwrite : bool, optional
|
78
|
+
Whether to overwrite the log file if it exists. If False, a unique file name is created. Defaults to True.
|
79
|
+
human_in_the_loop : bool, optional
|
80
|
+
Enables user review of feature engineering instructions. Defaults to False.
|
81
|
+
bypass_recommended_steps : bool, optional
|
82
|
+
If True, skips the default recommended steps. Defaults to False.
|
83
|
+
bypass_explain_code : bool, optional
|
84
|
+
If True, skips the step that provides code explanations. Defaults to False.
|
85
|
+
|
86
|
+
Methods
|
87
|
+
-------
|
88
|
+
update_params(**kwargs)
|
89
|
+
Updates the agent's parameters and rebuilds the compiled state graph.
|
90
|
+
ainvoke_agent(
|
91
|
+
user_instructions: str,
|
92
|
+
data_raw: pd.DataFrame,
|
93
|
+
target_variable: str = None,
|
94
|
+
max_retries=3,
|
95
|
+
retry_count=0
|
96
|
+
)
|
97
|
+
Engineers features from the provided dataset asynchronously based on user instructions.
|
98
|
+
invoke_agent(
|
99
|
+
user_instructions: str,
|
100
|
+
data_raw: pd.DataFrame,
|
101
|
+
target_variable: str = None,
|
102
|
+
max_retries=3,
|
103
|
+
retry_count=0
|
104
|
+
)
|
105
|
+
Engineers features from the provided dataset synchronously based on user instructions.
|
106
|
+
explain_feature_engineering_steps()
|
107
|
+
Returns an explanation of the feature engineering steps performed by the agent.
|
108
|
+
get_log_summary()
|
109
|
+
Retrieves a summary of logged operations if logging is enabled.
|
110
|
+
get_data_engineered()
|
111
|
+
Retrieves the feature-engineered dataset as a pandas DataFrame.
|
112
|
+
get_data_raw()
|
113
|
+
Retrieves the raw dataset as a pandas DataFrame.
|
114
|
+
get_feature_engineer_function()
|
115
|
+
Retrieves the generated Python function used for feature engineering.
|
116
|
+
get_recommended_feature_engineering_steps()
|
117
|
+
Retrieves the agent's recommended feature engineering steps.
|
118
|
+
get_response()
|
119
|
+
Returns the response from the agent as a dictionary.
|
120
|
+
show()
|
121
|
+
Displays the agent's mermaid diagram.
|
122
|
+
|
123
|
+
Examples
|
124
|
+
--------
|
125
|
+
```python
|
126
|
+
import pandas as pd
|
127
|
+
from langchain_openai import ChatOpenAI
|
128
|
+
from ai_data_science_team.agents import FeatureEngineeringAgent
|
129
|
+
|
130
|
+
llm = ChatOpenAI(model="gpt-4o-mini")
|
131
|
+
|
132
|
+
feature_agent = FeatureEngineeringAgent(
|
133
|
+
model=llm,
|
134
|
+
n_samples=30,
|
135
|
+
log=True,
|
136
|
+
log_path="logs",
|
137
|
+
human_in_the_loop=True
|
138
|
+
)
|
139
|
+
|
140
|
+
df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
|
141
|
+
|
142
|
+
feature_agent.invoke_agent(
|
143
|
+
user_instructions="Also encode the 'PaymentMethod' column with one-hot encoding.",
|
144
|
+
data_raw=df,
|
145
|
+
target_variable="Churn",
|
146
|
+
max_retries=3,
|
147
|
+
retry_count=0
|
148
|
+
)
|
149
|
+
|
150
|
+
engineered_data = feature_agent.get_data_engineered()
|
151
|
+
response = feature_agent.get_response()
|
152
|
+
```
|
153
|
+
|
154
|
+
Returns
|
155
|
+
-------
|
156
|
+
FeatureEngineeringAgent : langchain.graphs.CompiledStateGraph
|
157
|
+
A feature engineering agent implemented as a compiled state graph.
|
158
|
+
"""
|
159
|
+
|
160
|
+
def __init__(
|
161
|
+
self,
|
162
|
+
model,
|
163
|
+
n_samples=30,
|
164
|
+
log=False,
|
165
|
+
log_path=None,
|
166
|
+
file_name="feature_engineer.py",
|
167
|
+
function_name="feature_engineer",
|
168
|
+
overwrite=True,
|
169
|
+
human_in_the_loop=False,
|
170
|
+
bypass_recommended_steps=False,
|
171
|
+
bypass_explain_code=False
|
172
|
+
):
|
173
|
+
self._params = {
|
174
|
+
"model": model,
|
175
|
+
"n_samples": n_samples,
|
176
|
+
"log": log,
|
177
|
+
"log_path": log_path,
|
178
|
+
"file_name": file_name,
|
179
|
+
"function_name": function_name,
|
180
|
+
"overwrite": overwrite,
|
181
|
+
"human_in_the_loop": human_in_the_loop,
|
182
|
+
"bypass_recommended_steps": bypass_recommended_steps,
|
183
|
+
"bypass_explain_code": bypass_explain_code
|
184
|
+
}
|
185
|
+
self._compiled_graph = self._make_compiled_graph()
|
186
|
+
self.response = None
|
187
|
+
|
188
|
+
def _make_compiled_graph(self):
|
189
|
+
"""
|
190
|
+
Create the compiled graph for the feature engineering agent.
|
191
|
+
Running this method will reset the response to None.
|
192
|
+
"""
|
193
|
+
self.response = None
|
194
|
+
return make_feature_engineering_agent(**self._params)
|
195
|
+
|
196
|
+
def update_params(self, **kwargs):
|
197
|
+
"""
|
198
|
+
Updates the agent's parameters and rebuilds the compiled graph.
|
199
|
+
"""
|
200
|
+
for k, v in kwargs.items():
|
201
|
+
self._params[k] = v
|
202
|
+
self._compiled_graph = self._make_compiled_graph()
|
203
|
+
|
204
|
+
def ainvoke_agent(
|
205
|
+
self,
|
206
|
+
data_raw: pd.DataFrame,
|
207
|
+
user_instructions: str=None,
|
208
|
+
target_variable: str = None,
|
209
|
+
max_retries=3,
|
210
|
+
retry_count=0,
|
211
|
+
**kwargs
|
212
|
+
):
|
213
|
+
"""
|
214
|
+
Asynchronously engineers features for the provided dataset.
|
215
|
+
The response is stored in the 'response' attribute.
|
216
|
+
|
217
|
+
Parameters
|
218
|
+
----------
|
219
|
+
data_raw : pd.DataFrame
|
220
|
+
The raw dataset to be processed.
|
221
|
+
user_instructions : str, optional
|
222
|
+
Instructions for feature engineering.
|
223
|
+
target_variable : str, optional
|
224
|
+
The name of the target variable (if any).
|
225
|
+
max_retries : int
|
226
|
+
Maximum retry attempts.
|
227
|
+
retry_count : int
|
228
|
+
Current retry attempt count.
|
229
|
+
**kwargs
|
230
|
+
Additional keyword arguments to pass to ainvoke().
|
231
|
+
|
232
|
+
Returns
|
233
|
+
-------
|
234
|
+
None
|
235
|
+
"""
|
236
|
+
response = self._compiled_graph.ainvoke({
|
237
|
+
"user_instructions": user_instructions,
|
238
|
+
"data_raw": data_raw.to_dict(),
|
239
|
+
"target_variable": target_variable,
|
240
|
+
"max_retries": max_retries,
|
241
|
+
"retry_count": retry_count
|
242
|
+
}, **kwargs)
|
243
|
+
self.response = response
|
244
|
+
return None
|
245
|
+
|
246
|
+
def invoke_agent(
|
247
|
+
self,
|
248
|
+
data_raw: pd.DataFrame,
|
249
|
+
user_instructions: str=None,
|
250
|
+
target_variable: str = None,
|
251
|
+
max_retries=3,
|
252
|
+
retry_count=0,
|
253
|
+
**kwargs
|
254
|
+
):
|
255
|
+
"""
|
256
|
+
Synchronously engineers features for the provided dataset.
|
257
|
+
The response is stored in the 'response' attribute.
|
258
|
+
|
259
|
+
Parameters
|
260
|
+
----------
|
261
|
+
data_raw : pd.DataFrame
|
262
|
+
The raw dataset to be processed.
|
263
|
+
user_instructions : str
|
264
|
+
Instructions for feature engineering agent.
|
265
|
+
target_variable : str, optional
|
266
|
+
The name of the target variable (if any).
|
267
|
+
max_retries : int
|
268
|
+
Maximum retry attempts.
|
269
|
+
retry_count : int
|
270
|
+
Current retry attempt count.
|
271
|
+
**kwargs
|
272
|
+
Additional keyword arguments to pass to invoke().
|
273
|
+
|
274
|
+
Returns
|
275
|
+
-------
|
276
|
+
None
|
277
|
+
"""
|
278
|
+
response = self._compiled_graph.invoke({
|
279
|
+
"user_instructions": user_instructions,
|
280
|
+
"data_raw": data_raw.to_dict(),
|
281
|
+
"target_variable": target_variable,
|
282
|
+
"max_retries": max_retries,
|
283
|
+
"retry_count": retry_count
|
284
|
+
}, **kwargs)
|
285
|
+
self.response = response
|
286
|
+
return None
|
287
|
+
|
288
|
+
def explain_feature_engineering_steps(self):
|
289
|
+
"""
|
290
|
+
Provides an explanation of the feature engineering steps performed by the agent.
|
291
|
+
|
292
|
+
Returns
|
293
|
+
-------
|
294
|
+
str or list
|
295
|
+
Explanation of the feature engineering steps.
|
296
|
+
"""
|
297
|
+
if self.response:
|
298
|
+
return self.response.get("messages", [])
|
299
|
+
return []
|
300
|
+
|
301
|
+
def get_log_summary(self, markdown=False):
|
302
|
+
"""
|
303
|
+
Logs a summary of the agent's operations, if logging is enabled.
|
304
|
+
|
305
|
+
Parameters
|
306
|
+
----------
|
307
|
+
markdown : bool, optional
|
308
|
+
If True, returns Markdown-formatted output.
|
309
|
+
|
310
|
+
Returns
|
311
|
+
-------
|
312
|
+
str or None
|
313
|
+
Summary of logs, or None if not available.
|
314
|
+
"""
|
315
|
+
if self.response and self.response.get("feature_engineer_function_path"):
|
316
|
+
log_details = f"Log Path: {self.response.get('feature_engineer_function_path')}"
|
317
|
+
if markdown:
|
318
|
+
return Markdown(log_details)
|
319
|
+
else:
|
320
|
+
return log_details
|
321
|
+
return None
|
322
|
+
|
323
|
+
def get_data_engineered(self):
|
324
|
+
"""
|
325
|
+
Retrieves the engineered data stored after running invoke/ainvoke.
|
326
|
+
|
327
|
+
Returns
|
328
|
+
-------
|
329
|
+
pd.DataFrame or None
|
330
|
+
The engineered dataset as a pandas DataFrame.
|
331
|
+
"""
|
332
|
+
if self.response and "data_engineered" in self.response:
|
333
|
+
return pd.DataFrame(self.response["data_engineered"])
|
334
|
+
return None
|
335
|
+
|
336
|
+
def get_data_raw(self):
|
337
|
+
"""
|
338
|
+
Retrieves the raw data.
|
339
|
+
|
340
|
+
Returns
|
341
|
+
-------
|
342
|
+
pd.DataFrame or None
|
343
|
+
The raw dataset as a pandas DataFrame if available.
|
344
|
+
"""
|
345
|
+
if self.response and "data_raw" in self.response:
|
346
|
+
return pd.DataFrame(self.response["data_raw"])
|
347
|
+
return None
|
348
|
+
|
349
|
+
def get_feature_engineer_function(self, markdown=False):
|
350
|
+
"""
|
351
|
+
Retrieves the feature engineering function generated by the agent.
|
352
|
+
|
353
|
+
Parameters
|
354
|
+
----------
|
355
|
+
markdown : bool, optional
|
356
|
+
If True, returns the function in Markdown code block format.
|
357
|
+
|
358
|
+
Returns
|
359
|
+
-------
|
360
|
+
str or None
|
361
|
+
The Python function code, or None if unavailable.
|
362
|
+
"""
|
363
|
+
if self.response and "feature_engineer_function" in self.response:
|
364
|
+
code = self.response["feature_engineer_function"]
|
365
|
+
if markdown:
|
366
|
+
return Markdown(f"```python\n{code}\n```")
|
367
|
+
return code
|
368
|
+
return None
|
369
|
+
|
370
|
+
def get_recommended_feature_engineering_steps(self, markdown=False):
|
371
|
+
"""
|
372
|
+
Retrieves the agent's recommended feature engineering steps.
|
373
|
+
|
374
|
+
Parameters
|
375
|
+
----------
|
376
|
+
markdown : bool, optional
|
377
|
+
If True, returns the steps in Markdown format.
|
378
|
+
|
379
|
+
Returns
|
380
|
+
-------
|
381
|
+
str or None
|
382
|
+
The recommended steps, or None if not available.
|
383
|
+
"""
|
384
|
+
if self.response and "recommended_steps" in self.response:
|
385
|
+
steps = self.response["recommended_steps"]
|
386
|
+
if markdown:
|
387
|
+
return Markdown(steps)
|
388
|
+
return steps
|
389
|
+
return None
|
390
|
+
|
391
|
+
def get_response(self):
|
392
|
+
"""
|
393
|
+
Returns the agent's full response dictionary.
|
394
|
+
|
395
|
+
Returns
|
396
|
+
-------
|
397
|
+
dict or None
|
398
|
+
The response dictionary if available, otherwise None.
|
399
|
+
"""
|
400
|
+
return self.response
|
401
|
+
|
402
|
+
def show(self):
|
403
|
+
"""
|
404
|
+
Displays the agent's mermaid diagram for visual inspection of the compiled graph.
|
405
|
+
"""
|
406
|
+
return self._compiled_graph.show()
|
407
|
+
|
408
|
+
|
36
409
|
# * Feature Engineering Agent
|
37
410
|
|
38
|
-
def make_feature_engineering_agent(
|
411
|
+
def make_feature_engineering_agent(
|
412
|
+
model,
|
413
|
+
n_samples=30,
|
414
|
+
log=False,
|
415
|
+
log_path=None,
|
416
|
+
file_name="feature_engineer.py",
|
417
|
+
function_name="feature_engineer",
|
418
|
+
overwrite = True,
|
419
|
+
human_in_the_loop=False,
|
420
|
+
bypass_recommended_steps=False,
|
421
|
+
bypass_explain_code=False,
|
422
|
+
):
|
39
423
|
"""
|
40
424
|
Creates a feature engineering agent that can be run on a dataset. The agent applies various feature engineering
|
41
425
|
techniques, such as encoding categorical variables, scaling numeric variables, creating interaction terms,
|
@@ -61,11 +445,19 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
|
|
61
445
|
----------
|
62
446
|
model : langchain.llms.base.LLM
|
63
447
|
The language model to use to generate code.
|
448
|
+
n_samples : int, optional
|
449
|
+
The number of data samples to use for generating the feature engineering code. Defaults to 30.
|
450
|
+
If you get an error due to maximum tokens, try reducing this number.
|
451
|
+
> "This model's maximum context length is 128000 tokens. However, your messages resulted in 333858 tokens. Please reduce the length of the messages."
|
64
452
|
log : bool, optional
|
65
453
|
Whether or not to log the code generated and any errors that occur.
|
66
454
|
Defaults to False.
|
67
455
|
log_path : str, optional
|
68
456
|
The path to the directory where the log files should be stored. Defaults to "logs/".
|
457
|
+
file_name : str, optional
|
458
|
+
The name of the file to save the log to. Defaults to "feature_engineer.py".
|
459
|
+
function_name : str, optional
|
460
|
+
The name of the function that will be generated. Defaults to "feature_engineer".
|
69
461
|
overwrite : bool, optional
|
70
462
|
Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
|
71
463
|
Defaults to True.
|
@@ -102,10 +494,15 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
|
|
102
494
|
|
103
495
|
Returns
|
104
496
|
-------
|
105
|
-
app : langchain.graphs.
|
497
|
+
app : langchain.graphs.CompiledStateGraph
|
106
498
|
The feature engineering agent as a state graph.
|
107
499
|
"""
|
108
500
|
llm = model
|
501
|
+
|
502
|
+
# Human in th loop requires recommended steps
|
503
|
+
if bypass_recommended_steps and human_in_the_loop:
|
504
|
+
bypass_recommended_steps = False
|
505
|
+
print("Bypass recommended steps set to False to enable human in the loop.")
|
109
506
|
|
110
507
|
# Setup Log Directory
|
111
508
|
if log:
|
@@ -125,6 +522,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
|
|
125
522
|
all_datasets_summary: str
|
126
523
|
feature_engineer_function: str
|
127
524
|
feature_engineer_function_path: str
|
525
|
+
feature_engineer_file_name: str
|
128
526
|
feature_engineer_function_name: str
|
129
527
|
feature_engineer_error: str
|
130
528
|
max_retries: int
|
@@ -135,7 +533,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
|
|
135
533
|
Recommend a series of feature engineering steps based on the input data.
|
136
534
|
These recommended steps will be appended to the user_instructions.
|
137
535
|
"""
|
138
|
-
print(
|
536
|
+
print(format_agent_name(AGENT_NAME))
|
139
537
|
print(" * RECOMMEND FEATURE ENGINEERING STEPS")
|
140
538
|
|
141
539
|
# Prompt to get recommended steps from the LLM
|
@@ -182,6 +580,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
|
|
182
580
|
|
183
581
|
Avoid these:
|
184
582
|
1. Do not include steps to save files.
|
583
|
+
2. Do not include unrelated user instructions that are not related to the feature engineering.
|
185
584
|
""",
|
186
585
|
input_variables=["user_instructions", "recommended_steps", "all_datasets_summary"]
|
187
586
|
)
|
@@ -189,7 +588,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
|
|
189
588
|
data_raw = state.get("data_raw")
|
190
589
|
df = pd.DataFrame.from_dict(data_raw)
|
191
590
|
|
192
|
-
all_datasets_summary = get_dataframe_summary([df])
|
591
|
+
all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
|
193
592
|
|
194
593
|
all_datasets_summary_str = "\n\n".join(all_datasets_summary)
|
195
594
|
|
@@ -201,29 +600,57 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
|
|
201
600
|
})
|
202
601
|
|
203
602
|
return {
|
204
|
-
"recommended_steps": "
|
603
|
+
"recommended_steps": format_recommended_steps(recommended_steps.content.strip(), heading="# Recommended Feature Engineering Steps:"),
|
205
604
|
"all_datasets_summary": all_datasets_summary_str
|
206
605
|
}
|
207
606
|
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
607
|
+
# Human Review
|
608
|
+
|
609
|
+
prompt_text_human_review = "Are the following feature engineering instructions correct? (Answer 'yes' or provide modifications)\n{steps}"
|
610
|
+
|
611
|
+
if not bypass_explain_code:
|
612
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "explain_feature_engineering_code"]]:
|
613
|
+
return node_func_human_review(
|
614
|
+
state=state,
|
615
|
+
prompt_text=prompt_text_human_review,
|
616
|
+
yes_goto= 'explain_feature_engineering_code',
|
617
|
+
no_goto="recommend_feature_engineering_steps",
|
618
|
+
user_instructions_key="user_instructions",
|
619
|
+
recommended_steps_key="recommended_steps",
|
620
|
+
code_snippet_key="feature_engineer_function",
|
621
|
+
)
|
622
|
+
else:
|
623
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "__end__"]]:
|
624
|
+
return node_func_human_review(
|
625
|
+
state=state,
|
626
|
+
prompt_text=prompt_text_human_review,
|
627
|
+
yes_goto= '__end__',
|
628
|
+
no_goto="recommend_feature_engineering_steps",
|
629
|
+
user_instructions_key="user_instructions",
|
630
|
+
recommended_steps_key="recommended_steps",
|
631
|
+
code_snippet_key="feature_engineer_function",
|
632
|
+
)
|
217
633
|
|
218
634
|
def create_feature_engineering_code(state: GraphState):
|
219
635
|
if bypass_recommended_steps:
|
220
|
-
print(
|
636
|
+
print(format_agent_name(AGENT_NAME))
|
637
|
+
|
638
|
+
data_raw = state.get("data_raw")
|
639
|
+
df = pd.DataFrame.from_dict(data_raw)
|
640
|
+
|
641
|
+
all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
|
642
|
+
|
643
|
+
all_datasets_summary_str = "\n\n".join(all_datasets_summary)
|
644
|
+
|
645
|
+
else:
|
646
|
+
all_datasets_summary_str = state.get("all_datasets_summary")
|
647
|
+
|
221
648
|
print(" * CREATE FEATURE ENGINEERING CODE")
|
222
649
|
|
223
650
|
feature_engineering_prompt = PromptTemplate(
|
224
651
|
template="""
|
225
652
|
|
226
|
-
You are a Feature Engineering Agent. Your job is to create a
|
653
|
+
You are a Feature Engineering Agent. Your job is to create a {function_name}() function that can be run on the data provided using the following recommended steps.
|
227
654
|
|
228
655
|
Recommended Steps:
|
229
656
|
{recommended_steps}
|
@@ -237,11 +664,11 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
|
|
237
664
|
|
238
665
|
You can use Pandas, Numpy, and Scikit Learn libraries to feature engineer the data.
|
239
666
|
|
240
|
-
Return Python code in ```python``` format with a single function definition,
|
667
|
+
Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), including all imports inside the function.
|
241
668
|
|
242
669
|
Return code to provide the feature engineering function:
|
243
670
|
|
244
|
-
def
|
671
|
+
def {function_name}(data_raw):
|
245
672
|
import pandas as pd
|
246
673
|
import numpy as np
|
247
674
|
...
|
@@ -264,7 +691,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
|
|
264
691
|
|
265
692
|
|
266
693
|
""",
|
267
|
-
input_variables=["recommeded_steps", "target_variable", "all_datasets_summary"]
|
694
|
+
input_variables=["recommeded_steps", "target_variable", "all_datasets_summary", "function_name"]
|
268
695
|
)
|
269
696
|
|
270
697
|
feature_engineering_agent = feature_engineering_prompt | llm | PythonOutputParser()
|
@@ -272,16 +699,17 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
|
|
272
699
|
response = feature_engineering_agent.invoke({
|
273
700
|
"recommended_steps": state.get("recommended_steps"),
|
274
701
|
"target_variable": state.get("target_variable"),
|
275
|
-
"all_datasets_summary":
|
702
|
+
"all_datasets_summary": all_datasets_summary_str,
|
703
|
+
"function_name": function_name
|
276
704
|
})
|
277
705
|
|
278
706
|
response = relocate_imports_inside_function(response)
|
279
707
|
response = add_comments_to_top(response, agent_name=AGENT_NAME)
|
280
708
|
|
281
709
|
# For logging: store the code generated
|
282
|
-
file_path,
|
710
|
+
file_path, file_name_2 = log_ai_function(
|
283
711
|
response=response,
|
284
|
-
file_name=
|
712
|
+
file_name=file_name,
|
285
713
|
log=log,
|
286
714
|
log_path=log_path,
|
287
715
|
overwrite=overwrite
|
@@ -290,11 +718,11 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
|
|
290
718
|
return {
|
291
719
|
"feature_engineer_function": response,
|
292
720
|
"feature_engineer_function_path": file_path,
|
293
|
-
"
|
721
|
+
"feature_engineer_file_name": file_name_2,
|
722
|
+
"feature_engineer_function_name": function_name,
|
723
|
+
"all_datasets_summary": all_datasets_summary_str
|
294
724
|
}
|
295
725
|
|
296
|
-
|
297
|
-
|
298
726
|
def execute_feature_engineering_code(state):
|
299
727
|
return node_func_execute_agent_code_on_data(
|
300
728
|
state=state,
|
@@ -302,7 +730,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
|
|
302
730
|
result_key="data_engineered",
|
303
731
|
error_key="feature_engineer_error",
|
304
732
|
code_snippet_key="feature_engineer_function",
|
305
|
-
agent_function_name="
|
733
|
+
agent_function_name=state.get("feature_engineer_function_name"),
|
306
734
|
pre_processing=lambda data: pd.DataFrame.from_dict(data),
|
307
735
|
post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
|
308
736
|
error_message_prefix="An error occurred during feature engineering: "
|
@@ -310,11 +738,13 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
|
|
310
738
|
|
311
739
|
def fix_feature_engineering_code(state: GraphState):
|
312
740
|
feature_engineer_prompt = """
|
313
|
-
You are a Feature Engineering Agent. Your job is to fix the
|
741
|
+
You are a Feature Engineering Agent. Your job is to fix the {function_name}() function that currently contains errors.
|
742
|
+
|
743
|
+
Provide only the corrected function definition for {function_name}().
|
314
744
|
|
315
|
-
|
745
|
+
Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
|
316
746
|
|
317
|
-
|
747
|
+
This is the broken code (please fix):
|
318
748
|
{code_snippet}
|
319
749
|
|
320
750
|
Last Known Error:
|
@@ -330,6 +760,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
|
|
330
760
|
agent_name=AGENT_NAME,
|
331
761
|
log=log,
|
332
762
|
file_path=state.get("feature_engineer_function_path"),
|
763
|
+
function_name=state.get("feature_engineer_function_name"),
|
333
764
|
)
|
334
765
|
|
335
766
|
def explain_feature_engineering_code(state: GraphState):
|
@@ -366,9 +797,11 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
|
|
366
797
|
fix_code_node_name="fix_feature_engineering_code",
|
367
798
|
explain_code_node_name="explain_feature_engineering_code",
|
368
799
|
error_key="feature_engineer_error",
|
800
|
+
max_retries_key = "max_retries",
|
801
|
+
retry_count_key = "retry_count",
|
369
802
|
human_in_the_loop=human_in_the_loop,
|
370
803
|
human_review_node_name="human_review",
|
371
|
-
checkpointer=MemorySaver()
|
804
|
+
checkpointer=MemorySaver(),
|
372
805
|
bypass_recommended_steps=bypass_recommended_steps,
|
373
806
|
bypass_explain_code=bypass_explain_code,
|
374
807
|
)
|