ai-data-science-team 0.0.0.9000__py3-none-any.whl → 0.0.0.9005__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/agents/__init__.py +4 -0
- ai_data_science_team/agents/data_cleaning_agent.py +347 -0
- ai_data_science_team/agents/data_wrangling_agent.py +365 -0
- ai_data_science_team/agents/feature_engineering_agent.py +368 -0
- ai_data_science_team/templates/__init__.py +0 -0
- ai_data_science_team/templates/agent_templates.py +409 -0
- ai_data_science_team/tools/__init__.py +0 -0
- ai_data_science_team/tools/data_analysis.py +116 -0
- ai_data_science_team/tools/logging.py +61 -0
- ai_data_science_team/tools/parsers.py +57 -0
- ai_data_science_team/tools/regex.py +73 -0
- ai_data_science_team-0.0.0.9005.dist-info/METADATA +162 -0
- ai_data_science_team-0.0.0.9005.dist-info/RECORD +19 -0
- ai_data_science_team/agents.py +0 -325
- ai_data_science_team-0.0.0.9000.dist-info/METADATA +0 -131
- ai_data_science_team-0.0.0.9000.dist-info/RECORD +0 -9
- {ai_data_science_team-0.0.0.9000.dist-info → ai_data_science_team-0.0.0.9005.dist-info}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9000.dist-info → ai_data_science_team-0.0.0.9005.dist-info}/WHEEL +0 -0
- {ai_data_science_team-0.0.0.9000.dist-info → ai_data_science_team-0.0.0.9005.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,368 @@
|
|
1
|
+
# BUSINESS SCIENCE UNIVERSITY
|
2
|
+
# AI DATA SCIENCE TEAM
|
3
|
+
# ***
|
4
|
+
# * Agents: Feature Engineering Agent
|
5
|
+
|
6
|
+
# Libraries
|
7
|
+
from typing import TypedDict, Annotated, Sequence, Literal
|
8
|
+
import operator
|
9
|
+
|
10
|
+
from langchain.prompts import PromptTemplate
|
11
|
+
from langchain_core.messages import BaseMessage
|
12
|
+
|
13
|
+
from langgraph.types import Command
|
14
|
+
from langgraph.checkpoint.memory import MemorySaver
|
15
|
+
|
16
|
+
import os
|
17
|
+
import io
|
18
|
+
import pandas as pd
|
19
|
+
|
20
|
+
from ai_data_science_team.templates.agent_templates import(
|
21
|
+
node_func_execute_agent_code_on_data,
|
22
|
+
node_func_human_review,
|
23
|
+
node_func_fix_agent_code,
|
24
|
+
node_func_explain_agent_code,
|
25
|
+
create_coding_agent_graph
|
26
|
+
)
|
27
|
+
from ai_data_science_team.tools.parsers import PythonOutputParser
|
28
|
+
from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top
|
29
|
+
from ai_data_science_team.tools.data_analysis import summarize_dataframes
|
30
|
+
from ai_data_science_team.tools.logging import log_ai_function
|
31
|
+
|
32
|
+
# Setup
|
33
|
+
AGENT_NAME = "feature_engineering_agent"
|
34
|
+
LOG_PATH = os.path.join(os.getcwd(), "logs/")
|
35
|
+
|
36
|
+
# * Feature Engineering Agent
|
37
|
+
|
38
|
+
def make_feature_engineering_agent(model, log=False, log_path=None, overwrite = True, human_in_the_loop=False):
|
39
|
+
"""
|
40
|
+
Creates a feature engineering agent that can be run on a dataset. The agent applies various feature engineering
|
41
|
+
techniques, such as encoding categorical variables, scaling numeric variables, creating interaction terms,
|
42
|
+
and generating polynomial features. The agent takes in a dataset and user instructions and outputs a Python
|
43
|
+
function for feature engineering. It also logs the code generated and any errors that occur.
|
44
|
+
|
45
|
+
The agent is instructed to apply the following feature engineering techniques:
|
46
|
+
|
47
|
+
- Remove string or categorical features with unique values equal to the size of the dataset
|
48
|
+
- Remove constant features with the same value in all rows
|
49
|
+
- High cardinality categorical features should be encoded by a threshold <= 5 percent of the dataset, by converting infrequent values to "other"
|
50
|
+
- Encoding categorical variables using OneHotEncoding
|
51
|
+
- Numeric features should be left untransformed
|
52
|
+
- Create datetime-based features if datetime columns are present
|
53
|
+
- If a target variable is provided:
|
54
|
+
- If a categorical target variable is provided, encode it using LabelEncoding
|
55
|
+
- All other target variables should be converted to numeric and unscaled
|
56
|
+
- Convert any boolean True/False values to 1/0
|
57
|
+
- Return a single data frame containing the transformed features and target variable, if one is provided.
|
58
|
+
- Any specific instructions provided by the user
|
59
|
+
|
60
|
+
Parameters
|
61
|
+
----------
|
62
|
+
model : langchain.llms.base.LLM
|
63
|
+
The language model to use to generate code.
|
64
|
+
log : bool, optional
|
65
|
+
Whether or not to log the code generated and any errors that occur.
|
66
|
+
Defaults to False.
|
67
|
+
log_path : str, optional
|
68
|
+
The path to the directory where the log files should be stored. Defaults to "logs/".
|
69
|
+
overwrite : bool, optional
|
70
|
+
Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
|
71
|
+
Defaults to True.
|
72
|
+
human_in_the_loop : bool, optional
|
73
|
+
Whether or not to use human in the loop. If True, adds an interput and human in the loop step that asks the user to review the feature engineering instructions. Defaults to False.
|
74
|
+
|
75
|
+
Examples
|
76
|
+
-------
|
77
|
+
``` python
|
78
|
+
import pandas as pd
|
79
|
+
from langchain_openai import ChatOpenAI
|
80
|
+
from ai_data_science_team.agents import feature_engineering_agent
|
81
|
+
|
82
|
+
llm = ChatOpenAI(model="gpt-4o-mini")
|
83
|
+
|
84
|
+
feature_engineering_agent = make_feature_engineering_agent(llm)
|
85
|
+
|
86
|
+
df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
|
87
|
+
|
88
|
+
response = feature_engineering_agent.invoke({
|
89
|
+
"user_instructions": None,
|
90
|
+
"target_variable": "Churn",
|
91
|
+
"data_raw": df.to_dict(),
|
92
|
+
"max_retries": 3,
|
93
|
+
"retry_count": 0
|
94
|
+
})
|
95
|
+
|
96
|
+
pd.DataFrame(response['data_engineered'])
|
97
|
+
```
|
98
|
+
|
99
|
+
Returns
|
100
|
+
-------
|
101
|
+
app : langchain.graphs.StateGraph
|
102
|
+
The feature engineering agent as a state graph.
|
103
|
+
"""
|
104
|
+
llm = model
|
105
|
+
|
106
|
+
# Setup Log Directory
|
107
|
+
if log:
|
108
|
+
if log_path is None:
|
109
|
+
log_path = "logs/"
|
110
|
+
if not os.path.exists(log_path):
|
111
|
+
os.makedirs(log_path)
|
112
|
+
|
113
|
+
# Define GraphState for the router
|
114
|
+
class GraphState(TypedDict):
|
115
|
+
messages: Annotated[Sequence[BaseMessage], operator.add]
|
116
|
+
user_instructions: str
|
117
|
+
recommended_steps: str
|
118
|
+
data_raw: dict
|
119
|
+
data_engineered: dict
|
120
|
+
target_variable: str
|
121
|
+
all_datasets_summary: str
|
122
|
+
feature_engineer_function: str
|
123
|
+
feature_engineer_function_path: str
|
124
|
+
feature_engineer_function_name: str
|
125
|
+
feature_engineer_error: str
|
126
|
+
max_retries: int
|
127
|
+
retry_count: int
|
128
|
+
|
129
|
+
def recommend_feature_engineering_steps(state: GraphState):
|
130
|
+
"""
|
131
|
+
Recommend a series of feature engineering steps based on the input data.
|
132
|
+
These recommended steps will be appended to the user_instructions.
|
133
|
+
"""
|
134
|
+
print("---FEATURE ENGINEERING AGENT----")
|
135
|
+
print(" * RECOMMEND FEATURE ENGINEERING STEPS")
|
136
|
+
|
137
|
+
# Prompt to get recommended steps from the LLM
|
138
|
+
recommend_steps_prompt = PromptTemplate(
|
139
|
+
template="""
|
140
|
+
You are a Feature Engineering Expert. Given the following information about the data,
|
141
|
+
recommend a series of numbered steps to take to engineer features.
|
142
|
+
The steps should be tailored to the data characteristics and should be helpful
|
143
|
+
for a feature engineering agent that will be implemented.
|
144
|
+
|
145
|
+
General Steps:
|
146
|
+
Things that should be considered in the feature engineering steps:
|
147
|
+
|
148
|
+
* Convert features to the appropriate data types based on their sample data values
|
149
|
+
* Remove string or categorical features with unique values equal to the size of the dataset
|
150
|
+
* Remove constant features with the same value in all rows
|
151
|
+
* High cardinality categorical features should be encoded by a threshold <= 5 percent of the dataset, by converting infrequent values to "other"
|
152
|
+
* Encoding categorical variables using OneHotEncoding
|
153
|
+
* Numeric features should be left untransformed
|
154
|
+
* Create datetime-based features if datetime columns are present
|
155
|
+
* If a target variable is provided:
|
156
|
+
* If a categorical target variable is provided, encode it using LabelEncoding
|
157
|
+
* All other target variables should be converted to numeric and unscaled
|
158
|
+
* Convert any Boolean (True/False) values to integer (1/0) values. This should be performed after one-hot encoding.
|
159
|
+
|
160
|
+
Custom Steps:
|
161
|
+
* Analyze the data to determine if any additional feature engineering steps are needed.
|
162
|
+
* Recommend steps that are specific to the data provided. Include why these steps are necessary or beneficial.
|
163
|
+
* If no additional steps are needed, simply state that no additional steps are required.
|
164
|
+
|
165
|
+
IMPORTANT:
|
166
|
+
Make sure to take into account any additional user instructions that may add, remove or modify some of these steps. Include comments in your code to explain your reasoning for each step. Include comments if something is not done because a user requested. Include comments if something is done because a user requested.
|
167
|
+
|
168
|
+
User instructions:
|
169
|
+
{user_instructions}
|
170
|
+
|
171
|
+
Previously Recommended Steps (if any):
|
172
|
+
{recommended_steps}
|
173
|
+
|
174
|
+
Below are summaries of all datasets provided:
|
175
|
+
{all_datasets_summary}
|
176
|
+
|
177
|
+
Return the steps as a numbered list (no code, just the steps).
|
178
|
+
|
179
|
+
Avoid these:
|
180
|
+
1. Do not include steps to save files.
|
181
|
+
""",
|
182
|
+
input_variables=["user_instructions", "recommended_steps", "all_datasets_summary"]
|
183
|
+
)
|
184
|
+
|
185
|
+
data_raw = state.get("data_raw")
|
186
|
+
df = pd.DataFrame.from_dict(data_raw)
|
187
|
+
|
188
|
+
all_datasets_summary = summarize_dataframes([df])
|
189
|
+
|
190
|
+
all_datasets_summary_str = "\n\n".join(all_datasets_summary)
|
191
|
+
|
192
|
+
steps_agent = recommend_steps_prompt | llm
|
193
|
+
recommended_steps = steps_agent.invoke({
|
194
|
+
"user_instructions": state.get("user_instructions"),
|
195
|
+
"recommended_steps": state.get("recommended_steps"),
|
196
|
+
"all_datasets_summary": all_datasets_summary_str
|
197
|
+
})
|
198
|
+
|
199
|
+
return {
|
200
|
+
"recommended_steps": "\n\n# Recommended Feature Engineering Steps:\n" + recommended_steps.content.strip(),
|
201
|
+
"all_datasets_summary": all_datasets_summary_str
|
202
|
+
}
|
203
|
+
|
204
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "create_feature_engineering_code"]]:
|
205
|
+
return node_func_human_review(
|
206
|
+
state=state,
|
207
|
+
prompt_text="Is the following feature engineering instructions correct? (Answer 'yes' or provide modifications)\n{steps}",
|
208
|
+
yes_goto="create_feature_engineering_code",
|
209
|
+
no_goto="recommend_feature_engineering_steps",
|
210
|
+
user_instructions_key="user_instructions",
|
211
|
+
recommended_steps_key="recommended_steps"
|
212
|
+
)
|
213
|
+
|
214
|
+
def create_feature_engineering_code(state: GraphState):
|
215
|
+
print(" * CREATE FEATURE ENGINEERING CODE")
|
216
|
+
|
217
|
+
feature_engineering_prompt = PromptTemplate(
|
218
|
+
template="""
|
219
|
+
|
220
|
+
You are a Feature Engineering Agent. Your job is to create a feature_engineer() function that can be run on the data provided using the following recommended steps.
|
221
|
+
|
222
|
+
Recommended Steps:
|
223
|
+
{recommended_steps}
|
224
|
+
|
225
|
+
Use this information about the data to help determine how to feature engineer the data:
|
226
|
+
|
227
|
+
Target Variable (if provided): {target_variable}
|
228
|
+
|
229
|
+
Below are summaries of all datasets provided. Use this information about the data to help determine how to feature engineer the data:
|
230
|
+
{all_datasets_summary}
|
231
|
+
|
232
|
+
You can use Pandas, Numpy, and Scikit Learn libraries to feature engineer the data.
|
233
|
+
|
234
|
+
Return Python code in ```python``` format with a single function definition, feature_engineer(data_raw), including all imports inside the function.
|
235
|
+
|
236
|
+
Return code to provide the feature engineering function:
|
237
|
+
|
238
|
+
def feature_engineer(data_raw):
|
239
|
+
import pandas as pd
|
240
|
+
import numpy as np
|
241
|
+
...
|
242
|
+
return data_engineered
|
243
|
+
|
244
|
+
Best Practices and Error Preventions:
|
245
|
+
- Handle missing values in numeric and categorical features before transformations.
|
246
|
+
- Avoid creating highly correlated features unless explicitly instructed.
|
247
|
+
- Convert Boolean to integer values (0/1) after one-hot encoding unless otherwise instructed.
|
248
|
+
|
249
|
+
Avoid the following errors:
|
250
|
+
|
251
|
+
- name 'OneHotEncoder' is not defined
|
252
|
+
|
253
|
+
- Shape of passed values is (7043, 48), indices imply (7043, 47)
|
254
|
+
|
255
|
+
- name 'numeric_features' is not defined
|
256
|
+
|
257
|
+
- name 'categorical_features' is not defined
|
258
|
+
|
259
|
+
|
260
|
+
""",
|
261
|
+
input_variables=["recommeded_steps", "target_variable", "all_datasets_summary"]
|
262
|
+
)
|
263
|
+
|
264
|
+
feature_engineering_agent = feature_engineering_prompt | llm | PythonOutputParser()
|
265
|
+
|
266
|
+
response = feature_engineering_agent.invoke({
|
267
|
+
"recommended_steps": state.get("recommended_steps"),
|
268
|
+
"target_variable": state.get("target_variable"),
|
269
|
+
"all_datasets_summary": state.get("all_datasets_summary"),
|
270
|
+
})
|
271
|
+
|
272
|
+
response = relocate_imports_inside_function(response)
|
273
|
+
response = add_comments_to_top(response, agent_name=AGENT_NAME)
|
274
|
+
|
275
|
+
# For logging: store the code generated
|
276
|
+
file_path, file_name = log_ai_function(
|
277
|
+
response=response,
|
278
|
+
file_name="feature_engineer.py",
|
279
|
+
log=log,
|
280
|
+
log_path=log_path,
|
281
|
+
overwrite=overwrite
|
282
|
+
)
|
283
|
+
|
284
|
+
return {
|
285
|
+
"feature_engineer_function": response,
|
286
|
+
"feature_engineer_function_path": file_path,
|
287
|
+
"feature_engineer_function_name": file_name
|
288
|
+
}
|
289
|
+
|
290
|
+
|
291
|
+
|
292
|
+
def execute_feature_engineering_code(state):
|
293
|
+
return node_func_execute_agent_code_on_data(
|
294
|
+
state=state,
|
295
|
+
data_key="data_raw",
|
296
|
+
result_key="data_engineered",
|
297
|
+
error_key="feature_engineer_error",
|
298
|
+
code_snippet_key="feature_engineer_function",
|
299
|
+
agent_function_name="feature_engineer",
|
300
|
+
pre_processing=lambda data: pd.DataFrame.from_dict(data),
|
301
|
+
post_processing=lambda df: df.to_dict(),
|
302
|
+
error_message_prefix="An error occurred during feature engineering: "
|
303
|
+
)
|
304
|
+
|
305
|
+
def fix_feature_engineering_code(state: GraphState):
|
306
|
+
feature_engineer_prompt = """
|
307
|
+
You are a Feature Engineering Agent. Your job is to fix the feature_engineer() function that currently contains errors.
|
308
|
+
|
309
|
+
Provide only the corrected function definition.
|
310
|
+
|
311
|
+
Broken code:
|
312
|
+
{code_snippet}
|
313
|
+
|
314
|
+
Last Known Error:
|
315
|
+
{error}
|
316
|
+
"""
|
317
|
+
|
318
|
+
return node_func_fix_agent_code(
|
319
|
+
state=state,
|
320
|
+
code_snippet_key="feature_engineer_function",
|
321
|
+
error_key="feature_engineer_error",
|
322
|
+
llm=llm,
|
323
|
+
prompt_template=feature_engineer_prompt,
|
324
|
+
agent_name=AGENT_NAME,
|
325
|
+
log=log,
|
326
|
+
file_path=state.get("feature_engineer_function_path"),
|
327
|
+
)
|
328
|
+
|
329
|
+
def explain_feature_engineering_code(state: GraphState):
|
330
|
+
return node_func_explain_agent_code(
|
331
|
+
state=state,
|
332
|
+
code_snippet_key="feature_engineer_function",
|
333
|
+
result_key="messages",
|
334
|
+
error_key="feature_engineer_error",
|
335
|
+
llm=llm,
|
336
|
+
role=AGENT_NAME,
|
337
|
+
explanation_prompt_template="""
|
338
|
+
Explain the feature engineering steps performed by this function. Keep the explanation clear and concise.\n\n# Feature Engineering Agent:\n\n{code}
|
339
|
+
""",
|
340
|
+
success_prefix="# Feature Engineering Agent:\n\n ",
|
341
|
+
error_message="The Feature Engineering Agent encountered an error during feature engineering. Data could not be explained."
|
342
|
+
)
|
343
|
+
|
344
|
+
# Create the graph
|
345
|
+
node_functions = {
|
346
|
+
"recommend_feature_engineering_steps": recommend_feature_engineering_steps,
|
347
|
+
"human_review": human_review,
|
348
|
+
"create_feature_engineering_code": create_feature_engineering_code,
|
349
|
+
"execute_feature_engineering_code": execute_feature_engineering_code,
|
350
|
+
"fix_feature_engineering_code": fix_feature_engineering_code,
|
351
|
+
"explain_feature_engineering_code": explain_feature_engineering_code
|
352
|
+
}
|
353
|
+
|
354
|
+
app = create_coding_agent_graph(
|
355
|
+
GraphState=GraphState,
|
356
|
+
node_functions=node_functions,
|
357
|
+
recommended_steps_node_name="recommend_feature_engineering_steps",
|
358
|
+
create_code_node_name="create_feature_engineering_code",
|
359
|
+
execute_code_node_name="execute_feature_engineering_code",
|
360
|
+
fix_code_node_name="fix_feature_engineering_code",
|
361
|
+
explain_code_node_name="explain_feature_engineering_code",
|
362
|
+
error_key="feature_engineer_error",
|
363
|
+
human_in_the_loop=human_in_the_loop,
|
364
|
+
human_review_node_name="human_review",
|
365
|
+
checkpointer=MemorySaver() if human_in_the_loop else None
|
366
|
+
)
|
367
|
+
|
368
|
+
return app
|
File without changes
|