ai-data-science-team 0.0.0.9000__py3-none-any.whl → 0.0.0.9005__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,368 @@
1
+ # BUSINESS SCIENCE UNIVERSITY
2
+ # AI DATA SCIENCE TEAM
3
+ # ***
4
+ # * Agents: Feature Engineering Agent
5
+
6
+ # Libraries
7
+ from typing import TypedDict, Annotated, Sequence, Literal
8
+ import operator
9
+
10
+ from langchain.prompts import PromptTemplate
11
+ from langchain_core.messages import BaseMessage
12
+
13
+ from langgraph.types import Command
14
+ from langgraph.checkpoint.memory import MemorySaver
15
+
16
+ import os
17
+ import io
18
+ import pandas as pd
19
+
20
+ from ai_data_science_team.templates.agent_templates import(
21
+ node_func_execute_agent_code_on_data,
22
+ node_func_human_review,
23
+ node_func_fix_agent_code,
24
+ node_func_explain_agent_code,
25
+ create_coding_agent_graph
26
+ )
27
+ from ai_data_science_team.tools.parsers import PythonOutputParser
28
+ from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top
29
+ from ai_data_science_team.tools.data_analysis import summarize_dataframes
30
+ from ai_data_science_team.tools.logging import log_ai_function
31
+
32
+ # Setup
33
+ AGENT_NAME = "feature_engineering_agent"
34
+ LOG_PATH = os.path.join(os.getcwd(), "logs/")
35
+
36
+ # * Feature Engineering Agent
37
+
38
+ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite = True, human_in_the_loop=False):
39
+ """
40
+ Creates a feature engineering agent that can be run on a dataset. The agent applies various feature engineering
41
+ techniques, such as encoding categorical variables, scaling numeric variables, creating interaction terms,
42
+ and generating polynomial features. The agent takes in a dataset and user instructions and outputs a Python
43
+ function for feature engineering. It also logs the code generated and any errors that occur.
44
+
45
+ The agent is instructed to apply the following feature engineering techniques:
46
+
47
+ - Remove string or categorical features with unique values equal to the size of the dataset
48
+ - Remove constant features with the same value in all rows
49
+ - High cardinality categorical features should be encoded by a threshold <= 5 percent of the dataset, by converting infrequent values to "other"
50
+ - Encoding categorical variables using OneHotEncoding
51
+ - Numeric features should be left untransformed
52
+ - Create datetime-based features if datetime columns are present
53
+ - If a target variable is provided:
54
+ - If a categorical target variable is provided, encode it using LabelEncoding
55
+ - All other target variables should be converted to numeric and unscaled
56
+ - Convert any boolean True/False values to 1/0
57
+ - Return a single data frame containing the transformed features and target variable, if one is provided.
58
+ - Any specific instructions provided by the user
59
+
60
+ Parameters
61
+ ----------
62
+ model : langchain.llms.base.LLM
63
+ The language model to use to generate code.
64
+ log : bool, optional
65
+ Whether or not to log the code generated and any errors that occur.
66
+ Defaults to False.
67
+ log_path : str, optional
68
+ The path to the directory where the log files should be stored. Defaults to "logs/".
69
+ overwrite : bool, optional
70
+ Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
71
+ Defaults to True.
72
+ human_in_the_loop : bool, optional
73
+ Whether or not to use human in the loop. If True, adds an interput and human in the loop step that asks the user to review the feature engineering instructions. Defaults to False.
74
+
75
+ Examples
76
+ -------
77
+ ``` python
78
+ import pandas as pd
79
+ from langchain_openai import ChatOpenAI
80
+ from ai_data_science_team.agents import feature_engineering_agent
81
+
82
+ llm = ChatOpenAI(model="gpt-4o-mini")
83
+
84
+ feature_engineering_agent = make_feature_engineering_agent(llm)
85
+
86
+ df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
87
+
88
+ response = feature_engineering_agent.invoke({
89
+ "user_instructions": None,
90
+ "target_variable": "Churn",
91
+ "data_raw": df.to_dict(),
92
+ "max_retries": 3,
93
+ "retry_count": 0
94
+ })
95
+
96
+ pd.DataFrame(response['data_engineered'])
97
+ ```
98
+
99
+ Returns
100
+ -------
101
+ app : langchain.graphs.StateGraph
102
+ The feature engineering agent as a state graph.
103
+ """
104
+ llm = model
105
+
106
+ # Setup Log Directory
107
+ if log:
108
+ if log_path is None:
109
+ log_path = "logs/"
110
+ if not os.path.exists(log_path):
111
+ os.makedirs(log_path)
112
+
113
+ # Define GraphState for the router
114
+ class GraphState(TypedDict):
115
+ messages: Annotated[Sequence[BaseMessage], operator.add]
116
+ user_instructions: str
117
+ recommended_steps: str
118
+ data_raw: dict
119
+ data_engineered: dict
120
+ target_variable: str
121
+ all_datasets_summary: str
122
+ feature_engineer_function: str
123
+ feature_engineer_function_path: str
124
+ feature_engineer_function_name: str
125
+ feature_engineer_error: str
126
+ max_retries: int
127
+ retry_count: int
128
+
129
+ def recommend_feature_engineering_steps(state: GraphState):
130
+ """
131
+ Recommend a series of feature engineering steps based on the input data.
132
+ These recommended steps will be appended to the user_instructions.
133
+ """
134
+ print("---FEATURE ENGINEERING AGENT----")
135
+ print(" * RECOMMEND FEATURE ENGINEERING STEPS")
136
+
137
+ # Prompt to get recommended steps from the LLM
138
+ recommend_steps_prompt = PromptTemplate(
139
+ template="""
140
+ You are a Feature Engineering Expert. Given the following information about the data,
141
+ recommend a series of numbered steps to take to engineer features.
142
+ The steps should be tailored to the data characteristics and should be helpful
143
+ for a feature engineering agent that will be implemented.
144
+
145
+ General Steps:
146
+ Things that should be considered in the feature engineering steps:
147
+
148
+ * Convert features to the appropriate data types based on their sample data values
149
+ * Remove string or categorical features with unique values equal to the size of the dataset
150
+ * Remove constant features with the same value in all rows
151
+ * High cardinality categorical features should be encoded by a threshold <= 5 percent of the dataset, by converting infrequent values to "other"
152
+ * Encoding categorical variables using OneHotEncoding
153
+ * Numeric features should be left untransformed
154
+ * Create datetime-based features if datetime columns are present
155
+ * If a target variable is provided:
156
+ * If a categorical target variable is provided, encode it using LabelEncoding
157
+ * All other target variables should be converted to numeric and unscaled
158
+ * Convert any Boolean (True/False) values to integer (1/0) values. This should be performed after one-hot encoding.
159
+
160
+ Custom Steps:
161
+ * Analyze the data to determine if any additional feature engineering steps are needed.
162
+ * Recommend steps that are specific to the data provided. Include why these steps are necessary or beneficial.
163
+ * If no additional steps are needed, simply state that no additional steps are required.
164
+
165
+ IMPORTANT:
166
+ Make sure to take into account any additional user instructions that may add, remove or modify some of these steps. Include comments in your code to explain your reasoning for each step. Include comments if something is not done because a user requested. Include comments if something is done because a user requested.
167
+
168
+ User instructions:
169
+ {user_instructions}
170
+
171
+ Previously Recommended Steps (if any):
172
+ {recommended_steps}
173
+
174
+ Below are summaries of all datasets provided:
175
+ {all_datasets_summary}
176
+
177
+ Return the steps as a numbered list (no code, just the steps).
178
+
179
+ Avoid these:
180
+ 1. Do not include steps to save files.
181
+ """,
182
+ input_variables=["user_instructions", "recommended_steps", "all_datasets_summary"]
183
+ )
184
+
185
+ data_raw = state.get("data_raw")
186
+ df = pd.DataFrame.from_dict(data_raw)
187
+
188
+ all_datasets_summary = summarize_dataframes([df])
189
+
190
+ all_datasets_summary_str = "\n\n".join(all_datasets_summary)
191
+
192
+ steps_agent = recommend_steps_prompt | llm
193
+ recommended_steps = steps_agent.invoke({
194
+ "user_instructions": state.get("user_instructions"),
195
+ "recommended_steps": state.get("recommended_steps"),
196
+ "all_datasets_summary": all_datasets_summary_str
197
+ })
198
+
199
+ return {
200
+ "recommended_steps": "\n\n# Recommended Feature Engineering Steps:\n" + recommended_steps.content.strip(),
201
+ "all_datasets_summary": all_datasets_summary_str
202
+ }
203
+
204
+ def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "create_feature_engineering_code"]]:
205
+ return node_func_human_review(
206
+ state=state,
207
+ prompt_text="Is the following feature engineering instructions correct? (Answer 'yes' or provide modifications)\n{steps}",
208
+ yes_goto="create_feature_engineering_code",
209
+ no_goto="recommend_feature_engineering_steps",
210
+ user_instructions_key="user_instructions",
211
+ recommended_steps_key="recommended_steps"
212
+ )
213
+
214
+ def create_feature_engineering_code(state: GraphState):
215
+ print(" * CREATE FEATURE ENGINEERING CODE")
216
+
217
+ feature_engineering_prompt = PromptTemplate(
218
+ template="""
219
+
220
+ You are a Feature Engineering Agent. Your job is to create a feature_engineer() function that can be run on the data provided using the following recommended steps.
221
+
222
+ Recommended Steps:
223
+ {recommended_steps}
224
+
225
+ Use this information about the data to help determine how to feature engineer the data:
226
+
227
+ Target Variable (if provided): {target_variable}
228
+
229
+ Below are summaries of all datasets provided. Use this information about the data to help determine how to feature engineer the data:
230
+ {all_datasets_summary}
231
+
232
+ You can use Pandas, Numpy, and Scikit Learn libraries to feature engineer the data.
233
+
234
+ Return Python code in ```python``` format with a single function definition, feature_engineer(data_raw), including all imports inside the function.
235
+
236
+ Return code to provide the feature engineering function:
237
+
238
+ def feature_engineer(data_raw):
239
+ import pandas as pd
240
+ import numpy as np
241
+ ...
242
+ return data_engineered
243
+
244
+ Best Practices and Error Preventions:
245
+ - Handle missing values in numeric and categorical features before transformations.
246
+ - Avoid creating highly correlated features unless explicitly instructed.
247
+ - Convert Boolean to integer values (0/1) after one-hot encoding unless otherwise instructed.
248
+
249
+ Avoid the following errors:
250
+
251
+ - name 'OneHotEncoder' is not defined
252
+
253
+ - Shape of passed values is (7043, 48), indices imply (7043, 47)
254
+
255
+ - name 'numeric_features' is not defined
256
+
257
+ - name 'categorical_features' is not defined
258
+
259
+
260
+ """,
261
+ input_variables=["recommeded_steps", "target_variable", "all_datasets_summary"]
262
+ )
263
+
264
+ feature_engineering_agent = feature_engineering_prompt | llm | PythonOutputParser()
265
+
266
+ response = feature_engineering_agent.invoke({
267
+ "recommended_steps": state.get("recommended_steps"),
268
+ "target_variable": state.get("target_variable"),
269
+ "all_datasets_summary": state.get("all_datasets_summary"),
270
+ })
271
+
272
+ response = relocate_imports_inside_function(response)
273
+ response = add_comments_to_top(response, agent_name=AGENT_NAME)
274
+
275
+ # For logging: store the code generated
276
+ file_path, file_name = log_ai_function(
277
+ response=response,
278
+ file_name="feature_engineer.py",
279
+ log=log,
280
+ log_path=log_path,
281
+ overwrite=overwrite
282
+ )
283
+
284
+ return {
285
+ "feature_engineer_function": response,
286
+ "feature_engineer_function_path": file_path,
287
+ "feature_engineer_function_name": file_name
288
+ }
289
+
290
+
291
+
292
+ def execute_feature_engineering_code(state):
293
+ return node_func_execute_agent_code_on_data(
294
+ state=state,
295
+ data_key="data_raw",
296
+ result_key="data_engineered",
297
+ error_key="feature_engineer_error",
298
+ code_snippet_key="feature_engineer_function",
299
+ agent_function_name="feature_engineer",
300
+ pre_processing=lambda data: pd.DataFrame.from_dict(data),
301
+ post_processing=lambda df: df.to_dict(),
302
+ error_message_prefix="An error occurred during feature engineering: "
303
+ )
304
+
305
+ def fix_feature_engineering_code(state: GraphState):
306
+ feature_engineer_prompt = """
307
+ You are a Feature Engineering Agent. Your job is to fix the feature_engineer() function that currently contains errors.
308
+
309
+ Provide only the corrected function definition.
310
+
311
+ Broken code:
312
+ {code_snippet}
313
+
314
+ Last Known Error:
315
+ {error}
316
+ """
317
+
318
+ return node_func_fix_agent_code(
319
+ state=state,
320
+ code_snippet_key="feature_engineer_function",
321
+ error_key="feature_engineer_error",
322
+ llm=llm,
323
+ prompt_template=feature_engineer_prompt,
324
+ agent_name=AGENT_NAME,
325
+ log=log,
326
+ file_path=state.get("feature_engineer_function_path"),
327
+ )
328
+
329
+ def explain_feature_engineering_code(state: GraphState):
330
+ return node_func_explain_agent_code(
331
+ state=state,
332
+ code_snippet_key="feature_engineer_function",
333
+ result_key="messages",
334
+ error_key="feature_engineer_error",
335
+ llm=llm,
336
+ role=AGENT_NAME,
337
+ explanation_prompt_template="""
338
+ Explain the feature engineering steps performed by this function. Keep the explanation clear and concise.\n\n# Feature Engineering Agent:\n\n{code}
339
+ """,
340
+ success_prefix="# Feature Engineering Agent:\n\n ",
341
+ error_message="The Feature Engineering Agent encountered an error during feature engineering. Data could not be explained."
342
+ )
343
+
344
+ # Create the graph
345
+ node_functions = {
346
+ "recommend_feature_engineering_steps": recommend_feature_engineering_steps,
347
+ "human_review": human_review,
348
+ "create_feature_engineering_code": create_feature_engineering_code,
349
+ "execute_feature_engineering_code": execute_feature_engineering_code,
350
+ "fix_feature_engineering_code": fix_feature_engineering_code,
351
+ "explain_feature_engineering_code": explain_feature_engineering_code
352
+ }
353
+
354
+ app = create_coding_agent_graph(
355
+ GraphState=GraphState,
356
+ node_functions=node_functions,
357
+ recommended_steps_node_name="recommend_feature_engineering_steps",
358
+ create_code_node_name="create_feature_engineering_code",
359
+ execute_code_node_name="execute_feature_engineering_code",
360
+ fix_code_node_name="fix_feature_engineering_code",
361
+ explain_code_node_name="explain_feature_engineering_code",
362
+ error_key="feature_engineer_error",
363
+ human_in_the_loop=human_in_the_loop,
364
+ human_review_node_name="human_review",
365
+ checkpointer=MemorySaver() if human_in_the_loop else None
366
+ )
367
+
368
+ return app
File without changes