ai-data-science-team 0.0.0.9000__py3-none-any.whl → 0.0.0.9005__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/agents/__init__.py +4 -0
- ai_data_science_team/agents/data_cleaning_agent.py +347 -0
- ai_data_science_team/agents/data_wrangling_agent.py +365 -0
- ai_data_science_team/agents/feature_engineering_agent.py +368 -0
- ai_data_science_team/templates/__init__.py +0 -0
- ai_data_science_team/templates/agent_templates.py +409 -0
- ai_data_science_team/tools/__init__.py +0 -0
- ai_data_science_team/tools/data_analysis.py +116 -0
- ai_data_science_team/tools/logging.py +61 -0
- ai_data_science_team/tools/parsers.py +57 -0
- ai_data_science_team/tools/regex.py +73 -0
- ai_data_science_team-0.0.0.9005.dist-info/METADATA +162 -0
- ai_data_science_team-0.0.0.9005.dist-info/RECORD +19 -0
- ai_data_science_team/agents.py +0 -325
- ai_data_science_team-0.0.0.9000.dist-info/METADATA +0 -131
- ai_data_science_team-0.0.0.9000.dist-info/RECORD +0 -9
- {ai_data_science_team-0.0.0.9000.dist-info → ai_data_science_team-0.0.0.9005.dist-info}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9000.dist-info → ai_data_science_team-0.0.0.9005.dist-info}/WHEEL +0 -0
- {ai_data_science_team-0.0.0.9000.dist-info → ai_data_science_team-0.0.0.9005.dist-info}/top_level.txt +0 -0
ai_data_science_team/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.0.0.
|
1
|
+
__version__ = "0.0.0.9005"
|
@@ -0,0 +1,4 @@
|
|
1
|
+
from ai_data_science_team.agents.data_cleaning_agent import make_data_cleaning_agent
|
2
|
+
from ai_data_science_team.agents.feature_engineering_agent import make_feature_engineering_agent
|
3
|
+
from ai_data_science_team.agents.data_wrangling_agent import make_data_wrangling_agent
|
4
|
+
|
@@ -0,0 +1,347 @@
|
|
1
|
+
# BUSINESS SCIENCE UNIVERSITY
|
2
|
+
# AI DATA SCIENCE TEAM
|
3
|
+
# ***
|
4
|
+
# * Agents: Data Cleaning Agent
|
5
|
+
|
6
|
+
# Libraries
|
7
|
+
from typing import TypedDict, Annotated, Sequence, Literal
|
8
|
+
import operator
|
9
|
+
|
10
|
+
from langchain.prompts import PromptTemplate
|
11
|
+
from langchain_core.messages import BaseMessage
|
12
|
+
|
13
|
+
from langgraph.types import Command
|
14
|
+
from langgraph.checkpoint.memory import MemorySaver
|
15
|
+
|
16
|
+
import os
|
17
|
+
import io
|
18
|
+
import pandas as pd
|
19
|
+
|
20
|
+
from ai_data_science_team.templates.agent_templates import(
|
21
|
+
node_func_execute_agent_code_on_data,
|
22
|
+
node_func_human_review,
|
23
|
+
node_func_fix_agent_code,
|
24
|
+
node_func_explain_agent_code,
|
25
|
+
create_coding_agent_graph
|
26
|
+
)
|
27
|
+
from ai_data_science_team.tools.parsers import PythonOutputParser
|
28
|
+
from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top
|
29
|
+
from ai_data_science_team.tools.data_analysis import summarize_dataframes
|
30
|
+
from ai_data_science_team.tools.logging import log_ai_function
|
31
|
+
|
32
|
+
# Setup
|
33
|
+
AGENT_NAME = "data_cleaning_agent"
|
34
|
+
LOG_PATH = os.path.join(os.getcwd(), "logs/")
|
35
|
+
|
36
|
+
# Agent
|
37
|
+
|
38
|
+
def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True, human_in_the_loop=False):
|
39
|
+
"""
|
40
|
+
Creates a data cleaning agent that can be run on a dataset. The agent can be used to clean a dataset in a variety of
|
41
|
+
ways, such as removing columns with more than 40% missing values, imputing missing
|
42
|
+
values with the mean of the column if the column is numeric, or imputing missing
|
43
|
+
values with the mode of the column if the column is categorical.
|
44
|
+
The agent takes in a dataset and some user instructions, and outputs a python
|
45
|
+
function that can be used to clean the dataset. The agent also logs the code
|
46
|
+
generated and any errors that occur.
|
47
|
+
|
48
|
+
The agent is instructed to to perform the following data cleaning steps:
|
49
|
+
|
50
|
+
- Removing columns if more than 40 percent of the data is missing
|
51
|
+
- Imputing missing values with the mean of the column if the column is numeric
|
52
|
+
- Imputing missing values with the mode of the column if the column is categorical
|
53
|
+
- Converting columns to the correct data type
|
54
|
+
- Removing duplicate rows
|
55
|
+
- Removing rows with missing values
|
56
|
+
- Removing rows with extreme outliers (3X the interquartile range)
|
57
|
+
- User instructions can modify, add, or remove any of the above steps
|
58
|
+
|
59
|
+
Parameters
|
60
|
+
----------
|
61
|
+
model : langchain.llms.base.LLM
|
62
|
+
The language model to use to generate code.
|
63
|
+
log : bool, optional
|
64
|
+
Whether or not to log the code generated and any errors that occur.
|
65
|
+
Defaults to False.
|
66
|
+
log_path : str, optional
|
67
|
+
The path to the directory where the log files should be stored. Defaults to
|
68
|
+
"logs/".
|
69
|
+
overwrite : bool, optional
|
70
|
+
Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
|
71
|
+
Defaults to True.
|
72
|
+
human_in_the_loop : bool, optional
|
73
|
+
Whether or not to use human in the loop. If True, adds an interput and human in the loop step that asks the user to review the data cleaning instructions. Defaults to False.
|
74
|
+
|
75
|
+
Examples
|
76
|
+
-------
|
77
|
+
``` python
|
78
|
+
import pandas as pd
|
79
|
+
from langchain_openai import ChatOpenAI
|
80
|
+
from ai_data_science_team.agents import data_cleaning_agent
|
81
|
+
|
82
|
+
llm = ChatOpenAI(model = "gpt-4o-mini")
|
83
|
+
|
84
|
+
data_cleaning_agent = make_data_cleaning_agent(llm)
|
85
|
+
|
86
|
+
df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
|
87
|
+
|
88
|
+
response = data_cleaning_agent.invoke({
|
89
|
+
"user_instructions": "Don't remove outliers when cleaning the data.",
|
90
|
+
"data_raw": df.to_dict(),
|
91
|
+
"max_retries":3,
|
92
|
+
"retry_count":0
|
93
|
+
})
|
94
|
+
|
95
|
+
pd.DataFrame(response['data_cleaned'])
|
96
|
+
```
|
97
|
+
|
98
|
+
Returns
|
99
|
+
-------
|
100
|
+
app : langchain.graphs.StateGraph
|
101
|
+
The data cleaning agent as a state graph.
|
102
|
+
"""
|
103
|
+
llm = model
|
104
|
+
|
105
|
+
# Setup Log Directory
|
106
|
+
if log:
|
107
|
+
if log_path is None:
|
108
|
+
log_path = LOG_PATH
|
109
|
+
if not os.path.exists(log_path):
|
110
|
+
os.makedirs(log_path)
|
111
|
+
|
112
|
+
# Define GraphState for the router
|
113
|
+
class GraphState(TypedDict):
|
114
|
+
messages: Annotated[Sequence[BaseMessage], operator.add]
|
115
|
+
user_instructions: str
|
116
|
+
recommended_steps: str
|
117
|
+
data_raw: dict
|
118
|
+
data_cleaned: dict
|
119
|
+
all_datasets_summary: str
|
120
|
+
data_cleaner_function: str
|
121
|
+
data_cleaner_function_path: str
|
122
|
+
data_cleaner_function_name: str
|
123
|
+
data_cleaner_error: str
|
124
|
+
max_retries: int
|
125
|
+
retry_count: int
|
126
|
+
|
127
|
+
|
128
|
+
def recommend_cleaning_steps(state: GraphState):
|
129
|
+
"""
|
130
|
+
Recommend a series of data cleaning steps based on the input data.
|
131
|
+
These recommended steps will be appended to the user_instructions.
|
132
|
+
"""
|
133
|
+
print("---DATA CLEANING AGENT----")
|
134
|
+
print(" * RECOMMEND CLEANING STEPS")
|
135
|
+
|
136
|
+
# Prompt to get recommended steps from the LLM
|
137
|
+
recommend_steps_prompt = PromptTemplate(
|
138
|
+
template="""
|
139
|
+
You are a Data Cleaning Expert. Given the following information about the data,
|
140
|
+
recommend a series of numbered steps to take to clean and preprocess it.
|
141
|
+
The steps should be tailored to the data characteristics and should be helpful
|
142
|
+
for a data cleaning agent that will be implemented.
|
143
|
+
|
144
|
+
General Steps:
|
145
|
+
Things that should be considered in the data cleaning steps:
|
146
|
+
|
147
|
+
* Removing columns if more than 40 percent of the data is missing
|
148
|
+
* Imputing missing values with the mean of the column if the column is numeric
|
149
|
+
* Imputing missing values with the mode of the column if the column is categorical
|
150
|
+
* Converting columns to the correct data type
|
151
|
+
* Removing duplicate rows
|
152
|
+
* Removing rows with missing values
|
153
|
+
* Removing rows with extreme outliers (3X the interquartile range)
|
154
|
+
|
155
|
+
Custom Steps:
|
156
|
+
* Analyze the data to determine if any additional data cleaning steps are needed.
|
157
|
+
* Recommend steps that are specific to the data provided. Include why these steps are necessary or beneficial.
|
158
|
+
* If no additional steps are needed, simply state that no additional steps are required.
|
159
|
+
|
160
|
+
IMPORTANT:
|
161
|
+
Make sure to take into account any additional user instructions that may add, remove or modify some of these steps. Include comments in your code to explain your reasoning for each step. Include comments if something is not done because a user requested. Include comments if something is done because a user requested.
|
162
|
+
|
163
|
+
User instructions:
|
164
|
+
{user_instructions}
|
165
|
+
|
166
|
+
Previously Recommended Steps (if any):
|
167
|
+
{recommended_steps}
|
168
|
+
|
169
|
+
Below are summaries of all datasets provided:
|
170
|
+
{all_datasets_summary}
|
171
|
+
|
172
|
+
Return the steps as a bullet point list (no code, just the steps).
|
173
|
+
|
174
|
+
Avoid these:
|
175
|
+
1. Do not include steps to save files.
|
176
|
+
""",
|
177
|
+
input_variables=["user_instructions", "recommended_steps", "all_datasets_summary"]
|
178
|
+
)
|
179
|
+
|
180
|
+
data_raw = state.get("data_raw")
|
181
|
+
df = pd.DataFrame.from_dict(data_raw)
|
182
|
+
|
183
|
+
all_datasets_summary = summarize_dataframes([df])
|
184
|
+
|
185
|
+
all_datasets_summary_str = "\n\n".join(all_datasets_summary)
|
186
|
+
|
187
|
+
steps_agent = recommend_steps_prompt | llm
|
188
|
+
recommended_steps = steps_agent.invoke({
|
189
|
+
"user_instructions": state.get("user_instructions"),
|
190
|
+
"recommended_steps": state.get("recommended_steps"),
|
191
|
+
"all_datasets_summary": all_datasets_summary_str
|
192
|
+
})
|
193
|
+
|
194
|
+
return {
|
195
|
+
"recommended_steps": "\n\n# Recommended Data Cleaning Steps:\n" + recommended_steps.content.strip(),
|
196
|
+
"all_datasets_summary": all_datasets_summary_str
|
197
|
+
}
|
198
|
+
|
199
|
+
def create_data_cleaner_code(state: GraphState):
|
200
|
+
print(" * CREATE DATA CLEANER CODE")
|
201
|
+
|
202
|
+
data_cleaning_prompt = PromptTemplate(
|
203
|
+
template="""
|
204
|
+
You are a Data Cleaning Agent. Your job is to create a data_cleaner() function that can be run on the data provided using the following recommended steps.
|
205
|
+
|
206
|
+
Recommended Steps:
|
207
|
+
{recommended_steps}
|
208
|
+
|
209
|
+
You can use Pandas, Numpy, and Scikit Learn libraries to clean the data.
|
210
|
+
|
211
|
+
Below are summaries of all datasets provided. Use this information about the data to help determine how to clean the data:
|
212
|
+
|
213
|
+
{all_datasets_summary}
|
214
|
+
|
215
|
+
Return Python code in ```python ``` format with a single function definition, data_cleaner(data_raw), that incldues all imports inside the function.
|
216
|
+
|
217
|
+
Return code to provide the data cleaning function:
|
218
|
+
|
219
|
+
def data_cleaner(data_raw):
|
220
|
+
import pandas as pd
|
221
|
+
import numpy as np
|
222
|
+
...
|
223
|
+
return data_cleaned
|
224
|
+
|
225
|
+
Best Practices and Error Preventions:
|
226
|
+
|
227
|
+
Always ensure that when assigning the output of fit_transform() from SimpleImputer to a Pandas DataFrame column, you call .ravel() or flatten the array, because fit_transform() returns a 2D array while a DataFrame column is 1D.
|
228
|
+
|
229
|
+
""",
|
230
|
+
input_variables=["recommended_steps", "all_datasets_summary"]
|
231
|
+
)
|
232
|
+
|
233
|
+
data_cleaning_agent = data_cleaning_prompt | llm | PythonOutputParser()
|
234
|
+
|
235
|
+
response = data_cleaning_agent.invoke({
|
236
|
+
"recommended_steps": state.get("recommended_steps"),
|
237
|
+
"all_datasets_summary": state.get("all_datasets_summary")
|
238
|
+
})
|
239
|
+
|
240
|
+
response = relocate_imports_inside_function(response)
|
241
|
+
response = add_comments_to_top(response, agent_name=AGENT_NAME)
|
242
|
+
|
243
|
+
# For logging: store the code generated:
|
244
|
+
file_path, file_name = log_ai_function(
|
245
|
+
response=response,
|
246
|
+
file_name="data_cleaner.py",
|
247
|
+
log=log,
|
248
|
+
log_path=log_path,
|
249
|
+
overwrite=overwrite
|
250
|
+
)
|
251
|
+
|
252
|
+
return {
|
253
|
+
"data_cleaner_function" : response,
|
254
|
+
"data_cleaner_function_path": file_path,
|
255
|
+
"data_cleaner_function_name": file_name
|
256
|
+
}
|
257
|
+
|
258
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_cleaning_steps", "create_data_cleaner_code"]]:
|
259
|
+
return node_func_human_review(
|
260
|
+
state=state,
|
261
|
+
prompt_text="Is the following data cleaning instructions correct? (Answer 'yes' or provide modifications)\n{steps}",
|
262
|
+
yes_goto="create_data_cleaner_code",
|
263
|
+
no_goto="recommend_cleaning_steps",
|
264
|
+
user_instructions_key="user_instructions",
|
265
|
+
recommended_steps_key="recommended_steps"
|
266
|
+
)
|
267
|
+
|
268
|
+
def execute_data_cleaner_code(state):
|
269
|
+
return node_func_execute_agent_code_on_data(
|
270
|
+
state=state,
|
271
|
+
data_key="data_raw",
|
272
|
+
result_key="data_cleaned",
|
273
|
+
error_key="data_cleaner_error",
|
274
|
+
code_snippet_key="data_cleaner_function",
|
275
|
+
agent_function_name="data_cleaner",
|
276
|
+
pre_processing=lambda data: pd.DataFrame.from_dict(data),
|
277
|
+
post_processing=lambda df: df.to_dict(),
|
278
|
+
error_message_prefix="An error occurred during data cleaning: "
|
279
|
+
)
|
280
|
+
|
281
|
+
def fix_data_cleaner_code(state: GraphState):
|
282
|
+
data_cleaner_prompt = """
|
283
|
+
You are a Data Cleaning Agent. Your job is to create a data_cleaner() function that can be run on the data provided. The function is currently broken and needs to be fixed.
|
284
|
+
|
285
|
+
Make sure to only return the function definition for data_cleaner().
|
286
|
+
|
287
|
+
Return Python code in ```python``` format with a single function definition, data_cleaner(data_raw), that includes all imports inside the function.
|
288
|
+
|
289
|
+
This is the broken code (please fix):
|
290
|
+
{code_snippet}
|
291
|
+
|
292
|
+
Last Known Error:
|
293
|
+
{error}
|
294
|
+
"""
|
295
|
+
|
296
|
+
return node_func_fix_agent_code(
|
297
|
+
state=state,
|
298
|
+
code_snippet_key="data_cleaner_function",
|
299
|
+
error_key="data_cleaner_error",
|
300
|
+
llm=llm,
|
301
|
+
prompt_template=data_cleaner_prompt,
|
302
|
+
agent_name=AGENT_NAME,
|
303
|
+
log=log,
|
304
|
+
file_path=state.get("data_cleaner_function_path"),
|
305
|
+
)
|
306
|
+
|
307
|
+
def explain_data_cleaner_code(state: GraphState):
|
308
|
+
return node_func_explain_agent_code(
|
309
|
+
state=state,
|
310
|
+
code_snippet_key="data_cleaner_function",
|
311
|
+
result_key="messages",
|
312
|
+
error_key="data_cleaner_error",
|
313
|
+
llm=llm,
|
314
|
+
role=AGENT_NAME,
|
315
|
+
explanation_prompt_template="""
|
316
|
+
Explain the data cleaning steps that the data cleaning agent performed in this function.
|
317
|
+
Keep the summary succinct and to the point.\n\n# Data Cleaning Agent:\n\n{code}
|
318
|
+
""",
|
319
|
+
success_prefix="# Data Cleaning Agent:\n\n ",
|
320
|
+
error_message="The Data Cleaning Agent encountered an error during data cleaning. Data could not be explained."
|
321
|
+
)
|
322
|
+
|
323
|
+
# Define the graph
|
324
|
+
node_functions = {
|
325
|
+
"recommend_cleaning_steps": recommend_cleaning_steps,
|
326
|
+
"human_review": human_review,
|
327
|
+
"create_data_cleaner_code": create_data_cleaner_code,
|
328
|
+
"execute_data_cleaner_code": execute_data_cleaner_code,
|
329
|
+
"fix_data_cleaner_code": fix_data_cleaner_code,
|
330
|
+
"explain_data_cleaner_code": explain_data_cleaner_code
|
331
|
+
}
|
332
|
+
|
333
|
+
app = create_coding_agent_graph(
|
334
|
+
GraphState=GraphState,
|
335
|
+
node_functions=node_functions,
|
336
|
+
recommended_steps_node_name="recommend_cleaning_steps",
|
337
|
+
create_code_node_name="create_data_cleaner_code",
|
338
|
+
execute_code_node_name="execute_data_cleaner_code",
|
339
|
+
fix_code_node_name="fix_data_cleaner_code",
|
340
|
+
explain_code_node_name="explain_data_cleaner_code",
|
341
|
+
error_key="data_cleaner_error",
|
342
|
+
human_in_the_loop=human_in_the_loop, # or False
|
343
|
+
human_review_node_name="human_review",
|
344
|
+
checkpointer=MemorySaver() if human_in_the_loop else None
|
345
|
+
)
|
346
|
+
|
347
|
+
return app
|