ai-data-science-team 0.0.0.9005__py3-none-any.whl → 0.0.0.90061__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/agents/__init__.py +1 -0
- ai_data_science_team/agents/data_cleaning_agent.py +13 -5
- ai_data_science_team/agents/data_wrangling_agent.py +13 -16
- ai_data_science_team/agents/feature_engineering_agent.py +13 -5
- ai_data_science_team/agents/sql_database_agent.py +379 -0
- ai_data_science_team/templates/agent_templates.py +154 -37
- ai_data_science_team/tools/logging.py +1 -1
- ai_data_science_team/tools/{data_analysis.py → metadata.py} +53 -2
- ai_data_science_team/tools/regex.py +1 -1
- {ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.90061.dist-info}/METADATA +4 -1
- ai_data_science_team-0.0.0.90061.dist-info/RECORD +20 -0
- ai_data_science_team-0.0.0.9005.dist-info/RECORD +0 -19
- {ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.90061.dist-info}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.90061.dist-info}/WHEEL +0 -0
- {ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.90061.dist-info}/top_level.txt +0 -0
ai_data_science_team/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.0.0.
|
1
|
+
__version__ = "0.0.0.90061"
|
@@ -1,4 +1,5 @@
|
|
1
1
|
from ai_data_science_team.agents.data_cleaning_agent import make_data_cleaning_agent
|
2
2
|
from ai_data_science_team.agents.feature_engineering_agent import make_feature_engineering_agent
|
3
3
|
from ai_data_science_team.agents.data_wrangling_agent import make_data_wrangling_agent
|
4
|
+
from ai_data_science_team.agents.sql_database_agent import make_sql_database_agent
|
4
5
|
|
@@ -26,7 +26,7 @@ from ai_data_science_team.templates.agent_templates import(
|
|
26
26
|
)
|
27
27
|
from ai_data_science_team.tools.parsers import PythonOutputParser
|
28
28
|
from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top
|
29
|
-
from ai_data_science_team.tools.
|
29
|
+
from ai_data_science_team.tools.metadata import get_dataframe_summary
|
30
30
|
from ai_data_science_team.tools.logging import log_ai_function
|
31
31
|
|
32
32
|
# Setup
|
@@ -35,7 +35,7 @@ LOG_PATH = os.path.join(os.getcwd(), "logs/")
|
|
35
35
|
|
36
36
|
# Agent
|
37
37
|
|
38
|
-
def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True, human_in_the_loop=False):
|
38
|
+
def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True, human_in_the_loop=False, bypass_recommended_steps=False, bypass_explain_code=False):
|
39
39
|
"""
|
40
40
|
Creates a data cleaning agent that can be run on a dataset. The agent can be used to clean a dataset in a variety of
|
41
41
|
ways, such as removing columns with more than 40% missing values, imputing missing
|
@@ -71,6 +71,10 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
71
71
|
Defaults to True.
|
72
72
|
human_in_the_loop : bool, optional
|
73
73
|
Whether or not to use human in the loop. If True, adds an interput and human in the loop step that asks the user to review the data cleaning instructions. Defaults to False.
|
74
|
+
bypass_recommended_steps : bool, optional
|
75
|
+
Bypass the recommendation step, by default False
|
76
|
+
bypass_explain_code : bool, optional
|
77
|
+
Bypass the code explanation step, by default False.
|
74
78
|
|
75
79
|
Examples
|
76
80
|
-------
|
@@ -180,7 +184,7 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
180
184
|
data_raw = state.get("data_raw")
|
181
185
|
df = pd.DataFrame.from_dict(data_raw)
|
182
186
|
|
183
|
-
all_datasets_summary =
|
187
|
+
all_datasets_summary = get_dataframe_summary([df])
|
184
188
|
|
185
189
|
all_datasets_summary_str = "\n\n".join(all_datasets_summary)
|
186
190
|
|
@@ -197,6 +201,8 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
197
201
|
}
|
198
202
|
|
199
203
|
def create_data_cleaner_code(state: GraphState):
|
204
|
+
if bypass_recommended_steps:
|
205
|
+
print("---DATA CLEANING AGENT----")
|
200
206
|
print(" * CREATE DATA CLEANER CODE")
|
201
207
|
|
202
208
|
data_cleaning_prompt = PromptTemplate(
|
@@ -274,7 +280,7 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
274
280
|
code_snippet_key="data_cleaner_function",
|
275
281
|
agent_function_name="data_cleaner",
|
276
282
|
pre_processing=lambda data: pd.DataFrame.from_dict(data),
|
277
|
-
post_processing=lambda df: df.to_dict(),
|
283
|
+
post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
|
278
284
|
error_message_prefix="An error occurred during data cleaning: "
|
279
285
|
)
|
280
286
|
|
@@ -341,7 +347,9 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
341
347
|
error_key="data_cleaner_error",
|
342
348
|
human_in_the_loop=human_in_the_loop, # or False
|
343
349
|
human_review_node_name="human_review",
|
344
|
-
checkpointer=MemorySaver() if human_in_the_loop else None
|
350
|
+
checkpointer=MemorySaver() if human_in_the_loop else None,
|
351
|
+
bypass_recommended_steps=bypass_recommended_steps,
|
352
|
+
bypass_explain_code=bypass_explain_code,
|
345
353
|
)
|
346
354
|
|
347
355
|
return app
|
@@ -24,14 +24,14 @@ from ai_data_science_team.templates.agent_templates import(
|
|
24
24
|
)
|
25
25
|
from ai_data_science_team.tools.parsers import PythonOutputParser
|
26
26
|
from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top
|
27
|
-
from ai_data_science_team.tools.
|
27
|
+
from ai_data_science_team.tools.metadata import get_dataframe_summary
|
28
28
|
from ai_data_science_team.tools.logging import log_ai_function
|
29
29
|
|
30
30
|
# Setup Logging Path
|
31
31
|
AGENT_NAME = "data_wrangling_agent"
|
32
32
|
LOG_PATH = os.path.join(os.getcwd(), "logs/")
|
33
33
|
|
34
|
-
def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True, human_in_the_loop=False):
|
34
|
+
def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True, human_in_the_loop=False, bypass_recommended_steps=False, bypass_explain_code=False):
|
35
35
|
"""
|
36
36
|
Creates a data wrangling agent that can be run on one or more datasets. The agent can be
|
37
37
|
instructed to perform common data wrangling steps such as:
|
@@ -63,6 +63,10 @@ def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True,
|
|
63
63
|
human_in_the_loop : bool, optional
|
64
64
|
Whether or not to use human in the loop. If True, adds an interrupt and human-in-the-loop
|
65
65
|
step that asks the user to review the data wrangling instructions. Defaults to False.
|
66
|
+
bypass_recommended_steps : bool, optional
|
67
|
+
Bypass the recommendation step, by default False
|
68
|
+
bypass_explain_code : bool, optional
|
69
|
+
Bypass the code explanation step, by default False.
|
66
70
|
|
67
71
|
Example
|
68
72
|
-------
|
@@ -139,7 +143,7 @@ def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True,
|
|
139
143
|
|
140
144
|
# Create a summary for all datasets
|
141
145
|
# We'll include a short sample and info for each dataset
|
142
|
-
all_datasets_summary =
|
146
|
+
all_datasets_summary = get_dataframe_summary(dataframes)
|
143
147
|
|
144
148
|
# Join all datasets summaries into one big text block
|
145
149
|
all_datasets_summary_str = "\n\n".join(all_datasets_summary)
|
@@ -190,6 +194,8 @@ def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True,
|
|
190
194
|
|
191
195
|
|
192
196
|
def create_data_wrangler_code(state: GraphState):
|
197
|
+
if bypass_recommended_steps:
|
198
|
+
print("---DATA WRANGLING AGENT----")
|
193
199
|
print(" * CREATE DATA WRANGLER CODE")
|
194
200
|
|
195
201
|
data_wrangling_prompt = PromptTemplate(
|
@@ -269,17 +275,6 @@ def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True,
|
|
269
275
|
)
|
270
276
|
|
271
277
|
def execute_data_wrangler_code(state: GraphState):
|
272
|
-
|
273
|
-
# Handle multiple datasets as lists
|
274
|
-
# def pre_processing(data):
|
275
|
-
# df = []
|
276
|
-
# for i in range(len(data)):
|
277
|
-
# df[i] = pd.DataFrame.from_dict(data[i])
|
278
|
-
# return df
|
279
|
-
|
280
|
-
# def post_processing(df):
|
281
|
-
# return df.to_dict()
|
282
|
-
|
283
278
|
return node_func_execute_agent_code_on_data(
|
284
279
|
state=state,
|
285
280
|
data_key="data_raw",
|
@@ -288,7 +283,7 @@ def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True,
|
|
288
283
|
code_snippet_key="data_wrangler_function",
|
289
284
|
agent_function_name="data_wrangler",
|
290
285
|
# pre_processing=pre_processing,
|
291
|
-
|
286
|
+
post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
|
292
287
|
error_message_prefix="An error occurred during data wrangling: "
|
293
288
|
)
|
294
289
|
|
@@ -355,7 +350,9 @@ def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True,
|
|
355
350
|
error_key="data_wrangler_error",
|
356
351
|
human_in_the_loop=human_in_the_loop,
|
357
352
|
human_review_node_name="human_review",
|
358
|
-
checkpointer=MemorySaver() if human_in_the_loop else None
|
353
|
+
checkpointer=MemorySaver() if human_in_the_loop else None,
|
354
|
+
bypass_recommended_steps=bypass_recommended_steps,
|
355
|
+
bypass_explain_code=bypass_explain_code,
|
359
356
|
)
|
360
357
|
|
361
358
|
return app
|
@@ -26,7 +26,7 @@ from ai_data_science_team.templates.agent_templates import(
|
|
26
26
|
)
|
27
27
|
from ai_data_science_team.tools.parsers import PythonOutputParser
|
28
28
|
from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top
|
29
|
-
from ai_data_science_team.tools.
|
29
|
+
from ai_data_science_team.tools.metadata import get_dataframe_summary
|
30
30
|
from ai_data_science_team.tools.logging import log_ai_function
|
31
31
|
|
32
32
|
# Setup
|
@@ -35,7 +35,7 @@ LOG_PATH = os.path.join(os.getcwd(), "logs/")
|
|
35
35
|
|
36
36
|
# * Feature Engineering Agent
|
37
37
|
|
38
|
-
def make_feature_engineering_agent(model, log=False, log_path=None, overwrite = True, human_in_the_loop=False):
|
38
|
+
def make_feature_engineering_agent(model, log=False, log_path=None, overwrite = True, human_in_the_loop=False, bypass_recommended_steps=False, bypass_explain_code=False):
|
39
39
|
"""
|
40
40
|
Creates a feature engineering agent that can be run on a dataset. The agent applies various feature engineering
|
41
41
|
techniques, such as encoding categorical variables, scaling numeric variables, creating interaction terms,
|
@@ -71,6 +71,10 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
|
|
71
71
|
Defaults to True.
|
72
72
|
human_in_the_loop : bool, optional
|
73
73
|
Whether or not to use human in the loop. If True, adds an interput and human in the loop step that asks the user to review the feature engineering instructions. Defaults to False.
|
74
|
+
bypass_recommended_steps : bool, optional
|
75
|
+
Bypass the recommendation step, by default False
|
76
|
+
bypass_explain_code : bool, optional
|
77
|
+
Bypass the code explanation step, by default False.
|
74
78
|
|
75
79
|
Examples
|
76
80
|
-------
|
@@ -185,7 +189,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
|
|
185
189
|
data_raw = state.get("data_raw")
|
186
190
|
df = pd.DataFrame.from_dict(data_raw)
|
187
191
|
|
188
|
-
all_datasets_summary =
|
192
|
+
all_datasets_summary = get_dataframe_summary([df])
|
189
193
|
|
190
194
|
all_datasets_summary_str = "\n\n".join(all_datasets_summary)
|
191
195
|
|
@@ -212,6 +216,8 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
|
|
212
216
|
)
|
213
217
|
|
214
218
|
def create_feature_engineering_code(state: GraphState):
|
219
|
+
if bypass_recommended_steps:
|
220
|
+
print("---FEATURE ENGINEERING AGENT----")
|
215
221
|
print(" * CREATE FEATURE ENGINEERING CODE")
|
216
222
|
|
217
223
|
feature_engineering_prompt = PromptTemplate(
|
@@ -298,7 +304,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
|
|
298
304
|
code_snippet_key="feature_engineer_function",
|
299
305
|
agent_function_name="feature_engineer",
|
300
306
|
pre_processing=lambda data: pd.DataFrame.from_dict(data),
|
301
|
-
post_processing=lambda df: df.to_dict(),
|
307
|
+
post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
|
302
308
|
error_message_prefix="An error occurred during feature engineering: "
|
303
309
|
)
|
304
310
|
|
@@ -362,7 +368,9 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
|
|
362
368
|
error_key="feature_engineer_error",
|
363
369
|
human_in_the_loop=human_in_the_loop,
|
364
370
|
human_review_node_name="human_review",
|
365
|
-
checkpointer=MemorySaver() if human_in_the_loop else None
|
371
|
+
checkpointer=MemorySaver() if human_in_the_loop else None,
|
372
|
+
bypass_recommended_steps=bypass_recommended_steps,
|
373
|
+
bypass_explain_code=bypass_explain_code,
|
366
374
|
)
|
367
375
|
|
368
376
|
return app
|
@@ -0,0 +1,379 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
from typing import TypedDict, Annotated, Sequence, Literal
|
4
|
+
import operator
|
5
|
+
|
6
|
+
from langchain.prompts import PromptTemplate
|
7
|
+
from langchain_core.messages import BaseMessage
|
8
|
+
|
9
|
+
from langgraph.types import Command
|
10
|
+
from langgraph.checkpoint.memory import MemorySaver
|
11
|
+
|
12
|
+
import os
|
13
|
+
import io
|
14
|
+
import pandas as pd
|
15
|
+
import sqlalchemy as sql
|
16
|
+
|
17
|
+
from ai_data_science_team.templates.agent_templates import(
|
18
|
+
node_func_execute_agent_from_sql_connection,
|
19
|
+
node_func_human_review,
|
20
|
+
node_func_fix_agent_code,
|
21
|
+
node_func_explain_agent_code,
|
22
|
+
create_coding_agent_graph
|
23
|
+
)
|
24
|
+
from ai_data_science_team.tools.parsers import PythonOutputParser, SQLOutputParser
|
25
|
+
from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top
|
26
|
+
from ai_data_science_team.tools.metadata import get_database_metadata
|
27
|
+
from ai_data_science_team.tools.logging import log_ai_function
|
28
|
+
|
29
|
+
# Setup
|
30
|
+
AGENT_NAME = "sql_database_agent"
|
31
|
+
LOG_PATH = os.path.join(os.getcwd(), "logs/")
|
32
|
+
|
33
|
+
|
34
|
+
def make_sql_database_agent(model, connection, log=False, log_path=None, overwrite = True, human_in_the_loop=False, bypass_recommended_steps=False, bypass_explain_code=False):
|
35
|
+
"""
|
36
|
+
Creates a SQL Database Agent that can recommend SQL steps and generate SQL code to query a database.
|
37
|
+
|
38
|
+
Parameters
|
39
|
+
----------
|
40
|
+
model : ChatOpenAI
|
41
|
+
The language model to use for the agent.
|
42
|
+
connection : sqlalchemy.engine.base.Engine
|
43
|
+
The connection to the SQL database.
|
44
|
+
log : bool, optional
|
45
|
+
Whether to log the generated code, by default False
|
46
|
+
log_path : str, optional
|
47
|
+
The path to the log directory, by default None
|
48
|
+
overwrite : bool, optional
|
49
|
+
Whether to overwrite the existing log file, by default True
|
50
|
+
human_in_the_loop : bool, optional
|
51
|
+
Whether or not to use human in the loop. If True, adds an interput and human in the loop step that asks the user to review the feature engineering instructions. Defaults to False.
|
52
|
+
bypass_recommended_steps : bool, optional
|
53
|
+
Bypass the recommendation step, by default False
|
54
|
+
bypass_explain_code : bool, optional
|
55
|
+
Bypass the code explanation step, by default False.
|
56
|
+
|
57
|
+
Returns
|
58
|
+
-------
|
59
|
+
app : langchain.graphs.StateGraph
|
60
|
+
The data cleaning agent as a state graph.
|
61
|
+
|
62
|
+
Examples
|
63
|
+
--------
|
64
|
+
```python
|
65
|
+
from ai_data_science_team.agents import make_sql_database_agent
|
66
|
+
import sqlalchemy as sql
|
67
|
+
from langchain_openai import ChatOpenAI
|
68
|
+
|
69
|
+
sql_engine = sql.create_engine("sqlite:///data/leads_scored.db")
|
70
|
+
|
71
|
+
conn = sql_engine.connect()
|
72
|
+
|
73
|
+
llm = ChatOpenAI(model="gpt-4o-mini")
|
74
|
+
|
75
|
+
sql_agent = make_sql_database_agent(
|
76
|
+
model=llm,
|
77
|
+
connection=conn
|
78
|
+
)
|
79
|
+
|
80
|
+
sql_agent
|
81
|
+
|
82
|
+
response = sql_agent.invoke({
|
83
|
+
"user_instructions": "List the tables in the database",
|
84
|
+
"max_retries":3,
|
85
|
+
"retry_count":0
|
86
|
+
})
|
87
|
+
```
|
88
|
+
|
89
|
+
"""
|
90
|
+
|
91
|
+
is_engine = isinstance(connection, sql.engine.base.Engine)
|
92
|
+
conn = connection.connect() if is_engine else connection
|
93
|
+
|
94
|
+
llm = model
|
95
|
+
|
96
|
+
# Setup Log Directory
|
97
|
+
if log:
|
98
|
+
if log_path is None:
|
99
|
+
log_path = LOG_PATH
|
100
|
+
if not os.path.exists(log_path):
|
101
|
+
os.makedirs(log_path)
|
102
|
+
|
103
|
+
class GraphState(TypedDict):
|
104
|
+
messages: Annotated[Sequence[BaseMessage], operator.add]
|
105
|
+
user_instructions: str
|
106
|
+
recommended_steps: str
|
107
|
+
data_sql: dict
|
108
|
+
all_sql_database_summary: str
|
109
|
+
sql_query_code: str
|
110
|
+
sql_database_function: str
|
111
|
+
sql_database_function_path: str
|
112
|
+
sql_database_function_name: str
|
113
|
+
sql_database_error: str
|
114
|
+
max_retries: int
|
115
|
+
retry_count: int
|
116
|
+
|
117
|
+
def recommend_sql_steps(state: GraphState):
|
118
|
+
|
119
|
+
print("---SQL DATABASE AGENT---")
|
120
|
+
print(" * RECOMMEND SQL QUERY STEPS")
|
121
|
+
|
122
|
+
|
123
|
+
# Prompt to get recommended steps from the LLM
|
124
|
+
recommend_steps_prompt = PromptTemplate(
|
125
|
+
template="""
|
126
|
+
You are a SQL Database Instructions Expert. Given the following information about the SQL database,
|
127
|
+
recommend a series of numbered steps to take to collect the data and process it according to user instructions.
|
128
|
+
The steps should be tailored to the SQL database characteristics and should be helpful
|
129
|
+
for a sql database coding agent that will write the SQL code.
|
130
|
+
|
131
|
+
IMPORTANT INSTRUCTIONS:
|
132
|
+
- Take into account the user instructions and the previously recommended steps.
|
133
|
+
- If no user instructions are provided, just return the steps needed to understand the database.
|
134
|
+
- Take into account the database dialect and the tables and columns in the database.
|
135
|
+
- Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.
|
136
|
+
|
137
|
+
|
138
|
+
User instructions / Question:
|
139
|
+
{user_instructions}
|
140
|
+
|
141
|
+
Previously Recommended Steps (if any):
|
142
|
+
{recommended_steps}
|
143
|
+
|
144
|
+
Below are summaries of the database metadata and the SQL tables:
|
145
|
+
{all_sql_database_summary}
|
146
|
+
|
147
|
+
Return the steps as a numbered point list (no code, just the steps).
|
148
|
+
|
149
|
+
Consider these:
|
150
|
+
|
151
|
+
1. Consider the database dialect and the tables and columns in the database.
|
152
|
+
|
153
|
+
|
154
|
+
Avoid these:
|
155
|
+
1. Do not include steps to save files.
|
156
|
+
2. Do not include steps to modify existing tables, create new tables or modify the database schema.
|
157
|
+
3. Do not include steps that alter the existing data in the database.
|
158
|
+
4. Make sure not to include unsafe code that could cause data loss or corruption or SQL injections.
|
159
|
+
|
160
|
+
""",
|
161
|
+
input_variables=["user_instructions", "recommended_steps", "all_sql_database_summary"]
|
162
|
+
)
|
163
|
+
|
164
|
+
# Create a connection if needed
|
165
|
+
is_engine = isinstance(connection, sql.engine.base.Engine)
|
166
|
+
conn = connection.connect() if is_engine else connection
|
167
|
+
|
168
|
+
# Get the database metadata
|
169
|
+
all_sql_database_summary = get_database_metadata(conn, n_values=10)
|
170
|
+
|
171
|
+
steps_agent = recommend_steps_prompt | llm
|
172
|
+
|
173
|
+
recommended_steps = steps_agent.invoke({
|
174
|
+
"user_instructions": state.get("user_instructions"),
|
175
|
+
"recommended_steps": state.get("recommended_steps"),
|
176
|
+
"all_sql_database_summary": all_sql_database_summary
|
177
|
+
})
|
178
|
+
|
179
|
+
return {
|
180
|
+
"recommended_steps": "\n\n# Recommended SQL Database Steps:\n" + recommended_steps.content.strip(),
|
181
|
+
"all_sql_database_summary": all_sql_database_summary
|
182
|
+
}
|
183
|
+
|
184
|
+
def create_sql_query_code(state: GraphState):
|
185
|
+
if bypass_recommended_steps:
|
186
|
+
print("---SQL DATABASE AGENT---")
|
187
|
+
print(" * CREATE SQL QUERY CODE")
|
188
|
+
|
189
|
+
# Prompt to get the SQL code from the LLM
|
190
|
+
sql_query_code_prompt = PromptTemplate(
|
191
|
+
template="""
|
192
|
+
You are a SQL Database Coding Expert. Given the following information about the SQL database,
|
193
|
+
write the SQL code to collect the data and process it according to user instructions.
|
194
|
+
The code should be tailored to the SQL database characteristics and should take into account user instructions, recommended steps, database and table characteristics.
|
195
|
+
|
196
|
+
IMPORTANT INSTRUCTIONS:
|
197
|
+
- Do not use a LIMIT clause unless a user specifies a limit to be returned.
|
198
|
+
- Return SQL in ```sql ``` format.
|
199
|
+
- Only return a single query if possible.
|
200
|
+
- Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.
|
201
|
+
- Pay attention to the SQL dialect from the database summary metadata. Write the SQL code according to the dialect specified.
|
202
|
+
|
203
|
+
|
204
|
+
User instructions / Question:
|
205
|
+
{user_instructions}
|
206
|
+
|
207
|
+
Recommended Steps:
|
208
|
+
{recommended_steps}
|
209
|
+
|
210
|
+
Below are summaries of the database metadata and the SQL tables:
|
211
|
+
{all_sql_database_summary}
|
212
|
+
|
213
|
+
Return:
|
214
|
+
- The SQL code in ```sql ``` format to collect the data and process it according to the user instructions.
|
215
|
+
|
216
|
+
Avoid these:
|
217
|
+
- Do not include steps to save files.
|
218
|
+
- Do not include steps to modify existing tables, create new tables or modify the database schema.
|
219
|
+
- Make sure not to alter the existing data in the database.
|
220
|
+
- Make sure not to include unsafe code that could cause data loss or corruption.
|
221
|
+
|
222
|
+
""",
|
223
|
+
input_variables=["user_instructions", "recommended_steps", "all_sql_database_summary"]
|
224
|
+
)
|
225
|
+
|
226
|
+
# Create a connection if needed
|
227
|
+
is_engine = isinstance(connection, sql.engine.base.Engine)
|
228
|
+
conn = connection.connect() if is_engine else connection
|
229
|
+
|
230
|
+
# Get the database metadata
|
231
|
+
all_sql_database_summary = get_database_metadata(conn, n_values=10)
|
232
|
+
|
233
|
+
sql_query_code_agent = sql_query_code_prompt | llm | SQLOutputParser()
|
234
|
+
|
235
|
+
sql_query_code = sql_query_code_agent.invoke({
|
236
|
+
"user_instructions": state.get("user_instructions"),
|
237
|
+
"recommended_steps": state.get("recommended_steps"),
|
238
|
+
"all_sql_database_summary": all_sql_database_summary
|
239
|
+
})
|
240
|
+
|
241
|
+
print(" * CREATE PYTHON FUNCTION TO RUN SQL CODE")
|
242
|
+
|
243
|
+
response = f"""
|
244
|
+
def sql_database_pipeline(connection):
|
245
|
+
import pandas as pd
|
246
|
+
import sqlalchemy as sql
|
247
|
+
|
248
|
+
# Create a connection if needed
|
249
|
+
is_engine = isinstance(connection, sql.engine.base.Engine)
|
250
|
+
conn = connection.connect() if is_engine else connection
|
251
|
+
|
252
|
+
sql_query = '''
|
253
|
+
{sql_query_code}
|
254
|
+
'''
|
255
|
+
|
256
|
+
return pd.read_sql(sql_query, connection)
|
257
|
+
"""
|
258
|
+
|
259
|
+
response = add_comments_to_top(response, AGENT_NAME)
|
260
|
+
|
261
|
+
# For logging: store the code generated
|
262
|
+
file_path, file_name = log_ai_function(
|
263
|
+
response=response,
|
264
|
+
file_name="sql_database.py",
|
265
|
+
log=log,
|
266
|
+
log_path=log_path,
|
267
|
+
overwrite=overwrite
|
268
|
+
)
|
269
|
+
|
270
|
+
return {
|
271
|
+
"sql_query_code": sql_query_code,
|
272
|
+
"sql_database_function": response,
|
273
|
+
"sql_database_function_path": file_path,
|
274
|
+
"sql_database_function_name": file_name
|
275
|
+
}
|
276
|
+
|
277
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_sql_steps", "create_sql_query_code"]]:
|
278
|
+
return node_func_human_review(
|
279
|
+
state=state,
|
280
|
+
prompt_text="Are the following SQL database querying steps correct? (Answer 'yes' or provide modifications)\n{steps}",
|
281
|
+
yes_goto="create_sql_query_code",
|
282
|
+
no_goto="recommend_sql_steps",
|
283
|
+
user_instructions_key="user_instructions",
|
284
|
+
recommended_steps_key="recommended_steps"
|
285
|
+
)
|
286
|
+
|
287
|
+
def execute_sql_database_code(state: GraphState):
|
288
|
+
|
289
|
+
is_engine = isinstance(connection, sql.engine.base.Engine)
|
290
|
+
conn = connection.connect() if is_engine else connection
|
291
|
+
|
292
|
+
return node_func_execute_agent_from_sql_connection(
|
293
|
+
state=state,
|
294
|
+
connection=conn,
|
295
|
+
result_key="data_sql",
|
296
|
+
error_key="sql_database_error",
|
297
|
+
code_snippet_key="sql_database_function",
|
298
|
+
agent_function_name="sql_database_pipeline",
|
299
|
+
post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
|
300
|
+
error_message_prefix="An error occurred during executing the sql database pipeline: "
|
301
|
+
)
|
302
|
+
|
303
|
+
def fix_sql_database_code(state: GraphState):
|
304
|
+
prompt = """
|
305
|
+
You are a SQL Database Agent code fixer. Your job is to create a sql_database_pipeline(connection) function that can be run on a sql connection. The function is currently broken and needs to be fixed.
|
306
|
+
|
307
|
+
Make sure to only return the function definition for sql_database_pipeline().
|
308
|
+
|
309
|
+
Return Python code in ```python``` format with a single function definition, sql_database_pipeline(connection), that includes all imports inside the function. The connection object is a SQLAlchemy connection object. Don't specify the class of the connection object, just use it as an argument to the function.
|
310
|
+
|
311
|
+
This is the broken code (please fix):
|
312
|
+
{code_snippet}
|
313
|
+
|
314
|
+
Last Known Error:
|
315
|
+
{error}
|
316
|
+
"""
|
317
|
+
|
318
|
+
return node_func_fix_agent_code(
|
319
|
+
state=state,
|
320
|
+
code_snippet_key="sql_database_function",
|
321
|
+
error_key="sql_database_error",
|
322
|
+
llm=llm,
|
323
|
+
prompt_template=prompt,
|
324
|
+
agent_name=AGENT_NAME,
|
325
|
+
log=log,
|
326
|
+
file_path=state.get("sql_database_function_path", None),
|
327
|
+
)
|
328
|
+
|
329
|
+
def explain_sql_database_code(state: GraphState):
|
330
|
+
return node_func_explain_agent_code(
|
331
|
+
state=state,
|
332
|
+
code_snippet_key="sql_database_function",
|
333
|
+
result_key="messages",
|
334
|
+
error_key="sql_database_error",
|
335
|
+
llm=llm,
|
336
|
+
role=AGENT_NAME,
|
337
|
+
explanation_prompt_template="""
|
338
|
+
Explain the SQL steps that the SQL Database agent performed in this function.
|
339
|
+
Keep the summary succinct and to the point.\n\n# SQL Database Agent:\n\n{code}
|
340
|
+
""",
|
341
|
+
success_prefix="# SQL Database Agent:\n\n",
|
342
|
+
error_message="The SQL Database Agent encountered an error during SQL Query Analysis. No SQL function explanation is returned."
|
343
|
+
)
|
344
|
+
|
345
|
+
# Create the graph
|
346
|
+
node_functions = {
|
347
|
+
"recommend_sql_steps": recommend_sql_steps,
|
348
|
+
"human_review": human_review,
|
349
|
+
"create_sql_query_code": create_sql_query_code,
|
350
|
+
"execute_sql_database_code": execute_sql_database_code,
|
351
|
+
"fix_sql_database_code": fix_sql_database_code,
|
352
|
+
"explain_sql_database_code": explain_sql_database_code
|
353
|
+
}
|
354
|
+
|
355
|
+
app = create_coding_agent_graph(
|
356
|
+
GraphState=GraphState,
|
357
|
+
node_functions=node_functions,
|
358
|
+
recommended_steps_node_name="recommend_sql_steps",
|
359
|
+
create_code_node_name="create_sql_query_code",
|
360
|
+
execute_code_node_name="execute_sql_database_code",
|
361
|
+
fix_code_node_name="fix_sql_database_code",
|
362
|
+
explain_code_node_name="explain_sql_database_code",
|
363
|
+
error_key="sql_database_error",
|
364
|
+
human_in_the_loop=human_in_the_loop,
|
365
|
+
human_review_node_name="human_review",
|
366
|
+
checkpointer=MemorySaver() if human_in_the_loop else None,
|
367
|
+
bypass_recommended_steps=bypass_recommended_steps,
|
368
|
+
bypass_explain_code=bypass_explain_code,
|
369
|
+
)
|
370
|
+
|
371
|
+
return app
|
372
|
+
|
373
|
+
|
374
|
+
|
375
|
+
|
376
|
+
|
377
|
+
|
378
|
+
|
379
|
+
|
@@ -3,6 +3,7 @@ from langgraph.graph import StateGraph, END
|
|
3
3
|
from langgraph.types import interrupt, Command
|
4
4
|
|
5
5
|
import pandas as pd
|
6
|
+
import sqlalchemy as sql
|
6
7
|
|
7
8
|
from typing import Any, Callable, Dict, Type, Optional
|
8
9
|
|
@@ -22,7 +23,9 @@ def create_coding_agent_graph(
|
|
22
23
|
retry_count_key: str = "retry_count",
|
23
24
|
human_in_the_loop: bool = False,
|
24
25
|
human_review_node_name: str = "human_review",
|
25
|
-
checkpointer: Optional[Callable] = None
|
26
|
+
checkpointer: Optional[Callable] = None,
|
27
|
+
bypass_recommended_steps: bool = False,
|
28
|
+
bypass_explain_code: bool = False,
|
26
29
|
):
|
27
30
|
"""
|
28
31
|
Creates a generic agent graph using the provided node functions and node names.
|
@@ -63,7 +66,11 @@ def create_coding_agent_graph(
|
|
63
66
|
The node name for human review if human_in_the_loop is True.
|
64
67
|
checkpointer : callable, optional
|
65
68
|
A checkpointer callable if desired.
|
66
|
-
|
69
|
+
bypass_recommended_steps : bool, optional
|
70
|
+
Whether to skip the recommended steps node.
|
71
|
+
bypass_explain_code : bool, optional
|
72
|
+
Whether to skip the final explain code node.
|
73
|
+
|
67
74
|
Returns
|
68
75
|
-------
|
69
76
|
app : langchain.graphs.StateGraph
|
@@ -72,50 +79,76 @@ def create_coding_agent_graph(
|
|
72
79
|
|
73
80
|
workflow = StateGraph(GraphState)
|
74
81
|
|
75
|
-
#
|
76
|
-
|
82
|
+
# Conditionally add the recommended-steps node
|
83
|
+
if not bypass_recommended_steps:
|
84
|
+
workflow.add_node(recommended_steps_node_name, node_functions[recommended_steps_node_name])
|
77
85
|
|
78
|
-
#
|
79
|
-
if human_in_the_loop:
|
80
|
-
workflow.add_node(human_review_node_name, node_functions[human_review_node_name])
|
81
|
-
|
82
|
-
# Add main nodes
|
86
|
+
# Always add create, execute, and fix nodes
|
83
87
|
workflow.add_node(create_code_node_name, node_functions[create_code_node_name])
|
84
88
|
workflow.add_node(execute_code_node_name, node_functions[execute_code_node_name])
|
85
89
|
workflow.add_node(fix_code_node_name, node_functions[fix_code_node_name])
|
86
|
-
|
90
|
+
|
91
|
+
# Conditionally add the explanation node
|
92
|
+
if not bypass_explain_code:
|
93
|
+
workflow.add_node(explain_code_node_name, node_functions[explain_code_node_name])
|
87
94
|
|
88
95
|
# Set the entry point
|
89
|
-
|
96
|
+
entry_point = create_code_node_name if bypass_recommended_steps else recommended_steps_node_name
|
97
|
+
workflow.set_entry_point(entry_point)
|
90
98
|
|
91
|
-
# Add edges
|
92
|
-
if
|
93
|
-
|
94
|
-
|
95
|
-
|
99
|
+
# Add edges for recommended steps
|
100
|
+
if not bypass_recommended_steps:
|
101
|
+
if human_in_the_loop:
|
102
|
+
workflow.add_edge(recommended_steps_node_name, human_review_node_name)
|
103
|
+
else:
|
104
|
+
workflow.add_edge(recommended_steps_node_name, create_code_node_name)
|
105
|
+
elif human_in_the_loop:
|
106
|
+
# Skip recommended steps but still include human review
|
107
|
+
workflow.add_edge(create_code_node_name, human_review_node_name)
|
96
108
|
|
97
|
-
#
|
109
|
+
# Create -> Execute
|
98
110
|
workflow.add_edge(create_code_node_name, execute_code_node_name)
|
99
111
|
|
100
|
-
#
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
state.get(
|
105
|
-
state.get(
|
106
|
-
state
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
112
|
+
# Define a helper to check if we have an error & can still retry
|
113
|
+
def error_and_can_retry(state):
|
114
|
+
return (
|
115
|
+
state.get(error_key) is not None
|
116
|
+
and state.get(retry_count_key) is not None
|
117
|
+
and state.get(max_retries_key) is not None
|
118
|
+
and state[retry_count_key] < state[max_retries_key]
|
119
|
+
)
|
120
|
+
|
121
|
+
# ---- Split into two branches for bypass_explain_code ----
|
122
|
+
if not bypass_explain_code:
|
123
|
+
# If we are NOT bypassing explain, the next node is fix_code if error,
|
124
|
+
# else explain_code. Then we wire explain_code -> END afterward.
|
125
|
+
workflow.add_conditional_edges(
|
126
|
+
execute_code_node_name,
|
127
|
+
lambda s: "fix_code" if error_and_can_retry(s) else "explain_code",
|
128
|
+
{
|
129
|
+
"fix_code": fix_code_node_name,
|
130
|
+
"explain_code": explain_code_node_name,
|
131
|
+
},
|
132
|
+
)
|
133
|
+
# Fix code -> Execute again
|
134
|
+
workflow.add_edge(fix_code_node_name, execute_code_node_name)
|
135
|
+
# explain_code -> END
|
136
|
+
workflow.add_edge(explain_code_node_name, END)
|
137
|
+
else:
|
138
|
+
# If we ARE bypassing explain_code, the next node is fix_code if error,
|
139
|
+
# else straight to END.
|
140
|
+
workflow.add_conditional_edges(
|
141
|
+
execute_code_node_name,
|
142
|
+
lambda s: "fix_code" if error_and_can_retry(s) else "END",
|
143
|
+
{
|
144
|
+
"fix_code": fix_code_node_name,
|
145
|
+
"END": END,
|
146
|
+
},
|
147
|
+
)
|
148
|
+
# Fix code -> Execute again
|
149
|
+
workflow.add_edge(fix_code_node_name, execute_code_node_name)
|
150
|
+
|
151
|
+
# Finally, compile
|
119
152
|
if human_in_the_loop and checkpointer is not None:
|
120
153
|
app = workflow.compile(checkpointer=checkpointer)
|
121
154
|
else:
|
@@ -124,6 +157,7 @@ def create_coding_agent_graph(
|
|
124
157
|
return app
|
125
158
|
|
126
159
|
|
160
|
+
|
127
161
|
def node_func_human_review(
|
128
162
|
state: Any,
|
129
163
|
prompt_text: str,
|
@@ -256,6 +290,88 @@ def node_func_execute_agent_code_on_data(
|
|
256
290
|
# if state.get("retry_count") == 0:
|
257
291
|
# 10/0
|
258
292
|
|
293
|
+
# Apply post-processing if provided
|
294
|
+
if post_processing is not None:
|
295
|
+
result = post_processing(result)
|
296
|
+
else:
|
297
|
+
if isinstance(result, pd.DataFrame):
|
298
|
+
result = result.to_dict()
|
299
|
+
|
300
|
+
except Exception as e:
|
301
|
+
print(e)
|
302
|
+
agent_error = f"{error_message_prefix}{str(e)}"
|
303
|
+
|
304
|
+
# Return results
|
305
|
+
output = {result_key: result, error_key: agent_error}
|
306
|
+
return output
|
307
|
+
|
308
|
+
def node_func_execute_agent_from_sql_connection(
|
309
|
+
state: Any,
|
310
|
+
connection: Any,
|
311
|
+
code_snippet_key: str,
|
312
|
+
result_key: str,
|
313
|
+
error_key: str,
|
314
|
+
agent_function_name: str,
|
315
|
+
post_processing: Optional[Callable[[Any], Any]] = None,
|
316
|
+
error_message_prefix: str = "An error occurred during agent execution: "
|
317
|
+
) -> Dict[str, Any]:
|
318
|
+
"""
|
319
|
+
Execute a generic agent code defined in a code snippet retrieved from the state on a SQLAlchemy connection object
|
320
|
+
and return the result.
|
321
|
+
|
322
|
+
Parameters
|
323
|
+
----------
|
324
|
+
state : Any
|
325
|
+
A state object that supports `get(key: str)` method to retrieve values.
|
326
|
+
connection : str
|
327
|
+
The SQLAlchemy connection object to use for executing the agent function.
|
328
|
+
code_snippet_key : str
|
329
|
+
The key in the state used to retrieve the Python code snippet defining the agent function.
|
330
|
+
result_key : str
|
331
|
+
The key in the state used to store the result of the agent function.
|
332
|
+
error_key : str
|
333
|
+
The key in the state used to store the error message if any.
|
334
|
+
agent_function_name : str
|
335
|
+
The name of the function (e.g., 'sql_database_agent') expected to be defined in the code snippet.
|
336
|
+
post_processing : Callable[[Any], Any], optional
|
337
|
+
A function to postprocess the output of the agent function before returning it.
|
338
|
+
error_message_prefix : str, optional
|
339
|
+
A prefix or full message to use in the error output if an exception occurs.
|
340
|
+
|
341
|
+
Returns
|
342
|
+
-------
|
343
|
+
Dict[str, Any]
|
344
|
+
A dictionary containing the result and/or error messages. Keys are arbitrary,
|
345
|
+
but typically include something like "result" or "error".
|
346
|
+
"""
|
347
|
+
|
348
|
+
print(" * EXECUTING AGENT CODE ON SQL CONNECTION")
|
349
|
+
|
350
|
+
# Retrieve SQLAlchemy connection and code snippet from the state
|
351
|
+
is_engine = isinstance(connection, sql.engine.base.Engine)
|
352
|
+
conn = connection.connect() if is_engine else connection
|
353
|
+
agent_code = state.get(code_snippet_key)
|
354
|
+
|
355
|
+
# Ensure the connection object is provided
|
356
|
+
if connection is None:
|
357
|
+
raise ValueError(f"Connection object not found.")
|
358
|
+
|
359
|
+
# Execute the code snippet to define the agent function
|
360
|
+
local_vars = {}
|
361
|
+
global_vars = {}
|
362
|
+
exec(agent_code, global_vars, local_vars)
|
363
|
+
|
364
|
+
# Retrieve the agent function from the executed code
|
365
|
+
agent_function = local_vars.get(agent_function_name, None)
|
366
|
+
if agent_function is None or not callable(agent_function):
|
367
|
+
raise ValueError(f"Agent function '{agent_function_name}' not found or not callable in the provided code.")
|
368
|
+
|
369
|
+
# Execute the agent function
|
370
|
+
agent_error = None
|
371
|
+
result = None
|
372
|
+
try:
|
373
|
+
result = agent_function(connection)
|
374
|
+
|
259
375
|
# Apply post-processing if provided
|
260
376
|
if post_processing is not None:
|
261
377
|
result = post_processing(result)
|
@@ -267,6 +383,7 @@ def node_func_execute_agent_code_on_data(
|
|
267
383
|
output = {result_key: result, error_key: agent_error}
|
268
384
|
return output
|
269
385
|
|
386
|
+
|
270
387
|
def node_func_fix_agent_code(
|
271
388
|
state: Any,
|
272
389
|
code_snippet_key: str,
|
@@ -326,7 +443,7 @@ def node_func_fix_agent_code(
|
|
326
443
|
response = (llm | PythonOutputParser()).invoke(prompt)
|
327
444
|
|
328
445
|
response = relocate_imports_inside_function(response)
|
329
|
-
response = add_comments_to_top(response, agent_name=
|
446
|
+
response = add_comments_to_top(response, agent_name=agent_name)
|
330
447
|
|
331
448
|
# Log the response if requested
|
332
449
|
if log:
|
@@ -1,8 +1,9 @@
|
|
1
1
|
import io
|
2
2
|
import pandas as pd
|
3
|
+
import sqlalchemy as sql
|
3
4
|
from typing import Union, List, Dict
|
4
5
|
|
5
|
-
def
|
6
|
+
def get_dataframe_summary(
|
6
7
|
dataframes: Union[pd.DataFrame, List[pd.DataFrame], Dict[str, pd.DataFrame]]
|
7
8
|
) -> List[str]:
|
8
9
|
"""
|
@@ -26,7 +27,7 @@ def summarize_dataframes(
|
|
26
27
|
"iris": data.frame,
|
27
28
|
"iris_target": data.target,
|
28
29
|
}
|
29
|
-
summaries =
|
30
|
+
summaries = get_dataframe_summary(dataframes)
|
30
31
|
print(summaries[0])
|
31
32
|
```
|
32
33
|
|
@@ -114,3 +115,53 @@ def _summarize_dataframe(df: pd.DataFrame, dataset_name: str) -> str:
|
|
114
115
|
{info_text}
|
115
116
|
"""
|
116
117
|
return summary_text.strip()
|
118
|
+
|
119
|
+
|
120
|
+
def get_database_metadata(connection: Union[sql.engine.base.Connection, sql.engine.base.Engine], n_values: int=10):
|
121
|
+
"""
|
122
|
+
Collects metadata and sample data from a database.
|
123
|
+
|
124
|
+
Parameters:
|
125
|
+
-----------
|
126
|
+
connection (sqlalchemy.engine.base.Connection or sqlalchemy.engine.base.Engine):
|
127
|
+
An active SQLAlchemy connection or engine.
|
128
|
+
n_values (int):
|
129
|
+
Number of sample values to retrieve for each column.
|
130
|
+
|
131
|
+
Returns:
|
132
|
+
--------
|
133
|
+
str: Formatted text with database metadata.
|
134
|
+
"""
|
135
|
+
# If a connection is passed, use it; if an engine is passed, connect to it
|
136
|
+
is_engine = isinstance(connection, sql.engine.base.Engine)
|
137
|
+
conn = connection.connect() if is_engine else connection
|
138
|
+
output = []
|
139
|
+
|
140
|
+
try:
|
141
|
+
# Engine metadata
|
142
|
+
sql_engine = conn.engine
|
143
|
+
output.append(f"Database Dialect: {sql_engine.dialect.name}")
|
144
|
+
output.append(f"Driver: {sql_engine.driver}")
|
145
|
+
output.append(f"Connection URL: {sql_engine.url}")
|
146
|
+
|
147
|
+
# Inspect the database
|
148
|
+
inspector = sql.inspect(sql_engine)
|
149
|
+
output.append(f"Tables: {inspector.get_table_names()}")
|
150
|
+
output.append(f"Schemas: {inspector.get_schema_names()}")
|
151
|
+
|
152
|
+
# For each table, get the columns and their metadata
|
153
|
+
for table_name in inspector.get_table_names():
|
154
|
+
output.append(f"\nTable: {table_name}")
|
155
|
+
for column in inspector.get_columns(table_name):
|
156
|
+
output.append(f" Column: {column['name']} Type: {column['type']}")
|
157
|
+
# Fetch sample values for the column
|
158
|
+
query = f"SELECT {column['name']} FROM {table_name} LIMIT {n_values}"
|
159
|
+
data = pd.read_sql(query, sql_engine)
|
160
|
+
output.append(f" First {n_values} Values: {data.values.flatten().tolist()}")
|
161
|
+
finally:
|
162
|
+
# Close connection if it was created inside this function
|
163
|
+
if is_engine:
|
164
|
+
conn.close()
|
165
|
+
|
166
|
+
# Join all collected information into a single string
|
167
|
+
return "\n".join(output)
|
@@ -64,7 +64,7 @@ def add_comments_to_top(code_text, agent_name="data_wrangler"):
|
|
64
64
|
header_comments = [
|
65
65
|
"# Disclaimer: This function was generated by AI. Please review before using.",
|
66
66
|
f"# Agent Name: {agent_name}",
|
67
|
-
f"# Time Created: {time_created}",
|
67
|
+
f"# Time Created: {time_created}\n",
|
68
68
|
""
|
69
69
|
]
|
70
70
|
|
{ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.90061.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: ai-data-science-team
|
3
|
-
Version: 0.0.0.
|
3
|
+
Version: 0.0.0.90061
|
4
4
|
Summary: Build and run an AI-powered data science team.
|
5
5
|
Home-page: https://github.com/business-science/ai-data-science-team
|
6
6
|
Author: Matt Dancho
|
@@ -58,6 +58,7 @@ This project is a work in progress. New data science agents will be released soo
|
|
58
58
|
1. **Data Wrangling Agent:** Merges, Joins, Preps and Wrangles data into a format that is ready for data analysis.
|
59
59
|
2. **Data Cleaning Agent:** Performs Data Preparation steps including handling missing values, outliers, and data type conversions.
|
60
60
|
3. **Feature Engineering Agent:** Converts the prepared data into ML-ready data. Adds features to increase predictive accuracy of ML models.
|
61
|
+
4. **SQL Database Agent:** Connects to SQL databases to pull data into the data science environment. Creates pipelins to automate data extraction. Performs Joins, Aggregations, and other SQL Query operations.
|
61
62
|
|
62
63
|
### Agents Coming Soon
|
63
64
|
|
@@ -103,6 +104,8 @@ pip install git+https://github.com/business-science/ai-data-science-team.git --u
|
|
103
104
|
|
104
105
|
## Usage
|
105
106
|
|
107
|
+
[See all examples here.](/examples)
|
108
|
+
|
106
109
|
### Example 1: Feature Engineering with the Feature Engineering Agent
|
107
110
|
|
108
111
|
[See the full example here.](/examples/feature_engineering_agent.ipynb)
|
@@ -0,0 +1,20 @@
|
|
1
|
+
ai_data_science_team/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
ai_data_science_team/_version.py,sha256=RhiY8R43RrUH4olT4XbUwFEraT-iuZ_dNsUfJV7DU5s,27
|
3
|
+
ai_data_science_team/orchestration.py,sha256=xiIFOsrLwPdkSmtme7wNCCGv8XopnMTNElNzlZokL-4,303
|
4
|
+
ai_data_science_team/agents/__init__.py,sha256=1lrC6l-wJJNguHM14cDhX_QX7PLr1V9H_j33d-KouMc,353
|
5
|
+
ai_data_science_team/agents/data_cleaning_agent.py,sha256=-16zPLqtsvwxWID6KI9j9-9qxN1hMb9si_CTKmiyvss,14884
|
6
|
+
ai_data_science_team/agents/data_wrangling_agent.py,sha256=kt-jYEdZzSGtOnFh7KQA5DzkHFRp0MINp7UqfwqMlV8,14768
|
7
|
+
ai_data_science_team/agents/feature_engineering_agent.py,sha256=FguDb7fSLXDgyMSJIwxphCZ-PlfCXsaCXsxtFp_9mrQ,16421
|
8
|
+
ai_data_science_team/agents/sql_database_agent.py,sha256=AVgm9e4GZZiQ6C7-vANjEpI256STi6jmQI6MuYdOXmU,15181
|
9
|
+
ai_data_science_team/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
+
ai_data_science_team/templates/agent_templates.py,sha256=xohVgEfxPcVukPLpPfV7mZ0cpFgp-oJVLZRWCv2V-WU,19948
|
11
|
+
ai_data_science_team/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
|
+
ai_data_science_team/tools/logging.py,sha256=7wFOv6GGhXR_RPbh-8p0GyrS608XOnZtiaGK2IbDl_s,2081
|
13
|
+
ai_data_science_team/tools/metadata.py,sha256=Vd3gX4K31A4o5IiM4pr9Al_8jkzGOtDCxWMd264AoAM,5772
|
14
|
+
ai_data_science_team/tools/parsers.py,sha256=BAi-fJT7BBt9nRS3w5n9LDTsu7JAJsH8CAI9-Qf7jCs,2086
|
15
|
+
ai_data_science_team/tools/regex.py,sha256=TLXSgYbSOL6e9IJt1BY3Is2O9MCjTVeXpdKR4CIMuQc,2330
|
16
|
+
ai_data_science_team-0.0.0.90061.dist-info/LICENSE,sha256=Xif0IRLdd2HGLATxV2EVp91aSY6KOuacRr_6BorKGzA,1084
|
17
|
+
ai_data_science_team-0.0.0.90061.dist-info/METADATA,sha256=FSVfHjARQ2z4ZjCMJtCi4xrQtN-qnfrZTceO6X9ov5Y,6087
|
18
|
+
ai_data_science_team-0.0.0.90061.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
19
|
+
ai_data_science_team-0.0.0.90061.dist-info/top_level.txt,sha256=CnoMgOphCoAdGTLueWdCVByVyjwOubaGiTB1lchdy4M,21
|
20
|
+
ai_data_science_team-0.0.0.90061.dist-info/RECORD,,
|
@@ -1,19 +0,0 @@
|
|
1
|
-
ai_data_science_team/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
ai_data_science_team/_version.py,sha256=7tA8TocqCCzLkcB4ptV6bn3k5ni-0TGZvGnVBzmbeIc,26
|
3
|
-
ai_data_science_team/orchestration.py,sha256=xiIFOsrLwPdkSmtme7wNCCGv8XopnMTNElNzlZokL-4,303
|
4
|
-
ai_data_science_team/agents/__init__.py,sha256=DtwQgyeG3Q4rQ-NrMbva-jshVQyULaWW1RrnETQGZOY,270
|
5
|
-
ai_data_science_team/agents/data_cleaning_agent.py,sha256=0K-CgngGjamRk_QzMqNkplrI-ddCbtruQ7kjGrsRIN8,14390
|
6
|
-
ai_data_science_team/agents/data_wrangling_agent.py,sha256=uQBJ8vQwrXubQgaI9_UoNZnVQjIEBUOh3dTmNdg326k,14581
|
7
|
-
ai_data_science_team/agents/feature_engineering_agent.py,sha256=QEqXTsfjllUj4Wgsw4nNGUT6r9Y6q629ZNgqGy3Dbbk,15921
|
8
|
-
ai_data_science_team/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
-
ai_data_science_team/templates/agent_templates.py,sha256=gT48Pq9KlrrrF0yigodGl_BdptmowTJ2rEWUqh7g5E0,15410
|
10
|
-
ai_data_science_team/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
-
ai_data_science_team/tools/data_analysis.py,sha256=V7e6_fZA01mosFf5VcLwBcpiMVf7fClZMjTrj-egK-o,3715
|
12
|
-
ai_data_science_team/tools/logging.py,sha256=EU5EMg4Y0-Yhqf1vAEFg0eRvSTx8uF0LTOAKss8-T2M,2073
|
13
|
-
ai_data_science_team/tools/parsers.py,sha256=BAi-fJT7BBt9nRS3w5n9LDTsu7JAJsH8CAI9-Qf7jCs,2086
|
14
|
-
ai_data_science_team/tools/regex.py,sha256=KTH2SXPJT8Tzmj7CufyeET-FbA9BMhRzFlPKr4Tan3g,2320
|
15
|
-
ai_data_science_team-0.0.0.9005.dist-info/LICENSE,sha256=Xif0IRLdd2HGLATxV2EVp91aSY6KOuacRr_6BorKGzA,1084
|
16
|
-
ai_data_science_team-0.0.0.9005.dist-info/METADATA,sha256=PC6rJR965hPu02LtZrzHICkd3QeWzh2A35axTLjE9hM,5840
|
17
|
-
ai_data_science_team-0.0.0.9005.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
18
|
-
ai_data_science_team-0.0.0.9005.dist-info/top_level.txt,sha256=CnoMgOphCoAdGTLueWdCVByVyjwOubaGiTB1lchdy4M,21
|
19
|
-
ai_data_science_team-0.0.0.9005.dist-info/RECORD,,
|
{ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.90061.dist-info}/LICENSE
RENAMED
File without changes
|
{ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.90061.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|