PyPI - ai-data-science-team - Versions diffs - 0.0.0.9005__tar.gz → 0.0.0.90061__tar.gz - Mend

ai-data-science-team 0.0.0.9005tar.gz → 0.0.0.90061tar.gz

Files changed (26) hide show

{ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.90061}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ai-data-science-team
-Version: 0.0.0.9005
+Version: 0.0.0.90061
 Summary: Build and run an AI-powered data science team.
 Home-page: https://github.com/business-science/ai-data-science-team
 Author: Matt Dancho
@@ -58,6 +58,7 @@ This project is a work in progress. New data science agents will be released soo
 1. **Data Wrangling Agent:** Merges, Joins, Preps and Wrangles data into a format that is ready for data analysis.
 2. **Data Cleaning Agent:** Performs Data Preparation steps including handling missing values, outliers, and data type conversions.
 3. **Feature Engineering Agent:** Converts the prepared data into ML-ready data. Adds features to increase predictive accuracy of ML models.
+4. **SQL Database Agent:** Connects to SQL databases to pull data into the data science environment. Creates pipelins to automate data extraction. Performs Joins, Aggregations, and other SQL Query operations.
 ### Agents Coming Soon
@@ -103,6 +104,8 @@ pip install git+https://github.com/business-science/ai-data-science-team.git --u
 ## Usage
+[See all examples here.](/examples)
 ### Example 1: Feature Engineering with the Feature Engineering Agent
 [See the full example here.](/examples/feature_engineering_agent.ipynb)

{ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.90061}/README.md RENAMED Viewed

@@ -34,6 +34,7 @@ This project is a work in progress. New data science agents will be released soo
 1. **Data Wrangling Agent:** Merges, Joins, Preps and Wrangles data into a format that is ready for data analysis.
 2. **Data Cleaning Agent:** Performs Data Preparation steps including handling missing values, outliers, and data type conversions.
 3. **Feature Engineering Agent:** Converts the prepared data into ML-ready data. Adds features to increase predictive accuracy of ML models.
+4. **SQL Database Agent:** Connects to SQL databases to pull data into the data science environment. Creates pipelins to automate data extraction. Performs Joins, Aggregations, and other SQL Query operations.
 ### Agents Coming Soon
@@ -79,6 +80,8 @@ pip install git+https://github.com/business-science/ai-data-science-team.git --u
 ## Usage
+[See all examples here.](/examples)
 ### Example 1: Feature Engineering with the Feature Engineering Agent
 [See the full example here.](/examples/feature_engineering_agent.ipynb)

ai_data_science_team-0.0.0.90061/ai_data_science_team/_version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.0.0.90061"

{ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.90061}/ai_data_science_team/agents/__init__.py RENAMED Viewed

@@ -1,4 +1,5 @@
 from ai_data_science_team.agents.data_cleaning_agent import make_data_cleaning_agent
 from ai_data_science_team.agents.feature_engineering_agent import make_feature_engineering_agent
 from ai_data_science_team.agents.data_wrangling_agent import make_data_wrangling_agent
+from ai_data_science_team.agents.sql_database_agent import make_sql_database_agent

{ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.90061}/ai_data_science_team/agents/data_cleaning_agent.py RENAMED Viewed

@@ -26,7 +26,7 @@ from ai_data_science_team.templates.agent_templates import(
 )
 from ai_data_science_team.tools.parsers import PythonOutputParser
 from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top
-from ai_data_science_team.tools.data_analysis import summarize_dataframes
+from ai_data_science_team.tools.metadata import get_dataframe_summary
 from ai_data_science_team.tools.logging import log_ai_function
 # Setup
@@ -35,7 +35,7 @@ LOG_PATH = os.path.join(os.getcwd(), "logs/")
 # Agent
-def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True, human_in_the_loop=False):
+def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True, human_in_the_loop=False, bypass_recommended_steps=False, bypass_explain_code=False):
     """
     Creates a data cleaning agent that can be run on a dataset. The agent can be used to clean a dataset in a variety of
     ways, such as removing columns with more than 40% missing values, imputing missing
@@ -71,6 +71,10 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
         Defaults to True.
     human_in_the_loop : bool, optional
         Whether or not to use human in the loop. If True, adds an interput and human in the loop step that asks the user to review the data cleaning instructions. Defaults to False.
+    bypass_recommended_steps : bool, optional
+        Bypass the recommendation step, by default False
+    bypass_explain_code : bool, optional
+        Bypass the code explanation step, by default False.
     Examples
     -------
@@ -180,7 +184,7 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
         data_raw = state.get("data_raw")
         df = pd.DataFrame.from_dict(data_raw)
-        all_datasets_summary = summarize_dataframes([df])
+        all_datasets_summary = get_dataframe_summary([df])
         all_datasets_summary_str = "\n\n".join(all_datasets_summary)
@@ -197,6 +201,8 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
         }
     def create_data_cleaner_code(state: GraphState):
+        if bypass_recommended_steps:
+            print("---DATA CLEANING AGENT----")
         print("    * CREATE DATA CLEANER CODE")
         data_cleaning_prompt = PromptTemplate(
@@ -274,7 +280,7 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
             code_snippet_key="data_cleaner_function",
             agent_function_name="data_cleaner",
             pre_processing=lambda data: pd.DataFrame.from_dict(data),
-            post_processing=lambda df: df.to_dict(),
+            post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
             error_message_prefix="An error occurred during data cleaning: "
         )
@@ -341,7 +347,9 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
         error_key="data_cleaner_error",
         human_in_the_loop=human_in_the_loop,  # or False
         human_review_node_name="human_review",
-        checkpointer=MemorySaver() if human_in_the_loop else None
+        checkpointer=MemorySaver() if human_in_the_loop else None,
+        bypass_recommended_steps=bypass_recommended_steps,
+        bypass_explain_code=bypass_explain_code,
     )
     return app

{ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.90061}/ai_data_science_team/agents/data_wrangling_agent.py RENAMED Viewed

@@ -24,14 +24,14 @@ from ai_data_science_team.templates.agent_templates import(
 )
 from ai_data_science_team.tools.parsers import PythonOutputParser
 from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top
-from ai_data_science_team.tools.data_analysis import summarize_dataframes
+from ai_data_science_team.tools.metadata import get_dataframe_summary
 from ai_data_science_team.tools.logging import log_ai_function
 # Setup Logging Path
 AGENT_NAME = "data_wrangling_agent"
 LOG_PATH = os.path.join(os.getcwd(), "logs/")
-def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True, human_in_the_loop=False):
+def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True, human_in_the_loop=False, bypass_recommended_steps=False, bypass_explain_code=False):
     """
     Creates a data wrangling agent that can be run on one or more datasets. The agent can be
     instructed to perform common data wrangling steps such as:
@@ -63,6 +63,10 @@ def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True,
     human_in_the_loop : bool, optional
         Whether or not to use human in the loop. If True, adds an interrupt and human-in-the-loop
         step that asks the user to review the data wrangling instructions. Defaults to False.
+    bypass_recommended_steps : bool, optional
+        Bypass the recommendation step, by default False
+    bypass_explain_code : bool, optional
+        Bypass the code explanation step, by default False.
     Example
     -------
@@ -139,7 +143,7 @@ def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True,
         # Create a summary for all datasets
         # We'll include a short sample and info for each dataset
-        all_datasets_summary = summarize_dataframes(dataframes)
+        all_datasets_summary = get_dataframe_summary(dataframes)
         # Join all datasets summaries into one big text block
         all_datasets_summary_str = "\n\n".join(all_datasets_summary)
@@ -190,6 +194,8 @@ def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True,
     def create_data_wrangler_code(state: GraphState):
+        if bypass_recommended_steps:
+            print("---DATA WRANGLING AGENT----")
         print("    * CREATE DATA WRANGLER CODE")
         data_wrangling_prompt = PromptTemplate(
@@ -269,17 +275,6 @@ def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True,
         )
     def execute_data_wrangler_code(state: GraphState):
-        # Handle multiple datasets as lists
-        # def pre_processing(data):
-        #     df = []
-        #     for i in range(len(data)):
-        #         df[i] = pd.DataFrame.from_dict(data[i])
-        #     return df
-        # def post_processing(df):
-        #     return df.to_dict()
         return node_func_execute_agent_code_on_data(
             state=state,
             data_key="data_raw",
@@ -288,7 +283,7 @@ def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True,
             code_snippet_key="data_wrangler_function",
             agent_function_name="data_wrangler",
             # pre_processing=pre_processing,
-            # post_processing=post_processing,
+            post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
             error_message_prefix="An error occurred during data wrangling: "
         )
@@ -355,7 +350,9 @@ def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True,
         error_key="data_wrangler_error",
         human_in_the_loop=human_in_the_loop,
         human_review_node_name="human_review",
-        checkpointer=MemorySaver() if human_in_the_loop else None
+        checkpointer=MemorySaver() if human_in_the_loop else None,
+        bypass_recommended_steps=bypass_recommended_steps,
+        bypass_explain_code=bypass_explain_code,
     )
     return app

{ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.90061}/ai_data_science_team/agents/feature_engineering_agent.py RENAMED Viewed

@@ -26,7 +26,7 @@ from ai_data_science_team.templates.agent_templates import(
 )
 from ai_data_science_team.tools.parsers import PythonOutputParser
 from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top
-from ai_data_science_team.tools.data_analysis import summarize_dataframes
+from ai_data_science_team.tools.metadata import get_dataframe_summary
 from ai_data_science_team.tools.logging import log_ai_function
 # Setup
@@ -35,7 +35,7 @@ LOG_PATH = os.path.join(os.getcwd(), "logs/")
 # * Feature Engineering Agent
-def make_feature_engineering_agent(model, log=False, log_path=None, overwrite = True, human_in_the_loop=False):
+def make_feature_engineering_agent(model, log=False, log_path=None, overwrite = True, human_in_the_loop=False, bypass_recommended_steps=False, bypass_explain_code=False):
     """
     Creates a feature engineering agent that can be run on a dataset. The agent applies various feature engineering
     techniques, such as encoding categorical variables, scaling numeric variables, creating interaction terms,
@@ -71,6 +71,10 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
         Defaults to True.
     human_in_the_loop : bool, optional
         Whether or not to use human in the loop. If True, adds an interput and human in the loop step that asks the user to review the feature engineering instructions. Defaults to False.
+    bypass_recommended_steps : bool, optional
+        Bypass the recommendation step, by default False
+    bypass_explain_code : bool, optional
+        Bypass the code explanation step, by default False.
     Examples
     -------
@@ -185,7 +189,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
         data_raw = state.get("data_raw")
         df = pd.DataFrame.from_dict(data_raw)
-        all_datasets_summary = summarize_dataframes([df])
+        all_datasets_summary = get_dataframe_summary([df])
         all_datasets_summary_str = "\n\n".join(all_datasets_summary)
@@ -212,6 +216,8 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
         )
     def create_feature_engineering_code(state: GraphState):
+        if bypass_recommended_steps:
+            print("---FEATURE ENGINEERING AGENT----")
         print("    * CREATE FEATURE ENGINEERING CODE")
         feature_engineering_prompt = PromptTemplate(
@@ -298,7 +304,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
             code_snippet_key="feature_engineer_function",
             agent_function_name="feature_engineer",
             pre_processing=lambda data: pd.DataFrame.from_dict(data),
-            post_processing=lambda df: df.to_dict(),
+            post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
             error_message_prefix="An error occurred during feature engineering: "
         )
@@ -362,7 +368,9 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
         error_key="feature_engineer_error",
         human_in_the_loop=human_in_the_loop,
         human_review_node_name="human_review",
-        checkpointer=MemorySaver() if human_in_the_loop else None
+        checkpointer=MemorySaver() if human_in_the_loop else None,
+        bypass_recommended_steps=bypass_recommended_steps,
+        bypass_explain_code=bypass_explain_code,
     )
     return app

ai_data_science_team-0.0.0.90061/ai_data_science_team/agents/sql_database_agent.py ADDED Viewed

@@ -0,0 +1,379 @@
+from typing import TypedDict, Annotated, Sequence, Literal
+import operator
+from langchain.prompts import PromptTemplate
+from langchain_core.messages import BaseMessage
+from langgraph.types import Command
+from langgraph.checkpoint.memory import MemorySaver
+import os
+import io
+import pandas as pd
+import sqlalchemy as sql
+from ai_data_science_team.templates.agent_templates import(
+    node_func_execute_agent_from_sql_connection,
+    node_func_human_review,
+    node_func_fix_agent_code,
+    node_func_explain_agent_code,
+    create_coding_agent_graph
+)
+from ai_data_science_team.tools.parsers import PythonOutputParser, SQLOutputParser
+from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top
+from ai_data_science_team.tools.metadata import get_database_metadata
+from ai_data_science_team.tools.logging import log_ai_function
+# Setup
+AGENT_NAME = "sql_database_agent"
+LOG_PATH = os.path.join(os.getcwd(), "logs/")
+def make_sql_database_agent(model, connection, log=False, log_path=None, overwrite = True, human_in_the_loop=False, bypass_recommended_steps=False, bypass_explain_code=False):
+    """
+    Creates a SQL Database Agent that can recommend SQL steps and generate SQL code to query a database.
+    Parameters
+    ----------
+    model : ChatOpenAI
+        The language model to use for the agent.
+    connection : sqlalchemy.engine.base.Engine
+        The connection to the SQL database.
+    log : bool, optional
+        Whether to log the generated code, by default False
+    log_path : str, optional
+        The path to the log directory, by default None
+    overwrite : bool, optional
+        Whether to overwrite the existing log file, by default True
+    human_in_the_loop : bool, optional
+        Whether or not to use human in the loop. If True, adds an interput and human in the loop step that asks the user to review the feature engineering instructions. Defaults to False.
+    bypass_recommended_steps : bool, optional
+        Bypass the recommendation step, by default False
+    bypass_explain_code : bool, optional
+        Bypass the code explanation step, by default False.
+    Returns
+    -------
+    app : langchain.graphs.StateGraph
+        The data cleaning agent as a state graph.
+    Examples
+    --------
+    ```python
+    from ai_data_science_team.agents import make_sql_database_agent
+    import sqlalchemy as sql
+    from langchain_openai import ChatOpenAI
+    sql_engine = sql.create_engine("sqlite:///data/leads_scored.db")
+    conn = sql_engine.connect()
+    llm = ChatOpenAI(model="gpt-4o-mini")
+    sql_agent = make_sql_database_agent(
+        model=llm,
+        connection=conn
+    )
+    sql_agent
+    response = sql_agent.invoke({
+        "user_instructions": "List the tables in the database",
+        "max_retries":3,
+        "retry_count":0
+    })
+    ```
+    """
+    is_engine = isinstance(connection, sql.engine.base.Engine)
+    conn = connection.connect() if is_engine else connection
+    llm = model
+    # Setup Log Directory
+    if log:
+        if log_path is None:
+            log_path = LOG_PATH
+        if not os.path.exists(log_path):
+            os.makedirs(log_path)
+    class GraphState(TypedDict):
+        messages: Annotated[Sequence[BaseMessage], operator.add]
+        user_instructions: str
+        recommended_steps: str
+        data_sql: dict
+        all_sql_database_summary: str
+        sql_query_code: str
+        sql_database_function: str
+        sql_database_function_path: str
+        sql_database_function_name: str
+        sql_database_error: str
+        max_retries: int
+        retry_count: int
+    def recommend_sql_steps(state: GraphState):
+        print("---SQL DATABASE AGENT---")
+        print("    * RECOMMEND SQL QUERY STEPS")
+        # Prompt to get recommended steps from the LLM
+        recommend_steps_prompt = PromptTemplate(
+            template="""
+            You are a SQL Database Instructions Expert. Given the following information about the SQL database,
+            recommend a series of numbered steps to take to collect the data and process it according to user instructions.
+            The steps should be tailored to the SQL database characteristics and should be helpful
+            for a sql database coding agent that will write the SQL code.
+            IMPORTANT INSTRUCTIONS:
+            - Take into account the user instructions and the previously recommended steps.
+            - If no user instructions are provided, just return the steps needed to understand the database.
+            - Take into account the database dialect and the tables and columns in the database.
+            - Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.
+            User instructions / Question:
+            {user_instructions}
+            Previously Recommended Steps (if any):
+            {recommended_steps}
+            Below are summaries of the database metadata and the SQL tables:
+            {all_sql_database_summary}
+            Return the steps as a numbered point list (no code, just the steps).
+            Consider these:
+            1. Consider the database dialect and the tables and columns in the database.
+            Avoid these:
+            1. Do not include steps to save files.
+            2. Do not include steps to modify existing tables, create new tables or modify the database schema.
+            3. Do not include steps that alter the existing data in the database.
+            4. Make sure not to include unsafe code that could cause data loss or corruption or SQL injections.
+            """,
+            input_variables=["user_instructions", "recommended_steps", "all_sql_database_summary"]
+        )
+        # Create a connection if needed
+        is_engine = isinstance(connection, sql.engine.base.Engine)
+        conn = connection.connect() if is_engine else connection
+        # Get the database metadata
+        all_sql_database_summary = get_database_metadata(conn, n_values=10)
+        steps_agent = recommend_steps_prompt | llm
+        recommended_steps = steps_agent.invoke({
+            "user_instructions": state.get("user_instructions"),
+            "recommended_steps": state.get("recommended_steps"),
+            "all_sql_database_summary": all_sql_database_summary
+        })
+        return {
+            "recommended_steps": "\n\n# Recommended SQL Database Steps:\n" + recommended_steps.content.strip(),
+            "all_sql_database_summary": all_sql_database_summary
+        }
+    def create_sql_query_code(state: GraphState):
+        if bypass_recommended_steps:
+            print("---SQL DATABASE AGENT---")
+        print("    * CREATE SQL QUERY CODE")
+        # Prompt to get the SQL code from the LLM
+        sql_query_code_prompt = PromptTemplate(
+            template="""
+            You are a SQL Database Coding Expert. Given the following information about the SQL database,
+            write the SQL code to collect the data and process it according to user instructions.
+            The code should be tailored to the SQL database characteristics and should take into account user instructions, recommended steps, database and table characteristics.
+            IMPORTANT INSTRUCTIONS:
+            - Do not use a LIMIT clause unless a user specifies a limit to be returned.
+            - Return SQL in ```sql ``` format.
+            - Only return a single query if possible.
+            - Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.
+            - Pay attention to the SQL dialect from the database summary metadata. Write the SQL code according to the dialect specified.
+            User instructions / Question:
+            {user_instructions}
+            Recommended Steps:
+            {recommended_steps}
+            Below are summaries of the database metadata and the SQL tables:
+            {all_sql_database_summary}
+            Return:
+            - The SQL code in ```sql ``` format to collect the data and process it according to the user instructions.
+            Avoid these:
+            - Do not include steps to save files.
+            - Do not include steps to modify existing tables, create new tables or modify the database schema.
+            - Make sure not to alter the existing data in the database.
+            - Make sure not to include unsafe code that could cause data loss or corruption.
+            """,
+            input_variables=["user_instructions", "recommended_steps", "all_sql_database_summary"]
+        )
+        # Create a connection if needed
+        is_engine = isinstance(connection, sql.engine.base.Engine)
+        conn = connection.connect() if is_engine else connection
+        # Get the database metadata
+        all_sql_database_summary = get_database_metadata(conn, n_values=10)
+        sql_query_code_agent = sql_query_code_prompt | llm | SQLOutputParser()
+        sql_query_code = sql_query_code_agent.invoke({
+            "user_instructions": state.get("user_instructions"),
+            "recommended_steps": state.get("recommended_steps"),
+            "all_sql_database_summary": all_sql_database_summary
+        })
+        print("    * CREATE PYTHON FUNCTION TO RUN SQL CODE")
+        response = f"""
+def sql_database_pipeline(connection):
+    import pandas as pd
+    import sqlalchemy as sql
+    # Create a connection if needed
+    is_engine = isinstance(connection, sql.engine.base.Engine)
+    conn = connection.connect() if is_engine else connection
+    sql_query = '''
+    {sql_query_code}
+    '''
+    return pd.read_sql(sql_query, connection)
+        """
+        response = add_comments_to_top(response, AGENT_NAME)
+        # For logging: store the code generated
+        file_path, file_name = log_ai_function(
+            response=response,
+            file_name="sql_database.py",
+            log=log,
+            log_path=log_path,
+            overwrite=overwrite
+        )
+        return {
+            "sql_query_code": sql_query_code,
+            "sql_database_function": response,
+            "sql_database_function_path": file_path,
+            "sql_database_function_name": file_name
+        }
+    def human_review(state: GraphState) -> Command[Literal["recommend_sql_steps", "create_sql_query_code"]]:
+        return node_func_human_review(
+            state=state,
+            prompt_text="Are the following SQL database querying steps correct? (Answer 'yes' or provide modifications)\n{steps}",
+            yes_goto="create_sql_query_code",
+            no_goto="recommend_sql_steps",
+            user_instructions_key="user_instructions",
+            recommended_steps_key="recommended_steps"
+        )
+    def execute_sql_database_code(state: GraphState):
+        is_engine = isinstance(connection, sql.engine.base.Engine)
+        conn = connection.connect() if is_engine else connection
+        return node_func_execute_agent_from_sql_connection(
+            state=state,
+            connection=conn,
+            result_key="data_sql",
+            error_key="sql_database_error",
+            code_snippet_key="sql_database_function",
+            agent_function_name="sql_database_pipeline",
+            post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
+            error_message_prefix="An error occurred during executing the sql database pipeline: "
+        )
+    def fix_sql_database_code(state: GraphState):
+        prompt = """
+        You are a SQL Database Agent code fixer. Your job is to create a sql_database_pipeline(connection) function that can be run on a sql connection. The function is currently broken and needs to be fixed.
+        Make sure to only return the function definition for sql_database_pipeline().
+        Return Python code in ```python``` format with a single function definition, sql_database_pipeline(connection), that includes all imports inside the function. The connection object is a SQLAlchemy connection object. Don't specify the class of the connection object, just use it as an argument to the function.
+        This is the broken code (please fix):
+        {code_snippet}
+        Last Known Error:
+        {error}
+        """
+        return node_func_fix_agent_code(
+            state=state,
+            code_snippet_key="sql_database_function",
+            error_key="sql_database_error",
+            llm=llm,
+            prompt_template=prompt,
+            agent_name=AGENT_NAME,
+            log=log,
+            file_path=state.get("sql_database_function_path", None),
+        )
+    def explain_sql_database_code(state: GraphState):
+        return node_func_explain_agent_code(
+            state=state,
+            code_snippet_key="sql_database_function",
+            result_key="messages",
+            error_key="sql_database_error",
+            llm=llm,
+            role=AGENT_NAME,
+            explanation_prompt_template="""
+            Explain the SQL steps that the SQL Database agent performed in this function.
+            Keep the summary succinct and to the point.\n\n# SQL Database Agent:\n\n{code}
+            """,
+            success_prefix="# SQL Database Agent:\n\n",
+            error_message="The SQL Database Agent encountered an error during SQL Query Analysis. No SQL function explanation is returned."
+        )
+    # Create the graph
+    node_functions = {
+        "recommend_sql_steps": recommend_sql_steps,
+        "human_review": human_review,
+        "create_sql_query_code": create_sql_query_code,
+        "execute_sql_database_code": execute_sql_database_code,
+        "fix_sql_database_code": fix_sql_database_code,
+        "explain_sql_database_code": explain_sql_database_code
+    }
+    app = create_coding_agent_graph(
+        GraphState=GraphState,
+        node_functions=node_functions,
+        recommended_steps_node_name="recommend_sql_steps",
+        create_code_node_name="create_sql_query_code",
+        execute_code_node_name="execute_sql_database_code",
+        fix_code_node_name="fix_sql_database_code",
+        explain_code_node_name="explain_sql_database_code",
+        error_key="sql_database_error",
+        human_in_the_loop=human_in_the_loop,
+        human_review_node_name="human_review",
+        checkpointer=MemorySaver() if human_in_the_loop else None,
+        bypass_recommended_steps=bypass_recommended_steps,
+        bypass_explain_code=bypass_explain_code,
+    )
+    return app

{ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.90061}/ai_data_science_team/templates/agent_templates.py RENAMED Viewed

@@ -3,6 +3,7 @@ from langgraph.graph import StateGraph, END
 from langgraph.types import interrupt, Command
 import pandas as pd
+import sqlalchemy as sql
 from typing import Any, Callable, Dict, Type, Optional
@@ -22,7 +23,9 @@ def create_coding_agent_graph(
     retry_count_key: str = "retry_count",
     human_in_the_loop: bool = False,
     human_review_node_name: str = "human_review",
-    checkpointer: Optional[Callable] = None
+    checkpointer: Optional[Callable] = None,
+    bypass_recommended_steps: bool = False,
+    bypass_explain_code: bool = False,
 ):
     """
     Creates a generic agent graph using the provided node functions and node names.
@@ -63,7 +66,11 @@ def create_coding_agent_graph(
         The node name for human review if human_in_the_loop is True.
     checkpointer : callable, optional
         A checkpointer callable if desired.
+    bypass_recommended_steps : bool, optional
+        Whether to skip the recommended steps node.
+    bypass_explain_code : bool, optional
+        Whether to skip the final explain code node.
     Returns
     -------
     app : langchain.graphs.StateGraph
@@ -72,50 +79,76 @@ def create_coding_agent_graph(
     workflow = StateGraph(GraphState)
-    # Add the recommended steps node
-    workflow.add_node(recommended_steps_node_name, node_functions[recommended_steps_node_name])
+    # Conditionally add the recommended-steps node
+    if not bypass_recommended_steps:
+        workflow.add_node(recommended_steps_node_name, node_functions[recommended_steps_node_name])
-    # Optionally add the human review node
-    if human_in_the_loop:
-        workflow.add_node(human_review_node_name, node_functions[human_review_node_name])
-    # Add main nodes
+    # Always add create, execute, and fix nodes
     workflow.add_node(create_code_node_name, node_functions[create_code_node_name])
     workflow.add_node(execute_code_node_name, node_functions[execute_code_node_name])
     workflow.add_node(fix_code_node_name, node_functions[fix_code_node_name])
-    workflow.add_node(explain_code_node_name, node_functions[explain_code_node_name])
+    # Conditionally add the explanation node
+    if not bypass_explain_code:
+        workflow.add_node(explain_code_node_name, node_functions[explain_code_node_name])
     # Set the entry point
-    workflow.set_entry_point(recommended_steps_node_name)
+    entry_point = create_code_node_name if bypass_recommended_steps else recommended_steps_node_name
+    workflow.set_entry_point(entry_point)
-    # Add edges depending on human_in_the_loop
-    if human_in_the_loop:
-        workflow.add_edge(recommended_steps_node_name, human_review_node_name)
-    else:
-        workflow.add_edge(recommended_steps_node_name, create_code_node_name)
+    # Add edges for recommended steps
+    if not bypass_recommended_steps:
+        if human_in_the_loop:
+            workflow.add_edge(recommended_steps_node_name, human_review_node_name)
+        else:
+            workflow.add_edge(recommended_steps_node_name, create_code_node_name)
+    elif human_in_the_loop:
+        # Skip recommended steps but still include human review
+        workflow.add_edge(create_code_node_name, human_review_node_name)
-    # Connect create_code_node to execution node
+    # Create -> Execute
     workflow.add_edge(create_code_node_name, execute_code_node_name)
-    # Add conditional edges for error handling
-    workflow.add_conditional_edges(
-        execute_code_node_name,
-        lambda state: "fix_code" if (
-            state.get(error_key) is not None and
-            state.get(retry_count_key) is not None and
-            state.get(max_retries_key) is not None and
-            state.get(retry_count_key) < state.get(max_retries_key)
-        ) else "explain_code",
-        {"fix_code": fix_code_node_name, "explain_code": explain_code_node_name},
-    )
-    # From fix_code_node_name back to execution node
-    workflow.add_edge(fix_code_node_name, execute_code_node_name)
-    # explain_code_node_name leads to end
-    workflow.add_edge(explain_code_node_name, END)
-    # Compile workflow, optionally with checkpointer
+    # Define a helper to check if we have an error & can still retry
+    def error_and_can_retry(state):
+        return (
+            state.get(error_key) is not None
+            and state.get(retry_count_key) is not None
+            and state.get(max_retries_key) is not None
+            and state[retry_count_key] < state[max_retries_key]
+        )
+    # ---- Split into two branches for bypass_explain_code ----
+    if not bypass_explain_code:
+        # If we are NOT bypassing explain, the next node is fix_code if error,
+        # else explain_code. Then we wire explain_code -> END afterward.
+        workflow.add_conditional_edges(
+            execute_code_node_name,
+            lambda s: "fix_code" if error_and_can_retry(s) else "explain_code",
+            {
+                "fix_code": fix_code_node_name,
+                "explain_code": explain_code_node_name,
+            },
+        )
+        # Fix code -> Execute again
+        workflow.add_edge(fix_code_node_name, execute_code_node_name)
+        # explain_code -> END
+        workflow.add_edge(explain_code_node_name, END)
+    else:
+        # If we ARE bypassing explain_code, the next node is fix_code if error,
+        # else straight to END.
+        workflow.add_conditional_edges(
+            execute_code_node_name,
+            lambda s: "fix_code" if error_and_can_retry(s) else "END",
+            {
+                "fix_code": fix_code_node_name,
+                "END": END,
+            },
+        )
+        # Fix code -> Execute again
+        workflow.add_edge(fix_code_node_name, execute_code_node_name)
+    # Finally, compile
     if human_in_the_loop and checkpointer is not None:
         app = workflow.compile(checkpointer=checkpointer)
     else:
@@ -124,6 +157,7 @@ def create_coding_agent_graph(
     return app
 def node_func_human_review(
     state: Any,
     prompt_text: str,
@@ -256,6 +290,88 @@ def node_func_execute_agent_code_on_data(
         # if state.get("retry_count") == 0:
         #     10/0
+        # Apply post-processing if provided
+        if post_processing is not None:
+            result = post_processing(result)
+        else:
+            if isinstance(result, pd.DataFrame):
+                result = result.to_dict()
+    except Exception as e:
+        print(e)
+        agent_error = f"{error_message_prefix}{str(e)}"
+    # Return results
+    output = {result_key: result, error_key: agent_error}
+    return output
+def node_func_execute_agent_from_sql_connection(
+    state: Any,
+    connection: Any,
+    code_snippet_key: str,
+    result_key: str,
+    error_key: str,
+    agent_function_name: str,
+    post_processing: Optional[Callable[[Any], Any]] = None,
+    error_message_prefix: str = "An error occurred during agent execution: "
+) -> Dict[str, Any]:
+    """
+    Execute a generic agent code defined in a code snippet retrieved from the state on a SQLAlchemy connection object
+    and return the result.
+    Parameters
+    ----------
+    state : Any
+        A state object that supports `get(key: str)` method to retrieve values.
+    connection : str
+        The SQLAlchemy connection object to use for executing the agent function.
+    code_snippet_key : str
+        The key in the state used to retrieve the Python code snippet defining the agent function.
+    result_key : str
+        The key in the state used to store the result of the agent function.
+    error_key : str
+        The key in the state used to store the error message if any.
+    agent_function_name : str
+        The name of the function (e.g., 'sql_database_agent') expected to be defined in the code snippet.
+    post_processing : Callable[[Any], Any], optional
+        A function to postprocess the output of the agent function before returning it.
+    error_message_prefix : str, optional
+        A prefix or full message to use in the error output if an exception occurs.
+    Returns
+    -------
+    Dict[str, Any]
+        A dictionary containing the result and/or error messages. Keys are arbitrary,
+        but typically include something like "result" or "error".
+    """
+    print("    * EXECUTING AGENT CODE ON SQL CONNECTION")
+    # Retrieve SQLAlchemy connection and code snippet from the state
+    is_engine = isinstance(connection, sql.engine.base.Engine)
+    conn = connection.connect() if is_engine else connection
+    agent_code = state.get(code_snippet_key)
+    # Ensure the connection object is provided
+    if connection is None:
+        raise ValueError(f"Connection object not found.")
+    # Execute the code snippet to define the agent function
+    local_vars = {}
+    global_vars = {}
+    exec(agent_code, global_vars, local_vars)
+    # Retrieve the agent function from the executed code
+    agent_function = local_vars.get(agent_function_name, None)
+    if agent_function is None or not callable(agent_function):
+        raise ValueError(f"Agent function '{agent_function_name}' not found or not callable in the provided code.")
+    # Execute the agent function
+    agent_error = None
+    result = None
+    try:
+        result = agent_function(connection)
         # Apply post-processing if provided
         if post_processing is not None:
             result = post_processing(result)
@@ -267,6 +383,7 @@ def node_func_execute_agent_code_on_data(
     output = {result_key: result, error_key: agent_error}
     return output
 def node_func_fix_agent_code(
     state: Any,
     code_snippet_key: str,
@@ -326,7 +443,7 @@ def node_func_fix_agent_code(
     response = (llm | PythonOutputParser()).invoke(prompt)
     response = relocate_imports_inside_function(response)
-    response = add_comments_to_top(response, agent_name="data_wrangler")
+    response = add_comments_to_top(response, agent_name=agent_name)
     # Log the response if requested
     if log:

{ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.90061}/ai_data_science_team/tools/logging.py RENAMED Viewed

@@ -58,4 +58,4 @@ def log_ai_function(response: str, file_name: str, log: bool = True, log_path: s
         return (file_path, file_name)
     else:
-        return None
+        return (None, None)

ai_data_science_team-0.0.0.9005/ai_data_science_team/tools/data_analysis.py → ai_data_science_team-0.0.0.90061/ai_data_science_team/tools/metadata.py RENAMED Viewed

@@ -1,8 +1,9 @@
 import io
 import pandas as pd
+import sqlalchemy as sql
 from typing import Union, List, Dict
-def summarize_dataframes(
+def get_dataframe_summary(
     dataframes: Union[pd.DataFrame, List[pd.DataFrame], Dict[str, pd.DataFrame]]
 ) -> List[str]:
     """
@@ -26,7 +27,7 @@ def summarize_dataframes(
         "iris": data.frame,
         "iris_target": data.target,
     }
-    summaries = summarize_dataframes(dataframes)
+    summaries = get_dataframe_summary(dataframes)
     print(summaries[0])
     ```
@@ -114,3 +115,53 @@ def _summarize_dataframe(df: pd.DataFrame, dataset_name: str) -> str:
     {info_text}
     """
     return summary_text.strip()
+def get_database_metadata(connection: Union[sql.engine.base.Connection, sql.engine.base.Engine], n_values: int=10):
+    """
+    Collects metadata and sample data from a database.
+    Parameters:
+    -----------
+    connection (sqlalchemy.engine.base.Connection or sqlalchemy.engine.base.Engine):
+        An active SQLAlchemy connection or engine.
+    n_values (int):
+        Number of sample values to retrieve for each column.
+    Returns:
+    --------
+    str: Formatted text with database metadata.
+    """
+    # If a connection is passed, use it; if an engine is passed, connect to it
+    is_engine = isinstance(connection, sql.engine.base.Engine)
+    conn = connection.connect() if is_engine else connection
+    output = []
+    try:
+        # Engine metadata
+        sql_engine = conn.engine
+        output.append(f"Database Dialect: {sql_engine.dialect.name}")
+        output.append(f"Driver: {sql_engine.driver}")
+        output.append(f"Connection URL: {sql_engine.url}")
+        # Inspect the database
+        inspector = sql.inspect(sql_engine)
+        output.append(f"Tables: {inspector.get_table_names()}")
+        output.append(f"Schemas: {inspector.get_schema_names()}")
+        # For each table, get the columns and their metadata
+        for table_name in inspector.get_table_names():
+            output.append(f"\nTable: {table_name}")
+            for column in inspector.get_columns(table_name):
+                output.append(f"  Column: {column['name']} Type: {column['type']}")
+                # Fetch sample values for the column
+                query = f"SELECT {column['name']} FROM {table_name} LIMIT {n_values}"
+                data = pd.read_sql(query, sql_engine)
+                output.append(f"    First {n_values} Values: {data.values.flatten().tolist()}")
+    finally:
+        # Close connection if it was created inside this function
+        if is_engine:
+            conn.close()
+    # Join all collected information into a single string
+    return "\n".join(output)

{ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.90061}/ai_data_science_team/tools/regex.py RENAMED Viewed

@@ -64,7 +64,7 @@ def add_comments_to_top(code_text, agent_name="data_wrangler"):
     header_comments = [
         "# Disclaimer: This function was generated by AI. Please review before using.",
         f"# Agent Name: {agent_name}",
-        f"# Time Created: {time_created}",
+        f"# Time Created: {time_created}\n",
         ""
     ]

{ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.90061}/ai_data_science_team.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ai-data-science-team
-Version: 0.0.0.9005
+Version: 0.0.0.90061
 Summary: Build and run an AI-powered data science team.
 Home-page: https://github.com/business-science/ai-data-science-team
 Author: Matt Dancho
@@ -58,6 +58,7 @@ This project is a work in progress. New data science agents will be released soo
 1. **Data Wrangling Agent:** Merges, Joins, Preps and Wrangles data into a format that is ready for data analysis.
 2. **Data Cleaning Agent:** Performs Data Preparation steps including handling missing values, outliers, and data type conversions.
 3. **Feature Engineering Agent:** Converts the prepared data into ML-ready data. Adds features to increase predictive accuracy of ML models.
+4. **SQL Database Agent:** Connects to SQL databases to pull data into the data science environment. Creates pipelins to automate data extraction. Performs Joins, Aggregations, and other SQL Query operations.
 ### Agents Coming Soon
@@ -103,6 +104,8 @@ pip install git+https://github.com/business-science/ai-data-science-team.git --u
 ## Usage
+[See all examples here.](/examples)
 ### Example 1: Feature Engineering with the Feature Engineering Agent
 [See the full example here.](/examples/feature_engineering_agent.ipynb)

{ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.90061}/ai_data_science_team.egg-info/SOURCES.txt RENAMED Viewed

@@ -13,10 +13,11 @@ ai_data_science_team/agents/__init__.py
 ai_data_science_team/agents/data_cleaning_agent.py
 ai_data_science_team/agents/data_wrangling_agent.py
 ai_data_science_team/agents/feature_engineering_agent.py
+ai_data_science_team/agents/sql_database_agent.py
 ai_data_science_team/templates/__init__.py
 ai_data_science_team/templates/agent_templates.py
 ai_data_science_team/tools/__init__.py
-ai_data_science_team/tools/data_analysis.py
 ai_data_science_team/tools/logging.py
+ai_data_science_team/tools/metadata.py
 ai_data_science_team/tools/parsers.py
 ai_data_science_team/tools/regex.py