PyPI - ai-data-science-team - Versions diffs - 0.0.0.9008__py3-none-any.whl → 0.0.0.9010__py3-none-any.whl - Mend

ai-data-science-team 0.0.0.9008py3-none-any.whl → 0.0.0.9010py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

ai_data_science_team/agents/feature_engineering_agent.py CHANGED Viewed

@@ -14,6 +14,7 @@ from langgraph.types import Command
 from langgraph.checkpoint.memory import MemorySaver
 import os
+import json
 import pandas as pd
 from IPython.display import Markdown
@@ -22,19 +23,20 @@ from ai_data_science_team.templates import(
     node_func_execute_agent_code_on_data,
     node_func_human_review,
     node_func_fix_agent_code,
-    node_func_explain_agent_code,
+    node_func_report_agent_outputs,
     create_coding_agent_graph,
     BaseAgent,
 )
-from ai_data_science_team.tools.parsers import PythonOutputParser
-from ai_data_science_team.tools.regex import (
+from ai_data_science_team.parsers.parsers import PythonOutputParser
+from ai_data_science_team.utils.regex import (
     relocate_imports_inside_function,
     add_comments_to_top,
     format_agent_name,
-    format_recommended_steps
+    format_recommended_steps,
+    get_generic_summary,
 )
-from ai_data_science_team.tools.metadata import get_dataframe_summary
-from ai_data_science_team.tools.logging import log_ai_function
+from ai_data_science_team.tools.dataframe import get_dataframe_summary
+from ai_data_science_team.utils.logging import log_ai_function
 # Setup
 AGENT_NAME = "feature_engineering_agent"
@@ -103,8 +105,8 @@ class FeatureEngineeringAgent(BaseAgent):
         retry_count=0
     )
         Engineers features from the provided dataset synchronously based on user instructions.
-    explain_feature_engineering_steps()
-        Returns an explanation of the feature engineering steps performed by the agent.
+    get_workflow_summary()
+        Retrieves a summary of the agent's workflow.
     get_log_summary()
         Retrieves a summary of logged operations if logging is enabled.
     get_data_engineered()
@@ -201,7 +203,7 @@ class FeatureEngineeringAgent(BaseAgent):
             self._params[k] = v
         self._compiled_graph = self._make_compiled_graph()
-    def ainvoke_agent(
+    async def ainvoke_agent(
         self,
         data_raw: pd.DataFrame,
         user_instructions: str=None,
@@ -233,7 +235,7 @@ class FeatureEngineeringAgent(BaseAgent):
         -------
         None
         """
-        response = self._compiled_graph.ainvoke({
+        response = await self._compiled_graph.ainvoke({
             "user_instructions": user_instructions,
             "data_raw": data_raw.to_dict(),
             "target_variable": target_variable,
@@ -285,40 +287,34 @@ class FeatureEngineeringAgent(BaseAgent):
         self.response = response
         return None
-    def explain_feature_engineering_steps(self):
+    def get_workflow_summary(self, markdown=False):
         """
-        Provides an explanation of the feature engineering steps performed by the agent.
-        Returns
-        -------
-        str or list
-            Explanation of the feature engineering steps.
+        Retrieves the agent's workflow summary, if logging is enabled.
         """
-        if self.response:
-            return self.response.get("messages", [])
-        return []
+        if self.response and self.response.get("messages"):
+            summary = get_generic_summary(json.loads(self.response.get("messages")[-1].content))
+            if markdown:
+                return Markdown(summary)
+            else:
+                return summary
     def get_log_summary(self, markdown=False):
         """
         Logs a summary of the agent's operations, if logging is enabled.
+        """
+        if self.response:
+            if self.response.get('feature_engineer_function_path'):
+                log_details = f"""
+## Featuring Engineering Agent Log Summary:
-        Parameters
-        ----------
-        markdown : bool, optional
-            If True, returns Markdown-formatted output.
+Function Path: {self.response.get('feature_engineer_function_path')}
-        Returns
-        -------
-        str or None
-            Summary of logs, or None if not available.
-        """
-        if self.response and self.response.get("feature_engineer_function_path"):
-            log_details = f"Log Path: {self.response.get('feature_engineer_function_path')}"
-            if markdown:
-                return Markdown(log_details)
-            else:
-                return log_details
-        return None
+Function Name: {self.response.get('feature_engineer_function_name')}
+                """
+                if markdown:
+                    return Markdown(log_details)
+                else:
+                    return log_details
     def get_data_engineered(self):
         """
@@ -388,22 +384,7 @@ class FeatureEngineeringAgent(BaseAgent):
             return steps
         return None
-    def get_response(self):
-        """
-        Returns the agent's full response dictionary.
-        Returns
-        -------
-        dict or None
-            The response dictionary if available, otherwise None.
-        """
-        return self.response
-    def show(self):
-        """
-        Displays the agent's mermaid diagram for visual inspection of the compiled graph.
-        """
-        return self._compiled_graph.show()
 # * Feature Engineering Agent
@@ -576,7 +557,7 @@ def make_feature_engineering_agent(
             Below are summaries of all datasets provided:
             {all_datasets_summary}
-            Return the steps as a numbered list (no code, just the steps).
+            Return steps as a numbered list. You can return short code snippets to demonstrate actions. But do not return a fully coded solution. The code will be generated separately by a Coding Agent.
             Avoid these:
             1. Do not include steps to save files.
@@ -649,7 +630,6 @@ def make_feature_engineering_agent(
         feature_engineering_prompt = PromptTemplate(
             template="""
             You are a Feature Engineering Agent. Your job is to create a {function_name}() function that can be run on the data provided using the following recommended steps.
             Recommended Steps:
@@ -763,21 +743,22 @@ def make_feature_engineering_agent(
             function_name=state.get("feature_engineer_function_name"),
         )
-    def explain_feature_engineering_code(state: GraphState):
-        return node_func_explain_agent_code(
+    # Final reporting node
+    def report_agent_outputs(state: GraphState):
+        return node_func_report_agent_outputs(
             state=state,
-            code_snippet_key="feature_engineer_function",
+            keys_to_include=[
+                "recommended_steps",
+                "feature_engineer_function",
+                "feature_engineer_function_path",
+                "feature_engineer_function_name",
+                "feature_engineer_error",
+            ],
             result_key="messages",
-            error_key="feature_engineer_error",
-            llm=llm,
             role=AGENT_NAME,
-            explanation_prompt_template="""
-            Explain the feature engineering steps performed by this function. Keep the explanation clear and concise.\n\n# Feature Engineering Agent:\n\n{code}
-            """,
-            success_prefix="# Feature Engineering Agent:\n\n ",
-            error_message="The Feature Engineering Agent encountered an error during feature engineering. Data could not be explained."
+            custom_title="Feature Engineering Agent Outputs"
         )
     # Create the graph
     node_functions = {
         "recommend_feature_engineering_steps": recommend_feature_engineering_steps,
@@ -785,7 +766,7 @@ def make_feature_engineering_agent(
         "create_feature_engineering_code": create_feature_engineering_code,
         "execute_feature_engineering_code": execute_feature_engineering_code,
         "fix_feature_engineering_code": fix_feature_engineering_code,
-        "explain_feature_engineering_code": explain_feature_engineering_code
+        "report_agent_outputs": report_agent_outputs,
     }
     app = create_coding_agent_graph(
@@ -795,7 +776,7 @@ def make_feature_engineering_agent(
         create_code_node_name="create_feature_engineering_code",
         execute_code_node_name="execute_feature_engineering_code",
         fix_code_node_name="fix_feature_engineering_code",
-        explain_code_node_name="explain_feature_engineering_code",
+        explain_code_node_name="report_agent_outputs",
         error_key="feature_engineer_error",
         max_retries_key = "max_retries",
         retry_count_key = "retry_count",

ai_data_science_team/agents/sql_database_agent.py CHANGED Viewed

@@ -5,11 +5,13 @@ import operator
 from langchain.prompts import PromptTemplate
 from langchain_core.messages import BaseMessage
+from langchain_core.output_parsers import JsonOutputParser
 from langgraph.types import Command
 from langgraph.checkpoint.memory import MemorySaver
 import os
+import json
 import pandas as pd
 import sqlalchemy as sql
@@ -19,14 +21,19 @@ from ai_data_science_team.templates import(
     node_func_execute_agent_from_sql_connection,
     node_func_human_review,
     node_func_fix_agent_code,
-    node_func_explain_agent_code,
+    node_func_report_agent_outputs,
     create_coding_agent_graph,
     BaseAgent,
 )
-from ai_data_science_team.tools.parsers import SQLOutputParser
-from ai_data_science_team.tools.regex import add_comments_to_top, format_agent_name, format_recommended_steps
-from ai_data_science_team.tools.metadata import get_database_metadata
-from ai_data_science_team.tools.logging import log_ai_function
+from ai_data_science_team.parsers.parsers import SQLOutputParser
+from ai_data_science_team.utils.regex import (
+    add_comments_to_top,
+    format_agent_name,
+    format_recommended_steps,
+    get_generic_summary,
+)
+from ai_data_science_team.tools.sql import get_database_metadata
+from ai_data_science_team.utils.logging import log_ai_function
 # Setup
 AGENT_NAME = "sql_database_agent"
@@ -51,7 +58,7 @@ class SQLDatabaseAgent(BaseAgent):
     connection : sqlalchemy.engine.base.Engine or sqlalchemy.engine.base.Connection
         The SQLAlchemy connection (or engine) to the database.
     n_samples : int, optional
-        Number of sample rows (per column) to retrieve when summarizing database metadata. Defaults to 10.
+        Number of sample rows (per column) to retrieve when summarizing database metadata. Defaults to 1.
     log : bool, optional
         Whether to log the generated code and errors. Defaults to False.
     log_path : str, optional
@@ -68,6 +75,8 @@ class SQLDatabaseAgent(BaseAgent):
         If True, skips the step that generates recommended SQL steps. Defaults to False.
     bypass_explain_code : bool, optional
         If True, skips the step that provides code explanations. Defaults to False.
+    smart_schema_pruning : bool, optional
+        If True, filters the tables and columns based on the user instructions and recommended steps. Defaults to False.
     Methods
     -------
@@ -77,8 +86,8 @@ class SQLDatabaseAgent(BaseAgent):
         Asynchronously runs the agent to generate and execute a SQL query based on user instructions.
     invoke_agent(user_instructions: str, max_retries=3, retry_count=0)
         Synchronously runs the agent to generate and execute a SQL query based on user instructions.
-    explain_sql_steps()
-        Returns an explanation of the SQL steps performed by the agent.
+    get_workflow_summary()
+        Retrieves a summary of the agent's workflow.
     get_log_summary()
         Retrieves a summary of logged operations if logging is enabled.
     get_data_sql()
@@ -139,7 +148,7 @@ class SQLDatabaseAgent(BaseAgent):
         self,
         model,
         connection,
-        n_samples=10,
+        n_samples=1,
         log=False,
         log_path=None,
         file_name="sql_database.py",
@@ -147,7 +156,8 @@ class SQLDatabaseAgent(BaseAgent):
         overwrite=True,
         human_in_the_loop=False,
         bypass_recommended_steps=False,
-        bypass_explain_code=False
+        bypass_explain_code=False,
+        smart_schema_pruning=False,
     ):
         self._params = {
             "model": model,
@@ -160,7 +170,8 @@ class SQLDatabaseAgent(BaseAgent):
             "overwrite": overwrite,
             "human_in_the_loop": human_in_the_loop,
             "bypass_recommended_steps": bypass_recommended_steps,
-            "bypass_explain_code": bypass_explain_code
+            "bypass_explain_code": bypass_explain_code,
+            "smart_schema_pruning": smart_schema_pruning,
         }
         self._compiled_graph = self._make_compiled_graph()
         self.response = None
@@ -182,7 +193,7 @@ class SQLDatabaseAgent(BaseAgent):
             self._params[k] = v
         self._compiled_graph = self._make_compiled_graph()
-    def ainvoke_agent(self, user_instructions: str=None, max_retries=3, retry_count=0, **kwargs):
+    async def ainvoke_agent(self, user_instructions: str=None, max_retries=3, retry_count=0, **kwargs):
         """
         Asynchronously runs the SQL Database Agent based on user instructions.
@@ -201,7 +212,7 @@ class SQLDatabaseAgent(BaseAgent):
         -------
         None
         """
-        response = self._compiled_graph.ainvoke({
+        response = await self._compiled_graph.ainvoke({
             "user_instructions": user_instructions,
             "max_retries": max_retries,
             "retry_count": retry_count
@@ -234,40 +245,34 @@ class SQLDatabaseAgent(BaseAgent):
         }, **kwargs)
         self.response = response
-    def explain_sql_steps(self):
+    def get_workflow_summary(self, markdown=False):
         """
-        Provides an explanation of the SQL steps performed by the agent
-        if the explain step is not bypassed.
-        Returns
-        -------
-        str or list
-            An explanation of the SQL steps.
+        Retrieves the agent's workflow summary, if logging is enabled.
         """
-        if self.response:
-            return self.response.get("messages", [])
-        return []
+        if self.response and self.response.get("messages"):
+            summary = get_generic_summary(json.loads(self.response.get("messages")[-1].content))
+            if markdown:
+                return Markdown(summary)
+            else:
+                return summary
     def get_log_summary(self, markdown=False):
         """
-        Retrieves a summary of the logging details if logging is enabled.
+        Logs a summary of the agent's operations, if logging is enabled.
+        """
+        if self.response:
+            if self.response.get('sql_database_function_path'):
+                log_details = f"""
+## SQL Database Agent Log Summary:
-        Parameters
-        ----------
-        markdown : bool, optional
-            If True, returns the summary in Markdown format.
+Function Path: {self.response.get('sql_database_function_path')}
-        Returns
-        -------
-        str or None
-            Log details or None if logging is not used or data is unavailable.
-        """
-        if self.response and self.response.get("sql_database_function_path"):
-            log_details = f"Log Path: {self.response['sql_database_function_path']}"
-            if markdown:
-                return Markdown(log_details)
-            return log_details
-        return None
+Function Name: {self.response.get('sql_database_function_name')}
+                """
+                if markdown:
+                    return Markdown(log_details)
+                else:
+                    return log_details
     def get_data_sql(self):
         """
@@ -351,14 +356,16 @@ class SQLDatabaseAgent(BaseAgent):
 def make_sql_database_agent(
     model,
     connection,
-    n_samples = 10,
+    n_samples=1,
     log=False,
     log_path=None,
     file_name="sql_database.py",
     function_name="sql_database_pipeline",
     overwrite = True,
-    human_in_the_loop=False, bypass_recommended_steps=False,
-    bypass_explain_code=False
+    human_in_the_loop=False,
+    bypass_recommended_steps=False,
+    bypass_explain_code=False,
+    smart_schema_pruning=False,
 ):
     """
     Creates a SQL Database Agent that can recommend SQL steps and generate SQL code to query a database.
@@ -370,7 +377,7 @@ def make_sql_database_agent(
     connection : sqlalchemy.engine.base.Engine
         The connection to the SQL database.
     n_samples : int, optional
-        The number of samples to retrieve for each column, by default 10.
+        The number of samples to retrieve for each column, by default 1.
         If you get an error due to maximum tokens, try reducing this number.
         > "This model's maximum context length is 128000 tokens. However, your messages resulted in 333858 tokens. Please reduce the length of the messages."
     log : bool, optional
@@ -387,6 +394,8 @@ def make_sql_database_agent(
         Bypass the recommendation step, by default False
     bypass_explain_code : bool, optional
         Bypass the code explanation step, by default False.
+    smart_schema_pruning : bool, optional
+        If True, filters the tables and columns with an extra LLM step to reduce tokens for large databases. Increases processing time but can avoid errors due to hitting max token limits with large databases. Defaults to False.
     Returns
     -------
@@ -419,12 +428,8 @@ def make_sql_database_agent(
         "retry_count":0
     })
     ```
     """
-    is_engine = isinstance(connection, sql.engine.base.Engine)
-    conn = connection.connect() if is_engine else connection
     llm = model
     # Human in th loop requires recommended steps
@@ -438,6 +443,11 @@ def make_sql_database_agent(
             log_path = LOG_PATH
         if not os.path.exists(log_path):
             os.makedirs(log_path)
+    # Get the database metadata
+    is_engine = isinstance(connection, sql.engine.base.Engine)
+    conn = connection.connect() if is_engine else connection
     class GraphState(TypedDict):
         messages: Annotated[Sequence[BaseMessage], operator.add]
@@ -457,6 +467,16 @@ def make_sql_database_agent(
     def recommend_sql_steps(state: GraphState):
         print(format_agent_name(AGENT_NAME))
+        all_sql_database_summary = get_database_metadata(conn, n_samples=n_samples)
+        all_sql_database_summary = smart_schema_filter(
+            llm,
+            state.get("user_instructions"),
+            all_sql_database_summary,
+            smart_filtering=smart_schema_pruning
+        )
         print("    * RECOMMEND STEPS")
@@ -485,7 +505,7 @@ def make_sql_database_agent(
             Below are summaries of the database metadata and the SQL tables:
             {all_sql_database_summary}
-            Return the steps as a numbered point list (no code, just the steps).
+            Return steps as a numbered list. You can return short code snippets to demonstrate actions. But do not return a fully coded solution. The code will be generated separately by a Coding Agent.
             Consider these:
@@ -504,13 +524,6 @@ def make_sql_database_agent(
             input_variables=["user_instructions", "recommended_steps", "all_sql_database_summary"]
         )
-        # Create a connection if needed
-        is_engine = isinstance(connection, sql.engine.base.Engine)
-        conn = connection.connect() if is_engine else connection
-        # Get the database metadata
-        all_sql_database_summary = get_database_metadata(conn, n_samples=n_samples)
         steps_agent = recommend_steps_prompt | llm
         recommended_steps = steps_agent.invoke({
@@ -527,6 +540,15 @@ def make_sql_database_agent(
     def create_sql_query_code(state: GraphState):
         if bypass_recommended_steps:
             print(format_agent_name(AGENT_NAME))
+            all_sql_database_summary = get_database_metadata(conn, n_samples=n_samples)
+            all_sql_database_summary = smart_schema_filter(
+                llm,
+                state.get("user_instructions"),
+                all_sql_database_summary,
+                smart_filtering=smart_schema_pruning
+            )
+        else:
+            all_sql_database_summary = state.get("all_sql_database_summary")
         print("    * CREATE SQL QUERY CODE")
         # Prompt to get the SQL code from the LLM
@@ -567,13 +589,6 @@ def make_sql_database_agent(
             input_variables=["user_instructions", "recommended_steps", "all_sql_database_summary"]
         )
-        # Create a connection if needed
-        is_engine = isinstance(connection, sql.engine.base.Engine)
-        conn = connection.connect() if is_engine else connection
-        # Get the database metadata
-        all_sql_database_summary = get_database_metadata(conn, n_samples=n_samples)
         sql_query_code_agent = sql_query_code_prompt | llm | SQLOutputParser()
         sql_query_code = sql_query_code_agent.invoke({
@@ -690,20 +705,20 @@ def {function_name}(connection):
             function_name=state.get("sql_database_function_name"),
         )
-    def explain_sql_database_code(state: GraphState):
-        return node_func_explain_agent_code(
+    # Final reporting node
+    def report_agent_outputs(state: GraphState):
+        return node_func_report_agent_outputs(
             state=state,
-            code_snippet_key="sql_database_function",
+            keys_to_include=[
+                "recommended_steps",
+                "sql_database_function",
+                "sql_database_function_path",
+                "sql_database_function_name",
+                "sql_database_error",
+            ],
             result_key="messages",
-            error_key="sql_database_error",
-            llm=llm,
             role=AGENT_NAME,
-            explanation_prompt_template="""
-            Explain the SQL steps that the SQL Database agent performed in this function.
-            Keep the summary succinct and to the point.\n\n# SQL Database Agent:\n\n{code}
-            """,
-            success_prefix="# SQL Database Agent:\n\n",
-            error_message="The SQL Database Agent encountered an error during SQL Query Analysis. No SQL function explanation is returned."
+            custom_title="SQL Database Agent Outputs"
         )
     # Create the graph
@@ -713,7 +728,7 @@ def {function_name}(connection):
         "create_sql_query_code": create_sql_query_code,
         "execute_sql_database_code": execute_sql_database_code,
         "fix_sql_database_code": fix_sql_database_code,
-        "explain_sql_database_code": explain_sql_database_code
+        "report_agent_outputs": report_agent_outputs,
     }
     app = create_coding_agent_graph(
@@ -723,7 +738,7 @@ def {function_name}(connection):
         create_code_node_name="create_sql_query_code",
         execute_code_node_name="execute_sql_database_code",
         fix_code_node_name="fix_sql_database_code",
-        explain_code_node_name="explain_sql_database_code",
+        explain_code_node_name="report_agent_outputs",
         error_key="sql_database_error",
         human_in_the_loop=human_in_the_loop,
         human_review_node_name="human_review",
@@ -737,7 +752,46 @@ def {function_name}(connection):
+def smart_schema_filter(llm, user_instructions, all_sql_database_summary, smart_filtering = True):
+    """
+    This function filters the tables and columns based on the user instructions and the recommended steps.
+    """
+    # Smart schema filtering
+    if smart_filtering:
+        print("    * SMART FILTER SCHEMA")
+        filter_schema_prompt = PromptTemplate(
+            template="""
+            You are a highly skilled data engineer. The user question is:
+                "{user_instructions}"
+            You have the full database metadata in JSON format below:
+                {all_sql_database_summary}
+            Please return ONLY the subset of this metadata that is relevant to answering the user’s question.
+            - Preserve the same JSON structure for "schemas" -> "tables" -> "columns".
+            - If any schemas/tables are irrelevant, omit them entirely.
+            - If some columns in a relevant table are not needed, you can still keep them if you aren't sure.
+            - However, try to keep only the minimum amount of data required to answer the user’s question.
+            Return a valid JSON object. Do not include any additional explanation or text outside of the JSON.
+            """,
+            input_variables=["user_instructions", "full_metadata_json"]
+        )
+        filter_schema_agent = filter_schema_prompt | llm | JsonOutputParser()
+        response = filter_schema_agent.invoke({
+            "user_instructions": user_instructions,
+            "all_sql_database_summary": all_sql_database_summary
+        })
+        return response
+    else:
+        return all_sql_database_summary

ai_data_science_team/ml_agents/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from ai_data_science_team.ml_agents.h2o_ml_agent import make_h2o_ml_agent, H2OMLAgent
2	+ from ai_data_science_team.ml_agents.mlflow_tools_agent import make_mlflow_tools_agent, MLflowToolsAgent

ai-data-science-team 0.0.0.9008__py3-none-any.whl → 0.0.0.9010__py3-none-any.whl

ai-data-science-team 0.0.0.9008py3-none-any.whl → 0.0.0.9010py3-none-any.whl