PyPI - ai-data-science-team - Versions diffs - 0.0.0.9007__py3-none-any.whl → 0.0.0.9009__py3-none-any.whl - Mend

ai-data-science-team 0.0.0.9007py3-none-any.whl → 0.0.0.9009py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

ai_data_science_team/_version.py +1 -1
ai_data_science_team/agents/__init__.py +4 -5
ai_data_science_team/agents/data_cleaning_agent.py +268 -116
ai_data_science_team/agents/data_visualization_agent.py +470 -41
ai_data_science_team/agents/data_wrangling_agent.py +471 -31
ai_data_science_team/agents/feature_engineering_agent.py +426 -41
ai_data_science_team/agents/sql_database_agent.py +458 -58
ai_data_science_team/ml_agents/__init__.py +1 -0
ai_data_science_team/ml_agents/h2o_ml_agent.py +1032 -0
ai_data_science_team/multiagents/__init__.py +1 -0
ai_data_science_team/multiagents/sql_data_analyst.py +398 -0
ai_data_science_team/multiagents/supervised_data_analyst.py +2 -0
ai_data_science_team/templates/__init__.py +3 -1
ai_data_science_team/templates/agent_templates.py +319 -43
ai_data_science_team/tools/metadata.py +94 -62
ai_data_science_team/tools/regex.py +86 -1
ai_data_science_team/utils/__init__.py +0 -0
ai_data_science_team/utils/plotly.py +24 -0
ai_data_science_team-0.0.0.9009.dist-info/METADATA +245 -0
ai_data_science_team-0.0.0.9009.dist-info/RECORD +28 -0
ai_data_science_team-0.0.0.9007.dist-info/METADATA +0 -183
ai_data_science_team-0.0.0.9007.dist-info/RECORD +0 -21
{ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/LICENSE +0 -0
{ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/WHEEL +0 -0
{ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/top_level.txt +0 -0

ai_data_science_team/agents/feature_engineering_agent.py CHANGED Viewed

@@ -14,18 +14,27 @@ from langgraph.types import Command
 from langgraph.checkpoint.memory import MemorySaver
 import os
-import io
+import json
 import pandas as pd
+from IPython.display import Markdown
 from ai_data_science_team.templates import(
     node_func_execute_agent_code_on_data,
     node_func_human_review,
     node_func_fix_agent_code,
-    node_func_explain_agent_code,
-    create_coding_agent_graph
+    node_func_report_agent_outputs,
+    create_coding_agent_graph,
+    BaseAgent,
 )
 from ai_data_science_team.tools.parsers import PythonOutputParser
-from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top, format_agent_name
+from ai_data_science_team.tools.regex import (
+    relocate_imports_inside_function,
+    add_comments_to_top,
+    format_agent_name,
+    format_recommended_steps,
+    get_generic_summary,
+)
 from ai_data_science_team.tools.metadata import get_dataframe_summary
 from ai_data_science_team.tools.logging import log_ai_function
@@ -33,6 +42,351 @@ from ai_data_science_team.tools.logging import log_ai_function
 AGENT_NAME = "feature_engineering_agent"
 LOG_PATH = os.path.join(os.getcwd(), "logs/")
+# Class
+class FeatureEngineeringAgent(BaseAgent):
+    """
+    Creates a feature engineering agent that can process datasets based on user-defined instructions or
+    default feature engineering steps. The agent generates a Python function to engineer features, executes it,
+    and logs the process, including code and errors. It is designed to facilitate reproducible and
+    customizable feature engineering workflows.
+    The agent can perform the following default feature engineering steps unless instructed otherwise:
+    - Convert features to appropriate data types
+    - Remove features that have unique values for each row
+    - Remove constant features
+    - Encode high-cardinality categoricals (threshold <= 5% of dataset) as 'other'
+    - One-hot-encode categorical variables
+    - Convert booleans to integer (1/0)
+    - Create datetime-based features (if applicable)
+    - Handle target variable encoding if specified
+    - Any user-provided instructions to add, remove, or modify steps
+    Parameters
+    ----------
+    model : langchain.llms.base.LLM
+        The language model used to generate the feature engineering function.
+    n_samples : int, optional
+        Number of samples used when summarizing the dataset. Defaults to 30.
+    log : bool, optional
+        Whether to log the generated code and errors. Defaults to False.
+    log_path : str, optional
+        Directory path for storing log files. Defaults to None.
+    file_name : str, optional
+        Name of the file for saving the generated response. Defaults to "feature_engineer.py".
+    function_name : str, optional
+        Name of the function for data visualization. Defaults to "feature_engineer".
+    overwrite : bool, optional
+        Whether to overwrite the log file if it exists. If False, a unique file name is created. Defaults to True.
+    human_in_the_loop : bool, optional
+        Enables user review of feature engineering instructions. Defaults to False.
+    bypass_recommended_steps : bool, optional
+        If True, skips the default recommended steps. Defaults to False.
+    bypass_explain_code : bool, optional
+        If True, skips the step that provides code explanations. Defaults to False.
+    Methods
+    -------
+    update_params(**kwargs)
+        Updates the agent's parameters and rebuilds the compiled state graph.
+    ainvoke_agent(
+        user_instructions: str,
+        data_raw: pd.DataFrame,
+        target_variable: str = None,
+        max_retries=3,
+        retry_count=0
+    )
+        Engineers features from the provided dataset asynchronously based on user instructions.
+    invoke_agent(
+        user_instructions: str,
+        data_raw: pd.DataFrame,
+        target_variable: str = None,
+        max_retries=3,
+        retry_count=0
+    )
+        Engineers features from the provided dataset synchronously based on user instructions.
+    get_workflow_summary()
+        Retrieves a summary of the agent's workflow.
+    get_log_summary()
+        Retrieves a summary of logged operations if logging is enabled.
+    get_data_engineered()
+        Retrieves the feature-engineered dataset as a pandas DataFrame.
+    get_data_raw()
+        Retrieves the raw dataset as a pandas DataFrame.
+    get_feature_engineer_function()
+        Retrieves the generated Python function used for feature engineering.
+    get_recommended_feature_engineering_steps()
+        Retrieves the agent's recommended feature engineering steps.
+    get_response()
+        Returns the response from the agent as a dictionary.
+    show()
+        Displays the agent's mermaid diagram.
+    Examples
+    --------
+    ```python
+    import pandas as pd
+    from langchain_openai import ChatOpenAI
+    from ai_data_science_team.agents import FeatureEngineeringAgent
+    llm = ChatOpenAI(model="gpt-4o-mini")
+    feature_agent = FeatureEngineeringAgent(
+        model=llm,
+        n_samples=30,
+        log=True,
+        log_path="logs",
+        human_in_the_loop=True
+    )
+    df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
+    feature_agent.invoke_agent(
+        user_instructions="Also encode the 'PaymentMethod' column with one-hot encoding.",
+        data_raw=df,
+        target_variable="Churn",
+        max_retries=3,
+        retry_count=0
+    )
+    engineered_data = feature_agent.get_data_engineered()
+    response = feature_agent.get_response()
+    ```
+    Returns
+    -------
+    FeatureEngineeringAgent : langchain.graphs.CompiledStateGraph
+        A feature engineering agent implemented as a compiled state graph.
+    """
+    def __init__(
+        self,
+        model,
+        n_samples=30,
+        log=False,
+        log_path=None,
+        file_name="feature_engineer.py",
+        function_name="feature_engineer",
+        overwrite=True,
+        human_in_the_loop=False,
+        bypass_recommended_steps=False,
+        bypass_explain_code=False
+    ):
+        self._params = {
+            "model": model,
+            "n_samples": n_samples,
+            "log": log,
+            "log_path": log_path,
+            "file_name": file_name,
+            "function_name": function_name,
+            "overwrite": overwrite,
+            "human_in_the_loop": human_in_the_loop,
+            "bypass_recommended_steps": bypass_recommended_steps,
+            "bypass_explain_code": bypass_explain_code
+        }
+        self._compiled_graph = self._make_compiled_graph()
+        self.response = None
+    def _make_compiled_graph(self):
+        """
+        Create the compiled graph for the feature engineering agent.
+        Running this method will reset the response to None.
+        """
+        self.response = None
+        return make_feature_engineering_agent(**self._params)
+    def update_params(self, **kwargs):
+        """
+        Updates the agent's parameters and rebuilds the compiled graph.
+        """
+        for k, v in kwargs.items():
+            self._params[k] = v
+        self._compiled_graph = self._make_compiled_graph()
+    def ainvoke_agent(
+        self,
+        data_raw: pd.DataFrame,
+        user_instructions: str=None,
+        target_variable: str = None,
+        max_retries=3,
+        retry_count=0,
+        **kwargs
+    ):
+        """
+        Asynchronously engineers features for the provided dataset.
+        The response is stored in the 'response' attribute.
+        Parameters
+        ----------
+        data_raw : pd.DataFrame
+            The raw dataset to be processed.
+        user_instructions : str, optional
+            Instructions for feature engineering.
+        target_variable : str, optional
+            The name of the target variable (if any).
+        max_retries : int
+            Maximum retry attempts.
+        retry_count : int
+            Current retry attempt count.
+        **kwargs
+            Additional keyword arguments to pass to ainvoke().
+        Returns
+        -------
+        None
+        """
+        response = self._compiled_graph.ainvoke({
+            "user_instructions": user_instructions,
+            "data_raw": data_raw.to_dict(),
+            "target_variable": target_variable,
+            "max_retries": max_retries,
+            "retry_count": retry_count
+        }, **kwargs)
+        self.response = response
+        return None
+    def invoke_agent(
+        self,
+        data_raw: pd.DataFrame,
+        user_instructions: str=None,
+        target_variable: str = None,
+        max_retries=3,
+        retry_count=0,
+        **kwargs
+    ):
+        """
+        Synchronously engineers features for the provided dataset.
+        The response is stored in the 'response' attribute.
+        Parameters
+        ----------
+        data_raw : pd.DataFrame
+            The raw dataset to be processed.
+        user_instructions : str
+            Instructions for feature engineering agent.
+        target_variable : str, optional
+            The name of the target variable (if any).
+        max_retries : int
+            Maximum retry attempts.
+        retry_count : int
+            Current retry attempt count.
+        **kwargs
+            Additional keyword arguments to pass to invoke().
+        Returns
+        -------
+        None
+        """
+        response = self._compiled_graph.invoke({
+            "user_instructions": user_instructions,
+            "data_raw": data_raw.to_dict(),
+            "target_variable": target_variable,
+            "max_retries": max_retries,
+            "retry_count": retry_count
+        }, **kwargs)
+        self.response = response
+        return None
+    def get_workflow_summary(self, markdown=False):
+        """
+        Retrieves the agent's workflow summary, if logging is enabled.
+        """
+        if self.response and self.response.get("messages"):
+            summary = get_generic_summary(json.loads(self.response.get("messages")[-1].content))
+            if markdown:
+                return Markdown(summary)
+            else:
+                return summary
+    def get_log_summary(self, markdown=False):
+        """
+        Logs a summary of the agent's operations, if logging is enabled.
+        """
+        if self.response:
+            if self.response.get('feature_engineer_function_path'):
+                log_details = f"""
+## Featuring Engineering Agent Log Summary:
+Function Path: {self.response.get('feature_engineer_function_path')}
+Function Name: {self.response.get('feature_engineer_function_name')}
+                """
+                if markdown:
+                    return Markdown(log_details)
+                else:
+                    return log_details
+    def get_data_engineered(self):
+        """
+        Retrieves the engineered data stored after running invoke/ainvoke.
+        Returns
+        -------
+        pd.DataFrame or None
+            The engineered dataset as a pandas DataFrame.
+        """
+        if self.response and "data_engineered" in self.response:
+            return pd.DataFrame(self.response["data_engineered"])
+        return None
+    def get_data_raw(self):
+        """
+        Retrieves the raw data.
+        Returns
+        -------
+        pd.DataFrame or None
+            The raw dataset as a pandas DataFrame if available.
+        """
+        if self.response and "data_raw" in self.response:
+            return pd.DataFrame(self.response["data_raw"])
+        return None
+    def get_feature_engineer_function(self, markdown=False):
+        """
+        Retrieves the feature engineering function generated by the agent.
+        Parameters
+        ----------
+        markdown : bool, optional
+            If True, returns the function in Markdown code block format.
+        Returns
+        -------
+        str or None
+            The Python function code, or None if unavailable.
+        """
+        if self.response and "feature_engineer_function" in self.response:
+            code = self.response["feature_engineer_function"]
+            if markdown:
+                return Markdown(f"```python\n{code}\n```")
+            return code
+        return None
+    def get_recommended_feature_engineering_steps(self, markdown=False):
+        """
+        Retrieves the agent's recommended feature engineering steps.
+        Parameters
+        ----------
+        markdown : bool, optional
+            If True, returns the steps in Markdown format.
+        Returns
+        -------
+        str or None
+            The recommended steps, or None if not available.
+        """
+        if self.response and "recommended_steps" in self.response:
+            steps = self.response["recommended_steps"]
+            if markdown:
+                return Markdown(steps)
+            return steps
+        return None
 # * Feature Engineering Agent
 def make_feature_engineering_agent(
@@ -41,6 +395,7 @@ def make_feature_engineering_agent(
     log=False,
     log_path=None,
     file_name="feature_engineer.py",
+    function_name="feature_engineer",
     overwrite = True,
     human_in_the_loop=False,
     bypass_recommended_steps=False,
@@ -82,6 +437,8 @@ def make_feature_engineering_agent(
         The path to the directory where the log files should be stored. Defaults to "logs/".
     file_name : str, optional
         The name of the file to save the log to. Defaults to "feature_engineer.py".
+    function_name : str, optional
+        The name of the function that will be generated. Defaults to "feature_engineer".
     overwrite : bool, optional
         Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
         Defaults to True.
@@ -122,6 +479,11 @@ def make_feature_engineering_agent(
         The feature engineering agent as a state graph.
     """
     llm = model
+    # Human in th loop requires recommended steps
+    if bypass_recommended_steps and human_in_the_loop:
+        bypass_recommended_steps = False
+        print("Bypass recommended steps set to False to enable human in the loop.")
     # Setup Log Directory
     if log:
@@ -141,6 +503,7 @@ def make_feature_engineering_agent(
         all_datasets_summary: str
         feature_engineer_function: str
         feature_engineer_function_path: str
+        feature_engineer_file_name: str
         feature_engineer_function_name: str
         feature_engineer_error: str
         max_retries: int
@@ -194,7 +557,7 @@ def make_feature_engineering_agent(
             Below are summaries of all datasets provided:
             {all_datasets_summary}
-            Return the steps as a numbered list (no code, just the steps).
+            Return steps as a numbered list. You can return short code snippets to demonstrate actions. But do not return a fully coded solution. The code will be generated separately by a Coding Agent.
             Avoid these:
             1. Do not include steps to save files.
@@ -218,19 +581,36 @@ def make_feature_engineering_agent(
         })
         return {
-            "recommended_steps": "\n\n# Recommended Feature Engineering Steps:\n" + recommended_steps.content.strip(),
+            "recommended_steps": format_recommended_steps(recommended_steps.content.strip(), heading="# Recommended Feature Engineering Steps:"),
             "all_datasets_summary": all_datasets_summary_str
         }
-    def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "create_feature_engineering_code"]]:
-        return node_func_human_review(
-            state=state,
-            prompt_text="Is the following feature engineering instructions correct? (Answer 'yes' or provide modifications)\n{steps}",
-            yes_goto="create_feature_engineering_code",
-            no_goto="recommend_feature_engineering_steps",
-            user_instructions_key="user_instructions",
-            recommended_steps_key="recommended_steps"
-        )
+    # Human Review
+    prompt_text_human_review = "Are the following feature engineering instructions correct? (Answer 'yes' or provide modifications)\n{steps}"
+    if not bypass_explain_code:
+        def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "explain_feature_engineering_code"]]:
+            return node_func_human_review(
+                state=state,
+                prompt_text=prompt_text_human_review,
+                yes_goto= 'explain_feature_engineering_code',
+                no_goto="recommend_feature_engineering_steps",
+                user_instructions_key="user_instructions",
+                recommended_steps_key="recommended_steps",
+                code_snippet_key="feature_engineer_function",
+            )
+    else:
+        def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "__end__"]]:
+            return node_func_human_review(
+                state=state,
+                prompt_text=prompt_text_human_review,
+                yes_goto= '__end__',
+                no_goto="recommend_feature_engineering_steps",
+                user_instructions_key="user_instructions",
+                recommended_steps_key="recommended_steps",
+                code_snippet_key="feature_engineer_function",
+            )
     def create_feature_engineering_code(state: GraphState):
         if bypass_recommended_steps:
@@ -250,8 +630,7 @@ def make_feature_engineering_agent(
         feature_engineering_prompt = PromptTemplate(
             template="""
-            You are a Feature Engineering Agent. Your job is to create a feature_engineer() function that can be run on the data provided using the following recommended steps.
+            You are a Feature Engineering Agent. Your job is to create a {function_name}() function that can be run on the data provided using the following recommended steps.
             Recommended Steps:
             {recommended_steps}
@@ -265,11 +644,11 @@ def make_feature_engineering_agent(
             You can use Pandas, Numpy, and Scikit Learn libraries to feature engineer the data.
-            Return Python code in ```python``` format with a single function definition, feature_engineer(data_raw), including all imports inside the function.
+            Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), including all imports inside the function.
             Return code to provide the feature engineering function:
-            def feature_engineer(data_raw):
+            def {function_name}(data_raw):
                 import pandas as pd
                 import numpy as np
                 ...
@@ -292,7 +671,7 @@ def make_feature_engineering_agent(
             """,
-            input_variables=["recommeded_steps", "target_variable", "all_datasets_summary"]
+            input_variables=["recommeded_steps", "target_variable", "all_datasets_summary", "function_name"]
         )
         feature_engineering_agent = feature_engineering_prompt | llm | PythonOutputParser()
@@ -301,6 +680,7 @@ def make_feature_engineering_agent(
             "recommended_steps": state.get("recommended_steps"),
             "target_variable": state.get("target_variable"),
             "all_datasets_summary": all_datasets_summary_str,
+            "function_name": function_name
         })
         response = relocate_imports_inside_function(response)
@@ -318,12 +698,11 @@ def make_feature_engineering_agent(
         return {
             "feature_engineer_function": response,
             "feature_engineer_function_path": file_path,
-            "feature_engineer_function_name": file_name_2,
+            "feature_engineer_file_name": file_name_2,
+            "feature_engineer_function_name": function_name,
             "all_datasets_summary": all_datasets_summary_str
         }
     def execute_feature_engineering_code(state):
         return node_func_execute_agent_code_on_data(
             state=state,
@@ -331,7 +710,7 @@ def make_feature_engineering_agent(
             result_key="data_engineered",
             error_key="feature_engineer_error",
             code_snippet_key="feature_engineer_function",
-            agent_function_name="feature_engineer",
+            agent_function_name=state.get("feature_engineer_function_name"),
             pre_processing=lambda data: pd.DataFrame.from_dict(data),
             post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
             error_message_prefix="An error occurred during feature engineering: "
@@ -339,11 +718,13 @@ def make_feature_engineering_agent(
     def fix_feature_engineering_code(state: GraphState):
         feature_engineer_prompt = """
-        You are a Feature Engineering Agent. Your job is to fix the feature_engineer() function that currently contains errors.
+        You are a Feature Engineering Agent. Your job is to fix the {function_name}() function that currently contains errors.
+        Provide only the corrected function definition for {function_name}().
-        Provide only the corrected function definition.
+        Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
-        Broken code:
+        This is the broken code (please fix):
         {code_snippet}
         Last Known Error:
@@ -359,23 +740,25 @@ def make_feature_engineering_agent(
             agent_name=AGENT_NAME,
             log=log,
             file_path=state.get("feature_engineer_function_path"),
+            function_name=state.get("feature_engineer_function_name"),
         )
-    def explain_feature_engineering_code(state: GraphState):
-        return node_func_explain_agent_code(
+    # Final reporting node
+    def report_agent_outputs(state: GraphState):
+        return node_func_report_agent_outputs(
             state=state,
-            code_snippet_key="feature_engineer_function",
+            keys_to_include=[
+                "recommended_steps",
+                "feature_engineer_function",
+                "feature_engineer_function_path",
+                "feature_engineer_function_name",
+                "feature_engineer_error",
+            ],
             result_key="messages",
-            error_key="feature_engineer_error",
-            llm=llm,
             role=AGENT_NAME,
-            explanation_prompt_template="""
-            Explain the feature engineering steps performed by this function. Keep the explanation clear and concise.\n\n# Feature Engineering Agent:\n\n{code}
-            """,
-            success_prefix="# Feature Engineering Agent:\n\n ",
-            error_message="The Feature Engineering Agent encountered an error during feature engineering. Data could not be explained."
+            custom_title="Feature Engineering Agent Outputs"
         )
     # Create the graph
     node_functions = {
         "recommend_feature_engineering_steps": recommend_feature_engineering_steps,
@@ -383,7 +766,7 @@ def make_feature_engineering_agent(
         "create_feature_engineering_code": create_feature_engineering_code,
         "execute_feature_engineering_code": execute_feature_engineering_code,
         "fix_feature_engineering_code": fix_feature_engineering_code,
-        "explain_feature_engineering_code": explain_feature_engineering_code
+        "report_agent_outputs": report_agent_outputs,
     }
     app = create_coding_agent_graph(
@@ -393,11 +776,13 @@ def make_feature_engineering_agent(
         create_code_node_name="create_feature_engineering_code",
         execute_code_node_name="execute_feature_engineering_code",
         fix_code_node_name="fix_feature_engineering_code",
-        explain_code_node_name="explain_feature_engineering_code",
+        explain_code_node_name="report_agent_outputs",
         error_key="feature_engineer_error",
+        max_retries_key = "max_retries",
+        retry_count_key = "retry_count",
         human_in_the_loop=human_in_the_loop,
         human_review_node_name="human_review",
-        checkpointer=MemorySaver() if human_in_the_loop else None,
+        checkpointer=MemorySaver(),
         bypass_recommended_steps=bypass_recommended_steps,
         bypass_explain_code=bypass_explain_code,
     )

ai-data-science-team 0.0.0.9007__py3-none-any.whl → 0.0.0.9009__py3-none-any.whl

ai-data-science-team 0.0.0.9007py3-none-any.whl → 0.0.0.9009py3-none-any.whl