PyPI - ai-data-science-team - Versions diffs - 0.0.0.9006__py3-none-any.whl → 0.0.0.9008__py3-none-any.whl - Mend

ai-data-science-team 0.0.0.9006py3-none-any.whl → 0.0.0.9008py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

ai_data_science_team/agents/feature_engineering_agent.py CHANGED Viewed

@@ -14,18 +14,25 @@ from langgraph.types import Command
 from langgraph.checkpoint.memory import MemorySaver
 import os
-import io
 import pandas as pd
-from ai_data_science_team.templates.agent_templates import(
+from IPython.display import Markdown
+from ai_data_science_team.templates import(
     node_func_execute_agent_code_on_data,
     node_func_human_review,
     node_func_fix_agent_code,
     node_func_explain_agent_code,
-    create_coding_agent_graph
+    create_coding_agent_graph,
+    BaseAgent,
 )
 from ai_data_science_team.tools.parsers import PythonOutputParser
-from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top
+from ai_data_science_team.tools.regex import (
+    relocate_imports_inside_function,
+    add_comments_to_top,
+    format_agent_name,
+    format_recommended_steps
+)
 from ai_data_science_team.tools.metadata import get_dataframe_summary
 from ai_data_science_team.tools.logging import log_ai_function
@@ -33,9 +40,386 @@ from ai_data_science_team.tools.logging import log_ai_function
 AGENT_NAME = "feature_engineering_agent"
 LOG_PATH = os.path.join(os.getcwd(), "logs/")
+# Class
+class FeatureEngineeringAgent(BaseAgent):
+    """
+    Creates a feature engineering agent that can process datasets based on user-defined instructions or
+    default feature engineering steps. The agent generates a Python function to engineer features, executes it,
+    and logs the process, including code and errors. It is designed to facilitate reproducible and
+    customizable feature engineering workflows.
+    The agent can perform the following default feature engineering steps unless instructed otherwise:
+    - Convert features to appropriate data types
+    - Remove features that have unique values for each row
+    - Remove constant features
+    - Encode high-cardinality categoricals (threshold <= 5% of dataset) as 'other'
+    - One-hot-encode categorical variables
+    - Convert booleans to integer (1/0)
+    - Create datetime-based features (if applicable)
+    - Handle target variable encoding if specified
+    - Any user-provided instructions to add, remove, or modify steps
+    Parameters
+    ----------
+    model : langchain.llms.base.LLM
+        The language model used to generate the feature engineering function.
+    n_samples : int, optional
+        Number of samples used when summarizing the dataset. Defaults to 30.
+    log : bool, optional
+        Whether to log the generated code and errors. Defaults to False.
+    log_path : str, optional
+        Directory path for storing log files. Defaults to None.
+    file_name : str, optional
+        Name of the file for saving the generated response. Defaults to "feature_engineer.py".
+    function_name : str, optional
+        Name of the function for data visualization. Defaults to "feature_engineer".
+    overwrite : bool, optional
+        Whether to overwrite the log file if it exists. If False, a unique file name is created. Defaults to True.
+    human_in_the_loop : bool, optional
+        Enables user review of feature engineering instructions. Defaults to False.
+    bypass_recommended_steps : bool, optional
+        If True, skips the default recommended steps. Defaults to False.
+    bypass_explain_code : bool, optional
+        If True, skips the step that provides code explanations. Defaults to False.
+    Methods
+    -------
+    update_params(**kwargs)
+        Updates the agent's parameters and rebuilds the compiled state graph.
+    ainvoke_agent(
+        user_instructions: str,
+        data_raw: pd.DataFrame,
+        target_variable: str = None,
+        max_retries=3,
+        retry_count=0
+    )
+        Engineers features from the provided dataset asynchronously based on user instructions.
+    invoke_agent(
+        user_instructions: str,
+        data_raw: pd.DataFrame,
+        target_variable: str = None,
+        max_retries=3,
+        retry_count=0
+    )
+        Engineers features from the provided dataset synchronously based on user instructions.
+    explain_feature_engineering_steps()
+        Returns an explanation of the feature engineering steps performed by the agent.
+    get_log_summary()
+        Retrieves a summary of logged operations if logging is enabled.
+    get_data_engineered()
+        Retrieves the feature-engineered dataset as a pandas DataFrame.
+    get_data_raw()
+        Retrieves the raw dataset as a pandas DataFrame.
+    get_feature_engineer_function()
+        Retrieves the generated Python function used for feature engineering.
+    get_recommended_feature_engineering_steps()
+        Retrieves the agent's recommended feature engineering steps.
+    get_response()
+        Returns the response from the agent as a dictionary.
+    show()
+        Displays the agent's mermaid diagram.
+    Examples
+    --------
+    ```python
+    import pandas as pd
+    from langchain_openai import ChatOpenAI
+    from ai_data_science_team.agents import FeatureEngineeringAgent
+    llm = ChatOpenAI(model="gpt-4o-mini")
+    feature_agent = FeatureEngineeringAgent(
+        model=llm,
+        n_samples=30,
+        log=True,
+        log_path="logs",
+        human_in_the_loop=True
+    )
+    df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
+    feature_agent.invoke_agent(
+        user_instructions="Also encode the 'PaymentMethod' column with one-hot encoding.",
+        data_raw=df,
+        target_variable="Churn",
+        max_retries=3,
+        retry_count=0
+    )
+    engineered_data = feature_agent.get_data_engineered()
+    response = feature_agent.get_response()
+    ```
+    Returns
+    -------
+    FeatureEngineeringAgent : langchain.graphs.CompiledStateGraph
+        A feature engineering agent implemented as a compiled state graph.
+    """
+    def __init__(
+        self,
+        model,
+        n_samples=30,
+        log=False,
+        log_path=None,
+        file_name="feature_engineer.py",
+        function_name="feature_engineer",
+        overwrite=True,
+        human_in_the_loop=False,
+        bypass_recommended_steps=False,
+        bypass_explain_code=False
+    ):
+        self._params = {
+            "model": model,
+            "n_samples": n_samples,
+            "log": log,
+            "log_path": log_path,
+            "file_name": file_name,
+            "function_name": function_name,
+            "overwrite": overwrite,
+            "human_in_the_loop": human_in_the_loop,
+            "bypass_recommended_steps": bypass_recommended_steps,
+            "bypass_explain_code": bypass_explain_code
+        }
+        self._compiled_graph = self._make_compiled_graph()
+        self.response = None
+    def _make_compiled_graph(self):
+        """
+        Create the compiled graph for the feature engineering agent.
+        Running this method will reset the response to None.
+        """
+        self.response = None
+        return make_feature_engineering_agent(**self._params)
+    def update_params(self, **kwargs):
+        """
+        Updates the agent's parameters and rebuilds the compiled graph.
+        """
+        for k, v in kwargs.items():
+            self._params[k] = v
+        self._compiled_graph = self._make_compiled_graph()
+    def ainvoke_agent(
+        self,
+        data_raw: pd.DataFrame,
+        user_instructions: str=None,
+        target_variable: str = None,
+        max_retries=3,
+        retry_count=0,
+        **kwargs
+    ):
+        """
+        Asynchronously engineers features for the provided dataset.
+        The response is stored in the 'response' attribute.
+        Parameters
+        ----------
+        data_raw : pd.DataFrame
+            The raw dataset to be processed.
+        user_instructions : str, optional
+            Instructions for feature engineering.
+        target_variable : str, optional
+            The name of the target variable (if any).
+        max_retries : int
+            Maximum retry attempts.
+        retry_count : int
+            Current retry attempt count.
+        **kwargs
+            Additional keyword arguments to pass to ainvoke().
+        Returns
+        -------
+        None
+        """
+        response = self._compiled_graph.ainvoke({
+            "user_instructions": user_instructions,
+            "data_raw": data_raw.to_dict(),
+            "target_variable": target_variable,
+            "max_retries": max_retries,
+            "retry_count": retry_count
+        }, **kwargs)
+        self.response = response
+        return None
+    def invoke_agent(
+        self,
+        data_raw: pd.DataFrame,
+        user_instructions: str=None,
+        target_variable: str = None,
+        max_retries=3,
+        retry_count=0,
+        **kwargs
+    ):
+        """
+        Synchronously engineers features for the provided dataset.
+        The response is stored in the 'response' attribute.
+        Parameters
+        ----------
+        data_raw : pd.DataFrame
+            The raw dataset to be processed.
+        user_instructions : str
+            Instructions for feature engineering agent.
+        target_variable : str, optional
+            The name of the target variable (if any).
+        max_retries : int
+            Maximum retry attempts.
+        retry_count : int
+            Current retry attempt count.
+        **kwargs
+            Additional keyword arguments to pass to invoke().
+        Returns
+        -------
+        None
+        """
+        response = self._compiled_graph.invoke({
+            "user_instructions": user_instructions,
+            "data_raw": data_raw.to_dict(),
+            "target_variable": target_variable,
+            "max_retries": max_retries,
+            "retry_count": retry_count
+        }, **kwargs)
+        self.response = response
+        return None
+    def explain_feature_engineering_steps(self):
+        """
+        Provides an explanation of the feature engineering steps performed by the agent.
+        Returns
+        -------
+        str or list
+            Explanation of the feature engineering steps.
+        """
+        if self.response:
+            return self.response.get("messages", [])
+        return []
+    def get_log_summary(self, markdown=False):
+        """
+        Logs a summary of the agent's operations, if logging is enabled.
+        Parameters
+        ----------
+        markdown : bool, optional
+            If True, returns Markdown-formatted output.
+        Returns
+        -------
+        str or None
+            Summary of logs, or None if not available.
+        """
+        if self.response and self.response.get("feature_engineer_function_path"):
+            log_details = f"Log Path: {self.response.get('feature_engineer_function_path')}"
+            if markdown:
+                return Markdown(log_details)
+            else:
+                return log_details
+        return None
+    def get_data_engineered(self):
+        """
+        Retrieves the engineered data stored after running invoke/ainvoke.
+        Returns
+        -------
+        pd.DataFrame or None
+            The engineered dataset as a pandas DataFrame.
+        """
+        if self.response and "data_engineered" in self.response:
+            return pd.DataFrame(self.response["data_engineered"])
+        return None
+    def get_data_raw(self):
+        """
+        Retrieves the raw data.
+        Returns
+        -------
+        pd.DataFrame or None
+            The raw dataset as a pandas DataFrame if available.
+        """
+        if self.response and "data_raw" in self.response:
+            return pd.DataFrame(self.response["data_raw"])
+        return None
+    def get_feature_engineer_function(self, markdown=False):
+        """
+        Retrieves the feature engineering function generated by the agent.
+        Parameters
+        ----------
+        markdown : bool, optional
+            If True, returns the function in Markdown code block format.
+        Returns
+        -------
+        str or None
+            The Python function code, or None if unavailable.
+        """
+        if self.response and "feature_engineer_function" in self.response:
+            code = self.response["feature_engineer_function"]
+            if markdown:
+                return Markdown(f"```python\n{code}\n```")
+            return code
+        return None
+    def get_recommended_feature_engineering_steps(self, markdown=False):
+        """
+        Retrieves the agent's recommended feature engineering steps.
+        Parameters
+        ----------
+        markdown : bool, optional
+            If True, returns the steps in Markdown format.
+        Returns
+        -------
+        str or None
+            The recommended steps, or None if not available.
+        """
+        if self.response and "recommended_steps" in self.response:
+            steps = self.response["recommended_steps"]
+            if markdown:
+                return Markdown(steps)
+            return steps
+        return None
+    def get_response(self):
+        """
+        Returns the agent's full response dictionary.
+        Returns
+        -------
+        dict or None
+            The response dictionary if available, otherwise None.
+        """
+        return self.response
+    def show(self):
+        """
+        Displays the agent's mermaid diagram for visual inspection of the compiled graph.
+        """
+        return self._compiled_graph.show()
 # * Feature Engineering Agent
-def make_feature_engineering_agent(model, log=False, log_path=None, overwrite = True, human_in_the_loop=False, bypass_recommended_steps=False, bypass_explain_code=False):
+def make_feature_engineering_agent(
+    model,
+    n_samples=30,
+    log=False,
+    log_path=None,
+    file_name="feature_engineer.py",
+    function_name="feature_engineer",
+    overwrite = True,
+    human_in_the_loop=False,
+    bypass_recommended_steps=False,
+    bypass_explain_code=False,
+):
     """
     Creates a feature engineering agent that can be run on a dataset. The agent applies various feature engineering
     techniques, such as encoding categorical variables, scaling numeric variables, creating interaction terms,
@@ -61,11 +445,19 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
     ----------
     model : langchain.llms.base.LLM
         The language model to use to generate code.
+    n_samples : int, optional
+        The number of data samples to use for generating the feature engineering code. Defaults to 30.
+        If you get an error due to maximum tokens, try reducing this number.
+        > "This model's maximum context length is 128000 tokens. However, your messages resulted in 333858 tokens. Please reduce the length of the messages."
     log : bool, optional
         Whether or not to log the code generated and any errors that occur.
         Defaults to False.
     log_path : str, optional
         The path to the directory where the log files should be stored. Defaults to "logs/".
+    file_name : str, optional
+        The name of the file to save the log to. Defaults to "feature_engineer.py".
+    function_name : str, optional
+        The name of the function that will be generated. Defaults to "feature_engineer".
     overwrite : bool, optional
         Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
         Defaults to True.
@@ -102,10 +494,15 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
     Returns
     -------
-    app : langchain.graphs.StateGraph
+    app : langchain.graphs.CompiledStateGraph
         The feature engineering agent as a state graph.
     """
     llm = model
+    # Human in th loop requires recommended steps
+    if bypass_recommended_steps and human_in_the_loop:
+        bypass_recommended_steps = False
+        print("Bypass recommended steps set to False to enable human in the loop.")
     # Setup Log Directory
     if log:
@@ -125,6 +522,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
         all_datasets_summary: str
         feature_engineer_function: str
         feature_engineer_function_path: str
+        feature_engineer_file_name: str
         feature_engineer_function_name: str
         feature_engineer_error: str
         max_retries: int
@@ -135,7 +533,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
         Recommend a series of feature engineering steps based on the input data.
         These recommended steps will be appended to the user_instructions.
         """
-        print("---FEATURE ENGINEERING AGENT----")
+        print(format_agent_name(AGENT_NAME))
         print("    * RECOMMEND FEATURE ENGINEERING STEPS")
         # Prompt to get recommended steps from the LLM
@@ -182,6 +580,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
             Avoid these:
             1. Do not include steps to save files.
+            2. Do not include unrelated user instructions that are not related to the feature engineering.
             """,
             input_variables=["user_instructions", "recommended_steps", "all_datasets_summary"]
         )
@@ -189,7 +588,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
         data_raw = state.get("data_raw")
         df = pd.DataFrame.from_dict(data_raw)
-        all_datasets_summary = get_dataframe_summary([df])
+        all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
         all_datasets_summary_str = "\n\n".join(all_datasets_summary)
@@ -201,29 +600,57 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
         })
         return {
-            "recommended_steps": "\n\n# Recommended Feature Engineering Steps:\n" + recommended_steps.content.strip(),
+            "recommended_steps": format_recommended_steps(recommended_steps.content.strip(), heading="# Recommended Feature Engineering Steps:"),
             "all_datasets_summary": all_datasets_summary_str
         }
-    def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "create_feature_engineering_code"]]:
-        return node_func_human_review(
-            state=state,
-            prompt_text="Is the following feature engineering instructions correct? (Answer 'yes' or provide modifications)\n{steps}",
-            yes_goto="create_feature_engineering_code",
-            no_goto="recommend_feature_engineering_steps",
-            user_instructions_key="user_instructions",
-            recommended_steps_key="recommended_steps"
-        )
+    # Human Review
+    prompt_text_human_review = "Are the following feature engineering instructions correct? (Answer 'yes' or provide modifications)\n{steps}"
+    if not bypass_explain_code:
+        def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "explain_feature_engineering_code"]]:
+            return node_func_human_review(
+                state=state,
+                prompt_text=prompt_text_human_review,
+                yes_goto= 'explain_feature_engineering_code',
+                no_goto="recommend_feature_engineering_steps",
+                user_instructions_key="user_instructions",
+                recommended_steps_key="recommended_steps",
+                code_snippet_key="feature_engineer_function",
+            )
+    else:
+        def human_review(state: GraphState) -> Command[Literal["recommend_feature_engineering_steps", "__end__"]]:
+            return node_func_human_review(
+                state=state,
+                prompt_text=prompt_text_human_review,
+                yes_goto= '__end__',
+                no_goto="recommend_feature_engineering_steps",
+                user_instructions_key="user_instructions",
+                recommended_steps_key="recommended_steps",
+                code_snippet_key="feature_engineer_function",
+            )
     def create_feature_engineering_code(state: GraphState):
         if bypass_recommended_steps:
-            print("---FEATURE ENGINEERING AGENT----")
+            print(format_agent_name(AGENT_NAME))
+            data_raw = state.get("data_raw")
+            df = pd.DataFrame.from_dict(data_raw)
+            all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
+            all_datasets_summary_str = "\n\n".join(all_datasets_summary)
+        else:
+            all_datasets_summary_str = state.get("all_datasets_summary")
         print("    * CREATE FEATURE ENGINEERING CODE")
         feature_engineering_prompt = PromptTemplate(
             template="""
-            You are a Feature Engineering Agent. Your job is to create a feature_engineer() function that can be run on the data provided using the following recommended steps.
+            You are a Feature Engineering Agent. Your job is to create a {function_name}() function that can be run on the data provided using the following recommended steps.
             Recommended Steps:
             {recommended_steps}
@@ -237,11 +664,11 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
             You can use Pandas, Numpy, and Scikit Learn libraries to feature engineer the data.
-            Return Python code in ```python``` format with a single function definition, feature_engineer(data_raw), including all imports inside the function.
+            Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), including all imports inside the function.
             Return code to provide the feature engineering function:
-            def feature_engineer(data_raw):
+            def {function_name}(data_raw):
                 import pandas as pd
                 import numpy as np
                 ...
@@ -264,7 +691,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
             """,
-            input_variables=["recommeded_steps", "target_variable", "all_datasets_summary"]
+            input_variables=["recommeded_steps", "target_variable", "all_datasets_summary", "function_name"]
         )
         feature_engineering_agent = feature_engineering_prompt | llm | PythonOutputParser()
@@ -272,16 +699,17 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
         response = feature_engineering_agent.invoke({
             "recommended_steps": state.get("recommended_steps"),
             "target_variable": state.get("target_variable"),
-            "all_datasets_summary": state.get("all_datasets_summary"),
+            "all_datasets_summary": all_datasets_summary_str,
+            "function_name": function_name
         })
         response = relocate_imports_inside_function(response)
         response = add_comments_to_top(response, agent_name=AGENT_NAME)
         # For logging: store the code generated
-        file_path, file_name = log_ai_function(
+        file_path, file_name_2 = log_ai_function(
             response=response,
-            file_name="feature_engineer.py",
+            file_name=file_name,
             log=log,
             log_path=log_path,
             overwrite=overwrite
@@ -290,11 +718,11 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
         return {
             "feature_engineer_function": response,
             "feature_engineer_function_path": file_path,
-            "feature_engineer_function_name": file_name
+            "feature_engineer_file_name": file_name_2,
+            "feature_engineer_function_name": function_name,
+            "all_datasets_summary": all_datasets_summary_str
         }
     def execute_feature_engineering_code(state):
         return node_func_execute_agent_code_on_data(
             state=state,
@@ -302,7 +730,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
             result_key="data_engineered",
             error_key="feature_engineer_error",
             code_snippet_key="feature_engineer_function",
-            agent_function_name="feature_engineer",
+            agent_function_name=state.get("feature_engineer_function_name"),
             pre_processing=lambda data: pd.DataFrame.from_dict(data),
             post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
             error_message_prefix="An error occurred during feature engineering: "
@@ -310,11 +738,13 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
     def fix_feature_engineering_code(state: GraphState):
         feature_engineer_prompt = """
-        You are a Feature Engineering Agent. Your job is to fix the feature_engineer() function that currently contains errors.
+        You are a Feature Engineering Agent. Your job is to fix the {function_name}() function that currently contains errors.
+        Provide only the corrected function definition for {function_name}().
-        Provide only the corrected function definition.
+        Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
-        Broken code:
+        This is the broken code (please fix):
         {code_snippet}
         Last Known Error:
@@ -330,6 +760,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
             agent_name=AGENT_NAME,
             log=log,
             file_path=state.get("feature_engineer_function_path"),
+            function_name=state.get("feature_engineer_function_name"),
         )
     def explain_feature_engineering_code(state: GraphState):
@@ -366,9 +797,11 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
         fix_code_node_name="fix_feature_engineering_code",
         explain_code_node_name="explain_feature_engineering_code",
         error_key="feature_engineer_error",
+        max_retries_key = "max_retries",
+        retry_count_key = "retry_count",
         human_in_the_loop=human_in_the_loop,
         human_review_node_name="human_review",
-        checkpointer=MemorySaver() if human_in_the_loop else None,
+        checkpointer=MemorySaver(),
         bypass_recommended_steps=bypass_recommended_steps,
         bypass_explain_code=bypass_explain_code,
     )

ai-data-science-team 0.0.0.9006__py3-none-any.whl → 0.0.0.9008__py3-none-any.whl

ai-data-science-team 0.0.0.9006py3-none-any.whl → 0.0.0.9008py3-none-any.whl