PyPI - ai-data-science-team - Versions diffs - 0.0.0.9007__py3-none-any.whl → 0.0.0.9008__py3-none-any.whl - Mend

ai-data-science-team 0.0.0.9007py3-none-any.whl → 0.0.0.9008py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

ai_data_science_team/agents/data_wrangling_agent.py CHANGED Viewed

@@ -4,11 +4,11 @@
 # * Agents: Data Wrangling Agent
 # Libraries
-from typing import TypedDict, Annotated, Sequence, Literal, Union
+from typing import TypedDict, Annotated, Sequence, Literal, Union, Optional
 import operator
 import os
-import io
 import pandas as pd
+from IPython.display import Markdown
 from langchain.prompts import PromptTemplate
 from langchain_core.messages import BaseMessage
@@ -20,10 +20,11 @@ from ai_data_science_team.templates import(
     node_func_human_review,
     node_func_fix_agent_code,
     node_func_explain_agent_code,
-    create_coding_agent_graph
+    create_coding_agent_graph,
+    BaseAgent,
 )
 from ai_data_science_team.tools.parsers import PythonOutputParser
-from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top, format_agent_name
+from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top, format_agent_name, format_recommended_steps
 from ai_data_science_team.tools.metadata import get_dataframe_summary
 from ai_data_science_team.tools.logging import log_ai_function
@@ -31,13 +32,414 @@ from ai_data_science_team.tools.logging import log_ai_function
 AGENT_NAME = "data_wrangling_agent"
 LOG_PATH = os.path.join(os.getcwd(), "logs/")
+# Class
+class DataWranglingAgent(BaseAgent):
+    """
+    Creates a data wrangling agent that can work with one or more datasets, performing operations such as
+    joining/merging multiple datasets, reshaping, aggregating, encoding, creating computed features,
+    and ensuring consistent data types. The agent generates a Python function to wrangle the data,
+    executes the function, and logs the process (if enabled).
+    The agent can handle:
+    - A single dataset (provided as a dictionary of {column: list_of_values})
+    - Multiple datasets (provided as a list of such dictionaries)
+    Key wrangling steps can include:
+    - Merging or joining datasets
+    - Pivoting/melting data for reshaping
+    - GroupBy aggregations (sums, means, counts, etc.)
+    - Encoding categorical variables
+    - Computing new columns from existing ones
+    - Dropping or rearranging columns
+    - Any additional user instructions
+    Parameters
+    ----------
+    model : langchain.llms.base.LLM
+        The language model used to generate the data wrangling function.
+    n_samples : int, optional
+        Number of samples to show in the data summary for wrangling. Defaults to 30.
+    log : bool, optional
+        Whether to log the generated code and errors. Defaults to False.
+    log_path : str, optional
+        Directory path for storing log files. Defaults to None.
+    file_name : str, optional
+        Name of the file for saving the generated response. Defaults to "data_wrangler.py".
+    function_name : str, optional
+        Name of the function to be generated. Defaults to "data_wrangler".
+    overwrite : bool, optional
+        Whether to overwrite the log file if it exists. If False, a unique file name is created. Defaults to True.
+    human_in_the_loop : bool, optional
+        Enables user review of data wrangling instructions. Defaults to False.
+    bypass_recommended_steps : bool, optional
+        If True, skips the step that generates recommended data wrangling steps. Defaults to False.
+    bypass_explain_code : bool, optional
+        If True, skips the step that provides code explanations. Defaults to False.
+    Methods
+    -------
+    update_params(**kwargs)
+        Updates the agent's parameters and rebuilds the compiled state graph.
+    ainvoke_agent(user_instructions: str, data_raw: Union[dict, list], max_retries=3, retry_count=0)
+        Asynchronously wrangles the provided dataset(s) based on user instructions.
+    invoke_agent(user_instructions: str, data_raw: Union[dict, list], max_retries=3, retry_count=0)
+        Synchronously wrangles the provided dataset(s) based on user instructions.
+    explain_wrangling_steps()
+        Returns an explanation of the wrangling steps performed by the agent.
+    get_log_summary()
+        Retrieves a summary of logged operations if logging is enabled.
+    get_data_wrangled()
+        Retrieves the final wrangled dataset (as a dictionary of {column: list_of_values}).
+    get_data_raw()
+        Retrieves the raw dataset(s).
+    get_data_wrangler_function()
+        Retrieves the generated Python function used for data wrangling.
+    get_recommended_wrangling_steps()
+        Retrieves the agent's recommended wrangling steps.
+    get_response()
+        Returns the full response dictionary from the agent.
+    show()
+        Displays the agent's mermaid diagram for visual inspection of the compiled graph.
+    Examples
+    --------
+    ```python
+    import pandas as pd
+    from langchain_openai import ChatOpenAI
+    from ai_data_science_team.agents import DataWranglingAgent
+    # Single dataset example
+    llm = ChatOpenAI(model="gpt-4o-mini")
+    data_wrangling_agent = DataWranglingAgent(
+        model=llm,
+        n_samples=30,
+        log=True,
+        log_path="logs",
+        human_in_the_loop=True
+    )
+    df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
+    data_wrangling_agent.invoke_agent(
+        user_instructions="Group by 'gender' and compute mean of 'tenure'.",
+        data_raw=df,  # data_raw can be df.to_dict() or just a DataFrame
+        max_retries=3,
+        retry_count=0
+    )
+    data_wrangled = data_wrangling_agent.get_data_wrangled()
+    response = data_wrangling_agent.get_response()
+    # Multiple dataset example (list of dicts)
+    df1 = pd.DataFrame({'id': [1,2,3], 'val1': [10,20,30]})
+    df2 = pd.DataFrame({'id': [1,2,3], 'val2': [40,50,60]})
+    data_wrangling_agent.invoke_agent(
+        user_instructions="Merge these two datasets on 'id' and compute a new column 'val_sum' = val1+val2",
+        data_raw=[df1, df2],   # multiple datasets
+        max_retries=3,
+        retry_count=0
+    )
+    data_wrangled = data_wrangling_agent.get_data_wrangled()
+    ```
+    Returns
+    -------
+    DataWranglingAgent : langchain.graphs.CompiledStateGraph
+        A data wrangling agent implemented as a compiled state graph.
+    """
+    def __init__(
+        self,
+        model,
+        n_samples=30,
+        log=False,
+        log_path=None,
+        file_name="data_wrangler.py",
+        function_name="data_wrangler",
+        overwrite=True,
+        human_in_the_loop=False,
+        bypass_recommended_steps=False,
+        bypass_explain_code=False
+    ):
+        self._params = {
+            "model": model,
+            "n_samples": n_samples,
+            "log": log,
+            "log_path": log_path,
+            "file_name": file_name,
+            "function_name": function_name,
+            "overwrite": overwrite,
+            "human_in_the_loop": human_in_the_loop,
+            "bypass_recommended_steps": bypass_recommended_steps,
+            "bypass_explain_code": bypass_explain_code
+        }
+        self._compiled_graph = self._make_compiled_graph()
+        self.response = None
+    def _make_compiled_graph(self):
+        """
+        Create the compiled graph for the data wrangling agent.
+        Running this method will reset the response to None.
+        """
+        self.response = None
+        return make_data_wrangling_agent(**self._params)
+    def update_params(self, **kwargs):
+        """
+        Updates the agent's parameters and rebuilds the compiled graph.
+        """
+        for k, v in kwargs.items():
+            self._params[k] = v
+        self._compiled_graph = self._make_compiled_graph()
+    def ainvoke_agent(
+        self,
+        data_raw: Union[pd.DataFrame, dict, list],
+        user_instructions: str=None,
+        max_retries:int=3,
+        retry_count:int=0,
+        **kwargs
+    ):
+        """
+        Asynchronously wrangles the provided dataset(s) based on user instructions.
+        The response is stored in the 'response' attribute.
+        Parameters
+        ----------
+        data_raw : Union[pd.DataFrame, dict, list]
+            The raw dataset(s) to be wrangled.
+            Can be a single DataFrame, a single dict ({col: list_of_values}),
+              or a list of dicts if multiple datasets are provided.
+        user_instructions : str
+            Instructions for data wrangling.
+        max_retries : int
+            Maximum retry attempts.
+        retry_count : int
+            Current retry attempt count.
+        **kwargs
+            Additional keyword arguments to pass to ainvoke().
+        Returns
+        -------
+        None
+        """
+        data_input = self._convert_data_input(data_raw)
+        response = self._compiled_graph.ainvoke({
+            "user_instructions": user_instructions,
+            "data_raw": data_input,
+            "max_retries": max_retries,
+            "retry_count": retry_count
+        }, **kwargs)
+        self.response = response
+        return None
+    def invoke_agent(
+        self,
+        data_raw: Union[pd.DataFrame, dict, list],
+        user_instructions: str=None,
+        max_retries:int=3,
+        retry_count:int=0,
+        **kwargs
+    ):
+        """
+        Synchronously wrangles the provided dataset(s) based on user instructions.
+        The response is stored in the 'response' attribute.
+        Parameters
+        ----------
+        data_raw : Union[pd.DataFrame, dict, list]
+            The raw dataset(s) to be wrangled.
+            Can be a single DataFrame, a single dict, or a list of dicts.
+        user_instructions : str
+            Instructions for data wrangling agent.
+        max_retries : int
+            Maximum retry attempts.
+        retry_count : int
+            Current retry attempt count.
+        **kwargs
+            Additional keyword arguments to pass to invoke().
+        Returns
+        -------
+        None
+        """
+        data_input = self._convert_data_input(data_raw)
+        response = self._compiled_graph.invoke({
+            "user_instructions": user_instructions,
+            "data_raw": data_input,
+            "max_retries": max_retries,
+            "retry_count": retry_count
+        }, **kwargs)
+        self.response = response
+        return None
+    def explain_wrangling_steps(self):
+        """
+        Provides an explanation of the wrangling steps performed by the agent.
+        Returns
+        -------
+        str or list
+            Explanation of the data wrangling steps.
+        """
+        if self.response:
+            return self.response.get("messages", [])
+        return []
+    def get_log_summary(self, markdown=False):
+        """
+        Logs a summary of the agent's operations, if logging is enabled.
+        Parameters
+        ----------
+        markdown : bool, optional
+            If True, returns the summary in Markdown.
+        Returns
+        -------
+        str or None
+            The log details, or None if not available.
+        """
+        if self.response and self.response.get("data_wrangler_function_path"):
+            log_details = f"Log Path: {self.response.get('data_wrangler_function_path')}"
+            if markdown:
+                return Markdown(log_details)
+            else:
+                return log_details
+        return None
+    def get_data_wrangled(self) -> Optional[pd.DataFrame]:
+        """
+        Retrieves the wrangled data after running invoke_agent() or ainvoke_agent().
+        Returns
+        -------
+        pd.DataFrame or None
+            The wrangled dataset as a pandas DataFrame (if available).
+        """
+        if self.response and "data_wrangled" in self.response:
+            return pd.DataFrame(self.response["data_wrangled"])
+        return None
+    def get_data_raw(self) -> Union[dict, list, None]:
+        """
+        Retrieves the original raw data from the last invocation.
+        Returns
+        -------
+        Union[dict, list, None]
+            The original dataset(s) as a single dict or a list of dicts, or None if not available.
+        """
+        if self.response and "data_raw" in self.response:
+            return self.response["data_raw"]
+        return None
+    def get_data_wrangler_function(self, markdown=False) -> Optional[str]:
+        """
+        Retrieves the generated data wrangling function code.
+        Parameters
+        ----------
+        markdown : bool, optional
+            If True, returns the function in Markdown code block format.
+        Returns
+        -------
+        str or None
+            The Python function code, or None if not available.
+        """
+        if self.response and "data_wrangler_function" in self.response:
+            code = self.response["data_wrangler_function"]
+            if markdown:
+                return Markdown(f"```python\n{code}\n```")
+            return code
+        return None
+    def get_recommended_wrangling_steps(self, markdown=False) -> Optional[str]:
+        """
+        Retrieves the agent's recommended data wrangling steps.
+        Parameters
+        ----------
+        markdown : bool, optional
+            If True, returns the steps in Markdown format.
+        Returns
+        -------
+        str or None
+            The recommended steps, or None if not available.
+        """
+        if self.response and "recommended_steps" in self.response:
+            steps = self.response["recommended_steps"]
+            if markdown:
+                return Markdown(steps)
+            return steps
+        return None
+    @staticmethod
+    def _convert_data_input(data_raw: Union[pd.DataFrame, dict, list]) -> Union[dict, list]:
+        """
+        Internal utility to convert data_raw (which could be a DataFrame, dict, or list of dicts)
+        into the format expected by the underlying agent (dict or list of dicts).
+        Parameters
+        ----------
+        data_raw : Union[pd.DataFrame, dict, list]
+            The raw input data to be converted.
+        Returns
+        -------
+        Union[dict, list]
+            The data in a dictionary or list-of-dictionaries format.
+        """
+        # If a single DataFrame, convert to dict
+        if isinstance(data_raw, pd.DataFrame):
+            return data_raw.to_dict()
+        # If it's already a dict (single dataset)
+        if isinstance(data_raw, dict):
+            return data_raw
+        # If it's already a list, check if it's a list of DataFrames or dicts
+        if isinstance(data_raw, list):
+            # Convert any DataFrame item to dict
+            converted_list = []
+            for item in data_raw:
+                if isinstance(item, pd.DataFrame):
+                    converted_list.append(item.to_dict())
+                elif isinstance(item, dict):
+                    converted_list.append(item)
+                else:
+                    raise ValueError("List must contain only DataFrames or dictionaries.")
+            return converted_list
+        raise ValueError("data_raw must be a DataFrame, a dict, or a list of dicts/DataFrames.")
+# Function
 def make_data_wrangling_agent(
     model,
     n_samples=30,
     log=False,
     log_path=None,
     file_name="data_wrangler.py",
-    overwrite = True,
+    function_name="data_wrangler",
+    overwrite=True,
     human_in_the_loop=False,
     bypass_recommended_steps=False,
     bypass_explain_code=False
@@ -73,6 +475,8 @@ def make_data_wrangling_agent(
         The path to the directory where the log files should be stored. Defaults to "logs/".
     file_name : str, optional
         The name of the file to save the response to. Defaults to "data_wrangler.py".
+    function_name : str, optional
+        The name of the function to be generated. Defaults to "data_wrangler".
     overwrite : bool, optional
         Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
         Defaults to True.
@@ -115,6 +519,11 @@ def make_data_wrangling_agent(
     """
     llm = model
+    # Human in th loop requires recommended steps
+    if bypass_recommended_steps and human_in_the_loop:
+        bypass_recommended_steps = False
+        print("Bypass recommended steps set to False to enable human in the loop.")
     # Setup Log Directory
     if log:
         if log_path is None:
@@ -205,7 +614,7 @@ def make_data_wrangling_agent(
         })
         return {
-            "recommended_steps": "\n\n# Recommended Wrangling Steps:\n" + recommended_steps.content.strip(),
+            "recommended_steps": format_recommended_steps(recommended_steps.content.strip(), heading="# Recommended Data Wrangling Steps:"),
             "all_datasets_summary": all_datasets_summary_str,
         }
@@ -244,7 +653,7 @@ def make_data_wrangling_agent(
         data_wrangling_prompt = PromptTemplate(
             template="""
-            You are a Data Wrangling Coding Agent. Your job is to create a data_wrangler() function that can be run on the provided data.
+            You are a Data Wrangling Coding Agent. Your job is to create a {function_name}() function that can be run on the provided data.
             Follow these recommended steps:
             {recommended_steps}
@@ -254,10 +663,10 @@ def make_data_wrangling_agent(
             Below are summaries of all datasets provided. If more than one dataset is provided, you may need to merge or join them.:
             {all_datasets_summary}
-            Return Python code in ```python``` format with a single function definition, data_wrangler(), that includes all imports inside the function. And returns a single pandas data frame.
+            Return Python code in ```python``` format with a single function definition, {function_name}(), that includes all imports inside the function. And returns a single pandas data frame.
             ```python
-            def data_wrangler(data_list):
+            def {function_name}(data_list):
                 '''
                 Wrangle the data provided in data.
@@ -279,14 +688,15 @@ def make_data_wrangling_agent(
             """,
-            input_variables=["recommended_steps", "all_datasets_summary"]
+            input_variables=["recommended_steps", "all_datasets_summary", "function_name"]
         )
         data_wrangling_agent = data_wrangling_prompt | llm | PythonOutputParser()
         response = data_wrangling_agent.invoke({
             "recommended_steps": state.get("recommended_steps"),
-            "all_datasets_summary": all_datasets_summary_str
+            "all_datasets_summary": all_datasets_summary_str,
+            "function_name": function_name
         })
         response = relocate_imports_inside_function(response)
@@ -304,7 +714,8 @@ def make_data_wrangling_agent(
         return {
             "data_wrangler_function" : response,
             "data_wrangler_function_path": file_path,
-            "data_wrangler_function_name": file_name_2,
+            "data_wrangler_file_name": file_name_2,
+            "data_wrangler_function_name": function_name,
             "all_datasets_summary": all_datasets_summary_str
         }
@@ -318,6 +729,33 @@ def make_data_wrangling_agent(
             user_instructions_key="user_instructions",
             recommended_steps_key="recommended_steps"
         )
+    # Human Review
+    prompt_text_human_review = "Are the following data wrangling instructions correct? (Answer 'yes' or provide modifications)\n{steps}"
+    if not bypass_explain_code:
+        def human_review(state: GraphState) -> Command[Literal["recommend_wrangling_steps", "explain_data_wrangler_code"]]:
+            return node_func_human_review(
+                state=state,
+                prompt_text=prompt_text_human_review,
+                yes_goto= 'explain_data_wrangler_code',
+                no_goto="recommend_wrangling_steps",
+                user_instructions_key="user_instructions",
+                recommended_steps_key="recommended_steps",
+                code_snippet_key="data_wrangler_function",
+            )
+    else:
+        def human_review(state: GraphState) -> Command[Literal["recommend_wrangling_steps", "__end__"]]:
+            return node_func_human_review(
+                state=state,
+                prompt_text=prompt_text_human_review,
+                yes_goto= '__end__',
+                no_goto="recommend_wrangling_steps",
+                user_instructions_key="user_instructions",
+                recommended_steps_key="recommended_steps",
+                code_snippet_key="data_wrangler_function",
+            )
     def execute_data_wrangler_code(state: GraphState):
         return node_func_execute_agent_code_on_data(
@@ -326,7 +764,7 @@ def make_data_wrangling_agent(
             result_key="data_wrangled",
             error_key="data_wrangler_error",
             code_snippet_key="data_wrangler_function",
-            agent_function_name="data_wrangler",
+            agent_function_name=state.get("data_wrangler_function_name"),
             # pre_processing=pre_processing,
             post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
             error_message_prefix="An error occurred during data wrangling: "
@@ -334,11 +772,11 @@ def make_data_wrangling_agent(
     def fix_data_wrangler_code(state: GraphState):
         data_wrangler_prompt = """
-        You are a Data Wrangling Agent. Your job is to create a data_wrangler() function that can be run on the data provided. The function is currently broken and needs to be fixed.
+        You are a Data Wrangling Agent. Your job is to create a {function_name}() function that can be run on the data provided. The function is currently broken and needs to be fixed.
-        Make sure to only return the function definition for data_wrangler().
+        Make sure to only return the function definition for {function_name}().
-        Return Python code in ```python``` format with a single function definition, data_wrangler(data_raw), that includes all imports inside the function.
+        Return Python code in ```python``` format with a single function definition, {function_name}(data_raw), that includes all imports inside the function.
         This is the broken code (please fix):
         {code_snippet}
@@ -356,6 +794,7 @@ def make_data_wrangling_agent(
             agent_name=AGENT_NAME,
             log=log,
             file_path=state.get("data_wrangler_function_path"),
+            function_name=state.get("data_wrangler_function_name"),
         )
     def explain_data_wrangler_code(state: GraphState):

ai-data-science-team 0.0.0.9007__py3-none-any.whl → 0.0.0.9008__py3-none-any.whl

ai-data-science-team 0.0.0.9007py3-none-any.whl → 0.0.0.9008py3-none-any.whl