PyPI - ai-data-science-team - Versions diffs - 0.0.0.9008__py3-none-any.whl → 0.0.0.9009__py3-none-any.whl - Mend

ai-data-science-team 0.0.0.9008py3-none-any.whl → 0.0.0.9009py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

ai_data_science_team/multiagents/sql_data_analyst.py CHANGED Viewed

@@ -7,18 +7,19 @@ from langgraph.graph import START, END, StateGraph
 from langgraph.graph.state import CompiledStateGraph
 from langgraph.types import Command
-from typing import TypedDict, Annotated, Sequence
+from typing import TypedDict, Annotated, Sequence, Literal
 import operator
-from typing_extensions import TypedDict, Literal
+from typing_extensions import TypedDict
 import pandas as pd
+import json
 from IPython.display import Markdown
 from ai_data_science_team.templates import BaseAgent
 from ai_data_science_team.agents import SQLDatabaseAgent, DataVisualizationAgent
 from ai_data_science_team.utils.plotly import plotly_from_dict
+from ai_data_science_team.tools.regex import remove_consecutive_duplicates, get_generic_summary
 class SQLDataAnalyst(BaseAgent):
@@ -90,7 +91,7 @@ class SQLDataAnalyst(BaseAgent):
             self._params[k] = v
         self._compiled_graph = self._make_compiled_graph()
-    def ainvoke_agent(self, user_instructions, **kwargs):
+    def ainvoke_agent(self, user_instructions, max_retries:int=3, retry_count:int=0, **kwargs):
         """
         Asynchronosly nvokes the SQL Data Analyst Multi-Agent.
@@ -108,15 +109,53 @@ class SQLDataAnalyst(BaseAgent):
         Example:
         --------
         ``` python
-        # TODO
+        from langchain_openai import ChatOpenAI
+        import sqlalchemy as sql
+        from ai_data_science_team.multiagents import SQLDataAnalyst
+        from ai_data_science_team.agents import SQLDatabaseAgent, DataVisualizationAgent
+        llm = ChatOpenAI(model = "gpt-4o-mini")
+        sql_engine = sql.create_engine("sqlite:///data/northwind.db")
+        conn = sql_engine.connect()
+        sql_data_analyst = SQLDataAnalyst(
+            model = llm,
+            sql_database_agent = SQLDatabaseAgent(
+                model = llm,
+                connection = conn,
+                n_samples = 1,
+            ),
+            data_visualization_agent = DataVisualizationAgent(
+                model = llm,
+                n_samples = 10,
+            )
+        )
+        sql_data_analyst.ainvoke_agent(
+            user_instructions = "Make a plot of sales revenue by month by territory. Make a dropdown for the user to select the territory.",
+        )
+        sql_data_analyst.get_sql_query_code()
+        sql_data_analyst.get_data_sql()
+        sql_data_analyst.get_plotly_graph()
         ```
         """
         response = self._compiled_graph.ainvoke({
             "user_instructions": user_instructions,
+            "max_retries": max_retries,
+            "retry_count": retry_count,
         }, **kwargs)
+        if response.get("messages"):
+            response["messages"] = remove_consecutive_duplicates(response["messages"])
         self.response = response
-    def invoke_agent(self, user_instructions, **kwargs):
+    def invoke_agent(self, user_instructions, max_retries:int=3, retry_count:int=0, **kwargs):
         """
         Invokes the SQL Data Analyst Multi-Agent.
@@ -124,6 +163,10 @@ class SQLDataAnalyst(BaseAgent):
         ----------
         user_instructions: str
             The user's instructions for the combined SQL and (optionally) Data Visualization agents.
+        max_retries (int):
+                Maximum retry attempts for cleaning.
+        retry_count (int):
+            Current retry attempt.
         **kwargs:
             Additional keyword arguments to pass to the compiled graph's `invoke` method.
@@ -134,14 +177,53 @@ class SQLDataAnalyst(BaseAgent):
         Example:
         --------
         ``` python
-        # TODO
+        from langchain_openai import ChatOpenAI
+        import sqlalchemy as sql
+        from ai_data_science_team.multiagents import SQLDataAnalyst
+        from ai_data_science_team.agents import SQLDatabaseAgent, DataVisualizationAgent
+        llm = ChatOpenAI(model = "gpt-4o-mini")
+        sql_engine = sql.create_engine("sqlite:///data/northwind.db")
+        conn = sql_engine.connect()
+        sql_data_analyst = SQLDataAnalyst(
+            model = llm,
+            sql_database_agent = SQLDatabaseAgent(
+                model = llm,
+                connection = conn,
+                n_samples = 1,
+            ),
+            data_visualization_agent = DataVisualizationAgent(
+                model = llm,
+                n_samples = 10,
+            )
+        )
+        sql_data_analyst.invoke_agent(
+            user_instructions = "Make a plot of sales revenue by month by territory. Make a dropdown for the user to select the territory.",
+        )
+        sql_data_analyst.get_sql_query_code()
+        sql_data_analyst.get_data_sql()
+        sql_data_analyst.get_plotly_graph()
         ```
         """
         response = self._compiled_graph.invoke({
             "user_instructions": user_instructions,
+            "max_retries": max_retries,
+            "retry_count": retry_count,
         }, **kwargs)
+        if response.get("messages"):
+            response["messages"] = remove_consecutive_duplicates(response["messages"])
         self.response = response
     def get_data_sql(self):
         """
         Returns the SQL data as a Pandas DataFrame.
@@ -205,6 +287,34 @@ class SQLDataAnalyst(BaseAgent):
                 if markdown:
                     return Markdown(f"```python\n{self.response.get('data_visualization_function')}\n```")
                 return self.response.get("data_visualization_function")
+    def get_workflow_summary(self, markdown=False):
+        """
+        Returns a summary of the SQL Data Analyst workflow.
+        Parameters:
+        ----------
+        markdown: bool
+            If True, returns the summary as a Markdown-formatted string.
+        """
+        if self.response and self.get_response()['messages']:
+            agents = [self.get_response()['messages'][i].role for i in range(len(self.get_response()['messages']))]
+            agent_labels = []
+            for i in range(len(agents)):
+                agent_labels.append(f"- **Agent {i+1}:** {agents[i]}")
+            # Construct header
+            header = f"# SQL Data Analyst Workflow Summary Report\n\nThis agentic workflow contains {len(agents)} agents:\n\n" + "\n".join(agent_labels)
+            reports = []
+            for msg in self.get_response()['messages']:
+                reports.append(get_generic_summary(json.loads(msg.content)))
+            if markdown:
+                return Markdown(header + "\n\n".join(reports))
+            return "\n\n".join(reports)
@@ -250,6 +360,8 @@ def make_sql_data_analyst(
         plot_required: bool
         data_visualization_function: str
         plotly_graph: dict
+        max_retries: int
+        retry_count: int
     def route_to_visualization(state) -> Command[Literal["data_visualization_agent", "__end__"]]:

ai_data_science_team/templates/__init__.py CHANGED Viewed

@@ -3,6 +3,7 @@ from ai_data_science_team.templates.agent_templates import(
     node_func_human_review,
     node_func_fix_agent_code,
     node_func_explain_agent_code,
+    node_func_report_agent_outputs,
     node_func_execute_agent_from_sql_connection,
     create_coding_agent_graph,
     BaseAgent,

ai_data_science_team/templates/agent_templates.py CHANGED Viewed

@@ -8,11 +8,16 @@ from langgraph.pregel.types import StreamMode
 import pandas as pd
 import sqlalchemy as sql
+import json
-from typing import Any, Callable, Dict, Type, Optional, Union
+from typing import Any, Callable, Dict, Type, Optional, Union, List
 from ai_data_science_team.tools.parsers import PythonOutputParser
-from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top
+from ai_data_science_team.tools.regex import (
+    relocate_imports_inside_function,
+    add_comments_to_top,
+    remove_consecutive_duplicates
+)
 from IPython.display import Image, display
 import pandas as pd
@@ -82,6 +87,10 @@ class BaseAgent(CompiledStateGraph):
             Any: The agent's response.
         """
         self.response = self._compiled_graph.invoke(input=input, config=config,**kwargs)
+        if self.response.get("messages"):
+            self.response["messages"] = remove_consecutive_duplicates(self.response["messages"])
         return self.response
     def ainvoke(
@@ -102,6 +111,10 @@ class BaseAgent(CompiledStateGraph):
             Any: The agent's response.
         """
         self.response = self._compiled_graph.ainvoke(input=input, config=config,**kwargs)
+        if self.response.get("messages"):
+            self.response["messages"] = remove_consecutive_duplicates(self.response["messages"])
         return self.response
     def stream(
@@ -129,6 +142,10 @@ class BaseAgent(CompiledStateGraph):
             Any: The agent's response.
         """
         self.response = self._compiled_graph.stream(input=input, config=config, stream_mode=stream_mode, **kwargs)
+        if self.response.get("messages"):
+            self.response["messages"] = remove_consecutive_duplicates(self.response["messages"])
         return self.response
     def astream(
@@ -156,6 +173,10 @@ class BaseAgent(CompiledStateGraph):
             Any: The agent's response.
         """
         self.response = self._compiled_graph.astream(input=input, config=config, stream_mode=stream_mode, **kwargs)
+        if self.response.get("messages"):
+            self.response["messages"] = remove_consecutive_duplicates(self.response["messages"])
         return self.response
     def get_state_keys(self):
@@ -183,6 +204,9 @@ class BaseAgent(CompiledStateGraph):
         Returns:
             Any: The agent's response.
         """
+        if self.response.get("messages"):
+            self.response["messages"] = remove_consecutive_duplicates(self.response["messages"])
         return self.response
     def show(self, xray: int = 0):
@@ -729,3 +753,50 @@ def node_func_explain_agent_code(
         # Return an error message if there was a problem with the code
         message = AIMessage(content=error_message)
         return {result_key: [message]}
+def node_func_report_agent_outputs(
+    state: Dict[str, Any],
+    keys_to_include: List[str],
+    result_key: str,
+    role: str,
+    custom_title: str = "Agent Output Summary"
+) -> Dict[str, Any]:
+    """
+    Gathers relevant data directly from the state (filtered by `keys_to_include`)
+    and returns them as a structured message in `state[result_key]`.
+    No LLM is used.
+    Parameters
+    ----------
+    state : Dict[str, Any]
+        The current state dictionary holding all agent variables.
+    keys_to_include : List[str]
+        The list of keys in `state` to include in the output.
+    result_key : str
+        The key in `state` under which we'll store the final structured message.
+    role : str
+        The role that will be used in the final AIMessage (e.g., "DataCleaningAgent").
+    custom_title : str, optional
+        A title or heading for your report. Defaults to "Agent Output Summary".
+    """
+    print("    * REPORT AGENT OUTPUTS")
+    final_report = {"report_title": custom_title}
+    for key in keys_to_include:
+        final_report[key] = state.get(key, f"<{key}_not_found_in_state>")
+    # Wrap it in a list of messages (like the current "messages" pattern).
+    # You can serialize this dictionary as JSON or just cast it to string.
+    return {
+        result_key: [
+            AIMessage(
+                content=json.dumps(final_report, indent=2),
+                role=role
+            )
+        ]
+    }

ai_data_science_team/tools/metadata.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import io
 import pandas as pd
 import sqlalchemy as sql
+from sqlalchemy import inspect
 from typing import Union, List, Dict
 def get_dataframe_summary(
@@ -139,8 +140,7 @@ def _summarize_dataframe(df: pd.DataFrame, dataset_name: str, n_sample=30, skip_
-def get_database_metadata(connection: Union[sql.engine.base.Connection, sql.engine.base.Engine],
-                          n_samples: int = 10) -> str:
+def get_database_metadata(connection, n_samples=10) -> dict:
     """
     Collects metadata and sample data from a database, with safe identifier quoting and
     basic dialect-aware row limiting. Prevents issues with spaces/reserved words in identifiers.
@@ -154,77 +154,109 @@ def get_database_metadata(connection: Union[sql.engine.base.Connection, sql.engi
     Returns
     -------
-    str
-        A formatted string with database metadata, including some sample data from each column.
+    dict
+        A dictionary with database metadata, including some sample data from each column.
     """
-    # If a connection is passed, use it; if an engine is passed, connect to it
     is_engine = isinstance(connection, sql.engine.base.Engine)
     conn = connection.connect() if is_engine else connection
-    output = []
+    metadata = {
+        "dialect": None,
+        "driver": None,
+        "connection_url": None,
+        "schemas": [],
+    }
     try:
-        # Grab the engine off the connection
         sql_engine = conn.engine
         dialect_name = sql_engine.dialect.name.lower()
-        output.append(f"Database Dialect: {sql_engine.dialect.name}")
-        output.append(f"Driver: {sql_engine.driver}")
-        output.append(f"Connection URL: {sql_engine.url}")
-        # Inspect the database
-        inspector = sql.inspect(sql_engine)
-        tables = inspector.get_table_names()
-        output.append(f"Tables: {tables}")
-        output.append(f"Schemas: {inspector.get_schema_names()}")
-        # Helper to build a dialect-specific limit clause
-        def build_query(col_name_quoted: str, table_name_quoted: str, n: int) -> str:
-            """
-            Returns a SQL query string to select N rows from the given column/table
-            across different dialects (SQLite, MySQL, Postgres, MSSQL, Oracle, etc.)
-            """
-            if "sqlite" in dialect_name or "mysql" in dialect_name or "postgres" in dialect_name:
-                # Common dialects supporting LIMIT
-                return f"SELECT {col_name_quoted} FROM {table_name_quoted} LIMIT {n}"
-            elif "mssql" in dialect_name:
-                # Microsoft SQL Server syntax
-                return f"SELECT TOP {n} {col_name_quoted} FROM {table_name_quoted}"
-            elif "oracle" in dialect_name:
-                # Oracle syntax
-                return f"SELECT {col_name_quoted} FROM {table_name_quoted} WHERE ROWNUM <= {n}"
-            else:
-                # Fallback
-                return f"SELECT {col_name_quoted} FROM {table_name_quoted} LIMIT {n}"
-        # Prepare for quoting
-        preparer = inspector.bind.dialect.identifier_preparer
-        # For each table, get columns and sample data
-        for table_name in tables:
-            output.append(f"\nTable: {table_name}")
-            # Properly quote the table name
-            table_name_quoted = preparer.quote_identifier(table_name)
-            for column in inspector.get_columns(table_name):
-                col_name = column["name"]
-                col_type = column["type"]
-                output.append(f"  Column: {col_name} Type: {col_type}")
+        metadata["dialect"] = sql_engine.dialect.name
+        metadata["driver"] = sql_engine.driver
+        metadata["connection_url"] = str(sql_engine.url)
-                # Properly quote the column name
-                col_name_quoted = preparer.quote_identifier(col_name)
-                # Build a dialect-aware query with safe quoting
-                query = build_query(col_name_quoted, table_name_quoted, n_samples)
-                # Read a few sample values
-                df = pd.read_sql(sql.text(query), conn)
-                first_values = df[col_name].tolist()
-                output.append(f"    First {n_samples} Values: {first_values}")
+        inspector = inspect(sql_engine)
+        preparer = inspector.bind.dialect.identifier_preparer
+        # For each schema
+        for schema_name in inspector.get_schema_names():
+            schema_obj = {
+                "schema_name": schema_name,
+                "tables": []
+            }
+            tables = inspector.get_table_names(schema=schema_name)
+            for table_name in tables:
+                table_info = {
+                    "table_name": table_name,
+                    "columns": [],
+                    "primary_key": [],
+                    "foreign_keys": [],
+                    "indexes": []
+                }
+                # Get columns
+                columns = inspector.get_columns(table_name, schema=schema_name)
+                for col in columns:
+                    col_name = col["name"]
+                    col_type = str(col["type"])
+                    table_name_quoted = f"{preparer.quote_identifier(schema_name)}.{preparer.quote_identifier(table_name)}"
+                    col_name_quoted = preparer.quote_identifier(col_name)
+                    # Build query for sample data
+                    query = build_query(col_name_quoted, table_name_quoted, n_samples, dialect_name)
+                    # Retrieve sample data
+                    try:
+                        df = pd.read_sql(query, conn)
+                        samples = df[col_name].head(n_samples).tolist()
+                    except Exception as e:
+                        samples = [f"Error retrieving data: {str(e)}"]
+                    table_info["columns"].append({
+                        "name": col_name,
+                        "type": col_type,
+                        "sample_values": samples
+                    })
+                # Primary keys
+                pk_constraint = inspector.get_pk_constraint(table_name, schema=schema_name)
+                table_info["primary_key"] = pk_constraint.get("constrained_columns", [])
+                # Foreign keys
+                fks = inspector.get_foreign_keys(table_name, schema=schema_name)
+                table_info["foreign_keys"] = [
+                    {
+                        "local_cols": fk["constrained_columns"],
+                        "referred_table": fk["referred_table"],
+                        "referred_cols": fk["referred_columns"]
+                    }
+                    for fk in fks
+                ]
+                # Indexes
+                idxs = inspector.get_indexes(table_name, schema=schema_name)
+                table_info["indexes"] = idxs
+                schema_obj["tables"].append(table_info)
+            metadata["schemas"].append(schema_obj)
     finally:
-        # Close connection if created inside the function
         if is_engine:
             conn.close()
-    return "\n".join(output)
+    return metadata
+def build_query(col_name_quoted: str, table_name_quoted: str, n: int, dialect_name: str) -> str:
+    # Example: expand your build_query to handle random sampling if possible
+    if "postgres" in dialect_name:
+        return f"SELECT {col_name_quoted} FROM {table_name_quoted} ORDER BY RANDOM() LIMIT {n}"
+    if "mysql" in dialect_name:
+        return f"SELECT {col_name_quoted} FROM {table_name_quoted} ORDER BY RAND() LIMIT {n}"
+    if "sqlite" in dialect_name:
+        return f"SELECT {col_name_quoted} FROM {table_name_quoted} ORDER BY RANDOM() LIMIT {n}"
+    if "mssql" in dialect_name:
+        return f"SELECT TOP {n} {col_name_quoted} FROM {table_name_quoted} ORDER BY NEWID()"
+    # Oracle or fallback
+    return f"SELECT {col_name_quoted} FROM {table_name_quoted} WHERE ROWNUM <= {n}"

ai_data_science_team/tools/regex.py CHANGED Viewed

@@ -103,4 +103,62 @@ def format_recommended_steps(raw_text: str, heading: str = "# Recommended Steps:
     if not seen_heading:
         new_lines.insert(0, heading)
-    return "\n".join(new_lines)
+    return "\n".join(new_lines)
+def get_generic_summary(report_dict: dict, code_lang = "python") -> str:
+    """
+    Takes a dictionary of unknown structure (e.g., from json.loads(...))
+    and returns a textual summary. It assumes:
+      1) 'report_title' (if present) should be displayed first.
+      2) If a key includes 'code' or 'function',
+         the value is treated as a code block.
+      3) Otherwise, key-value pairs are displayed as text.
+    Parameters
+    ----------
+    report_dict : dict
+        The dictionary holding the agent output or user report.
+    Returns
+    -------
+    str
+        A formatted summary string.
+    """
+    # 1) Grab the report title (or default)
+    title = report_dict.get("report_title", "Untitled Report")
+    lines = []
+    lines.append(f"# {title}")
+    # 2) Iterate over all other keys
+    for key, value in report_dict.items():
+        # Skip the title key, since we already displayed it
+        if key == "report_title":
+            continue
+        # 3) Check if it's code or function
+        # (You can tweak this logic if you have different rules)
+        key_lower = key.lower()
+        if "code" in key_lower or "function" in key_lower:
+            # Treat as code
+            lines.append(f"\n## {format_agent_name(key).upper()}")
+            lines.append(f"```{code_lang}\n" + str(value) + "\n```")
+        else:
+            # 4) Otherwise, just display the key-value as text
+            lines.append(f"\n## {format_agent_name(key).upper()}")
+            lines.append(str(value))
+    return "\n".join(lines)
+def remove_consecutive_duplicates(messages):
+    unique_messages = []
+    prev_message = None
+    for msg in messages:
+        if msg.content != prev_message:
+            unique_messages.append(msg)
+        prev_message = msg.content  # Update previous message to current
+    return unique_messages

ai-data-science-team 0.0.0.9008__py3-none-any.whl → 0.0.0.9009__py3-none-any.whl

ai-data-science-team 0.0.0.9008py3-none-any.whl → 0.0.0.9009py3-none-any.whl