PyPI - ai-data-science-team - Versions diffs - 0.0.0.9006__py3-none-any.whl → 0.0.0.9007__py3-none-any.whl - Mend

ai-data-science-team 0.0.0.9006py3-none-any.whl → 0.0.0.9007py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

ai_data_science_team/agents/data_wrangling_agent.py CHANGED Viewed

@@ -15,7 +15,7 @@ from langchain_core.messages import BaseMessage
 from langgraph.types import Command
 from langgraph.checkpoint.memory import MemorySaver
-from ai_data_science_team.templates.agent_templates import(
+from ai_data_science_team.templates import(
     node_func_execute_agent_code_on_data,
     node_func_human_review,
     node_func_fix_agent_code,
@@ -23,7 +23,7 @@ from ai_data_science_team.templates.agent_templates import(
     create_coding_agent_graph
 )
 from ai_data_science_team.tools.parsers import PythonOutputParser
-from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top
+from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top, format_agent_name
 from ai_data_science_team.tools.metadata import get_dataframe_summary
 from ai_data_science_team.tools.logging import log_ai_function
@@ -31,7 +31,17 @@ from ai_data_science_team.tools.logging import log_ai_function
 AGENT_NAME = "data_wrangling_agent"
 LOG_PATH = os.path.join(os.getcwd(), "logs/")
-def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True, human_in_the_loop=False, bypass_recommended_steps=False, bypass_explain_code=False):
+def make_data_wrangling_agent(
+    model,
+    n_samples=30,
+    log=False,
+    log_path=None,
+    file_name="data_wrangler.py",
+    overwrite = True,
+    human_in_the_loop=False,
+    bypass_recommended_steps=False,
+    bypass_explain_code=False
+):
     """
     Creates a data wrangling agent that can be run on one or more datasets. The agent can be
     instructed to perform common data wrangling steps such as:
@@ -52,11 +62,17 @@ def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True,
     ----------
     model : langchain.llms.base.LLM
         The language model to use to generate code.
+    n_samples : int, optional
+        The number of samples to show in the data summary. Defaults to 30.
+        If you get an error due to maximum tokens, try reducing this number.
+        > "This model's maximum context length is 128000 tokens. However, your messages resulted in 333858 tokens. Please reduce the length of the messages."
     log : bool, optional
         Whether or not to log the code generated and any errors that occur.
         Defaults to False.
     log_path : str, optional
         The path to the directory where the log files should be stored. Defaults to "logs/".
+    file_name : str, optional
+        The name of the file to save the response to. Defaults to "data_wrangler.py".
     overwrite : bool, optional
         Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
         Defaults to True.
@@ -94,7 +110,7 @@ def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True,
     Returns
     -------
-    app : langchain.graphs.StateGraph
+    app : langchain.graphs.CompiledStateGraph
         The data wrangling agent as a state graph.
     """
     llm = model
@@ -122,7 +138,7 @@ def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True,
         retry_count: int
     def recommend_wrangling_steps(state: GraphState):
-        print("---DATA WRANGLING AGENT----")
+        print(format_agent_name(AGENT_NAME))
         print("    * RECOMMEND WRANGLING STEPS")
         data_raw = state.get("data_raw")
@@ -143,7 +159,7 @@ def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True,
         # Create a summary for all datasets
         # We'll include a short sample and info for each dataset
-        all_datasets_summary = get_dataframe_summary(dataframes)
+        all_datasets_summary = get_dataframe_summary(dataframes, n_sample=n_samples)
         # Join all datasets summaries into one big text block
         all_datasets_summary_str = "\n\n".join(all_datasets_summary)
@@ -176,6 +192,7 @@ def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True,
             Avoid these:
             1. Do not include steps to save files.
+            2. Do not include unrelated user instructions that are not related to the data wrangling.
             """,
             input_variables=["user_instructions", "recommended_steps", "all_datasets_summary"]
         )
@@ -195,7 +212,34 @@ def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True,
     def create_data_wrangler_code(state: GraphState):
         if bypass_recommended_steps:
-            print("---DATA WRANGLING AGENT----")
+            print(format_agent_name(AGENT_NAME))
+            data_raw = state.get("data_raw")
+            if isinstance(data_raw, dict):
+                # Single dataset scenario
+                primary_dataset_name = "main"
+                datasets = {primary_dataset_name: data_raw}
+            elif isinstance(data_raw, list) and all(isinstance(item, dict) for item in data_raw):
+                # Multiple datasets scenario
+                datasets = {f"dataset_{i}": d for i, d in enumerate(data_raw, start=1)}
+                primary_dataset_name = "dataset_1"
+            else:
+                raise ValueError("data_raw must be a dict or a list of dicts.")
+            # Convert all datasets to DataFrames for inspection
+            dataframes = {name: pd.DataFrame.from_dict(d) for name, d in datasets.items()}
+            # Create a summary for all datasets
+            # We'll include a short sample and info for each dataset
+            all_datasets_summary = get_dataframe_summary(dataframes, n_sample=n_samples)
+            # Join all datasets summaries into one big text block
+            all_datasets_summary_str = "\n\n".join(all_datasets_summary)
+        else:
+            all_datasets_summary_str = state.get("all_datasets_summary")
         print("    * CREATE DATA WRANGLER CODE")
         data_wrangling_prompt = PromptTemplate(
@@ -242,16 +286,16 @@ def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True,
         response = data_wrangling_agent.invoke({
             "recommended_steps": state.get("recommended_steps"),
-            "all_datasets_summary": state.get("all_datasets_summary")
+            "all_datasets_summary": all_datasets_summary_str
         })
         response = relocate_imports_inside_function(response)
         response = add_comments_to_top(response, agent_name=AGENT_NAME)
         # For logging: store the code generated
-        file_path, file_name = log_ai_function(
+        file_path, file_name_2 = log_ai_function(
             response=response,
-            file_name="data_wrangler.py",
+            file_name=file_name,
             log=log,
             log_path=log_path,
             overwrite=overwrite
@@ -260,7 +304,8 @@ def make_data_wrangling_agent(model, log=False, log_path=None, overwrite = True,
         return {
             "data_wrangler_function" : response,
             "data_wrangler_function_path": file_path,
-            "data_wrangler_function_name": file_name
+            "data_wrangler_function_name": file_name_2,
+            "all_datasets_summary": all_datasets_summary_str
         }

ai_data_science_team/agents/feature_engineering_agent.py CHANGED Viewed

@@ -17,7 +17,7 @@ import os
 import io
 import pandas as pd
-from ai_data_science_team.templates.agent_templates import(
+from ai_data_science_team.templates import(
     node_func_execute_agent_code_on_data,
     node_func_human_review,
     node_func_fix_agent_code,
@@ -25,7 +25,7 @@ from ai_data_science_team.templates.agent_templates import(
     create_coding_agent_graph
 )
 from ai_data_science_team.tools.parsers import PythonOutputParser
-from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top
+from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top, format_agent_name
 from ai_data_science_team.tools.metadata import get_dataframe_summary
 from ai_data_science_team.tools.logging import log_ai_function
@@ -35,7 +35,17 @@ LOG_PATH = os.path.join(os.getcwd(), "logs/")
 # * Feature Engineering Agent
-def make_feature_engineering_agent(model, log=False, log_path=None, overwrite = True, human_in_the_loop=False, bypass_recommended_steps=False, bypass_explain_code=False):
+def make_feature_engineering_agent(
+    model,
+    n_samples=30,
+    log=False,
+    log_path=None,
+    file_name="feature_engineer.py",
+    overwrite = True,
+    human_in_the_loop=False,
+    bypass_recommended_steps=False,
+    bypass_explain_code=False,
+):
     """
     Creates a feature engineering agent that can be run on a dataset. The agent applies various feature engineering
     techniques, such as encoding categorical variables, scaling numeric variables, creating interaction terms,
@@ -61,11 +71,17 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
     ----------
     model : langchain.llms.base.LLM
         The language model to use to generate code.
+    n_samples : int, optional
+        The number of data samples to use for generating the feature engineering code. Defaults to 30.
+        If you get an error due to maximum tokens, try reducing this number.
+        > "This model's maximum context length is 128000 tokens. However, your messages resulted in 333858 tokens. Please reduce the length of the messages."
     log : bool, optional
         Whether or not to log the code generated and any errors that occur.
         Defaults to False.
     log_path : str, optional
         The path to the directory where the log files should be stored. Defaults to "logs/".
+    file_name : str, optional
+        The name of the file to save the log to. Defaults to "feature_engineer.py".
     overwrite : bool, optional
         Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
         Defaults to True.
@@ -102,7 +118,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
     Returns
     -------
-    app : langchain.graphs.StateGraph
+    app : langchain.graphs.CompiledStateGraph
         The feature engineering agent as a state graph.
     """
     llm = model
@@ -135,7 +151,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
         Recommend a series of feature engineering steps based on the input data.
         These recommended steps will be appended to the user_instructions.
         """
-        print("---FEATURE ENGINEERING AGENT----")
+        print(format_agent_name(AGENT_NAME))
         print("    * RECOMMEND FEATURE ENGINEERING STEPS")
         # Prompt to get recommended steps from the LLM
@@ -182,6 +198,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
             Avoid these:
             1. Do not include steps to save files.
+            2. Do not include unrelated user instructions that are not related to the feature engineering.
             """,
             input_variables=["user_instructions", "recommended_steps", "all_datasets_summary"]
         )
@@ -189,7 +206,7 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
         data_raw = state.get("data_raw")
         df = pd.DataFrame.from_dict(data_raw)
-        all_datasets_summary = get_dataframe_summary([df])
+        all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
         all_datasets_summary_str = "\n\n".join(all_datasets_summary)
@@ -217,7 +234,18 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
     def create_feature_engineering_code(state: GraphState):
         if bypass_recommended_steps:
-            print("---FEATURE ENGINEERING AGENT----")
+            print(format_agent_name(AGENT_NAME))
+            data_raw = state.get("data_raw")
+            df = pd.DataFrame.from_dict(data_raw)
+            all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
+            all_datasets_summary_str = "\n\n".join(all_datasets_summary)
+        else:
+            all_datasets_summary_str = state.get("all_datasets_summary")
         print("    * CREATE FEATURE ENGINEERING CODE")
         feature_engineering_prompt = PromptTemplate(
@@ -272,16 +300,16 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
         response = feature_engineering_agent.invoke({
             "recommended_steps": state.get("recommended_steps"),
             "target_variable": state.get("target_variable"),
-            "all_datasets_summary": state.get("all_datasets_summary"),
+            "all_datasets_summary": all_datasets_summary_str,
         })
         response = relocate_imports_inside_function(response)
         response = add_comments_to_top(response, agent_name=AGENT_NAME)
         # For logging: store the code generated
-        file_path, file_name = log_ai_function(
+        file_path, file_name_2 = log_ai_function(
             response=response,
-            file_name="feature_engineer.py",
+            file_name=file_name,
             log=log,
             log_path=log_path,
             overwrite=overwrite
@@ -290,7 +318,8 @@ def make_feature_engineering_agent(model, log=False, log_path=None, overwrite =
         return {
             "feature_engineer_function": response,
             "feature_engineer_function_path": file_path,
-            "feature_engineer_function_name": file_name
+            "feature_engineer_function_name": file_name_2,
+            "all_datasets_summary": all_datasets_summary_str
         }

ai_data_science_team/agents/sql_database_agent.py CHANGED Viewed

@@ -14,7 +14,7 @@ import io
 import pandas as pd
 import sqlalchemy as sql
-from ai_data_science_team.templates.agent_templates import(
+from ai_data_science_team.templates import(
     node_func_execute_agent_from_sql_connection,
     node_func_human_review,
     node_func_fix_agent_code,
@@ -22,7 +22,7 @@ from ai_data_science_team.templates.agent_templates import(
     create_coding_agent_graph
 )
 from ai_data_science_team.tools.parsers import PythonOutputParser, SQLOutputParser
-from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top
+from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top, format_agent_name
 from ai_data_science_team.tools.metadata import get_database_metadata
 from ai_data_science_team.tools.logging import log_ai_function
@@ -31,7 +31,16 @@ AGENT_NAME = "sql_database_agent"
 LOG_PATH = os.path.join(os.getcwd(), "logs/")
-def make_sql_database_agent(model, connection, log=False, log_path=None, overwrite = True, human_in_the_loop=False, bypass_recommended_steps=False, bypass_explain_code=False):
+def make_sql_database_agent(
+    model, connection,
+    n_samples = 10,
+    log=False,
+    log_path=None,
+    file_name="sql_database.py",
+    overwrite = True,
+    human_in_the_loop=False, bypass_recommended_steps=False,
+    bypass_explain_code=False
+):
     """
     Creates a SQL Database Agent that can recommend SQL steps and generate SQL code to query a database.
@@ -41,10 +50,16 @@ def make_sql_database_agent(model, connection, log=False, log_path=None, overwri
         The language model to use for the agent.
     connection : sqlalchemy.engine.base.Engine
         The connection to the SQL database.
+    n_samples : int, optional
+        The number of samples to retrieve for each column, by default 10.
+        If you get an error due to maximum tokens, try reducing this number.
+        > "This model's maximum context length is 128000 tokens. However, your messages resulted in 333858 tokens. Please reduce the length of the messages."
     log : bool, optional
         Whether to log the generated code, by default False
     log_path : str, optional
         The path to the log directory, by default None
+    file_name : str, optional
+        The name of the file to save the generated code, by default "sql_database.py"
     overwrite : bool, optional
         Whether to overwrite the existing log file, by default True
     human_in_the_loop : bool, optional
@@ -56,7 +71,7 @@ def make_sql_database_agent(model, connection, log=False, log_path=None, overwri
     Returns
     -------
-    app : langchain.graphs.StateGraph
+    app : langchain.graphs.CompiledStateGraph
         The data cleaning agent as a state graph.
     Examples
@@ -116,8 +131,8 @@ def make_sql_database_agent(model, connection, log=False, log_path=None, overwri
     def recommend_sql_steps(state: GraphState):
-        print("---SQL DATABASE AGENT---")
-        print("    * RECOMMEND SQL QUERY STEPS")
+        print(format_agent_name(AGENT_NAME))
+        print("    * RECOMMEND STEPS")
         # Prompt to get recommended steps from the LLM
@@ -156,6 +171,8 @@ def make_sql_database_agent(model, connection, log=False, log_path=None, overwri
             2. Do not include steps to modify existing tables, create new tables or modify the database schema.
             3. Do not include steps that alter the existing data in the database.
             4. Make sure not to include unsafe code that could cause data loss or corruption or SQL injections.
+            5. Make sure to not include irrelevant steps that do not help in the SQL agent's data collection and processing. Examples include steps to create new tables, modify the schema, save files, create charts, etc.
             """,
             input_variables=["user_instructions", "recommended_steps", "all_sql_database_summary"]
@@ -166,7 +183,7 @@ def make_sql_database_agent(model, connection, log=False, log_path=None, overwri
         conn = connection.connect() if is_engine else connection
         # Get the database metadata
-        all_sql_database_summary = get_database_metadata(conn, n_values=10)
+        all_sql_database_summary = get_database_metadata(conn, n_samples=n_samples)
         steps_agent = recommend_steps_prompt | llm
@@ -183,7 +200,7 @@ def make_sql_database_agent(model, connection, log=False, log_path=None, overwri
     def create_sql_query_code(state: GraphState):
         if bypass_recommended_steps:
-            print("---SQL DATABASE AGENT---")
+            print(format_agent_name(AGENT_NAME))
         print("    * CREATE SQL QUERY CODE")
         # Prompt to get the SQL code from the LLM
@@ -228,7 +245,7 @@ def make_sql_database_agent(model, connection, log=False, log_path=None, overwri
         conn = connection.connect() if is_engine else connection
         # Get the database metadata
-        all_sql_database_summary = get_database_metadata(conn, n_values=10)
+        all_sql_database_summary = get_database_metadata(conn, n_samples=n_samples)
         sql_query_code_agent = sql_query_code_prompt | llm | SQLOutputParser()
@@ -259,9 +276,9 @@ def sql_database_pipeline(connection):
         response = add_comments_to_top(response, AGENT_NAME)
         # For logging: store the code generated
-        file_path, file_name = log_ai_function(
+        file_path, file_name_2 = log_ai_function(
             response=response,
-            file_name="sql_database.py",
+            file_name=file_name,
             log=log,
             log_path=log_path,
             overwrite=overwrite
@@ -271,7 +288,8 @@ def sql_database_pipeline(connection):
             "sql_query_code": sql_query_code,
             "sql_database_function": response,
             "sql_database_function_path": file_path,
-            "sql_database_function_name": file_name
+            "sql_database_function_name": file_name_2,
+            "all_sql_database_summary": all_sql_database_summary
         }
     def human_review(state: GraphState) -> Command[Literal["recommend_sql_steps", "create_sql_query_code"]]:

ai_data_science_team/templates/__init__.py CHANGED Viewed

@@ -0,0 +1,8 @@
+from ai_data_science_team.templates.agent_templates import(
+    node_func_execute_agent_code_on_data,
+    node_func_human_review,
+    node_func_fix_agent_code,
+    node_func_explain_agent_code,
+    node_func_execute_agent_from_sql_connection,
+    create_coding_agent_graph
+)

ai_data_science_team/tools/metadata.py CHANGED Viewed

@@ -4,7 +4,9 @@ import sqlalchemy as sql
 from typing import Union, List, Dict
 def get_dataframe_summary(
-    dataframes: Union[pd.DataFrame, List[pd.DataFrame], Dict[str, pd.DataFrame]]
+    dataframes: Union[pd.DataFrame, List[pd.DataFrame], Dict[str, pd.DataFrame]],
+    n_sample: int = 30,
+    skip_stats: bool = False,
 ) -> List[str]:
     """
     Generate a summary for one or more DataFrames. Accepts a single DataFrame, a list of DataFrames,
@@ -16,6 +18,10 @@ def get_dataframe_summary(
         - Single DataFrame: produce a single summary (returned within a one-element list).
         - List of DataFrames: produce a summary for each DataFrame, using index-based names.
         - Dictionary of DataFrames: produce a summary for each DataFrame, using dictionary keys as names.
+    n_sample : int, default 30
+        Number of rows to display in the "Data (first 30 rows)" section.
+    skip_stats : bool, default False
+        If True, skip the descriptive statistics and DataFrame info sections.
     Example:
     --------
@@ -49,17 +55,17 @@ def get_dataframe_summary(
     # --- Dictionary Case ---
     if isinstance(dataframes, dict):
         for dataset_name, df in dataframes.items():
-            summaries.append(_summarize_dataframe(df, dataset_name))
+            summaries.append(_summarize_dataframe(df, dataset_name, n_sample, skip_stats))
     # --- Single DataFrame Case ---
     elif isinstance(dataframes, pd.DataFrame):
-        summaries.append(_summarize_dataframe(dataframes, "Single_Dataset"))
+        summaries.append(_summarize_dataframe(dataframes, "Single_Dataset", n_sample, skip_stats))
     # --- List of DataFrames Case ---
     elif isinstance(dataframes, list):
         for idx, df in enumerate(dataframes):
             dataset_name = f"Dataset_{idx}"
-            summaries.append(_summarize_dataframe(df, dataset_name))
+            summaries.append(_summarize_dataframe(df, dataset_name, n_sample, skip_stats))
     else:
         raise TypeError(
@@ -69,7 +75,7 @@ def get_dataframe_summary(
     return summaries
-def _summarize_dataframe(df: pd.DataFrame, dataset_name: str) -> str:
+def _summarize_dataframe(df: pd.DataFrame, dataset_name: str, n_sample=30, skip_stats=False) -> str:
     """Generate a summary string for a single DataFrame."""
     # 1. Convert dictionary-type cells to strings
     #    This prevents unhashable dict errors during df.nunique().
@@ -91,77 +97,134 @@ def _summarize_dataframe(df: pd.DataFrame, dataset_name: str) -> str:
     unique_counts = df.nunique()  # Will no longer fail on unhashable dict
     unique_counts_summary = "\n".join([f"{col}: {count}" for col, count in unique_counts.items()])
-    summary_text = f"""
-    Dataset Name: {dataset_name}
-    ----------------------------
-    Shape: {df.shape[0]} rows x {df.shape[1]} columns
+    # 6. Generate the summary text
+    if not skip_stats:
+        summary_text = f"""
+        Dataset Name: {dataset_name}
+        ----------------------------
+        Shape: {df.shape[0]} rows x {df.shape[1]} columns
-    Column Data Types:
-    {column_types}
+        Column Data Types:
+        {column_types}
-    Missing Value Percentage:
-    {missing_summary}
+        Missing Value Percentage:
+        {missing_summary}
-    Unique Value Counts:
-    {unique_counts_summary}
+        Unique Value Counts:
+        {unique_counts_summary}
-    Data (first 30 rows):
-    {df.head(30).to_string()}
+        Data (first {n_sample} rows):
+        {df.head(n_sample).to_string()}
-    Data Description:
-    {df.describe().to_string()}
+        Data Description:
+        {df.describe().to_string()}
-    Data Info:
-    {info_text}
-    """
+        Data Info:
+        {info_text}
+        """
+    else:
+        summary_text = f"""
+        Dataset Name: {dataset_name}
+        ----------------------------
+        Shape: {df.shape[0]} rows x {df.shape[1]} columns
+        Column Data Types:
+        {column_types}
+        Data (first {n_sample} rows):
+        {df.head(n_sample).to_string()}
+        """
     return summary_text.strip()
-def get_database_metadata(connection: Union[sql.engine.base.Connection, sql.engine.base.Engine], n_values: int=10):
-    """
-    Collects metadata and sample data from a database.
-    Parameters:
-    -----------
-    connection (sqlalchemy.engine.base.Connection or sqlalchemy.engine.base.Engine):
+def get_database_metadata(connection: Union[sql.engine.base.Connection, sql.engine.base.Engine],
+                          n_samples: int = 10) -> str:
+    """
+    Collects metadata and sample data from a database, with safe identifier quoting and
+    basic dialect-aware row limiting. Prevents issues with spaces/reserved words in identifiers.
+    Parameters
+    ----------
+    connection : Union[sql.engine.base.Connection, sql.engine.base.Engine]
         An active SQLAlchemy connection or engine.
-    n_values (int):
+    n_samples : int
         Number of sample values to retrieve for each column.
-    Returns:
-    --------
-    str: Formatted text with database metadata.
+    Returns
+    -------
+    str
+        A formatted string with database metadata, including some sample data from each column.
     """
     # If a connection is passed, use it; if an engine is passed, connect to it
     is_engine = isinstance(connection, sql.engine.base.Engine)
     conn = connection.connect() if is_engine else connection
-    output = []
+    output = []
     try:
-        # Engine metadata
+        # Grab the engine off the connection
         sql_engine = conn.engine
+        dialect_name = sql_engine.dialect.name.lower()
         output.append(f"Database Dialect: {sql_engine.dialect.name}")
         output.append(f"Driver: {sql_engine.driver}")
         output.append(f"Connection URL: {sql_engine.url}")
         # Inspect the database
         inspector = sql.inspect(sql_engine)
-        output.append(f"Tables: {inspector.get_table_names()}")
+        tables = inspector.get_table_names()
+        output.append(f"Tables: {tables}")
         output.append(f"Schemas: {inspector.get_schema_names()}")
-        # For each table, get the columns and their metadata
-        for table_name in inspector.get_table_names():
+        # Helper to build a dialect-specific limit clause
+        def build_query(col_name_quoted: str, table_name_quoted: str, n: int) -> str:
+            """
+            Returns a SQL query string to select N rows from the given column/table
+            across different dialects (SQLite, MySQL, Postgres, MSSQL, Oracle, etc.)
+            """
+            if "sqlite" in dialect_name or "mysql" in dialect_name or "postgres" in dialect_name:
+                # Common dialects supporting LIMIT
+                return f"SELECT {col_name_quoted} FROM {table_name_quoted} LIMIT {n}"
+            elif "mssql" in dialect_name:
+                # Microsoft SQL Server syntax
+                return f"SELECT TOP {n} {col_name_quoted} FROM {table_name_quoted}"
+            elif "oracle" in dialect_name:
+                # Oracle syntax
+                return f"SELECT {col_name_quoted} FROM {table_name_quoted} WHERE ROWNUM <= {n}"
+            else:
+                # Fallback
+                return f"SELECT {col_name_quoted} FROM {table_name_quoted} LIMIT {n}"
+        # Prepare for quoting
+        preparer = inspector.bind.dialect.identifier_preparer
+        # For each table, get columns and sample data
+        for table_name in tables:
             output.append(f"\nTable: {table_name}")
+            # Properly quote the table name
+            table_name_quoted = preparer.quote_identifier(table_name)
             for column in inspector.get_columns(table_name):
-                output.append(f"  Column: {column['name']} Type: {column['type']}")
-                # Fetch sample values for the column
-                query = f"SELECT {column['name']} FROM {table_name} LIMIT {n_values}"
-                data = pd.read_sql(query, sql_engine)
-                output.append(f"    First {n_values} Values: {data.values.flatten().tolist()}")
+                col_name = column["name"]
+                col_type = column["type"]
+                output.append(f"  Column: {col_name} Type: {col_type}")
+                # Properly quote the column name
+                col_name_quoted = preparer.quote_identifier(col_name)
+                # Build a dialect-aware query with safe quoting
+                query = build_query(col_name_quoted, table_name_quoted, n_samples)
+                # Read a few sample values
+                df = pd.read_sql(sql.text(query), conn)
+                first_values = df[col_name].tolist()
+                output.append(f"    First {n_samples} Values: {first_values}")
     finally:
-        # Close connection if it was created inside this function
+        # Close connection if created inside the function
         if is_engine:
             conn.close()
-    # Join all collected information into a single string
     return "\n".join(output)

ai_data_science_team/tools/regex.py CHANGED Viewed

@@ -71,3 +71,9 @@ def add_comments_to_top(code_text, agent_name="data_wrangler"):
     # Join the header with newlines, then prepend to the existing code_text
     header_block = "\n".join(header_comments)
     return header_block + code_text
+def format_agent_name(agent_name: str) -> str:
+    formatted_name = agent_name.strip().replace("_", " ").upper()
+    return f"---{formatted_name}----"

ai-data-science-team 0.0.0.9006__py3-none-any.whl → 0.0.0.9007__py3-none-any.whl

ai-data-science-team 0.0.0.9006py3-none-any.whl → 0.0.0.9007py3-none-any.whl