PyPI - openchatbi - Versions diffs - 0.0.1__py3-none-any.whl - Mend

openchatbi 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

openchatbi/__init__.py +35 -0
openchatbi/agent_graph.py +373 -0
openchatbi/catalog/__init__.py +14 -0
openchatbi/catalog/catalog_loader.py +208 -0
openchatbi/catalog/catalog_store.py +202 -0
openchatbi/catalog/entry.py +5 -0
openchatbi/catalog/factory.py +81 -0
openchatbi/catalog/helper.py +49 -0
openchatbi/catalog/retrival_helper.py +74 -0
openchatbi/catalog/schema_retrival.py +144 -0
openchatbi/catalog/store/__init__.py +3 -0
openchatbi/catalog/store/file_system.py +789 -0
openchatbi/catalog/token_service.py +48 -0
openchatbi/code/docker_executor.py +179 -0
openchatbi/code/executor_base.py +21 -0
openchatbi/code/local_executor.py +21 -0
openchatbi/code/restricted_local_executor.py +47 -0
openchatbi/config.yaml.template +74 -0
openchatbi/config_loader.py +225 -0
openchatbi/constants.py +17 -0
openchatbi/graph_state.py +59 -0
openchatbi/llm/llm.py +94 -0
openchatbi/prompts/agent_prompt.md +48 -0
openchatbi/prompts/extraction_prompt.md +175 -0
openchatbi/prompts/schema_linking_prompt.md +56 -0
openchatbi/prompts/sql_dialect/presto.md +57 -0
openchatbi/prompts/system_prompt.py +92 -0
openchatbi/prompts/text2sql_prompt.md +35 -0
openchatbi/prompts/visualization_prompt.md +34 -0
openchatbi/text2sql/__init__.py +1 -0
openchatbi/text2sql/data.py +12 -0
openchatbi/text2sql/extraction.py +122 -0
openchatbi/text2sql/generate_sql.py +400 -0
openchatbi/text2sql/schema_linking.py +239 -0
openchatbi/text2sql/sql_graph.py +150 -0
openchatbi/text2sql/text2sql_utils.py +57 -0
openchatbi/text2sql/visualization.py +315 -0
openchatbi/tool/ask_human.py +15 -0
openchatbi/tool/mcp_tools.py +257 -0
openchatbi/tool/memory.py +181 -0
openchatbi/tool/run_python_code.py +70 -0
openchatbi/tool/save_report.py +65 -0
openchatbi/tool/search_knowledge.py +107 -0
openchatbi/utils.py +183 -0
openchatbi-0.0.1.dist-info/METADATA +674 -0
openchatbi-0.0.1.dist-info/RECORD +48 -0
openchatbi-0.0.1.dist-info/WHEEL +4 -0
openchatbi-0.0.1.dist-info/licenses/LICENSE +21 -0

openchatbi/__init__.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""OpenChatBI core module initialization."""
+import os
+from langgraph.graph.state import CompiledStateGraph
+from openchatbi.config_loader import ConfigLoader
+# Global configuration instance
+config = ConfigLoader()
+# Skip config loading during documentation build
+if not os.environ.get("SPHINX_BUILD"):
+    config.load()
+else:
+    config.set({})
+def get_default_graph():
+    """
+    Build the synchronous mode of the agent graph using default catalog in config.
+    Returns:
+        CompiledStateGraph: Compiled agent graph ready for execution.
+    """
+    if os.environ.get("SPHINX_BUILD"):
+        return None
+    from langgraph.checkpoint.memory import MemorySaver
+    from openchatbi.agent_graph import build_agent_graph_sync
+    from openchatbi.tool.memory import get_sync_memory_store
+    checkpointer = MemorySaver()
+    return build_agent_graph_sync(
+        config.get().catalog_store, checkpointer=checkpointer, memory_store=get_sync_memory_store()
+    )

openchatbi/agent_graph.py ADDED Viewed

@@ -0,0 +1,373 @@
+"""Main agent graph construction and execution logic."""
+import datetime
+import logging
+import traceback
+from collections.abc import Callable
+from typing import Any, Optional
+from langchain_core.language_models import BaseChatModel
+from langchain_core.messages import AIMessage, SystemMessage
+from langchain_core.tools import StructuredTool
+from langchain_openai.chat_models.base import BaseChatOpenAI
+from langgraph.constants import START
+from langgraph.errors import GraphInterrupt
+from langgraph.graph import END, StateGraph
+from langgraph.graph.state import CompiledStateGraph
+from langgraph.prebuilt import ToolNode
+from langgraph.store.base import BaseStore
+from langgraph.types import Checkpointer, interrupt, Send
+from pydantic import BaseModel, Field
+from openchatbi import config
+from openchatbi.catalog import CatalogStore
+from openchatbi.constants import datetime_format
+from openchatbi.graph_state import AgentState, InputState, OutputState
+from openchatbi.llm.llm import call_llm_chat_model_with_retry, default_llm
+from openchatbi.prompts.system_prompt import AGENT_PROMPT_TEMPLATE
+from openchatbi.text2sql.sql_graph import build_sql_graph
+from openchatbi.tool.ask_human import AskHuman
+from openchatbi.tool.mcp_tools import create_mcp_tools_sync, get_mcp_tools_async
+from openchatbi.tool.memory import get_memory_tools
+from openchatbi.tool.run_python_code import run_python_code
+from openchatbi.tool.save_report import save_report
+from openchatbi.tool.search_knowledge import search_knowledge, show_schema
+from openchatbi.utils import log
+logger = logging.getLogger(__name__)
+def ask_human(state: AgentState) -> dict[str, Any]:
+    """Node function to ask human for additional information or clarification.
+    Args:
+        state (AgentState): The current graph state containing messages and context.
+    Returns:
+        dict: Updated state with human feedback as a tool message and user input.
+    """
+    tool_call = state["messages"][-1].tool_calls[0]
+    tool_call_id = tool_call["id"]
+    args = tool_call["args"]
+    user_feedback = interrupt({"text": args["question"], "buttons": args.get("options", None)})
+    tool_message = [{"tool_call_id": tool_call_id, "type": "tool", "content": user_feedback}]
+    return {"messages": tool_message, "user_input": user_feedback}
+class CallSQLGraphInput(BaseModel):
+    reasoning: str = Field(
+        description="Explanation of why Text2SQL tool is needed",
+    )
+    context: str = Field(
+        description="""The full context pass to Text2SQL tool, make sure do not miss any potential information that related to user's question.
+        Following the format: History Conversation: (user and assistant history dialog)
+        Information: (the knowledge you retrival that is relevant, like metrics and dimensions)
+        User's latest question:""",
+    )
+# Description for SQL tools
+TEXT2SQL_TOOL_DESCRIPTION = """Text2SQL tool to generate and execute SQL query and build visualization DSL for UI
+based on user's question and context.
+Returns:
+    str: A formatted response containing SQL, data, and visualization status.
+Important notes:
+- If user want to change the visualization chart type or style, add the requirement in the question
+- Make sure to provide question in English
+"""
+def _format_sql_response(sql_graph_response: dict) -> str:
+    """Format SQL graph response into a standardized string format.
+    Args:
+        sql_graph_response: The response dictionary from the SQL graph
+    Returns:
+        str: Formatted response string
+    """
+    sql = sql_graph_response.get("sql", "")
+    data = sql_graph_response.get("data", "")
+    visualization_dsl = sql_graph_response.get("visualization_dsl", {})
+    response_parts = []
+    if sql:
+        response_parts.append(f"SQL Query:\n```sql\n{sql}\n```")
+    if data:
+        response_parts.append(f"\nQuery Results (CSV format):\n```csv\n{data}\n```")
+    # Include visualization status
+    if visualization_dsl and "error" not in visualization_dsl:
+        chart_type = visualization_dsl.get("chart_type", "unknown")
+        response_parts.append(
+            f"\nVisualization Created: {chart_type} chart has been automatically generated and will be displayed in the UI."
+        )
+    elif visualization_dsl and "error" in visualization_dsl:
+        response_parts.append(f"\nVisualization Error: {visualization_dsl['error']}")
+    return "\n\n".join(response_parts) if response_parts else "No results returned."
+def get_sql_tools(sql_graph: CompiledStateGraph, sync_mode: bool = False) -> Callable:
+    """Create SQL generation tool from compiled SQL graph.
+    Args:
+        sql_graph (CompiledStateGraph): The compiled SQL generation subgraph.
+        sync_mode (bool): Whether to create synchronous or asynchronous tools
+    Returns:
+        function: Tool function for SQL generation.
+    """
+    def call_sql_graph_sync(reasoning: str, context: str) -> str:
+        """Sync node function for Text2SQL tool"""
+        log(f"Call SQL graph (sync) with reasoning: {reasoning}, context: {context}")
+        try:
+            sql_graph_response = sql_graph.invoke({"messages": context})
+            return _format_sql_response(sql_graph_response)
+        except GraphInterrupt as e:
+            log(f"Sql graph interrupted:\n{repr(e)}")
+            raise e
+        except Exception as e:
+            log(f"Run sql graph error:\n{repr(e)}")
+            traceback.print_exc()
+        return "Error occurred when calling Text2SQL tool."
+    async def call_sql_graph_async(reasoning: str, context: str) -> str:
+        """Async node function for Text2SQL tool"""
+        log(f"Call SQL graph (async) with reasoning: {reasoning}, context: {context}")
+        try:
+            sql_graph_response = await sql_graph.ainvoke({"messages": context})
+            return _format_sql_response(sql_graph_response)
+        except GraphInterrupt as e:
+            log(f"Sql graph interrupted:\n{repr(e)}")
+            raise e
+        except Exception as e:
+            log(f"Run sql graph error:\n{repr(e)}")
+            traceback.print_exc()
+        return "Error occurred when calling Text2SQL tool."
+    if sync_mode:
+        return StructuredTool.from_function(
+            func=call_sql_graph_sync,
+            name="text2sql",
+            description=TEXT2SQL_TOOL_DESCRIPTION,
+            args_schema=CallSQLGraphInput,
+            return_direct=False,
+        )
+    else:
+        return StructuredTool.from_function(
+            coroutine=call_sql_graph_async,
+            name="text2sql",
+            description=TEXT2SQL_TOOL_DESCRIPTION,
+            args_schema=CallSQLGraphInput,
+            return_direct=False,
+        )
+def agent_router(llm: BaseChatModel, tools: list) -> Callable:
+    """Create router function to determine next node based on LLM tool calls.
+    Args:
+        llm (BaseChatModel): The LLM for decision-making.
+        tools: List of tools.
+    Returns:
+        function: Router function that processes state and determines next node.
+    """
+    # OpenAI models support strict tool calling
+    if isinstance(llm, BaseChatOpenAI):
+        llm_with_tools = llm.bind_tools(tools, strict=True)
+    else:
+        llm_with_tools = llm.bind_tools(tools)
+    def _call_model(state: AgentState):
+        messages = state["messages"]
+        system_prompt = AGENT_PROMPT_TEMPLATE.replace(
+            "[time_field_placeholder]", datetime.datetime.now().strftime(datetime_format)
+        )
+        response = call_llm_chat_model_with_retry(
+            llm_with_tools, ([SystemMessage(system_prompt)] + messages), bound_tools=tools, parallel_tool_call=True
+        )
+        agent_next_node = ""
+        if isinstance(response, AIMessage):
+            tool_calls = response.tool_calls
+            print("Tool Call:", ", ".join(tool["name"] for tool in tool_calls))
+            if tool_calls:
+                # Group tool calls by type for parallel routing
+                ask_human_calls = [call for call in tool_calls if call["name"] == "AskHuman"]
+                normal_tool_calls = [call for call in tool_calls if call["name"] != "AskHuman"]
+                # Create Send objects for parallel routing
+                sends = []
+                if ask_human_calls:
+                    # Create message with only AskHuman calls
+                    ask_human_msg = AIMessage(content=response.content, tool_calls=ask_human_calls)
+                    sends.append(Send("ask_human", {"messages": [ask_human_msg]}))
+                if normal_tool_calls:
+                    # Create message with only normal tool calls
+                    tool_msg = AIMessage(content=response.content, tool_calls=normal_tool_calls)
+                    sends.append(Send("use_tool", {"messages": [tool_msg]}))
+                return {"messages": [response], "sends": sends}
+            else:
+                return {"messages": [response], "final_answer": response.content, "agent_next_node": END}
+        elif response is None:
+            return {"messages": [AIMessage("Sorry, the LLM service is currently unavailable.")], "agent_next_node": END}
+        else:
+            return {"messages": [response], "agent_next_node": END}
+    return _call_model
+def _build_graph_core(
+    catalog: CatalogStore,
+    sync_mode: bool,
+    checkpointer: Checkpointer,
+    memory_store: BaseStore,
+    memory_tools: Optional[tuple[Callable, Callable]],
+    mcp_tools: list,
+) -> CompiledStateGraph:
+    """Core graph building logic shared by both sync and async versions.
+    Args:
+        catalog: Catalog store containing schema information
+        sync_mode: Whether to use synchronous mode for tools and operations
+        checkpointer: The Checkpointer for state persistence
+        memory_store: The BaseStore to use for long-term memory
+        memory_tools: Tuple of (manage_memory_tool, search_memory_tool)
+        mcp_tools: Pre-initialized MCP tools
+    Returns:
+        CompiledStateGraph: Compiled agent graph ready for execution
+    """
+    sql_graph = build_sql_graph(catalog, checkpointer, memory_store)
+    call_sql_graph_tool = get_sql_tools(sql_graph=sql_graph, sync_mode=sync_mode)
+    # Use provided memory tools or create them
+    if memory_tools:
+        manage_memory_tool, search_memory_tool = memory_tools
+    else:
+        manage_memory_tool, search_memory_tool = get_memory_tools(default_llm, sync_mode=sync_mode, store=memory_store)
+    log(str(mcp_tools))
+    normal_tools = [
+        search_knowledge,
+        show_schema,
+        call_sql_graph_tool,
+        run_python_code,
+        manage_memory_tool,
+        search_memory_tool,
+        save_report,
+    ] + mcp_tools
+    tool_node = ToolNode(normal_tools)
+    # Define the agent graph
+    graph = StateGraph(AgentState, input_schema=InputState, output_schema=OutputState)
+    # Add nodes to the graph
+    graph.add_node("router", agent_router(default_llm, normal_tools + [AskHuman]))
+    graph.add_node("ask_human", ask_human)
+    graph.add_node("use_tool", tool_node)
+    # Add edges between nodes
+    graph.add_edge(START, "router")
+    graph.add_edge("ask_human", "router")
+    graph.add_edge("use_tool", "router")
+    # Add conditional routing from router node
+    def route_tools(state: AgentState):
+        # Only use sends if the last message came from the router (has tool_calls)
+        last_message = state["messages"][-1] if state["messages"] else None
+        if (
+            last_message
+            and isinstance(last_message, AIMessage)
+            and last_message.tool_calls
+            and "sends" in state
+            and state["sends"]
+        ):
+            return state["sends"]  # Return Send objects for parallel execution
+        elif "agent_next_node" in state:
+            return state["agent_next_node"]  # Return single node name
+        else:
+            return END
+    graph.add_conditional_edges(
+        "router",
+        route_tools,
+        # mapping of paths to node names (for single routing)
+        {
+            "ask_human": "ask_human",
+            "use_tool": "use_tool",
+            END: END,
+        },
+    )
+    graph = graph.compile(name="agent_graph", checkpointer=checkpointer, store=memory_store)
+    return graph
+def build_agent_graph_sync(
+    catalog: CatalogStore,
+    checkpointer: Checkpointer = None,
+    memory_store: BaseStore = None,
+) -> CompiledStateGraph:
+    """Build the main agent graph with all nodes and edges (sync version).
+    Args:
+        catalog: Catalog store containing schema information.
+        checkpointer: The Checkpointer for state persistence (short memory). If None, no short memory.
+        memory_store: The BaseStore to use for long-term memory. If None, will auto assign according to sync_mode.
+    Returns:
+        CompiledStateGraph: Compiled agent graph ready for execution.
+    """
+    # Get MCP tools for sync context
+    mcp_tools = create_mcp_tools_sync(config.get().mcp_servers)
+    return _build_graph_core(
+        catalog=catalog,
+        sync_mode=True,
+        checkpointer=checkpointer,
+        memory_store=memory_store,
+        memory_tools=None,  # Always None for sync version - creates its own
+        mcp_tools=mcp_tools,
+    )
+async def build_agent_graph_async(
+    catalog: CatalogStore,
+    checkpointer: Checkpointer = None,
+    memory_store: BaseStore = None,
+    memory_tools: tuple[Callable, Callable] = None,
+) -> CompiledStateGraph:
+    """Build the main agent graph with all nodes and edges (async version).
+    This function is identical to build_agent_graph_sync but properly handles
+    async MCP tool initialization when called from async contexts.
+    Args:
+        catalog: Catalog store containing schema information.
+        checkpointer: The Checkpointer for state persistence (short memory). If None, no short memory.
+        memory_store: The BaseStore to use for long-term memory. If None, will auto assign according to sync_mode.
+        memory_tools: Tuple of (manage_memory_tool, search_memory_tool). If None, creates async tools.
+    Returns:
+        CompiledStateGraph: Compiled agent graph ready for execution.
+    """
+    # Get MCP tools for async context
+    mcp_tools = await get_mcp_tools_async(config.get().mcp_servers)
+    return _build_graph_core(
+        catalog=catalog,
+        sync_mode=False,
+        checkpointer=checkpointer,
+        memory_store=memory_store,
+        memory_tools=memory_tools,
+        mcp_tools=mcp_tools,
+    )

openchatbi/catalog/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""Data catalog management module for OpenChatBI."""
+from openchatbi.catalog.catalog_loader import (
+    DataCatalogLoader,
+    load_catalog_from_data_warehouse,
+)
+from openchatbi.catalog.catalog_store import CatalogStore
+from openchatbi.catalog.factory import create_catalog_store
+__all__ = [
+    "CatalogStore",
+    "DataCatalogLoader",
+    "load_catalog_from_data_warehouse",
+]

openchatbi/catalog/catalog_loader.py ADDED Viewed

@@ -0,0 +1,208 @@
+import logging
+from typing import Any
+from sqlalchemy import MetaData, inspect
+from sqlalchemy.engine import Engine
+from .catalog_store import CatalogStore
+logger = logging.getLogger(__name__)
+class DataCatalogLoader:
+    """
+    The loader to load data catalog from data warehouse metadata and save to catalog store.
+    """
+    def __init__(self, engine: Engine, include_tables: list[str] | None = None):
+        """
+        Initialize catalog loader.
+        Args:
+            engine (Engine): SQLAlchemy engine instance
+            include_tables (Optional[List[str]]): List of table names to include, None for all
+        """
+        self.engine = engine
+        self.include_tables = include_tables
+        self.metadata = MetaData()
+        self.inspector = inspect(engine)
+    def get_tables_and_columns(self) -> dict[str, list[dict[str, Any]]]:
+        """
+        Extract table and column metadata including comments using SQLAlchemy inspector.
+        Returns:
+            Dict[str, List[Dict[str, Any]]]: Dictionary mapping table names to list of column information
+        """
+        try:
+            tables_columns = {}
+            # Get all table names
+            table_names = self.inspector.get_table_names()
+            # Filter to specific tables if configured
+            if self.include_tables:
+                table_names = [name for name in table_names if name in self.include_tables]
+            logger.info(f"Found {len(table_names)} tables to process")
+            for table_name in table_names:
+                try:
+                    # Get column information for the table
+                    columns = self.inspector.get_columns(table_name)
+                    column_list = []
+                    for column in columns:
+                        is_common_column = column not in ("id", "name", "type", "status")
+                        column_info = {
+                            "column_name": column["name"],
+                            "display_name": "",
+                            "alias": "",
+                            "type": str(column["type"]),
+                            "category": "",
+                            "tag": "",
+                            "description": column.get("comment", "") or "",
+                            "dimension_table": "",
+                            "default": str(column.get("default", "")) if column.get("default") is not None else "",
+                            "is_common": is_common_column,
+                        }
+                        column_list.append(column_info)
+                    tables_columns[table_name] = column_list
+                    logger.debug(f"Processed table {table_name} with {len(column_list)} columns")
+                except Exception as e:
+                    logger.error(f"Failed to process table {table_name}: {e}")
+                    continue
+            logger.info(f"Successfully processed {len(tables_columns)} tables")
+            return tables_columns
+        except Exception as e:
+            logger.error(f"Failed to get tables and columns from data warehouse: {e}")
+            return {}
+    def get_table_indexes(self, table_name: str) -> list[dict[str, Any]]:
+        """
+        Get index information for a specific table.
+        Args:
+            table_name (str): Name of the table
+        Returns:
+            List[Dict[str, Any]]: List of index information
+        """
+        try:
+            indexes = self.inspector.get_indexes(table_name)
+            return indexes
+        except Exception as e:
+            logger.warning(f"Failed to get indexes for table {table_name}: {e}")
+            return []
+    def get_foreign_keys(self, table_name: str) -> list[dict[str, Any]]:
+        """
+        Get foreign key information for a specific table.
+        Args:
+            table_name (str): Name of the table
+        Returns:
+            List[Dict[str, Any]]: List of foreign key information
+        """
+        try:
+            foreign_keys = self.inspector.get_foreign_keys(table_name)
+            return foreign_keys
+        except Exception as e:
+            logger.warning(f"Failed to get foreign keys for table {table_name}: {e}")
+            return []
+    def save_to_catalog_store(
+        self, catalog_store: CatalogStore, database_name: str | None = None, update: bool = False
+    ) -> bool:
+        """
+        Extract warehouse metadata and save to catalog store.
+        Args:
+            catalog_store (CatalogStore): Target catalog store to load data to
+            database_name (Optional[str]): Database name in catalog, defaults to 'default'
+            update (bool): Update existing catalog store to sync with data warehouse
+        Returns:
+            bool: True if load was successful, False otherwise
+        """
+        try:
+            if database_name is None:
+                database_name = "default"
+            # Get tables and columns from data warehouse
+            tables_columns = self.get_tables_and_columns()
+            if not tables_columns:
+                logger.warning("No tables found in data warehouse")
+                return True
+            # Import each table
+            success_count = 0
+            total_count = len(tables_columns)
+            for table_name, columns in tables_columns.items():
+                try:
+                    # Get table comment if available
+                    table_comment = ""
+                    try:
+                        table_info = self.inspector.get_table_comment(table_name)
+                        table_comment = table_info.get("text", "") if table_info else ""
+                    except Exception:
+                        # Some databases don't support table comments
+                        pass
+                    table_info = {"description": table_comment, "selection_rule": "", "sql_rule": ""}
+                    if catalog_store.save_table_information(table_name, table_info, columns, database_name):
+                        success_count += 1
+                        logger.info(f"Successfully loaded table: {database_name}.{table_name}")
+                    else:
+                        logger.error(f"Failed to load table: {database_name}.{table_name}")
+                    # init null SQL examples
+                    catalog_store.save_table_sql_examples(
+                        table_name, [{"question": "null", "answer": "null"}], database_name
+                    )
+                except Exception as e:
+                    logger.error(f"Error loading table {table_name}: {e}")
+            # init empty table selection examples
+            catalog_store.save_table_selection_examples([("", [])])
+            logger.info(f"Load completed: {success_count}/{total_count} tables loaded successfully")
+            return success_count == total_count
+        except Exception as e:
+            logger.error(f"Failed to load data warehouse to catalog store: {e}")
+            return False
+def load_catalog_from_data_warehouse(catalog_store: CatalogStore) -> bool:
+    """
+    Load catalog data from data warehouse using SQLAlchemy based on data warehouse config (URI)
+    Main entry point for catalog loading.
+    Args:
+        catalog_store (CatalogStore): Target catalog store
+    Returns:
+        bool: True if load was successful, False otherwise
+    """
+    try:
+        data_warehouse_config = catalog_store.get_data_warehouse_config()
+        database_uri = data_warehouse_config.get("uri")
+        include_tables = data_warehouse_config.get("include_tables")
+        database_name = data_warehouse_config.get("database_name", "default")
+        engine = catalog_store.get_sql_engine()
+        loader = DataCatalogLoader(engine, include_tables)
+        return loader.save_to_catalog_store(catalog_store, database_name)
+    except Exception as e:
+        logger.error(f"Failed to import catalog from data warehouse URI {database_uri}: {e}")
+        return False