PyPI - nl2sql-agents - Versions diffs - 0.1.0__py3-none-any.whl - Mend

nl2sql-agents 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

nl2sql_agents/__init__.py +9 -0
nl2sql_agents/agents/__init__.py +0 -0
nl2sql_agents/agents/base_agent.py +74 -0
nl2sql_agents/agents/discovery/__init__.py +0 -0
nl2sql_agents/agents/discovery/discovery_agent.py +117 -0
nl2sql_agents/agents/discovery/fk_graph_agent.py +75 -0
nl2sql_agents/agents/discovery/keyword_agent.py +61 -0
nl2sql_agents/agents/discovery/semantic_agent.py +61 -0
nl2sql_agents/agents/explainer/__init__.py +0 -0
nl2sql_agents/agents/explainer/explainer_agent.py +45 -0
nl2sql_agents/agents/explainer/explanation_agent.py +32 -0
nl2sql_agents/agents/explainer/optimization_agent.py +31 -0
nl2sql_agents/agents/explainer/safety_report_agent.py +42 -0
nl2sql_agents/agents/query_generator.py +133 -0
nl2sql_agents/agents/schema_formatter.py +69 -0
nl2sql_agents/agents/validator/__init__.py +0 -0
nl2sql_agents/agents/validator/logic_validator.py +59 -0
nl2sql_agents/agents/validator/performance_validator.py +74 -0
nl2sql_agents/agents/validator/security_validator.py +51 -0
nl2sql_agents/agents/validator/syntax_validator.py +74 -0
nl2sql_agents/agents/validator/validator_agent.py +104 -0
nl2sql_agents/cli.py +291 -0
nl2sql_agents/config/__init__.py +0 -0
nl2sql_agents/config/settings.py +66 -0
nl2sql_agents/db/__init__.py +0 -0
nl2sql_agents/db/connector.py +107 -0
nl2sql_agents/filters/__init__.py +0 -0
nl2sql_agents/filters/gate.py +62 -0
nl2sql_agents/filters/security_filter.py +36 -0
nl2sql_agents/models/__init__.py +0 -0
nl2sql_agents/models/schemas.py +120 -0
nl2sql_agents/orchestrator/__init__.py +0 -0
nl2sql_agents/orchestrator/nodes.py +142 -0
nl2sql_agents/orchestrator/pipeline.py +70 -0
nl2sql_agents/py.typed +0 -0
nl2sql_agents-0.1.0.dist-info/METADATA +540 -0
nl2sql_agents-0.1.0.dist-info/RECORD +40 -0
nl2sql_agents-0.1.0.dist-info/WHEEL +4 -0
nl2sql_agents-0.1.0.dist-info/entry_points.txt +2 -0
nl2sql_agents-0.1.0.dist-info/licenses/LICENSE +21 -0

nl2sql_agents/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""
+nl2sql-agents — Multi-Agent Natural Language to SQL System.
+A sophisticated multi-agent orchestration system that converts natural
+language queries into safe, optimized SQL using LangGraph and OpenRouter LLMs.
+"""
+__version__ = "0.1.0"
+__all__ = ["__version__"]

nl2sql_agents/agents/__init__.py ADDED Viewed

File without changes

nl2sql_agents/agents/base_agent.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""
+BASE AGNET - Abstract Interface for all LLM agents.
+hanldes:
+- Async LLM calls via ChatOpenAI (langhcain-openai)
+- provider-aware: each agent recieves an LLM provider
+- token usage logging
+Each agent implements:
+- build_prompt()
+- parse_response()
+"""
+import logging
+from abc import ABC, abstractmethod
+from typing import Any
+from langchain_openai import ChatOpenAI
+from nl2sql_agents.config.settings import LLMProvider, PRIMARY_PROVIDER
+from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
+logger = logging.getLogger(__name__)
+def _to_langchain_messages(messages: list[dict[str, str]]) -> list[BaseMessage]:
+    mapping = {"system": SystemMessage, "user": HumanMessage}
+    return [mapping.get(m["role"], HumanMessage)(content=m["content"]) for m in messages]
+class BaseAgent(ABC):
+    def __init__(self, provider: LLMProvider = PRIMARY_PROVIDER) -> None:
+        self.provider = provider
+        self.model_name = provider.default_model
+    @abstractmethod
+    def build_prompt(self, *args, **kwargs) -> list[dict[str, str]]:
+        raise NotImplementedError("Abstract Method build_prompt not implemented")
+    @abstractmethod
+    def parse_response(self, raw: str) -> Any:
+        raise NotImplementedError("Abstract Method ParseResponse not implemented")
+    def _get_llm(self, temperature: float=0.3, max_tokens: int = 2048) -> ChatOpenAI:
+        return self.provider.chat_model(temperature=temperature, max_tokens=max_tokens)
+    async def call_llm(
+            self,
+            messages: list[dict[str, str]],
+            temperature: float = 0.3,
+            max_tokens: int = 2048
+    ) -> str:
+        logger.debug(
+            "%s -> LLM (model=%s, temp=%.1f, msgs=%d)", self.__class__.__name__, self.model_name, temperature, len(messages)
+        )
+        llm = self._get_llm(temperature=temperature, max_tokens=max_tokens)
+        lc_messages = _to_langchain_messages(messages)
+        response = await llm.ainvoke(lc_messages)
+        if response.usage_metadata:
+            logger.debug("%s ← LLM (prompt=%d, completion=%d tokens)",
+                self.__class__.__name__,
+                response.usage_metadata.get("input_tokens", 0),
+                response.usage_metadata.get("output_tokens", 0),
+            )
+        return response.content.strip()
+    async def execute(self, *args, **kwargs) -> Any:
+        messages = self.build_prompt(*args, **kwargs)
+        raw = await self.call_llm(
+            messages,
+            temperature=kwargs.get("temperature", 0.3),
+            max_tokens=kwargs.get("max_tokens", 2048)
+        )
+        return self.parse_response(raw)

nl2sql_agents/agents/discovery/__init__.py ADDED Viewed

File without changes

nl2sql_agents/agents/discovery/discovery_agent.py ADDED Viewed

@@ -0,0 +1,117 @@
+"""
+DISCOVERY AGENT
+1. Keyword pre-filter (no llm)
+    - runs KeywordAgent on all tables
+    - Takes top KEYWORD_PRE_FILTER_TOP_N by keyword score
+2. PARALLEL - Semantic + FK (on filtered tables only.)
+    - runs SemanticAgent + FKGraphAgent in parallel on filtered tables
+    - Merges all 3 scores with configurable weights
+    - returns full ranked list
+"""
+import logging
+import asyncio
+from collections import defaultdict
+from .keyword_agent import KeywordAgent
+from .fk_graph_agent import FKGraphAgent
+from .semantic_agent import SemanticAgent
+from nl2sql_agents.config.settings import KEYWORD_PRE_FILTER_TOP_N
+from nl2sql_agents.models.schemas import TableMetaData, DiscoveryResult, ScoredTable
+logger = logging.getLogger(__name__)
+WEIGHTS = {
+    "keywords" : 0.35,
+    "semantic" : 0.45,
+    "fk_graph" : 0.20
+}
+class DiscoveryAgent:
+    def __init__(self) -> None:
+        self.keyword_agent = KeywordAgent()
+        self.semantic_agent = SemanticAgent()
+        self.fk_graph_agent = FKGraphAgent()
+    async def run(
+            self,
+            tables: list[TableMetaData],
+            user_query: str,
+            pre_filter_n: int = KEYWORD_PRE_FILTER_TOP_N
+    ) -> DiscoveryResult:
+        # 1. keyword pre-filter
+        logger.info('DiscoveryAgent: Keyword prefilter on %d tables', len(tables))
+        kw_scores = await self.keyword_agent.score(tables, user_query)
+        sorted_by_kw = sorted(kw_scores.items(), key=lambda x:x[1], reverse=True)
+        top_n_names = {name for name, _ in sorted_by_kw[:pre_filter_n]}
+        pre_filtered = [t for t in tables if t.table_name in top_n_names]
+        logger.info('DiscoveryAgent phase 1: %d -> %d tables', len(tables), len(pre_filtered))
+        #2. semantic + FK in parallel
+        logger.info('DiscoveryAgent phase2: Semantic + FK in parallel on %d tables', len(pre_filtered))
+        sem_scores, fk_scores = await asyncio.gather(
+            self.semantic_agent.score(pre_filtered, user_query),
+            self.fk_graph_agent.score(pre_filtered, user_query)
+        )
+        merged = self._merge_and_rank(pre_filtered, kw_scores, sem_scores, fk_scores)
+        logger.info('Discover Agent: ranked %d tables, top-5 = %s', len(merged), [s.table.table_name for s in merged[:5]])
+        return DiscoveryResult(
+            top_tables=[s.table for s in merged[:5]],
+            scored_tables=merged
+        )
+    def _merge_and_rank(
+            self,
+            tables: list[TableMetaData],
+            kw: dict[str, float],
+            sem: dict[str, float],
+            fk: dict[str, float]
+    ) -> list[ScoredTable]:
+        agg: dict[str, dict] = defaultdict(
+            lambda: {"score": 0.0, "found_by": []}
+        )
+        for name, score in kw.items():
+            if name not in {t.table_name for t in tables}:
+                continue
+            agg[name]["score"] += score * WEIGHTS["keywords"]
+            if score > 0:
+                agg[name]["found_by"].append("keyword")
+        for name, score in sem.items():
+            agg[name]["score"] += score * WEIGHTS["semantic"]
+            if score > 0:
+                agg[name]["found_by"].append("semantic")
+        for name, score in fk.items():
+            agg[name]["score"] += score * WEIGHTS["fk_graph"]
+            if score > 0:
+                agg[name]["found_by"].append("fk_graph")
+        table_map = {t.table_name: t for t in tables}
+        ranked = sorted(
+            [
+                ScoredTable(
+                    table=table_map[name],
+                    score=round(data['score'], 4),
+                    found_by=list(set(data['found_by'])),
+                )
+                for name, data in agg.items() if name in table_map
+            ],
+            key=lambda a: a.score,
+            reverse=True
+        )
+        return ranked

nl2sql_agents/agents/discovery/fk_graph_agent.py ADDED Viewed

@@ -0,0 +1,75 @@
+"""
+Sub-Agent 1c - Foreign Key Graph Agent
+Builds Bi-directional foriegn key graph from table metadata.
+BFS-walk from seed tables, and scores by graph distance.
+| Distance from Seed         | Score | Interpretation              |
+| -------------------------- | ----- | --------------------------- |
+| 0 (seed itself)            | 1.0   | Directly mentioned in query |
+| 1 (neighbors)              | 0.5   | Directly linked via FK      |
+| 2 (neighbors of neighbors) | 0.25  | Two hops away               |
+"""
+import logging
+from collections import deque
+from nl2sql_agents.models.schemas import TableMetaData
+logger = logging.getLogger(__name__)
+MAX_DEPTH = 2
+class FKGraphAgent:
+    async def score(self, tables: list[TableMetaData], user_query: str) -> dict[str, float]:
+        graph = self._build_fk_graph(tables)
+        seeds = self._find_seeds(tables, user_query)
+        logger.debug('FKGraphAgent: seeds=%s', seeds)
+        return self._bfs_score(seeds, graph, MAX_DEPTH)
+    def _build_fk_graph(self, tables: list[TableMetaData])-> dict[str, set[str]]:
+        graph: dict[str, set[str]] = {t.table_name: set() for t in tables}
+        for table in tables:
+            for col in table.columns:
+                if col.is_foreign_key and col.reference_column:
+                    graph[table.table_name].add(col.reference_table)
+                    if col.reference_table in graph:
+                        graph[col.reference_table].add(table.table_name)
+        return graph
+    def _find_seeds(self, tables: list[TableMetaData], user_query: str) -> list[str]:
+        """seed = table whose name-parts appear in the user query"""
+        query_lower = user_query.lower()
+        seeds = []
+        for table in tables:
+            parts = table.table_name.lower().replace('_', ' ').split()
+            if any(part in query_lower for part in parts if len(part) > 2):
+                seeds.append(table.table_name)
+        return seeds
+    def _bfs_score(self, seeds: list[str], graph: dict[str, set[str]], max_depth: int) -> dict[str, float]:
+        scores: dict[str, float] = {}
+        visited: set[str] = set()
+        queue: deque[tuple[str, int]] = deque()
+        for seed in seeds:
+            queue.append((seed, 0))
+            visited.add(seed)
+        while queue:
+            node, depth = queue.popleft()
+            if depth > max_depth:
+                continue
+            score = 1.0 / (2 ** depth)
+            scores[node] = max(scores.get(node, 0.0), score)
+            for neighbor in graph.get(node, []):
+                if neighbor not in visited:
+                    visited.add(neighbor)
+                    queue.append((neighbor, depth+1))
+        return scores

nl2sql_agents/agents/discovery/keyword_agent.py ADDED Viewed

@@ -0,0 +1,61 @@
+"""
+Sub-Agent 1a - Keyword Agent (No LLM)
+Extract Meaningful tokens from user query and fuzzy-matches aginst table names and column names. Returns {tablename: score}
+"""
+import re
+import logging
+from difflib import SequenceMatcher
+from nl2sql_agents.models.schemas import TableMetaData
+logger = logging.getLogger(__name__)
+STOP_WORDS = {
+    "show", "me", "get", "find", "list", "give", "the", "a", "an",
+    "of", "for", "in", "on", "by", "with", "from", "where", "top",
+    "all", "my", "this", "that", "and", "or", "is", "are", "was",
+    "how", "many", "much", "what", "which", "who",
+}
+class KeywordAgent:
+    async def score(
+            self,
+            tables: list[TableMetaData],
+            user_query: str
+    ) -> dict[str, float]:
+        keywords = self._extract_keywords(user_query)
+        logger.debug('KeywordAgent: Keywords=%s', keywords)
+        return {
+            t.table_name: self._score_table(t, keywords)
+            for t in tables
+        }
+    def _extract_keywords(self, query: str) -> list[str]:
+        tokens = re.findall(r"[a-zA-Z]+", query.lower())
+        return [t for t in tokens if t not in STOP_WORDS and len(t) > 2]
+    def _score_table(self, table: TableMetaData, keywords: list[str]) -> float:
+        if not keywords:
+            return 0.0
+        candidates = [table.table_name.lower()] + [
+            c.column_name.lower() for c in table.columns
+        ]
+        best_scores = []
+        for kw in keywords:
+            kw_best = max(
+                self._fuzzy_score(kw, candidate)
+                for candidate in candidates
+            )
+            best_scores.append(kw_best)
+        return round(sum(best_scores) / len(best_scores), 4)
+    def _fuzzy_score(self, keyword: str, target: str) -> float:
+        if keyword in target:
+            return 1.0
+        return SequenceMatcher(None, keyword, target).ratio()

nl2sql_agents/agents/discovery/semantic_agent.py ADDED Viewed

@@ -0,0 +1,61 @@
+"""
+Sub-Agent 1b - Semantic Agent
+Embeds the user query and table descriptions using OpenAIEmbeddings.
+Return Cosine similarity scores: {table_name: similarity}
+"""
+import logging
+import numpy as np
+from nl2sql_agents.models.schemas import TableMetaData
+from nl2sql_agents.config.settings import EMBEDDING_PROVIDER
+logger = logging.getLogger(__name__)
+def _cosine_similarity(a: list[float], b: list[float]) -> float:
+    va, vb = np.array(a), np.array(b)
+    # Calculate product of magnitudes (L2 norms)
+    # ||a|| * ||b||
+    norm = np.linalg.norm(va) * np.linalg.norm(vb)
+    """
+    # Two similar vectors (both about "sales")
+    a = [0.5, 0.8, 0.2, 0.1]   # embedding for "revenue"
+    b = [0.6, 0.7, 0.3, 0.2]   # embedding for "sales"
+    # Dot product: 0.5*0.6 + 0.8*0.7 + 0.2*0.3 + 0.1*0.2 = 0.88
+    # ||a|| = sqrt(0.25 + 0.64 + 0.04 + 0.01) = 0.949
+    # ||b|| = sqrt(0.36 + 0.49 + 0.09 + 0.04) = 0.990
+    # Result: 0.88 / (0.949 * 0.990) ≈ 0.94  ← very similar!
+    # Two unrelated vectors
+    c = [0.9, 0.1, 0.0, 0.0]   # embedding for "apple" (fruit)
+    d = [0.1, 0.9, 0.8, 0.5]   # embedding for "car" (vehicle)
+    # Result: ≈ 0.15  ← not similar
+    """
+    return float(np.dot(va, vb)/norm) if norm > 0 else 0.0
+class SemanticAgent:
+    def __init__(self) -> None:
+        self.embeddings = EMBEDDING_PROVIDER.embeddings_model()
+    async def score(
+            self, tables: list[TableMetaData], user_query: str
+    ) -> dict[str, float]:
+        logger.debug("SemanticAgent: embedding query + %d tables", len(tables))
+        texts = [user_query] + [self._table_to_text(t) for t in tables]
+        all_embeddings = await self.embeddings.aembed_documents(texts)
+        query_emb = all_embeddings[0]
+        table_embs = all_embeddings[1:]
+        return {
+            t.table_name: round(_cosine_similarity(query_emb, emb), 4) for t, emb in zip(tables, table_embs)
+        }
+    def _table_to_text(self, table: TableMetaData) -> str:
+        col_names = ','.join(c.column_name for c in table.columns[:20])
+        return f"Table {table.table_name}: columns {col_names}"

nl2sql_agents/agents/explainer/__init__.py ADDED Viewed

File without changes

nl2sql_agents/agents/explainer/explainer_agent.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""
+AGENT 5 - EXPLAINER AGENT
+runs PARALLEL - 3 output tasks concurrently:
+- Explanation (Plain English)
+- Safety Report (audit from validation report)
+- Optimization Hints (LLM)
+"""
+import asyncio
+import logging
+from nl2sql_agents.agents.explainer.explanation_agent import ExplanationAgent
+from nl2sql_agents.agents.explainer.optimization_agent import OptimizationAgent
+from nl2sql_agents.agents.explainer.safety_report_agent import SafetyReportAgent
+from nl2sql_agents.models.schemas import SQLCandidate, CandidateValidationResult, ExplainerOutput
+logger = logging.getLogger(__name__)
+class ExplainerAgent:
+    def __init__(self) -> None:
+        self.explanation = ExplanationAgent()
+        self.safety_report = SafetyReportAgent()
+        self.optimization = OptimizationAgent()
+    async def explain(
+            self,
+            candidate: SQLCandidate,
+            validation_results: list[CandidateValidationResult],
+            user_query: str = "",
+    ) -> ExplainerOutput:
+        """PARALLEL"""
+        logger.info("ExplainerAgent: 3 output taaks in PARALLEL")
+        explanation, safety, hints = await asyncio.gather(
+            self.explanation.run(candidate.sql, user_query),
+            self.safety_report.run(candidate, validation_results),
+            self.optimization.run(candidate.sql)
+        )
+        return ExplainerOutput(
+            explanation=explanation,
+            safety_report=safety,
+            optimization_hints = hints
+        )

nl2sql_agents/agents/explainer/explanation_agent.py ADDED Viewed

@@ -0,0 +1,32 @@
+"""
+Explanation Agent - Translates SQL into plain English
+"""
+import logging
+from nl2sql_agents.agents.base_agent import BaseAgent
+logger = logging.getLogger(__name__)
+SYSTEM_PROMPT = """You are a helpful data analyst explaining SQL to a business user.
+Given a SQL query and the original question, explain in 2-4 plain English sentences what the query does. Do not include SQL syntax in your explanation."""
+class ExplanationAgent(BaseAgent):
+    def build_prompt(self, sql: str = "", user_query: str = "", **_) -> list[dict[str, str]]:
+        USER_PROMPT = (
+            f"Original question: {user_query}\n\n"
+            f"SQL: \n{sql}\n\n"
+            "Explain in plain English."
+        )
+        return [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": USER_PROMPT}
+        ]
+    def parse_response(self, raw: str) -> str:
+        return raw.strip()
+    async def run(self, sql: str, user_query: str) -> str:
+        messages = self.build_prompt(sql=sql, user_query=user_query)
+        return await self.call_llm(messages, temperature=0.3, max_tokens=300)

nl2sql_agents/agents/explainer/optimization_agent.py ADDED Viewed

@@ -0,0 +1,31 @@
+"""
+OPTIMIZATION AGENT - LLM SUGGESTS INDEX/PERFORMANCE IMPROVMENTS
+"""
+import logging
+from nl2sql_agents.agents.base_agent import BaseAgent
+logger = logging.getLogger(__name__)
+SYSTEM_PROMPT = """You are a database performance tuning expert.
+Given a SQL query, suggest 1-3 concrete optimization hints.
+Be specific and actionable. If well-optimized, say so.
+Keep each hint to one sentence."""
+class OptimizationAgent(BaseAgent):
+    async def run(self, sql: str) -> str:
+        messages = self.build_prompt(sql=sql)
+        return await self.call_llm(
+            messages=messages,
+            temperature=0.2,
+            max_tokens=200
+        )
+    def build_prompt(self, sql: str = "", **_) -> list[dict[str, str]]:
+        return [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": f"Optimize this SQl: \n\n{sql}"}
+        ]
+    def parse_response(self, raw: str) -> str:
+        return raw.strip()

nl2sql_agents/agents/explainer/safety_report_agent.py ADDED Viewed

@@ -0,0 +1,42 @@
+"""
+SAFETY REPORT AGENT - Generates structured audit report from validation results.
+No LLM calls - purely derived from check date.
+"""
+import logging
+from nl2sql_agents.models.schemas import SQLCandidate, CandidateValidationResult
+logger = logging.getLogger(__name__)
+ICONS={"passed": "", "warned": " ", "failed":""}
+CHECK_ORDER = ["security", "syntax", "logic", "performance"]
+class SafetyReportAgent:
+    async def run(self, candidate: SQLCandidate, all_results: list[CandidateValidationResult]) -> str:
+        winning = next(
+            (r for r in all_results if r.candidate.sql == candidate.sql), None
+        )
+        if not winning:
+            logger.warning("SafetyReportAgent: Safety Report Not Available")
+            return "Safety Report Not Available"
+        lines = ["Security & Quality Report", " "*36]
+        for check in sorted(winning.checks, key=lambda c: CHECK_ORDER.index(c.check_name.lower())):
+            if check.passed and check.score==1.0:
+                icon = ICONS['passed']
+            elif check.passed and check.score<1.0:
+                icon = ICONS['warned']
+            else:
+                icon = ICONS["failed"]
+            lines.append(
+                f"{icon} {check.check_name.upper():12}  {check.details or ''}"
+            )
+        lines.append(" "*36)
+        lines.append(f"Total score: {winning.total_score:.1f} / 4.0")
+        return "\n".join(lines)