PyPI - groundedql - Versions diffs - 0.3.0__py3-none-any.whl - Mend

groundedql 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

groundedql/__init__.py +72 -0
groundedql/agent.py +233 -0
groundedql/api/__init__.py +4 -0
groundedql/api/api.py +183 -0
groundedql/api/spec_api.py +52 -0
groundedql/cli.py +508 -0
groundedql/compiler.py +1458 -0
groundedql/decompose.py +83 -0
groundedql/evidence_planner.py +5743 -0
groundedql/exceptions.py +94 -0
groundedql/executor.py +49 -0
groundedql/intent.py +234 -0
groundedql/intent_memory.py +340 -0
groundedql/intent_normalize.py +1639 -0
groundedql/intent_planner.py +1417 -0
groundedql/join_planner.py +255 -0
groundedql/llm_adapters.py +428 -0
groundedql/llm_client.py +42 -0
groundedql/llm_integration.py +44 -0
groundedql/plan_autofix.py +586 -0
groundedql/plan_canonical.py +181 -0
groundedql/planner.py +244 -0
groundedql/queryplan_models.py +147 -0
groundedql/read_sql_surface.py +96 -0
groundedql/schema_validator.py +228 -0
groundedql/semantic_lint.py +623 -0
groundedql/spec_builder.py +597 -0
groundedql/validation.py +223 -0
groundedql/value_index.py +290 -0
groundedql-0.3.0.dist-info/METADATA +178 -0
groundedql-0.3.0.dist-info/RECORD +36 -0
groundedql-0.3.0.dist-info/WHEEL +5 -0
groundedql-0.3.0.dist-info/entry_points.txt +2 -0
groundedql-0.3.0.dist-info/licenses/LICENSE +202 -0
groundedql-0.3.0.dist-info/licenses/NOTICE +4 -0
groundedql-0.3.0.dist-info/top_level.txt +1 -0

groundedql/__init__.py ADDED Viewed

@@ -0,0 +1,72 @@
+from importlib.metadata import version as _version, PackageNotFoundError
+try:
+    __version__ = _version("groundedql")
+except PackageNotFoundError:
+    __version__ = "0.0.0-dev"
+from .api.api import execute_query_plan, validate_query_plan, load_and_validate_schema
+from .planner import QueryPlanPlanner
+from .validation import validate_query_plan_dict, ValidationErrorItem
+from .queryplan_models import CteDef, QueryPlan, queryplan_json_schema
+from .agent import QueryAgent
+from .semantic_lint import semantic_lint
+from .join_planner import auto_inject_joins, build_link_graph, shortest_join_path
+from .plan_canonical import canonicalize_query_plan, plan_fingerprint
+from .spec_builder import build_spec, write_spec
+from .llm_adapters import MistralChatJSONAdapter, OllamaChatJSONAdapter
+from .read_sql_surface import (
+    READ_SQL_SURFACE,
+    READ_SQL_SURFACE_VERSION,
+    read_sql_surface_capabilities,
+    read_sql_surface_summary_for_spec,
+)
+from .cli import introspect_database
+from .decompose import is_compound, split_compound, SubQuestion
+from .plan_autofix import autofix_plan
+from .exceptions import (
+    DSLCompilerError,
+    SchemaError,
+    QueryPlanError,
+    AmbiguousColumnError,
+    DatabaseExecutionError,
+    QueryCostError,
+)
+__all__ = [
+    "execute_query_plan",
+    "validate_query_plan",
+    "load_and_validate_schema",
+    "QueryPlanPlanner",
+    "validate_query_plan_dict",
+    "ValidationErrorItem",
+    "CteDef",
+    "QueryPlan",
+    "queryplan_json_schema",
+    "QueryAgent",
+    "semantic_lint",
+    "auto_inject_joins",
+    "build_link_graph",
+    "shortest_join_path",
+    "canonicalize_query_plan",
+    "plan_fingerprint",
+    "is_compound",
+    "split_compound",
+    "SubQuestion",
+    "autofix_plan",
+    "DSLCompilerError",
+    "SchemaError",
+    "QueryPlanError",
+    "AmbiguousColumnError",
+    "DatabaseExecutionError",
+    "QueryCostError",
+    "build_spec",
+    "write_spec",
+    "MistralChatJSONAdapter",
+    "OllamaChatJSONAdapter",
+    "READ_SQL_SURFACE",
+    "READ_SQL_SURFACE_VERSION",
+    "read_sql_surface_capabilities",
+    "read_sql_surface_summary_for_spec",
+    "introspect_database",
+]

groundedql/agent.py ADDED Viewed

@@ -0,0 +1,233 @@
+from __future__ import annotations
+import sys
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import yaml
+from sqlalchemy.engine import Engine
+from .planner import QueryPlanPlanner
+from .intent_planner import IntentPlanner
+from .validation import validate_query_plan_dict
+from .api.api import execute_query_plan
+from .llm_adapters import make_llm_client
+from .semantic_lint import semantic_lint
+from .decompose import is_compound, split_compound, SubQuestion
+from .spec_builder import build_spec, write_spec
+from .value_index import build_value_index
+from .intent_memory import IntentMemory
+def _ensure_spec(schema_path: str, spec_path: Optional[str]) -> str:
+    """Auto-generate the spec file from schema.yaml if not provided or missing."""
+    if spec_path:
+        p = Path(spec_path)
+        if p.exists():
+            return spec_path
+        print(
+            f"[GroundedQL] Spec file not found at {spec_path}, generating from schema...",
+            file=sys.stderr,
+        )
+    else:
+        p = Path(schema_path).parent / "queryplan_spec_generated.yaml"
+        spec_path = str(p)
+    if p.exists():
+        schema_mtime = Path(schema_path).stat().st_mtime
+        spec_mtime = p.stat().st_mtime
+        if spec_mtime >= schema_mtime:
+            return spec_path
+        print(
+            "[GroundedQL] Spec is older than schema.yaml, regenerating...",
+            file=sys.stderr,
+        )
+    spec = build_spec(schema_path)
+    write_spec(spec, spec_path)
+    return spec_path
+class QueryAgent:
+    def __init__(
+        self,
+        *,
+        engine: Engine,
+        schema_path: str,
+        spec_path: Optional[str] = None,
+        llm: Any,
+        max_plan_retries: int = 2,
+        enforce_semantic_lint: bool = True,
+        use_intent_pipeline: bool = True,
+    ):
+        """
+        Args:
+            schema_path: Path to your schema.yaml — the only required config file.
+            spec_path: Path to the LLM spec file. If omitted or missing, it is
+                auto-generated from schema.yaml at startup.
+            max_plan_retries: Extra LLM attempts after the first plan when structural
+                or semantic checks fail.
+            enforce_semantic_lint: If True (default), do not execute when
+                :func:`semantic_lint` still reports errors after retries.
+            use_intent_pipeline: If True (default), use the two-stage intent
+                extraction + deterministic plan builder instead of direct LLM
+                QueryPlan generation. Falls back to legacy pipeline on error.
+        """
+        self.engine = engine
+        self.schema_path = schema_path
+        self.spec_path = _ensure_spec(schema_path, spec_path)
+        self.max_plan_retries = max_plan_retries
+        self.enforce_semantic_lint = enforce_semantic_lint
+        self.use_intent_pipeline = use_intent_pipeline
+        llm_client = make_llm_client(llm)
+        self.planner = QueryPlanPlanner(
+            llm=llm_client,
+            schema_path=schema_path,
+            spec_path=self.spec_path,
+        )
+        self.value_index = {}
+        try:
+            self.value_index = build_value_index(engine, schema_path)
+            print(
+                f"[GroundedQL] Value index built: "
+                f"{sum(len(cols) for cols in self.value_index.values())} columns indexed",
+                file=sys.stderr,
+            )
+        except Exception as exc:
+            print(f"[GroundedQL] Value index build failed ({exc}), continuing without it.", file=sys.stderr)
+        memory_dir = str(Path(schema_path).parent / ".intent_memory")
+        self.intent_memory = IntentMemory(persist_directory=memory_dir)
+        self.intent_planner = IntentPlanner(
+            llm=llm_client,
+            schema_path=schema_path,
+            value_index=self.value_index or None,
+            memory=self.intent_memory,
+        )
+    def ask(self, question: str) -> Dict[str, Any]:
+        if self.use_intent_pipeline:
+            return self._ask_intent(question)
+        return self._ask_legacy(question)
+    def _ask_intent(self, question: str) -> Dict[str, Any]:
+        """Two-stage pipeline: intent extraction → deterministic plan builder."""
+        try:
+            plan_dict = self.intent_planner.plan(question)
+        except Exception as exc:
+            print(
+                f"[GroundedQL] Intent pipeline failed ({exc}), falling back to legacy.",
+                file=sys.stderr,
+            )
+            return self._ask_legacy(question)
+        parsed, errors = validate_query_plan_dict(plan_dict, self.schema_path)
+        if errors:
+            print(
+                f"[GroundedQL] Intent plan failed validation, falling back to legacy.",
+                file=sys.stderr,
+            )
+            return self._ask_legacy(question)
+        return execute_query_plan(
+            engine=self.engine,
+            schema_path=self.schema_path,
+            query_plan=plan_dict,
+        )
+    def _ask_legacy(self, question: str) -> Dict[str, Any]:
+        """Original full-plan LLM generation with retries + autofix."""
+        plan_dict = self.planner.plan_with_retry(question, max_retries=self.max_plan_retries)
+        parsed, errors = validate_query_plan_dict(plan_dict, self.schema_path)
+        if errors:
+            return {
+                "error": {
+                    "message": "QueryPlan failed validation after retries.",
+                    "validation_errors": [{"path": e.path, "message": e.message} for e in errors],
+                    "plan": plan_dict,
+                }
+            }
+        schema_data = yaml.safe_load(
+            Path(self.schema_path).read_text(encoding="utf-8")
+        ) or {}
+        plan_body = {k: v for k, v in plan_dict.items() if k != "meta"}
+        if self.enforce_semantic_lint:
+            lint_errs = semantic_lint(question, plan_body, schema_data)
+            if lint_errs:
+                return {
+                    "error": {
+                        "message": "QueryPlan failed semantic lint after retries — plan does not match the question.",
+                        "lint_errors": lint_errs,
+                        "plan": plan_dict,
+                    }
+                }
+        return execute_query_plan(
+            engine=self.engine,
+            schema_path=self.schema_path,
+            query_plan=plan_dict,
+        )
+    def ask_compound(self, question: str) -> Dict[str, Any]:
+        """
+        Smart entry point that handles compound questions automatically.
+        If the question asks for multiple deliverables (e.g. a count AND a
+        ranked list with detail columns), it splits the question into focused
+        sub-questions, runs each through :meth:`ask`, and merges the results.
+        For simple questions it delegates to :meth:`ask` directly.
+        Returns a dict with:
+            - ``"compound": False, ...`` for simple questions (same as ``ask()``)
+            - ``"compound": True, "parts": [...]`` for compound questions, where
+              each part is ``{"role": str, "question": str, "result": dict}``
+        """
+        if not is_compound(question):
+            result = self.ask(question)
+            result["compound"] = False
+            return result
+        subs = split_compound(question)
+        print(
+            f"[GroundedQL] Compound question detected — splitting into {len(subs)} sub-questions.",
+            file=sys.stderr,
+        )
+        parts: List[Dict[str, Any]] = []
+        has_success = False
+        for sq in subs:
+            print(f"[GroundedQL]   {sq.role}: {sq.text!r}", file=sys.stderr)
+            try:
+                r = self.ask(sq.text)
+                is_error = isinstance(r, dict) and bool(r.get("error"))
+                if not is_error:
+                    has_success = True
+                parts.append({
+                    "role": sq.role,
+                    "question": sq.text,
+                    "result": r,
+                })
+            except Exception as exc:
+                print(f"[GroundedQL]   {sq.role} failed: {exc}", file=sys.stderr)
+                parts.append({
+                    "role": sq.role,
+                    "question": sq.text,
+                    "result": {"error": {"message": str(exc)}},
+                })
+        if not has_success:
+            first_err = next(
+                (p["result"] for p in parts if p["result"].get("error")),
+                {"error": {"message": "All sub-questions failed."}},
+            )
+            return first_err
+        return {"compound": True, "parts": parts}

groundedql/api/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .api import execute_query_plan
+from .spec_api import get_queryplan_instructions, get_queryplan_spec
+__all__ = ["execute_query_plan", "get_queryplan_instructions", "get_queryplan_spec"]

groundedql/api/api.py ADDED Viewed

@@ -0,0 +1,183 @@
+from __future__ import annotations
+import datetime
+from typing import Any, Dict, List, Optional
+import yaml
+from sqlalchemy.engine import Engine
+from ..compiler import Compiler
+from ..executor import Executor
+from ..exceptions import QueryPlanError, DatabaseExecutionError, SchemaError
+from ..join_planner import auto_inject_joins
+from ..plan_canonical import canonicalize_query_plan
+from ..schema_validator import validate_schema
+from ..validation import validate_query_plan_dict
+def _resolve_relative_dates(plan: Any) -> Any:
+    """
+    Recursively walk the plan and replace relative date sentinels with
+    concrete ISO-8601 UTC timestamps.
+    Supported value shapes in filter/cmp nodes:
+      {"$relative_date": {"op": "now_minus_days", "days": 7}}
+      -> replaced with "2024-01-15T10:30:00+00:00" (UTC ISO string)
+      {"$relative_date": {"op": "calendar_year_start", "year_offset": -1}}
+      -> UTC midnight at the start of the calendar year relative to "now":
+         year_offset 0 = Jan 1 of the current UTC year;
+         year_offset -1 = Jan 1 of the previous UTC year; etc.
+      Use two filters for "last calendar year":
+        field >= calendar_year_start(-1) AND field < calendar_year_start(0)
+    This allows the LLM to express date-relative intent without generating
+    SQL expressions as string values (which fail bindparam type checking).
+    """
+    if isinstance(plan, dict):
+        # Resolve relative date sentinel at this node
+        if "$relative_date" in plan:
+            spec = plan["$relative_date"]
+            op = spec.get("op")
+            if op == "now_minus_days":
+                days = int(spec.get("days", 0))
+                dt = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=days)
+                return dt.isoformat()
+            if op == "now_minus_hours":
+                hours = int(spec.get("hours", 0))
+                dt = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(hours=hours)
+                return dt.isoformat()
+            if op == "today":
+                return datetime.date.today().isoformat()
+            if op == "calendar_year_start":
+                now = datetime.datetime.now(datetime.timezone.utc)
+                current_year = now.year
+                off = int(spec.get("year_offset", 0))
+                target_year = current_year + off
+                start = datetime.datetime(
+                    target_year, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc
+                )
+                return start.isoformat()
+            # Unknown op — leave as-is so validation catches it
+            return plan
+        return {k: _resolve_relative_dates(v) for k, v in plan.items()}
+    if isinstance(plan, list):
+        return [_resolve_relative_dates(item) for item in plan]
+    return plan
+def load_and_validate_schema(schema_path: str) -> Dict[str, Any]:
+    """
+    Load schema.yaml and run load-time validation.
+    Prints any non-fatal warnings. Raises SchemaError on fatal issues.
+    """
+    try:
+        with open(schema_path, "r") as f:
+            schema = yaml.safe_load(f) or {}
+    except Exception as e:
+        raise SchemaError(f"Failed to load schema from '{schema_path}': {e}") from e
+    warnings = validate_schema(schema)
+    for w in warnings:
+        print(f"[GroundedQL schema] {w}")
+    return schema
+def validate_query_plan(
+    query_plan: Dict[str, Any],
+    schema_path: str,
+) -> List[str]:
+    """
+    Validate a QueryPlan without executing it.
+    Returns a list of error strings. Empty list = valid.
+    Does NOT require a database connection.
+    Args:
+        query_plan: The QueryPlan dict to validate.
+        schema_path: Path to schema.yaml.
+    Example:
+        errors = validate_query_plan(plan, "config/schema.yaml")
+        if errors:
+            print("Plan is invalid:", errors)
+    """
+    schema = load_and_validate_schema(schema_path)
+    # Strip meta before validation
+    clean_plan = {k: v for k, v in query_plan.items() if k != "meta"}
+    clean_plan = _resolve_relative_dates(clean_plan)
+    _, errors = validate_query_plan_dict(clean_plan, schema_path)
+    return [f"{e.path}: {e.message}" for e in errors]
+def execute_query_plan(
+    *,
+    engine: Engine,
+    schema_path: str,
+    query_plan: Dict[str, Any],
+    raise_on_error: bool = False,
+    statement_timeout_ms: int = 30_000,
+) -> Dict[str, Any]:
+    """
+    Compile and execute a QueryPlan.
+    Args:
+        engine: SQLAlchemy engine.
+        schema_path: Path to schema.yaml.
+        query_plan: The QueryPlan dict (from LLM or hand-written).
+        raise_on_error: If True, raises typed exceptions instead of returning
+                        {"error": ...}. Default False for backward compatibility.
+        statement_timeout_ms: Per-query statement timeout in milliseconds.
+                              Default 30000 (30 seconds).
+    Returns:
+        Dict with keys: rows, row_count, columns, sql, params, meta (if present)
+        On failure (raise_on_error=False): {"error": {"message": ...}}
+    """
+    try:
+        schema = load_and_validate_schema(schema_path)
+        # Strip meta early — before any processing
+        meta = query_plan.get("meta")
+        clean_plan = {k: v for k, v in query_plan.items() if k != "meta"}
+        resolved_plan = _resolve_relative_dates(clean_plan)
+        resolved_plan = auto_inject_joins(resolved_plan, schema)
+        resolved_plan = canonicalize_query_plan(resolved_plan)
+        compiler = Compiler(schema)
+        sql, params = compiler.compile(resolved_plan)
+        executor = Executor(engine, statement_timeout_ms=statement_timeout_ms)
+        result = executor.execute(sql, params)
+        if "error" in result:
+            raise DatabaseExecutionError(
+                result["error"]["message"],
+                sql=sql,
+            )
+        result["sql"] = sql
+        result["params"] = params
+        # Forward meta from planner if originally present
+        if meta is not None:
+            result["meta"] = meta
+        return result
+    except (QueryPlanError, DatabaseExecutionError, SchemaError):
+        if raise_on_error:
+            raise
+        import traceback
+        return {"error": {"message": traceback.format_exc(limit=3)}}
+    except Exception as e:
+        if raise_on_error:
+            raise DatabaseExecutionError(str(e)) from e
+        return {"error": {"message": str(e)}}

groundedql/api/spec_api.py ADDED Viewed

@@ -0,0 +1,52 @@
+from __future__ import annotations
+from pathlib import Path
+from typing import Any, Dict, Optional
+import yaml
+def load_yaml(path: str | Path) -> Dict[str, Any]:
+    p = Path(path)
+    data = yaml.safe_load(p.read_text()) or {}
+    if not isinstance(data, dict):
+        raise ValueError(f"YAML at {p} must be a mapping/object.")
+    return data
+def get_queryplan_spec(
+    *,
+    spec_path: str | Path = "config/queryplan_spec.yaml",
+) -> Dict[str, Any]:
+    """
+    Returns the raw spec dict (programmatic use).
+    """
+    return load_yaml(spec_path)
+def get_queryplan_instructions(
+    *,
+    schema_path: str | Path,
+    spec_path: str | Path = "config/queryplan_spec.yaml",
+    include_schema_yaml: bool = True,
+) -> str:
+    """
+    Returns a ready-to-use prompt string for an LLM:
+    - high-level instructions from queryplan_spec.yaml
+    - optionally appends the DB schema YAML so the model knows allowed tables/columns
+    """
+    spec = load_yaml(spec_path)
+    parts: list[str] = []
+    # Core instructions
+    parts.append(spec.get("system_instructions", "").strip())
+    parts.append("\n---\n")
+    parts.append("QUERYPLAN SPEC (authoring rules):\n")
+    parts.append(yaml.safe_dump(spec, sort_keys=False))
+    if include_schema_yaml:
+        schema = Path(schema_path).read_text()
+        parts.append("\n---\n")
+        parts.append("DB SCHEMA (logical names to use):\n")
+        parts.append(schema)
+    return "\n".join(parts).strip()