groundedql 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
groundedql/__init__.py ADDED
@@ -0,0 +1,72 @@
1
+ from importlib.metadata import version as _version, PackageNotFoundError
2
+
3
+ try:
4
+ __version__ = _version("groundedql")
5
+ except PackageNotFoundError:
6
+ __version__ = "0.0.0-dev"
7
+
8
+ from .api.api import execute_query_plan, validate_query_plan, load_and_validate_schema
9
+ from .planner import QueryPlanPlanner
10
+ from .validation import validate_query_plan_dict, ValidationErrorItem
11
+ from .queryplan_models import CteDef, QueryPlan, queryplan_json_schema
12
+ from .agent import QueryAgent
13
+ from .semantic_lint import semantic_lint
14
+ from .join_planner import auto_inject_joins, build_link_graph, shortest_join_path
15
+ from .plan_canonical import canonicalize_query_plan, plan_fingerprint
16
+ from .spec_builder import build_spec, write_spec
17
+ from .llm_adapters import MistralChatJSONAdapter, OllamaChatJSONAdapter
18
+ from .read_sql_surface import (
19
+ READ_SQL_SURFACE,
20
+ READ_SQL_SURFACE_VERSION,
21
+ read_sql_surface_capabilities,
22
+ read_sql_surface_summary_for_spec,
23
+ )
24
+ from .cli import introspect_database
25
+ from .decompose import is_compound, split_compound, SubQuestion
26
+ from .plan_autofix import autofix_plan
27
+ from .exceptions import (
28
+ DSLCompilerError,
29
+ SchemaError,
30
+ QueryPlanError,
31
+ AmbiguousColumnError,
32
+ DatabaseExecutionError,
33
+ QueryCostError,
34
+ )
35
+
36
+ __all__ = [
37
+ "execute_query_plan",
38
+ "validate_query_plan",
39
+ "load_and_validate_schema",
40
+ "QueryPlanPlanner",
41
+ "validate_query_plan_dict",
42
+ "ValidationErrorItem",
43
+ "CteDef",
44
+ "QueryPlan",
45
+ "queryplan_json_schema",
46
+ "QueryAgent",
47
+ "semantic_lint",
48
+ "auto_inject_joins",
49
+ "build_link_graph",
50
+ "shortest_join_path",
51
+ "canonicalize_query_plan",
52
+ "plan_fingerprint",
53
+ "is_compound",
54
+ "split_compound",
55
+ "SubQuestion",
56
+ "autofix_plan",
57
+ "DSLCompilerError",
58
+ "SchemaError",
59
+ "QueryPlanError",
60
+ "AmbiguousColumnError",
61
+ "DatabaseExecutionError",
62
+ "QueryCostError",
63
+ "build_spec",
64
+ "write_spec",
65
+ "MistralChatJSONAdapter",
66
+ "OllamaChatJSONAdapter",
67
+ "READ_SQL_SURFACE",
68
+ "READ_SQL_SURFACE_VERSION",
69
+ "read_sql_surface_capabilities",
70
+ "read_sql_surface_summary_for_spec",
71
+ "introspect_database",
72
+ ]
groundedql/agent.py ADDED
@@ -0,0 +1,233 @@
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ import yaml
8
+ from sqlalchemy.engine import Engine
9
+
10
+ from .planner import QueryPlanPlanner
11
+ from .intent_planner import IntentPlanner
12
+ from .validation import validate_query_plan_dict
13
+ from .api.api import execute_query_plan
14
+ from .llm_adapters import make_llm_client
15
+ from .semantic_lint import semantic_lint
16
+ from .decompose import is_compound, split_compound, SubQuestion
17
+ from .spec_builder import build_spec, write_spec
18
+ from .value_index import build_value_index
19
+ from .intent_memory import IntentMemory
20
+
21
+
22
+ def _ensure_spec(schema_path: str, spec_path: Optional[str]) -> str:
23
+ """Auto-generate the spec file from schema.yaml if not provided or missing."""
24
+ if spec_path:
25
+ p = Path(spec_path)
26
+ if p.exists():
27
+ return spec_path
28
+ print(
29
+ f"[GroundedQL] Spec file not found at {spec_path}, generating from schema...",
30
+ file=sys.stderr,
31
+ )
32
+ else:
33
+ p = Path(schema_path).parent / "queryplan_spec_generated.yaml"
34
+ spec_path = str(p)
35
+
36
+ if p.exists():
37
+ schema_mtime = Path(schema_path).stat().st_mtime
38
+ spec_mtime = p.stat().st_mtime
39
+ if spec_mtime >= schema_mtime:
40
+ return spec_path
41
+ print(
42
+ "[GroundedQL] Spec is older than schema.yaml, regenerating...",
43
+ file=sys.stderr,
44
+ )
45
+
46
+ spec = build_spec(schema_path)
47
+ write_spec(spec, spec_path)
48
+ return spec_path
49
+
50
+
51
+ class QueryAgent:
52
+ def __init__(
53
+ self,
54
+ *,
55
+ engine: Engine,
56
+ schema_path: str,
57
+ spec_path: Optional[str] = None,
58
+ llm: Any,
59
+ max_plan_retries: int = 2,
60
+ enforce_semantic_lint: bool = True,
61
+ use_intent_pipeline: bool = True,
62
+ ):
63
+ """
64
+ Args:
65
+ schema_path: Path to your schema.yaml — the only required config file.
66
+ spec_path: Path to the LLM spec file. If omitted or missing, it is
67
+ auto-generated from schema.yaml at startup.
68
+ max_plan_retries: Extra LLM attempts after the first plan when structural
69
+ or semantic checks fail.
70
+ enforce_semantic_lint: If True (default), do not execute when
71
+ :func:`semantic_lint` still reports errors after retries.
72
+ use_intent_pipeline: If True (default), use the two-stage intent
73
+ extraction + deterministic plan builder instead of direct LLM
74
+ QueryPlan generation. Falls back to legacy pipeline on error.
75
+ """
76
+ self.engine = engine
77
+ self.schema_path = schema_path
78
+ self.spec_path = _ensure_spec(schema_path, spec_path)
79
+ self.max_plan_retries = max_plan_retries
80
+ self.enforce_semantic_lint = enforce_semantic_lint
81
+ self.use_intent_pipeline = use_intent_pipeline
82
+ llm_client = make_llm_client(llm)
83
+ self.planner = QueryPlanPlanner(
84
+ llm=llm_client,
85
+ schema_path=schema_path,
86
+ spec_path=self.spec_path,
87
+ )
88
+
89
+ self.value_index = {}
90
+ try:
91
+ self.value_index = build_value_index(engine, schema_path)
92
+ print(
93
+ f"[GroundedQL] Value index built: "
94
+ f"{sum(len(cols) for cols in self.value_index.values())} columns indexed",
95
+ file=sys.stderr,
96
+ )
97
+ except Exception as exc:
98
+ print(f"[GroundedQL] Value index build failed ({exc}), continuing without it.", file=sys.stderr)
99
+
100
+ memory_dir = str(Path(schema_path).parent / ".intent_memory")
101
+ self.intent_memory = IntentMemory(persist_directory=memory_dir)
102
+
103
+ self.intent_planner = IntentPlanner(
104
+ llm=llm_client,
105
+ schema_path=schema_path,
106
+ value_index=self.value_index or None,
107
+ memory=self.intent_memory,
108
+ )
109
+
110
+ def ask(self, question: str) -> Dict[str, Any]:
111
+ if self.use_intent_pipeline:
112
+ return self._ask_intent(question)
113
+ return self._ask_legacy(question)
114
+
115
+ def _ask_intent(self, question: str) -> Dict[str, Any]:
116
+ """Two-stage pipeline: intent extraction → deterministic plan builder."""
117
+ try:
118
+ plan_dict = self.intent_planner.plan(question)
119
+ except Exception as exc:
120
+ print(
121
+ f"[GroundedQL] Intent pipeline failed ({exc}), falling back to legacy.",
122
+ file=sys.stderr,
123
+ )
124
+ return self._ask_legacy(question)
125
+
126
+ parsed, errors = validate_query_plan_dict(plan_dict, self.schema_path)
127
+ if errors:
128
+ print(
129
+ f"[GroundedQL] Intent plan failed validation, falling back to legacy.",
130
+ file=sys.stderr,
131
+ )
132
+ return self._ask_legacy(question)
133
+
134
+ return execute_query_plan(
135
+ engine=self.engine,
136
+ schema_path=self.schema_path,
137
+ query_plan=plan_dict,
138
+ )
139
+
140
+ def _ask_legacy(self, question: str) -> Dict[str, Any]:
141
+ """Original full-plan LLM generation with retries + autofix."""
142
+ plan_dict = self.planner.plan_with_retry(question, max_retries=self.max_plan_retries)
143
+
144
+ parsed, errors = validate_query_plan_dict(plan_dict, self.schema_path)
145
+ if errors:
146
+ return {
147
+ "error": {
148
+ "message": "QueryPlan failed validation after retries.",
149
+ "validation_errors": [{"path": e.path, "message": e.message} for e in errors],
150
+ "plan": plan_dict,
151
+ }
152
+ }
153
+
154
+ schema_data = yaml.safe_load(
155
+ Path(self.schema_path).read_text(encoding="utf-8")
156
+ ) or {}
157
+
158
+ plan_body = {k: v for k, v in plan_dict.items() if k != "meta"}
159
+
160
+ if self.enforce_semantic_lint:
161
+ lint_errs = semantic_lint(question, plan_body, schema_data)
162
+ if lint_errs:
163
+ return {
164
+ "error": {
165
+ "message": "QueryPlan failed semantic lint after retries — plan does not match the question.",
166
+ "lint_errors": lint_errs,
167
+ "plan": plan_dict,
168
+ }
169
+ }
170
+
171
+ return execute_query_plan(
172
+ engine=self.engine,
173
+ schema_path=self.schema_path,
174
+ query_plan=plan_dict,
175
+ )
176
+
177
+ def ask_compound(self, question: str) -> Dict[str, Any]:
178
+ """
179
+ Smart entry point that handles compound questions automatically.
180
+
181
+ If the question asks for multiple deliverables (e.g. a count AND a
182
+ ranked list with detail columns), it splits the question into focused
183
+ sub-questions, runs each through :meth:`ask`, and merges the results.
184
+
185
+ For simple questions it delegates to :meth:`ask` directly.
186
+
187
+ Returns a dict with:
188
+ - ``"compound": False, ...`` for simple questions (same as ``ask()``)
189
+ - ``"compound": True, "parts": [...]`` for compound questions, where
190
+ each part is ``{"role": str, "question": str, "result": dict}``
191
+ """
192
+ if not is_compound(question):
193
+ result = self.ask(question)
194
+ result["compound"] = False
195
+ return result
196
+
197
+ subs = split_compound(question)
198
+ print(
199
+ f"[GroundedQL] Compound question detected — splitting into {len(subs)} sub-questions.",
200
+ file=sys.stderr,
201
+ )
202
+
203
+ parts: List[Dict[str, Any]] = []
204
+ has_success = False
205
+
206
+ for sq in subs:
207
+ print(f"[GroundedQL] {sq.role}: {sq.text!r}", file=sys.stderr)
208
+ try:
209
+ r = self.ask(sq.text)
210
+ is_error = isinstance(r, dict) and bool(r.get("error"))
211
+ if not is_error:
212
+ has_success = True
213
+ parts.append({
214
+ "role": sq.role,
215
+ "question": sq.text,
216
+ "result": r,
217
+ })
218
+ except Exception as exc:
219
+ print(f"[GroundedQL] {sq.role} failed: {exc}", file=sys.stderr)
220
+ parts.append({
221
+ "role": sq.role,
222
+ "question": sq.text,
223
+ "result": {"error": {"message": str(exc)}},
224
+ })
225
+
226
+ if not has_success:
227
+ first_err = next(
228
+ (p["result"] for p in parts if p["result"].get("error")),
229
+ {"error": {"message": "All sub-questions failed."}},
230
+ )
231
+ return first_err
232
+
233
+ return {"compound": True, "parts": parts}
@@ -0,0 +1,4 @@
1
+ from .api import execute_query_plan
2
+ from .spec_api import get_queryplan_instructions, get_queryplan_spec
3
+
4
+ __all__ = ["execute_query_plan", "get_queryplan_instructions", "get_queryplan_spec"]
groundedql/api/api.py ADDED
@@ -0,0 +1,183 @@
1
+ from __future__ import annotations
2
+
3
+ import datetime
4
+ from typing import Any, Dict, List, Optional
5
+
6
+ import yaml
7
+ from sqlalchemy.engine import Engine
8
+
9
+ from ..compiler import Compiler
10
+ from ..executor import Executor
11
+ from ..exceptions import QueryPlanError, DatabaseExecutionError, SchemaError
12
+ from ..join_planner import auto_inject_joins
13
+ from ..plan_canonical import canonicalize_query_plan
14
+ from ..schema_validator import validate_schema
15
+ from ..validation import validate_query_plan_dict
16
+
17
+
18
+ def _resolve_relative_dates(plan: Any) -> Any:
19
+ """
20
+ Recursively walk the plan and replace relative date sentinels with
21
+ concrete ISO-8601 UTC timestamps.
22
+
23
+ Supported value shapes in filter/cmp nodes:
24
+ {"$relative_date": {"op": "now_minus_days", "days": 7}}
25
+ -> replaced with "2024-01-15T10:30:00+00:00" (UTC ISO string)
26
+
27
+ {"$relative_date": {"op": "calendar_year_start", "year_offset": -1}}
28
+ -> UTC midnight at the start of the calendar year relative to "now":
29
+ year_offset 0 = Jan 1 of the current UTC year;
30
+ year_offset -1 = Jan 1 of the previous UTC year; etc.
31
+ Use two filters for "last calendar year":
32
+ field >= calendar_year_start(-1) AND field < calendar_year_start(0)
33
+
34
+ This allows the LLM to express date-relative intent without generating
35
+ SQL expressions as string values (which fail bindparam type checking).
36
+ """
37
+ if isinstance(plan, dict):
38
+ # Resolve relative date sentinel at this node
39
+ if "$relative_date" in plan:
40
+ spec = plan["$relative_date"]
41
+ op = spec.get("op")
42
+ if op == "now_minus_days":
43
+ days = int(spec.get("days", 0))
44
+ dt = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=days)
45
+ return dt.isoformat()
46
+ if op == "now_minus_hours":
47
+ hours = int(spec.get("hours", 0))
48
+ dt = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(hours=hours)
49
+ return dt.isoformat()
50
+ if op == "today":
51
+ return datetime.date.today().isoformat()
52
+ if op == "calendar_year_start":
53
+ now = datetime.datetime.now(datetime.timezone.utc)
54
+ current_year = now.year
55
+ off = int(spec.get("year_offset", 0))
56
+ target_year = current_year + off
57
+ start = datetime.datetime(
58
+ target_year, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc
59
+ )
60
+ return start.isoformat()
61
+ # Unknown op — leave as-is so validation catches it
62
+ return plan
63
+
64
+ return {k: _resolve_relative_dates(v) for k, v in plan.items()}
65
+
66
+ if isinstance(plan, list):
67
+ return [_resolve_relative_dates(item) for item in plan]
68
+
69
+ return plan
70
+
71
+
72
+ def load_and_validate_schema(schema_path: str) -> Dict[str, Any]:
73
+ """
74
+ Load schema.yaml and run load-time validation.
75
+ Prints any non-fatal warnings. Raises SchemaError on fatal issues.
76
+ """
77
+ try:
78
+ with open(schema_path, "r") as f:
79
+ schema = yaml.safe_load(f) or {}
80
+ except Exception as e:
81
+ raise SchemaError(f"Failed to load schema from '{schema_path}': {e}") from e
82
+
83
+ warnings = validate_schema(schema)
84
+ for w in warnings:
85
+ print(f"[GroundedQL schema] {w}")
86
+
87
+ return schema
88
+
89
+
90
+ def validate_query_plan(
91
+ query_plan: Dict[str, Any],
92
+ schema_path: str,
93
+ ) -> List[str]:
94
+ """
95
+ Validate a QueryPlan without executing it.
96
+
97
+ Returns a list of error strings. Empty list = valid.
98
+ Does NOT require a database connection.
99
+
100
+ Args:
101
+ query_plan: The QueryPlan dict to validate.
102
+ schema_path: Path to schema.yaml.
103
+
104
+ Example:
105
+ errors = validate_query_plan(plan, "config/schema.yaml")
106
+ if errors:
107
+ print("Plan is invalid:", errors)
108
+ """
109
+ schema = load_and_validate_schema(schema_path)
110
+
111
+ # Strip meta before validation
112
+ clean_plan = {k: v for k, v in query_plan.items() if k != "meta"}
113
+ clean_plan = _resolve_relative_dates(clean_plan)
114
+
115
+ _, errors = validate_query_plan_dict(clean_plan, schema_path)
116
+ return [f"{e.path}: {e.message}" for e in errors]
117
+
118
+
119
+ def execute_query_plan(
120
+ *,
121
+ engine: Engine,
122
+ schema_path: str,
123
+ query_plan: Dict[str, Any],
124
+ raise_on_error: bool = False,
125
+ statement_timeout_ms: int = 30_000,
126
+ ) -> Dict[str, Any]:
127
+ """
128
+ Compile and execute a QueryPlan.
129
+
130
+ Args:
131
+ engine: SQLAlchemy engine.
132
+ schema_path: Path to schema.yaml.
133
+ query_plan: The QueryPlan dict (from LLM or hand-written).
134
+ raise_on_error: If True, raises typed exceptions instead of returning
135
+ {"error": ...}. Default False for backward compatibility.
136
+ statement_timeout_ms: Per-query statement timeout in milliseconds.
137
+ Default 30000 (30 seconds).
138
+
139
+ Returns:
140
+ Dict with keys: rows, row_count, columns, sql, params, meta (if present)
141
+ On failure (raise_on_error=False): {"error": {"message": ...}}
142
+ """
143
+ try:
144
+ schema = load_and_validate_schema(schema_path)
145
+
146
+ # Strip meta early — before any processing
147
+ meta = query_plan.get("meta")
148
+ clean_plan = {k: v for k, v in query_plan.items() if k != "meta"}
149
+
150
+ resolved_plan = _resolve_relative_dates(clean_plan)
151
+ resolved_plan = auto_inject_joins(resolved_plan, schema)
152
+ resolved_plan = canonicalize_query_plan(resolved_plan)
153
+
154
+ compiler = Compiler(schema)
155
+ sql, params = compiler.compile(resolved_plan)
156
+
157
+ executor = Executor(engine, statement_timeout_ms=statement_timeout_ms)
158
+ result = executor.execute(sql, params)
159
+
160
+ if "error" in result:
161
+ raise DatabaseExecutionError(
162
+ result["error"]["message"],
163
+ sql=sql,
164
+ )
165
+
166
+ result["sql"] = sql
167
+ result["params"] = params
168
+
169
+ # Forward meta from planner if originally present
170
+ if meta is not None:
171
+ result["meta"] = meta
172
+
173
+ return result
174
+
175
+ except (QueryPlanError, DatabaseExecutionError, SchemaError):
176
+ if raise_on_error:
177
+ raise
178
+ import traceback
179
+ return {"error": {"message": traceback.format_exc(limit=3)}}
180
+ except Exception as e:
181
+ if raise_on_error:
182
+ raise DatabaseExecutionError(str(e)) from e
183
+ return {"error": {"message": str(e)}}
@@ -0,0 +1,52 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any, Dict, Optional
5
+ import yaml
6
+
7
+
8
+ def load_yaml(path: str | Path) -> Dict[str, Any]:
9
+ p = Path(path)
10
+ data = yaml.safe_load(p.read_text()) or {}
11
+ if not isinstance(data, dict):
12
+ raise ValueError(f"YAML at {p} must be a mapping/object.")
13
+ return data
14
+
15
+
16
+ def get_queryplan_spec(
17
+ *,
18
+ spec_path: str | Path = "config/queryplan_spec.yaml",
19
+ ) -> Dict[str, Any]:
20
+ """
21
+ Returns the raw spec dict (programmatic use).
22
+ """
23
+ return load_yaml(spec_path)
24
+
25
+
26
+ def get_queryplan_instructions(
27
+ *,
28
+ schema_path: str | Path,
29
+ spec_path: str | Path = "config/queryplan_spec.yaml",
30
+ include_schema_yaml: bool = True,
31
+ ) -> str:
32
+ """
33
+ Returns a ready-to-use prompt string for an LLM:
34
+ - high-level instructions from queryplan_spec.yaml
35
+ - optionally appends the DB schema YAML so the model knows allowed tables/columns
36
+ """
37
+ spec = load_yaml(spec_path)
38
+ parts: list[str] = []
39
+
40
+ # Core instructions
41
+ parts.append(spec.get("system_instructions", "").strip())
42
+ parts.append("\n---\n")
43
+ parts.append("QUERYPLAN SPEC (authoring rules):\n")
44
+ parts.append(yaml.safe_dump(spec, sort_keys=False))
45
+
46
+ if include_schema_yaml:
47
+ schema = Path(schema_path).read_text()
48
+ parts.append("\n---\n")
49
+ parts.append("DB SCHEMA (logical names to use):\n")
50
+ parts.append(schema)
51
+
52
+ return "\n".join(parts).strip()