semql-engine 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ """Public surface of semql-engine."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from semql_engine.adapter import (
6
+ Adapter,
7
+ AdapterResult,
8
+ AsyncAdapter,
9
+ AsyncDuckDBAdapter,
10
+ DBAPIAdapter,
11
+ DuckDBAdapter,
12
+ to_async_adapter,
13
+ )
14
+ from semql_engine.engine import AsyncEngine, Engine, EngineError, ExecutionResult
15
+
16
+ __all__ = [
17
+ "Adapter",
18
+ "AdapterResult",
19
+ "AsyncAdapter",
20
+ "AsyncDuckDBAdapter",
21
+ "AsyncEngine",
22
+ "DBAPIAdapter",
23
+ "DuckDBAdapter",
24
+ "Engine",
25
+ "EngineError",
26
+ "ExecutionResult",
27
+ "to_async_adapter",
28
+ ]
@@ -0,0 +1,180 @@
1
+ """Adapter protocols for the in-process executor.
2
+
3
+ An ``Adapter`` is the glue between a backend's connection and the
4
+ semql executor. Given a ``(sql, params)`` pair, an adapter runs the
5
+ SQL on its backend and yields the result as an :class:`AdapterResult`
6
+ — a typed pair of ``columns: list[str]`` and ``rows: Iterable[Sequence
7
+ [Any]]`` (positional, matching ``columns`` order).
8
+
9
+ Two parallel protocols ship:
10
+
11
+ - :class:`Adapter` — sync ``execute``. Wired up to :class:`semql_engine.Engine`.
12
+ - :class:`AsyncAdapter` — same shape, ``async def execute``. Wired up
13
+ to :class:`semql_engine.AsyncEngine`. Production deployments running
14
+ on asyncio (FastAPI / Litestar / aiohttp) avoid the per-call
15
+ ``asyncio.to_thread`` boilerplate.
16
+
17
+ :func:`to_async_adapter` wraps any sync ``Adapter`` so it satisfies the
18
+ async protocol — useful when only a sync driver is available and the
19
+ caller is otherwise async-first.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import asyncio
25
+ from collections.abc import Iterable, Mapping, Sequence
26
+ from dataclasses import dataclass
27
+ from typing import Any, Protocol
28
+
29
+
30
+ @dataclass
31
+ class AdapterResult:
32
+ """A query result as a column list + row iterator.
33
+
34
+ ``rows`` is positional — each row is a tuple/list aligned to
35
+ ``columns``. The executor zips them into dicts when materialising
36
+ into DuckDB temp tables. Iterators are fine; the engine consumes
37
+ each row exactly once.
38
+ """
39
+
40
+ columns: list[str]
41
+ rows: Iterable[Sequence[Any]]
42
+
43
+
44
+ class Adapter(Protocol):
45
+ """Minimal contract for a backend connection.
46
+
47
+ Implementations execute a SQL string against their backend and
48
+ return rows + column metadata. Parameter binding is the adapter's
49
+ responsibility (sqlglot-emitted placeholders match the dialect, so
50
+ a Postgres adapter sees ``$1``-style or named placeholders as
51
+ appropriate)."""
52
+
53
+ def execute(
54
+ self,
55
+ sql: str,
56
+ params: Mapping[str, Any],
57
+ ) -> AdapterResult: ...
58
+
59
+
60
+ class DBAPIAdapter:
61
+ """PEP-249 adapter — wraps any DB-API 2.0 connection.
62
+
63
+ Uses a fresh cursor per call, passes ``params`` as the second
64
+ argument to ``cursor.execute`` (the named-parameter form is
65
+ driver-specific; this assumes the driver matches the dialect the
66
+ compiler emitted for the backend).
67
+
68
+ Read-only by design: we never commit or close the connection; the
69
+ caller owns its lifecycle.
70
+ """
71
+
72
+ def __init__(self, connection: Any) -> None: # noqa: ANN401 — any PEP-249 conn
73
+ self._conn = connection
74
+
75
+ def execute(self, sql: str, params: Mapping[str, Any]) -> AdapterResult:
76
+ cursor = self._conn.cursor()
77
+ try:
78
+ if params:
79
+ cursor.execute(sql, dict(params))
80
+ else:
81
+ cursor.execute(sql)
82
+ description: list[Any] = list(cursor.description or [])
83
+ columns: list[str] = [str(d[0]) for d in description]
84
+ rows = list(cursor.fetchall())
85
+ finally:
86
+ cursor.close()
87
+ return AdapterResult(columns=columns, rows=rows)
88
+
89
+
90
+ class DuckDBAdapter:
91
+ """DuckDB adapter — wraps an existing ``duckdb.DuckDBPyConnection``.
92
+
93
+ DuckDB uses ``$name`` placeholders for named parameters; the
94
+ compiler emits them already for DuckDB targets, so we pass
95
+ ``params`` through unchanged.
96
+
97
+ Useful for: local CSV / Parquet enrichment cubes (point a Cube at a
98
+ file path and DuckDB reads it natively), in-memory test fixtures,
99
+ and as a unified backend for users who don't want to manage
100
+ multiple connections.
101
+ """
102
+
103
+ def __init__(self, connection: Any) -> None: # noqa: ANN401 — duckdb conn
104
+ self._conn = connection
105
+
106
+ def execute(self, sql: str, params: Mapping[str, Any]) -> AdapterResult:
107
+ cursor = self._conn.execute(sql, dict(params) if params else None)
108
+ description: list[Any] = list(cursor.description or [])
109
+ columns: list[str] = [str(d[0]) for d in description]
110
+ rows = cursor.fetchall()
111
+ return AdapterResult(columns=columns, rows=rows)
112
+
113
+
114
+ class AsyncAdapter(Protocol):
115
+ """Async counterpart of :class:`Adapter`.
116
+
117
+ ``execute`` is an awaitable that returns the same
118
+ :class:`AdapterResult` shape. Implementations should be safe to
119
+ call concurrently — :class:`semql_engine.AsyncEngine` runs all the
120
+ fragments of a federated plan in parallel via ``asyncio.gather``.
121
+ """
122
+
123
+ async def execute(
124
+ self,
125
+ sql: str,
126
+ params: Mapping[str, Any],
127
+ ) -> AdapterResult: ...
128
+
129
+
130
+ class _SyncAsAsyncAdapter:
131
+ """Internal wrapper produced by :func:`to_async_adapter`."""
132
+
133
+ def __init__(self, inner: Adapter) -> None:
134
+ self._inner = inner
135
+
136
+ async def execute(self, sql: str, params: Mapping[str, Any]) -> AdapterResult:
137
+ # ``asyncio.to_thread`` releases the event loop so other
138
+ # fragments registered on the AsyncEngine can run in parallel
139
+ # even when this adapter is pure-Python sync.
140
+ return await asyncio.to_thread(self._inner.execute, sql, params)
141
+
142
+
143
+ def to_async_adapter(adapter: Adapter) -> AsyncAdapter:
144
+ """Wrap a sync :class:`Adapter` so it satisfies :class:`AsyncAdapter`.
145
+
146
+ The wrapped adapter dispatches each ``execute`` to a worker thread
147
+ via ``asyncio.to_thread`` — fragments scheduled on
148
+ :class:`semql_engine.AsyncEngine` still run concurrently because
149
+ the event loop is freed up while the thread blocks on I/O. Prefer a
150
+ native async adapter when the underlying driver supports one; the
151
+ bridge is the right answer for drivers that only ship sync APIs.
152
+ """
153
+ return _SyncAsAsyncAdapter(adapter)
154
+
155
+
156
+ class AsyncDuckDBAdapter:
157
+ """Async DuckDB adapter — wraps an existing ``duckdb.DuckDBPyConnection``.
158
+
159
+ DuckDB has no native async API; this adapter dispatches each
160
+ ``execute`` to a worker thread via ``asyncio.to_thread``. Useful
161
+ for single-fragment async plans, in-memory test fixtures, and for
162
+ keeping async-first user code from inheriting a sync ``Engine``.
163
+ """
164
+
165
+ def __init__(self, connection: Any) -> None: # noqa: ANN401 — duckdb conn
166
+ self._inner = DuckDBAdapter(connection)
167
+
168
+ async def execute(self, sql: str, params: Mapping[str, Any]) -> AdapterResult:
169
+ return await asyncio.to_thread(self._inner.execute, sql, params)
170
+
171
+
172
+ __all__ = [
173
+ "Adapter",
174
+ "AdapterResult",
175
+ "AsyncAdapter",
176
+ "AsyncDuckDBAdapter",
177
+ "DBAPIAdapter",
178
+ "DuckDBAdapter",
179
+ "to_async_adapter",
180
+ ]
semql_engine/engine.py ADDED
@@ -0,0 +1,655 @@
1
+ """In-process executor for :class:`semql.FederatedPlan`.
2
+
3
+ The :class:`Engine` runs each per-backend fragment via a registered
4
+ :class:`Adapter`, materialises the resulting rows into in-memory DuckDB
5
+ under the tables ``frag_0``, ``frag_1``, … expected by the plan's
6
+ ``merge.sql``, and finally executes the merge to produce the final
7
+ shape.
8
+
9
+ Single-fragment plans (returned by :func:`semql.compile_federated_query`
10
+ when the query touches one backend) are handled identically — the merge
11
+ SQL is a trivial ``SELECT * FROM frag_0`` in that case.
12
+
13
+ The engine keeps a private DuckDB connection. Adapters that are
14
+ themselves DuckDB-backed run against their own connections; results
15
+ still flow through the engine's connection via the materialisation
16
+ step, so isolation is preserved.
17
+
18
+ :class:`AsyncEngine.iter_run` has a single-fragment fast path that
19
+ recognises the trivial merge shape that distributive federation emits
20
+ for one-backend plans (column rename + ORDER + LIMIT + identity
21
+ SUM-over-single-row groups + AVG decomposition) and executes the
22
+ merge in Python — skipping DuckDB's CREATE TABLE + INSERT roundtrip
23
+ plus the full second pass over the materialised rows.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import asyncio
29
+ import re
30
+ from collections.abc import AsyncIterator, Iterable, Iterator, Sequence
31
+ from dataclasses import dataclass
32
+ from dataclasses import field as dc_field
33
+ from typing import Any, Literal
34
+
35
+ import duckdb
36
+ import sqlglot
37
+ import sqlglot.errors
38
+ from semql.compile import ColumnMeta
39
+ from semql.federate import FederatedPlan
40
+ from semql.model import Backend
41
+ from sqlglot import expressions as exp
42
+
43
+ from semql_engine.adapter import Adapter, AdapterResult, AsyncAdapter
44
+
45
+
46
+ class EngineError(RuntimeError):
47
+ """Raised by the engine when a plan can't be executed.
48
+
49
+ Distinct from ``FederationError`` (compile-time refusals): this
50
+ surfaces runtime issues such as a missing adapter for a backend the
51
+ plan references, or an adapter returning rows whose columns don't
52
+ match the fragment's declared output."""
53
+
54
+
55
+ @dataclass
56
+ class ExecutionResult:
57
+ """Final result of running a :class:`FederatedPlan`.
58
+
59
+ ``columns`` and ``column_meta`` are pass-throughs from the plan so a
60
+ consumer that wants formatted output (units, percent, etc.) has
61
+ everything it needs without re-resolving against the catalog.
62
+ """
63
+
64
+ columns: list[str]
65
+ column_meta: list[ColumnMeta]
66
+ rows: list[tuple[Any, ...]]
67
+
68
+
69
+ class Engine:
70
+ """Runs federated plans by materialising fragments into DuckDB.
71
+
72
+ Register one adapter per backend you intend to query against, then
73
+ call :meth:`run`. The engine isn't tied to a specific catalog;
74
+ register adapters once and execute many plans.
75
+ """
76
+
77
+ def __init__(self, duckdb_connection: Any | None = None) -> None: # noqa: ANN401
78
+ self._con: Any = duckdb_connection or duckdb.connect(":memory:")
79
+ self._adapters: dict[Backend, Adapter] = {}
80
+
81
+ def register(self, backend: Backend, adapter: Adapter) -> None:
82
+ """Bind an adapter to a backend. Replacing an existing
83
+ registration is allowed (so callers can swap adapters mid-flight
84
+ in tests)."""
85
+ self._adapters[backend] = adapter
86
+
87
+ def run(self, plan: FederatedPlan) -> ExecutionResult:
88
+ """Execute a :class:`FederatedPlan` end-to-end.
89
+
90
+ For each fragment, runs the SQL via the matching adapter and
91
+ materialises the rows into a DuckDB temp table. Then runs the
92
+ plan's merge SQL and returns the final rows + metadata.
93
+
94
+ Raises :class:`EngineError` for missing adapters or column
95
+ mismatches between adapter output and the fragment's declared
96
+ columns.
97
+ """
98
+ self._reset_frag_tables(len(plan.fragments))
99
+ for i, fragment in enumerate(plan.fragments):
100
+ adapter = self._adapters.get(fragment.backend)
101
+ if adapter is None:
102
+ raise EngineError(
103
+ f"No adapter registered for backend "
104
+ f"{fragment.backend.value!r}. Call Engine.register("
105
+ f"Backend.{fragment.backend.name}, your_adapter) "
106
+ f"before running this plan."
107
+ )
108
+ result = adapter.execute(fragment.sql, fragment.params)
109
+ if set(result.columns) != set(fragment.columns):
110
+ raise EngineError(
111
+ f"Fragment {i} (backend {fragment.backend.value!r}) "
112
+ f"adapter returned columns {result.columns!r} but the "
113
+ f"fragment declares {fragment.columns!r}. Adapter "
114
+ f"must preserve the SELECT-list aliases."
115
+ )
116
+ materialised: list[tuple[Any, ...]] = [tuple(r) for r in result.rows]
117
+ self._load_fragment(i, result.columns, materialised)
118
+
119
+ merge_cursor = self._con.execute(plan.merge.sql, dict(plan.merge.params))
120
+ rows = merge_cursor.fetchall()
121
+ return ExecutionResult(
122
+ columns=plan.columns,
123
+ column_meta=plan.column_meta,
124
+ rows=rows,
125
+ )
126
+
127
+ def iter_rows(self, plan: FederatedPlan) -> Iterator[dict[str, Any]]:
128
+ """Convenience: run the plan and yield each row as a
129
+ ``{column: value}`` dict. Useful for callers wiring the result
130
+ into a templating layer / JSON envelope."""
131
+ result = self.run(plan)
132
+ for row in result.rows:
133
+ yield dict(zip(result.columns, row, strict=True))
134
+
135
+ # ------------------------------------------------------------------
136
+ # Internals
137
+ # ------------------------------------------------------------------
138
+
139
+ def _reset_frag_tables(self, n: int) -> None:
140
+ """Drop any frag_* tables left over from a previous run so we
141
+ don't accidentally join against stale data. n is conservatively
142
+ larger than needed in case a previous plan had more fragments."""
143
+ # We drop generously to also clean up old runs with more frags.
144
+ # A failed query won't recurse into Python-level state.
145
+ for i in range(max(n, 32)):
146
+ self._con.execute(f"DROP TABLE IF EXISTS frag_{i}")
147
+
148
+ def _load_fragment(
149
+ self,
150
+ index: int,
151
+ columns: list[str],
152
+ rows: list[tuple[Any, ...]],
153
+ ) -> None:
154
+ """Materialise a fragment's rows into ``frag_<index>``.
155
+
156
+ Strategy: infer a DuckDB type per column from the first non-NULL
157
+ value in each column, CREATE TABLE with those types, then
158
+ ``executemany`` the rows. Adapters that return empty result
159
+ sets get a VARCHAR-typed table (we have no per-column type
160
+ info in the adapter contract) — that's fine for merge joins
161
+ that produce an empty result themselves."""
162
+ col_idents = ", ".join(_quote(c) for c in columns)
163
+ types = _infer_column_types(columns, rows)
164
+ type_decls = ", ".join(f"{_quote(c)} {t}" for c, t in zip(columns, types, strict=True))
165
+ self._con.execute(f"CREATE TABLE frag_{index} ({type_decls})")
166
+ if not rows:
167
+ return
168
+ placeholders = ", ".join("?" for _ in columns)
169
+ self._con.executemany(
170
+ f"INSERT INTO frag_{index} ({col_idents}) VALUES ({placeholders})",
171
+ rows,
172
+ )
173
+
174
+
175
+ def _infer_column_types(columns: list[str], rows: list[tuple[Any, ...]]) -> list[str]:
176
+ """Pick a DuckDB type per column from the first non-NULL value.
177
+
178
+ Falls back to ``VARCHAR`` for fully-NULL columns and unknown types
179
+ — DuckDB will widen on insert if the data is heterogeneous, and
180
+ callers wanting strict types should cast on the source side."""
181
+ types: list[str] = []
182
+ for col_idx in range(len(columns)):
183
+ chosen = "VARCHAR"
184
+ for row in rows:
185
+ v = row[col_idx]
186
+ if v is None:
187
+ continue
188
+ chosen = _duckdb_type_for(v)
189
+ break
190
+ types.append(chosen)
191
+ return types
192
+
193
+
194
+ def _duckdb_type_for(value: Any) -> str: # noqa: ANN401 — any row value
195
+ """Map a Python value to a DuckDB type literal.
196
+
197
+ Order matters: ``bool`` is a subclass of ``int`` in Python, check
198
+ it first."""
199
+ import datetime as _dt
200
+
201
+ if isinstance(value, bool):
202
+ return "BOOLEAN"
203
+ if isinstance(value, int):
204
+ return "BIGINT"
205
+ if isinstance(value, float):
206
+ return "DOUBLE"
207
+ if isinstance(value, str):
208
+ return "VARCHAR"
209
+ if isinstance(value, _dt.datetime):
210
+ return "TIMESTAMP"
211
+ if isinstance(value, _dt.date):
212
+ return "DATE"
213
+ if isinstance(value, _dt.time):
214
+ return "TIME"
215
+ if isinstance(value, bytes):
216
+ return "BLOB"
217
+ return "VARCHAR"
218
+
219
+
220
+ def _quote(name: str) -> str:
221
+ """DuckDB identifier quoting; matches semql.federate."""
222
+ return f'"{name}"'
223
+
224
+
225
+ # ---------------------------------------------------------------------------
226
+ # Single-fragment fast path
227
+ # ---------------------------------------------------------------------------
228
+ #
229
+ # For a 1-fragment plan, the merge SQL is structurally trivial: it
230
+ # selects from ``frag_0`` with column renames, an identity SUM (or
231
+ # NULLIF(SUM/SUM) for AVG) since each group has exactly one row in the
232
+ # fragment, plus optional ORDER BY / LIMIT / OFFSET. There is no
233
+ # cross-fragment join to do.
234
+ #
235
+ # Going through DuckDB still works but pays for: a CREATE TABLE +
236
+ # INSERT roundtrip that copies every row, plus a second full pass when
237
+ # the merge SELECT scans frag_0. For large raw-row results this
238
+ # overhead is real. The fast path parses the merge SQL once, builds a
239
+ # tiny *MergeProgram* (per-output-column transform + sort + slice),
240
+ # and applies it to the adapter rows directly — no DuckDB touched.
241
+ #
242
+ # Detection is conservative: any merge shape we don't immediately
243
+ # recognise (HAVING, complex expressions, cross-fragment JOIN, etc.)
244
+ # returns ``None`` from ``_try_build_program`` and we fall through to
245
+ # the DuckDB path. Correctness > coverage.
246
+
247
+
248
+ @dataclass(frozen=True)
249
+ class _ColTransform:
250
+ """How to compute one output column from a fragment row.
251
+
252
+ ``kind`` is one of:
253
+ - ``"identity"`` — copy ``f0.<src>`` to the output. Covers both
254
+ bare dimension references and ``SUM(col)`` over single-row groups
255
+ (SUM of one element is identity, modulo NULL handling: SUM
256
+ ignores NULLs whereas the identity path passes them through;
257
+ single-fragment plans don't produce per-group NULL aggregates
258
+ so this is safe).
259
+ - ``"avg_div"`` — emit ``sum_col / count_col`` when ``count_col``
260
+ is non-zero / non-null; else NULL. Mirrors the
261
+ ``NULLIF(SUM(sum_col), 0)`` shape the merge SQL uses for AVG
262
+ decomposition.
263
+ """
264
+
265
+ kind: Literal["identity", "avg_div"]
266
+ src_col: str = "" # for identity
267
+ sum_col: str = "" # for avg_div
268
+ count_col: str = "" # for avg_div
269
+
270
+
271
+ @dataclass(frozen=True)
272
+ class _MergeProgram:
273
+ """Recipe for executing a single-fragment merge in Python.
274
+
275
+ ``transforms`` is parallel to the plan's output columns. ``order``
276
+ is a list of ``(output_col_index, descending)`` pairs applied as a
277
+ stable Python sort. ``limit`` and ``offset`` apply after sort.
278
+ """
279
+
280
+ transforms: list[_ColTransform]
281
+ order: list[tuple[int, bool]] = dc_field(default_factory=lambda: [])
282
+ limit: int | None = None
283
+ offset: int | None = None
284
+
285
+
286
+ def _try_build_program(
287
+ merge_sql: str,
288
+ fragment_columns: list[str],
289
+ output_columns: list[str],
290
+ ) -> _MergeProgram | None:
291
+ """Parse a single-fragment merge SQL into an executable program.
292
+
293
+ Returns ``None`` if the SQL doesn't match the trivial single-fragment
294
+ shape the federation layer emits — caller falls back to the DuckDB
295
+ materialisation path."""
296
+ try:
297
+ tree = sqlglot.parse_one(merge_sql, dialect="duckdb")
298
+ except sqlglot.errors.ParseError:
299
+ return None
300
+ if not isinstance(tree, exp.Select):
301
+ return None
302
+
303
+ # HAVING / DISTINCT / WINDOW / CTEs aren't part of the trivial shape.
304
+ if tree.args.get("having") is not None:
305
+ return None
306
+ if tree.args.get("distinct") is not None:
307
+ return None
308
+ if tree.args.get("with") is not None:
309
+ return None
310
+
311
+ select_exprs = tree.expressions
312
+
313
+ # Star: ``SELECT * FROM frag_0`` — identity over every fragment
314
+ # column in fragment order. The federation layer emits this for
315
+ # the trivial single-backend case (no measure renames needed).
316
+ transforms: list[_ColTransform] = []
317
+ fragment_set = set(fragment_columns)
318
+ if len(select_exprs) == 1 and isinstance(select_exprs[0], exp.Star):
319
+ if fragment_columns != output_columns:
320
+ return None
321
+ transforms = [_ColTransform(kind="identity", src_col=c) for c in fragment_columns]
322
+ else:
323
+ if len(select_exprs) != len(output_columns):
324
+ return None
325
+ for expression in select_exprs:
326
+ t = _try_parse_select_item(expression, fragment_set)
327
+ if t is None:
328
+ return None
329
+ transforms.append(t)
330
+
331
+ # ORDER BY — output-column references only.
332
+ order_pairs: list[tuple[int, bool]] = []
333
+ order_node = tree.args.get("order")
334
+ if order_node is not None:
335
+ for ordered in order_node.expressions:
336
+ if not isinstance(ordered, exp.Ordered):
337
+ return None
338
+ col = ordered.this
339
+ if not isinstance(col, exp.Column) or col.table:
340
+ return None
341
+ name = col.name
342
+ if name not in output_columns:
343
+ return None
344
+ order_pairs.append((output_columns.index(name), bool(ordered.args.get("desc"))))
345
+
346
+ limit_val: int | None = None
347
+ limit_node = tree.args.get("limit")
348
+ if limit_node is not None:
349
+ n = limit_node.expression if hasattr(limit_node, "expression") else None
350
+ if isinstance(n, exp.Literal) and n.is_int:
351
+ limit_val = int(n.this)
352
+ else:
353
+ return None
354
+
355
+ offset_val: int | None = None
356
+ offset_node = tree.args.get("offset")
357
+ if offset_node is not None:
358
+ n = offset_node.expression if hasattr(offset_node, "expression") else None
359
+ if isinstance(n, exp.Literal) and n.is_int:
360
+ offset_val = int(n.this)
361
+ else:
362
+ return None
363
+
364
+ return _MergeProgram(
365
+ transforms=transforms,
366
+ order=order_pairs,
367
+ limit=limit_val,
368
+ offset=offset_val,
369
+ )
370
+
371
+
372
+ def _try_parse_select_item(
373
+ expression: Any, # noqa: ANN401 — sqlglot Expression isn't in the public `expressions` __all__
374
+ fragment_cols: set[str],
375
+ ) -> _ColTransform | None:
376
+ """Match the merge SQL's SELECT items: ``f0.col AS out``,
377
+ ``SUM(f0.col) AS out``, or ``SUM(f0.sum_col) / NULLIF(SUM(f0.count_col), 0) AS out``."""
378
+ inner = expression.unalias()
379
+
380
+ # Bare column reference: ``f0.col`` (or just ``col``).
381
+ if isinstance(inner, exp.Column):
382
+ col = inner.name
383
+ if col in fragment_cols:
384
+ return _ColTransform(kind="identity", src_col=col)
385
+ return None
386
+
387
+ # ``SUM(f0.col)`` — identity over single-row groups.
388
+ if isinstance(inner, exp.Sum):
389
+ arg = inner.this
390
+ if isinstance(arg, exp.Column) and arg.name in fragment_cols:
391
+ return _ColTransform(kind="identity", src_col=arg.name)
392
+ return None
393
+
394
+ # AVG decomposition: ``SUM(sum_col) / NULLIF(SUM(count_col), 0)``.
395
+ if isinstance(inner, exp.Div):
396
+ num = inner.this
397
+ den = inner.expression
398
+ if not (isinstance(num, exp.Sum) and isinstance(num.this, exp.Column)):
399
+ return None
400
+ if not isinstance(den, exp.Nullif):
401
+ return None
402
+ den_sum = den.this
403
+ zero = den.expression
404
+ if not (isinstance(den_sum, exp.Sum) and isinstance(den_sum.this, exp.Column)):
405
+ return None
406
+ if not (isinstance(zero, exp.Literal) and zero.this == "0"):
407
+ return None
408
+ sum_col = num.this.name
409
+ count_col = den_sum.this.name
410
+ if sum_col in fragment_cols and count_col in fragment_cols:
411
+ return _ColTransform(
412
+ kind="avg_div",
413
+ sum_col=sum_col,
414
+ count_col=count_col,
415
+ )
416
+ return None
417
+
418
+
419
+ def _project_row(
420
+ row: Sequence[Any],
421
+ col_index: dict[str, int],
422
+ transforms: list[_ColTransform],
423
+ ) -> tuple[Any, ...]:
424
+ """Apply a MergeProgram's column transforms to one fragment row."""
425
+ out: list[Any] = []
426
+ for t in transforms:
427
+ if t.kind == "identity":
428
+ out.append(row[col_index[t.src_col]])
429
+ else:
430
+ sum_val = row[col_index[t.sum_col]]
431
+ count_val = row[col_index[t.count_col]]
432
+ if sum_val is None or count_val in (None, 0):
433
+ out.append(None)
434
+ else:
435
+ out.append(sum_val / count_val)
436
+ return tuple(out)
437
+
438
+
439
+ def _apply_program(
440
+ program: _MergeProgram,
441
+ fragment_columns: list[str],
442
+ rows: Iterable[Sequence[Any]],
443
+ ) -> list[tuple[Any, ...]]:
444
+ """Execute the MergeProgram against the fragment rowset.
445
+
446
+ Returns the projected + sorted + sliced result list. Sorting +
447
+ slicing forces materialisation; chunk emission still lets callers
448
+ stream the OUTPUT side."""
449
+ col_index = {c: i for i, c in enumerate(fragment_columns)}
450
+ projected = [_project_row(r, col_index, program.transforms) for r in rows]
451
+ if program.order:
452
+ for col_idx, descending in reversed(program.order):
453
+
454
+ def _key(row: tuple[Any, ...], _i: int = col_idx) -> _OrderKey:
455
+ return _OrderKey(row[_i])
456
+
457
+ projected.sort(key=_key, reverse=descending)
458
+ if program.offset:
459
+ projected = projected[program.offset :]
460
+ if program.limit is not None:
461
+ projected = projected[: program.limit]
462
+ return projected
463
+
464
+
465
+ class _OrderKey:
466
+ """Wrapper so Python sort handles NULLs without raising.
467
+
468
+ Python ``sorted`` over a list containing ``None`` and numbers
469
+ raises ``TypeError``. SQL ORDER BY treats NULL as low (ASC) by
470
+ default. We mirror that — NULLs sort first; non-NULLs follow
471
+ in natural Python order."""
472
+
473
+ __slots__ = ("v",)
474
+
475
+ def __init__(self, v: Any) -> None: # noqa: ANN401
476
+ self.v = v
477
+
478
+ def __lt__(self, other: _OrderKey) -> bool:
479
+ a, b = self.v, other.v
480
+ if a is None and b is None:
481
+ return False
482
+ if a is None:
483
+ return True
484
+ if b is None:
485
+ return False
486
+ return bool(a < b)
487
+
488
+
489
+ _FRAG_TABLE_RE = re.compile(r"\bfrag_(\d+)\b")
490
+
491
+
492
+ class AsyncEngine:
493
+ """Async counterpart to :class:`Engine`.
494
+
495
+ Runs federated plans by awaiting per-fragment adapters in parallel
496
+ via :func:`asyncio.gather`, then merging the results in DuckDB.
497
+ Fragments of a single ``FederatedPlan`` are always independent
498
+ (they're per-backend sub-queries; the join lives in the merge SQL),
499
+ so the parallelism is safe for any plan the federation layer
500
+ produces.
501
+
502
+ :meth:`iter_run` adds chunked streaming: the merge cursor's rows
503
+ are fetched in batches of ``chunk_rows`` so a result set with
504
+ millions of rows doesn't have to land in memory all at once. For
505
+ single-fragment plans, the merge runs in Python (DuckDB skipped
506
+ entirely) — see ``_MergeProgram`` and ``_try_build_program`` for
507
+ the recognised shapes. Multi-fragment plans continue to merge in
508
+ DuckDB because that's where the join belongs.
509
+
510
+ ``last_iter_run_used_fast_path`` records which path the most-recent
511
+ ``iter_run`` call took. Useful for tests + observability; not part
512
+ of the wire protocol.
513
+ """
514
+
515
+ def __init__(self, duckdb_connection: Any | None = None) -> None: # noqa: ANN401
516
+ self._con: Any = duckdb_connection or duckdb.connect(":memory:")
517
+ self._adapters: dict[Backend, AsyncAdapter] = {}
518
+ self.last_iter_run_used_fast_path: bool = False
519
+
520
+ def register(self, backend: Backend, adapter: AsyncAdapter) -> None:
521
+ """Bind an async adapter to a backend. Replacing an existing
522
+ registration is allowed."""
523
+ self._adapters[backend] = adapter
524
+
525
+ async def run(self, plan: FederatedPlan) -> ExecutionResult:
526
+ """Execute a :class:`FederatedPlan` end-to-end on an event loop.
527
+
528
+ Fragments are launched concurrently via :func:`asyncio.gather`;
529
+ a single slow adapter doesn't block the others. Once every
530
+ fragment has returned, results are materialised into DuckDB and
531
+ the merge SQL runs to produce the final shape.
532
+
533
+ Raises :class:`EngineError` for missing adapters or column
534
+ mismatches.
535
+ """
536
+ self._adapters_present(plan)
537
+ self._reset_frag_tables(len(plan.fragments))
538
+
539
+ results = await asyncio.gather(
540
+ *(
541
+ self._adapters[frag.backend].execute(frag.sql, frag.params)
542
+ for frag in plan.fragments
543
+ )
544
+ )
545
+
546
+ for i, (fragment, result) in enumerate(zip(plan.fragments, results, strict=True)):
547
+ self._load_result(i, fragment, result)
548
+
549
+ merge_cursor = self._con.execute(plan.merge.sql, dict(plan.merge.params))
550
+ rows = merge_cursor.fetchall()
551
+ return ExecutionResult(
552
+ columns=plan.columns,
553
+ column_meta=plan.column_meta,
554
+ rows=rows,
555
+ )
556
+
557
+ async def iter_run(
558
+ self,
559
+ plan: FederatedPlan,
560
+ *,
561
+ chunk_rows: int = 10_000,
562
+ ) -> AsyncIterator[list[tuple[Any, ...]]]:
563
+ """Run ``plan`` and yield merge result rows in chunks.
564
+
565
+ Two paths:
566
+
567
+ - **Single-fragment fast path** — when ``plan.fragments`` has
568
+ one entry and ``_try_build_program`` recognises the merge
569
+ shape, the merge runs in Python without DuckDB.
570
+ ``last_iter_run_used_fast_path`` is set to ``True``.
571
+ - **DuckDB merge** — multi-fragment plans, or shapes the fast
572
+ path doesn't recognise (HAVING etc.). Fragments materialise
573
+ into DuckDB temp tables and the merge cursor is fetched via
574
+ ``fetchmany`` for memory-bounded streaming.
575
+
576
+ Yields a list of row tuples per iteration; an empty list is
577
+ never emitted — the iterator terminates instead.
578
+ """
579
+ if chunk_rows <= 0:
580
+ raise EngineError(f"iter_run: chunk_rows must be positive, got {chunk_rows!r}.")
581
+ self._adapters_present(plan)
582
+ self.last_iter_run_used_fast_path = False
583
+
584
+ if len(plan.fragments) == 1:
585
+ fragment = plan.fragments[0]
586
+ program = _try_build_program(
587
+ plan.merge.sql,
588
+ fragment.columns,
589
+ plan.columns,
590
+ )
591
+ if program is not None:
592
+ self.last_iter_run_used_fast_path = True
593
+ adapter = self._adapters[fragment.backend]
594
+ result = await adapter.execute(fragment.sql, fragment.params)
595
+ if set(result.columns) != set(fragment.columns):
596
+ raise EngineError(
597
+ f"Fragment 0 (backend {fragment.backend.value!r}) "
598
+ f"adapter returned columns {result.columns!r} but "
599
+ f"the fragment declares {fragment.columns!r}. "
600
+ "Adapter must preserve the SELECT-list aliases."
601
+ )
602
+ rows = _apply_program(program, result.columns, result.rows)
603
+ for start in range(0, len(rows), chunk_rows):
604
+ yield rows[start : start + chunk_rows]
605
+ return
606
+
607
+ self._reset_frag_tables(len(plan.fragments))
608
+
609
+ results = await asyncio.gather(
610
+ *(
611
+ self._adapters[frag.backend].execute(frag.sql, frag.params)
612
+ for frag in plan.fragments
613
+ )
614
+ )
615
+ for i, (fragment, result) in enumerate(zip(plan.fragments, results, strict=True)):
616
+ self._load_result(i, fragment, result)
617
+
618
+ cursor = self._con.execute(plan.merge.sql, dict(plan.merge.params))
619
+ while True:
620
+ chunk = await asyncio.to_thread(cursor.fetchmany, chunk_rows)
621
+ if not chunk:
622
+ return
623
+ yield [tuple(row) for row in chunk]
624
+
625
+ # ------------------------------------------------------------------
626
+ # Internals
627
+ # ------------------------------------------------------------------
628
+
629
+ def _adapters_present(self, plan: FederatedPlan) -> None:
630
+ for frag in plan.fragments:
631
+ if frag.backend not in self._adapters:
632
+ raise EngineError(
633
+ f"No adapter registered for backend "
634
+ f"{frag.backend.value!r}. Call AsyncEngine.register("
635
+ f"Backend.{frag.backend.name}, your_adapter) before "
636
+ f"running this plan."
637
+ )
638
+
639
+ def _load_result(self, index: int, fragment: Any, result: AdapterResult) -> None: # noqa: ANN401
640
+ if set(result.columns) != set(fragment.columns):
641
+ raise EngineError(
642
+ f"Fragment {index} (backend {fragment.backend.value!r}) "
643
+ f"adapter returned columns {result.columns!r} but the "
644
+ f"fragment declares {fragment.columns!r}. Adapter "
645
+ f"must preserve the SELECT-list aliases."
646
+ )
647
+ materialised: list[tuple[Any, ...]] = [tuple(r) for r in result.rows]
648
+ # Reuse Engine's loader; signature matches.
649
+ Engine._load_fragment(self, index, result.columns, materialised) # type: ignore[arg-type]
650
+
651
+ def _reset_frag_tables(self, n: int) -> None:
652
+ Engine._reset_frag_tables(self, n) # type: ignore[arg-type]
653
+
654
+
655
+ __all__ = ["AsyncEngine", "Engine", "EngineError", "ExecutionResult"]
semql_engine/py.typed ADDED
File without changes
@@ -0,0 +1,88 @@
1
+ Metadata-Version: 2.4
2
+ Name: semql-engine
3
+ Version: 0.2.1
4
+ Summary: In-process executor for semql FederatedPlans — runs per-backend fragments via caller-supplied adapters and merges results in DuckDB.
5
+ Author: Nikhil Pallamreddy
6
+ Author-email: Nikhil Pallamreddy <nikhil.pallamreddy+git@gmail.com>
7
+ License-Expression: BSD-3-Clause
8
+ License-File: LICENSE
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Topic :: Database
16
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
17
+ Classifier: Typing :: Typed
18
+ Requires-Dist: duckdb>=1.5.3
19
+ Requires-Dist: semql>=0.2.1,<0.3
20
+ Requires-Python: >=3.12
21
+ Project-URL: Homepage, https://github.com/npalladium/semql
22
+ Project-URL: Repository, https://github.com/npalladium/semql
23
+ Project-URL: Issues, https://github.com/npalladium/semql/issues
24
+ Description-Content-Type: text/markdown
25
+
26
+ # semql-engine
27
+
28
+ In-process executor for [`semql`](https://github.com/npalladium/semql)
29
+ `FederatedPlan` results. Runs each per-backend fragment via a
30
+ caller-supplied `Adapter`, materialises the rows into in-memory DuckDB,
31
+ then runs the plan's merge SQL against the assembled tables.
32
+
33
+ `semql` core stays sans-io. `semql-engine` is the opt-in package that
34
+ turns a `FederatedPlan` into result rows when you want the cross-source
35
+ execution done for you.
36
+
37
+ ## Quickstart
38
+
39
+ ```python
40
+ import duckdb
41
+ from semql import Catalog, compile_federated_query
42
+ from semql_engine import DuckDBAdapter, Engine
43
+
44
+ catalog = Catalog([...]) # cubes spanning multiple backends
45
+ plan = compile_federated_query(query, catalog.as_dict())
46
+
47
+ engine = Engine()
48
+ engine.register(Backend.POSTGRES, my_pg_adapter)
49
+ engine.register(Backend.BIGQUERY, my_bq_adapter)
50
+ rows = list(engine.run(plan))
51
+ ```
52
+
53
+ ## What it does
54
+
55
+ For every fragment in the plan, the engine calls the adapter registered
56
+ for that backend with `(sql, params)`. It loads the resulting rows into
57
+ a DuckDB table named `frag_<i>` (matching `FederatedPlan.fragments`
58
+ indices) and finally runs `plan.merge.sql` to produce the merged shape.
59
+
60
+ Single-fragment plans (single-backend queries that went through
61
+ `compile_federated_query` anyway) work transparently — the merge is a
62
+ pass-through.
63
+
64
+ ## Adapters
65
+
66
+ An `Adapter` is anything with `execute(sql, params) -> AdapterResult`
67
+ where `AdapterResult` carries `columns: list[str]` and an iterable of
68
+ row dicts. Built-ins:
69
+
70
+ - `DuckDBAdapter(con)` — runs the SQL inside an existing DuckDB
71
+ connection. Useful for local CSV / Parquet enrichment cubes.
72
+ - `DBAPIAdapter(con)` — wraps any PEP-249 connection (psycopg, mysql,
73
+ sqlite, etc).
74
+
75
+ Bring your own for warehouses that need a vendor SDK.
76
+
77
+ ## Scope
78
+
79
+ v1 mirrors `compile_federated_query` v1:
80
+
81
+ - Sum / count / avg supported (avg is decomposed at compile and
82
+ recomposed in the merge SQL); other aggregations are refused by the
83
+ compiler before the engine ever sees them.
84
+ - Equality bridge joins only.
85
+ - No `compare` mode, no boolean `where` tree across backends.
86
+
87
+ The engine itself is small; most of the federation logic lives in
88
+ `semql.federate`.
@@ -0,0 +1,8 @@
1
+ semql_engine/__init__.py,sha256=2Crj7ozo6mfsXevCjpxTFiVP6BZYhWe6S1r5s8MhiU0,564
2
+ semql_engine/adapter.py,sha256=6FZfq_t2bXI8UqW-JhSeCPfR0nVpSjDcra5n98uUoTU,6454
3
+ semql_engine/engine.py,sha256=9gFkYBtntqC9PtHcSjawOGiladWFJb5sT33nY6pAblE,25576
4
+ semql_engine/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ semql_engine-0.2.1.dist-info/licenses/LICENSE,sha256=AdcAzanKVr3cVSrhBpG6gytjG0Ss1SBTQDAavLe0CRc,1505
6
+ semql_engine-0.2.1.dist-info/WHEEL,sha256=wXwAVsgVaOZ_pwDFqQm5Rd6PID-Fc74nkLc8X8gHiDo,81
7
+ semql_engine-0.2.1.dist-info/METADATA,sha256=X1F_AWYB25_O_vNK4fLGTazcLrVrvIXH7l8cGEXfOtA,3245
8
+ semql_engine-0.2.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.11.19
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,28 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2026, Nikhil Pallamreddy
4
+
5
+ Redistribution and use in source and binary forms, with or without
6
+ modification, are permitted provided that the following conditions are met:
7
+
8
+ 1. Redistributions of source code must retain the above copyright notice, this
9
+ list of conditions and the following disclaimer.
10
+
11
+ 2. Redistributions in binary form must reproduce the above copyright notice,
12
+ this list of conditions and the following disclaimer in the documentation
13
+ and/or other materials provided with the distribution.
14
+
15
+ 3. Neither the name of the copyright holder nor the names of its
16
+ contributors may be used to endorse or promote products derived from
17
+ this software without specific prior written permission.
18
+
19
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.