semql-engine 0.2.1__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: semql-engine
3
- Version: 0.2.1
3
+ Version: 0.3.0
4
4
  Summary: In-process executor for semql FederatedPlans — runs per-backend fragments via caller-supplied adapters and merges results in DuckDB.
5
5
  Author: Nikhil Pallamreddy
6
6
  Author-email: Nikhil Pallamreddy <nikhil.pallamreddy+git@gmail.com>
@@ -16,7 +16,7 @@ Classifier: Topic :: Database
16
16
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
17
17
  Classifier: Typing :: Typed
18
18
  Requires-Dist: duckdb>=1.5.3
19
- Requires-Dist: semql>=0.2.1,<0.3
19
+ Requires-Dist: semql>=0.3.0,<0.4
20
20
  Requires-Python: >=3.12
21
21
  Project-URL: Homepage, https://github.com/npalladium/semql
22
22
  Project-URL: Repository, https://github.com/npalladium/semql
@@ -34,6 +34,12 @@ then runs the plan's merge SQL against the assembled tables.
34
34
  turns a `FederatedPlan` into result rows when you want the cross-source
35
35
  execution done for you.
36
36
 
37
+ ## Install
38
+
39
+ ```sh
40
+ pip install semql-engine
41
+ ```
42
+
37
43
  ## Quickstart
38
44
 
39
45
  ```python
@@ -45,8 +51,8 @@ catalog = Catalog([...]) # cubes spanning multiple backends
45
51
  plan = compile_federated_query(query, catalog.as_dict())
46
52
 
47
53
  engine = Engine()
48
- engine.register(Backend.POSTGRES, my_pg_adapter)
49
- engine.register(Backend.BIGQUERY, my_bq_adapter)
54
+ engine.register(Dialect.POSTGRES, my_pg_adapter)
55
+ engine.register(Dialect.BIGQUERY, my_bq_adapter)
50
56
  rows = list(engine.run(plan))
51
57
  ```
52
58
 
@@ -86,3 +92,9 @@ v1 mirrors `compile_federated_query` v1:
86
92
 
87
93
  The engine itself is small; most of the federation logic lives in
88
94
  `semql.federate`.
95
+
96
+ ## Status
97
+
98
+ Early development. The `Adapter` contract is stable; the federation
99
+ shape mirrors `compile_federated_query` v1 (sum / count / avg and
100
+ equality bridge joins only).
@@ -9,6 +9,12 @@ then runs the plan's merge SQL against the assembled tables.
9
9
  turns a `FederatedPlan` into result rows when you want the cross-source
10
10
  execution done for you.
11
11
 
12
+ ## Install
13
+
14
+ ```sh
15
+ pip install semql-engine
16
+ ```
17
+
12
18
  ## Quickstart
13
19
 
14
20
  ```python
@@ -20,8 +26,8 @@ catalog = Catalog([...]) # cubes spanning multiple backends
20
26
  plan = compile_federated_query(query, catalog.as_dict())
21
27
 
22
28
  engine = Engine()
23
- engine.register(Backend.POSTGRES, my_pg_adapter)
24
- engine.register(Backend.BIGQUERY, my_bq_adapter)
29
+ engine.register(Dialect.POSTGRES, my_pg_adapter)
30
+ engine.register(Dialect.BIGQUERY, my_bq_adapter)
25
31
  rows = list(engine.run(plan))
26
32
  ```
27
33
 
@@ -61,3 +67,9 @@ v1 mirrors `compile_federated_query` v1:
61
67
 
62
68
  The engine itself is small; most of the federation logic lives in
63
69
  `semql.federate`.
70
+
71
+ ## Status
72
+
73
+ Early development. The `Adapter` contract is stable; the federation
74
+ shape mirrors `compile_federated_query` v1 (sum / count / avg and
75
+ equality bridge joins only).
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "semql-engine"
3
- version = "0.2.1"
3
+ version = "0.3.0"
4
4
  description = "In-process executor for semql FederatedPlans — runs per-backend fragments via caller-supplied adapters and merges results in DuckDB."
5
5
  readme = "README.md"
6
6
  license = "BSD-3-Clause"
@@ -11,7 +11,7 @@ authors = [
11
11
  requires-python = ">=3.12"
12
12
  dependencies = [
13
13
  "duckdb>=1.5.3",
14
- "semql>=0.2.1,<0.3",
14
+ "semql>=0.3.0,<0.4",
15
15
  ]
16
16
  classifiers = [
17
17
  "Development Status :: 3 - Alpha",
@@ -0,0 +1,55 @@
1
+ """Public surface of semql-engine."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from semql_engine.adapter import (
6
+ Adapter,
7
+ AdapterResult,
8
+ AsyncAdapter,
9
+ AsyncBigQueryAdapter,
10
+ AsyncClickHouseAdapter,
11
+ AsyncDBAPIAdapter,
12
+ AsyncDuckDBAdapter,
13
+ DBAPIAdapter,
14
+ DuckDBAdapter,
15
+ to_async_adapter,
16
+ )
17
+ from semql_engine.engine import (
18
+ AsyncEngine,
19
+ AsyncMergeEngine,
20
+ DuckDBMergeEngine,
21
+ Engine,
22
+ EngineError,
23
+ ExecutionResult,
24
+ MergeEngine,
25
+ to_async_merge_engine,
26
+ )
27
+ from semql_engine.rows import (
28
+ InMemoryRowAdapter,
29
+ RowCapableAdapter,
30
+ execute_entity,
31
+ )
32
+
33
+ __all__ = [
34
+ "Adapter",
35
+ "AdapterResult",
36
+ "AsyncAdapter",
37
+ "AsyncBigQueryAdapter",
38
+ "AsyncClickHouseAdapter",
39
+ "AsyncDBAPIAdapter",
40
+ "AsyncDuckDBAdapter",
41
+ "AsyncEngine",
42
+ "AsyncMergeEngine",
43
+ "DBAPIAdapter",
44
+ "DuckDBMergeEngine",
45
+ "DuckDBAdapter",
46
+ "Engine",
47
+ "EngineError",
48
+ "ExecutionResult",
49
+ "InMemoryRowAdapter",
50
+ "MergeEngine",
51
+ "RowCapableAdapter",
52
+ "execute_entity",
53
+ "to_async_adapter",
54
+ "to_async_merge_engine",
55
+ ]
@@ -0,0 +1,368 @@
1
+ # pyright: reportAttributeAccessIssue=false
2
+ # pyright: reportUnknownArgumentType=false
3
+ # pyright: reportUnknownMemberType=false
4
+ # pyright: reportUnknownVariableType=false
5
+ # The BigQuery adapter reaches into the google.cloud.bigquery module
6
+ # at runtime (lazy import) so pyright can't see the types.
7
+ """Adapter protocols for the in-process executor.
8
+
9
+ An ``Adapter`` is the glue between a backend's connection and the
10
+ semql executor. Given a ``(sql, params)`` pair, an adapter runs the
11
+ SQL on its backend and yields the result as an :class:`AdapterResult`
12
+ — a typed pair of ``columns: list[str]`` and ``rows: Iterable[Sequence
13
+ [Any]]`` (positional, matching ``columns`` order).
14
+
15
+ Two parallel protocols ship:
16
+
17
+ - :class:`Adapter` — sync ``execute``. Wired up to :class:`semql_engine.Engine`.
18
+ - :class:`AsyncAdapter` — same shape, ``async def execute``. Wired up
19
+ to :class:`semql_engine.AsyncEngine`. Production deployments running
20
+ on asyncio (FastAPI / Litestar / aiohttp) avoid the per-call
21
+ ``asyncio.to_thread`` boilerplate.
22
+
23
+ :func:`to_async_adapter` wraps any sync ``Adapter`` so it satisfies the
24
+ async protocol — useful when only a sync driver is available and the
25
+ caller is otherwise async-first.
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ import asyncio
31
+ from collections.abc import Callable, Iterable, Mapping, Sequence
32
+ from dataclasses import dataclass
33
+ from typing import Any, Protocol
34
+
35
+
36
+ @dataclass
37
+ class AdapterResult:
38
+ """A query result as a column list + row iterator.
39
+
40
+ ``rows`` is positional — each row is a tuple/list aligned to
41
+ ``columns``. The executor zips them into dicts when materialising
42
+ into DuckDB temp tables. Iterators are fine; the engine consumes
43
+ each row exactly once.
44
+ """
45
+
46
+ columns: list[str]
47
+ rows: Iterable[Sequence[Any]]
48
+
49
+
50
+ class Adapter(Protocol):
51
+ """Minimal contract for a backend connection.
52
+
53
+ Implementations execute a SQL string against their backend and
54
+ return rows + column metadata. Parameter binding is the adapter's
55
+ responsibility (sqlglot-emitted placeholders match the dialect, so
56
+ a Postgres adapter sees ``$1``-style or named placeholders as
57
+ appropriate)."""
58
+
59
+ def execute(
60
+ self,
61
+ sql: str,
62
+ params: Mapping[str, Any],
63
+ ) -> AdapterResult: ...
64
+
65
+
66
+ class DBAPIAdapter:
67
+ """PEP-249 adapter — wraps any DB-API 2.0 connection.
68
+
69
+ Uses a fresh cursor per call, passes ``params`` as the second
70
+ argument to ``cursor.execute`` (the named-parameter form is
71
+ driver-specific; this assumes the driver matches the dialect the
72
+ compiler emitted for the backend).
73
+
74
+ Read-only by design: we never commit or close the connection; the
75
+ caller owns its lifecycle.
76
+ """
77
+
78
+ def __init__(self, connection: Any) -> None: # noqa: ANN401 — any PEP-249 conn
79
+ self._conn = connection
80
+
81
+ def execute(self, sql: str, params: Mapping[str, Any]) -> AdapterResult:
82
+ cursor = self._conn.cursor()
83
+ try:
84
+ if params:
85
+ cursor.execute(sql, dict(params))
86
+ else:
87
+ cursor.execute(sql)
88
+ description: list[Any] = list(cursor.description or [])
89
+ columns: list[str] = [str(d[0]) for d in description]
90
+ rows = list(cursor.fetchall())
91
+ finally:
92
+ cursor.close()
93
+ return AdapterResult(columns=columns, rows=rows)
94
+
95
+
96
+ class DuckDBAdapter:
97
+ """DuckDB adapter — wraps an existing ``duckdb.DuckDBPyConnection``.
98
+
99
+ DuckDB uses ``$name`` placeholders for named parameters; the
100
+ compiler emits them already for DuckDB targets, so we pass
101
+ ``params`` through unchanged.
102
+
103
+ Useful for: local CSV / Parquet enrichment cubes (point a Cube at a
104
+ file path and DuckDB reads it natively), in-memory test fixtures,
105
+ and as a unified backend for users who don't want to manage
106
+ multiple connections.
107
+ """
108
+
109
+ def __init__(self, connection: Any) -> None: # noqa: ANN401 — duckdb conn
110
+ self._conn = connection
111
+
112
+ def execute(self, sql: str, params: Mapping[str, Any]) -> AdapterResult:
113
+ cursor = self._conn.execute(sql, dict(params) if params else None)
114
+ description: list[Any] = list(cursor.description or [])
115
+ columns: list[str] = [str(d[0]) for d in description]
116
+ rows = cursor.fetchall()
117
+ return AdapterResult(columns=columns, rows=rows)
118
+
119
+
120
+ class AsyncAdapter(Protocol):
121
+ """Async counterpart of :class:`Adapter`.
122
+
123
+ ``execute`` is an awaitable that returns the same
124
+ :class:`AdapterResult` shape. Implementations should be safe to
125
+ call concurrently — :class:`semql_engine.AsyncEngine` runs all the
126
+ fragments of a federated plan in parallel via ``asyncio.gather``.
127
+ """
128
+
129
+ async def execute(
130
+ self,
131
+ sql: str,
132
+ params: Mapping[str, Any],
133
+ ) -> AdapterResult: ...
134
+
135
+
136
+ class _SyncAsAsyncAdapter:
137
+ """Internal wrapper produced by :func:`to_async_adapter`."""
138
+
139
+ def __init__(self, inner: Adapter) -> None:
140
+ self._inner = inner
141
+
142
+ async def execute(self, sql: str, params: Mapping[str, Any]) -> AdapterResult:
143
+ # ``asyncio.to_thread`` releases the event loop so other
144
+ # fragments registered on the AsyncEngine can run in parallel
145
+ # even when this adapter is pure-Python sync.
146
+ return await asyncio.to_thread(self._inner.execute, sql, params)
147
+
148
+
149
+ def to_async_adapter(adapter: Adapter) -> AsyncAdapter:
150
+ """Wrap a sync :class:`Adapter` so it satisfies :class:`AsyncAdapter`.
151
+
152
+ The wrapped adapter dispatches each ``execute`` to a worker thread
153
+ via ``asyncio.to_thread`` — fragments scheduled on
154
+ :class:`semql_engine.AsyncEngine` still run concurrently because
155
+ the event loop is freed up while the thread blocks on I/O. Prefer a
156
+ native async adapter when the underlying driver supports one; the
157
+ bridge is the right answer for drivers that only ship sync APIs.
158
+ """
159
+ return _SyncAsAsyncAdapter(adapter)
160
+
161
+
162
+ class AsyncDuckDBAdapter:
163
+ """Async DuckDB adapter — wraps an existing ``duckdb.DuckDBPyConnection``.
164
+
165
+ DuckDB has no native async API; this adapter dispatches each
166
+ ``execute`` to a worker thread via ``asyncio.to_thread``. Useful
167
+ for single-fragment async plans, in-memory test fixtures, and for
168
+ keeping async-first user code from inheriting a sync ``Engine``.
169
+ """
170
+
171
+ def __init__(self, connection: Any) -> None: # noqa: ANN401 — duckdb conn
172
+ self._inner = DuckDBAdapter(connection)
173
+
174
+ async def execute(self, sql: str, params: Mapping[str, Any]) -> AdapterResult:
175
+ return await asyncio.to_thread(self._inner.execute, sql, params)
176
+
177
+
178
+ class AsyncDBAPIAdapter:
179
+ """Async wrapper for any PEP-249 (DB-API 2.0) connection.
180
+
181
+ The whole PEP-249 family — ``psycopg2``, ``psycopg`` (sync mode),
182
+ ``pymysql``, ``mysql.connector``, ``snowflake-connector-python`` —
183
+ is synchronous. Async drivers (asyncpg, ``psycopg`` async mode)
184
+ are *not* DB-API; they implement :class:`AsyncAdapter` directly.
185
+
186
+ This adapter dispatches each ``execute`` to a worker thread via
187
+ :func:`asyncio.to_thread` so the event loop is free to schedule
188
+ sibling coroutines while the DB-API call blocks. Parameters are
189
+ passed as a dict — the named-parameter form is the only one
190
+ guaranteed portable across the DB-API family; the caller is
191
+ responsible for matching the compiler's emitted placeholder
192
+ syntax to the driver (Postgres / Snowflake both use
193
+ ``%(name)s``; the ``AsyncDBAPIAdapter`` itself is dialect-blind
194
+ and passes the SQL through verbatim).
195
+
196
+ Read-only by design: we never commit or close the connection;
197
+ the caller owns its lifecycle.
198
+ """
199
+
200
+ def __init__(self, connection: Any) -> None: # noqa: ANN401 — any PEP-249 conn
201
+ self._conn = connection
202
+
203
+ async def execute(self, sql: str, params: Mapping[str, Any]) -> AdapterResult:
204
+ return await asyncio.to_thread(self._sync_execute, sql, params)
205
+
206
+ def _sync_execute(self, sql: str, params: Mapping[str, Any]) -> AdapterResult:
207
+ cursor = self._conn.cursor()
208
+ try:
209
+ if params:
210
+ cursor.execute(sql, dict(params))
211
+ else:
212
+ cursor.execute(sql)
213
+ description: list[Any] = list(cursor.description or [])
214
+ columns: list[str] = [str(d[0]) for d in description]
215
+ rows = list(cursor.fetchall())
216
+ finally:
217
+ cursor.close()
218
+ return AdapterResult(columns=columns, rows=rows)
219
+
220
+
221
+ _BQ_TYPE_HINTS: tuple[tuple[str, tuple[type, ...]], ...] = (
222
+ ("INT64", (int,)),
223
+ ("FLOAT64", (float,)),
224
+ ("BOOL", (bool,)),
225
+ ("STRING", (str,)),
226
+ )
227
+
228
+
229
+ def _bq_type_for(value: Any) -> str: # noqa: ANN401 — duck-typed on BQ param shape
230
+ """Map a Python value to BQ's parameter ``type_`` string.
231
+
232
+ Boolean must be checked before int (``bool`` is a subclass of
233
+ ``int`` in Python — ``isinstance(True, int)`` is ``True``)."""
234
+ for type_name, py_types in _BQ_TYPE_HINTS:
235
+ if isinstance(value, py_types):
236
+ return type_name
237
+ return "STRING"
238
+
239
+
240
+ def _bq_array_type_for(value: Any) -> str: # noqa: ANN401 — duck-typed on BQ param shape
241
+ """Map a Python list's element type to BQ's ``ARRAY<T>`` form.
242
+
243
+ Falls back to ``ARRAY<STRING>`` if the list is empty (BQ needs a
244
+ non-null element type) or contains mixed types."""
245
+ if not isinstance(value, list) or not value:
246
+ return "ARRAY<STRING>"
247
+ element = value[0]
248
+ return f"ARRAY<{_bq_type_for(element)}>"
249
+
250
+
251
+ class AsyncBigQueryAdapter:
252
+ """Async adapter for ``google.cloud.bigquery.Client``.
253
+
254
+ The BQ client is synchronous: ``.query(sql, job_config=...)``
255
+ returns a ``RowIterator`` that's sync-iterable. This adapter:
256
+
257
+ 1. Maps the SemQL params ``dict`` to BQ's structured-parameter
258
+ form (``ScalarQueryParameter`` / ``ArrayQueryParameter``) so the
259
+ job is *parameterised* — not f-string interpolated. This is
260
+ the only safe way to run user-supplied values against BQ
261
+ without exposing a SQL-injection surface.
262
+ 2. Dispatches the entire ``.query()`` call to a worker thread via
263
+ :func:`asyncio.to_thread` so the event loop is free to schedule
264
+ sibling coroutines during the round-trip.
265
+
266
+ The translator is a function ``(params) -> job_config``. The
267
+ default attempts to import ``google.cloud.bigquery`` and use the
268
+ real ``QueryJobConfig`` / ``ScalarQueryParameter`` constructors.
269
+ Tests inject a stub translator that builds their duck-typed
270
+ fake objects. Pass ``translator=...`` to ``__init__`` to override.
271
+
272
+ The ``google-cloud-bigquery`` package is *not* a hard import-time
273
+ dependency — we try-import inside the default translator on first
274
+ use, and raise a helpful error if it's missing.
275
+ """
276
+
277
+ def __init__(
278
+ self,
279
+ client: Any, # noqa: ANN401 — google-cloud-bigquery
280
+ translator: Callable[[Mapping[str, Any]], Any] | None = None,
281
+ ) -> None:
282
+ self._client = client
283
+ self._translator = translator or _default_bq_translator
284
+
285
+ async def execute(self, sql: str, params: Mapping[str, Any]) -> AdapterResult:
286
+ return await asyncio.to_thread(self._sync_execute, sql, params)
287
+
288
+ def _sync_execute(self, sql: str, params: Mapping[str, Any]) -> AdapterResult:
289
+ job_config = self._translator(params)
290
+ result = self._client.query(sql, job_config=job_config)
291
+ schema = getattr(result, "schema", None)
292
+ column_names: list[str] = list(getattr(schema, "names", []) or [])
293
+ if not column_names:
294
+ first = next(iter(result), None)
295
+ if first is not None:
296
+ column_names = [str(i) for i in range(len(first))]
297
+ rows: list[Sequence[Any]] = [tuple(r) for r in result]
298
+ return AdapterResult(columns=column_names, rows=rows)
299
+
300
+
301
+ def _default_bq_translator(params: Mapping[str, Any]) -> Any: # noqa: ANN401 — BQ types are duck-typed
302
+ """Translate the params ``dict`` into a ``QueryJobConfig`` with one
303
+ ``ScalarQueryParameter`` / ``ArrayQueryParameter`` per name.
304
+
305
+ Imports ``google.cloud.bigquery`` lazily; raises a helpful error
306
+ if it isn't installed."""
307
+ try:
308
+ from google.cloud import bigquery as _bq # type: ignore[import-not-found]
309
+ except ImportError as exc:
310
+ raise RuntimeError(
311
+ "AsyncBigQueryAdapter needs `google-cloud-bigquery` to translate "
312
+ "parameters into structured-query-parameter form. Install with "
313
+ "`uv add google-cloud-bigquery`."
314
+ ) from exc
315
+ job_config = _bq.QueryJobConfig()
316
+ for name, value in params.items():
317
+ if isinstance(value, list):
318
+ job_config.query_parameters.append(
319
+ _bq.ArrayQueryParameter(name, _bq_array_type_for(value), value)
320
+ )
321
+ else:
322
+ job_config.query_parameters.append(
323
+ _bq.ScalarQueryParameter(name, _bq_type_for(value), value)
324
+ )
325
+ return job_config
326
+
327
+
328
+ class AsyncClickHouseAdapter:
329
+ """Async adapter for ``clickhouse_connect.asynch_client.AsyncClient``.
330
+
331
+ Unlike the BQ client, the clickhouse-connect async client has a
332
+ true coroutine ``.query()`` method — it returns immediately when
333
+ the row set starts streaming rather than blocking the event loop.
334
+ The named-parameter form ``{name:Type}`` is native to the CH wire
335
+ protocol, so the adapter just hands the params dict through as
336
+ the ``parameters=`` keyword argument.
337
+
338
+ The ``clickhouse-connect`` package is *not* a hard import-time
339
+ dependency — we duck-type against its surface (``query(query,
340
+ parameters=...)``) and let the user pass a real or fake client
341
+ object.
342
+
343
+ Returns rows via the same :class:`AdapterResult` shape the sync
344
+ side uses: ``columns`` from ``QueryResult.column_names``, ``rows``
345
+ as positional tuples aligned to that column order.
346
+ """
347
+
348
+ def __init__(self, client: Any) -> None: # noqa: ANN401 — clickhouse-connect
349
+ self._client = client
350
+
351
+ async def execute(self, sql: str, params: Mapping[str, Any]) -> AdapterResult:
352
+ result = await self._client.query(sql, parameters=dict(params))
353
+ column_names: list[str] = list(getattr(result, "column_names", []) or [])
354
+ # ``QueryResult.result_rows`` is a list of positional tuples;
355
+ # ``named_results()`` returns dicts which we don't need.
356
+ raw_rows: list[Sequence[Any]] = [tuple(r) for r in getattr(result, "result_rows", []) or []]
357
+ return AdapterResult(columns=column_names, rows=raw_rows)
358
+
359
+
360
+ __all__ = [
361
+ "Adapter",
362
+ "AdapterResult",
363
+ "AsyncAdapter",
364
+ "AsyncDuckDBAdapter",
365
+ "DBAPIAdapter",
366
+ "DuckDBAdapter",
367
+ "to_async_adapter",
368
+ ]