duckbill 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. duckbill-0.1.0/.gitignore +11 -0
  2. duckbill-0.1.0/PKG-INFO +18 -0
  3. duckbill-0.1.0/README.md +316 -0
  4. duckbill-0.1.0/duckbill/__init__.py +15 -0
  5. duckbill-0.1.0/duckbill/backends/__init__.py +48 -0
  6. duckbill-0.1.0/duckbill/backends/base.py +262 -0
  7. duckbill-0.1.0/duckbill/backends/duckdb.py +96 -0
  8. duckbill-0.1.0/duckbill/backends/mysql.py +69 -0
  9. duckbill-0.1.0/duckbill/backends/postgres.py +61 -0
  10. duckbill-0.1.0/duckbill/backends/snowflake.py +82 -0
  11. duckbill-0.1.0/duckbill/backends/sqlite.py +73 -0
  12. duckbill-0.1.0/duckbill/bundle.py +73 -0
  13. duckbill-0.1.0/duckbill/cli.py +70 -0
  14. duckbill-0.1.0/duckbill/core.py +188 -0
  15. duckbill-0.1.0/duckbill/docs.py +21 -0
  16. duckbill-0.1.0/duckbill/loader.py +44 -0
  17. duckbill-0.1.0/duckbill/page.py +1179 -0
  18. duckbill-0.1.0/duckbill/prune.py +157 -0
  19. duckbill-0.1.0/duckbill/questions.py +57 -0
  20. duckbill-0.1.0/duckbill/server.py +162 -0
  21. duckbill-0.1.0/duckbill/server_bundle.py +581 -0
  22. duckbill-0.1.0/examples/rds_slowq.py +377 -0
  23. duckbill-0.1.0/pyproject.toml +22 -0
  24. duckbill-0.1.0/tests/conftest.py +2 -0
  25. duckbill-0.1.0/tests/test_backends_factory.py +42 -0
  26. duckbill-0.1.0/tests/test_backends_mysql.py +62 -0
  27. duckbill-0.1.0/tests/test_backends_postgres.py +68 -0
  28. duckbill-0.1.0/tests/test_backends_scan.py +101 -0
  29. duckbill-0.1.0/tests/test_backends_snowflake.py +70 -0
  30. duckbill-0.1.0/tests/test_backends_sqlite.py +62 -0
  31. duckbill-0.1.0/tests/test_bundle_cli.py +56 -0
  32. duckbill-0.1.0/tests/test_bundle_guard.py +14 -0
  33. duckbill-0.1.0/tests/test_duckbill.py +311 -0
  34. duckbill-0.1.0/tests/test_prune.py +142 -0
  35. duckbill-0.1.0/tests/test_server_bundle.py +230 -0
@@ -0,0 +1,11 @@
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ *.egg-info/
5
+ .pytest_cache/
6
+ examples/questions/
7
+ .vendor/
8
+ # standalone bundles are regenerable build artifacts (duckbill bundle -> a single
9
+ # uv-run .py); they embed the warehouse data and are too large to track.
10
+ /dashboard.py
11
+ /slowq.py
@@ -0,0 +1,18 @@
1
+ Metadata-Version: 2.4
2
+ Name: duckbill
3
+ Version: 0.1.0
4
+ Summary: Live, query-backed dashboards over a local DuckDB, declared as Python data.
5
+ Requires-Python: >=3.9
6
+ Requires-Dist: duckdb>=0.10
7
+ Requires-Dist: pytz
8
+ Requires-Dist: sqlglot>=20
9
+ Provides-Extra: all
10
+ Requires-Dist: psycopg[binary]>=3; extra == 'all'
11
+ Requires-Dist: pymysql; extra == 'all'
12
+ Requires-Dist: snowflake-connector-python; extra == 'all'
13
+ Provides-Extra: mysql
14
+ Requires-Dist: pymysql; extra == 'mysql'
15
+ Provides-Extra: postgres
16
+ Requires-Dist: psycopg[binary]>=3; extra == 'postgres'
17
+ Provides-Extra: snowflake
18
+ Requires-Dist: snowflake-connector-python; extra == 'snowflake'
@@ -0,0 +1,316 @@
1
+ # duckbill
2
+
3
+ Live, query-backed dashboards declared as Python data. A dashboard is a Python
4
+ file that defines charts as dicts; the server runs each chart's SQL on every
5
+ request, so the page is live -- it re-queries on interaction and reflects the
6
+ current warehouse. No build step, two dependencies (`duckdb` + `sqlglot`); network
7
+ backends are opt-in extras. Single process.
8
+
9
+ > Part of **duckpond**, a two-part local-DuckDB toolkit. **duckbill** (this) serves
10
+ > and shares a warehouse as a live dashboard; its sibling **ducktail** pulls
11
+ > scattered sources into one. duckbill works against any DuckDB/SQLite store --
12
+ > ducktail-built or not.
13
+
14
+ ```
15
+ pip install -e .
16
+ duckbill serve examples/rds_slowq.py --db /path/to/warehouse.duckdb
17
+ ```
18
+
19
+ ## Backends
20
+
21
+ The `--db` flag accepts a DSN or a bare file path:
22
+
23
+ | DSN form | backend |
24
+ |---|---|
25
+ | `/path/to/file.duckdb` or `duckdb:///path/to/file.duckdb` | DuckDB (local file) |
26
+ | `sqlite:///path/to/file.db` | SQLite (local file) |
27
+ | `postgresql://user:pass@host/db` | Postgres |
28
+ | `mysql://user:pass@host/db` | MySQL |
29
+ | `snowflake://user@account/db/schema?warehouse=W&role=R` | Snowflake |
30
+
31
+ Secret values (passwords, tokens) should not be written into command lines or
32
+ dashboard files. Use `${VAR}` in the DSN; duckbill expands it from the
33
+ environment before connecting:
34
+
35
+ ```
36
+ duckbill serve dash.py --db "postgresql://ro_user:${DB_PASS}@db-host:5432/warehouse"
37
+ ```
38
+
39
+ Network backends (Postgres, MySQL, Snowflake) are opt-in extras; the base
40
+ install only pulls in `duckdb` and `sqlglot`:
41
+
42
+ ```
43
+ pip install duckbill[postgres] # psycopg
44
+ pip install duckbill[mysql] # pymysql
45
+ pip install duckbill[snowflake] # snowflake-connector-python
46
+ pip install duckbill[all] # all three
47
+ ```
48
+
49
+ All connections are read-only. For DuckDB and SQLite that is enforced at the
50
+ driver level. For network backends, use a read-only role or user -- Snowflake
51
+ has no session-level read-only toggle, so this is especially important there.
52
+
53
+ `--pool` sets the connection pool size for network backends (default 4); it has
54
+ no effect on DuckDB or SQLite.
55
+
56
+ `$name` parameter binding is uniform across all backends -- the server translates
57
+ the dashboard's `$name` placeholders to the native paramstyle before executing.
58
+ SQL dialect (functions, casts, date arithmetic) is the author's responsibility:
59
+ write SQL that matches the backend you deploy against.
60
+
61
+ Bundles (`duckbill bundle`) are DuckDB/SQLite-only. A bundle embeds the warehouse
62
+ tables as Parquet in a self-contained `uv run` server script; network backends are
63
+ serve-only (they can't export to Parquet). A bundle never contains credentials --
64
+ the DSN is used at build time to export data and is not written into the output.
65
+
66
+ ## Declaring a dashboard
67
+
68
+ A dashboard module defines `charts`, and optionally `params`, `title`, and a
69
+ `readme` (see [Documentation](#documentation)):
70
+
71
+ ```python
72
+ title = "my warehouse"
73
+
74
+ params = [
75
+ {"name": "window", "control": "timespan", "default": "31d",
76
+ "presets": ["6h", "24h", "7d", "31d"]}, # binds $start and $end
77
+ {"name": "kind", "control": "select", "default": "all",
78
+ "choices_sql": "SELECT DISTINCT kind FROM warehouse.t ORDER BY 1"},
79
+ {"name": "id", "default": "", "control": "none"}, # set by a drill click
80
+ ]
81
+
82
+ charts = [
83
+ {"id": "volume", "section": "Overview", "title": "Volume", "type": "line",
84
+ "brush": "timespan", # drag the x-axis to zoom
85
+ "sql": """SELECT to_timestamp(ts) AS t, n FROM warehouse.t
86
+ WHERE to_timestamp(ts) >= $start::TIMESTAMPTZ
87
+ AND to_timestamp(ts) < $end::TIMESTAMPTZ
88
+ AND ($kind = 'all' OR kind = $kind) ORDER BY t""",
89
+ "encoding": {"x": {"field": "t", "type": "temporal"},
90
+ "y": {"field": "n", "type": "quantitative"}}},
91
+ ]
92
+ ```
93
+
94
+ ### Charts
95
+
96
+ | key | meaning |
97
+ |------------|---------|
98
+ | `id` | unique identifier (required) |
99
+ | `title` | card heading (required) |
100
+ | `type` | `line` / `bar` / `stacked-bar` / `area` / `point` / `table` / `metric` / `leaderboard` / `spec` (required) |
101
+ | (table col) | a `table` query column named `_*` is data-only: kept for drill values, not displayed |
102
+ | `sql` | the query; may reference `$param` (required) |
103
+ | `section` | groups cards under a heading (default `Overview`) |
104
+ | `encoding` | Vega-Lite encoding for the built-in types |
105
+ | `spec` | raw Vega-Lite spec -- the escape hatch, used when `type` is `spec` |
106
+ | `drill` | bars: `{"param": p, "field": col}` -- click a mark to open param `p`'s detail page. tables: `{column: param}` or `{column: {param, value}}` -- click a cell to drill (a `value` column, or one named `_*`, supplies the param value when it differs from the displayed text) |
107
+ | `brush` | `"timespan"`: drag the x-axis to set the window |
108
+ | `markers` | `true` (all marker sets) or `["id", ...]`: overlay marker rules on this chart |
109
+ | `span` | `"full"` -- the card spans the whole row; an integer `N` -- it spans `N` columns (clamped to the columns that fit, so it degrades to full width on a narrow window) |
110
+
111
+ `sql` is bound, not interpolated -- the server passes `$param` values to DuckDB as
112
+ parameters, so control and drill input is safe. The dashboard module is your own
113
+ trusted code; its SQL runs as written.
114
+
115
+ ### Metric cards
116
+
117
+ A `metric` chart is a strip of hero figures rather than a plot. Its SQL returns
118
+ **one row**; each column becomes a figure -- the value shown large and compacted
119
+ (`30.2k`, `1.2M`), the column name as the label. Use a quoted alias to control
120
+ the label, and pair it with `"span": "full"` for a hero row across the top:
121
+
122
+ ```python
123
+ {"id": "summary", "section": "Overview", "title": "Slow-log summary", "type": "metric", "span": "full",
124
+ "sql": f"""SELECT count(*) AS "slow entries", round(sum(query_time_s)) AS "total query time (s)",
125
+ count(DISTINCT fingerprint_hash) AS "fingerprints"
126
+ FROM warehouse.entries WHERE {{window}}"""}
127
+ ```
128
+
129
+ When the query references the timespan (`$start`/`$end`), each numeric figure
130
+ also shows its change versus the previous equal-length window, as a signed
131
+ percent colored by whether the move is good or bad. `good` declares the good
132
+ direction -- `"up"` (higher is better), `"down"` (lower is better), or
133
+ `"neutral"` (no judgment, gray) -- either for all figures or per figure:
134
+
135
+ ```python
136
+ "good": {"slow entries": "down", "total query time (s)": "down", "fingerprints": "neutral"}
137
+ ```
138
+
139
+ Figures not listed default to `"up"`. The delta is omitted when there's no prior
140
+ window or the previous value is zero/absent.
141
+
142
+ A metric may also carry a `spark` query -- SQL returning a temporal column plus
143
+ one column per figure (aliases matched by name) -- and each figure gets an inline
144
+ sparkline of that trend:
145
+
146
+ ```python
147
+ "spark": f"""SELECT date_trunc('hour', t) AS hour, count(*) AS "slow entries", ...
148
+ FROM warehouse.entries WHERE {{window}} GROUP BY 1 ORDER BY 1"""
149
+ ```
150
+
151
+ ### Leaderboards
152
+
153
+ A `leaderboard` is a ranked list for top-N-by-dimension: SQL returns rows whose
154
+ first text column is the label and first numeric column the value, drawn with an
155
+ inline magnitude bar behind each value. It drills like a bar (`{"param", "field"}`,
156
+ clicking a row navigates to that param's detail page; a `_`-prefixed column can
157
+ carry a hidden drill value, e.g. show readable text but drill on a hash). Denser
158
+ and more scannable than a bar chart for a long ranking.
159
+
160
+ ### Compare
161
+
162
+ The **Compare** toggle (in the timespan control) overlays the previous
163
+ equal-length window: a faded previous-period series on single-series time charts,
164
+ and a `Δ%` per row on windowed leaderboards. The prev/next arrows step the window
165
+ back/forward by its own length.
166
+
167
+ ### Enlarge and explore
168
+
169
+ Every card has an expand icon (top-right, on hover). Clicking it opens the chart
170
+ in a large modal where you can flip between the **Chart** and the raw **Data**
171
+ table, and **Open in Ask** to drop the chart's query (with the current params
172
+ substituted) into the Ask workbench for ad-hoc exploration.
173
+
174
+ ### Params
175
+
176
+ A param drives a control and binds into SQL by its `name`. Controls:
177
+
178
+ - `select` -- a dropdown; options come from `choices` (a list) or `choices_sql`.
179
+ - `timespan` -- a time-range picker (presets + custom from/to + brush-to-zoom).
180
+ It binds `$start` and `$end` (ISO timestamps), not a param of its own name.
181
+ - `none` -- no control; the param is set only by a drill click.
182
+
183
+ `type` is `str` (default), `int`, or `float`.
184
+
185
+ ### Markers
186
+
187
+ A `markers` list declares overlay queries -- the canonical case is deploy
188
+ markers, a recurring motif. Each marker is `{"id", "sql", "field"}` plus optional
189
+ `label` and `color`; the `sql` returns timestamps (referencing `$param` like any
190
+ query), and any chart with `markers: true` gets those timestamps drawn as rules.
191
+
192
+ ```python
193
+ markers = [
194
+ {"id": "deploys", "field": "t", "label": "label", "color": "#b9c2cc",
195
+ "sql": "SELECT to_timestamp(build_time) AS t, version AS label FROM warehouse.deploys "
196
+ "WHERE to_timestamp(build_time) >= $start::TIMESTAMPTZ "
197
+ " AND to_timestamp(build_time) < $end::TIMESTAMPTZ"},
198
+ ]
199
+ ```
200
+
201
+ Window the marker query on `$start`/`$end` so rules stay inside the chart's time
202
+ axis. Markers re-run when the window changes.
203
+
204
+ ## Documentation
205
+
206
+ A warehouse documents itself from two sources, both surfaced in the header's
207
+ **About** tab and by `duckbill docs`:
208
+
209
+ - the dashboard's `readme` -- a Markdown string for the narrative: what the
210
+ warehouse is, how the pieces fit, how to read the dashboard.
211
+ - DuckDB `COMMENT`s -- per-table and per-column descriptions that live in the
212
+ warehouse catalog, so the schema reference is generated, not hand-maintained.
213
+
214
+ ```python
215
+ readme = """\
216
+ This warehouse stitches together the slow log, Performance Insights, and ALB
217
+ access logs so a slow query can be traced out to the request that issued it.
218
+
219
+ Times are stored as epoch seconds (`logged_at`); the charts convert with
220
+ `to_timestamp`.
221
+ """
222
+ ```
223
+
224
+ Set the `COMMENT`s where the warehouse is built, so they survive a rebuild:
225
+
226
+ ```sql
227
+ COMMENT ON TABLE warehouse.entries IS 'One row per slow-query log entry, fingerprinted.';
228
+ COMMENT ON COLUMN warehouse.entries.logged_at IS 'When the statement was logged, epoch seconds (UTC).';
229
+ ```
230
+
231
+ The About view renders the `readme` and a schema reference (each table's comment
232
+ and its columns with types and comments); the Ask sidebar hangs the same comments
233
+ off tables and columns as tooltips. To emit a `WAREHOUSE.md` for the repo:
234
+
235
+ ```
236
+ duckbill docs examples/rds_slowq.py --db warehouse.duckdb -o WAREHOUSE.md
237
+ ```
238
+
239
+ ## Ask (ad-hoc queries)
240
+
241
+ The header's **Ask** tab is a query workbench, like Metabase's native query: a
242
+ schema sidebar (click to insert), a CodeMirror SQL editor with schema-aware
243
+ autocomplete, and a Run button (⌘/Ctrl+Enter). Results show as a table, or pick a
244
+ chart type + x/y/color to visualize through the same chart engine (tooltips,
245
+ hover crosshair, interactive legend included). The query is read-only -- the
246
+ connection is `read_only`, so it's SELECT-only -- and results are row-capped.
247
+
248
+ **Save** names a question and writes it to a file -- one JSON per question under
249
+ `questions/` next to the dashboard (override with `--questions <dir>`), so they're
250
+ git-friendly and hand-editable. The **Saved** dropdown reopens or deletes them,
251
+ and each has a stable link (`#q=<slug>`). **Copy link** is the no-save path: it
252
+ encodes the SQL and chart choice into the URL (`#ask=…`).
253
+
254
+ ## Standalone bundle
255
+
256
+ Wrap a dashboard and its data into one self-contained file for sharing or
257
+ archiving:
258
+
259
+ ```
260
+ duckbill bundle examples/rds_slowq.py --db warehouse.duckdb -o dashboard.py
261
+ # -> dashboard.py (run it with: uv run dashboard.py)
262
+ ```
263
+
264
+ `bundle` prunes the warehouse first -- only the tables and columns the charts
265
+ actually reference are included -- exports them to zstd Parquet, and embeds that
266
+ (b85) in a single `uv run`-able Python script. The recipient runs `uv run
267
+ dashboard.py`; uv resolves the deps from the PEP 723 header, the script extracts
268
+ its embedded Parquet to a content-keyed temp dir on first run (later runs reuse
269
+ it), an in-memory DuckDB exposes each table as a view `warehouse.<table>`, and a
270
+ browser renders the dashboard. No duckbill install, no static host, no sibling
271
+ files -- just one script and uv.
272
+
273
+ Queries run server-side (it's a tiny localhost http server), so it works in every
274
+ browser and the whole dashboard stays live: drill-down, the timespan brush, legend
275
+ filters, and the Ask view all work. The only degradation is that saved questions
276
+ are read-only -- the ones embedded at build time are loadable, but new ones can't
277
+ be persisted into the bundle.
278
+
279
+ Bundles are DuckDB/SQLite-only -- a bundle embeds the data as Parquet, which a
280
+ network backend can't export. A bundle never contains credentials; the DSN is used
281
+ only at build time.
282
+
283
+ ## How it works
284
+
285
+ - **The dashboard module** is pure data -- charts and params, plus whatever Python
286
+ you want for shared SQL fragments and computed defaults.
287
+ - **The server** holds one read-only DuckDB connection behind a lock and serves
288
+ `/` (the page), `/meta` (params + chart metadata), `/q` (run one chart's SQL),
289
+ and `/docs` (the readme + catalog comments). It binds only the params each
290
+ query references.
291
+ - **The page** builds controls from the params, draws each chart with Vega-Lite,
292
+ and re-queries only the charts that reference a changed param.
293
+
294
+ ### Pages and drill-down
295
+
296
+ Every section is a page. Sections not driven by a drill are the home page; each
297
+ drill param has its own detail page -- the section whose charts all reference it.
298
+ Clicking a drill mark navigates to that detail page (the current page lives in
299
+ the URL hash, so the browser back button and shareable links work); the home
300
+ page's other charts stay put. A detail page populates from its drill value, or,
301
+ on a direct link with no value, from a SQL default like
302
+ `COALESCE(NULLIF($route, ''), (SELECT ... LIMIT 1))`.
303
+
304
+ A control appears only when a chart on the current page references its param, so
305
+ a home-only filter hides on a detail page that ignores it. The header is pinned.
306
+
307
+ A chart with a color series gets an interactive legend: click an entry to focus
308
+ that series (the rest dim), click again to clear, shift-click for several. This
309
+ filters within the chart and never drills.
310
+
311
+ ## Not in scope
312
+
313
+ Client-side crossfilter (that's [Mosaic](https://github.com/uwdata/mosaic)),
314
+ multi-user/auth/sharing (loopback, single user), and static export (the point is
315
+ to stay live). Charts that need a layered/transformed view use the `spec` escape
316
+ hatch; reference overlays (deploys, incidents) use `markers`.
@@ -0,0 +1,15 @@
1
+ """duckbill -- live, query-backed dashboards over a local DuckDB.
2
+
3
+ A dashboard is a plain Python module that defines `charts` (and optionally
4
+ `params` and `title`) as data. The server runs each chart's SQL per request, so
5
+ the page is live: it re-queries on every interaction and reflects the current
6
+ warehouse.
7
+ """
8
+
9
+ from .core import Dashboard, Warehouse, params_in
10
+ from .loader import DashboardError, load_dashboard
11
+ from .server import serve
12
+
13
+ __all__ = ["Dashboard", "Warehouse", "params_in", "load_dashboard",
14
+ "DashboardError", "serve"]
15
+ __version__ = "0.1.0"
@@ -0,0 +1,48 @@
1
+ """open_backend: pick a Backend from a DSN, expanding ${VAR} from the environment.
2
+
3
+ A bare path (no scheme) is treated as a DuckDB file (`--db /x.duckdb`).
4
+ Network drivers are imported lazily inside their module, so a missing extra
5
+ errors only when that backend is selected.
6
+ """
7
+
8
+ import os
9
+ import re
10
+ from urllib.parse import urlparse
11
+
12
+ from .base import Backend # re-exported for callers
13
+
14
+ _VAR = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
15
+
16
+
17
+ def _expand(dsn):
18
+ return _VAR.sub(lambda m: os.environ.get(m.group(1), ""), dsn)
19
+
20
+
21
+ def open_backend(dsn, *, read_only=True, pool=4):
22
+ dsn = _expand(dsn)
23
+ scheme = urlparse(dsn).scheme
24
+
25
+ if scheme in ("", "duckdb", "file"):
26
+ from .duckdb import DuckDBBackend
27
+ path = dsn
28
+ if scheme:
29
+ path = dsn.split("://", 1)[1]
30
+ return DuckDBBackend(path, read_only=read_only)
31
+
32
+ if scheme == "sqlite":
33
+ from .sqlite import SQLiteBackend
34
+ return SQLiteBackend(dsn.split("://", 1)[1], read_only=read_only)
35
+
36
+ if scheme in ("postgres", "postgresql"):
37
+ from .postgres import PostgresBackend
38
+ return PostgresBackend(dsn, read_only=read_only, pool=pool)
39
+
40
+ if scheme == "mysql":
41
+ from .mysql import MySQLBackend
42
+ return MySQLBackend(dsn, read_only=read_only, pool=pool)
43
+
44
+ if scheme == "snowflake":
45
+ from .snowflake import SnowflakeBackend
46
+ return SnowflakeBackend(dsn, read_only=read_only, pool=pool)
47
+
48
+ raise ValueError(f"unknown backend scheme {scheme!r} in {dsn!r}")
@@ -0,0 +1,262 @@
1
+ """Backend surface plus the dialect-aware parameter scan shared by every backend.
2
+
3
+ `$name` is the one author-facing bind placeholder. Discovery and translation are
4
+ the same scan: sqlglot tokenizes the SQL for the backend's dialect so we know the
5
+ source spans of string literals, quoted identifiers, and dollar-quoted bodies;
6
+ comment spans we add ourselves (guarded by those string spans). A `$name` counts
7
+ only when it falls outside every protected span. We use sqlglot to find non-code
8
+ regions, not to interpret `$name` -- which is a duckbill convention, not native
9
+ to each dialect.
10
+ """
11
+
12
+ import queue
13
+ import re
14
+ import threading
15
+ from contextlib import contextmanager
16
+ from datetime import date, datetime, time
17
+ from decimal import Decimal
18
+
19
+ import sqlglot
20
+
21
+ _PARAM = re.compile(r"\$([A-Za-z_][A-Za-z0-9_]*)")
22
+
23
+ # sqlglot token types whose source span is non-code: string literals (incl.
24
+ # Postgres/Snowflake dollar-quoting -> HEREDOC/RAW) and quoted identifiers.
25
+ _PROTECTED_TOKENS = {
26
+ "STRING", "HEREDOC_STRING", "RAW_STRING", "NATIONAL_STRING",
27
+ "BYTE_STRING", "HEX_STRING", "BIT_STRING", "IDENTIFIER",
28
+ }
29
+ # Line-comment markers; only MySQL adds '#'. Block comments /* */ are universal.
30
+ _LINE_COMMENTS = {"mysql": ("--", "#")}
31
+
32
+ _STYLE = {
33
+ "duckdb": lambda n: f"${n}", # native; passthrough
34
+ "sqlite": lambda n: f":{n}", # sqlite3 named paramstyle
35
+ "pyformat": lambda n: f"%({n})s", # psycopg / PyMySQL / snowflake-connector
36
+ }
37
+
38
+
39
+ def _in(spans, i):
40
+ return any(a <= i <= b for a, b in spans)
41
+
42
+
43
+ def _comment_spans(sql, dialect, str_spans):
44
+ markers = _LINE_COMMENTS.get(dialect, ("--",))
45
+ spans, i, n = [], 0, len(sql)
46
+ while i < n:
47
+ if _in(str_spans, i): # a marker inside a string is not a comment
48
+ i += 1
49
+ continue
50
+ if sql.startswith("/*", i):
51
+ j = sql.find("*/", i + 2)
52
+ j = n - 1 if j < 0 else j + 1
53
+ spans.append((i, j))
54
+ i = j + 1
55
+ continue
56
+ if any(sql.startswith(m, i) for m in markers):
57
+ j = sql.find("\n", i)
58
+ j = n - 1 if j < 0 else j - 1
59
+ spans.append((i, j))
60
+ i = j + 1
61
+ continue
62
+ i += 1
63
+ return spans
64
+
65
+
66
+ def _protected(sql, dialect):
67
+ try:
68
+ toks = sqlglot.tokenize(sql, dialect=dialect)
69
+ except Exception: # a tokenize failure must not blank the chart -- protect nothing
70
+ toks = []
71
+ str_spans = [(t.start, t.end) for t in toks if t.token_type.name in _PROTECTED_TOKENS]
72
+ return str_spans + _comment_spans(sql, dialect, str_spans)
73
+
74
+
75
+ def referenced_params(sql, dialect="duckdb"):
76
+ """The set of $name placeholders a query references, ignoring those inside
77
+ strings, quoted identifiers, comments, or dollar-quoted bodies."""
78
+ spans = _protected(sql, dialect)
79
+ return {m.group(1) for m in _PARAM.finditer(sql) if not _in(spans, m.start())}
80
+
81
+
82
+ def bind(sql, args, dialect, paramstyle):
83
+ """Translate $name to the driver's paramstyle and bind only referenced params.
84
+
85
+ Returns (translated_sql, params). For 'pyformat' backends, literal '%' in the
86
+ SQL is escaped to '%%' so LIKE patterns survive the driver's own substitution.
87
+ """
88
+ spans = _protected(sql, dialect)
89
+ fmt = _STYLE[paramstyle]
90
+ esc = (lambda s: s.replace("%", "%%")) if paramstyle == "pyformat" else (lambda s: s)
91
+ out, last, used = [], 0, set()
92
+ for m in _PARAM.finditer(sql):
93
+ if _in(spans, m.start()):
94
+ continue
95
+ name = m.group(1)
96
+ out.append(esc(sql[last:m.start()]))
97
+ out.append(fmt(name))
98
+ last = m.end()
99
+ used.add(name)
100
+ if not used:
101
+ return sql, {} # no binds -> run() calls execute(q) with no driver
102
+ # %-substitution, so the SQL must stay verbatim
103
+ out.append(esc(sql[last:]))
104
+ return "".join(out), {k: v for k, v in args.items() if k in used}
105
+
106
+
107
+ def jsonable(v):
108
+ """Coerce a driver value to something the JSON encoder and Vega accept."""
109
+ if isinstance(v, Decimal):
110
+ return float(v)
111
+ if isinstance(v, (datetime, date, time)):
112
+ return v.isoformat()
113
+ if isinstance(v, (bytes, bytearray, memoryview)):
114
+ return bytes(v).hex()
115
+ return v
116
+
117
+
118
+ def jsonable_row(row):
119
+ return [jsonable(v) for v in row]
120
+
121
+
122
+ class Backend:
123
+ """The surface the server and bundler speak. Subclasses implement these.
124
+
125
+ dialect: sqlglot dialect name for the scan
126
+ paramstyle: key into _STYLE for bind()
127
+ bundleable: can `duckbill bundle` embed this backend's data?
128
+ """
129
+
130
+ dialect = "duckdb"
131
+ paramstyle = "duckdb"
132
+ bundleable = False
133
+
134
+ def run(self, sql, args):
135
+ raise NotImplementedError
136
+
137
+ def query(self, sql, limit=2000):
138
+ raise NotImplementedError
139
+
140
+ def docs(self):
141
+ raise NotImplementedError
142
+
143
+ def schema(self):
144
+ raise NotImplementedError
145
+
146
+ def table_columns(self):
147
+ """Columns per table, keyed by the same qualified names as `schema()`:
148
+ `{<schema>.<name>: [col, ...]}`. The bundler's column pruner feeds this to
149
+ sqlglot as a schema map. Serve-only backends don't implement it."""
150
+ raise NotImplementedError(f"{type(self).__name__} is serve-only (not bundleable)")
151
+
152
+ def export_parquet(self, qualified, columns=None, compression="snappy"):
153
+ raise NotImplementedError(f"{type(self).__name__} is serve-only (not bundleable)")
154
+
155
+ def close(self):
156
+ pass
157
+
158
+
159
+ # Parquet codecs DuckDB writes and reads back. Restricted to an allowlist because
160
+ # the value is interpolated into a COPY statement (it can't be a bind parameter).
161
+ _PARQUET_CODECS = frozenset({"snappy", "zstd", "gzip", "uncompressed"})
162
+
163
+
164
+ def parquet_codec(compression):
165
+ """Validate and normalize a Parquet compression name to a COPY keyword."""
166
+ c = compression.lower()
167
+ if c not in _PARQUET_CODECS:
168
+ raise ValueError(
169
+ f"unsupported Parquet compression {compression!r}; "
170
+ f"expected one of {sorted(_PARQUET_CODECS)}")
171
+ return c
172
+
173
+
174
+ class Pool:
175
+ """A tiny bounded connection pool for network backends. Connections are made
176
+ lazily up to `size`, then borrowers block for a free one. LIFO so a small
177
+ working set stays warm."""
178
+
179
+ def __init__(self, factory, size=4):
180
+ self._factory = factory
181
+ self._free = queue.LifoQueue()
182
+ self._made = 0
183
+ self._size = max(1, size)
184
+ self._lock = threading.Lock()
185
+
186
+ @contextmanager
187
+ def borrow(self):
188
+ con = self._acquire()
189
+ ok = False
190
+ try:
191
+ yield con
192
+ ok = True
193
+ finally:
194
+ if ok:
195
+ self._free.put(con)
196
+ else:
197
+ try:
198
+ con.close()
199
+ except Exception:
200
+ pass
201
+ with self._lock:
202
+ self._made -= 1
203
+
204
+ def _acquire(self):
205
+ try:
206
+ return self._free.get_nowait()
207
+ except queue.Empty:
208
+ pass
209
+ with self._lock:
210
+ make = self._made < self._size
211
+ if make:
212
+ self._made += 1
213
+ if make:
214
+ try:
215
+ return self._factory()
216
+ except Exception:
217
+ with self._lock:
218
+ self._made -= 1
219
+ raise
220
+ return self._free.get() # all in use -- block for one
221
+
222
+ def close(self):
223
+ while not self._free.empty():
224
+ try:
225
+ self._free.get_nowait().close()
226
+ except Exception:
227
+ pass
228
+
229
+
230
+ class DBAPIBackend(Backend):
231
+ """Shared run/query for PEP-249 drivers: translate $name via bind(), borrow a
232
+ pooled connection, coerce rows. Subclasses set dialect/paramstyle and
233
+ implement `_connect` (returns a new read-only DBAPI connection), `docs`,
234
+ `schema`."""
235
+
236
+ def __init__(self, *, pool=4):
237
+ self._pool = Pool(self._connect, size=pool)
238
+
239
+ def _connect(self):
240
+ raise NotImplementedError
241
+
242
+ def run(self, sql, args):
243
+ q, p = bind(sql, args, self.dialect, self.paramstyle)
244
+ with self._pool.borrow() as con:
245
+ cur = con.cursor()
246
+ cur.execute(q, p) if p else cur.execute(q)
247
+ cols = [d[0] for d in cur.description]
248
+ rows = cur.fetchall()
249
+ return cols, [dict(zip(cols, jsonable_row(r))) for r in rows]
250
+
251
+ def query(self, sql, limit=2000):
252
+ with self._pool.borrow() as con:
253
+ cur = con.cursor()
254
+ cur.execute(sql)
255
+ cols = [d[0] for d in cur.description]
256
+ rows = cur.fetchmany(limit + 1)
257
+ truncated = len(rows) > limit
258
+ rows = rows[:limit]
259
+ return cols, [dict(zip(cols, jsonable_row(r))) for r in rows], truncated
260
+
261
+ def close(self):
262
+ self._pool.close()