vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Extract column statistics from DuckDB tables.
|
|
4
|
+
|
|
5
|
+
Provides a helper to query a DuckDB connection and produce
|
|
6
|
+
:class:`~vgi.catalog.descriptors.ColumnStatisticsInput` dicts
|
|
7
|
+
ready for use in ``Table(statistics=...)``.
|
|
8
|
+
|
|
9
|
+
Example::
|
|
10
|
+
|
|
11
|
+
import duckdb
|
|
12
|
+
from vgi.catalog.duckdb_statistics import statistics_from_duckdb
|
|
13
|
+
|
|
14
|
+
conn = duckdb.connect("my_data.duckdb")
|
|
15
|
+
stats = statistics_from_duckdb(conn, "my_table")
|
|
16
|
+
|
|
17
|
+
Table(
|
|
18
|
+
name="my_table",
|
|
19
|
+
columns=...,
|
|
20
|
+
statistics=stats,
|
|
21
|
+
statistics_cache_max_age_seconds=3600,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
Geometry columns are handled specially: instead of meaningless ``min``/``max``
|
|
25
|
+
of the raw WKB blobs, the helper computes the spatial bounding box of the
|
|
26
|
+
dataset and sends two corner points so that DuckDB's ``GeometryStats`` can
|
|
27
|
+
reconstruct the correct spatial extent for filter pushdown.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from __future__ import annotations
|
|
31
|
+
|
|
32
|
+
from typing import TYPE_CHECKING
|
|
33
|
+
|
|
34
|
+
import pyarrow as pa
|
|
35
|
+
|
|
36
|
+
from vgi.catalog.catalog_interface import ColumnStatistics
|
|
37
|
+
from vgi.catalog.descriptors import ColumnStatisticsInput
|
|
38
|
+
|
|
39
|
+
if TYPE_CHECKING:
|
|
40
|
+
import duckdb
|
|
41
|
+
|
|
42
|
+
__all__ = ["column_statistics_from_duckdb", "statistics_from_duckdb"]
|
|
43
|
+
|
|
44
|
+
# DuckDB type names that indicate a spatial column requiring special handling.
|
|
45
|
+
_GEOMETRY_TYPE_NAMES = frozenset({"GEOMETRY", "POINT_2D", "LINESTRING_2D", "POLYGON_2D", "BOX_2D"})
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _is_geometry_column(conn: duckdb.DuckDBPyConnection, qualified: str, col: str) -> bool:
|
|
49
|
+
"""Check if a column is a geometry type by querying DuckDB's typeof()."""
|
|
50
|
+
try:
|
|
51
|
+
row = conn.execute(f"SELECT typeof({col}) FROM {qualified} WHERE {col} IS NOT NULL LIMIT 1").fetchone()
|
|
52
|
+
return row is not None and row[0] in _GEOMETRY_TYPE_NAMES
|
|
53
|
+
except Exception:
|
|
54
|
+
return False
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _geometry_stats(
|
|
58
|
+
conn: duckdb.DuckDBPyConnection,
|
|
59
|
+
qualified: str,
|
|
60
|
+
col: str,
|
|
61
|
+
) -> tuple[pa.Scalar | None, pa.Scalar | None]: # type: ignore[type-arg]
|
|
62
|
+
"""Compute min/max geometry scalars as bounding-box corner points.
|
|
63
|
+
|
|
64
|
+
For geometry columns, ``min``/``max`` of the raw WKB is meaningless for
|
|
65
|
+
spatial filtering. Instead, we compute the spatial bounding box and return
|
|
66
|
+
two corner-point geometries whose union covers the full extent.
|
|
67
|
+
|
|
68
|
+
Handles all vertex types:
|
|
69
|
+
|
|
70
|
+
- **XY** (2D): ``POINT(xmin ymin)`` / ``POINT(xmax ymax)``
|
|
71
|
+
- **XYZ** (3D): ``POINT Z(xmin ymin zmin)`` / ``POINT Z(xmax ymax zmax)``
|
|
72
|
+
- **XYM**: ``POINT M(xmin ymin mmin)`` / ``POINT M(xmax ymax mmax)``
|
|
73
|
+
- **XYZM**: ``POINT ZM(xmin ymin zmin mmin)`` / ``POINT ZM(xmax ymax zmax mmax)``
|
|
74
|
+
|
|
75
|
+
When the C++ side calls ``GeometryStats::Update`` on each, the resulting
|
|
76
|
+
``GeometryExtent`` is the correct overall bounding box in all dimensions.
|
|
77
|
+
|
|
78
|
+
Returns (min_point, max_point) as Arrow binary scalars (WKB), or
|
|
79
|
+
(None, None) if the column has no non-null geometries or the spatial
|
|
80
|
+
extension is not loaded.
|
|
81
|
+
"""
|
|
82
|
+
try:
|
|
83
|
+
# Detect which dimensions are present by checking if Z/M functions
|
|
84
|
+
# return non-NULL for any row
|
|
85
|
+
dim_row = conn.execute(
|
|
86
|
+
f"SELECT"
|
|
87
|
+
f" bool_or(ST_ZMin({col}) IS NOT NULL) AS has_z,"
|
|
88
|
+
f" bool_or(ST_MMin({col}) IS NOT NULL) AS has_m"
|
|
89
|
+
f" FROM {qualified}"
|
|
90
|
+
f" WHERE {col} IS NOT NULL"
|
|
91
|
+
).fetchone()
|
|
92
|
+
|
|
93
|
+
if dim_row is None:
|
|
94
|
+
return None, None
|
|
95
|
+
|
|
96
|
+
has_z = bool(dim_row[0])
|
|
97
|
+
has_m = bool(dim_row[1])
|
|
98
|
+
|
|
99
|
+
# Build the aggregation query for all present dimensions
|
|
100
|
+
agg_parts = [
|
|
101
|
+
f"min(ST_XMin({col})) AS xmin",
|
|
102
|
+
f"max(ST_XMax({col})) AS xmax",
|
|
103
|
+
f"min(ST_YMin({col})) AS ymin",
|
|
104
|
+
f"max(ST_YMax({col})) AS ymax",
|
|
105
|
+
]
|
|
106
|
+
if has_z:
|
|
107
|
+
agg_parts += [f"min(ST_ZMin({col})) AS zmin", f"max(ST_ZMax({col})) AS zmax"]
|
|
108
|
+
if has_m:
|
|
109
|
+
agg_parts += [f"min(ST_MMin({col})) AS mmin", f"max(ST_MMax({col})) AS mmax"]
|
|
110
|
+
|
|
111
|
+
bounds = conn.execute(f"SELECT {', '.join(agg_parts)} FROM {qualified} WHERE {col} IS NOT NULL").fetchone()
|
|
112
|
+
|
|
113
|
+
if bounds is None:
|
|
114
|
+
return None, None
|
|
115
|
+
|
|
116
|
+
xmin, xmax, ymin, ymax = bounds[0], bounds[1], bounds[2], bounds[3]
|
|
117
|
+
if xmin is None:
|
|
118
|
+
return None, None
|
|
119
|
+
|
|
120
|
+
# Build WKT for the corner points with the correct vertex type
|
|
121
|
+
idx = 4
|
|
122
|
+
if has_z and has_m:
|
|
123
|
+
zmin, zmax = bounds[idx], bounds[idx + 1]
|
|
124
|
+
mmin, mmax = bounds[idx + 2], bounds[idx + 3]
|
|
125
|
+
dim_label = "ZM"
|
|
126
|
+
min_coords = f"{xmin} {ymin} {zmin} {mmin}"
|
|
127
|
+
max_coords = f"{xmax} {ymax} {zmax} {mmax}"
|
|
128
|
+
elif has_z:
|
|
129
|
+
zmin, zmax = bounds[idx], bounds[idx + 1]
|
|
130
|
+
dim_label = "Z"
|
|
131
|
+
min_coords = f"{xmin} {ymin} {zmin}"
|
|
132
|
+
max_coords = f"{xmax} {ymax} {zmax}"
|
|
133
|
+
elif has_m:
|
|
134
|
+
mmin, mmax = bounds[idx], bounds[idx + 1]
|
|
135
|
+
dim_label = "M"
|
|
136
|
+
min_coords = f"{xmin} {ymin} {mmin}"
|
|
137
|
+
max_coords = f"{xmax} {ymax} {mmax}"
|
|
138
|
+
else:
|
|
139
|
+
dim_label = ""
|
|
140
|
+
min_coords = f"{xmin} {ymin}"
|
|
141
|
+
max_coords = f"{xmax} {ymax}"
|
|
142
|
+
|
|
143
|
+
dim_suffix = f" {dim_label}" if dim_label else ""
|
|
144
|
+
min_wkt = f"POINT{dim_suffix}({min_coords})"
|
|
145
|
+
max_wkt = f"POINT{dim_suffix}({max_coords})"
|
|
146
|
+
|
|
147
|
+
arrow_table = conn.execute(
|
|
148
|
+
f"SELECT"
|
|
149
|
+
f" ST_GeomFromText('{min_wkt}')::GEOMETRY AS min_pt,"
|
|
150
|
+
f" ST_GeomFromText('{max_wkt}')::GEOMETRY AS max_pt"
|
|
151
|
+
).to_arrow_table()
|
|
152
|
+
|
|
153
|
+
min_scalar = arrow_table.column("min_pt")[0]
|
|
154
|
+
max_scalar = arrow_table.column("max_pt")[0]
|
|
155
|
+
return (
|
|
156
|
+
min_scalar if min_scalar.is_valid else None,
|
|
157
|
+
max_scalar if max_scalar.is_valid else None,
|
|
158
|
+
)
|
|
159
|
+
except Exception:
|
|
160
|
+
# Spatial extension not loaded, or column type doesn't support ST_ functions
|
|
161
|
+
return None, None
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _list_stats(
|
|
165
|
+
conn: duckdb.DuckDBPyConnection,
|
|
166
|
+
qualified: str,
|
|
167
|
+
col: str,
|
|
168
|
+
arrow_type: pa.DataType,
|
|
169
|
+
) -> tuple[pa.Scalar | None, pa.Scalar | None]: # type: ignore[type-arg]
|
|
170
|
+
"""Compute min/max for list columns using child element extremes.
|
|
171
|
+
|
|
172
|
+
For list columns, ``min``/``max`` of the list values themselves is not useful
|
|
173
|
+
for statistics. Instead, we compute the min/max of the child elements across
|
|
174
|
+
all lists using ``list_min``/``list_max``, then wrap them in single-element
|
|
175
|
+
lists so that ``FromConstant([child_min])`` + ``Merge(FromConstant([child_max]))``
|
|
176
|
+
produces the correct ``ListStats`` with child element bounds.
|
|
177
|
+
|
|
178
|
+
Returns (min_list, max_list) as Arrow list scalars, or (None, None) if there
|
|
179
|
+
are no non-null child elements.
|
|
180
|
+
"""
|
|
181
|
+
try:
|
|
182
|
+
arrow_table = conn.execute(
|
|
183
|
+
f"SELECT"
|
|
184
|
+
f" [min(list_min({col}))] AS min_val,"
|
|
185
|
+
f" [max(list_max({col}))] AS max_val"
|
|
186
|
+
f" FROM {qualified}"
|
|
187
|
+
f" WHERE {col} IS NOT NULL"
|
|
188
|
+
).to_arrow_table()
|
|
189
|
+
min_scalar = arrow_table.column("min_val")[0]
|
|
190
|
+
max_scalar = arrow_table.column("max_val")[0]
|
|
191
|
+
# Check if the inner element is null (all lists were empty)
|
|
192
|
+
min_inner = min_scalar.as_py()
|
|
193
|
+
max_inner = max_scalar.as_py()
|
|
194
|
+
if min_inner is None or min_inner == [None]:
|
|
195
|
+
return None, None
|
|
196
|
+
if max_inner is None or max_inner == [None]:
|
|
197
|
+
return None, None
|
|
198
|
+
# Wrap child extremes in a regular list type (works for both LIST and ARRAY columns).
|
|
199
|
+
# For fixed-size ARRAY types, we can't create a 1-element scalar with the original
|
|
200
|
+
# type (size mismatch), so we use a variable-length list instead. DuckDB's
|
|
201
|
+
# FromConstant handles both LIST_STATS and ARRAY_STATS identically for child bounds.
|
|
202
|
+
list_type = pa.list_(arrow_type.value_type) if pa.types.is_fixed_size_list(arrow_type) else arrow_type
|
|
203
|
+
return (
|
|
204
|
+
pa.scalar(min_inner, type=list_type),
|
|
205
|
+
pa.scalar(max_inner, type=list_type),
|
|
206
|
+
)
|
|
207
|
+
except Exception:
|
|
208
|
+
return None, None
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def statistics_from_duckdb(
|
|
212
|
+
conn: duckdb.DuckDBPyConnection,
|
|
213
|
+
table_name: str,
|
|
214
|
+
*,
|
|
215
|
+
schema_name: str | None = None,
|
|
216
|
+
) -> dict[str, ColumnStatisticsInput]:
|
|
217
|
+
"""Extract column statistics from a DuckDB table.
|
|
218
|
+
|
|
219
|
+
Queries the table for min, max, approximate distinct count, and null counts
|
|
220
|
+
per column. Returns a dict mapping column names to
|
|
221
|
+
:class:`ColumnStatisticsInput` instances with properly typed PyArrow scalars.
|
|
222
|
+
|
|
223
|
+
Special column type handling:
|
|
224
|
+
|
|
225
|
+
- **Geometry**: computes the spatial bounding box and sends two corner-point
|
|
226
|
+
geometries so that DuckDB's ``GeometryStats`` can reconstruct the correct
|
|
227
|
+
extent for spatial filter pushdown.
|
|
228
|
+
- **List**: uses ``list_min``/``list_max`` to find child element extremes,
|
|
229
|
+
then wraps them in single-element lists so DuckDB's ``ListStats`` tracks
|
|
230
|
+
the correct child element bounds.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
conn: An open DuckDB connection.
|
|
234
|
+
table_name: Name of the table to query.
|
|
235
|
+
schema_name: Optional schema name. If provided, the table is referenced
|
|
236
|
+
as ``schema_name.table_name``.
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
Dict mapping column names to ``ColumnStatisticsInput``, suitable for
|
|
240
|
+
passing directly to ``Table(statistics=...)``.
|
|
241
|
+
|
|
242
|
+
"""
|
|
243
|
+
qualified = f'"{schema_name}"."{table_name}"' if schema_name else f'"{table_name}"'
|
|
244
|
+
|
|
245
|
+
# Get the table schema via a zero-row Arrow query
|
|
246
|
+
schema: pa.Schema = conn.execute(f"SELECT * FROM {qualified} LIMIT 0").to_arrow_table().schema
|
|
247
|
+
|
|
248
|
+
result: dict[str, ColumnStatisticsInput] = {}
|
|
249
|
+
|
|
250
|
+
for field in schema:
|
|
251
|
+
col = f'"{field.name}"'
|
|
252
|
+
|
|
253
|
+
# Count nulls/non-nulls and distinct values (works for all types)
|
|
254
|
+
count_table = conn.execute(
|
|
255
|
+
f"SELECT"
|
|
256
|
+
f" approx_count_distinct({col}) AS distinct_count,"
|
|
257
|
+
f" count({col}) AS non_null_count,"
|
|
258
|
+
f" (count(*) - count({col})) AS null_count"
|
|
259
|
+
f" FROM {qualified}"
|
|
260
|
+
).to_arrow_table()
|
|
261
|
+
distinct_count: int = count_table.column("distinct_count")[0].as_py()
|
|
262
|
+
non_null_count: int = count_table.column("non_null_count")[0].as_py()
|
|
263
|
+
null_count: int = count_table.column("null_count")[0].as_py()
|
|
264
|
+
|
|
265
|
+
# Compute min/max — dispatch by column type
|
|
266
|
+
min_val: pa.Scalar | None = None # type: ignore[type-arg]
|
|
267
|
+
max_val: pa.Scalar | None = None # type: ignore[type-arg]
|
|
268
|
+
|
|
269
|
+
is_geom = _is_geometry_column(conn, qualified, col)
|
|
270
|
+
if is_geom:
|
|
271
|
+
min_val, max_val = _geometry_stats(conn, qualified, col)
|
|
272
|
+
elif (
|
|
273
|
+
pa.types.is_list(field.type)
|
|
274
|
+
or pa.types.is_large_list(field.type)
|
|
275
|
+
or pa.types.is_fixed_size_list(field.type)
|
|
276
|
+
):
|
|
277
|
+
min_val, max_val = _list_stats(conn, qualified, col, field.type)
|
|
278
|
+
else:
|
|
279
|
+
minmax_table = conn.execute(
|
|
280
|
+
f"SELECT min({col}) AS min_val, max({col}) AS max_val FROM {qualified}"
|
|
281
|
+
).to_arrow_table()
|
|
282
|
+
min_scalar = minmax_table.column("min_val")[0]
|
|
283
|
+
max_scalar = minmax_table.column("max_val")[0]
|
|
284
|
+
min_val = min_scalar if min_scalar.is_valid else None
|
|
285
|
+
max_val = max_scalar if max_scalar.is_valid else None
|
|
286
|
+
|
|
287
|
+
# Unwrap dictionary-encoded scalars (e.g. from ENUM columns) to their
|
|
288
|
+
# value type so that statistics report actual values, not dictionary indices.
|
|
289
|
+
if min_val is not None and pa.types.is_dictionary(min_val.type):
|
|
290
|
+
min_val = pa.scalar(min_val.as_py(), type=min_val.type.value_type)
|
|
291
|
+
if max_val is not None and pa.types.is_dictionary(max_val.type):
|
|
292
|
+
max_val = pa.scalar(max_val.as_py(), type=max_val.type.value_type)
|
|
293
|
+
|
|
294
|
+
# Compute max_string_length for string/binary columns (including
|
|
295
|
+
# dictionary-encoded columns with string value types like ENUMs).
|
|
296
|
+
# Skip geometry columns — their Arrow type is binary but strlen/octet_length
|
|
297
|
+
# don't apply to the DuckDB GEOMETRY type.
|
|
298
|
+
max_string_length: int | None = None
|
|
299
|
+
is_dict = pa.types.is_dictionary(field.type)
|
|
300
|
+
effective_type = field.type.value_type if is_dict else field.type
|
|
301
|
+
if not is_geom and (
|
|
302
|
+
pa.types.is_string(effective_type)
|
|
303
|
+
or pa.types.is_large_string(effective_type)
|
|
304
|
+
or pa.types.is_binary(effective_type)
|
|
305
|
+
or pa.types.is_large_binary(effective_type)
|
|
306
|
+
):
|
|
307
|
+
# strlen returns byte length for VARCHAR; octet_length for BLOB.
|
|
308
|
+
# ENUM columns need a cast to VARCHAR first.
|
|
309
|
+
if pa.types.is_binary(effective_type) or pa.types.is_large_binary(effective_type):
|
|
310
|
+
len_expr = f"octet_length({col})"
|
|
311
|
+
elif is_dict:
|
|
312
|
+
len_expr = f"strlen({col}::VARCHAR)"
|
|
313
|
+
else:
|
|
314
|
+
len_expr = f"strlen({col})"
|
|
315
|
+
len_row = conn.execute(f"SELECT max({len_expr}) AS max_len FROM {qualified}").fetchone()
|
|
316
|
+
if len_row is not None and len_row[0] is not None:
|
|
317
|
+
max_string_length = int(len_row[0])
|
|
318
|
+
|
|
319
|
+
# Compute contains_unicode for string columns: true if any value has
|
|
320
|
+
# characters outside ASCII (byte length > character length).
|
|
321
|
+
contains_unicode: bool | None = None
|
|
322
|
+
if pa.types.is_string(effective_type) or pa.types.is_large_string(effective_type):
|
|
323
|
+
if is_dict:
|
|
324
|
+
unicode_expr = f"strlen({col}::VARCHAR) != length({col}::VARCHAR)"
|
|
325
|
+
else:
|
|
326
|
+
unicode_expr = f"strlen({col}) != length({col})"
|
|
327
|
+
uni_row = conn.execute(f"SELECT bool_or({unicode_expr}) AS has_unicode FROM {qualified}").fetchone()
|
|
328
|
+
contains_unicode = bool(uni_row[0]) if uni_row is not None and uni_row[0] is not None else False
|
|
329
|
+
|
|
330
|
+
result[field.name] = ColumnStatisticsInput(
|
|
331
|
+
min=min_val,
|
|
332
|
+
max=max_val,
|
|
333
|
+
has_null=null_count > 0,
|
|
334
|
+
has_not_null=non_null_count > 0,
|
|
335
|
+
distinct_count=distinct_count,
|
|
336
|
+
max_string_length=max_string_length,
|
|
337
|
+
contains_unicode=contains_unicode,
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
return result
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def column_statistics_from_duckdb(
|
|
344
|
+
conn: duckdb.DuckDBPyConnection,
|
|
345
|
+
table_name: str,
|
|
346
|
+
*,
|
|
347
|
+
schema_name: str | None = None,
|
|
348
|
+
) -> list[ColumnStatistics]:
|
|
349
|
+
"""Extract resolved column statistics from a DuckDB table.
|
|
350
|
+
|
|
351
|
+
Like :func:`statistics_from_duckdb`, but returns fully resolved
|
|
352
|
+
:class:`ColumnStatistics` objects with typed PyArrow scalars — ready
|
|
353
|
+
to be returned from ``table_column_statistics_get()`` wrapped in a
|
|
354
|
+
:class:`TableColumnStatisticsResult`.
|
|
355
|
+
|
|
356
|
+
Example usage in a dynamic catalog::
|
|
357
|
+
|
|
358
|
+
def table_column_statistics_get(self, *, attach_opaque_data, transaction_opaque_data, schema_name, name):
|
|
359
|
+
conn = self._get_connection(attach_opaque_data)
|
|
360
|
+
return TableColumnStatisticsResult(
|
|
361
|
+
statistics=column_statistics_from_duckdb(conn, name, schema_name=schema_name),
|
|
362
|
+
cache_max_age_seconds=60,
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
conn: An open DuckDB connection.
|
|
367
|
+
table_name: Name of the table to query.
|
|
368
|
+
schema_name: Optional schema name.
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
List of resolved ``ColumnStatistics`` objects.
|
|
372
|
+
|
|
373
|
+
"""
|
|
374
|
+
qualified = f'"{schema_name}"."{table_name}"' if schema_name else f'"{table_name}"'
|
|
375
|
+
schema: pa.Schema = conn.execute(f"SELECT * FROM {qualified} LIMIT 0").to_arrow_table().schema
|
|
376
|
+
stats_dict = statistics_from_duckdb(conn, table_name, schema_name=schema_name)
|
|
377
|
+
return [stats_dict[field.name].resolve(field.name, field.type) for field in schema if field.name in stats_dict]
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Secret type descriptor for declarative worker secret type definitions.
|
|
4
|
+
|
|
5
|
+
This module provides the SecretTypeSpec class for defining secret types
|
|
6
|
+
that are registered with DuckDB's SecretManager during ATTACH.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import ClassVar, Self, cast
|
|
11
|
+
|
|
12
|
+
import pyarrow as pa
|
|
13
|
+
from vgi_rpc.utils import serialize_record_batch_bytes
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"SecretTypeSpec",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True)
|
|
21
|
+
class SecretTypeSpec:
|
|
22
|
+
"""Specification for a custom secret type registered at ATTACH.
|
|
23
|
+
|
|
24
|
+
Defines the secret type name, description, and parameter schema.
|
|
25
|
+
The schema is a standard Arrow schema where each field represents a
|
|
26
|
+
secret parameter (key name -> value type). Fields that should be
|
|
27
|
+
redacted in SHOW SECRETS are marked with {"redact": "true"} in
|
|
28
|
+
their Arrow field metadata.
|
|
29
|
+
|
|
30
|
+
Attributes:
|
|
31
|
+
name: The secret type name (e.g., "vgi_example").
|
|
32
|
+
description: Human-readable description.
|
|
33
|
+
schema: Arrow schema defining the secret's key-value parameters.
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
SecretTypeSpec(
|
|
37
|
+
name="vgi_example",
|
|
38
|
+
description="Example VGI secret for testing",
|
|
39
|
+
schema=pa.schema([
|
|
40
|
+
pa.field("secret_string", pa.string(), metadata={"redact": "true"}),
|
|
41
|
+
pa.field("api_key", pa.string(), metadata={"redact": "true"}),
|
|
42
|
+
pa.field("port", pa.int32()),
|
|
43
|
+
pa.field("use_ssl", pa.bool_()),
|
|
44
|
+
pa.field("timeout", pa.float64()),
|
|
45
|
+
]),
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
name: str
|
|
51
|
+
description: str
|
|
52
|
+
schema: pa.Schema
|
|
53
|
+
|
|
54
|
+
ARROW_SCHEMA: ClassVar[pa.Schema] = pa.schema(
|
|
55
|
+
[
|
|
56
|
+
pa.field("name", pa.string(), nullable=False),
|
|
57
|
+
pa.field("description", pa.string(), nullable=False),
|
|
58
|
+
pa.field("parameters_schema", pa.binary(), nullable=False),
|
|
59
|
+
] # type: ignore[arg-type] # PyArrow field metadata typing limitation
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def serialize(self) -> bytes:
|
|
63
|
+
"""Serialize to Arrow IPC bytes."""
|
|
64
|
+
# Serialize the parameters schema (with field metadata for redact keys)
|
|
65
|
+
schema_bytes = self.schema.serialize().to_pybytes()
|
|
66
|
+
|
|
67
|
+
batch = pa.RecordBatch.from_pylist(
|
|
68
|
+
[
|
|
69
|
+
{
|
|
70
|
+
"name": self.name,
|
|
71
|
+
"description": self.description,
|
|
72
|
+
"parameters_schema": schema_bytes,
|
|
73
|
+
}
|
|
74
|
+
],
|
|
75
|
+
schema=self.ARROW_SCHEMA,
|
|
76
|
+
)
|
|
77
|
+
return serialize_record_batch_bytes(batch)
|
|
78
|
+
|
|
79
|
+
@classmethod
|
|
80
|
+
def deserialize(cls, batch: pa.RecordBatch) -> Self:
|
|
81
|
+
"""Deserialize from Arrow RecordBatch."""
|
|
82
|
+
from vgi_rpc.utils import _validate_single_row_batch
|
|
83
|
+
|
|
84
|
+
row = _validate_single_row_batch(
|
|
85
|
+
batch,
|
|
86
|
+
cls.__name__,
|
|
87
|
+
required_fields=["name", "description", "parameters_schema"],
|
|
88
|
+
)
|
|
89
|
+
# Deserialize the parameters schema from IPC bytes
|
|
90
|
+
parameters_schema = pa.ipc.read_schema(pa.py_buffer(cast(bytes, row["parameters_schema"])))
|
|
91
|
+
|
|
92
|
+
return cls(
|
|
93
|
+
name=cast(str, row["name"]),
|
|
94
|
+
description=cast(str, row["description"]),
|
|
95
|
+
schema=parameters_schema,
|
|
96
|
+
)
|