vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
@@ -0,0 +1,377 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Extract column statistics from DuckDB tables.
4
+
5
+ Provides a helper to query a DuckDB connection and produce
6
+ :class:`~vgi.catalog.descriptors.ColumnStatisticsInput` dicts
7
+ ready for use in ``Table(statistics=...)``.
8
+
9
+ Example::
10
+
11
+ import duckdb
12
+ from vgi.catalog.duckdb_statistics import statistics_from_duckdb
13
+
14
+ conn = duckdb.connect("my_data.duckdb")
15
+ stats = statistics_from_duckdb(conn, "my_table")
16
+
17
+ Table(
18
+ name="my_table",
19
+ columns=...,
20
+ statistics=stats,
21
+ statistics_cache_max_age_seconds=3600,
22
+ )
23
+
24
+ Geometry columns are handled specially: instead of meaningless ``min``/``max``
25
+ of the raw WKB blobs, the helper computes the spatial bounding box of the
26
+ dataset and sends two corner points so that DuckDB's ``GeometryStats`` can
27
+ reconstruct the correct spatial extent for filter pushdown.
28
+ """
29
+
30
+ from __future__ import annotations
31
+
32
+ from typing import TYPE_CHECKING
33
+
34
+ import pyarrow as pa
35
+
36
+ from vgi.catalog.catalog_interface import ColumnStatistics
37
+ from vgi.catalog.descriptors import ColumnStatisticsInput
38
+
39
+ if TYPE_CHECKING:
40
+ import duckdb
41
+
42
+ __all__ = ["column_statistics_from_duckdb", "statistics_from_duckdb"]
43
+
44
+ # DuckDB type names that indicate a spatial column requiring special handling.
45
+ _GEOMETRY_TYPE_NAMES = frozenset({"GEOMETRY", "POINT_2D", "LINESTRING_2D", "POLYGON_2D", "BOX_2D"})
46
+
47
+
48
+ def _is_geometry_column(conn: duckdb.DuckDBPyConnection, qualified: str, col: str) -> bool:
49
+ """Check if a column is a geometry type by querying DuckDB's typeof()."""
50
+ try:
51
+ row = conn.execute(f"SELECT typeof({col}) FROM {qualified} WHERE {col} IS NOT NULL LIMIT 1").fetchone()
52
+ return row is not None and row[0] in _GEOMETRY_TYPE_NAMES
53
+ except Exception:
54
+ return False
55
+
56
+
57
+ def _geometry_stats(
58
+ conn: duckdb.DuckDBPyConnection,
59
+ qualified: str,
60
+ col: str,
61
+ ) -> tuple[pa.Scalar | None, pa.Scalar | None]: # type: ignore[type-arg]
62
+ """Compute min/max geometry scalars as bounding-box corner points.
63
+
64
+ For geometry columns, ``min``/``max`` of the raw WKB is meaningless for
65
+ spatial filtering. Instead, we compute the spatial bounding box and return
66
+ two corner-point geometries whose union covers the full extent.
67
+
68
+ Handles all vertex types:
69
+
70
+ - **XY** (2D): ``POINT(xmin ymin)`` / ``POINT(xmax ymax)``
71
+ - **XYZ** (3D): ``POINT Z(xmin ymin zmin)`` / ``POINT Z(xmax ymax zmax)``
72
+ - **XYM**: ``POINT M(xmin ymin mmin)`` / ``POINT M(xmax ymax mmax)``
73
+ - **XYZM**: ``POINT ZM(xmin ymin zmin mmin)`` / ``POINT ZM(xmax ymax zmax mmax)``
74
+
75
+ When the C++ side calls ``GeometryStats::Update`` on each, the resulting
76
+ ``GeometryExtent`` is the correct overall bounding box in all dimensions.
77
+
78
+ Returns (min_point, max_point) as Arrow binary scalars (WKB), or
79
+ (None, None) if the column has no non-null geometries or the spatial
80
+ extension is not loaded.
81
+ """
82
+ try:
83
+ # Detect which dimensions are present by checking if Z/M functions
84
+ # return non-NULL for any row
85
+ dim_row = conn.execute(
86
+ f"SELECT"
87
+ f" bool_or(ST_ZMin({col}) IS NOT NULL) AS has_z,"
88
+ f" bool_or(ST_MMin({col}) IS NOT NULL) AS has_m"
89
+ f" FROM {qualified}"
90
+ f" WHERE {col} IS NOT NULL"
91
+ ).fetchone()
92
+
93
+ if dim_row is None:
94
+ return None, None
95
+
96
+ has_z = bool(dim_row[0])
97
+ has_m = bool(dim_row[1])
98
+
99
+ # Build the aggregation query for all present dimensions
100
+ agg_parts = [
101
+ f"min(ST_XMin({col})) AS xmin",
102
+ f"max(ST_XMax({col})) AS xmax",
103
+ f"min(ST_YMin({col})) AS ymin",
104
+ f"max(ST_YMax({col})) AS ymax",
105
+ ]
106
+ if has_z:
107
+ agg_parts += [f"min(ST_ZMin({col})) AS zmin", f"max(ST_ZMax({col})) AS zmax"]
108
+ if has_m:
109
+ agg_parts += [f"min(ST_MMin({col})) AS mmin", f"max(ST_MMax({col})) AS mmax"]
110
+
111
+ bounds = conn.execute(f"SELECT {', '.join(agg_parts)} FROM {qualified} WHERE {col} IS NOT NULL").fetchone()
112
+
113
+ if bounds is None:
114
+ return None, None
115
+
116
+ xmin, xmax, ymin, ymax = bounds[0], bounds[1], bounds[2], bounds[3]
117
+ if xmin is None:
118
+ return None, None
119
+
120
+ # Build WKT for the corner points with the correct vertex type
121
+ idx = 4
122
+ if has_z and has_m:
123
+ zmin, zmax = bounds[idx], bounds[idx + 1]
124
+ mmin, mmax = bounds[idx + 2], bounds[idx + 3]
125
+ dim_label = "ZM"
126
+ min_coords = f"{xmin} {ymin} {zmin} {mmin}"
127
+ max_coords = f"{xmax} {ymax} {zmax} {mmax}"
128
+ elif has_z:
129
+ zmin, zmax = bounds[idx], bounds[idx + 1]
130
+ dim_label = "Z"
131
+ min_coords = f"{xmin} {ymin} {zmin}"
132
+ max_coords = f"{xmax} {ymax} {zmax}"
133
+ elif has_m:
134
+ mmin, mmax = bounds[idx], bounds[idx + 1]
135
+ dim_label = "M"
136
+ min_coords = f"{xmin} {ymin} {mmin}"
137
+ max_coords = f"{xmax} {ymax} {mmax}"
138
+ else:
139
+ dim_label = ""
140
+ min_coords = f"{xmin} {ymin}"
141
+ max_coords = f"{xmax} {ymax}"
142
+
143
+ dim_suffix = f" {dim_label}" if dim_label else ""
144
+ min_wkt = f"POINT{dim_suffix}({min_coords})"
145
+ max_wkt = f"POINT{dim_suffix}({max_coords})"
146
+
147
+ arrow_table = conn.execute(
148
+ f"SELECT"
149
+ f" ST_GeomFromText('{min_wkt}')::GEOMETRY AS min_pt,"
150
+ f" ST_GeomFromText('{max_wkt}')::GEOMETRY AS max_pt"
151
+ ).to_arrow_table()
152
+
153
+ min_scalar = arrow_table.column("min_pt")[0]
154
+ max_scalar = arrow_table.column("max_pt")[0]
155
+ return (
156
+ min_scalar if min_scalar.is_valid else None,
157
+ max_scalar if max_scalar.is_valid else None,
158
+ )
159
+ except Exception:
160
+ # Spatial extension not loaded, or column type doesn't support ST_ functions
161
+ return None, None
162
+
163
+
164
+ def _list_stats(
165
+ conn: duckdb.DuckDBPyConnection,
166
+ qualified: str,
167
+ col: str,
168
+ arrow_type: pa.DataType,
169
+ ) -> tuple[pa.Scalar | None, pa.Scalar | None]: # type: ignore[type-arg]
170
+ """Compute min/max for list columns using child element extremes.
171
+
172
+ For list columns, ``min``/``max`` of the list values themselves is not useful
173
+ for statistics. Instead, we compute the min/max of the child elements across
174
+ all lists using ``list_min``/``list_max``, then wrap them in single-element
175
+ lists so that ``FromConstant([child_min])`` + ``Merge(FromConstant([child_max]))``
176
+ produces the correct ``ListStats`` with child element bounds.
177
+
178
+ Returns (min_list, max_list) as Arrow list scalars, or (None, None) if there
179
+ are no non-null child elements.
180
+ """
181
+ try:
182
+ arrow_table = conn.execute(
183
+ f"SELECT"
184
+ f" [min(list_min({col}))] AS min_val,"
185
+ f" [max(list_max({col}))] AS max_val"
186
+ f" FROM {qualified}"
187
+ f" WHERE {col} IS NOT NULL"
188
+ ).to_arrow_table()
189
+ min_scalar = arrow_table.column("min_val")[0]
190
+ max_scalar = arrow_table.column("max_val")[0]
191
+ # Check if the inner element is null (all lists were empty)
192
+ min_inner = min_scalar.as_py()
193
+ max_inner = max_scalar.as_py()
194
+ if min_inner is None or min_inner == [None]:
195
+ return None, None
196
+ if max_inner is None or max_inner == [None]:
197
+ return None, None
198
+ # Wrap child extremes in a regular list type (works for both LIST and ARRAY columns).
199
+ # For fixed-size ARRAY types, we can't create a 1-element scalar with the original
200
+ # type (size mismatch), so we use a variable-length list instead. DuckDB's
201
+ # FromConstant handles both LIST_STATS and ARRAY_STATS identically for child bounds.
202
+ list_type = pa.list_(arrow_type.value_type) if pa.types.is_fixed_size_list(arrow_type) else arrow_type
203
+ return (
204
+ pa.scalar(min_inner, type=list_type),
205
+ pa.scalar(max_inner, type=list_type),
206
+ )
207
+ except Exception:
208
+ return None, None
209
+
210
+
211
+ def statistics_from_duckdb(
212
+ conn: duckdb.DuckDBPyConnection,
213
+ table_name: str,
214
+ *,
215
+ schema_name: str | None = None,
216
+ ) -> dict[str, ColumnStatisticsInput]:
217
+ """Extract column statistics from a DuckDB table.
218
+
219
+ Queries the table for min, max, approximate distinct count, and null counts
220
+ per column. Returns a dict mapping column names to
221
+ :class:`ColumnStatisticsInput` instances with properly typed PyArrow scalars.
222
+
223
+ Special column type handling:
224
+
225
+ - **Geometry**: computes the spatial bounding box and sends two corner-point
226
+ geometries so that DuckDB's ``GeometryStats`` can reconstruct the correct
227
+ extent for spatial filter pushdown.
228
+ - **List**: uses ``list_min``/``list_max`` to find child element extremes,
229
+ then wraps them in single-element lists so DuckDB's ``ListStats`` tracks
230
+ the correct child element bounds.
231
+
232
+ Args:
233
+ conn: An open DuckDB connection.
234
+ table_name: Name of the table to query.
235
+ schema_name: Optional schema name. If provided, the table is referenced
236
+ as ``schema_name.table_name``.
237
+
238
+ Returns:
239
+ Dict mapping column names to ``ColumnStatisticsInput``, suitable for
240
+ passing directly to ``Table(statistics=...)``.
241
+
242
+ """
243
+ qualified = f'"{schema_name}"."{table_name}"' if schema_name else f'"{table_name}"'
244
+
245
+ # Get the table schema via a zero-row Arrow query
246
+ schema: pa.Schema = conn.execute(f"SELECT * FROM {qualified} LIMIT 0").to_arrow_table().schema
247
+
248
+ result: dict[str, ColumnStatisticsInput] = {}
249
+
250
+ for field in schema:
251
+ col = f'"{field.name}"'
252
+
253
+ # Count nulls/non-nulls and distinct values (works for all types)
254
+ count_table = conn.execute(
255
+ f"SELECT"
256
+ f" approx_count_distinct({col}) AS distinct_count,"
257
+ f" count({col}) AS non_null_count,"
258
+ f" (count(*) - count({col})) AS null_count"
259
+ f" FROM {qualified}"
260
+ ).to_arrow_table()
261
+ distinct_count: int = count_table.column("distinct_count")[0].as_py()
262
+ non_null_count: int = count_table.column("non_null_count")[0].as_py()
263
+ null_count: int = count_table.column("null_count")[0].as_py()
264
+
265
+ # Compute min/max — dispatch by column type
266
+ min_val: pa.Scalar | None = None # type: ignore[type-arg]
267
+ max_val: pa.Scalar | None = None # type: ignore[type-arg]
268
+
269
+ is_geom = _is_geometry_column(conn, qualified, col)
270
+ if is_geom:
271
+ min_val, max_val = _geometry_stats(conn, qualified, col)
272
+ elif (
273
+ pa.types.is_list(field.type)
274
+ or pa.types.is_large_list(field.type)
275
+ or pa.types.is_fixed_size_list(field.type)
276
+ ):
277
+ min_val, max_val = _list_stats(conn, qualified, col, field.type)
278
+ else:
279
+ minmax_table = conn.execute(
280
+ f"SELECT min({col}) AS min_val, max({col}) AS max_val FROM {qualified}"
281
+ ).to_arrow_table()
282
+ min_scalar = minmax_table.column("min_val")[0]
283
+ max_scalar = minmax_table.column("max_val")[0]
284
+ min_val = min_scalar if min_scalar.is_valid else None
285
+ max_val = max_scalar if max_scalar.is_valid else None
286
+
287
+ # Unwrap dictionary-encoded scalars (e.g. from ENUM columns) to their
288
+ # value type so that statistics report actual values, not dictionary indices.
289
+ if min_val is not None and pa.types.is_dictionary(min_val.type):
290
+ min_val = pa.scalar(min_val.as_py(), type=min_val.type.value_type)
291
+ if max_val is not None and pa.types.is_dictionary(max_val.type):
292
+ max_val = pa.scalar(max_val.as_py(), type=max_val.type.value_type)
293
+
294
+ # Compute max_string_length for string/binary columns (including
295
+ # dictionary-encoded columns with string value types like ENUMs).
296
+ # Skip geometry columns — their Arrow type is binary but strlen/octet_length
297
+ # don't apply to the DuckDB GEOMETRY type.
298
+ max_string_length: int | None = None
299
+ is_dict = pa.types.is_dictionary(field.type)
300
+ effective_type = field.type.value_type if is_dict else field.type
301
+ if not is_geom and (
302
+ pa.types.is_string(effective_type)
303
+ or pa.types.is_large_string(effective_type)
304
+ or pa.types.is_binary(effective_type)
305
+ or pa.types.is_large_binary(effective_type)
306
+ ):
307
+ # strlen returns byte length for VARCHAR; octet_length for BLOB.
308
+ # ENUM columns need a cast to VARCHAR first.
309
+ if pa.types.is_binary(effective_type) or pa.types.is_large_binary(effective_type):
310
+ len_expr = f"octet_length({col})"
311
+ elif is_dict:
312
+ len_expr = f"strlen({col}::VARCHAR)"
313
+ else:
314
+ len_expr = f"strlen({col})"
315
+ len_row = conn.execute(f"SELECT max({len_expr}) AS max_len FROM {qualified}").fetchone()
316
+ if len_row is not None and len_row[0] is not None:
317
+ max_string_length = int(len_row[0])
318
+
319
+ # Compute contains_unicode for string columns: true if any value has
320
+ # characters outside ASCII (byte length > character length).
321
+ contains_unicode: bool | None = None
322
+ if pa.types.is_string(effective_type) or pa.types.is_large_string(effective_type):
323
+ if is_dict:
324
+ unicode_expr = f"strlen({col}::VARCHAR) != length({col}::VARCHAR)"
325
+ else:
326
+ unicode_expr = f"strlen({col}) != length({col})"
327
+ uni_row = conn.execute(f"SELECT bool_or({unicode_expr}) AS has_unicode FROM {qualified}").fetchone()
328
+ contains_unicode = bool(uni_row[0]) if uni_row is not None and uni_row[0] is not None else False
329
+
330
+ result[field.name] = ColumnStatisticsInput(
331
+ min=min_val,
332
+ max=max_val,
333
+ has_null=null_count > 0,
334
+ has_not_null=non_null_count > 0,
335
+ distinct_count=distinct_count,
336
+ max_string_length=max_string_length,
337
+ contains_unicode=contains_unicode,
338
+ )
339
+
340
+ return result
341
+
342
+
343
+ def column_statistics_from_duckdb(
344
+ conn: duckdb.DuckDBPyConnection,
345
+ table_name: str,
346
+ *,
347
+ schema_name: str | None = None,
348
+ ) -> list[ColumnStatistics]:
349
+ """Extract resolved column statistics from a DuckDB table.
350
+
351
+ Like :func:`statistics_from_duckdb`, but returns fully resolved
352
+ :class:`ColumnStatistics` objects with typed PyArrow scalars — ready
353
+ to be returned from ``table_column_statistics_get()`` wrapped in a
354
+ :class:`TableColumnStatisticsResult`.
355
+
356
+ Example usage in a dynamic catalog::
357
+
358
+ def table_column_statistics_get(self, *, attach_opaque_data, transaction_opaque_data, schema_name, name):
359
+ conn = self._get_connection(attach_opaque_data)
360
+ return TableColumnStatisticsResult(
361
+ statistics=column_statistics_from_duckdb(conn, name, schema_name=schema_name),
362
+ cache_max_age_seconds=60,
363
+ )
364
+
365
+ Args:
366
+ conn: An open DuckDB connection.
367
+ table_name: Name of the table to query.
368
+ schema_name: Optional schema name.
369
+
370
+ Returns:
371
+ List of resolved ``ColumnStatistics`` objects.
372
+
373
+ """
374
+ qualified = f'"{schema_name}"."{table_name}"' if schema_name else f'"{table_name}"'
375
+ schema: pa.Schema = conn.execute(f"SELECT * FROM {qualified} LIMIT 0").to_arrow_table().schema
376
+ stats_dict = statistics_from_duckdb(conn, table_name, schema_name=schema_name)
377
+ return [stats_dict[field.name].resolve(field.name, field.type) for field in schema if field.name in stats_dict]
@@ -0,0 +1,96 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Secret type descriptor for declarative worker secret type definitions.
4
+
5
+ This module provides the SecretTypeSpec class for defining secret types
6
+ that are registered with DuckDB's SecretManager during ATTACH.
7
+ """
8
+
9
+ from dataclasses import dataclass
10
+ from typing import ClassVar, Self, cast
11
+
12
+ import pyarrow as pa
13
+ from vgi_rpc.utils import serialize_record_batch_bytes
14
+
15
+ __all__ = [
16
+ "SecretTypeSpec",
17
+ ]
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class SecretTypeSpec:
22
+ """Specification for a custom secret type registered at ATTACH.
23
+
24
+ Defines the secret type name, description, and parameter schema.
25
+ The schema is a standard Arrow schema where each field represents a
26
+ secret parameter (key name -> value type). Fields that should be
27
+ redacted in SHOW SECRETS are marked with {"redact": "true"} in
28
+ their Arrow field metadata.
29
+
30
+ Attributes:
31
+ name: The secret type name (e.g., "vgi_example").
32
+ description: Human-readable description.
33
+ schema: Arrow schema defining the secret's key-value parameters.
34
+
35
+ Example:
36
+ SecretTypeSpec(
37
+ name="vgi_example",
38
+ description="Example VGI secret for testing",
39
+ schema=pa.schema([
40
+ pa.field("secret_string", pa.string(), metadata={"redact": "true"}),
41
+ pa.field("api_key", pa.string(), metadata={"redact": "true"}),
42
+ pa.field("port", pa.int32()),
43
+ pa.field("use_ssl", pa.bool_()),
44
+ pa.field("timeout", pa.float64()),
45
+ ]),
46
+ )
47
+
48
+ """
49
+
50
+ name: str
51
+ description: str
52
+ schema: pa.Schema
53
+
54
+ ARROW_SCHEMA: ClassVar[pa.Schema] = pa.schema(
55
+ [
56
+ pa.field("name", pa.string(), nullable=False),
57
+ pa.field("description", pa.string(), nullable=False),
58
+ pa.field("parameters_schema", pa.binary(), nullable=False),
59
+ ] # type: ignore[arg-type] # PyArrow field metadata typing limitation
60
+ )
61
+
62
+ def serialize(self) -> bytes:
63
+ """Serialize to Arrow IPC bytes."""
64
+ # Serialize the parameters schema (with field metadata for redact keys)
65
+ schema_bytes = self.schema.serialize().to_pybytes()
66
+
67
+ batch = pa.RecordBatch.from_pylist(
68
+ [
69
+ {
70
+ "name": self.name,
71
+ "description": self.description,
72
+ "parameters_schema": schema_bytes,
73
+ }
74
+ ],
75
+ schema=self.ARROW_SCHEMA,
76
+ )
77
+ return serialize_record_batch_bytes(batch)
78
+
79
+ @classmethod
80
+ def deserialize(cls, batch: pa.RecordBatch) -> Self:
81
+ """Deserialize from Arrow RecordBatch."""
82
+ from vgi_rpc.utils import _validate_single_row_batch
83
+
84
+ row = _validate_single_row_batch(
85
+ batch,
86
+ cls.__name__,
87
+ required_fields=["name", "description", "parameters_schema"],
88
+ )
89
+ # Deserialize the parameters schema from IPC bytes
90
+ parameters_schema = pa.ipc.read_schema(pa.py_buffer(cast(bytes, row["parameters_schema"])))
91
+
92
+ return cls(
93
+ name=cast(str, row["name"]),
94
+ description=cast(str, row["description"]),
95
+ schema=parameters_schema,
96
+ )