cudf-polars-cu13 25.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -0
- cudf_polars/VERSION +1 -0
- cudf_polars/__init__.py +28 -0
- cudf_polars/_version.py +21 -0
- cudf_polars/callback.py +318 -0
- cudf_polars/containers/__init__.py +13 -0
- cudf_polars/containers/column.py +495 -0
- cudf_polars/containers/dataframe.py +361 -0
- cudf_polars/containers/datatype.py +137 -0
- cudf_polars/dsl/__init__.py +8 -0
- cudf_polars/dsl/expr.py +66 -0
- cudf_polars/dsl/expressions/__init__.py +8 -0
- cudf_polars/dsl/expressions/aggregation.py +226 -0
- cudf_polars/dsl/expressions/base.py +272 -0
- cudf_polars/dsl/expressions/binaryop.py +120 -0
- cudf_polars/dsl/expressions/boolean.py +326 -0
- cudf_polars/dsl/expressions/datetime.py +271 -0
- cudf_polars/dsl/expressions/literal.py +97 -0
- cudf_polars/dsl/expressions/rolling.py +643 -0
- cudf_polars/dsl/expressions/selection.py +74 -0
- cudf_polars/dsl/expressions/slicing.py +46 -0
- cudf_polars/dsl/expressions/sorting.py +85 -0
- cudf_polars/dsl/expressions/string.py +1002 -0
- cudf_polars/dsl/expressions/struct.py +137 -0
- cudf_polars/dsl/expressions/ternary.py +49 -0
- cudf_polars/dsl/expressions/unary.py +517 -0
- cudf_polars/dsl/ir.py +2607 -0
- cudf_polars/dsl/nodebase.py +164 -0
- cudf_polars/dsl/to_ast.py +359 -0
- cudf_polars/dsl/tracing.py +16 -0
- cudf_polars/dsl/translate.py +939 -0
- cudf_polars/dsl/traversal.py +224 -0
- cudf_polars/dsl/utils/__init__.py +8 -0
- cudf_polars/dsl/utils/aggregations.py +481 -0
- cudf_polars/dsl/utils/groupby.py +98 -0
- cudf_polars/dsl/utils/naming.py +34 -0
- cudf_polars/dsl/utils/replace.py +61 -0
- cudf_polars/dsl/utils/reshape.py +74 -0
- cudf_polars/dsl/utils/rolling.py +121 -0
- cudf_polars/dsl/utils/windows.py +192 -0
- cudf_polars/experimental/__init__.py +8 -0
- cudf_polars/experimental/base.py +386 -0
- cudf_polars/experimental/benchmarks/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds.py +220 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
- cudf_polars/experimental/benchmarks/pdsh.py +814 -0
- cudf_polars/experimental/benchmarks/utils.py +832 -0
- cudf_polars/experimental/dask_registers.py +200 -0
- cudf_polars/experimental/dispatch.py +156 -0
- cudf_polars/experimental/distinct.py +197 -0
- cudf_polars/experimental/explain.py +157 -0
- cudf_polars/experimental/expressions.py +590 -0
- cudf_polars/experimental/groupby.py +327 -0
- cudf_polars/experimental/io.py +943 -0
- cudf_polars/experimental/join.py +391 -0
- cudf_polars/experimental/parallel.py +423 -0
- cudf_polars/experimental/repartition.py +69 -0
- cudf_polars/experimental/scheduler.py +155 -0
- cudf_polars/experimental/select.py +188 -0
- cudf_polars/experimental/shuffle.py +354 -0
- cudf_polars/experimental/sort.py +609 -0
- cudf_polars/experimental/spilling.py +151 -0
- cudf_polars/experimental/statistics.py +795 -0
- cudf_polars/experimental/utils.py +169 -0
- cudf_polars/py.typed +0 -0
- cudf_polars/testing/__init__.py +8 -0
- cudf_polars/testing/asserts.py +448 -0
- cudf_polars/testing/io.py +122 -0
- cudf_polars/testing/plugin.py +236 -0
- cudf_polars/typing/__init__.py +219 -0
- cudf_polars/utils/__init__.py +8 -0
- cudf_polars/utils/config.py +741 -0
- cudf_polars/utils/conversion.py +40 -0
- cudf_polars/utils/dtypes.py +118 -0
- cudf_polars/utils/sorting.py +53 -0
- cudf_polars/utils/timer.py +39 -0
- cudf_polars/utils/versions.py +27 -0
- cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
- cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
- cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
- cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
- cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,943 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""Multi-partition IO Logic."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import dataclasses
|
|
8
|
+
import enum
|
|
9
|
+
import functools
|
|
10
|
+
import itertools
|
|
11
|
+
import math
|
|
12
|
+
import statistics
|
|
13
|
+
from collections import defaultdict
|
|
14
|
+
from enum import IntEnum
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import TYPE_CHECKING, Any
|
|
17
|
+
|
|
18
|
+
import polars as pl
|
|
19
|
+
|
|
20
|
+
import pylibcudf as plc
|
|
21
|
+
|
|
22
|
+
from cudf_polars.dsl.ir import IR, DataFrameScan, Empty, Scan, Sink, Union
|
|
23
|
+
from cudf_polars.experimental.base import (
|
|
24
|
+
ColumnSourceInfo,
|
|
25
|
+
ColumnStat,
|
|
26
|
+
ColumnStats,
|
|
27
|
+
DataSourceInfo,
|
|
28
|
+
DataSourcePair,
|
|
29
|
+
PartitionInfo,
|
|
30
|
+
UniqueStats,
|
|
31
|
+
get_key_name,
|
|
32
|
+
)
|
|
33
|
+
from cudf_polars.experimental.dispatch import generate_ir_tasks, lower_ir_node
|
|
34
|
+
|
|
35
|
+
if TYPE_CHECKING:
|
|
36
|
+
from collections.abc import Hashable, MutableMapping
|
|
37
|
+
|
|
38
|
+
from cudf_polars.containers import DataFrame
|
|
39
|
+
from cudf_polars.dsl.expr import NamedExpr
|
|
40
|
+
from cudf_polars.experimental.base import StatsCollector
|
|
41
|
+
from cudf_polars.experimental.dispatch import LowerIRTransformer
|
|
42
|
+
from cudf_polars.typing import Schema
|
|
43
|
+
from cudf_polars.utils.config import (
|
|
44
|
+
ConfigOptions,
|
|
45
|
+
ParquetOptions,
|
|
46
|
+
StatsPlanningOptions,
|
|
47
|
+
StreamingExecutor,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@lower_ir_node.register(DataFrameScan)
|
|
52
|
+
def _(
|
|
53
|
+
ir: DataFrameScan, rec: LowerIRTransformer
|
|
54
|
+
) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
|
|
55
|
+
config_options = rec.state["config_options"]
|
|
56
|
+
|
|
57
|
+
assert config_options.executor.name == "streaming", (
|
|
58
|
+
"'in-memory' executor not supported in 'generate_ir_tasks'"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
rows_per_partition = config_options.executor.max_rows_per_partition
|
|
62
|
+
nrows = max(ir.df.shape()[0], 1)
|
|
63
|
+
count = math.ceil(nrows / rows_per_partition)
|
|
64
|
+
|
|
65
|
+
if count > 1:
|
|
66
|
+
length = math.ceil(nrows / count)
|
|
67
|
+
slices = [
|
|
68
|
+
DataFrameScan(
|
|
69
|
+
ir.schema,
|
|
70
|
+
ir.df.slice(offset, length),
|
|
71
|
+
ir.projection,
|
|
72
|
+
)
|
|
73
|
+
for offset in range(0, nrows, length)
|
|
74
|
+
]
|
|
75
|
+
new_node = Union(ir.schema, None, *slices)
|
|
76
|
+
return new_node, {slice: PartitionInfo(count=1) for slice in slices} | {
|
|
77
|
+
new_node: PartitionInfo(count=count)
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
return ir, {ir: PartitionInfo(count=1)}
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class ScanPartitionFlavor(IntEnum):
|
|
84
|
+
"""Flavor of Scan partitioning."""
|
|
85
|
+
|
|
86
|
+
SINGLE_FILE = enum.auto() # 1:1 mapping between files and partitions
|
|
87
|
+
SPLIT_FILES = enum.auto() # Split each file into >1 partition
|
|
88
|
+
FUSED_FILES = enum.auto() # Fuse multiple files into each partition
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class ScanPartitionPlan:
|
|
92
|
+
"""
|
|
93
|
+
Scan partitioning plan.
|
|
94
|
+
|
|
95
|
+
Notes
|
|
96
|
+
-----
|
|
97
|
+
The meaning of `factor` depends on the value of `flavor`:
|
|
98
|
+
- SINGLE_FILE: `factor` must be `1`.
|
|
99
|
+
- SPLIT_FILES: `factor` is the number of partitions per file.
|
|
100
|
+
- FUSED_FILES: `factor` is the number of files per partition.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
__slots__ = ("factor", "flavor")
|
|
104
|
+
factor: int
|
|
105
|
+
flavor: ScanPartitionFlavor
|
|
106
|
+
|
|
107
|
+
def __init__(self, factor: int, flavor: ScanPartitionFlavor) -> None:
|
|
108
|
+
if (
|
|
109
|
+
flavor == ScanPartitionFlavor.SINGLE_FILE and factor != 1
|
|
110
|
+
): # pragma: no cover
|
|
111
|
+
raise ValueError(f"Expected factor == 1 for {flavor}, got: {factor}")
|
|
112
|
+
self.factor = factor
|
|
113
|
+
self.flavor = flavor
|
|
114
|
+
|
|
115
|
+
@staticmethod
|
|
116
|
+
def from_scan(
|
|
117
|
+
ir: Scan, stats: StatsCollector, config_options: ConfigOptions
|
|
118
|
+
) -> ScanPartitionPlan:
|
|
119
|
+
"""Extract the partitioning plan of a Scan operation."""
|
|
120
|
+
if ir.typ == "parquet":
|
|
121
|
+
# TODO: Use system info to set default blocksize
|
|
122
|
+
assert config_options.executor.name == "streaming", (
|
|
123
|
+
"'in-memory' executor not supported in 'generate_ir_tasks'"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
blocksize: int = config_options.executor.target_partition_size
|
|
127
|
+
column_stats = stats.column_stats.get(ir, {})
|
|
128
|
+
column_sizes: list[int] = []
|
|
129
|
+
for cs in column_stats.values():
|
|
130
|
+
storage_size = cs.source_info.storage_size
|
|
131
|
+
if storage_size.value is not None:
|
|
132
|
+
column_sizes.append(storage_size.value)
|
|
133
|
+
|
|
134
|
+
if (file_size := sum(column_sizes)) > 0:
|
|
135
|
+
if file_size > blocksize:
|
|
136
|
+
# Split large files
|
|
137
|
+
return ScanPartitionPlan(
|
|
138
|
+
math.ceil(file_size / blocksize),
|
|
139
|
+
ScanPartitionFlavor.SPLIT_FILES,
|
|
140
|
+
)
|
|
141
|
+
else:
|
|
142
|
+
# Fuse small files
|
|
143
|
+
return ScanPartitionPlan(
|
|
144
|
+
max(blocksize // int(file_size), 1),
|
|
145
|
+
ScanPartitionFlavor.FUSED_FILES,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# TODO: Use file sizes for csv and json
|
|
149
|
+
return ScanPartitionPlan(1, ScanPartitionFlavor.SINGLE_FILE)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class SplitScan(IR):
|
|
153
|
+
"""
|
|
154
|
+
Input from a split file.
|
|
155
|
+
|
|
156
|
+
This class wraps a single-file `Scan` object. At
|
|
157
|
+
IO/evaluation time, this class will only perform
|
|
158
|
+
a partial read of the underlying file. The range
|
|
159
|
+
(skip_rows and n_rows) is calculated at IO time.
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
__slots__ = (
|
|
163
|
+
"base_scan",
|
|
164
|
+
"parquet_options",
|
|
165
|
+
"schema",
|
|
166
|
+
"split_index",
|
|
167
|
+
"total_splits",
|
|
168
|
+
)
|
|
169
|
+
_non_child = (
|
|
170
|
+
"schema",
|
|
171
|
+
"base_scan",
|
|
172
|
+
"split_index",
|
|
173
|
+
"total_splits",
|
|
174
|
+
"parquet_options",
|
|
175
|
+
)
|
|
176
|
+
base_scan: Scan
|
|
177
|
+
"""Scan operation this node is based on."""
|
|
178
|
+
split_index: int
|
|
179
|
+
"""Index of the current split."""
|
|
180
|
+
total_splits: int
|
|
181
|
+
"""Total number of splits."""
|
|
182
|
+
parquet_options: ParquetOptions
|
|
183
|
+
"""Parquet-specific options."""
|
|
184
|
+
|
|
185
|
+
def __init__(
|
|
186
|
+
self,
|
|
187
|
+
schema: Schema,
|
|
188
|
+
base_scan: Scan,
|
|
189
|
+
split_index: int,
|
|
190
|
+
total_splits: int,
|
|
191
|
+
parquet_options: ParquetOptions,
|
|
192
|
+
):
|
|
193
|
+
self.schema = schema
|
|
194
|
+
self.base_scan = base_scan
|
|
195
|
+
self.split_index = split_index
|
|
196
|
+
self.total_splits = total_splits
|
|
197
|
+
self._non_child_args = (
|
|
198
|
+
split_index,
|
|
199
|
+
total_splits,
|
|
200
|
+
*base_scan._non_child_args,
|
|
201
|
+
)
|
|
202
|
+
self.parquet_options = parquet_options
|
|
203
|
+
self.children = ()
|
|
204
|
+
if base_scan.typ not in ("parquet",): # pragma: no cover
|
|
205
|
+
raise NotImplementedError(
|
|
206
|
+
f"Unhandled Scan type for file splitting: {base_scan.typ}"
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
@classmethod
|
|
210
|
+
def do_evaluate(
|
|
211
|
+
cls,
|
|
212
|
+
split_index: int,
|
|
213
|
+
total_splits: int,
|
|
214
|
+
schema: Schema,
|
|
215
|
+
typ: str,
|
|
216
|
+
reader_options: dict[str, Any],
|
|
217
|
+
paths: list[str],
|
|
218
|
+
with_columns: list[str] | None,
|
|
219
|
+
skip_rows: int,
|
|
220
|
+
n_rows: int,
|
|
221
|
+
row_index: tuple[str, int] | None,
|
|
222
|
+
include_file_paths: str | None,
|
|
223
|
+
predicate: NamedExpr | None,
|
|
224
|
+
parquet_options: ParquetOptions,
|
|
225
|
+
) -> DataFrame:
|
|
226
|
+
"""Evaluate and return a dataframe."""
|
|
227
|
+
if typ not in ("parquet",): # pragma: no cover
|
|
228
|
+
raise NotImplementedError(f"Unhandled Scan type for file splitting: {typ}")
|
|
229
|
+
|
|
230
|
+
if len(paths) > 1: # pragma: no cover
|
|
231
|
+
raise ValueError(f"Expected a single path, got: {paths}")
|
|
232
|
+
|
|
233
|
+
# Parquet logic:
|
|
234
|
+
# - We are one of "total_splits" SplitScan nodes
|
|
235
|
+
# assigned to the same file.
|
|
236
|
+
# - We know our index within this file ("split_index")
|
|
237
|
+
# - We can also use parquet metadata to query the
|
|
238
|
+
# total number of rows in each row-group of the file.
|
|
239
|
+
# - We can use all this information to calculate the
|
|
240
|
+
# "skip_rows" and "n_rows" options to use locally.
|
|
241
|
+
|
|
242
|
+
rowgroup_metadata = plc.io.parquet_metadata.read_parquet_metadata(
|
|
243
|
+
plc.io.SourceInfo(paths)
|
|
244
|
+
).rowgroup_metadata()
|
|
245
|
+
total_row_groups = len(rowgroup_metadata)
|
|
246
|
+
if total_splits <= total_row_groups:
|
|
247
|
+
# We have enough row-groups in the file to align
|
|
248
|
+
# all "total_splits" of our reads with row-group
|
|
249
|
+
# boundaries. Calculate which row-groups to include
|
|
250
|
+
# in the current read, and use metadata to translate
|
|
251
|
+
# the row-group indices to "skip_rows" and "n_rows".
|
|
252
|
+
rg_stride = total_row_groups // total_splits
|
|
253
|
+
skip_rgs = rg_stride * split_index
|
|
254
|
+
skip_rows = sum(rg["num_rows"] for rg in rowgroup_metadata[:skip_rgs])
|
|
255
|
+
n_rows = sum(
|
|
256
|
+
rg["num_rows"]
|
|
257
|
+
for rg in rowgroup_metadata[skip_rgs : skip_rgs + rg_stride]
|
|
258
|
+
)
|
|
259
|
+
else:
|
|
260
|
+
# There are not enough row-groups to align
|
|
261
|
+
# all "total_splits" of our reads with row-group
|
|
262
|
+
# boundaries. Use metadata to directly calculate
|
|
263
|
+
# "skip_rows" and "n_rows" for the current read.
|
|
264
|
+
total_rows = sum(rg["num_rows"] for rg in rowgroup_metadata)
|
|
265
|
+
n_rows = total_rows // total_splits
|
|
266
|
+
skip_rows = n_rows * split_index
|
|
267
|
+
|
|
268
|
+
# Last split should always read to end of file
|
|
269
|
+
if split_index == (total_splits - 1):
|
|
270
|
+
n_rows = -1
|
|
271
|
+
|
|
272
|
+
# Perform the partial read
|
|
273
|
+
return Scan.do_evaluate(
|
|
274
|
+
schema,
|
|
275
|
+
typ,
|
|
276
|
+
reader_options,
|
|
277
|
+
paths,
|
|
278
|
+
with_columns,
|
|
279
|
+
skip_rows,
|
|
280
|
+
n_rows,
|
|
281
|
+
row_index,
|
|
282
|
+
include_file_paths,
|
|
283
|
+
predicate,
|
|
284
|
+
parquet_options,
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
@lower_ir_node.register(Empty)
|
|
289
|
+
def _(
|
|
290
|
+
ir: Empty, rec: LowerIRTransformer
|
|
291
|
+
) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
|
|
292
|
+
return ir, {ir: PartitionInfo(count=1)} # pragma: no cover
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
@lower_ir_node.register(Scan)
|
|
296
|
+
def _(
|
|
297
|
+
ir: Scan, rec: LowerIRTransformer
|
|
298
|
+
) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
|
|
299
|
+
partition_info: MutableMapping[IR, PartitionInfo]
|
|
300
|
+
config_options = rec.state["config_options"]
|
|
301
|
+
if (
|
|
302
|
+
ir.typ in ("csv", "parquet", "ndjson")
|
|
303
|
+
and ir.n_rows == -1
|
|
304
|
+
and ir.skip_rows == 0
|
|
305
|
+
and ir.row_index is None
|
|
306
|
+
):
|
|
307
|
+
plan = ScanPartitionPlan.from_scan(ir, rec.state["stats"], config_options)
|
|
308
|
+
paths = list(ir.paths)
|
|
309
|
+
if plan.flavor == ScanPartitionFlavor.SPLIT_FILES:
|
|
310
|
+
# Disable chunked reader when splitting files
|
|
311
|
+
parquet_options = dataclasses.replace(
|
|
312
|
+
config_options.parquet_options,
|
|
313
|
+
chunked=False,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
slices: list[SplitScan] = []
|
|
317
|
+
for path in paths:
|
|
318
|
+
base_scan = Scan(
|
|
319
|
+
ir.schema,
|
|
320
|
+
ir.typ,
|
|
321
|
+
ir.reader_options,
|
|
322
|
+
ir.cloud_options,
|
|
323
|
+
[path],
|
|
324
|
+
ir.with_columns,
|
|
325
|
+
ir.skip_rows,
|
|
326
|
+
ir.n_rows,
|
|
327
|
+
ir.row_index,
|
|
328
|
+
ir.include_file_paths,
|
|
329
|
+
ir.predicate,
|
|
330
|
+
parquet_options,
|
|
331
|
+
)
|
|
332
|
+
slices.extend(
|
|
333
|
+
SplitScan(
|
|
334
|
+
ir.schema, base_scan, sindex, plan.factor, parquet_options
|
|
335
|
+
)
|
|
336
|
+
for sindex in range(plan.factor)
|
|
337
|
+
)
|
|
338
|
+
new_node = Union(ir.schema, None, *slices)
|
|
339
|
+
partition_info = {slice: PartitionInfo(count=1) for slice in slices} | {
|
|
340
|
+
new_node: PartitionInfo(count=len(slices))
|
|
341
|
+
}
|
|
342
|
+
else:
|
|
343
|
+
groups: list[Scan] = [
|
|
344
|
+
Scan(
|
|
345
|
+
ir.schema,
|
|
346
|
+
ir.typ,
|
|
347
|
+
ir.reader_options,
|
|
348
|
+
ir.cloud_options,
|
|
349
|
+
paths[i : i + plan.factor],
|
|
350
|
+
ir.with_columns,
|
|
351
|
+
ir.skip_rows,
|
|
352
|
+
ir.n_rows,
|
|
353
|
+
ir.row_index,
|
|
354
|
+
ir.include_file_paths,
|
|
355
|
+
ir.predicate,
|
|
356
|
+
config_options.parquet_options,
|
|
357
|
+
)
|
|
358
|
+
for i in range(0, len(paths), plan.factor)
|
|
359
|
+
]
|
|
360
|
+
new_node = Union(ir.schema, None, *groups)
|
|
361
|
+
partition_info = {group: PartitionInfo(count=1) for group in groups} | {
|
|
362
|
+
new_node: PartitionInfo(count=len(groups))
|
|
363
|
+
}
|
|
364
|
+
return new_node, partition_info
|
|
365
|
+
|
|
366
|
+
return ir, {ir: PartitionInfo(count=1)} # pragma: no cover
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
class StreamingSink(IR):
|
|
370
|
+
"""Sink a dataframe in streaming mode."""
|
|
371
|
+
|
|
372
|
+
__slots__ = ("executor_options", "sink")
|
|
373
|
+
_non_child = ("schema", "sink", "executor_options")
|
|
374
|
+
|
|
375
|
+
sink: Sink
|
|
376
|
+
executor_options: StreamingExecutor
|
|
377
|
+
|
|
378
|
+
def __init__(
|
|
379
|
+
self,
|
|
380
|
+
schema: Schema,
|
|
381
|
+
sink: Sink,
|
|
382
|
+
executor_options: StreamingExecutor,
|
|
383
|
+
df: IR,
|
|
384
|
+
):
|
|
385
|
+
self.schema = schema
|
|
386
|
+
self.sink = sink
|
|
387
|
+
self.executor_options = executor_options
|
|
388
|
+
self.children = (df,)
|
|
389
|
+
|
|
390
|
+
def get_hashable(self) -> Hashable:
|
|
391
|
+
"""Hashable representation of the node."""
|
|
392
|
+
return (type(self), self.sink, *self.children)
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
@lower_ir_node.register(Sink)
|
|
396
|
+
def _(
|
|
397
|
+
ir: Sink, rec: LowerIRTransformer
|
|
398
|
+
) -> tuple[StreamingSink, MutableMapping[IR, PartitionInfo]]:
|
|
399
|
+
child, partition_info = rec(ir.children[0])
|
|
400
|
+
executor_options = rec.state["config_options"].executor
|
|
401
|
+
|
|
402
|
+
assert executor_options.name == "streaming", (
|
|
403
|
+
"'in-memory' executor not supported in 'lower_ir_node'"
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
# TODO: Support cloud storage
|
|
407
|
+
if Path(ir.path).exists() and executor_options.sink_to_directory:
|
|
408
|
+
raise NotImplementedError(
|
|
409
|
+
"Writing to an existing path is not supported when sinking "
|
|
410
|
+
"to a directory. If you are using the 'distributed' scheduler, "
|
|
411
|
+
"please remove the target directory before calling 'collect'. "
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
new_node = StreamingSink(
|
|
415
|
+
ir.schema,
|
|
416
|
+
ir.reconstruct([child]),
|
|
417
|
+
executor_options,
|
|
418
|
+
child,
|
|
419
|
+
)
|
|
420
|
+
partition_info[new_node] = partition_info[child]
|
|
421
|
+
return new_node, partition_info
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def _prepare_sink_directory(path: str) -> None:
|
|
425
|
+
"""Prepare for a multi-partition sink."""
|
|
426
|
+
# TODO: Support cloud storage
|
|
427
|
+
Path(path).mkdir(parents=True)
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def _sink_to_directory(
|
|
431
|
+
schema: Schema,
|
|
432
|
+
kind: str,
|
|
433
|
+
path: str,
|
|
434
|
+
parquet_options: ParquetOptions,
|
|
435
|
+
options: dict[str, Any],
|
|
436
|
+
df: DataFrame,
|
|
437
|
+
ready: None,
|
|
438
|
+
) -> DataFrame:
|
|
439
|
+
"""Sink a partition to a new file."""
|
|
440
|
+
return Sink.do_evaluate(schema, kind, path, parquet_options, options, df)
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def _sink_to_parquet_file(
|
|
444
|
+
path: str,
|
|
445
|
+
options: dict[str, Any],
|
|
446
|
+
finalize: bool, # noqa: FBT001
|
|
447
|
+
writer: plc.io.parquet.ChunkedParquetWriter | None,
|
|
448
|
+
df: DataFrame,
|
|
449
|
+
) -> plc.io.parquet.ChunkedParquetWriter | DataFrame:
|
|
450
|
+
"""Sink a partition to an open Parquet file."""
|
|
451
|
+
# Set up a new chunked Parquet writer if necessary.
|
|
452
|
+
if writer is None:
|
|
453
|
+
metadata = Sink._make_parquet_metadata(df)
|
|
454
|
+
sink = plc.io.types.SinkInfo([path])
|
|
455
|
+
builder = Sink._apply_parquet_writer_options(
|
|
456
|
+
plc.io.parquet.ChunkedParquetWriterOptions.builder(sink), options
|
|
457
|
+
)
|
|
458
|
+
writer_options = builder.metadata(metadata).build()
|
|
459
|
+
writer = plc.io.parquet.ChunkedParquetWriter.from_options(writer_options)
|
|
460
|
+
|
|
461
|
+
# Append to the open Parquet file.
|
|
462
|
+
assert isinstance(writer, plc.io.parquet.ChunkedParquetWriter), (
|
|
463
|
+
"ChunkedParquetWriter is required."
|
|
464
|
+
)
|
|
465
|
+
writer.write(df.table)
|
|
466
|
+
|
|
467
|
+
# Finalize or return active writer.
|
|
468
|
+
if finalize:
|
|
469
|
+
writer.close([])
|
|
470
|
+
return df
|
|
471
|
+
else:
|
|
472
|
+
return writer
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def _sink_to_file(
|
|
476
|
+
kind: str,
|
|
477
|
+
path: str,
|
|
478
|
+
options: dict[str, Any],
|
|
479
|
+
finalize: bool, # noqa: FBT001
|
|
480
|
+
writer_state: Any,
|
|
481
|
+
df: DataFrame,
|
|
482
|
+
) -> Any:
|
|
483
|
+
"""Sink a partition to an open file."""
|
|
484
|
+
if kind == "Parquet":
|
|
485
|
+
# Parquet writer will pass along a
|
|
486
|
+
# ChunkedParquetWriter "writer state".
|
|
487
|
+
return _sink_to_parquet_file(
|
|
488
|
+
path,
|
|
489
|
+
options,
|
|
490
|
+
finalize,
|
|
491
|
+
writer_state,
|
|
492
|
+
df,
|
|
493
|
+
)
|
|
494
|
+
elif kind == "Csv":
|
|
495
|
+
use_options = options.copy()
|
|
496
|
+
if writer_state is None:
|
|
497
|
+
mode = "wb"
|
|
498
|
+
else:
|
|
499
|
+
mode = "ab"
|
|
500
|
+
use_options["include_header"] = False
|
|
501
|
+
with Path.open(Path(path), mode) as f:
|
|
502
|
+
sink = plc.io.types.SinkInfo([f])
|
|
503
|
+
Sink._write_csv(sink, use_options, df)
|
|
504
|
+
elif kind == "Json":
|
|
505
|
+
mode = "wb" if writer_state is None else "ab"
|
|
506
|
+
with Path.open(Path(path), mode) as f:
|
|
507
|
+
sink = plc.io.types.SinkInfo([f])
|
|
508
|
+
Sink._write_json(sink, df)
|
|
509
|
+
else: # pragma: no cover; Shouldn't get here.
|
|
510
|
+
raise NotImplementedError(f"{kind} not yet supported in _sink_to_file")
|
|
511
|
+
|
|
512
|
+
# Default return type is bool | DataFrame.
|
|
513
|
+
# We only return a DataFrame for the final sink task.
|
|
514
|
+
# The other tasks return a "ready" signal of True.
|
|
515
|
+
return df if finalize else True
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
def _file_sink_graph(
|
|
519
|
+
ir: StreamingSink, partition_info: MutableMapping[IR, PartitionInfo]
|
|
520
|
+
) -> MutableMapping[Any, Any]:
|
|
521
|
+
"""Sink to a single file."""
|
|
522
|
+
name = get_key_name(ir)
|
|
523
|
+
count = partition_info[ir].count
|
|
524
|
+
child_name = get_key_name(ir.children[0])
|
|
525
|
+
sink = ir.sink
|
|
526
|
+
if count == 1:
|
|
527
|
+
return {
|
|
528
|
+
(name, 0): (
|
|
529
|
+
sink.do_evaluate,
|
|
530
|
+
*sink._non_child_args,
|
|
531
|
+
(child_name, 0),
|
|
532
|
+
)
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
sink_name = get_key_name(sink)
|
|
536
|
+
graph: MutableMapping[Any, Any] = {
|
|
537
|
+
(sink_name, i): (
|
|
538
|
+
_sink_to_file,
|
|
539
|
+
sink.kind,
|
|
540
|
+
sink.path,
|
|
541
|
+
sink.options,
|
|
542
|
+
i == count - 1, # Whether to finalize
|
|
543
|
+
None if i == 0 else (sink_name, i - 1), # Writer state
|
|
544
|
+
(child_name, i),
|
|
545
|
+
)
|
|
546
|
+
for i in range(count)
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
# Make sure final tasks point to empty DataFrame output
|
|
550
|
+
graph.update({(name, i): (sink_name, count - 1) for i in range(count)})
|
|
551
|
+
return graph
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def _directory_sink_graph(
|
|
555
|
+
ir: StreamingSink, partition_info: MutableMapping[IR, PartitionInfo]
|
|
556
|
+
) -> MutableMapping[Any, Any]:
|
|
557
|
+
"""Sink to a directory of files."""
|
|
558
|
+
name = get_key_name(ir)
|
|
559
|
+
count = partition_info[ir].count
|
|
560
|
+
child_name = get_key_name(ir.children[0])
|
|
561
|
+
sink = ir.sink
|
|
562
|
+
|
|
563
|
+
setup_name = f"setup-{name}"
|
|
564
|
+
suffix = sink.kind.lower()
|
|
565
|
+
width = math.ceil(math.log10(count))
|
|
566
|
+
graph: MutableMapping[Any, Any] = {
|
|
567
|
+
(name, i): (
|
|
568
|
+
_sink_to_directory,
|
|
569
|
+
sink.schema,
|
|
570
|
+
sink.kind,
|
|
571
|
+
f"{sink.path}/part.{str(i).zfill(width)}.{suffix}",
|
|
572
|
+
sink.parquet_options,
|
|
573
|
+
sink.options,
|
|
574
|
+
(child_name, i),
|
|
575
|
+
setup_name,
|
|
576
|
+
)
|
|
577
|
+
for i in range(count)
|
|
578
|
+
}
|
|
579
|
+
graph[setup_name] = (_prepare_sink_directory, sink.path)
|
|
580
|
+
return graph
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
@generate_ir_tasks.register(StreamingSink)
|
|
584
|
+
def _(
|
|
585
|
+
ir: StreamingSink, partition_info: MutableMapping[IR, PartitionInfo]
|
|
586
|
+
) -> MutableMapping[Any, Any]:
|
|
587
|
+
if ir.executor_options.sink_to_directory:
|
|
588
|
+
return _directory_sink_graph(ir, partition_info)
|
|
589
|
+
else:
|
|
590
|
+
return _file_sink_graph(ir, partition_info)
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
class ParquetMetadata:
|
|
594
|
+
"""
|
|
595
|
+
Parquet metadata container.
|
|
596
|
+
|
|
597
|
+
Parameters
|
|
598
|
+
----------
|
|
599
|
+
paths
|
|
600
|
+
Parquet-dataset paths.
|
|
601
|
+
max_footer_samples
|
|
602
|
+
Maximum number of file footers to sample metadata from.
|
|
603
|
+
"""
|
|
604
|
+
|
|
605
|
+
__slots__ = (
|
|
606
|
+
"column_names",
|
|
607
|
+
"max_footer_samples",
|
|
608
|
+
"mean_size_per_file",
|
|
609
|
+
"num_row_groups_per_file",
|
|
610
|
+
"paths",
|
|
611
|
+
"row_count",
|
|
612
|
+
"sample_paths",
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
paths: tuple[str, ...]
|
|
616
|
+
"""Parquet-dataset paths."""
|
|
617
|
+
max_footer_samples: int
|
|
618
|
+
"""Maximum number of file footers to sample metadata from."""
|
|
619
|
+
row_count: ColumnStat[int]
|
|
620
|
+
"""Total row-count estimate."""
|
|
621
|
+
num_row_groups_per_file: tuple[int, ...]
|
|
622
|
+
"""Number of row groups in each sampled file."""
|
|
623
|
+
mean_size_per_file: dict[str, ColumnStat[int]]
|
|
624
|
+
"""Average column storage size in a single file."""
|
|
625
|
+
column_names: tuple[str, ...]
|
|
626
|
+
"""All column names found it the dataset."""
|
|
627
|
+
sample_paths: tuple[str, ...]
|
|
628
|
+
"""Sampled file paths."""
|
|
629
|
+
|
|
630
|
+
def __init__(self, paths: tuple[str, ...], max_footer_samples: int):
|
|
631
|
+
self.paths = paths
|
|
632
|
+
self.max_footer_samples = max_footer_samples
|
|
633
|
+
self.row_count = ColumnStat[int]()
|
|
634
|
+
self.num_row_groups_per_file = ()
|
|
635
|
+
self.mean_size_per_file = {}
|
|
636
|
+
self.column_names = ()
|
|
637
|
+
stride = (
|
|
638
|
+
max(1, int(len(paths) / max_footer_samples)) if max_footer_samples else 1
|
|
639
|
+
)
|
|
640
|
+
self.sample_paths = paths[: stride * max_footer_samples : stride]
|
|
641
|
+
|
|
642
|
+
if not self.sample_paths:
|
|
643
|
+
# No paths to sample from
|
|
644
|
+
return
|
|
645
|
+
|
|
646
|
+
total_file_count = len(self.paths)
|
|
647
|
+
sampled_file_count = len(self.sample_paths)
|
|
648
|
+
exact: bool = False
|
|
649
|
+
sample_metadata = plc.io.parquet_metadata.read_parquet_metadata(
|
|
650
|
+
plc.io.SourceInfo(list(self.sample_paths))
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
if total_file_count == sampled_file_count:
|
|
654
|
+
# We know the "exact" row_count from our sample
|
|
655
|
+
row_count = sample_metadata.num_rows()
|
|
656
|
+
exact = True
|
|
657
|
+
else:
|
|
658
|
+
# We must estimate/extrapolate the row_count from our sample
|
|
659
|
+
num_rows_per_sampled_file = int(
|
|
660
|
+
sample_metadata.num_rows() / sampled_file_count
|
|
661
|
+
)
|
|
662
|
+
row_count = num_rows_per_sampled_file * total_file_count
|
|
663
|
+
|
|
664
|
+
num_row_groups_per_sampled_file = sample_metadata.num_rowgroups_per_file()
|
|
665
|
+
rowgroup_offsets_per_file = list(
|
|
666
|
+
itertools.accumulate(num_row_groups_per_sampled_file, initial=0)
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
column_sizes_per_file = {
|
|
670
|
+
name: [
|
|
671
|
+
sum(uncompressed_sizes[start:end])
|
|
672
|
+
for (start, end) in itertools.pairwise(rowgroup_offsets_per_file)
|
|
673
|
+
]
|
|
674
|
+
for name, uncompressed_sizes in sample_metadata.columnchunk_metadata().items()
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
self.column_names = tuple(column_sizes_per_file)
|
|
678
|
+
self.mean_size_per_file = {
|
|
679
|
+
name: ColumnStat[int](value=int(statistics.mean(sizes)))
|
|
680
|
+
for name, sizes in column_sizes_per_file.items()
|
|
681
|
+
}
|
|
682
|
+
self.num_row_groups_per_file = tuple(num_row_groups_per_sampled_file)
|
|
683
|
+
self.row_count.value = row_count
|
|
684
|
+
self.row_count.exact = exact
|
|
685
|
+
|
|
686
|
+
|
|
687
|
+
class ParquetSourceInfo(DataSourceInfo):
|
|
688
|
+
"""
|
|
689
|
+
Parquet datasource information.
|
|
690
|
+
|
|
691
|
+
Parameters
|
|
692
|
+
----------
|
|
693
|
+
paths
|
|
694
|
+
Parquet-dataset paths.
|
|
695
|
+
max_footer_samples
|
|
696
|
+
Maximum number of file footers to sample metadata from.
|
|
697
|
+
max_row_group_samples
|
|
698
|
+
Maximum number of row-groups to sample data from.
|
|
699
|
+
stats_planning
|
|
700
|
+
Statistics planning options.
|
|
701
|
+
"""
|
|
702
|
+
|
|
703
|
+
def __init__(
|
|
704
|
+
self,
|
|
705
|
+
paths: tuple[str, ...],
|
|
706
|
+
max_footer_samples: int,
|
|
707
|
+
max_row_group_samples: int,
|
|
708
|
+
stats_planning: StatsPlanningOptions,
|
|
709
|
+
):
|
|
710
|
+
self.paths = paths
|
|
711
|
+
self.max_footer_samples = max_footer_samples
|
|
712
|
+
self.max_row_group_samples = max_row_group_samples
|
|
713
|
+
self._stats_planning = stats_planning
|
|
714
|
+
self._unique_stats_columns = set()
|
|
715
|
+
# Helper attributes
|
|
716
|
+
self._key_columns: set[str] = set() # Used to fuse lazy row-group sampling
|
|
717
|
+
self._unique_stats: dict[str, UniqueStats] = {}
|
|
718
|
+
|
|
719
|
+
@functools.cached_property
|
|
720
|
+
def metadata(self) -> ParquetMetadata:
|
|
721
|
+
"""Return Parquet metadata."""
|
|
722
|
+
return ParquetMetadata(self.paths, self.max_footer_samples)
|
|
723
|
+
|
|
724
|
+
@property
|
|
725
|
+
def row_count(self) -> ColumnStat[int]:
|
|
726
|
+
"""Data source row-count estimate."""
|
|
727
|
+
return self.metadata.row_count
|
|
728
|
+
|
|
729
|
+
def _sample_row_groups(self) -> None:
|
|
730
|
+
"""Estimate unique-value statistics from a row-group sample."""
|
|
731
|
+
if (
|
|
732
|
+
self.max_row_group_samples < 1
|
|
733
|
+
or not self._stats_planning.use_sampling
|
|
734
|
+
or not (sample_paths := self.metadata.sample_paths)
|
|
735
|
+
):
|
|
736
|
+
# No sampling allowed or no row-groups to sample from
|
|
737
|
+
return
|
|
738
|
+
|
|
739
|
+
column_names = self.metadata.column_names
|
|
740
|
+
if not (
|
|
741
|
+
key_columns := [key for key in self._key_columns if key in column_names]
|
|
742
|
+
): # pragma: no cover; should never get here
|
|
743
|
+
# No key columns found in the file
|
|
744
|
+
raise ValueError(f"None of {self._key_columns} in {column_names}")
|
|
745
|
+
|
|
746
|
+
sampled_file_count = len(sample_paths)
|
|
747
|
+
num_row_groups_per_file = self.metadata.num_row_groups_per_file
|
|
748
|
+
if (
|
|
749
|
+
self.row_count.value is None
|
|
750
|
+
or len(num_row_groups_per_file) != sampled_file_count
|
|
751
|
+
):
|
|
752
|
+
raise ValueError("Parquet metadata sampling failed.") # pragma: no cover
|
|
753
|
+
|
|
754
|
+
n = 0
|
|
755
|
+
samples: defaultdict[str, list[int]] = defaultdict(list)
|
|
756
|
+
for path, num_rgs in zip(sample_paths, num_row_groups_per_file, strict=True):
|
|
757
|
+
for rg_id in range(num_rgs):
|
|
758
|
+
n += 1
|
|
759
|
+
samples[path].append(rg_id)
|
|
760
|
+
if n == self.max_row_group_samples:
|
|
761
|
+
break
|
|
762
|
+
if n == self.max_row_group_samples:
|
|
763
|
+
break
|
|
764
|
+
|
|
765
|
+
exact = sampled_file_count == len(
|
|
766
|
+
self.paths
|
|
767
|
+
) and self.max_row_group_samples >= sum(num_row_groups_per_file)
|
|
768
|
+
|
|
769
|
+
options = plc.io.parquet.ParquetReaderOptions.builder(
|
|
770
|
+
plc.io.SourceInfo(list(samples))
|
|
771
|
+
).build()
|
|
772
|
+
options.set_columns(key_columns)
|
|
773
|
+
options.set_row_groups(list(samples.values()))
|
|
774
|
+
tbl_w_meta = plc.io.parquet.read_parquet(options)
|
|
775
|
+
row_group_num_rows = tbl_w_meta.tbl.num_rows()
|
|
776
|
+
for name, column in zip(
|
|
777
|
+
tbl_w_meta.column_names(), tbl_w_meta.columns, strict=True
|
|
778
|
+
):
|
|
779
|
+
row_group_unique_count = plc.stream_compaction.distinct_count(
|
|
780
|
+
column,
|
|
781
|
+
plc.types.NullPolicy.INCLUDE,
|
|
782
|
+
plc.types.NanPolicy.NAN_IS_NULL,
|
|
783
|
+
)
|
|
784
|
+
fraction = row_group_unique_count / row_group_num_rows
|
|
785
|
+
# Assume that if every row is unique then this is a
|
|
786
|
+
# primary key otherwise it's a foreign key and we
|
|
787
|
+
# can't use the single row group count estimate.
|
|
788
|
+
# Example, consider a "foreign" key that has 100
|
|
789
|
+
# unique values. If we sample from a single row group,
|
|
790
|
+
# we likely obtain a unique count of 100. But we can't
|
|
791
|
+
# necessarily deduce that that means that the unique
|
|
792
|
+
# count is 100 / num_rows_in_group * num_rows_in_file
|
|
793
|
+
count: int | None = None
|
|
794
|
+
if exact:
|
|
795
|
+
count = row_group_unique_count
|
|
796
|
+
elif row_group_unique_count == row_group_num_rows:
|
|
797
|
+
count = self.row_count.value
|
|
798
|
+
self._unique_stats[name] = UniqueStats(
|
|
799
|
+
ColumnStat[int](value=count, exact=exact),
|
|
800
|
+
ColumnStat[float](value=fraction, exact=exact),
|
|
801
|
+
)
|
|
802
|
+
|
|
803
|
+
def _update_unique_stats(self, column: str) -> None:
|
|
804
|
+
if column not in self._unique_stats and column in self.metadata.column_names:
|
|
805
|
+
self.add_unique_stats_column(column)
|
|
806
|
+
self._sample_row_groups()
|
|
807
|
+
self._key_columns = set()
|
|
808
|
+
|
|
809
|
+
def unique_stats(self, column: str) -> UniqueStats:
|
|
810
|
+
"""Return unique-value statistics for a column."""
|
|
811
|
+
self._update_unique_stats(column)
|
|
812
|
+
return self._unique_stats.get(column, UniqueStats())
|
|
813
|
+
|
|
814
|
+
def storage_size(self, column: str) -> ColumnStat[int]:
|
|
815
|
+
"""Return the average column size for a single file."""
|
|
816
|
+
return self.metadata.mean_size_per_file.get(column, ColumnStat[int]())
|
|
817
|
+
|
|
818
|
+
def add_unique_stats_column(self, column: str) -> None:
|
|
819
|
+
"""Add a column needing unique-value information."""
|
|
820
|
+
self._unique_stats_columns.add(column)
|
|
821
|
+
if column not in self._key_columns and column not in self._unique_stats:
|
|
822
|
+
self._key_columns.add(column)
|
|
823
|
+
|
|
824
|
+
|
|
825
|
+
@functools.cache
|
|
826
|
+
def _sample_pq_stats(
|
|
827
|
+
paths: tuple[str, ...],
|
|
828
|
+
max_footer_samples: int,
|
|
829
|
+
max_row_group_samples: int,
|
|
830
|
+
stats_planning: StatsPlanningOptions,
|
|
831
|
+
) -> ParquetSourceInfo:
|
|
832
|
+
"""Return Parquet datasource information."""
|
|
833
|
+
return ParquetSourceInfo(
|
|
834
|
+
paths,
|
|
835
|
+
max_footer_samples,
|
|
836
|
+
max_row_group_samples,
|
|
837
|
+
stats_planning,
|
|
838
|
+
)
|
|
839
|
+
|
|
840
|
+
|
|
841
|
+
def _extract_scan_stats(
|
|
842
|
+
ir: Scan,
|
|
843
|
+
config_options: ConfigOptions,
|
|
844
|
+
) -> dict[str, ColumnStats]:
|
|
845
|
+
"""Extract base ColumnStats for a Scan node."""
|
|
846
|
+
if ir.typ == "parquet":
|
|
847
|
+
assert config_options.executor.name == "streaming", (
|
|
848
|
+
"Only streaming executor is supported in _extract_scan_stats"
|
|
849
|
+
)
|
|
850
|
+
table_source_info = _sample_pq_stats(
|
|
851
|
+
tuple(ir.paths),
|
|
852
|
+
config_options.parquet_options.max_footer_samples,
|
|
853
|
+
config_options.parquet_options.max_row_group_samples,
|
|
854
|
+
config_options.executor.stats_planning,
|
|
855
|
+
)
|
|
856
|
+
return {
|
|
857
|
+
name: ColumnStats(
|
|
858
|
+
name=name,
|
|
859
|
+
source_info=ColumnSourceInfo(DataSourcePair(table_source_info, name)),
|
|
860
|
+
)
|
|
861
|
+
for name in ir.schema
|
|
862
|
+
}
|
|
863
|
+
|
|
864
|
+
else:
|
|
865
|
+
return {name: ColumnStats(name=name) for name in ir.schema}
|
|
866
|
+
|
|
867
|
+
|
|
868
|
+
class DataFrameSourceInfo(DataSourceInfo):
|
|
869
|
+
"""
|
|
870
|
+
In-memory DataFrame source information.
|
|
871
|
+
|
|
872
|
+
Parameters
|
|
873
|
+
----------
|
|
874
|
+
df
|
|
875
|
+
In-memory DataFrame source.
|
|
876
|
+
stats_planning
|
|
877
|
+
Statistics planning options.
|
|
878
|
+
"""
|
|
879
|
+
|
|
880
|
+
def __init__(
|
|
881
|
+
self,
|
|
882
|
+
df: Any,
|
|
883
|
+
stats_planning: StatsPlanningOptions,
|
|
884
|
+
):
|
|
885
|
+
self._df = df
|
|
886
|
+
self._stats_planning = stats_planning
|
|
887
|
+
self._key_columns: set[str] = set()
|
|
888
|
+
self._unique_stats_columns = set()
|
|
889
|
+
self._unique_stats: dict[str, UniqueStats] = {}
|
|
890
|
+
|
|
891
|
+
@functools.cached_property
|
|
892
|
+
def row_count(self) -> ColumnStat[int]:
|
|
893
|
+
"""Data source row-count estimate."""
|
|
894
|
+
return ColumnStat[int](value=self._df.height(), exact=True)
|
|
895
|
+
|
|
896
|
+
def _update_unique_stats(self, column: str) -> None:
|
|
897
|
+
if column not in self._unique_stats and self._stats_planning.use_sampling:
|
|
898
|
+
row_count = self.row_count.value
|
|
899
|
+
try:
|
|
900
|
+
unique_count = (
|
|
901
|
+
self._df.get_column(column).approx_n_unique() if row_count else 0
|
|
902
|
+
)
|
|
903
|
+
except pl.exceptions.InvalidOperationError: # pragma: no cover
|
|
904
|
+
unique_count = self._df.get_column(column).n_unique()
|
|
905
|
+
unique_fraction = min((unique_count / row_count), 1.0) if row_count else 1.0
|
|
906
|
+
self._unique_stats[column] = UniqueStats(
|
|
907
|
+
ColumnStat[int](value=unique_count),
|
|
908
|
+
ColumnStat[float](value=unique_fraction),
|
|
909
|
+
)
|
|
910
|
+
|
|
911
|
+
def unique_stats(self, column: str) -> UniqueStats:
|
|
912
|
+
"""Return unique-value statistics for a column."""
|
|
913
|
+
self._update_unique_stats(column)
|
|
914
|
+
return self._unique_stats.get(column, UniqueStats())
|
|
915
|
+
|
|
916
|
+
|
|
917
|
+
def _extract_dataframescan_stats(
|
|
918
|
+
ir: DataFrameScan, config_options: ConfigOptions
|
|
919
|
+
) -> dict[str, ColumnStats]:
|
|
920
|
+
"""Extract base ColumnStats for a DataFrameScan node."""
|
|
921
|
+
assert config_options.executor.name == "streaming", (
|
|
922
|
+
"Only streaming executor is supported in _extract_dataframescan_stats"
|
|
923
|
+
)
|
|
924
|
+
table_source_info = DataFrameSourceInfo(
|
|
925
|
+
ir.df,
|
|
926
|
+
config_options.executor.stats_planning,
|
|
927
|
+
)
|
|
928
|
+
return {
|
|
929
|
+
name: ColumnStats(
|
|
930
|
+
name=name,
|
|
931
|
+
source_info=ColumnSourceInfo(DataSourcePair(table_source_info, name)),
|
|
932
|
+
)
|
|
933
|
+
for name in ir.schema
|
|
934
|
+
}
|
|
935
|
+
|
|
936
|
+
|
|
937
|
+
def _clear_source_info_cache() -> None:
|
|
938
|
+
"""Clear DataSourceInfo caches."""
|
|
939
|
+
# TODO: Avoid clearing the cache if we can
|
|
940
|
+
# check that the underlying data hasn't changed.
|
|
941
|
+
|
|
942
|
+
# Clear ParquetSourceInfo cache
|
|
943
|
+
_sample_pq_stats.cache_clear()
|