cudf-polars-cu13 25.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. cudf_polars/GIT_COMMIT +1 -0
  2. cudf_polars/VERSION +1 -0
  3. cudf_polars/__init__.py +28 -0
  4. cudf_polars/_version.py +21 -0
  5. cudf_polars/callback.py +318 -0
  6. cudf_polars/containers/__init__.py +13 -0
  7. cudf_polars/containers/column.py +495 -0
  8. cudf_polars/containers/dataframe.py +361 -0
  9. cudf_polars/containers/datatype.py +137 -0
  10. cudf_polars/dsl/__init__.py +8 -0
  11. cudf_polars/dsl/expr.py +66 -0
  12. cudf_polars/dsl/expressions/__init__.py +8 -0
  13. cudf_polars/dsl/expressions/aggregation.py +226 -0
  14. cudf_polars/dsl/expressions/base.py +272 -0
  15. cudf_polars/dsl/expressions/binaryop.py +120 -0
  16. cudf_polars/dsl/expressions/boolean.py +326 -0
  17. cudf_polars/dsl/expressions/datetime.py +271 -0
  18. cudf_polars/dsl/expressions/literal.py +97 -0
  19. cudf_polars/dsl/expressions/rolling.py +643 -0
  20. cudf_polars/dsl/expressions/selection.py +74 -0
  21. cudf_polars/dsl/expressions/slicing.py +46 -0
  22. cudf_polars/dsl/expressions/sorting.py +85 -0
  23. cudf_polars/dsl/expressions/string.py +1002 -0
  24. cudf_polars/dsl/expressions/struct.py +137 -0
  25. cudf_polars/dsl/expressions/ternary.py +49 -0
  26. cudf_polars/dsl/expressions/unary.py +517 -0
  27. cudf_polars/dsl/ir.py +2607 -0
  28. cudf_polars/dsl/nodebase.py +164 -0
  29. cudf_polars/dsl/to_ast.py +359 -0
  30. cudf_polars/dsl/tracing.py +16 -0
  31. cudf_polars/dsl/translate.py +939 -0
  32. cudf_polars/dsl/traversal.py +224 -0
  33. cudf_polars/dsl/utils/__init__.py +8 -0
  34. cudf_polars/dsl/utils/aggregations.py +481 -0
  35. cudf_polars/dsl/utils/groupby.py +98 -0
  36. cudf_polars/dsl/utils/naming.py +34 -0
  37. cudf_polars/dsl/utils/replace.py +61 -0
  38. cudf_polars/dsl/utils/reshape.py +74 -0
  39. cudf_polars/dsl/utils/rolling.py +121 -0
  40. cudf_polars/dsl/utils/windows.py +192 -0
  41. cudf_polars/experimental/__init__.py +8 -0
  42. cudf_polars/experimental/base.py +386 -0
  43. cudf_polars/experimental/benchmarks/__init__.py +4 -0
  44. cudf_polars/experimental/benchmarks/pdsds.py +220 -0
  45. cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
  46. cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
  47. cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
  48. cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
  49. cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
  50. cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
  51. cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
  52. cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
  53. cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
  54. cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
  55. cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
  56. cudf_polars/experimental/benchmarks/pdsh.py +814 -0
  57. cudf_polars/experimental/benchmarks/utils.py +832 -0
  58. cudf_polars/experimental/dask_registers.py +200 -0
  59. cudf_polars/experimental/dispatch.py +156 -0
  60. cudf_polars/experimental/distinct.py +197 -0
  61. cudf_polars/experimental/explain.py +157 -0
  62. cudf_polars/experimental/expressions.py +590 -0
  63. cudf_polars/experimental/groupby.py +327 -0
  64. cudf_polars/experimental/io.py +943 -0
  65. cudf_polars/experimental/join.py +391 -0
  66. cudf_polars/experimental/parallel.py +423 -0
  67. cudf_polars/experimental/repartition.py +69 -0
  68. cudf_polars/experimental/scheduler.py +155 -0
  69. cudf_polars/experimental/select.py +188 -0
  70. cudf_polars/experimental/shuffle.py +354 -0
  71. cudf_polars/experimental/sort.py +609 -0
  72. cudf_polars/experimental/spilling.py +151 -0
  73. cudf_polars/experimental/statistics.py +795 -0
  74. cudf_polars/experimental/utils.py +169 -0
  75. cudf_polars/py.typed +0 -0
  76. cudf_polars/testing/__init__.py +8 -0
  77. cudf_polars/testing/asserts.py +448 -0
  78. cudf_polars/testing/io.py +122 -0
  79. cudf_polars/testing/plugin.py +236 -0
  80. cudf_polars/typing/__init__.py +219 -0
  81. cudf_polars/utils/__init__.py +8 -0
  82. cudf_polars/utils/config.py +741 -0
  83. cudf_polars/utils/conversion.py +40 -0
  84. cudf_polars/utils/dtypes.py +118 -0
  85. cudf_polars/utils/sorting.py +53 -0
  86. cudf_polars/utils/timer.py +39 -0
  87. cudf_polars/utils/versions.py +27 -0
  88. cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
  89. cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
  90. cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
  91. cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
  92. cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,943 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """Multi-partition IO Logic."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import dataclasses
8
+ import enum
9
+ import functools
10
+ import itertools
11
+ import math
12
+ import statistics
13
+ from collections import defaultdict
14
+ from enum import IntEnum
15
+ from pathlib import Path
16
+ from typing import TYPE_CHECKING, Any
17
+
18
+ import polars as pl
19
+
20
+ import pylibcudf as plc
21
+
22
+ from cudf_polars.dsl.ir import IR, DataFrameScan, Empty, Scan, Sink, Union
23
+ from cudf_polars.experimental.base import (
24
+ ColumnSourceInfo,
25
+ ColumnStat,
26
+ ColumnStats,
27
+ DataSourceInfo,
28
+ DataSourcePair,
29
+ PartitionInfo,
30
+ UniqueStats,
31
+ get_key_name,
32
+ )
33
+ from cudf_polars.experimental.dispatch import generate_ir_tasks, lower_ir_node
34
+
35
+ if TYPE_CHECKING:
36
+ from collections.abc import Hashable, MutableMapping
37
+
38
+ from cudf_polars.containers import DataFrame
39
+ from cudf_polars.dsl.expr import NamedExpr
40
+ from cudf_polars.experimental.base import StatsCollector
41
+ from cudf_polars.experimental.dispatch import LowerIRTransformer
42
+ from cudf_polars.typing import Schema
43
+ from cudf_polars.utils.config import (
44
+ ConfigOptions,
45
+ ParquetOptions,
46
+ StatsPlanningOptions,
47
+ StreamingExecutor,
48
+ )
49
+
50
+
51
+ @lower_ir_node.register(DataFrameScan)
52
+ def _(
53
+ ir: DataFrameScan, rec: LowerIRTransformer
54
+ ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
55
+ config_options = rec.state["config_options"]
56
+
57
+ assert config_options.executor.name == "streaming", (
58
+ "'in-memory' executor not supported in 'generate_ir_tasks'"
59
+ )
60
+
61
+ rows_per_partition = config_options.executor.max_rows_per_partition
62
+ nrows = max(ir.df.shape()[0], 1)
63
+ count = math.ceil(nrows / rows_per_partition)
64
+
65
+ if count > 1:
66
+ length = math.ceil(nrows / count)
67
+ slices = [
68
+ DataFrameScan(
69
+ ir.schema,
70
+ ir.df.slice(offset, length),
71
+ ir.projection,
72
+ )
73
+ for offset in range(0, nrows, length)
74
+ ]
75
+ new_node = Union(ir.schema, None, *slices)
76
+ return new_node, {slice: PartitionInfo(count=1) for slice in slices} | {
77
+ new_node: PartitionInfo(count=count)
78
+ }
79
+
80
+ return ir, {ir: PartitionInfo(count=1)}
81
+
82
+
83
+ class ScanPartitionFlavor(IntEnum):
84
+ """Flavor of Scan partitioning."""
85
+
86
+ SINGLE_FILE = enum.auto() # 1:1 mapping between files and partitions
87
+ SPLIT_FILES = enum.auto() # Split each file into >1 partition
88
+ FUSED_FILES = enum.auto() # Fuse multiple files into each partition
89
+
90
+
91
+ class ScanPartitionPlan:
92
+ """
93
+ Scan partitioning plan.
94
+
95
+ Notes
96
+ -----
97
+ The meaning of `factor` depends on the value of `flavor`:
98
+ - SINGLE_FILE: `factor` must be `1`.
99
+ - SPLIT_FILES: `factor` is the number of partitions per file.
100
+ - FUSED_FILES: `factor` is the number of files per partition.
101
+ """
102
+
103
+ __slots__ = ("factor", "flavor")
104
+ factor: int
105
+ flavor: ScanPartitionFlavor
106
+
107
+ def __init__(self, factor: int, flavor: ScanPartitionFlavor) -> None:
108
+ if (
109
+ flavor == ScanPartitionFlavor.SINGLE_FILE and factor != 1
110
+ ): # pragma: no cover
111
+ raise ValueError(f"Expected factor == 1 for {flavor}, got: {factor}")
112
+ self.factor = factor
113
+ self.flavor = flavor
114
+
115
+ @staticmethod
116
+ def from_scan(
117
+ ir: Scan, stats: StatsCollector, config_options: ConfigOptions
118
+ ) -> ScanPartitionPlan:
119
+ """Extract the partitioning plan of a Scan operation."""
120
+ if ir.typ == "parquet":
121
+ # TODO: Use system info to set default blocksize
122
+ assert config_options.executor.name == "streaming", (
123
+ "'in-memory' executor not supported in 'generate_ir_tasks'"
124
+ )
125
+
126
+ blocksize: int = config_options.executor.target_partition_size
127
+ column_stats = stats.column_stats.get(ir, {})
128
+ column_sizes: list[int] = []
129
+ for cs in column_stats.values():
130
+ storage_size = cs.source_info.storage_size
131
+ if storage_size.value is not None:
132
+ column_sizes.append(storage_size.value)
133
+
134
+ if (file_size := sum(column_sizes)) > 0:
135
+ if file_size > blocksize:
136
+ # Split large files
137
+ return ScanPartitionPlan(
138
+ math.ceil(file_size / blocksize),
139
+ ScanPartitionFlavor.SPLIT_FILES,
140
+ )
141
+ else:
142
+ # Fuse small files
143
+ return ScanPartitionPlan(
144
+ max(blocksize // int(file_size), 1),
145
+ ScanPartitionFlavor.FUSED_FILES,
146
+ )
147
+
148
+ # TODO: Use file sizes for csv and json
149
+ return ScanPartitionPlan(1, ScanPartitionFlavor.SINGLE_FILE)
150
+
151
+
152
+ class SplitScan(IR):
153
+ """
154
+ Input from a split file.
155
+
156
+ This class wraps a single-file `Scan` object. At
157
+ IO/evaluation time, this class will only perform
158
+ a partial read of the underlying file. The range
159
+ (skip_rows and n_rows) is calculated at IO time.
160
+ """
161
+
162
+ __slots__ = (
163
+ "base_scan",
164
+ "parquet_options",
165
+ "schema",
166
+ "split_index",
167
+ "total_splits",
168
+ )
169
+ _non_child = (
170
+ "schema",
171
+ "base_scan",
172
+ "split_index",
173
+ "total_splits",
174
+ "parquet_options",
175
+ )
176
+ base_scan: Scan
177
+ """Scan operation this node is based on."""
178
+ split_index: int
179
+ """Index of the current split."""
180
+ total_splits: int
181
+ """Total number of splits."""
182
+ parquet_options: ParquetOptions
183
+ """Parquet-specific options."""
184
+
185
+ def __init__(
186
+ self,
187
+ schema: Schema,
188
+ base_scan: Scan,
189
+ split_index: int,
190
+ total_splits: int,
191
+ parquet_options: ParquetOptions,
192
+ ):
193
+ self.schema = schema
194
+ self.base_scan = base_scan
195
+ self.split_index = split_index
196
+ self.total_splits = total_splits
197
+ self._non_child_args = (
198
+ split_index,
199
+ total_splits,
200
+ *base_scan._non_child_args,
201
+ )
202
+ self.parquet_options = parquet_options
203
+ self.children = ()
204
+ if base_scan.typ not in ("parquet",): # pragma: no cover
205
+ raise NotImplementedError(
206
+ f"Unhandled Scan type for file splitting: {base_scan.typ}"
207
+ )
208
+
209
+ @classmethod
210
+ def do_evaluate(
211
+ cls,
212
+ split_index: int,
213
+ total_splits: int,
214
+ schema: Schema,
215
+ typ: str,
216
+ reader_options: dict[str, Any],
217
+ paths: list[str],
218
+ with_columns: list[str] | None,
219
+ skip_rows: int,
220
+ n_rows: int,
221
+ row_index: tuple[str, int] | None,
222
+ include_file_paths: str | None,
223
+ predicate: NamedExpr | None,
224
+ parquet_options: ParquetOptions,
225
+ ) -> DataFrame:
226
+ """Evaluate and return a dataframe."""
227
+ if typ not in ("parquet",): # pragma: no cover
228
+ raise NotImplementedError(f"Unhandled Scan type for file splitting: {typ}")
229
+
230
+ if len(paths) > 1: # pragma: no cover
231
+ raise ValueError(f"Expected a single path, got: {paths}")
232
+
233
+ # Parquet logic:
234
+ # - We are one of "total_splits" SplitScan nodes
235
+ # assigned to the same file.
236
+ # - We know our index within this file ("split_index")
237
+ # - We can also use parquet metadata to query the
238
+ # total number of rows in each row-group of the file.
239
+ # - We can use all this information to calculate the
240
+ # "skip_rows" and "n_rows" options to use locally.
241
+
242
+ rowgroup_metadata = plc.io.parquet_metadata.read_parquet_metadata(
243
+ plc.io.SourceInfo(paths)
244
+ ).rowgroup_metadata()
245
+ total_row_groups = len(rowgroup_metadata)
246
+ if total_splits <= total_row_groups:
247
+ # We have enough row-groups in the file to align
248
+ # all "total_splits" of our reads with row-group
249
+ # boundaries. Calculate which row-groups to include
250
+ # in the current read, and use metadata to translate
251
+ # the row-group indices to "skip_rows" and "n_rows".
252
+ rg_stride = total_row_groups // total_splits
253
+ skip_rgs = rg_stride * split_index
254
+ skip_rows = sum(rg["num_rows"] for rg in rowgroup_metadata[:skip_rgs])
255
+ n_rows = sum(
256
+ rg["num_rows"]
257
+ for rg in rowgroup_metadata[skip_rgs : skip_rgs + rg_stride]
258
+ )
259
+ else:
260
+ # There are not enough row-groups to align
261
+ # all "total_splits" of our reads with row-group
262
+ # boundaries. Use metadata to directly calculate
263
+ # "skip_rows" and "n_rows" for the current read.
264
+ total_rows = sum(rg["num_rows"] for rg in rowgroup_metadata)
265
+ n_rows = total_rows // total_splits
266
+ skip_rows = n_rows * split_index
267
+
268
+ # Last split should always read to end of file
269
+ if split_index == (total_splits - 1):
270
+ n_rows = -1
271
+
272
+ # Perform the partial read
273
+ return Scan.do_evaluate(
274
+ schema,
275
+ typ,
276
+ reader_options,
277
+ paths,
278
+ with_columns,
279
+ skip_rows,
280
+ n_rows,
281
+ row_index,
282
+ include_file_paths,
283
+ predicate,
284
+ parquet_options,
285
+ )
286
+
287
+
288
+ @lower_ir_node.register(Empty)
289
+ def _(
290
+ ir: Empty, rec: LowerIRTransformer
291
+ ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
292
+ return ir, {ir: PartitionInfo(count=1)} # pragma: no cover
293
+
294
+
295
+ @lower_ir_node.register(Scan)
296
+ def _(
297
+ ir: Scan, rec: LowerIRTransformer
298
+ ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
299
+ partition_info: MutableMapping[IR, PartitionInfo]
300
+ config_options = rec.state["config_options"]
301
+ if (
302
+ ir.typ in ("csv", "parquet", "ndjson")
303
+ and ir.n_rows == -1
304
+ and ir.skip_rows == 0
305
+ and ir.row_index is None
306
+ ):
307
+ plan = ScanPartitionPlan.from_scan(ir, rec.state["stats"], config_options)
308
+ paths = list(ir.paths)
309
+ if plan.flavor == ScanPartitionFlavor.SPLIT_FILES:
310
+ # Disable chunked reader when splitting files
311
+ parquet_options = dataclasses.replace(
312
+ config_options.parquet_options,
313
+ chunked=False,
314
+ )
315
+
316
+ slices: list[SplitScan] = []
317
+ for path in paths:
318
+ base_scan = Scan(
319
+ ir.schema,
320
+ ir.typ,
321
+ ir.reader_options,
322
+ ir.cloud_options,
323
+ [path],
324
+ ir.with_columns,
325
+ ir.skip_rows,
326
+ ir.n_rows,
327
+ ir.row_index,
328
+ ir.include_file_paths,
329
+ ir.predicate,
330
+ parquet_options,
331
+ )
332
+ slices.extend(
333
+ SplitScan(
334
+ ir.schema, base_scan, sindex, plan.factor, parquet_options
335
+ )
336
+ for sindex in range(plan.factor)
337
+ )
338
+ new_node = Union(ir.schema, None, *slices)
339
+ partition_info = {slice: PartitionInfo(count=1) for slice in slices} | {
340
+ new_node: PartitionInfo(count=len(slices))
341
+ }
342
+ else:
343
+ groups: list[Scan] = [
344
+ Scan(
345
+ ir.schema,
346
+ ir.typ,
347
+ ir.reader_options,
348
+ ir.cloud_options,
349
+ paths[i : i + plan.factor],
350
+ ir.with_columns,
351
+ ir.skip_rows,
352
+ ir.n_rows,
353
+ ir.row_index,
354
+ ir.include_file_paths,
355
+ ir.predicate,
356
+ config_options.parquet_options,
357
+ )
358
+ for i in range(0, len(paths), plan.factor)
359
+ ]
360
+ new_node = Union(ir.schema, None, *groups)
361
+ partition_info = {group: PartitionInfo(count=1) for group in groups} | {
362
+ new_node: PartitionInfo(count=len(groups))
363
+ }
364
+ return new_node, partition_info
365
+
366
+ return ir, {ir: PartitionInfo(count=1)} # pragma: no cover
367
+
368
+
369
+ class StreamingSink(IR):
370
+ """Sink a dataframe in streaming mode."""
371
+
372
+ __slots__ = ("executor_options", "sink")
373
+ _non_child = ("schema", "sink", "executor_options")
374
+
375
+ sink: Sink
376
+ executor_options: StreamingExecutor
377
+
378
+ def __init__(
379
+ self,
380
+ schema: Schema,
381
+ sink: Sink,
382
+ executor_options: StreamingExecutor,
383
+ df: IR,
384
+ ):
385
+ self.schema = schema
386
+ self.sink = sink
387
+ self.executor_options = executor_options
388
+ self.children = (df,)
389
+
390
+ def get_hashable(self) -> Hashable:
391
+ """Hashable representation of the node."""
392
+ return (type(self), self.sink, *self.children)
393
+
394
+
395
+ @lower_ir_node.register(Sink)
396
+ def _(
397
+ ir: Sink, rec: LowerIRTransformer
398
+ ) -> tuple[StreamingSink, MutableMapping[IR, PartitionInfo]]:
399
+ child, partition_info = rec(ir.children[0])
400
+ executor_options = rec.state["config_options"].executor
401
+
402
+ assert executor_options.name == "streaming", (
403
+ "'in-memory' executor not supported in 'lower_ir_node'"
404
+ )
405
+
406
+ # TODO: Support cloud storage
407
+ if Path(ir.path).exists() and executor_options.sink_to_directory:
408
+ raise NotImplementedError(
409
+ "Writing to an existing path is not supported when sinking "
410
+ "to a directory. If you are using the 'distributed' scheduler, "
411
+ "please remove the target directory before calling 'collect'. "
412
+ )
413
+
414
+ new_node = StreamingSink(
415
+ ir.schema,
416
+ ir.reconstruct([child]),
417
+ executor_options,
418
+ child,
419
+ )
420
+ partition_info[new_node] = partition_info[child]
421
+ return new_node, partition_info
422
+
423
+
424
+ def _prepare_sink_directory(path: str) -> None:
425
+ """Prepare for a multi-partition sink."""
426
+ # TODO: Support cloud storage
427
+ Path(path).mkdir(parents=True)
428
+
429
+
430
+ def _sink_to_directory(
431
+ schema: Schema,
432
+ kind: str,
433
+ path: str,
434
+ parquet_options: ParquetOptions,
435
+ options: dict[str, Any],
436
+ df: DataFrame,
437
+ ready: None,
438
+ ) -> DataFrame:
439
+ """Sink a partition to a new file."""
440
+ return Sink.do_evaluate(schema, kind, path, parquet_options, options, df)
441
+
442
+
443
+ def _sink_to_parquet_file(
444
+ path: str,
445
+ options: dict[str, Any],
446
+ finalize: bool, # noqa: FBT001
447
+ writer: plc.io.parquet.ChunkedParquetWriter | None,
448
+ df: DataFrame,
449
+ ) -> plc.io.parquet.ChunkedParquetWriter | DataFrame:
450
+ """Sink a partition to an open Parquet file."""
451
+ # Set up a new chunked Parquet writer if necessary.
452
+ if writer is None:
453
+ metadata = Sink._make_parquet_metadata(df)
454
+ sink = plc.io.types.SinkInfo([path])
455
+ builder = Sink._apply_parquet_writer_options(
456
+ plc.io.parquet.ChunkedParquetWriterOptions.builder(sink), options
457
+ )
458
+ writer_options = builder.metadata(metadata).build()
459
+ writer = plc.io.parquet.ChunkedParquetWriter.from_options(writer_options)
460
+
461
+ # Append to the open Parquet file.
462
+ assert isinstance(writer, plc.io.parquet.ChunkedParquetWriter), (
463
+ "ChunkedParquetWriter is required."
464
+ )
465
+ writer.write(df.table)
466
+
467
+ # Finalize or return active writer.
468
+ if finalize:
469
+ writer.close([])
470
+ return df
471
+ else:
472
+ return writer
473
+
474
+
475
+ def _sink_to_file(
476
+ kind: str,
477
+ path: str,
478
+ options: dict[str, Any],
479
+ finalize: bool, # noqa: FBT001
480
+ writer_state: Any,
481
+ df: DataFrame,
482
+ ) -> Any:
483
+ """Sink a partition to an open file."""
484
+ if kind == "Parquet":
485
+ # Parquet writer will pass along a
486
+ # ChunkedParquetWriter "writer state".
487
+ return _sink_to_parquet_file(
488
+ path,
489
+ options,
490
+ finalize,
491
+ writer_state,
492
+ df,
493
+ )
494
+ elif kind == "Csv":
495
+ use_options = options.copy()
496
+ if writer_state is None:
497
+ mode = "wb"
498
+ else:
499
+ mode = "ab"
500
+ use_options["include_header"] = False
501
+ with Path.open(Path(path), mode) as f:
502
+ sink = plc.io.types.SinkInfo([f])
503
+ Sink._write_csv(sink, use_options, df)
504
+ elif kind == "Json":
505
+ mode = "wb" if writer_state is None else "ab"
506
+ with Path.open(Path(path), mode) as f:
507
+ sink = plc.io.types.SinkInfo([f])
508
+ Sink._write_json(sink, df)
509
+ else: # pragma: no cover; Shouldn't get here.
510
+ raise NotImplementedError(f"{kind} not yet supported in _sink_to_file")
511
+
512
+ # Default return type is bool | DataFrame.
513
+ # We only return a DataFrame for the final sink task.
514
+ # The other tasks return a "ready" signal of True.
515
+ return df if finalize else True
516
+
517
+
518
+ def _file_sink_graph(
519
+ ir: StreamingSink, partition_info: MutableMapping[IR, PartitionInfo]
520
+ ) -> MutableMapping[Any, Any]:
521
+ """Sink to a single file."""
522
+ name = get_key_name(ir)
523
+ count = partition_info[ir].count
524
+ child_name = get_key_name(ir.children[0])
525
+ sink = ir.sink
526
+ if count == 1:
527
+ return {
528
+ (name, 0): (
529
+ sink.do_evaluate,
530
+ *sink._non_child_args,
531
+ (child_name, 0),
532
+ )
533
+ }
534
+
535
+ sink_name = get_key_name(sink)
536
+ graph: MutableMapping[Any, Any] = {
537
+ (sink_name, i): (
538
+ _sink_to_file,
539
+ sink.kind,
540
+ sink.path,
541
+ sink.options,
542
+ i == count - 1, # Whether to finalize
543
+ None if i == 0 else (sink_name, i - 1), # Writer state
544
+ (child_name, i),
545
+ )
546
+ for i in range(count)
547
+ }
548
+
549
+ # Make sure final tasks point to empty DataFrame output
550
+ graph.update({(name, i): (sink_name, count - 1) for i in range(count)})
551
+ return graph
552
+
553
+
554
+ def _directory_sink_graph(
555
+ ir: StreamingSink, partition_info: MutableMapping[IR, PartitionInfo]
556
+ ) -> MutableMapping[Any, Any]:
557
+ """Sink to a directory of files."""
558
+ name = get_key_name(ir)
559
+ count = partition_info[ir].count
560
+ child_name = get_key_name(ir.children[0])
561
+ sink = ir.sink
562
+
563
+ setup_name = f"setup-{name}"
564
+ suffix = sink.kind.lower()
565
+ width = math.ceil(math.log10(count))
566
+ graph: MutableMapping[Any, Any] = {
567
+ (name, i): (
568
+ _sink_to_directory,
569
+ sink.schema,
570
+ sink.kind,
571
+ f"{sink.path}/part.{str(i).zfill(width)}.{suffix}",
572
+ sink.parquet_options,
573
+ sink.options,
574
+ (child_name, i),
575
+ setup_name,
576
+ )
577
+ for i in range(count)
578
+ }
579
+ graph[setup_name] = (_prepare_sink_directory, sink.path)
580
+ return graph
581
+
582
+
583
+ @generate_ir_tasks.register(StreamingSink)
584
+ def _(
585
+ ir: StreamingSink, partition_info: MutableMapping[IR, PartitionInfo]
586
+ ) -> MutableMapping[Any, Any]:
587
+ if ir.executor_options.sink_to_directory:
588
+ return _directory_sink_graph(ir, partition_info)
589
+ else:
590
+ return _file_sink_graph(ir, partition_info)
591
+
592
+
593
+ class ParquetMetadata:
594
+ """
595
+ Parquet metadata container.
596
+
597
+ Parameters
598
+ ----------
599
+ paths
600
+ Parquet-dataset paths.
601
+ max_footer_samples
602
+ Maximum number of file footers to sample metadata from.
603
+ """
604
+
605
+ __slots__ = (
606
+ "column_names",
607
+ "max_footer_samples",
608
+ "mean_size_per_file",
609
+ "num_row_groups_per_file",
610
+ "paths",
611
+ "row_count",
612
+ "sample_paths",
613
+ )
614
+
615
+ paths: tuple[str, ...]
616
+ """Parquet-dataset paths."""
617
+ max_footer_samples: int
618
+ """Maximum number of file footers to sample metadata from."""
619
+ row_count: ColumnStat[int]
620
+ """Total row-count estimate."""
621
+ num_row_groups_per_file: tuple[int, ...]
622
+ """Number of row groups in each sampled file."""
623
+ mean_size_per_file: dict[str, ColumnStat[int]]
624
+ """Average column storage size in a single file."""
625
+ column_names: tuple[str, ...]
626
+ """All column names found it the dataset."""
627
+ sample_paths: tuple[str, ...]
628
+ """Sampled file paths."""
629
+
630
+ def __init__(self, paths: tuple[str, ...], max_footer_samples: int):
631
+ self.paths = paths
632
+ self.max_footer_samples = max_footer_samples
633
+ self.row_count = ColumnStat[int]()
634
+ self.num_row_groups_per_file = ()
635
+ self.mean_size_per_file = {}
636
+ self.column_names = ()
637
+ stride = (
638
+ max(1, int(len(paths) / max_footer_samples)) if max_footer_samples else 1
639
+ )
640
+ self.sample_paths = paths[: stride * max_footer_samples : stride]
641
+
642
+ if not self.sample_paths:
643
+ # No paths to sample from
644
+ return
645
+
646
+ total_file_count = len(self.paths)
647
+ sampled_file_count = len(self.sample_paths)
648
+ exact: bool = False
649
+ sample_metadata = plc.io.parquet_metadata.read_parquet_metadata(
650
+ plc.io.SourceInfo(list(self.sample_paths))
651
+ )
652
+
653
+ if total_file_count == sampled_file_count:
654
+ # We know the "exact" row_count from our sample
655
+ row_count = sample_metadata.num_rows()
656
+ exact = True
657
+ else:
658
+ # We must estimate/extrapolate the row_count from our sample
659
+ num_rows_per_sampled_file = int(
660
+ sample_metadata.num_rows() / sampled_file_count
661
+ )
662
+ row_count = num_rows_per_sampled_file * total_file_count
663
+
664
+ num_row_groups_per_sampled_file = sample_metadata.num_rowgroups_per_file()
665
+ rowgroup_offsets_per_file = list(
666
+ itertools.accumulate(num_row_groups_per_sampled_file, initial=0)
667
+ )
668
+
669
+ column_sizes_per_file = {
670
+ name: [
671
+ sum(uncompressed_sizes[start:end])
672
+ for (start, end) in itertools.pairwise(rowgroup_offsets_per_file)
673
+ ]
674
+ for name, uncompressed_sizes in sample_metadata.columnchunk_metadata().items()
675
+ }
676
+
677
+ self.column_names = tuple(column_sizes_per_file)
678
+ self.mean_size_per_file = {
679
+ name: ColumnStat[int](value=int(statistics.mean(sizes)))
680
+ for name, sizes in column_sizes_per_file.items()
681
+ }
682
+ self.num_row_groups_per_file = tuple(num_row_groups_per_sampled_file)
683
+ self.row_count.value = row_count
684
+ self.row_count.exact = exact
685
+
686
+
687
+ class ParquetSourceInfo(DataSourceInfo):
688
+ """
689
+ Parquet datasource information.
690
+
691
+ Parameters
692
+ ----------
693
+ paths
694
+ Parquet-dataset paths.
695
+ max_footer_samples
696
+ Maximum number of file footers to sample metadata from.
697
+ max_row_group_samples
698
+ Maximum number of row-groups to sample data from.
699
+ stats_planning
700
+ Statistics planning options.
701
+ """
702
+
703
+ def __init__(
704
+ self,
705
+ paths: tuple[str, ...],
706
+ max_footer_samples: int,
707
+ max_row_group_samples: int,
708
+ stats_planning: StatsPlanningOptions,
709
+ ):
710
+ self.paths = paths
711
+ self.max_footer_samples = max_footer_samples
712
+ self.max_row_group_samples = max_row_group_samples
713
+ self._stats_planning = stats_planning
714
+ self._unique_stats_columns = set()
715
+ # Helper attributes
716
+ self._key_columns: set[str] = set() # Used to fuse lazy row-group sampling
717
+ self._unique_stats: dict[str, UniqueStats] = {}
718
+
719
+ @functools.cached_property
720
+ def metadata(self) -> ParquetMetadata:
721
+ """Return Parquet metadata."""
722
+ return ParquetMetadata(self.paths, self.max_footer_samples)
723
+
724
+ @property
725
+ def row_count(self) -> ColumnStat[int]:
726
+ """Data source row-count estimate."""
727
+ return self.metadata.row_count
728
+
729
+ def _sample_row_groups(self) -> None:
730
+ """Estimate unique-value statistics from a row-group sample."""
731
+ if (
732
+ self.max_row_group_samples < 1
733
+ or not self._stats_planning.use_sampling
734
+ or not (sample_paths := self.metadata.sample_paths)
735
+ ):
736
+ # No sampling allowed or no row-groups to sample from
737
+ return
738
+
739
+ column_names = self.metadata.column_names
740
+ if not (
741
+ key_columns := [key for key in self._key_columns if key in column_names]
742
+ ): # pragma: no cover; should never get here
743
+ # No key columns found in the file
744
+ raise ValueError(f"None of {self._key_columns} in {column_names}")
745
+
746
+ sampled_file_count = len(sample_paths)
747
+ num_row_groups_per_file = self.metadata.num_row_groups_per_file
748
+ if (
749
+ self.row_count.value is None
750
+ or len(num_row_groups_per_file) != sampled_file_count
751
+ ):
752
+ raise ValueError("Parquet metadata sampling failed.") # pragma: no cover
753
+
754
+ n = 0
755
+ samples: defaultdict[str, list[int]] = defaultdict(list)
756
+ for path, num_rgs in zip(sample_paths, num_row_groups_per_file, strict=True):
757
+ for rg_id in range(num_rgs):
758
+ n += 1
759
+ samples[path].append(rg_id)
760
+ if n == self.max_row_group_samples:
761
+ break
762
+ if n == self.max_row_group_samples:
763
+ break
764
+
765
+ exact = sampled_file_count == len(
766
+ self.paths
767
+ ) and self.max_row_group_samples >= sum(num_row_groups_per_file)
768
+
769
+ options = plc.io.parquet.ParquetReaderOptions.builder(
770
+ plc.io.SourceInfo(list(samples))
771
+ ).build()
772
+ options.set_columns(key_columns)
773
+ options.set_row_groups(list(samples.values()))
774
+ tbl_w_meta = plc.io.parquet.read_parquet(options)
775
+ row_group_num_rows = tbl_w_meta.tbl.num_rows()
776
+ for name, column in zip(
777
+ tbl_w_meta.column_names(), tbl_w_meta.columns, strict=True
778
+ ):
779
+ row_group_unique_count = plc.stream_compaction.distinct_count(
780
+ column,
781
+ plc.types.NullPolicy.INCLUDE,
782
+ plc.types.NanPolicy.NAN_IS_NULL,
783
+ )
784
+ fraction = row_group_unique_count / row_group_num_rows
785
+ # Assume that if every row is unique then this is a
786
+ # primary key otherwise it's a foreign key and we
787
+ # can't use the single row group count estimate.
788
+ # Example, consider a "foreign" key that has 100
789
+ # unique values. If we sample from a single row group,
790
+ # we likely obtain a unique count of 100. But we can't
791
+ # necessarily deduce that that means that the unique
792
+ # count is 100 / num_rows_in_group * num_rows_in_file
793
+ count: int | None = None
794
+ if exact:
795
+ count = row_group_unique_count
796
+ elif row_group_unique_count == row_group_num_rows:
797
+ count = self.row_count.value
798
+ self._unique_stats[name] = UniqueStats(
799
+ ColumnStat[int](value=count, exact=exact),
800
+ ColumnStat[float](value=fraction, exact=exact),
801
+ )
802
+
803
+ def _update_unique_stats(self, column: str) -> None:
804
+ if column not in self._unique_stats and column in self.metadata.column_names:
805
+ self.add_unique_stats_column(column)
806
+ self._sample_row_groups()
807
+ self._key_columns = set()
808
+
809
+ def unique_stats(self, column: str) -> UniqueStats:
810
+ """Return unique-value statistics for a column."""
811
+ self._update_unique_stats(column)
812
+ return self._unique_stats.get(column, UniqueStats())
813
+
814
+ def storage_size(self, column: str) -> ColumnStat[int]:
815
+ """Return the average column size for a single file."""
816
+ return self.metadata.mean_size_per_file.get(column, ColumnStat[int]())
817
+
818
+ def add_unique_stats_column(self, column: str) -> None:
819
+ """Add a column needing unique-value information."""
820
+ self._unique_stats_columns.add(column)
821
+ if column not in self._key_columns and column not in self._unique_stats:
822
+ self._key_columns.add(column)
823
+
824
+
825
+ @functools.cache
826
+ def _sample_pq_stats(
827
+ paths: tuple[str, ...],
828
+ max_footer_samples: int,
829
+ max_row_group_samples: int,
830
+ stats_planning: StatsPlanningOptions,
831
+ ) -> ParquetSourceInfo:
832
+ """Return Parquet datasource information."""
833
+ return ParquetSourceInfo(
834
+ paths,
835
+ max_footer_samples,
836
+ max_row_group_samples,
837
+ stats_planning,
838
+ )
839
+
840
+
841
+ def _extract_scan_stats(
842
+ ir: Scan,
843
+ config_options: ConfigOptions,
844
+ ) -> dict[str, ColumnStats]:
845
+ """Extract base ColumnStats for a Scan node."""
846
+ if ir.typ == "parquet":
847
+ assert config_options.executor.name == "streaming", (
848
+ "Only streaming executor is supported in _extract_scan_stats"
849
+ )
850
+ table_source_info = _sample_pq_stats(
851
+ tuple(ir.paths),
852
+ config_options.parquet_options.max_footer_samples,
853
+ config_options.parquet_options.max_row_group_samples,
854
+ config_options.executor.stats_planning,
855
+ )
856
+ return {
857
+ name: ColumnStats(
858
+ name=name,
859
+ source_info=ColumnSourceInfo(DataSourcePair(table_source_info, name)),
860
+ )
861
+ for name in ir.schema
862
+ }
863
+
864
+ else:
865
+ return {name: ColumnStats(name=name) for name in ir.schema}
866
+
867
+
868
+ class DataFrameSourceInfo(DataSourceInfo):
869
+ """
870
+ In-memory DataFrame source information.
871
+
872
+ Parameters
873
+ ----------
874
+ df
875
+ In-memory DataFrame source.
876
+ stats_planning
877
+ Statistics planning options.
878
+ """
879
+
880
+ def __init__(
881
+ self,
882
+ df: Any,
883
+ stats_planning: StatsPlanningOptions,
884
+ ):
885
+ self._df = df
886
+ self._stats_planning = stats_planning
887
+ self._key_columns: set[str] = set()
888
+ self._unique_stats_columns = set()
889
+ self._unique_stats: dict[str, UniqueStats] = {}
890
+
891
+ @functools.cached_property
892
+ def row_count(self) -> ColumnStat[int]:
893
+ """Data source row-count estimate."""
894
+ return ColumnStat[int](value=self._df.height(), exact=True)
895
+
896
+ def _update_unique_stats(self, column: str) -> None:
897
+ if column not in self._unique_stats and self._stats_planning.use_sampling:
898
+ row_count = self.row_count.value
899
+ try:
900
+ unique_count = (
901
+ self._df.get_column(column).approx_n_unique() if row_count else 0
902
+ )
903
+ except pl.exceptions.InvalidOperationError: # pragma: no cover
904
+ unique_count = self._df.get_column(column).n_unique()
905
+ unique_fraction = min((unique_count / row_count), 1.0) if row_count else 1.0
906
+ self._unique_stats[column] = UniqueStats(
907
+ ColumnStat[int](value=unique_count),
908
+ ColumnStat[float](value=unique_fraction),
909
+ )
910
+
911
+ def unique_stats(self, column: str) -> UniqueStats:
912
+ """Return unique-value statistics for a column."""
913
+ self._update_unique_stats(column)
914
+ return self._unique_stats.get(column, UniqueStats())
915
+
916
+
917
+ def _extract_dataframescan_stats(
918
+ ir: DataFrameScan, config_options: ConfigOptions
919
+ ) -> dict[str, ColumnStats]:
920
+ """Extract base ColumnStats for a DataFrameScan node."""
921
+ assert config_options.executor.name == "streaming", (
922
+ "Only streaming executor is supported in _extract_dataframescan_stats"
923
+ )
924
+ table_source_info = DataFrameSourceInfo(
925
+ ir.df,
926
+ config_options.executor.stats_planning,
927
+ )
928
+ return {
929
+ name: ColumnStats(
930
+ name=name,
931
+ source_info=ColumnSourceInfo(DataSourcePair(table_source_info, name)),
932
+ )
933
+ for name in ir.schema
934
+ }
935
+
936
+
937
+ def _clear_source_info_cache() -> None:
938
+ """Clear DataSourceInfo caches."""
939
+ # TODO: Avoid clearing the cache if we can
940
+ # check that the underlying data hasn't changed.
941
+
942
+ # Clear ParquetSourceInfo cache
943
+ _sample_pq_stats.cache_clear()