chalkpy 2.93.5__py3-none-any.whl → 2.93.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. chalk/_gen/chalk/arrow/v1/arrow_pb2.py +7 -5
  2. chalk/_gen/chalk/arrow/v1/arrow_pb2.pyi +6 -0
  3. chalk/_gen/chalk/common/v1/offline_query_pb2.py +17 -15
  4. chalk/_gen/chalk/common/v1/offline_query_pb2.pyi +25 -0
  5. chalk/_gen/chalk/common/v1/script_task_pb2.py +3 -3
  6. chalk/_gen/chalk/common/v1/script_task_pb2.pyi +2 -0
  7. chalk/_gen/chalk/dataframe/__init__.py +0 -0
  8. chalk/_gen/chalk/dataframe/v1/__init__.py +0 -0
  9. chalk/_gen/chalk/dataframe/v1/dataframe_pb2.py +48 -0
  10. chalk/_gen/chalk/dataframe/v1/dataframe_pb2.pyi +119 -0
  11. chalk/_gen/chalk/dataframe/v1/dataframe_pb2_grpc.py +4 -0
  12. chalk/_gen/chalk/dataframe/v1/dataframe_pb2_grpc.pyi +4 -0
  13. chalk/_gen/chalk/graph/v1/graph_pb2.py +150 -149
  14. chalk/_gen/chalk/graph/v1/graph_pb2.pyi +5 -0
  15. chalk/_gen/chalk/kubernetes/v1/horizontalpodautoscaler_pb2.py +79 -0
  16. chalk/_gen/chalk/kubernetes/v1/horizontalpodautoscaler_pb2.pyi +377 -0
  17. chalk/_gen/chalk/kubernetes/v1/horizontalpodautoscaler_pb2_grpc.py +4 -0
  18. chalk/_gen/chalk/kubernetes/v1/horizontalpodautoscaler_pb2_grpc.pyi +4 -0
  19. chalk/_gen/chalk/kubernetes/v1/scaledobject_pb2.py +43 -7
  20. chalk/_gen/chalk/kubernetes/v1/scaledobject_pb2.pyi +252 -2
  21. chalk/_gen/chalk/server/v1/benchmark_pb2.py +45 -0
  22. chalk/_gen/chalk/server/v1/benchmark_pb2.pyi +74 -0
  23. chalk/_gen/chalk/server/v1/benchmark_pb2_grpc.py +78 -0
  24. chalk/_gen/chalk/server/v1/benchmark_pb2_grpc.pyi +36 -0
  25. chalk/_gen/chalk/server/v1/builder_pb2.py +218 -210
  26. chalk/_gen/chalk/server/v1/builder_pb2.pyi +59 -1
  27. chalk/_gen/chalk/server/v1/dataplaneworkflows_pb2.py +70 -0
  28. chalk/_gen/chalk/server/v1/dataplaneworkflows_pb2.pyi +219 -0
  29. chalk/_gen/chalk/server/v1/dataplaneworkflows_pb2_grpc.py +170 -0
  30. chalk/_gen/chalk/server/v1/dataplaneworkflows_pb2_grpc.pyi +62 -0
  31. chalk/_gen/chalk/server/v1/files_pb2.py +65 -0
  32. chalk/_gen/chalk/server/v1/files_pb2.pyi +167 -0
  33. chalk/_gen/chalk/server/v1/files_pb2_grpc.py +4 -0
  34. chalk/_gen/chalk/server/v1/files_pb2_grpc.pyi +4 -0
  35. chalk/_gen/chalk/server/v1/kube_pb2.py +29 -19
  36. chalk/_gen/chalk/server/v1/kube_pb2.pyi +28 -0
  37. chalk/_gen/chalk/server/v1/kube_pb2_grpc.py +45 -0
  38. chalk/_gen/chalk/server/v1/kube_pb2_grpc.pyi +12 -0
  39. chalk/_gen/chalk/server/v1/queries_pb2.py +66 -66
  40. chalk/_gen/chalk/server/v1/queries_pb2.pyi +20 -0
  41. chalk/_gen/chalk/server/v1/sql_interface_pb2.py +75 -0
  42. chalk/_gen/chalk/server/v1/sql_interface_pb2.pyi +142 -0
  43. chalk/_gen/chalk/server/v1/sql_interface_pb2_grpc.py +349 -0
  44. chalk/_gen/chalk/server/v1/sql_interface_pb2_grpc.pyi +114 -0
  45. chalk/_gen/chalk/server/v1/trace_pb2.py +44 -40
  46. chalk/_gen/chalk/server/v1/trace_pb2.pyi +12 -0
  47. chalk/_gen/chalk/streaming/v1/debug_service_pb2.py +62 -0
  48. chalk/_gen/chalk/streaming/v1/debug_service_pb2.pyi +75 -0
  49. chalk/_gen/chalk/streaming/v1/debug_service_pb2_grpc.py +221 -0
  50. chalk/_gen/chalk/streaming/v1/debug_service_pb2_grpc.pyi +88 -0
  51. chalk/_gen/chalk/streaming/v1/simple_streaming_service_pb2.py +8 -8
  52. chalk/_gen/chalk/streaming/v1/simple_streaming_service_pb2.pyi +4 -1
  53. chalk/_version.py +1 -1
  54. chalk/df/LazyFramePlaceholder.py +1129 -0
  55. chalk/features/_encoding/converter.py +23 -0
  56. chalk/features/resolver.py +9 -2
  57. chalk/operators/_utils.py +10 -3
  58. chalk/parsed/to_proto.py +7 -1
  59. {chalkpy-2.93.5.dist-info → chalkpy-2.93.7.dist-info}/METADATA +1 -1
  60. {chalkpy-2.93.5.dist-info → chalkpy-2.93.7.dist-info}/RECORD +63 -32
  61. {chalkpy-2.93.5.dist-info → chalkpy-2.93.7.dist-info}/WHEEL +0 -0
  62. {chalkpy-2.93.5.dist-info → chalkpy-2.93.7.dist-info}/entry_points.txt +0 -0
  63. {chalkpy-2.93.5.dist-info → chalkpy-2.93.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1129 @@
1
+ """Lightweight DataFrame wrapper around Chalk's execution engine.
2
+
3
+ The :class:`DataFrame` class constructs query plans backed by ``libchalk`` and
4
+ can materialize them into Arrow tables. It offers a minimal API similar to
5
+ other DataFrame libraries while delegating heavy lifting to the underlying
6
+ engine.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import typing
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+ from typing import TYPE_CHECKING, Any, Optional, TypeAlias
15
+
16
+ import pyarrow
17
+
18
+ import chalk._gen.chalk.dataframe.v1.dataframe_pb2 as dataframe_pb2
19
+ from chalk.features._encoding.converter import PrimitiveFeatureConverter
20
+ from chalk.features.underscore import (
21
+ Underscore,
22
+ UnderscoreAttr,
23
+ UnderscoreCall,
24
+ UnderscoreRoot,
25
+ convert_value_to_proto_expr,
26
+ )
27
+
28
+ if TYPE_CHECKING:
29
+ from chalk.features import Underscore
30
+
31
+
32
+ MaterializedTable: TypeAlias = pyarrow.RecordBatch | pyarrow.Table
33
+
34
+
35
+ @dataclass
36
+ class _LazyFrameConstructor:
37
+ """
38
+ A lazily-called function which will be used to construct a Chalk DataFrame.
39
+ """
40
+
41
+ self_dataframe: "Optional[LazyFramePlaceholder]"
42
+ """If present, this is the value of 'self' to call the function on."""
43
+
44
+ function_name: str
45
+ """The name of the function to construct the DataFrame."""
46
+
47
+ args: tuple[Any, ...]
48
+ """The args to pass to the DataFrame function."""
49
+
50
+ kwargs: dict[str, Any]
51
+ """The kwargs to pass to the DataFrame function."""
52
+
53
+
54
+ class LazyFramePlaceholder:
55
+ """
56
+ A lazy representation of a DataFrame operation.
57
+
58
+ Examples
59
+ --------
60
+ >>> from chalk.df import LazyFramePlaceholder
61
+ >>> from chalk.features import _
62
+ >>> # Create from a dictionary
63
+ >>> df = LazyFramePlaceholder.named_table('input', pa.schema({"id": pa.int64(), "name": pa.string()}))
64
+ >>> # Apply operations
65
+ >>> filtered = df.filter(_.x > 1)
66
+ """
67
+
68
+ @staticmethod
69
+ def _construct(
70
+ *,
71
+ self_dataframe: "Optional[LazyFramePlaceholder]",
72
+ function_name: str,
73
+ args: tuple[Any, ...] = (),
74
+ **kwargs: Any,
75
+ ):
76
+ return LazyFramePlaceholder(
77
+ _internal_constructor=_LazyFrameConstructor(
78
+ self_dataframe=self_dataframe,
79
+ function_name=function_name,
80
+ args=tuple(args),
81
+ kwargs=kwargs,
82
+ )
83
+ )
84
+
85
+ def __init__(
86
+ self,
87
+ *,
88
+ _internal_constructor: _LazyFrameConstructor,
89
+ ):
90
+ """
91
+ An internal construct that creates a `LazyFramePlaceholder` from its underlying operation.
92
+ """
93
+
94
+ super().__init__()
95
+ self._lazy_frame_constructor = _internal_constructor
96
+
97
+ def __repr__(self) -> str:
98
+ return "LazyFramePlaceholder(...)"
99
+
100
+ __str__ = __repr__
101
+
102
+ def _to_proto(self) -> dataframe_pb2.DataFramePlan:
103
+ """
104
+ Convert this proto plan to a dataframe.
105
+ """
106
+ return _convert_to_dataframe_proto(self)
107
+
108
+ @staticmethod
109
+ def _from_proto(proto: dataframe_pb2.DataFramePlan) -> "LazyFramePlaceholder":
110
+ """
111
+ Parse a `LazyFramePlaceholder` from the specified proto plan.
112
+ """
113
+ return _convert_from_dataframe_proto(proto, dataframe_class=LazyFramePlaceholder)
114
+
115
+ @classmethod
116
+ def named_table(cls, name: str, schema: pyarrow.Schema) -> LazyFramePlaceholder:
117
+ """Create a ``DataFrame`` for a named table.
118
+
119
+ Parameters
120
+ ----------
121
+ name
122
+ Table identifier.
123
+ schema
124
+ Arrow schema describing the table.
125
+
126
+ Returns
127
+ -------
128
+ DataFrame referencing the named table.
129
+ """
130
+
131
+ if not isinstance(name, str): # pyright: ignore[reportUnnecessaryIsInstance]
132
+ raise ValueError(
133
+ f"LazyFramePlaceholder.named_table expected `name` to have type 'str' but it was passed as a '{type(name)}'"
134
+ )
135
+ if not isinstance(schema, pyarrow.Schema): # pyright: ignore[reportUnnecessaryIsInstance]
136
+ raise ValueError(
137
+ f"LazyFramePlaceholder.named_table expected `schema` to have type 'pyarrow.Schema' but it was passed as a '{type(schema)}'"
138
+ )
139
+
140
+ return LazyFramePlaceholder._construct(
141
+ function_name="named_table",
142
+ self_dataframe=None,
143
+ name=name,
144
+ schema=schema,
145
+ )
146
+
147
+ @classmethod
148
+ def from_arrow(cls, data: MaterializedTable):
149
+ """Construct a DataFrame from an in-memory Arrow object.
150
+
151
+ Parameters
152
+ ----------
153
+ data
154
+ PyArrow Table or RecordBatch to convert into a DataFrame.
155
+
156
+ Returns
157
+ -------
158
+ DataFrame backed by the provided Arrow data.
159
+
160
+ Examples
161
+ --------
162
+ >>> import pyarrow as pa
163
+ >>> from chalkdf import DataFrame
164
+ >>> table = pa.table({"x": [1, 2, 3], "y": ["a", "b", "c"]})
165
+ >>> df = DataFrame.from_arrow(table)
166
+ """
167
+
168
+ assert isinstance(data, (pyarrow.Table, pyarrow.RecordBatch))
169
+
170
+ return LazyFramePlaceholder._construct(
171
+ self_dataframe=None,
172
+ function_name="from_arrow",
173
+ data=data,
174
+ )
175
+
176
+ @classmethod
177
+ def from_dict(cls, data: dict):
178
+ """Construct a DataFrame from a Python dictionary.
179
+
180
+ Parameters
181
+ ----------
182
+ data
183
+ Dictionary mapping column names to lists of values.
184
+
185
+ Returns
186
+ -------
187
+ DataFrame backed by the provided dictionary data.
188
+
189
+ Examples
190
+ --------
191
+ >>> from chalkdf import DataFrame
192
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": ["a", "b", "c"]})
193
+ """
194
+
195
+ return LazyFramePlaceholder.from_arrow(pyarrow.table(data))
196
+
197
+ @classmethod
198
+ def scan(
199
+ cls,
200
+ name: str,
201
+ input_uris: typing.Sequence[str | Path],
202
+ *,
203
+ schema: pyarrow.Schema | None = None,
204
+ ) -> "LazyFramePlaceholder":
205
+ """Scan files and return a DataFrame.
206
+
207
+ Currently supports CSV (with headers) and Parquet file formats.
208
+
209
+ Parameters
210
+ ----------
211
+ name
212
+ Name to assign to the table being scanned.
213
+ input_uris
214
+ List of file paths or URIs to scan. Supports local paths and file:// URIs.
215
+ schema
216
+ Schema of the data. Required for CSV files, optional for Parquet.
217
+
218
+ Returns
219
+ -------
220
+ DataFrame that reads data from the specified files.
221
+
222
+ Examples
223
+ --------
224
+ >>> from chalkdf import DataFrame
225
+ >>> # Scan Parquet files
226
+ >>> df = DataFrame.scan("sales_data", ["data/sales_2024.parquet"])
227
+ >>> # Scan CSV with explicit schema
228
+ >>> import pyarrow as pa
229
+ >>> schema = pa.schema([("id", pa.int64()), ("name", pa.string())])
230
+ >>> df = DataFrame.scan("users", ["data/users.csv"], schema=schema)
231
+ """
232
+ # Accept filesystem paths or URIs; construct file:// URIs manually for
233
+ # local paths to avoid percent-encoding partition tokens like '='.
234
+
235
+ if isinstance(input_uris, str):
236
+ raise ValueError(
237
+ "The LazyFramePlaceholder.scan() function must be called with a list of input_uris, not a single str URI"
238
+ )
239
+
240
+ normalized_input_uris: list[str] = []
241
+ for p in input_uris:
242
+ s = p if isinstance(p, str) else str(p)
243
+ if "://" in s:
244
+ normalized_input_uris.append(s)
245
+ else:
246
+ abs_path = str(Path(s).resolve())
247
+ if not abs_path.startswith("/"):
248
+ normalized_input_uris.append(Path(s).resolve().as_uri())
249
+ else:
250
+ normalized_input_uris.append("file://" + abs_path)
251
+
252
+ return LazyFramePlaceholder._construct(
253
+ self_dataframe=None,
254
+ function_name="scan",
255
+ name=name,
256
+ input_uris=normalized_input_uris,
257
+ schema=schema,
258
+ )
259
+
260
+ @classmethod
261
+ def scan_glue_iceberg(
262
+ cls,
263
+ glue_table_name: str,
264
+ schema: typing.Mapping[str, pyarrow.DataType],
265
+ *,
266
+ batch_row_count: int = 1_000,
267
+ aws_catalog_account_id: typing.Optional[str] = None,
268
+ aws_catalog_region: typing.Optional[str] = None,
269
+ aws_role_arn: typing.Optional[str] = None,
270
+ parquet_scan_range_column: typing.Optional[str] = None,
271
+ custom_partitions: typing.Optional[dict[str, tuple[typing.Literal["date_trunc(day)"], str]]] = None,
272
+ partition_column: typing.Optional[str] = None,
273
+ ) -> "LazyFramePlaceholder":
274
+ """Load data from an AWS Glue Iceberg table.
275
+
276
+ Parameters
277
+ ----------
278
+ glue_table_name
279
+ Fully qualified ``database.table`` name.
280
+ schema
281
+ Mapping of column names to Arrow types.
282
+ batch_row_count
283
+ Number of rows per batch.
284
+ aws_catalog_account_id
285
+ AWS account hosting the Glue catalog.
286
+ aws_catalog_region
287
+ Region of the Glue catalog.
288
+ aws_role_arn
289
+ IAM role to assume for access.
290
+ filter_predicate
291
+ Optional filter applied during scan.
292
+ parquet_scan_range_column
293
+ Column used for range-based reads.
294
+ custom_partitions
295
+ Additional partition definitions.
296
+ partition_column
297
+ Column name representing partitions.
298
+
299
+ Returns
300
+ -------
301
+ DataFrame backed by the Glue table.
302
+ """
303
+
304
+ return LazyFramePlaceholder._construct(
305
+ self_dataframe=None,
306
+ function_name="scan_glue_iceberg",
307
+ schema=schema,
308
+ batch_row_count=batch_row_count,
309
+ aws_catalog_account_id=aws_catalog_account_id,
310
+ aws_catalog_region=aws_catalog_region,
311
+ aws_role_arn=aws_role_arn,
312
+ filter_predicate=None,
313
+ parquet_scan_range_column=parquet_scan_range_column,
314
+ custom_partitions=custom_partitions,
315
+ partition_column=partition_column,
316
+ )
317
+
318
+ @classmethod
319
+ def from_sql(
320
+ cls,
321
+ query: str,
322
+ ) -> LazyFramePlaceholder:
323
+ """Create a ``DataFrame`` from the result of executing a SQL query (DuckDB dialect).
324
+
325
+ Parameters
326
+ ----------
327
+ query
328
+ SQL query string (DuckDB dialect).
329
+ **tables
330
+ Named tables to use in the query. Can be Arrow Table, RecordBatch, or DataFrame.
331
+
332
+ Returns
333
+ -------
334
+ DataFrame containing the query results.
335
+ """
336
+
337
+ return LazyFramePlaceholder._construct(
338
+ self_dataframe=None,
339
+ function_name="from_sql",
340
+ query=query,
341
+ )
342
+
343
+ def with_columns(
344
+ self,
345
+ *columns: typing.Mapping[str, Underscore] | Underscore | tuple[str, Underscore],
346
+ ) -> LazyFramePlaceholder:
347
+ """Add or replace columns.
348
+
349
+ Accepts multiple forms:
350
+ - A mapping of column names to expressions
351
+ - Positional tuples of (name, expression)
352
+ - Bare positional expressions that must include ``.alias(<name>)``
353
+
354
+ Parameters
355
+ ----------
356
+ *columns
357
+ Column definitions as mappings, tuples, or aliased expressions.
358
+
359
+ Returns
360
+ -------
361
+ DataFrame with the specified columns added or replaced.
362
+
363
+ Examples
364
+ --------
365
+ >>> from chalkdf import DataFrame
366
+ >>> from chalk.features import _
367
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
368
+ >>> # Add a new column using a dict with _ syntax
369
+ >>> df2 = df.with_columns({"z": _.x + _.y})
370
+ >>> # Add a new column using alias
371
+ >>> df3 = df.with_columns((_.x + _.y).alias("z"))
372
+ """
373
+ entries: list[tuple[str, Underscore]] = []
374
+ if len(columns) == 0:
375
+ raise ValueError("with_columns requires at least one column expression")
376
+
377
+ for col in columns:
378
+ if isinstance(col, (list, tuple)):
379
+ if len(col) != 2:
380
+ raise ValueError(
381
+ f"LazyFramePlaceholder.with_column(...) cannot be called with tuple having {len(col)} members - expect (name, expression) pairs only."
382
+ )
383
+ entries.append(col)
384
+ elif isinstance(col, Underscore):
385
+ attempted_alias = _extract_alias_from_underscore(col)
386
+ if attempted_alias:
387
+ entries.append(attempted_alias)
388
+ else:
389
+ raise ValueError(
390
+ f"Positional with_columns expressions must use `.alias(...)` to set the column name, got expression '{col}' without any alias specified"
391
+ )
392
+ elif isinstance(col, typing.Mapping): # pyright: ignore[reportUnnecessaryIsInstance]
393
+ entries.extend((k, v) for k, v in col.items()) # pyright: ignore
394
+ else:
395
+ raise ValueError(
396
+ f"LazyFramePlaceholder.with_columns cannot be called with column argument `{repr(col)}`"
397
+ )
398
+
399
+ return LazyFramePlaceholder._construct(
400
+ self_dataframe=self,
401
+ function_name="with_columns",
402
+ args=tuple(entries),
403
+ )
404
+
405
+ def with_unique_id(self, name: str) -> LazyFramePlaceholder:
406
+ """Add a monotonically increasing unique identifier column.
407
+
408
+ Parameters
409
+ ----------
410
+ name
411
+ Name of the new ID column.
412
+
413
+ Returns
414
+ -------
415
+ DataFrame with a new column containing unique, incrementing IDs.
416
+
417
+ Examples
418
+ --------
419
+ >>> from chalkdf import DataFrame
420
+ >>> df = DataFrame.from_dict({"x": [10, 20, 30]})
421
+ >>> df_with_id = df.with_unique_id("row_id")
422
+ """
423
+
424
+ return LazyFramePlaceholder._construct(
425
+ self_dataframe=self,
426
+ function_name="with_unique_id",
427
+ name=name,
428
+ )
429
+
430
+ def filter(self, expr: Underscore) -> LazyFramePlaceholder:
431
+ """Filter rows based on a boolean expression.
432
+
433
+ Parameters
434
+ ----------
435
+ expr
436
+ Boolean expression to filter rows. Only rows where the expression
437
+ evaluates to True are kept.
438
+
439
+ Returns
440
+ -------
441
+ DataFrame containing only the rows that match the filter condition.
442
+
443
+ Examples
444
+ --------
445
+ >>> from chalkdf import DataFrame
446
+ >>> from chalk.features import _
447
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3, 4], "y": [10, 20, 30, 40]})
448
+ >>> filtered = df.filter(_.x > 2)
449
+ """
450
+
451
+ return LazyFramePlaceholder._construct(
452
+ self_dataframe=self,
453
+ function_name="filter",
454
+ expr=expr,
455
+ )
456
+
457
+ def slice(self, start: int, length: int | None = None) -> LazyFramePlaceholder:
458
+ """Return a subset of rows starting at a specific position.
459
+
460
+ Parameters
461
+ ----------
462
+ start
463
+ Zero-based index where the slice begins.
464
+ length
465
+ Number of rows to include. If None, includes all remaining rows.
466
+
467
+ Returns
468
+ -------
469
+ DataFrame containing the sliced rows.
470
+
471
+ Examples
472
+ --------
473
+ >>> from chalkdf import DataFrame
474
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3, 4, 5]})
475
+ >>> # Get rows 1-3 (indices 1, 2, 3)
476
+ >>> sliced = df.slice(1, 3)
477
+ """
478
+
479
+ # Can't actually express "no limit" with velox limit/offset, but this'll do.
480
+ return self._construct(
481
+ self_dataframe=self,
482
+ function_name="slice",
483
+ start=start,
484
+ length=length,
485
+ )
486
+
487
+ def col(self, column: str) -> Underscore:
488
+ """Get a column expression from the DataFrame.
489
+
490
+ Parameters
491
+ ----------
492
+ column
493
+ Name of the column to retrieve.
494
+
495
+ Returns
496
+ -------
497
+ Column expression (as Underscore) that can be used in operations.
498
+
499
+ Examples
500
+ --------
501
+ >>> from chalkdf import DataFrame
502
+ >>> from chalk.features import _
503
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
504
+ >>> # Use col to reference columns in expressions
505
+ >>> df_filtered = df.filter(_.x > 1)
506
+ """
507
+ return self.column(column)
508
+
509
+ def column(self, column: str) -> Underscore:
510
+ """Get a column expression from the DataFrame.
511
+
512
+ Alias for col() method.
513
+
514
+ Parameters
515
+ ----------
516
+ column
517
+ Name of the column to retrieve.
518
+
519
+ Returns
520
+ -------
521
+ Column expression (as Underscore) that can be used in operations.
522
+
523
+ Examples
524
+ --------
525
+ >>> from chalkdf import DataFrame
526
+ >>> from chalk.features import _
527
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
528
+ >>> df_sum = df.with_columns({"sum": _.x + _.y})
529
+ """
530
+
531
+ # The LazyFramePlaceholder does not currently track schema, so it cannot detect
532
+ # errors about missing columns.
533
+ return UnderscoreAttr(UnderscoreRoot(), column)
534
+
535
+ def project(self, columns: typing.Mapping[str, Underscore]) -> "LazyFramePlaceholder":
536
+ """Project to a new set of columns using expressions.
537
+
538
+ Parameters
539
+ ----------
540
+ columns
541
+ Mapping of output column names to expressions that define them.
542
+
543
+ Returns
544
+ -------
545
+ DataFrame with only the specified columns.
546
+
547
+ Examples
548
+ --------
549
+ >>> from chalkdf import DataFrame
550
+ >>> from chalk.features import _
551
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
552
+ >>> projected = df.project({"sum": _.x + _.y, "x": _.x})
553
+ """
554
+
555
+ return self._construct(
556
+ self_dataframe=self,
557
+ function_name="project",
558
+ columns=columns,
559
+ )
560
+
561
+ def select(self, *columns: str, strict: bool = True) -> "LazyFramePlaceholder":
562
+ """Select existing columns by name.
563
+
564
+ Parameters
565
+ ----------
566
+ *columns
567
+ Names of columns to select.
568
+ strict
569
+ If True, raise an error if any column doesn't exist. If False,
570
+ silently ignore missing columns.
571
+
572
+ Returns
573
+ -------
574
+ DataFrame with only the selected columns.
575
+
576
+ Examples
577
+ --------
578
+ >>> from chalkdf import DataFrame
579
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]})
580
+ >>> selected = df.select("x", "y")
581
+ """
582
+
583
+ return self._construct(
584
+ self_dataframe=self,
585
+ function_name="select",
586
+ args=columns,
587
+ strict=strict,
588
+ )
589
+
590
+ def drop(self, *columns: str, strict: bool = True) -> LazyFramePlaceholder:
591
+ """Drop specified columns from the DataFrame.
592
+
593
+ Parameters
594
+ ----------
595
+ *columns
596
+ Names of columns to drop.
597
+ strict
598
+ If True, raise an error if any column doesn't exist. If False,
599
+ silently ignore missing columns.
600
+
601
+ Returns
602
+ -------
603
+ DataFrame without the dropped columns.
604
+
605
+ Examples
606
+ --------
607
+ >>> from chalkdf import DataFrame
608
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]})
609
+ >>> df_dropped = df.drop("z")
610
+ """
611
+
612
+ return self._construct(
613
+ self_dataframe=self,
614
+ function_name="drop",
615
+ args=columns,
616
+ strict=strict,
617
+ )
618
+
619
+ def explode(self, column: str) -> "LazyFramePlaceholder":
620
+ """Explode a list or array column into multiple rows.
621
+
622
+ Each element in the list becomes a separate row, with other column
623
+ values duplicated.
624
+
625
+ Parameters
626
+ ----------
627
+ column
628
+ Name of the list/array column to explode.
629
+
630
+ Returns
631
+ -------
632
+ DataFrame with the list column expanded into multiple rows.
633
+
634
+ Examples
635
+ --------
636
+ >>> from chalkdf import DataFrame
637
+ >>> df = DataFrame.from_dict({"id": [1, 2], "items": [[10, 20], [30]]})
638
+ >>> exploded = df.explode("items")
639
+ """
640
+ return self._construct(
641
+ self_dataframe=self,
642
+ function_name="explode",
643
+ column=column,
644
+ )
645
+
646
+ def join(
647
+ self,
648
+ other: "LazyFramePlaceholder",
649
+ on: dict[str, str] | typing.Sequence[str],
650
+ how: str = "inner",
651
+ right_suffix: str | None = None,
652
+ ) -> "LazyFramePlaceholder":
653
+ """Join this ``DataFrame`` with another.
654
+
655
+ Parameters
656
+ ----------
657
+ other
658
+ Right-hand ``DataFrame``.
659
+ on
660
+ Column names or mapping of left->right join keys.
661
+ how
662
+ Join type (e.g. ``"inner"`` or ``"left"``).
663
+ right_suffix
664
+ Optional suffix applied to right-hand columns when names collide.
665
+
666
+ Returns
667
+ -------
668
+ Resulting ``DataFrame`` after the join.
669
+ """
670
+
671
+ return self._construct(
672
+ self_dataframe=self,
673
+ function_name="join",
674
+ other=other,
675
+ on=on,
676
+ how=how,
677
+ right_suffix=right_suffix,
678
+ )
679
+
680
+ def join_asof(
681
+ self,
682
+ other: LazyFramePlaceholder,
683
+ on: str,
684
+ *,
685
+ right_on: str | None = None,
686
+ by: list[str] | None = None,
687
+ right_by: list[str] | None = None,
688
+ strategy: typing.Literal["forward", "backward"] = "backward",
689
+ right_suffix: str | None = None,
690
+ coalesce: bool = True,
691
+ ) -> LazyFramePlaceholder:
692
+ """Perform an as-of join with another DataFrame.
693
+
694
+ An as-of join is similar to a left join, but instead of matching on equality,
695
+ it matches on the nearest key from the right DataFrame. This is commonly used
696
+ for time-series data where you want to join with the most recent observation.
697
+
698
+ **Important**: Both DataFrames must be sorted by the ``on`` column before calling
699
+ this method. Use ``.order_by(on)`` to sort if needed.
700
+
701
+ Parameters
702
+ ----------
703
+ other
704
+ Right-hand DataFrame to join with.
705
+ on
706
+ Column name in the left DataFrame to join on (must be sorted).
707
+ right_on
708
+ Column name in the right DataFrame to join on. If None, uses ``on``.
709
+ by
710
+ Additional exact-match columns for left DataFrame (optional).
711
+ right_by
712
+ Additional exact-match columns for right DataFrame. If None, uses ``by``.
713
+ strategy
714
+ Join strategy - "backward" (default) matches with the most recent past value,
715
+ "forward" matches with the nearest future value. Can also pass AsOfJoinStrategy enum.
716
+ right_suffix
717
+ Suffix to add to overlapping column names from the right DataFrame.
718
+ coalesce
719
+ Whether to coalesce the join keys (default True).
720
+
721
+ Returns
722
+ -------
723
+ Resulting DataFrame after the as-of join.
724
+ """
725
+ # Convert string strategy to enum if needed
726
+
727
+ return self._construct(
728
+ self_dataframe=self,
729
+ function_name="join_asof",
730
+ other=other,
731
+ on=on,
732
+ right_on=right_on,
733
+ by=by,
734
+ right_by=right_by,
735
+ strategy=strategy,
736
+ right_suffix=right_suffix,
737
+ coalesce=coalesce,
738
+ )
739
+
740
+ # # Window is not yet supported in LazyFramePlaceholder:
741
+ # def window(
742
+ # self,
743
+ # by: typing.Sequence[str],
744
+ # order_by: typing.Sequence[str | tuple[str, str]],
745
+ # *expressions: WindowExpr,
746
+ # ) -> LazyFramePlaceholder:
747
+ # ...
748
+
749
+ def agg(self, by: typing.Sequence[str], *aggregations: Underscore) -> "LazyFramePlaceholder":
750
+ """Group by columns and apply aggregation expressions.
751
+
752
+ Parameters
753
+ ----------
754
+ by
755
+ Column names to group by.
756
+ *aggregations
757
+ Aggregation expressions to apply to each group (e.g., sum, count, mean).
758
+
759
+ Returns
760
+ -------
761
+ DataFrame with one row per group containing the aggregated values.
762
+
763
+ Examples
764
+ --------
765
+ >>> from chalkdf import DataFrame
766
+ >>> from chalk.features import _
767
+ >>> df = DataFrame.from_dict({"group": ["A", "A", "B"], "value": [1, 2, 3]})
768
+ >>> agg_df = df.agg(["group"], _.value.sum().alias("total"))
769
+ """
770
+
771
+ if isinstance(by, str):
772
+ raise ValueError(f".agg(...) must be called with a list of group-by columns, not a single str {repr(by)}")
773
+
774
+ return self._construct(
775
+ self_dataframe=self,
776
+ function_name="agg",
777
+ args=(by, *aggregations),
778
+ )
779
+
780
+ def distinct_on(self, *columns: str) -> "LazyFramePlaceholder":
781
+ """Remove duplicate rows based on specified columns.
782
+
783
+ For rows with identical values in the specified columns, only one
784
+ row is kept (chosen arbitrarily).
785
+
786
+ Parameters
787
+ ----------
788
+ *columns
789
+ Column names to check for duplicates.
790
+
791
+ Returns
792
+ -------
793
+ DataFrame with duplicate rows removed.
794
+
795
+ Examples
796
+ --------
797
+ >>> from chalkdf import DataFrame
798
+ >>> df = DataFrame.from_dict({"x": [1, 1, 2], "y": [10, 20, 30]})
799
+ >>> unique = df.distinct_on("x")
800
+ """
801
+
802
+ return self._construct(
803
+ self_dataframe=self,
804
+ function_name="distinct_on",
805
+ args=columns,
806
+ )
807
+
808
+ def order_by(self, *columns: str | tuple[str, str]) -> LazyFramePlaceholder:
809
+ """Sort the DataFrame by one or more columns.
810
+
811
+ Parameters
812
+ ----------
813
+ *columns
814
+ Column names to sort by. Can be strings (for ascending order) or
815
+ tuples of (column_name, direction) where direction is "asc" or "desc".
816
+
817
+ Returns
818
+ -------
819
+ DataFrame sorted by the specified columns.
820
+
821
+ Examples
822
+ --------
823
+ >>> from chalkdf import DataFrame
824
+ >>> df = DataFrame.from_dict({"x": [3, 1, 2], "y": [30, 10, 20]})
825
+ >>> # Sort by x ascending
826
+ >>> sorted_df = df.order_by("x")
827
+ >>> # Sort by x descending, then y ascending
828
+ >>> sorted_df = df.order_by(("x", "desc"), "y")
829
+ """
830
+
831
+ return self._construct(
832
+ self_dataframe=self,
833
+ function_name="order_by",
834
+ args=columns,
835
+ )
836
+
837
+ def write(
838
+ self,
839
+ target_path: str,
840
+ target_file_name: str | None = None,
841
+ *,
842
+ file_format: str = "parquet",
843
+ serde_parameters: typing.Mapping[str, str] | None = None,
844
+ compression: str | None = None,
845
+ ensure_files: bool = False,
846
+ connector_id: str | None = None,
847
+ ) -> "LazyFramePlaceholder":
848
+ """Persist the DataFrame plan using Velox's Hive connector.
849
+
850
+ Parameters
851
+ ----------
852
+ target_path
853
+ Directory to write output files.
854
+ target_file_name
855
+ Optional explicit file name.
856
+ file_format
857
+ Output format (default ``parquet``).
858
+ serde_parameters
859
+ Optional SerDe options for text formats.
860
+ compression
861
+ Optional compression codec.
862
+ ensure_files
863
+ Ensure writers emit files even if no rows were produced.
864
+ connector_id
865
+ Optional connector id override.
866
+
867
+ Returns
868
+ -------
869
+ DataFrame representing the TableWrite operator.
870
+ """
871
+
872
+ return self._construct(
873
+ self_dataframe=self,
874
+ function_name="write",
875
+ target_path=target_path,
876
+ target_file_name=target_file_name,
877
+ file_format=file_format,
878
+ serde_parameters=serde_parameters,
879
+ compression=compression,
880
+ ensure_files=ensure_files,
881
+ connector_id=connector_id,
882
+ )
883
+
884
+ def rename(self, new_names: dict[str, str]) -> LazyFramePlaceholder:
885
+ """Rename columns in the DataFrame.
886
+
887
+ Parameters
888
+ ----------
889
+ new_names
890
+ Dictionary mapping old column names to new column names.
891
+
892
+ Returns
893
+ -------
894
+ DataFrame with renamed columns.
895
+
896
+ Examples
897
+ --------
898
+ >>> from chalkdf import DataFrame
899
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
900
+ >>> renamed = df.rename({"x": "id", "y": "value"})
901
+ """
902
+
903
+ return self._construct(
904
+ self_dataframe=self,
905
+ function_name="rename",
906
+ new_names=new_names,
907
+ )
908
+
909
+ @staticmethod
910
+ def from_proto(
911
+ proto: bytes | dataframe_pb2.DataFramePlan,
912
+ ) -> "LazyFramePlaceholder":
913
+ if isinstance(proto, bytes):
914
+ proto_bytes = proto
915
+ proto = dataframe_pb2.DataFramePlan()
916
+ proto.ParseFromString(proto_bytes)
917
+ return _convert_from_dataframe_proto(proto, dataframe_class=LazyFramePlaceholder)
918
+
919
+
920
+ def _extract_alias_from_underscore(u: Underscore) -> tuple[str, Underscore] | None:
921
+ """
922
+ Given an underscore expression like `_.something.alias("name")` splits the expression
923
+ into the alias `"name"` and the underscore expression `_.something`.
924
+
925
+ If this expression does not have an alias, returns `None` instead.
926
+ """
927
+ if not isinstance(u, UnderscoreCall):
928
+ return None
929
+ parent = u._chalk__parent # pyright: ignore[reportPrivateUsage]
930
+ if not isinstance(parent, UnderscoreAttr) or parent._chalk__attr != "alias": # pyright: ignore[reportPrivateUsage]
931
+ return None
932
+ if len(u._chalk__args) != 1: # pyright: ignore[reportPrivateUsage]
933
+ raise ValueError("alias() must be called with one argument")
934
+ alias = u._chalk__args[0] # pyright: ignore[reportPrivateUsage]
935
+ if not isinstance(alias, str):
936
+ raise ValueError("argument to alias() must be a string")
937
+ return (
938
+ alias,
939
+ parent._chalk__parent, # pyright: ignore[reportPrivateUsage]
940
+ )
941
+
942
+
943
+ def _convert_to_dataframe_proto(
944
+ lazy_frame: LazyFramePlaceholder,
945
+ ) -> dataframe_pb2.DataFramePlan:
946
+ """
947
+ Converts a `LazyFramePlaceholder` into a proto value, allowing it to be round-tripped
948
+ or converted into a Chalk DataFrame for execution.
949
+ """
950
+ df_constructors: list[dataframe_pb2.DataFrameConstructor] = []
951
+
952
+ # This map will memoize the constructor for a specified `LazyFramePlaceholder`.
953
+ lazy_frame_placeholder_cache: dict[LazyFramePlaceholder, dataframe_pb2.DataFrameIndex] = {}
954
+
955
+ def _convert_dataframe(df: LazyFramePlaceholder) -> dataframe_pb2.DataFrameIndex:
956
+ """
957
+ Recursively converts a `LazyFramePlaceholder` into a proto message.
958
+ If this `df` instance has been seen before, returns an index into the `df_constructors`
959
+ list pointing to the previous construction.
960
+
961
+ This allows plans that re-use operators to be efficiently encoded.
962
+ """
963
+ if df in lazy_frame_placeholder_cache:
964
+ return lazy_frame_placeholder_cache[df]
965
+
966
+ df_constructor = df._lazy_frame_constructor # pyright: ignore[reportPrivateUsage]
967
+ if df_constructor.self_dataframe is None:
968
+ self_proto = None
969
+ else:
970
+ self_proto = _convert_dataframe(df_constructor.self_dataframe)
971
+
972
+ proto_args = dataframe_pb2.PyList(
973
+ list_items=[_convert_arg(arg_value) for arg_value in df_constructor.args],
974
+ )
975
+ proto_kwargs = dataframe_pb2.PyDict(
976
+ dict_entries=[
977
+ dataframe_pb2.PyDictEntry(
978
+ entry_key=_convert_arg(kwarg_name),
979
+ entry_value=_convert_arg(kwarg_value),
980
+ )
981
+ for kwarg_name, kwarg_value in df_constructor.kwargs.items()
982
+ ],
983
+ )
984
+
985
+ new_constructor_index = len(df_constructors)
986
+ df_constructors.append(
987
+ dataframe_pb2.DataFrameConstructor(
988
+ self_operand=self_proto,
989
+ function_name=df_constructor.function_name,
990
+ args=proto_args,
991
+ kwargs=proto_kwargs,
992
+ )
993
+ )
994
+ lazy_frame_placeholder_cache[df] = dataframe_pb2.DataFrameIndex(
995
+ dataframe_op_index=new_constructor_index,
996
+ )
997
+ return lazy_frame_placeholder_cache[df]
998
+
999
+ def _convert_arg(value: Any) -> dataframe_pb2.DataFrameOperand:
1000
+ if value is None:
1001
+ return dataframe_pb2.DataFrameOperand(
1002
+ value_none=dataframe_pb2.PyNone(),
1003
+ )
1004
+ if isinstance(value, int):
1005
+ return dataframe_pb2.DataFrameOperand(
1006
+ value_int=value,
1007
+ )
1008
+ if isinstance(value, str):
1009
+ return dataframe_pb2.DataFrameOperand(
1010
+ value_string=value,
1011
+ )
1012
+ if isinstance(value, bool):
1013
+ return dataframe_pb2.DataFrameOperand(
1014
+ value_bool=value,
1015
+ )
1016
+ if isinstance(value, (list, tuple)):
1017
+ return dataframe_pb2.DataFrameOperand(
1018
+ value_list=dataframe_pb2.PyList(
1019
+ list_items=[_convert_arg(item) for item in value],
1020
+ )
1021
+ )
1022
+ if isinstance(value, typing.Mapping):
1023
+ return dataframe_pb2.DataFrameOperand(
1024
+ value_dict=dataframe_pb2.PyDict(
1025
+ dict_entries=[
1026
+ dataframe_pb2.PyDictEntry(
1027
+ entry_key=_convert_arg(key),
1028
+ entry_value=_convert_arg(value),
1029
+ )
1030
+ for key, value in value.items()
1031
+ ]
1032
+ )
1033
+ )
1034
+ if isinstance(value, LazyFramePlaceholder):
1035
+ # Use the dataframe-specific helper function for this logic.
1036
+ return dataframe_pb2.DataFrameOperand(
1037
+ value_dataframe_index=_convert_dataframe(value),
1038
+ )
1039
+ if isinstance(value, Underscore):
1040
+ return dataframe_pb2.DataFrameOperand(
1041
+ underscore_expr=convert_value_to_proto_expr(value),
1042
+ )
1043
+ if isinstance(value, pyarrow.Schema):
1044
+ return dataframe_pb2.DataFrameOperand(
1045
+ arrow_schema=PrimitiveFeatureConverter.convert_pa_schema_to_proto_schema(value),
1046
+ )
1047
+ if isinstance(value, (pyarrow.Table, pyarrow.RecordBatch)):
1048
+ return dataframe_pb2.DataFrameOperand(
1049
+ arrow_table=PrimitiveFeatureConverter.convert_arrow_table_to_proto(value),
1050
+ )
1051
+
1052
+ raise ValueError(f"LazyFramePlaceholder function operand is of unsupported type {type(value)}")
1053
+
1054
+ _convert_arg(lazy_frame)
1055
+
1056
+ return dataframe_pb2.DataFramePlan(
1057
+ constructors=df_constructors,
1058
+ )
1059
+
1060
+
1061
+ def _convert_from_dataframe_proto(
1062
+ proto_plan: dataframe_pb2.DataFramePlan,
1063
+ dataframe_class: type,
1064
+ ) -> LazyFramePlaceholder:
1065
+ """
1066
+ Converts a proto into a lazy frame.
1067
+ """
1068
+ df_values: list[LazyFramePlaceholder] = []
1069
+
1070
+ def _convert_dataframe_index(df: dataframe_pb2.DataFrameIndex) -> LazyFramePlaceholder:
1071
+ if df.dataframe_op_index < 0 or df.dataframe_op_index >= len(df_values):
1072
+ raise ValueError(
1073
+ f"DataFrame proto message value is invalid - a DataFrame constructor references operator index {df.dataframe_op_index} but only {len(df_values)} dataframe(s) intermediate values have been defined so far."
1074
+ )
1075
+ return df_values[df.dataframe_op_index]
1076
+
1077
+ def _convert_dataframe(df: dataframe_pb2.DataFrameConstructor) -> LazyFramePlaceholder:
1078
+ if df.HasField("self_operand"):
1079
+ self_operand = _convert_dataframe_index(df.self_operand)
1080
+ else:
1081
+ self_operand = None
1082
+
1083
+ # TODO: validate that function_name is legal.
1084
+ if self_operand is None:
1085
+ method = getattr(dataframe_class, df.function_name)
1086
+ else:
1087
+ method = getattr(self_operand, df.function_name)
1088
+
1089
+ args = [_convert_arg(arg) for arg in df.args.list_items]
1090
+ kwargs = {_convert_arg(entry.entry_key): _convert_arg(entry.entry_value) for entry in df.kwargs.dict_entries}
1091
+
1092
+ return method(*args, **kwargs)
1093
+
1094
+ def _convert_arg(value: dataframe_pb2.DataFrameOperand) -> Any:
1095
+ if value.HasField("value_string"):
1096
+ return value.value_string
1097
+ if value.HasField("value_int"):
1098
+ return value.value_int
1099
+ if value.HasField("value_bool"):
1100
+ return value.value_bool
1101
+ if value.HasField("value_none"):
1102
+ return None
1103
+ if value.HasField("value_list"):
1104
+ return [_convert_arg(item) for item in value.value_list.list_items]
1105
+ if value.HasField("value_dict"):
1106
+ return {
1107
+ _convert_arg(entry.entry_key): _convert_arg(entry.entry_value)
1108
+ for entry in value.value_dict.dict_entries
1109
+ }
1110
+ if value.HasField("value_dataframe_index"):
1111
+ return _convert_dataframe_index(value.value_dataframe_index)
1112
+ if value.HasField("arrow_schema"):
1113
+ return PrimitiveFeatureConverter.convert_proto_schema_to_pa_schema(value.arrow_schema)
1114
+ if value.HasField("arrow_table"):
1115
+ return PrimitiveFeatureConverter.convert_arrow_table_from_proto(value.arrow_table)
1116
+ if value.HasField("underscore_expr"):
1117
+ return Underscore._from_proto(value.underscore_expr) # pyright: ignore[reportPrivateUsage]
1118
+
1119
+ raise ValueError(f"DataFrame operand expression {value} does not have any value set")
1120
+
1121
+ for df in proto_plan.constructors:
1122
+ df_values.append(_convert_dataframe(df))
1123
+
1124
+ if len(df_values) == 0:
1125
+ raise ValueError(
1126
+ "Could not parse LazyFramePlaceholder from proto expression; no dataframe constructors were present in the provided proto message"
1127
+ )
1128
+
1129
+ return df_values[-1]