chalkpy 2.93.4__py3-none-any.whl → 2.93.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. chalk/_gen/chalk/arrow/v1/arrow_pb2.py +7 -5
  2. chalk/_gen/chalk/arrow/v1/arrow_pb2.pyi +6 -0
  3. chalk/_gen/chalk/common/v1/offline_query_pb2.py +17 -15
  4. chalk/_gen/chalk/common/v1/offline_query_pb2.pyi +25 -0
  5. chalk/_gen/chalk/common/v1/script_task_pb2.py +3 -3
  6. chalk/_gen/chalk/common/v1/script_task_pb2.pyi +2 -0
  7. chalk/_gen/chalk/dataframe/__init__.py +0 -0
  8. chalk/_gen/chalk/dataframe/v1/__init__.py +0 -0
  9. chalk/_gen/chalk/dataframe/v1/dataframe_pb2.py +48 -0
  10. chalk/_gen/chalk/dataframe/v1/dataframe_pb2.pyi +119 -0
  11. chalk/_gen/chalk/dataframe/v1/dataframe_pb2_grpc.py +4 -0
  12. chalk/_gen/chalk/dataframe/v1/dataframe_pb2_grpc.pyi +4 -0
  13. chalk/_gen/chalk/graph/v1/graph_pb2.py +150 -149
  14. chalk/_gen/chalk/graph/v1/graph_pb2.pyi +5 -0
  15. chalk/_gen/chalk/kubernetes/v1/horizontalpodautoscaler_pb2.py +79 -0
  16. chalk/_gen/chalk/kubernetes/v1/horizontalpodautoscaler_pb2.pyi +377 -0
  17. chalk/_gen/chalk/kubernetes/v1/horizontalpodautoscaler_pb2_grpc.py +4 -0
  18. chalk/_gen/chalk/kubernetes/v1/horizontalpodautoscaler_pb2_grpc.pyi +4 -0
  19. chalk/_gen/chalk/kubernetes/v1/scaledobject_pb2.py +43 -7
  20. chalk/_gen/chalk/kubernetes/v1/scaledobject_pb2.pyi +252 -2
  21. chalk/_gen/chalk/server/v1/benchmark_pb2.py +45 -0
  22. chalk/_gen/chalk/server/v1/benchmark_pb2.pyi +74 -0
  23. chalk/_gen/chalk/server/v1/benchmark_pb2_grpc.py +78 -0
  24. chalk/_gen/chalk/server/v1/benchmark_pb2_grpc.pyi +36 -0
  25. chalk/_gen/chalk/server/v1/builder_pb2.py +218 -210
  26. chalk/_gen/chalk/server/v1/builder_pb2.pyi +59 -1
  27. chalk/_gen/chalk/server/v1/dataplaneworkflows_pb2.py +70 -0
  28. chalk/_gen/chalk/server/v1/dataplaneworkflows_pb2.pyi +219 -0
  29. chalk/_gen/chalk/server/v1/dataplaneworkflows_pb2_grpc.py +170 -0
  30. chalk/_gen/chalk/server/v1/dataplaneworkflows_pb2_grpc.pyi +62 -0
  31. chalk/_gen/chalk/server/v1/files_pb2.py +65 -0
  32. chalk/_gen/chalk/server/v1/files_pb2.pyi +167 -0
  33. chalk/_gen/chalk/server/v1/files_pb2_grpc.py +4 -0
  34. chalk/_gen/chalk/server/v1/files_pb2_grpc.pyi +4 -0
  35. chalk/_gen/chalk/server/v1/kube_pb2.py +29 -19
  36. chalk/_gen/chalk/server/v1/kube_pb2.pyi +28 -0
  37. chalk/_gen/chalk/server/v1/kube_pb2_grpc.py +45 -0
  38. chalk/_gen/chalk/server/v1/kube_pb2_grpc.pyi +12 -0
  39. chalk/_gen/chalk/server/v1/queries_pb2.py +66 -66
  40. chalk/_gen/chalk/server/v1/queries_pb2.pyi +20 -0
  41. chalk/_gen/chalk/server/v1/sql_interface_pb2.py +75 -0
  42. chalk/_gen/chalk/server/v1/sql_interface_pb2.pyi +142 -0
  43. chalk/_gen/chalk/server/v1/sql_interface_pb2_grpc.py +349 -0
  44. chalk/_gen/chalk/server/v1/sql_interface_pb2_grpc.pyi +114 -0
  45. chalk/_gen/chalk/server/v1/trace_pb2.py +44 -40
  46. chalk/_gen/chalk/server/v1/trace_pb2.pyi +12 -0
  47. chalk/_gen/chalk/streaming/v1/debug_service_pb2.py +62 -0
  48. chalk/_gen/chalk/streaming/v1/debug_service_pb2.pyi +75 -0
  49. chalk/_gen/chalk/streaming/v1/debug_service_pb2_grpc.py +221 -0
  50. chalk/_gen/chalk/streaming/v1/debug_service_pb2_grpc.pyi +88 -0
  51. chalk/_gen/chalk/streaming/v1/simple_streaming_service_pb2.py +8 -8
  52. chalk/_gen/chalk/streaming/v1/simple_streaming_service_pb2.pyi +4 -1
  53. chalk/_version.py +1 -1
  54. chalk/df/LazyFramePlaceholder.py +1125 -0
  55. chalk/features/_encoding/converter.py +23 -0
  56. chalk/features/feature_field.py +4 -2
  57. chalk/features/feature_set.py +0 -1
  58. chalk/features/feature_set_decorator.py +1 -34
  59. chalk/features/resolver.py +9 -2
  60. chalk/operators/_utils.py +10 -3
  61. chalk/parsed/to_proto.py +7 -1
  62. {chalkpy-2.93.4.dist-info → chalkpy-2.93.6.dist-info}/METADATA +1 -1
  63. {chalkpy-2.93.4.dist-info → chalkpy-2.93.6.dist-info}/RECORD +66 -35
  64. {chalkpy-2.93.4.dist-info → chalkpy-2.93.6.dist-info}/WHEEL +0 -0
  65. {chalkpy-2.93.4.dist-info → chalkpy-2.93.6.dist-info}/entry_points.txt +0 -0
  66. {chalkpy-2.93.4.dist-info → chalkpy-2.93.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1125 @@
1
+ """Lightweight DataFrame wrapper around Chalk's execution engine.
2
+
3
+ The :class:`DataFrame` class constructs query plans backed by ``libchalk`` and
4
+ can materialize them into Arrow tables. It offers a minimal API similar to
5
+ other DataFrame libraries while delegating heavy lifting to the underlying
6
+ engine.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import typing
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+ from typing import TYPE_CHECKING, Any, Optional, TypeAlias
15
+
16
+ import pyarrow
17
+
18
+ import chalk._gen.chalk.dataframe.v1.dataframe_pb2 as dataframe_pb2
19
+ from chalk.features._encoding.converter import PrimitiveFeatureConverter
20
+ from chalk.features.underscore import (
21
+ Underscore,
22
+ UnderscoreAttr,
23
+ UnderscoreCall,
24
+ UnderscoreRoot,
25
+ convert_value_to_proto_expr,
26
+ )
27
+
28
+ if TYPE_CHECKING:
29
+ from chalk.features import Underscore
30
+
31
+
32
+ MaterializedTable: TypeAlias = pyarrow.RecordBatch | pyarrow.Table
33
+
34
+
35
+ @dataclass
36
+ class _LazyFrameConstructor:
37
+ """
38
+ A lazily-called function which will be used to construct a Chalk DataFrame.
39
+ """
40
+
41
+ self_dataframe: "Optional[LazyFramePlaceholder]"
42
+ """If present, this is the value of 'self' to call the function on."""
43
+
44
+ function_name: str
45
+ """The name of the function to construct the DataFrame."""
46
+
47
+ args: tuple[Any, ...]
48
+ """The args to pass to the DataFrame function."""
49
+
50
+ kwargs: dict[str, Any]
51
+ """The kwargs to pass to the DataFrame function."""
52
+
53
+
54
+ class LazyFramePlaceholder:
55
+ """
56
+ A lazy representation of a DataFrame operation.
57
+
58
+ Examples
59
+ --------
60
+ >>> from chalk.df import LazyFramePlaceholder
61
+ >>> from chalk.features import _
62
+ >>> # Create from a dictionary
63
+ >>> df = LazyFramePlaceholder.named_table('input', pa.schema({"id": pa.int64(), "name": pa.string()}))
64
+ >>> # Apply operations
65
+ >>> filtered = df.filter(_.x > 1)
66
+ """
67
+
68
+ @staticmethod
69
+ def _construct(
70
+ *,
71
+ self_dataframe: "Optional[LazyFramePlaceholder]",
72
+ function_name: str,
73
+ args: tuple[Any, ...] = (),
74
+ **kwargs: Any,
75
+ ):
76
+ return LazyFramePlaceholder(
77
+ _internal_constructor=_LazyFrameConstructor(
78
+ self_dataframe=self_dataframe,
79
+ function_name=function_name,
80
+ args=tuple(args),
81
+ kwargs=kwargs,
82
+ )
83
+ )
84
+
85
+ def __init__(
86
+ self,
87
+ *,
88
+ _internal_constructor: _LazyFrameConstructor,
89
+ ):
90
+ """
91
+ An internal construct that creates a `LazyFramePlaceholder` from its underlying operation.
92
+ """
93
+
94
+ super().__init__()
95
+ self._lazy_frame_constructor = _internal_constructor
96
+
97
+ def __repr__(self) -> str:
98
+ return "LazyFramePlaceholder(...)"
99
+
100
+ __str__ = __repr__
101
+
102
+ def _to_proto(self) -> dataframe_pb2.DataFramePlan:
103
+ """
104
+ Convert this proto plan to a dataframe.
105
+ """
106
+ return _convert_to_dataframe_proto(self)
107
+
108
+ @staticmethod
109
+ def _from_proto(proto: dataframe_pb2.DataFramePlan) -> "LazyFramePlaceholder":
110
+ """
111
+ Parse a `LazyFramePlaceholder` from the specified proto plan.
112
+ """
113
+ return _convert_from_dataframe_proto(proto, dataframe_class=LazyFramePlaceholder)
114
+
115
+ @classmethod
116
+ def named_table(cls, name: str, schema: pyarrow.Schema) -> LazyFramePlaceholder:
117
+ """Create a ``DataFrame`` for a named table.
118
+
119
+ Parameters
120
+ ----------
121
+ name
122
+ Table identifier.
123
+ schema
124
+ Arrow schema describing the table.
125
+
126
+ Returns
127
+ -------
128
+ DataFrame referencing the named table.
129
+ """
130
+
131
+ if not isinstance(name, str): # pyright: ignore[reportUnnecessaryIsInstance]
132
+ raise ValueError(
133
+ f"LazyFramePlaceholder.named_table expected `name` to have type 'str' but it was passed as a '{type(name)}'"
134
+ )
135
+ if not isinstance(schema, pyarrow.Schema): # pyright: ignore[reportUnnecessaryIsInstance]
136
+ raise ValueError(
137
+ f"LazyFramePlaceholder.named_table expected `schema` to have type 'pyarrow.Schema' but it was passed as a '{type(schema)}'"
138
+ )
139
+
140
+ return LazyFramePlaceholder._construct(
141
+ function_name="named_table",
142
+ self_dataframe=None,
143
+ name=name,
144
+ schema=schema,
145
+ )
146
+
147
+ @classmethod
148
+ def from_arrow(cls, data: MaterializedTable):
149
+ """Construct a DataFrame from an in-memory Arrow object.
150
+
151
+ Parameters
152
+ ----------
153
+ data
154
+ PyArrow Table or RecordBatch to convert into a DataFrame.
155
+
156
+ Returns
157
+ -------
158
+ DataFrame backed by the provided Arrow data.
159
+
160
+ Examples
161
+ --------
162
+ >>> import pyarrow as pa
163
+ >>> from chalkdf import DataFrame
164
+ >>> table = pa.table({"x": [1, 2, 3], "y": ["a", "b", "c"]})
165
+ >>> df = DataFrame.from_arrow(table)
166
+ """
167
+
168
+ assert isinstance(data, (pyarrow.Table, pyarrow.RecordBatch))
169
+
170
+ return LazyFramePlaceholder._construct(
171
+ self_dataframe=None,
172
+ function_name="from_arrow",
173
+ data=data,
174
+ )
175
+
176
+ @classmethod
177
+ def from_dict(cls, data: dict):
178
+ """Construct a DataFrame from a Python dictionary.
179
+
180
+ Parameters
181
+ ----------
182
+ data
183
+ Dictionary mapping column names to lists of values.
184
+
185
+ Returns
186
+ -------
187
+ DataFrame backed by the provided dictionary data.
188
+
189
+ Examples
190
+ --------
191
+ >>> from chalkdf import DataFrame
192
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": ["a", "b", "c"]})
193
+ """
194
+
195
+ return LazyFramePlaceholder.from_arrow(pyarrow.table(data))
196
+
197
+ @classmethod
198
+ def scan(
199
+ cls,
200
+ name: str,
201
+ input_uris: typing.Sequence[str | Path],
202
+ *,
203
+ schema: pyarrow.Schema | None = None,
204
+ ) -> "LazyFramePlaceholder":
205
+ """Scan files and return a DataFrame.
206
+
207
+ Currently supports CSV (with headers) and Parquet file formats.
208
+
209
+ Parameters
210
+ ----------
211
+ name
212
+ Name to assign to the table being scanned.
213
+ input_uris
214
+ List of file paths or URIs to scan. Supports local paths and file:// URIs.
215
+ schema
216
+ Schema of the data. Required for CSV files, optional for Parquet.
217
+
218
+ Returns
219
+ -------
220
+ DataFrame that reads data from the specified files.
221
+
222
+ Examples
223
+ --------
224
+ >>> from chalkdf import DataFrame
225
+ >>> # Scan Parquet files
226
+ >>> df = DataFrame.scan("sales_data", ["data/sales_2024.parquet"])
227
+ >>> # Scan CSV with explicit schema
228
+ >>> import pyarrow as pa
229
+ >>> schema = pa.schema([("id", pa.int64()), ("name", pa.string())])
230
+ >>> df = DataFrame.scan("users", ["data/users.csv"], schema=schema)
231
+ """
232
+ # Accept filesystem paths or URIs; construct file:// URIs manually for
233
+ # local paths to avoid percent-encoding partition tokens like '='.
234
+
235
+ if isinstance(input_uris, str):
236
+ raise ValueError(
237
+ "The LazyFramePlaceholder.scan() function must be called with a list of input_uris, not a single str URI"
238
+ )
239
+
240
+ normalized_input_uris: list[str] = []
241
+ for p in input_uris:
242
+ s = p if isinstance(p, str) else str(p)
243
+ if "://" in s:
244
+ normalized_input_uris.append(s)
245
+ else:
246
+ abs_path = str(Path(s).resolve())
247
+ if not abs_path.startswith("/"):
248
+ normalized_input_uris.append(Path(s).resolve().as_uri())
249
+ else:
250
+ normalized_input_uris.append("file://" + abs_path)
251
+
252
+ return LazyFramePlaceholder._construct(
253
+ self_dataframe=None,
254
+ function_name="scan",
255
+ name=name,
256
+ input_uris=normalized_input_uris,
257
+ schema=schema,
258
+ )
259
+
260
+ @classmethod
261
+ def scan_glue_iceberg(
262
+ cls,
263
+ glue_table_name: str,
264
+ schema: typing.Mapping[str, pyarrow.DataType],
265
+ *,
266
+ batch_row_count: int = 1_000,
267
+ aws_catalog_account_id: typing.Optional[str] = None,
268
+ aws_catalog_region: typing.Optional[str] = None,
269
+ aws_role_arn: typing.Optional[str] = None,
270
+ parquet_scan_range_column: typing.Optional[str] = None,
271
+ custom_partitions: typing.Optional[dict[str, tuple[typing.Literal["date_trunc(day)"], str]]] = None,
272
+ partition_column: typing.Optional[str] = None,
273
+ ) -> "LazyFramePlaceholder":
274
+ """Load data from an AWS Glue Iceberg table.
275
+
276
+ Parameters
277
+ ----------
278
+ glue_table_name
279
+ Fully qualified ``database.table`` name.
280
+ schema
281
+ Mapping of column names to Arrow types.
282
+ batch_row_count
283
+ Number of rows per batch.
284
+ aws_catalog_account_id
285
+ AWS account hosting the Glue catalog.
286
+ aws_catalog_region
287
+ Region of the Glue catalog.
288
+ aws_role_arn
289
+ IAM role to assume for access.
290
+ filter_predicate
291
+ Optional filter applied during scan.
292
+ parquet_scan_range_column
293
+ Column used for range-based reads.
294
+ custom_partitions
295
+ Additional partition definitions.
296
+ partition_column
297
+ Column name representing partitions.
298
+
299
+ Returns
300
+ -------
301
+ DataFrame backed by the Glue table.
302
+ """
303
+
304
+ return LazyFramePlaceholder._construct(
305
+ self_dataframe=None,
306
+ function_name="scan_glue_iceberg",
307
+ schema=schema,
308
+ batch_row_count=batch_row_count,
309
+ aws_catalog_account_id=aws_catalog_account_id,
310
+ aws_catalog_region=aws_catalog_region,
311
+ aws_role_arn=aws_role_arn,
312
+ filter_predicate=None,
313
+ parquet_scan_range_column=parquet_scan_range_column,
314
+ custom_partitions=custom_partitions,
315
+ partition_column=partition_column,
316
+ )
317
+
318
+ @classmethod
319
+ def from_sql(
320
+ cls,
321
+ query: str,
322
+ ) -> LazyFramePlaceholder:
323
+ """Create a ``DataFrame`` from the result of executing a SQL query (DuckDB dialect).
324
+
325
+ Parameters
326
+ ----------
327
+ query
328
+ SQL query string (DuckDB dialect).
329
+ **tables
330
+ Named tables to use in the query. Can be Arrow Table, RecordBatch, or DataFrame.
331
+
332
+ Returns
333
+ -------
334
+ DataFrame containing the query results.
335
+ """
336
+
337
+ return LazyFramePlaceholder._construct(
338
+ self_dataframe=None,
339
+ function_name="from_sql",
340
+ query=query,
341
+ )
342
+
343
+ def with_columns(
344
+ self,
345
+ *columns: typing.Mapping[str, Underscore] | Underscore | tuple[str, Underscore],
346
+ ) -> LazyFramePlaceholder:
347
+ """Add or replace columns.
348
+
349
+ Accepts multiple forms:
350
+ - A mapping of column names to expressions
351
+ - Positional tuples of (name, expression)
352
+ - Bare positional expressions that must include ``.alias(<name>)``
353
+
354
+ Parameters
355
+ ----------
356
+ *columns
357
+ Column definitions as mappings, tuples, or aliased expressions.
358
+
359
+ Returns
360
+ -------
361
+ DataFrame with the specified columns added or replaced.
362
+
363
+ Examples
364
+ --------
365
+ >>> from chalkdf import DataFrame
366
+ >>> from chalk.features import _
367
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
368
+ >>> # Add a new column using a dict with _ syntax
369
+ >>> df2 = df.with_columns({"z": _.x + _.y})
370
+ >>> # Add a new column using alias
371
+ >>> df3 = df.with_columns((_.x + _.y).alias("z"))
372
+ """
373
+ entries: list[tuple[str, Underscore]] = []
374
+ if len(columns) == 0:
375
+ raise ValueError("with_columns requires at least one column expression")
376
+
377
+ for col in columns:
378
+ if isinstance(col, tuple):
379
+ if len(col) != 2:
380
+ raise ValueError(
381
+ f"LazyFramePlaceholder.with_column(...) cannot be called with tuple having {len(col)} members - expect (name, expression) pairs only."
382
+ )
383
+ entries.append(col)
384
+ elif isinstance(col, Underscore):
385
+ attempted_alias = _extract_alias_from_underscore(col)
386
+ if attempted_alias:
387
+ entries.append(attempted_alias)
388
+ else:
389
+ raise ValueError(
390
+ f"Positional with_columns expressions must use `.alias(...)` to set the column name, got expression '{col}' without any alias specified"
391
+ )
392
+ else:
393
+ entries.extend((k, v) for k, v in col) # pyright: ignore
394
+
395
+ return LazyFramePlaceholder._construct(
396
+ self_dataframe=self,
397
+ function_name="with_columns",
398
+ args=tuple(entries),
399
+ )
400
+
401
+ def with_unique_id(self, name: str) -> LazyFramePlaceholder:
402
+ """Add a monotonically increasing unique identifier column.
403
+
404
+ Parameters
405
+ ----------
406
+ name
407
+ Name of the new ID column.
408
+
409
+ Returns
410
+ -------
411
+ DataFrame with a new column containing unique, incrementing IDs.
412
+
413
+ Examples
414
+ --------
415
+ >>> from chalkdf import DataFrame
416
+ >>> df = DataFrame.from_dict({"x": [10, 20, 30]})
417
+ >>> df_with_id = df.with_unique_id("row_id")
418
+ """
419
+
420
+ return LazyFramePlaceholder._construct(
421
+ self_dataframe=self,
422
+ function_name="with_unique_id",
423
+ name=name,
424
+ )
425
+
426
+ def filter(self, expr: Underscore) -> LazyFramePlaceholder:
427
+ """Filter rows based on a boolean expression.
428
+
429
+ Parameters
430
+ ----------
431
+ expr
432
+ Boolean expression to filter rows. Only rows where the expression
433
+ evaluates to True are kept.
434
+
435
+ Returns
436
+ -------
437
+ DataFrame containing only the rows that match the filter condition.
438
+
439
+ Examples
440
+ --------
441
+ >>> from chalkdf import DataFrame
442
+ >>> from chalk.features import _
443
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3, 4], "y": [10, 20, 30, 40]})
444
+ >>> filtered = df.filter(_.x > 2)
445
+ """
446
+
447
+ return LazyFramePlaceholder._construct(
448
+ self_dataframe=self,
449
+ function_name="filter",
450
+ expr=expr,
451
+ )
452
+
453
+ def slice(self, start: int, length: int | None = None) -> LazyFramePlaceholder:
454
+ """Return a subset of rows starting at a specific position.
455
+
456
+ Parameters
457
+ ----------
458
+ start
459
+ Zero-based index where the slice begins.
460
+ length
461
+ Number of rows to include. If None, includes all remaining rows.
462
+
463
+ Returns
464
+ -------
465
+ DataFrame containing the sliced rows.
466
+
467
+ Examples
468
+ --------
469
+ >>> from chalkdf import DataFrame
470
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3, 4, 5]})
471
+ >>> # Get rows 1-3 (indices 1, 2, 3)
472
+ >>> sliced = df.slice(1, 3)
473
+ """
474
+
475
+ # Can't actually express "no limit" with velox limit/offset, but this'll do.
476
+ return self._construct(
477
+ self_dataframe=self,
478
+ function_name="slice",
479
+ start=start,
480
+ length=length,
481
+ )
482
+
483
+ def col(self, column: str) -> Underscore:
484
+ """Get a column expression from the DataFrame.
485
+
486
+ Parameters
487
+ ----------
488
+ column
489
+ Name of the column to retrieve.
490
+
491
+ Returns
492
+ -------
493
+ Column expression (as Underscore) that can be used in operations.
494
+
495
+ Examples
496
+ --------
497
+ >>> from chalkdf import DataFrame
498
+ >>> from chalk.features import _
499
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
500
+ >>> # Use col to reference columns in expressions
501
+ >>> df_filtered = df.filter(_.x > 1)
502
+ """
503
+ return self.column(column)
504
+
505
+ def column(self, column: str) -> Underscore:
506
+ """Get a column expression from the DataFrame.
507
+
508
+ Alias for col() method.
509
+
510
+ Parameters
511
+ ----------
512
+ column
513
+ Name of the column to retrieve.
514
+
515
+ Returns
516
+ -------
517
+ Column expression (as Underscore) that can be used in operations.
518
+
519
+ Examples
520
+ --------
521
+ >>> from chalkdf import DataFrame
522
+ >>> from chalk.features import _
523
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
524
+ >>> df_sum = df.with_columns({"sum": _.x + _.y})
525
+ """
526
+
527
+ # The LazyFramePlaceholder does not currently track schema, so it cannot detect
528
+ # errors about missing columns.
529
+ return UnderscoreAttr(UnderscoreRoot(), column)
530
+
531
+ def project(self, columns: typing.Mapping[str, Underscore]) -> "LazyFramePlaceholder":
532
+ """Project to a new set of columns using expressions.
533
+
534
+ Parameters
535
+ ----------
536
+ columns
537
+ Mapping of output column names to expressions that define them.
538
+
539
+ Returns
540
+ -------
541
+ DataFrame with only the specified columns.
542
+
543
+ Examples
544
+ --------
545
+ >>> from chalkdf import DataFrame
546
+ >>> from chalk.features import _
547
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
548
+ >>> projected = df.project({"sum": _.x + _.y, "x": _.x})
549
+ """
550
+
551
+ return self._construct(
552
+ self_dataframe=self,
553
+ function_name="project",
554
+ columns=columns,
555
+ )
556
+
557
+ def select(self, *columns: str, strict: bool = True) -> "LazyFramePlaceholder":
558
+ """Select existing columns by name.
559
+
560
+ Parameters
561
+ ----------
562
+ *columns
563
+ Names of columns to select.
564
+ strict
565
+ If True, raise an error if any column doesn't exist. If False,
566
+ silently ignore missing columns.
567
+
568
+ Returns
569
+ -------
570
+ DataFrame with only the selected columns.
571
+
572
+ Examples
573
+ --------
574
+ >>> from chalkdf import DataFrame
575
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]})
576
+ >>> selected = df.select("x", "y")
577
+ """
578
+
579
+ return self._construct(
580
+ self_dataframe=self,
581
+ function_name="select",
582
+ args=columns,
583
+ strict=strict,
584
+ )
585
+
586
+ def drop(self, *columns: str, strict: bool = True) -> LazyFramePlaceholder:
587
+ """Drop specified columns from the DataFrame.
588
+
589
+ Parameters
590
+ ----------
591
+ *columns
592
+ Names of columns to drop.
593
+ strict
594
+ If True, raise an error if any column doesn't exist. If False,
595
+ silently ignore missing columns.
596
+
597
+ Returns
598
+ -------
599
+ DataFrame without the dropped columns.
600
+
601
+ Examples
602
+ --------
603
+ >>> from chalkdf import DataFrame
604
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]})
605
+ >>> df_dropped = df.drop("z")
606
+ """
607
+
608
+ return self._construct(
609
+ self_dataframe=self,
610
+ function_name="drop",
611
+ args=columns,
612
+ strict=strict,
613
+ )
614
+
615
+ def explode(self, column: str) -> "LazyFramePlaceholder":
616
+ """Explode a list or array column into multiple rows.
617
+
618
+ Each element in the list becomes a separate row, with other column
619
+ values duplicated.
620
+
621
+ Parameters
622
+ ----------
623
+ column
624
+ Name of the list/array column to explode.
625
+
626
+ Returns
627
+ -------
628
+ DataFrame with the list column expanded into multiple rows.
629
+
630
+ Examples
631
+ --------
632
+ >>> from chalkdf import DataFrame
633
+ >>> df = DataFrame.from_dict({"id": [1, 2], "items": [[10, 20], [30]]})
634
+ >>> exploded = df.explode("items")
635
+ """
636
+ return self._construct(
637
+ self_dataframe=self,
638
+ function_name="explode",
639
+ column=column,
640
+ )
641
+
642
+ def join(
643
+ self,
644
+ other: "LazyFramePlaceholder",
645
+ on: dict[str, str] | typing.Sequence[str],
646
+ how: str = "inner",
647
+ right_suffix: str | None = None,
648
+ ) -> "LazyFramePlaceholder":
649
+ """Join this ``DataFrame`` with another.
650
+
651
+ Parameters
652
+ ----------
653
+ other
654
+ Right-hand ``DataFrame``.
655
+ on
656
+ Column names or mapping of left->right join keys.
657
+ how
658
+ Join type (e.g. ``"inner"`` or ``"left"``).
659
+ right_suffix
660
+ Optional suffix applied to right-hand columns when names collide.
661
+
662
+ Returns
663
+ -------
664
+ Resulting ``DataFrame`` after the join.
665
+ """
666
+
667
+ return self._construct(
668
+ self_dataframe=self,
669
+ function_name="join",
670
+ other=other,
671
+ on=on,
672
+ how=how,
673
+ right_suffix=right_suffix,
674
+ )
675
+
676
+ def join_asof(
677
+ self,
678
+ other: LazyFramePlaceholder,
679
+ on: str,
680
+ *,
681
+ right_on: str | None = None,
682
+ by: list[str] | None = None,
683
+ right_by: list[str] | None = None,
684
+ strategy: typing.Literal["forward", "backward"] = "backward",
685
+ right_suffix: str | None = None,
686
+ coalesce: bool = True,
687
+ ) -> LazyFramePlaceholder:
688
+ """Perform an as-of join with another DataFrame.
689
+
690
+ An as-of join is similar to a left join, but instead of matching on equality,
691
+ it matches on the nearest key from the right DataFrame. This is commonly used
692
+ for time-series data where you want to join with the most recent observation.
693
+
694
+ **Important**: Both DataFrames must be sorted by the ``on`` column before calling
695
+ this method. Use ``.order_by(on)`` to sort if needed.
696
+
697
+ Parameters
698
+ ----------
699
+ other
700
+ Right-hand DataFrame to join with.
701
+ on
702
+ Column name in the left DataFrame to join on (must be sorted).
703
+ right_on
704
+ Column name in the right DataFrame to join on. If None, uses ``on``.
705
+ by
706
+ Additional exact-match columns for left DataFrame (optional).
707
+ right_by
708
+ Additional exact-match columns for right DataFrame. If None, uses ``by``.
709
+ strategy
710
+ Join strategy - "backward" (default) matches with the most recent past value,
711
+ "forward" matches with the nearest future value. Can also pass AsOfJoinStrategy enum.
712
+ right_suffix
713
+ Suffix to add to overlapping column names from the right DataFrame.
714
+ coalesce
715
+ Whether to coalesce the join keys (default True).
716
+
717
+ Returns
718
+ -------
719
+ Resulting DataFrame after the as-of join.
720
+ """
721
+ # Convert string strategy to enum if needed
722
+
723
+ return self._construct(
724
+ self_dataframe=self,
725
+ function_name="join_asof",
726
+ other=other,
727
+ on=on,
728
+ right_on=right_on,
729
+ by=by,
730
+ right_by=right_by,
731
+ strategy=strategy,
732
+ right_suffix=right_suffix,
733
+ coalesce=coalesce,
734
+ )
735
+
736
+ # # Window is not yet supported in LazyFramePlaceholder:
737
+ # def window(
738
+ # self,
739
+ # by: typing.Sequence[str],
740
+ # order_by: typing.Sequence[str | tuple[str, str]],
741
+ # *expressions: WindowExpr,
742
+ # ) -> LazyFramePlaceholder:
743
+ # ...
744
+
745
+ def agg(self, by: typing.Sequence[str], *aggregations: Underscore) -> "LazyFramePlaceholder":
746
+ """Group by columns and apply aggregation expressions.
747
+
748
+ Parameters
749
+ ----------
750
+ by
751
+ Column names to group by.
752
+ *aggregations
753
+ Aggregation expressions to apply to each group (e.g., sum, count, mean).
754
+
755
+ Returns
756
+ -------
757
+ DataFrame with one row per group containing the aggregated values.
758
+
759
+ Examples
760
+ --------
761
+ >>> from chalkdf import DataFrame
762
+ >>> from chalk.features import _
763
+ >>> df = DataFrame.from_dict({"group": ["A", "A", "B"], "value": [1, 2, 3]})
764
+ >>> agg_df = df.agg(["group"], _.value.sum().alias("total"))
765
+ """
766
+
767
+ if isinstance(by, str):
768
+ raise ValueError(f".agg(...) must be called with a list of group-by columns, not a single str {repr(by)}")
769
+
770
+ return self._construct(
771
+ self_dataframe=self,
772
+ function_name="agg",
773
+ args=(by, *aggregations),
774
+ )
775
+
776
+ def distinct_on(self, *columns: str) -> "LazyFramePlaceholder":
777
+ """Remove duplicate rows based on specified columns.
778
+
779
+ For rows with identical values in the specified columns, only one
780
+ row is kept (chosen arbitrarily).
781
+
782
+ Parameters
783
+ ----------
784
+ *columns
785
+ Column names to check for duplicates.
786
+
787
+ Returns
788
+ -------
789
+ DataFrame with duplicate rows removed.
790
+
791
+ Examples
792
+ --------
793
+ >>> from chalkdf import DataFrame
794
+ >>> df = DataFrame.from_dict({"x": [1, 1, 2], "y": [10, 20, 30]})
795
+ >>> unique = df.distinct_on("x")
796
+ """
797
+
798
+ return self._construct(
799
+ self_dataframe=self,
800
+ function_name="distinct_on",
801
+ args=columns,
802
+ )
803
+
804
+ def order_by(self, *columns: str | tuple[str, str]) -> LazyFramePlaceholder:
805
+ """Sort the DataFrame by one or more columns.
806
+
807
+ Parameters
808
+ ----------
809
+ *columns
810
+ Column names to sort by. Can be strings (for ascending order) or
811
+ tuples of (column_name, direction) where direction is "asc" or "desc".
812
+
813
+ Returns
814
+ -------
815
+ DataFrame sorted by the specified columns.
816
+
817
+ Examples
818
+ --------
819
+ >>> from chalkdf import DataFrame
820
+ >>> df = DataFrame.from_dict({"x": [3, 1, 2], "y": [30, 10, 20]})
821
+ >>> # Sort by x ascending
822
+ >>> sorted_df = df.order_by("x")
823
+ >>> # Sort by x descending, then y ascending
824
+ >>> sorted_df = df.order_by(("x", "desc"), "y")
825
+ """
826
+
827
+ return self._construct(
828
+ self_dataframe=self,
829
+ function_name="order_by",
830
+ args=columns,
831
+ )
832
+
833
+ def write(
834
+ self,
835
+ target_path: str,
836
+ target_file_name: str | None = None,
837
+ *,
838
+ file_format: str = "parquet",
839
+ serde_parameters: typing.Mapping[str, str] | None = None,
840
+ compression: str | None = None,
841
+ ensure_files: bool = False,
842
+ connector_id: str | None = None,
843
+ ) -> "LazyFramePlaceholder":
844
+ """Persist the DataFrame plan using Velox's Hive connector.
845
+
846
+ Parameters
847
+ ----------
848
+ target_path
849
+ Directory to write output files.
850
+ target_file_name
851
+ Optional explicit file name.
852
+ file_format
853
+ Output format (default ``parquet``).
854
+ serde_parameters
855
+ Optional SerDe options for text formats.
856
+ compression
857
+ Optional compression codec.
858
+ ensure_files
859
+ Ensure writers emit files even if no rows were produced.
860
+ connector_id
861
+ Optional connector id override.
862
+
863
+ Returns
864
+ -------
865
+ DataFrame representing the TableWrite operator.
866
+ """
867
+
868
+ return self._construct(
869
+ self_dataframe=self,
870
+ function_name="write",
871
+ target_path=target_path,
872
+ target_file_name=target_file_name,
873
+ file_format=file_format,
874
+ serde_parameters=serde_parameters,
875
+ compression=compression,
876
+ ensure_files=ensure_files,
877
+ connector_id=connector_id,
878
+ )
879
+
880
+ def rename(self, new_names: dict[str, str]) -> LazyFramePlaceholder:
881
+ """Rename columns in the DataFrame.
882
+
883
+ Parameters
884
+ ----------
885
+ new_names
886
+ Dictionary mapping old column names to new column names.
887
+
888
+ Returns
889
+ -------
890
+ DataFrame with renamed columns.
891
+
892
+ Examples
893
+ --------
894
+ >>> from chalkdf import DataFrame
895
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
896
+ >>> renamed = df.rename({"x": "id", "y": "value"})
897
+ """
898
+
899
+ return self._construct(
900
+ self_dataframe=self,
901
+ function_name="rename",
902
+ new_names=new_names,
903
+ )
904
+
905
+ @staticmethod
906
+ def from_proto(
907
+ proto: bytes | dataframe_pb2.DataFramePlan,
908
+ ) -> "LazyFramePlaceholder":
909
+ if isinstance(proto, bytes):
910
+ proto_bytes = proto
911
+ proto = dataframe_pb2.DataFramePlan()
912
+ proto.ParseFromString(proto_bytes)
913
+ return _convert_from_dataframe_proto(proto, dataframe_class=LazyFramePlaceholder)
914
+
915
+
916
+ def _extract_alias_from_underscore(u: Underscore) -> tuple[str, Underscore] | None:
917
+ """
918
+ Given an underscore expression like `_.something.alias("name")` splits the expression
919
+ into the alias `"name"` and the underscore expression `_.something`.
920
+
921
+ If this expression does not have an alias, returns `None` instead.
922
+ """
923
+ if not isinstance(u, UnderscoreCall):
924
+ return None
925
+ parent = u._chalk__parent # pyright: ignore[reportPrivateUsage]
926
+ if not isinstance(parent, UnderscoreAttr) or parent._chalk__attr != "alias": # pyright: ignore[reportPrivateUsage]
927
+ return None
928
+ if len(u._chalk__args) != 1: # pyright: ignore[reportPrivateUsage]
929
+ raise ValueError("alias() must be called with one argument")
930
+ alias = u._chalk__args[0] # pyright: ignore[reportPrivateUsage]
931
+ if not isinstance(alias, str):
932
+ raise ValueError("argument to alias() must be a string")
933
+ return (
934
+ alias,
935
+ parent._chalk__parent, # pyright: ignore[reportPrivateUsage]
936
+ )
937
+
938
+
939
+ def _convert_to_dataframe_proto(
940
+ lazy_frame: LazyFramePlaceholder,
941
+ ) -> dataframe_pb2.DataFramePlan:
942
+ """
943
+ Converts a `LazyFramePlaceholder` into a proto value, allowing it to be round-tripped
944
+ or converted into a Chalk DataFrame for execution.
945
+ """
946
+ df_constructors: list[dataframe_pb2.DataFrameConstructor] = []
947
+
948
+ # This map will memoize the constructor for a specified `LazyFramePlaceholder`.
949
+ lazy_frame_placeholder_cache: dict[LazyFramePlaceholder, dataframe_pb2.DataFrameIndex] = {}
950
+
951
+ def _convert_dataframe(df: LazyFramePlaceholder) -> dataframe_pb2.DataFrameIndex:
952
+ """
953
+ Recursively converts a `LazyFramePlaceholder` into a proto message.
954
+ If this `df` instance has been seen before, returns an index into the `df_constructors`
955
+ list pointing to the previous construction.
956
+
957
+ This allows plans that re-use operators to be efficiently encoded.
958
+ """
959
+ if df in lazy_frame_placeholder_cache:
960
+ return lazy_frame_placeholder_cache[df]
961
+
962
+ df_constructor = df._lazy_frame_constructor # pyright: ignore[reportPrivateUsage]
963
+ if df_constructor.self_dataframe is None:
964
+ self_proto = None
965
+ else:
966
+ self_proto = _convert_dataframe(df_constructor.self_dataframe)
967
+
968
+ proto_args = dataframe_pb2.PyList(
969
+ list_items=[_convert_arg(arg_value) for arg_value in df_constructor.args],
970
+ )
971
+ proto_kwargs = dataframe_pb2.PyDict(
972
+ dict_entries=[
973
+ dataframe_pb2.PyDictEntry(
974
+ entry_key=_convert_arg(kwarg_name),
975
+ entry_value=_convert_arg(kwarg_value),
976
+ )
977
+ for kwarg_name, kwarg_value in df_constructor.kwargs.items()
978
+ ],
979
+ )
980
+
981
+ new_constructor_index = len(df_constructors)
982
+ df_constructors.append(
983
+ dataframe_pb2.DataFrameConstructor(
984
+ self_operand=self_proto,
985
+ function_name=df_constructor.function_name,
986
+ args=proto_args,
987
+ kwargs=proto_kwargs,
988
+ )
989
+ )
990
+ lazy_frame_placeholder_cache[df] = dataframe_pb2.DataFrameIndex(
991
+ dataframe_op_index=new_constructor_index,
992
+ )
993
+ return lazy_frame_placeholder_cache[df]
994
+
995
+ def _convert_arg(value: Any) -> dataframe_pb2.DataFrameOperand:
996
+ if value is None:
997
+ return dataframe_pb2.DataFrameOperand(
998
+ value_none=dataframe_pb2.PyNone(),
999
+ )
1000
+ if isinstance(value, int):
1001
+ return dataframe_pb2.DataFrameOperand(
1002
+ value_int=value,
1003
+ )
1004
+ if isinstance(value, str):
1005
+ return dataframe_pb2.DataFrameOperand(
1006
+ value_string=value,
1007
+ )
1008
+ if isinstance(value, bool):
1009
+ return dataframe_pb2.DataFrameOperand(
1010
+ value_bool=value,
1011
+ )
1012
+ if isinstance(value, (list, tuple)):
1013
+ return dataframe_pb2.DataFrameOperand(
1014
+ value_list=dataframe_pb2.PyList(
1015
+ list_items=[_convert_arg(item) for item in value],
1016
+ )
1017
+ )
1018
+ if isinstance(value, typing.Mapping):
1019
+ return dataframe_pb2.DataFrameOperand(
1020
+ value_dict=dataframe_pb2.PyDict(
1021
+ dict_entries=[
1022
+ dataframe_pb2.PyDictEntry(
1023
+ entry_key=_convert_arg(key),
1024
+ entry_value=_convert_arg(value),
1025
+ )
1026
+ for key, value in value.items()
1027
+ ]
1028
+ )
1029
+ )
1030
+ if isinstance(value, LazyFramePlaceholder):
1031
+ # Use the dataframe-specific helper function for this logic.
1032
+ return dataframe_pb2.DataFrameOperand(
1033
+ value_dataframe_index=_convert_dataframe(value),
1034
+ )
1035
+ if isinstance(value, Underscore):
1036
+ return dataframe_pb2.DataFrameOperand(
1037
+ underscore_expr=convert_value_to_proto_expr(value),
1038
+ )
1039
+ if isinstance(value, pyarrow.Schema):
1040
+ return dataframe_pb2.DataFrameOperand(
1041
+ arrow_schema=PrimitiveFeatureConverter.convert_pa_schema_to_proto_schema(value),
1042
+ )
1043
+ if isinstance(value, (pyarrow.Table, pyarrow.RecordBatch)):
1044
+ return dataframe_pb2.DataFrameOperand(
1045
+ arrow_table=PrimitiveFeatureConverter.convert_arrow_table_to_proto(value),
1046
+ )
1047
+
1048
+ raise ValueError(f"LazyFramePlaceholder function operand is of unsupported type {type(value)}")
1049
+
1050
+ _convert_arg(lazy_frame)
1051
+
1052
+ return dataframe_pb2.DataFramePlan(
1053
+ constructors=df_constructors,
1054
+ )
1055
+
1056
+
1057
+ def _convert_from_dataframe_proto(
1058
+ proto_plan: dataframe_pb2.DataFramePlan,
1059
+ dataframe_class: type,
1060
+ ) -> LazyFramePlaceholder:
1061
+ """
1062
+ Converts a proto into a lazy frame.
1063
+ """
1064
+ df_values: list[LazyFramePlaceholder] = []
1065
+
1066
+ def _convert_dataframe_index(df: dataframe_pb2.DataFrameIndex) -> LazyFramePlaceholder:
1067
+ if df.dataframe_op_index < 0 or df.dataframe_op_index >= len(df_values):
1068
+ raise ValueError(
1069
+ f"DataFrame proto message value is invalid - a DataFrame constructor references operator index {df.dataframe_op_index} but only {len(df_values)} dataframe(s) intermediate values have been defined so far."
1070
+ )
1071
+ return df_values[df.dataframe_op_index]
1072
+
1073
+ def _convert_dataframe(df: dataframe_pb2.DataFrameConstructor) -> LazyFramePlaceholder:
1074
+ if df.HasField("self_operand"):
1075
+ self_operand = _convert_dataframe_index(df.self_operand)
1076
+ else:
1077
+ self_operand = None
1078
+
1079
+ # TODO: validate that function_name is legal.
1080
+ if self_operand is None:
1081
+ method = getattr(dataframe_class, df.function_name)
1082
+ else:
1083
+ method = getattr(self_operand, df.function_name)
1084
+
1085
+ args = [_convert_arg(arg) for arg in df.args.list_items]
1086
+ kwargs = {_convert_arg(entry.entry_key): _convert_arg(entry.entry_value) for entry in df.kwargs.dict_entries}
1087
+
1088
+ return method(*args, **kwargs)
1089
+
1090
+ def _convert_arg(value: dataframe_pb2.DataFrameOperand) -> Any:
1091
+ if value.HasField("value_string"):
1092
+ return value.value_string
1093
+ if value.HasField("value_int"):
1094
+ return value.value_int
1095
+ if value.HasField("value_bool"):
1096
+ return value.value_bool
1097
+ if value.HasField("value_none"):
1098
+ return None
1099
+ if value.HasField("value_list"):
1100
+ return [_convert_arg(item) for item in value.value_list.list_items]
1101
+ if value.HasField("value_dict"):
1102
+ return {
1103
+ _convert_arg(entry.entry_key): _convert_arg(entry.entry_value)
1104
+ for entry in value.value_dict.dict_entries
1105
+ }
1106
+ if value.HasField("value_dataframe_index"):
1107
+ return _convert_dataframe_index(value.value_dataframe_index)
1108
+ if value.HasField("arrow_schema"):
1109
+ return PrimitiveFeatureConverter.convert_proto_schema_to_pa_schema(value.arrow_schema)
1110
+ if value.HasField("arrow_table"):
1111
+ return PrimitiveFeatureConverter.convert_arrow_table_from_proto(value.arrow_table)
1112
+ if value.HasField("underscore_expr"):
1113
+ return Underscore._from_proto(value.underscore_expr) # pyright: ignore[reportPrivateUsage]
1114
+
1115
+ raise ValueError(f"DataFrame operand expression {value} does not have any value set")
1116
+
1117
+ for df in proto_plan.constructors:
1118
+ df_values.append(_convert_dataframe(df))
1119
+
1120
+ if len(df_values) == 0:
1121
+ raise ValueError(
1122
+ "Could not parse LazyFramePlaceholder from proto expression; no dataframe constructors were present in the provided proto message"
1123
+ )
1124
+
1125
+ return df_values[-1]