chalkpy 2.89.22__py3-none-any.whl → 2.95.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (268) hide show
  1. chalk/__init__.py +2 -1
  2. chalk/_gen/chalk/arrow/v1/arrow_pb2.py +7 -5
  3. chalk/_gen/chalk/arrow/v1/arrow_pb2.pyi +6 -0
  4. chalk/_gen/chalk/artifacts/v1/chart_pb2.py +36 -33
  5. chalk/_gen/chalk/artifacts/v1/chart_pb2.pyi +41 -1
  6. chalk/_gen/chalk/artifacts/v1/cron_query_pb2.py +8 -7
  7. chalk/_gen/chalk/artifacts/v1/cron_query_pb2.pyi +5 -0
  8. chalk/_gen/chalk/common/v1/offline_query_pb2.py +19 -13
  9. chalk/_gen/chalk/common/v1/offline_query_pb2.pyi +37 -0
  10. chalk/_gen/chalk/common/v1/online_query_pb2.py +54 -54
  11. chalk/_gen/chalk/common/v1/online_query_pb2.pyi +13 -1
  12. chalk/_gen/chalk/common/v1/script_task_pb2.py +13 -11
  13. chalk/_gen/chalk/common/v1/script_task_pb2.pyi +19 -1
  14. chalk/_gen/chalk/dataframe/__init__.py +0 -0
  15. chalk/_gen/chalk/dataframe/v1/__init__.py +0 -0
  16. chalk/_gen/chalk/dataframe/v1/dataframe_pb2.py +48 -0
  17. chalk/_gen/chalk/dataframe/v1/dataframe_pb2.pyi +123 -0
  18. chalk/_gen/chalk/dataframe/v1/dataframe_pb2_grpc.py +4 -0
  19. chalk/_gen/chalk/dataframe/v1/dataframe_pb2_grpc.pyi +4 -0
  20. chalk/_gen/chalk/graph/v1/graph_pb2.py +150 -149
  21. chalk/_gen/chalk/graph/v1/graph_pb2.pyi +25 -0
  22. chalk/_gen/chalk/graph/v1/sources_pb2.py +94 -84
  23. chalk/_gen/chalk/graph/v1/sources_pb2.pyi +56 -0
  24. chalk/_gen/chalk/kubernetes/v1/horizontalpodautoscaler_pb2.py +79 -0
  25. chalk/_gen/chalk/kubernetes/v1/horizontalpodautoscaler_pb2.pyi +377 -0
  26. chalk/_gen/chalk/kubernetes/v1/horizontalpodautoscaler_pb2_grpc.py +4 -0
  27. chalk/_gen/chalk/kubernetes/v1/horizontalpodautoscaler_pb2_grpc.pyi +4 -0
  28. chalk/_gen/chalk/kubernetes/v1/scaledobject_pb2.py +43 -7
  29. chalk/_gen/chalk/kubernetes/v1/scaledobject_pb2.pyi +252 -2
  30. chalk/_gen/chalk/protosql/v1/sql_service_pb2.py +54 -27
  31. chalk/_gen/chalk/protosql/v1/sql_service_pb2.pyi +131 -3
  32. chalk/_gen/chalk/protosql/v1/sql_service_pb2_grpc.py +45 -0
  33. chalk/_gen/chalk/protosql/v1/sql_service_pb2_grpc.pyi +14 -0
  34. chalk/_gen/chalk/python/v1/types_pb2.py +14 -14
  35. chalk/_gen/chalk/python/v1/types_pb2.pyi +8 -0
  36. chalk/_gen/chalk/server/v1/benchmark_pb2.py +76 -0
  37. chalk/_gen/chalk/server/v1/benchmark_pb2.pyi +156 -0
  38. chalk/_gen/chalk/server/v1/benchmark_pb2_grpc.py +258 -0
  39. chalk/_gen/chalk/server/v1/benchmark_pb2_grpc.pyi +84 -0
  40. chalk/_gen/chalk/server/v1/billing_pb2.py +40 -38
  41. chalk/_gen/chalk/server/v1/billing_pb2.pyi +17 -1
  42. chalk/_gen/chalk/server/v1/branches_pb2.py +45 -0
  43. chalk/_gen/chalk/server/v1/branches_pb2.pyi +80 -0
  44. chalk/_gen/chalk/server/v1/branches_pb2_grpc.pyi +36 -0
  45. chalk/_gen/chalk/server/v1/builder_pb2.py +372 -272
  46. chalk/_gen/chalk/server/v1/builder_pb2.pyi +479 -12
  47. chalk/_gen/chalk/server/v1/builder_pb2_grpc.py +360 -0
  48. chalk/_gen/chalk/server/v1/builder_pb2_grpc.pyi +96 -0
  49. chalk/_gen/chalk/server/v1/chart_pb2.py +10 -10
  50. chalk/_gen/chalk/server/v1/chart_pb2.pyi +18 -2
  51. chalk/_gen/chalk/server/v1/clickhouse_pb2.py +42 -0
  52. chalk/_gen/chalk/server/v1/clickhouse_pb2.pyi +17 -0
  53. chalk/_gen/chalk/server/v1/clickhouse_pb2_grpc.py +78 -0
  54. chalk/_gen/chalk/server/v1/clickhouse_pb2_grpc.pyi +38 -0
  55. chalk/_gen/chalk/server/v1/cloud_components_pb2.py +153 -107
  56. chalk/_gen/chalk/server/v1/cloud_components_pb2.pyi +146 -4
  57. chalk/_gen/chalk/server/v1/cloud_components_pb2_grpc.py +180 -0
  58. chalk/_gen/chalk/server/v1/cloud_components_pb2_grpc.pyi +48 -0
  59. chalk/_gen/chalk/server/v1/cloud_credentials_pb2.py +11 -3
  60. chalk/_gen/chalk/server/v1/cloud_credentials_pb2.pyi +20 -0
  61. chalk/_gen/chalk/server/v1/cloud_credentials_pb2_grpc.py +45 -0
  62. chalk/_gen/chalk/server/v1/cloud_credentials_pb2_grpc.pyi +12 -0
  63. chalk/_gen/chalk/server/v1/dataplanejobqueue_pb2.py +59 -35
  64. chalk/_gen/chalk/server/v1/dataplanejobqueue_pb2.pyi +127 -1
  65. chalk/_gen/chalk/server/v1/dataplanejobqueue_pb2_grpc.py +135 -0
  66. chalk/_gen/chalk/server/v1/dataplanejobqueue_pb2_grpc.pyi +36 -0
  67. chalk/_gen/chalk/server/v1/dataplaneworkflows_pb2.py +90 -0
  68. chalk/_gen/chalk/server/v1/dataplaneworkflows_pb2.pyi +264 -0
  69. chalk/_gen/chalk/server/v1/dataplaneworkflows_pb2_grpc.py +170 -0
  70. chalk/_gen/chalk/server/v1/dataplaneworkflows_pb2_grpc.pyi +62 -0
  71. chalk/_gen/chalk/server/v1/datasets_pb2.py +36 -24
  72. chalk/_gen/chalk/server/v1/datasets_pb2.pyi +71 -2
  73. chalk/_gen/chalk/server/v1/datasets_pb2_grpc.py +45 -0
  74. chalk/_gen/chalk/server/v1/datasets_pb2_grpc.pyi +12 -0
  75. chalk/_gen/chalk/server/v1/deploy_pb2.py +9 -3
  76. chalk/_gen/chalk/server/v1/deploy_pb2.pyi +12 -0
  77. chalk/_gen/chalk/server/v1/deploy_pb2_grpc.py +45 -0
  78. chalk/_gen/chalk/server/v1/deploy_pb2_grpc.pyi +12 -0
  79. chalk/_gen/chalk/server/v1/deployment_pb2.py +20 -15
  80. chalk/_gen/chalk/server/v1/deployment_pb2.pyi +25 -0
  81. chalk/_gen/chalk/server/v1/environment_pb2.py +25 -15
  82. chalk/_gen/chalk/server/v1/environment_pb2.pyi +93 -1
  83. chalk/_gen/chalk/server/v1/eventbus_pb2.py +44 -0
  84. chalk/_gen/chalk/server/v1/eventbus_pb2.pyi +64 -0
  85. chalk/_gen/chalk/server/v1/eventbus_pb2_grpc.py +4 -0
  86. chalk/_gen/chalk/server/v1/eventbus_pb2_grpc.pyi +4 -0
  87. chalk/_gen/chalk/server/v1/files_pb2.py +65 -0
  88. chalk/_gen/chalk/server/v1/files_pb2.pyi +167 -0
  89. chalk/_gen/chalk/server/v1/files_pb2_grpc.py +4 -0
  90. chalk/_gen/chalk/server/v1/files_pb2_grpc.pyi +4 -0
  91. chalk/_gen/chalk/server/v1/graph_pb2.py +41 -3
  92. chalk/_gen/chalk/server/v1/graph_pb2.pyi +191 -0
  93. chalk/_gen/chalk/server/v1/graph_pb2_grpc.py +92 -0
  94. chalk/_gen/chalk/server/v1/graph_pb2_grpc.pyi +32 -0
  95. chalk/_gen/chalk/server/v1/incident_pb2.py +57 -0
  96. chalk/_gen/chalk/server/v1/incident_pb2.pyi +165 -0
  97. chalk/_gen/chalk/server/v1/incident_pb2_grpc.py +4 -0
  98. chalk/_gen/chalk/server/v1/incident_pb2_grpc.pyi +4 -0
  99. chalk/_gen/chalk/server/v1/indexing_job_pb2.py +44 -0
  100. chalk/_gen/chalk/server/v1/indexing_job_pb2.pyi +38 -0
  101. chalk/_gen/chalk/server/v1/indexing_job_pb2_grpc.py +78 -0
  102. chalk/_gen/chalk/server/v1/indexing_job_pb2_grpc.pyi +38 -0
  103. chalk/_gen/chalk/server/v1/integrations_pb2.py +11 -9
  104. chalk/_gen/chalk/server/v1/integrations_pb2.pyi +34 -2
  105. chalk/_gen/chalk/server/v1/kube_pb2.py +29 -19
  106. chalk/_gen/chalk/server/v1/kube_pb2.pyi +28 -0
  107. chalk/_gen/chalk/server/v1/kube_pb2_grpc.py +45 -0
  108. chalk/_gen/chalk/server/v1/kube_pb2_grpc.pyi +12 -0
  109. chalk/_gen/chalk/server/v1/log_pb2.py +21 -3
  110. chalk/_gen/chalk/server/v1/log_pb2.pyi +68 -0
  111. chalk/_gen/chalk/server/v1/log_pb2_grpc.py +90 -0
  112. chalk/_gen/chalk/server/v1/log_pb2_grpc.pyi +24 -0
  113. chalk/_gen/chalk/server/v1/metadataplanejobqueue_pb2.py +73 -0
  114. chalk/_gen/chalk/server/v1/metadataplanejobqueue_pb2.pyi +212 -0
  115. chalk/_gen/chalk/server/v1/metadataplanejobqueue_pb2_grpc.py +217 -0
  116. chalk/_gen/chalk/server/v1/metadataplanejobqueue_pb2_grpc.pyi +74 -0
  117. chalk/_gen/chalk/server/v1/model_registry_pb2.py +10 -10
  118. chalk/_gen/chalk/server/v1/model_registry_pb2.pyi +4 -1
  119. chalk/_gen/chalk/server/v1/monitoring_pb2.py +84 -75
  120. chalk/_gen/chalk/server/v1/monitoring_pb2.pyi +1 -0
  121. chalk/_gen/chalk/server/v1/monitoring_pb2_grpc.py +136 -0
  122. chalk/_gen/chalk/server/v1/monitoring_pb2_grpc.pyi +38 -0
  123. chalk/_gen/chalk/server/v1/offline_queries_pb2.py +32 -10
  124. chalk/_gen/chalk/server/v1/offline_queries_pb2.pyi +73 -0
  125. chalk/_gen/chalk/server/v1/offline_queries_pb2_grpc.py +90 -0
  126. chalk/_gen/chalk/server/v1/offline_queries_pb2_grpc.pyi +24 -0
  127. chalk/_gen/chalk/server/v1/plandebug_pb2.py +53 -0
  128. chalk/_gen/chalk/server/v1/plandebug_pb2.pyi +86 -0
  129. chalk/_gen/chalk/server/v1/plandebug_pb2_grpc.py +168 -0
  130. chalk/_gen/chalk/server/v1/plandebug_pb2_grpc.pyi +60 -0
  131. chalk/_gen/chalk/server/v1/queries_pb2.py +76 -48
  132. chalk/_gen/chalk/server/v1/queries_pb2.pyi +155 -2
  133. chalk/_gen/chalk/server/v1/queries_pb2_grpc.py +180 -0
  134. chalk/_gen/chalk/server/v1/queries_pb2_grpc.pyi +48 -0
  135. chalk/_gen/chalk/server/v1/scheduled_query_pb2.py +4 -2
  136. chalk/_gen/chalk/server/v1/scheduled_query_pb2_grpc.py +45 -0
  137. chalk/_gen/chalk/server/v1/scheduled_query_pb2_grpc.pyi +12 -0
  138. chalk/_gen/chalk/server/v1/scheduled_query_run_pb2.py +12 -6
  139. chalk/_gen/chalk/server/v1/scheduled_query_run_pb2.pyi +75 -2
  140. chalk/_gen/chalk/server/v1/scheduler_pb2.py +24 -12
  141. chalk/_gen/chalk/server/v1/scheduler_pb2.pyi +61 -1
  142. chalk/_gen/chalk/server/v1/scheduler_pb2_grpc.py +90 -0
  143. chalk/_gen/chalk/server/v1/scheduler_pb2_grpc.pyi +24 -0
  144. chalk/_gen/chalk/server/v1/script_tasks_pb2.py +26 -14
  145. chalk/_gen/chalk/server/v1/script_tasks_pb2.pyi +33 -3
  146. chalk/_gen/chalk/server/v1/script_tasks_pb2_grpc.py +90 -0
  147. chalk/_gen/chalk/server/v1/script_tasks_pb2_grpc.pyi +24 -0
  148. chalk/_gen/chalk/server/v1/sql_interface_pb2.py +75 -0
  149. chalk/_gen/chalk/server/v1/sql_interface_pb2.pyi +142 -0
  150. chalk/_gen/chalk/server/v1/sql_interface_pb2_grpc.py +349 -0
  151. chalk/_gen/chalk/server/v1/sql_interface_pb2_grpc.pyi +114 -0
  152. chalk/_gen/chalk/server/v1/sql_queries_pb2.py +48 -0
  153. chalk/_gen/chalk/server/v1/sql_queries_pb2.pyi +150 -0
  154. chalk/_gen/chalk/server/v1/sql_queries_pb2_grpc.py +123 -0
  155. chalk/_gen/chalk/server/v1/sql_queries_pb2_grpc.pyi +52 -0
  156. chalk/_gen/chalk/server/v1/team_pb2.py +156 -137
  157. chalk/_gen/chalk/server/v1/team_pb2.pyi +56 -10
  158. chalk/_gen/chalk/server/v1/team_pb2_grpc.py +90 -0
  159. chalk/_gen/chalk/server/v1/team_pb2_grpc.pyi +24 -0
  160. chalk/_gen/chalk/server/v1/topic_pb2.py +5 -3
  161. chalk/_gen/chalk/server/v1/topic_pb2.pyi +10 -1
  162. chalk/_gen/chalk/server/v1/trace_pb2.py +50 -28
  163. chalk/_gen/chalk/server/v1/trace_pb2.pyi +121 -0
  164. chalk/_gen/chalk/server/v1/trace_pb2_grpc.py +135 -0
  165. chalk/_gen/chalk/server/v1/trace_pb2_grpc.pyi +42 -0
  166. chalk/_gen/chalk/server/v1/webhook_pb2.py +9 -3
  167. chalk/_gen/chalk/server/v1/webhook_pb2.pyi +18 -0
  168. chalk/_gen/chalk/server/v1/webhook_pb2_grpc.py +45 -0
  169. chalk/_gen/chalk/server/v1/webhook_pb2_grpc.pyi +12 -0
  170. chalk/_gen/chalk/streaming/v1/debug_service_pb2.py +62 -0
  171. chalk/_gen/chalk/streaming/v1/debug_service_pb2.pyi +75 -0
  172. chalk/_gen/chalk/streaming/v1/debug_service_pb2_grpc.py +221 -0
  173. chalk/_gen/chalk/streaming/v1/debug_service_pb2_grpc.pyi +88 -0
  174. chalk/_gen/chalk/streaming/v1/simple_streaming_service_pb2.py +19 -7
  175. chalk/_gen/chalk/streaming/v1/simple_streaming_service_pb2.pyi +96 -3
  176. chalk/_gen/chalk/streaming/v1/simple_streaming_service_pb2_grpc.py +48 -0
  177. chalk/_gen/chalk/streaming/v1/simple_streaming_service_pb2_grpc.pyi +20 -0
  178. chalk/_gen/chalk/utils/v1/field_change_pb2.py +32 -0
  179. chalk/_gen/chalk/utils/v1/field_change_pb2.pyi +42 -0
  180. chalk/_gen/chalk/utils/v1/field_change_pb2_grpc.py +4 -0
  181. chalk/_gen/chalk/utils/v1/field_change_pb2_grpc.pyi +4 -0
  182. chalk/_lsp/error_builder.py +11 -0
  183. chalk/_monitoring/Chart.py +1 -3
  184. chalk/_version.py +1 -1
  185. chalk/cli.py +5 -10
  186. chalk/client/client.py +178 -64
  187. chalk/client/client_async.py +154 -0
  188. chalk/client/client_async_impl.py +22 -0
  189. chalk/client/client_grpc.py +738 -112
  190. chalk/client/client_impl.py +541 -136
  191. chalk/client/dataset.py +27 -6
  192. chalk/client/models.py +99 -2
  193. chalk/client/serialization/model_serialization.py +126 -10
  194. chalk/config/project_config.py +1 -1
  195. chalk/df/LazyFramePlaceholder.py +1154 -0
  196. chalk/df/ast_parser.py +2 -10
  197. chalk/features/_class_property.py +7 -0
  198. chalk/features/_embedding/embedding.py +1 -0
  199. chalk/features/_embedding/sentence_transformer.py +1 -1
  200. chalk/features/_encoding/converter.py +83 -2
  201. chalk/features/_encoding/pyarrow.py +20 -4
  202. chalk/features/_encoding/rich.py +1 -3
  203. chalk/features/_tensor.py +1 -2
  204. chalk/features/dataframe/_filters.py +14 -5
  205. chalk/features/dataframe/_impl.py +91 -36
  206. chalk/features/dataframe/_validation.py +11 -7
  207. chalk/features/feature_field.py +40 -30
  208. chalk/features/feature_set.py +1 -2
  209. chalk/features/feature_set_decorator.py +1 -0
  210. chalk/features/feature_wrapper.py +42 -3
  211. chalk/features/hooks.py +81 -12
  212. chalk/features/inference.py +65 -10
  213. chalk/features/resolver.py +338 -56
  214. chalk/features/tag.py +1 -3
  215. chalk/features/underscore_features.py +2 -1
  216. chalk/functions/__init__.py +456 -21
  217. chalk/functions/holidays.py +1 -3
  218. chalk/gitignore/gitignore_parser.py +5 -1
  219. chalk/importer.py +186 -74
  220. chalk/ml/__init__.py +6 -2
  221. chalk/ml/model_hooks.py +368 -51
  222. chalk/ml/model_reference.py +68 -10
  223. chalk/ml/model_version.py +34 -21
  224. chalk/ml/utils.py +143 -40
  225. chalk/operators/_utils.py +14 -3
  226. chalk/parsed/_proto/export.py +22 -0
  227. chalk/parsed/duplicate_input_gql.py +4 -0
  228. chalk/parsed/expressions.py +1 -3
  229. chalk/parsed/json_conversions.py +21 -14
  230. chalk/parsed/to_proto.py +16 -4
  231. chalk/parsed/user_types_to_json.py +31 -10
  232. chalk/parsed/validation_from_registries.py +182 -0
  233. chalk/queries/named_query.py +16 -6
  234. chalk/queries/scheduled_query.py +13 -1
  235. chalk/serialization/parsed_annotation.py +25 -12
  236. chalk/sql/__init__.py +221 -0
  237. chalk/sql/_internal/integrations/athena.py +6 -1
  238. chalk/sql/_internal/integrations/bigquery.py +22 -2
  239. chalk/sql/_internal/integrations/databricks.py +61 -18
  240. chalk/sql/_internal/integrations/mssql.py +281 -0
  241. chalk/sql/_internal/integrations/postgres.py +11 -3
  242. chalk/sql/_internal/integrations/redshift.py +4 -0
  243. chalk/sql/_internal/integrations/snowflake.py +11 -2
  244. chalk/sql/_internal/integrations/util.py +2 -1
  245. chalk/sql/_internal/sql_file_resolver.py +55 -10
  246. chalk/sql/_internal/sql_source.py +36 -2
  247. chalk/streams/__init__.py +1 -3
  248. chalk/streams/_kafka_source.py +5 -1
  249. chalk/streams/_windows.py +16 -4
  250. chalk/streams/types.py +1 -2
  251. chalk/utils/__init__.py +1 -3
  252. chalk/utils/_otel_version.py +13 -0
  253. chalk/utils/async_helpers.py +14 -5
  254. chalk/utils/df_utils.py +2 -2
  255. chalk/utils/duration.py +1 -3
  256. chalk/utils/job_log_display.py +538 -0
  257. chalk/utils/missing_dependency.py +5 -4
  258. chalk/utils/notebook.py +255 -2
  259. chalk/utils/pl_helpers.py +190 -37
  260. chalk/utils/pydanticutil/pydantic_compat.py +1 -2
  261. chalk/utils/storage_client.py +246 -0
  262. chalk/utils/threading.py +1 -3
  263. chalk/utils/tracing.py +194 -86
  264. {chalkpy-2.89.22.dist-info → chalkpy-2.95.3.dist-info}/METADATA +53 -21
  265. {chalkpy-2.89.22.dist-info → chalkpy-2.95.3.dist-info}/RECORD +268 -198
  266. {chalkpy-2.89.22.dist-info → chalkpy-2.95.3.dist-info}/WHEEL +0 -0
  267. {chalkpy-2.89.22.dist-info → chalkpy-2.95.3.dist-info}/entry_points.txt +0 -0
  268. {chalkpy-2.89.22.dist-info → chalkpy-2.95.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1154 @@
1
+ """Lightweight DataFrame wrapper around Chalk's execution engine.
2
+
3
+ The :class:`DataFrame` class constructs query plans backed by ``libchalk`` and
4
+ can materialize them into Arrow tables. It offers a minimal API similar to
5
+ other DataFrame libraries while delegating heavy lifting to the underlying
6
+ engine.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import typing
12
+ import uuid
13
+ from dataclasses import dataclass
14
+ from pathlib import Path
15
+ from typing import TYPE_CHECKING, Any, Optional, TypeAlias
16
+
17
+ import pyarrow
18
+
19
+ import chalk._gen.chalk.dataframe.v1.dataframe_pb2 as dataframe_pb2
20
+ import chalk._gen.chalk.expression.v1.expression_pb2 as expression_pb2
21
+ from chalk.features._encoding.converter import PrimitiveFeatureConverter
22
+ from chalk.features.underscore import (
23
+ Underscore,
24
+ UnderscoreAttr,
25
+ UnderscoreCall,
26
+ UnderscoreRoot,
27
+ convert_value_to_proto_expr,
28
+ )
29
+
30
+ if TYPE_CHECKING:
31
+ from chalk.features import Underscore
32
+
33
+
34
+ MaterializedTable: TypeAlias = pyarrow.RecordBatch | pyarrow.Table
35
+
36
+
37
+ @dataclass
38
+ class _LazyFrameConstructor:
39
+ """
40
+ A lazily-called function which will be used to construct a Chalk DataFrame.
41
+ """
42
+
43
+ self_dataframe: "Optional[LazyFramePlaceholder]"
44
+ """If present, this is the value of 'self' to call the function on."""
45
+
46
+ function_name: str
47
+ """The name of the function to construct the DataFrame."""
48
+
49
+ args: tuple[Any, ...]
50
+ """The args to pass to the DataFrame function."""
51
+
52
+ kwargs: dict[str, Any]
53
+ """The kwargs to pass to the DataFrame function."""
54
+
55
+
56
+ class LazyFramePlaceholder:
57
+ """
58
+ A lazy representation of a DataFrame operation.
59
+
60
+ Examples
61
+ --------
62
+ >>> from chalk.df import LazyFramePlaceholder
63
+ >>> from chalk.features import _
64
+ >>> # Create from a dictionary
65
+ >>> df = LazyFramePlaceholder.named_table('input', pa.schema({"id": pa.int64(), "name": pa.string()}))
66
+ >>> # Apply operations
67
+ >>> filtered = df.filter(_.x > 1)
68
+ """
69
+
70
+ @staticmethod
71
+ def _construct(
72
+ *,
73
+ self_dataframe: "Optional[LazyFramePlaceholder]",
74
+ function_name: str,
75
+ args: tuple[Any, ...] = (),
76
+ **kwargs: Any,
77
+ ):
78
+ return LazyFramePlaceholder(
79
+ _internal_constructor=_LazyFrameConstructor(
80
+ self_dataframe=self_dataframe,
81
+ function_name=function_name,
82
+ args=tuple(args),
83
+ kwargs=kwargs,
84
+ )
85
+ )
86
+
87
+ def __init__(
88
+ self,
89
+ *,
90
+ _internal_constructor: _LazyFrameConstructor,
91
+ ):
92
+ """
93
+ An internal construct that creates a `LazyFramePlaceholder` from its underlying operation.
94
+ """
95
+
96
+ super().__init__()
97
+ self._lazy_frame_constructor = _internal_constructor
98
+
99
+ def __repr__(self) -> str:
100
+ return "LazyFramePlaceholder(...)"
101
+
102
+ __str__ = __repr__
103
+
104
+ def _to_proto(self) -> dataframe_pb2.DataFramePlan:
105
+ """
106
+ Convert this proto plan to a dataframe.
107
+ """
108
+ return _convert_to_dataframe_proto(self)
109
+
110
+ @staticmethod
111
+ def _from_proto(proto: dataframe_pb2.DataFramePlan) -> "LazyFramePlaceholder":
112
+ """
113
+ Parse a `LazyFramePlaceholder` from the specified proto plan.
114
+ """
115
+ return _convert_from_dataframe_proto(proto, dataframe_class=LazyFramePlaceholder)
116
+
117
+ @classmethod
118
+ def named_table(cls, name: str, schema: pyarrow.Schema) -> LazyFramePlaceholder:
119
+ """Create a ``DataFrame`` for a named table.
120
+
121
+ Parameters
122
+ ----------
123
+ name
124
+ Table identifier.
125
+ schema
126
+ Arrow schema describing the table.
127
+
128
+ Returns
129
+ -------
130
+ DataFrame referencing the named table.
131
+ """
132
+
133
+ if not isinstance(name, str): # pyright: ignore[reportUnnecessaryIsInstance]
134
+ raise ValueError(
135
+ f"LazyFramePlaceholder.named_table expected `name` to have type 'str' but it was passed as a '{type(name)}'"
136
+ )
137
+ if not isinstance(schema, pyarrow.Schema): # pyright: ignore[reportUnnecessaryIsInstance]
138
+ raise ValueError(
139
+ f"LazyFramePlaceholder.named_table expected `schema` to have type 'pyarrow.Schema' but it was passed as a '{type(schema)}'"
140
+ )
141
+
142
+ return LazyFramePlaceholder._construct(
143
+ function_name="named_table",
144
+ self_dataframe=None,
145
+ name=name,
146
+ schema=schema,
147
+ )
148
+
149
+ @classmethod
150
+ def from_arrow(cls, data: MaterializedTable):
151
+ """Construct a DataFrame from an in-memory Arrow object.
152
+
153
+ Parameters
154
+ ----------
155
+ data
156
+ PyArrow Table or RecordBatch to convert into a DataFrame.
157
+
158
+ Returns
159
+ -------
160
+ DataFrame backed by the provided Arrow data.
161
+
162
+ Examples
163
+ --------
164
+ >>> import pyarrow as pa
165
+ >>> from chalkdf import DataFrame
166
+ >>> table = pa.table({"x": [1, 2, 3], "y": ["a", "b", "c"]})
167
+ >>> df = DataFrame.from_arrow(table)
168
+ """
169
+
170
+ assert isinstance(data, (pyarrow.Table, pyarrow.RecordBatch))
171
+
172
+ return LazyFramePlaceholder._construct(
173
+ self_dataframe=None,
174
+ function_name="from_arrow",
175
+ data=data,
176
+ )
177
+
178
+ @classmethod
179
+ def from_dict(cls, data: dict):
180
+ """Construct a DataFrame from a Python dictionary.
181
+
182
+ Parameters
183
+ ----------
184
+ data
185
+ Dictionary mapping column names to lists of values.
186
+
187
+ Returns
188
+ -------
189
+ DataFrame backed by the provided dictionary data.
190
+
191
+ Examples
192
+ --------
193
+ >>> from chalkdf import DataFrame
194
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": ["a", "b", "c"]})
195
+ """
196
+
197
+ return LazyFramePlaceholder.from_arrow(pyarrow.table(data))
198
+
199
+ @classmethod
200
+ def scan(
201
+ cls,
202
+ input_uris: typing.Sequence[str | Path],
203
+ *,
204
+ name: typing.Optional[str] = None,
205
+ schema: pyarrow.Schema | None = None,
206
+ ) -> "LazyFramePlaceholder":
207
+ """Scan files and return a DataFrame.
208
+
209
+ Currently supports CSV (with headers) and Parquet file formats.
210
+
211
+ Parameters
212
+ ----------
213
+ input_uris
214
+ List of file paths or URIs to scan. Supports local paths and file:// URIs.
215
+ name
216
+ Optional name to assign to the table being scanned.
217
+ schema
218
+ Schema of the data. Required for CSV files, optional for Parquet.
219
+
220
+ Returns
221
+ -------
222
+ DataFrame that reads data from the specified files.
223
+
224
+ Examples
225
+ --------
226
+ >>> from chalkdf import DataFrame
227
+ >>> # Scan Parquet files
228
+ >>> df = DataFrame.scan(["data/sales_2024.parquet"], name="sales_data")
229
+ >>> # Scan CSV with explicit schema
230
+ >>> import pyarrow as pa
231
+ >>> schema = pa.schema([("id", pa.int64()), ("name", pa.string())])
232
+ >>> df = DataFrame.scan(["data/users.csv"], name="users", schema=schema)
233
+ """
234
+ # Accept filesystem paths or URIs; construct file:// URIs manually for
235
+ # local paths to avoid percent-encoding partition tokens like '='.
236
+
237
+ if isinstance(input_uris, str):
238
+ raise ValueError(
239
+ "The LazyFramePlaceholder.scan() function must be called with a list of input_uris, not a single str URI"
240
+ )
241
+
242
+ if name is None:
243
+ name = str(uuid.uuid4())
244
+
245
+ normalized_input_uris: list[str] = []
246
+ for p in input_uris:
247
+ s = p if isinstance(p, str) else str(p)
248
+ if "://" in s:
249
+ normalized_input_uris.append(s)
250
+ else:
251
+ abs_path = str(Path(s).resolve())
252
+ if not abs_path.startswith("/"):
253
+ normalized_input_uris.append(Path(s).resolve().as_uri())
254
+ else:
255
+ normalized_input_uris.append("file://" + abs_path)
256
+
257
+ return LazyFramePlaceholder._construct(
258
+ self_dataframe=None,
259
+ function_name="scan",
260
+ name=name,
261
+ input_uris=normalized_input_uris,
262
+ schema=schema,
263
+ )
264
+
265
+ @classmethod
266
+ def scan_glue_iceberg(
267
+ cls,
268
+ glue_table_name: str,
269
+ schema: typing.Mapping[str, pyarrow.DataType],
270
+ *,
271
+ batch_row_count: int = 1_000,
272
+ aws_catalog_account_id: typing.Optional[str] = None,
273
+ aws_catalog_region: typing.Optional[str] = None,
274
+ aws_role_arn: typing.Optional[str] = None,
275
+ parquet_scan_range_column: typing.Optional[str] = None,
276
+ custom_partitions: typing.Optional[dict[str, tuple[typing.Literal["date_trunc(day)"], str]]] = None,
277
+ partition_column: typing.Optional[str] = None,
278
+ ) -> "LazyFramePlaceholder":
279
+ """Load data from an AWS Glue Iceberg table.
280
+
281
+ Parameters
282
+ ----------
283
+ glue_table_name
284
+ Fully qualified ``database.table`` name.
285
+ schema
286
+ Mapping of column names to Arrow types.
287
+ batch_row_count
288
+ Number of rows per batch.
289
+ aws_catalog_account_id
290
+ AWS account hosting the Glue catalog.
291
+ aws_catalog_region
292
+ Region of the Glue catalog.
293
+ aws_role_arn
294
+ IAM role to assume for access.
295
+ parquet_scan_range_column
296
+ Column used for range-based reads.
297
+ custom_partitions
298
+ Additional partition definitions.
299
+ partition_column
300
+ Column name representing partitions.
301
+
302
+ Returns
303
+ -------
304
+ DataFrame backed by the Glue table.
305
+ """
306
+
307
+ return LazyFramePlaceholder._construct(
308
+ self_dataframe=None,
309
+ function_name="scan_glue_iceberg",
310
+ schema=schema,
311
+ batch_row_count=batch_row_count,
312
+ aws_catalog_account_id=aws_catalog_account_id,
313
+ aws_catalog_region=aws_catalog_region,
314
+ aws_role_arn=aws_role_arn,
315
+ filter_predicate=None,
316
+ parquet_scan_range_column=parquet_scan_range_column,
317
+ custom_partitions=custom_partitions,
318
+ partition_column=partition_column,
319
+ )
320
+
321
+ @classmethod
322
+ def from_sql(
323
+ cls,
324
+ query: str,
325
+ ) -> LazyFramePlaceholder:
326
+ """Create a ``DataFrame`` from the result of executing a SQL query (DuckDB dialect).
327
+
328
+ Parameters
329
+ ----------
330
+ query
331
+ SQL query string (DuckDB dialect).
332
+ **tables
333
+ Named tables to use in the query. Can be Arrow Table, RecordBatch, or DataFrame.
334
+
335
+ Returns
336
+ -------
337
+ DataFrame containing the query results.
338
+ """
339
+
340
+ return LazyFramePlaceholder._construct(
341
+ self_dataframe=None,
342
+ function_name="from_sql",
343
+ query=query,
344
+ )
345
+
346
+ def with_columns(
347
+ self,
348
+ *columns: typing.Mapping[str, Underscore] | Underscore | tuple[str, Underscore],
349
+ ) -> LazyFramePlaceholder:
350
+ """Add or replace columns.
351
+
352
+ Accepts multiple forms:
353
+ - A mapping of column names to expressions
354
+ - Positional tuples of (name, expression)
355
+ - Bare positional expressions that must include ``.alias(<name>)``
356
+
357
+ Parameters
358
+ ----------
359
+ *columns
360
+ Column definitions as mappings, tuples, or aliased expressions.
361
+
362
+ Returns
363
+ -------
364
+ DataFrame with the specified columns added or replaced.
365
+
366
+ Examples
367
+ --------
368
+ >>> from chalkdf import DataFrame
369
+ >>> from chalk.features import _
370
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
371
+ >>> # Add a new column using a dict with _ syntax
372
+ >>> df2 = df.with_columns({"z": _.x + _.y})
373
+ >>> # Add a new column using alias
374
+ >>> df3 = df.with_columns((_.x + _.y).alias("z"))
375
+ """
376
+ entries: list[tuple[str, Underscore]] = []
377
+ if len(columns) == 0:
378
+ raise ValueError("with_columns requires at least one column expression")
379
+
380
+ for col in columns:
381
+ if isinstance(col, (list, tuple)):
382
+ if len(col) != 2:
383
+ raise ValueError(
384
+ f"LazyFramePlaceholder.with_column(...) cannot be called with tuple having {len(col)} members - expect (name, expression) pairs only."
385
+ )
386
+ entries.append(col)
387
+ elif isinstance(col, Underscore):
388
+ attempted_alias = _extract_alias_from_underscore(col)
389
+ if attempted_alias:
390
+ entries.append(attempted_alias)
391
+ else:
392
+ raise ValueError(
393
+ f"Positional with_columns expressions must use `.alias(...)` to set the column name, got expression '{col}' without any alias specified"
394
+ )
395
+ elif isinstance(col, typing.Mapping): # pyright: ignore[reportUnnecessaryIsInstance]
396
+ entries.extend((k, v) for k, v in col.items()) # pyright: ignore
397
+ else:
398
+ raise ValueError(
399
+ f"LazyFramePlaceholder.with_columns cannot be called with column argument `{repr(col)}`"
400
+ )
401
+
402
+ return LazyFramePlaceholder._construct(
403
+ self_dataframe=self,
404
+ function_name="with_columns",
405
+ args=tuple(entries),
406
+ )
407
+
408
+ def with_unique_id(self, name: str) -> LazyFramePlaceholder:
409
+ """Add a monotonically increasing unique identifier column.
410
+
411
+ Parameters
412
+ ----------
413
+ name
414
+ Name of the new ID column.
415
+
416
+ Returns
417
+ -------
418
+ DataFrame with a new column containing unique, incrementing IDs.
419
+
420
+ Examples
421
+ --------
422
+ >>> from chalkdf import DataFrame
423
+ >>> df = DataFrame.from_dict({"x": [10, 20, 30]})
424
+ >>> df_with_id = df.with_unique_id("row_id")
425
+ """
426
+
427
+ return LazyFramePlaceholder._construct(
428
+ self_dataframe=self,
429
+ function_name="with_unique_id",
430
+ name=name,
431
+ )
432
+
433
+ def filter(self, expr: Underscore) -> LazyFramePlaceholder:
434
+ """Filter rows based on a boolean expression.
435
+
436
+ Parameters
437
+ ----------
438
+ expr
439
+ Boolean expression to filter rows. Only rows where the expression
440
+ evaluates to True are kept.
441
+
442
+ Returns
443
+ -------
444
+ DataFrame containing only the rows that match the filter condition.
445
+
446
+ Examples
447
+ --------
448
+ >>> from chalkdf import DataFrame
449
+ >>> from chalk.features import _
450
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3, 4], "y": [10, 20, 30, 40]})
451
+ >>> filtered = df.filter(_.x > 2)
452
+ """
453
+
454
+ return LazyFramePlaceholder._construct(
455
+ self_dataframe=self,
456
+ function_name="filter",
457
+ expr=expr,
458
+ )
459
+
460
+ def slice(self, start: int, length: int | None = None) -> LazyFramePlaceholder:
461
+ """Return a subset of rows starting at a specific position.
462
+
463
+ Parameters
464
+ ----------
465
+ start
466
+ Zero-based index where the slice begins.
467
+ length
468
+ Number of rows to include. If None, includes all remaining rows.
469
+
470
+ Returns
471
+ -------
472
+ DataFrame containing the sliced rows.
473
+
474
+ Examples
475
+ --------
476
+ >>> from chalkdf import DataFrame
477
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3, 4, 5]})
478
+ >>> # Get rows 1-3 (indices 1, 2, 3)
479
+ >>> sliced = df.slice(1, 3)
480
+ """
481
+
482
+ # Can't actually express "no limit" with velox limit/offset, but this'll do.
483
+ return self._construct(
484
+ self_dataframe=self,
485
+ function_name="slice",
486
+ start=start,
487
+ length=length,
488
+ )
489
+
490
+ def col(self, column: str) -> Underscore:
491
+ """Get a column expression from the DataFrame.
492
+
493
+ Parameters
494
+ ----------
495
+ column
496
+ Name of the column to retrieve.
497
+
498
+ Returns
499
+ -------
500
+ Column expression (as Underscore) that can be used in operations.
501
+
502
+ Examples
503
+ --------
504
+ >>> from chalkdf import DataFrame
505
+ >>> from chalk.features import _
506
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
507
+ >>> # Use col to reference columns in expressions
508
+ >>> df_filtered = df.filter(_.x > 1)
509
+ """
510
+ return self.column(column)
511
+
512
+ def column(self, column: str) -> Underscore:
513
+ """Get a column expression from the DataFrame.
514
+
515
+ Alias for col() method.
516
+
517
+ Parameters
518
+ ----------
519
+ column
520
+ Name of the column to retrieve.
521
+
522
+ Returns
523
+ -------
524
+ Column expression (as Underscore) that can be used in operations.
525
+
526
+ Examples
527
+ --------
528
+ >>> from chalkdf import DataFrame
529
+ >>> from chalk.features import _
530
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
531
+ >>> df_sum = df.with_columns({"sum": _.x + _.y})
532
+ """
533
+
534
+ # The LazyFramePlaceholder does not currently track schema, so it cannot detect
535
+ # errors about missing columns.
536
+ return UnderscoreAttr(UnderscoreRoot(), column)
537
+
538
+ def project(self, columns: typing.Mapping[str, Underscore]) -> "LazyFramePlaceholder":
539
+ """Project to a new set of columns using expressions.
540
+
541
+ Parameters
542
+ ----------
543
+ columns
544
+ Mapping of output column names to expressions that define them.
545
+
546
+ Returns
547
+ -------
548
+ DataFrame with only the specified columns.
549
+
550
+ Examples
551
+ --------
552
+ >>> from chalkdf import DataFrame
553
+ >>> from chalk.features import _
554
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
555
+ >>> projected = df.project({"sum": _.x + _.y, "x": _.x})
556
+ """
557
+
558
+ return self._construct(
559
+ self_dataframe=self,
560
+ function_name="project",
561
+ columns=columns,
562
+ )
563
+
564
+ def select(self, *columns: str, strict: bool = True) -> "LazyFramePlaceholder":
565
+ """Select existing columns by name.
566
+
567
+ Parameters
568
+ ----------
569
+ *columns
570
+ Names of columns to select.
571
+ strict
572
+ If True, raise an error if any column doesn't exist. If False,
573
+ silently ignore missing columns.
574
+
575
+ Returns
576
+ -------
577
+ DataFrame with only the selected columns.
578
+
579
+ Examples
580
+ --------
581
+ >>> from chalkdf import DataFrame
582
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]})
583
+ >>> selected = df.select("x", "y")
584
+ """
585
+
586
+ return self._construct(
587
+ self_dataframe=self,
588
+ function_name="select",
589
+ args=columns,
590
+ strict=strict,
591
+ )
592
+
593
+ def drop(self, *columns: str, strict: bool = True) -> LazyFramePlaceholder:
594
+ """Drop specified columns from the DataFrame.
595
+
596
+ Parameters
597
+ ----------
598
+ *columns
599
+ Names of columns to drop.
600
+ strict
601
+ If True, raise an error if any column doesn't exist. If False,
602
+ silently ignore missing columns.
603
+
604
+ Returns
605
+ -------
606
+ DataFrame without the dropped columns.
607
+
608
+ Examples
609
+ --------
610
+ >>> from chalkdf import DataFrame
611
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]})
612
+ >>> df_dropped = df.drop("z")
613
+ """
614
+
615
+ return self._construct(
616
+ self_dataframe=self,
617
+ function_name="drop",
618
+ args=columns,
619
+ strict=strict,
620
+ )
621
+
622
+ def explode(self, column: str) -> "LazyFramePlaceholder":
623
+ """Explode a list or array column into multiple rows.
624
+
625
+ Each element in the list becomes a separate row, with other column
626
+ values duplicated.
627
+
628
+ Parameters
629
+ ----------
630
+ column
631
+ Name of the list/array column to explode.
632
+
633
+ Returns
634
+ -------
635
+ DataFrame with the list column expanded into multiple rows.
636
+
637
+ Examples
638
+ --------
639
+ >>> from chalkdf import DataFrame
640
+ >>> df = DataFrame.from_dict({"id": [1, 2], "items": [[10, 20], [30]]})
641
+ >>> exploded = df.explode("items")
642
+ """
643
+ return self._construct(
644
+ self_dataframe=self,
645
+ function_name="explode",
646
+ column=column,
647
+ )
648
+
649
+ def join(
650
+ self,
651
+ other: "LazyFramePlaceholder",
652
+ on: dict[str, str] | typing.Sequence[str],
653
+ how: str = "inner",
654
+ right_suffix: str | None = None,
655
+ ) -> "LazyFramePlaceholder":
656
+ """Join this ``DataFrame`` with another.
657
+
658
+ Parameters
659
+ ----------
660
+ other
661
+ Right-hand ``DataFrame``.
662
+ on
663
+ Column names or mapping of left->right join keys.
664
+ how
665
+ Join type (e.g. ``"inner"`` or ``"left"``).
666
+ right_suffix
667
+ Optional suffix applied to right-hand columns when names collide.
668
+
669
+ Returns
670
+ -------
671
+ Resulting ``DataFrame`` after the join.
672
+ """
673
+
674
+ return self._construct(
675
+ self_dataframe=self,
676
+ function_name="join",
677
+ other=other,
678
+ on=on,
679
+ how=how,
680
+ right_suffix=right_suffix,
681
+ )
682
+
683
+ def join_asof(
684
+ self,
685
+ other: LazyFramePlaceholder,
686
+ on: str,
687
+ *,
688
+ right_on: str | None = None,
689
+ by: list[str] | None = None,
690
+ right_by: list[str] | None = None,
691
+ strategy: typing.Literal["forward", "backward"] = "backward",
692
+ right_suffix: str | None = None,
693
+ coalesce: bool = True,
694
+ ) -> LazyFramePlaceholder:
695
+ """Perform an as-of join with another DataFrame.
696
+
697
+ An as-of join is similar to a left join, but instead of matching on equality,
698
+ it matches on the nearest key from the right DataFrame. This is commonly used
699
+ for time-series data where you want to join with the most recent observation.
700
+
701
+ **Important**: Both DataFrames must be sorted by the ``on`` column before calling
702
+ this method. Use ``.order_by(on)`` to sort if needed.
703
+
704
+ Parameters
705
+ ----------
706
+ other
707
+ Right-hand DataFrame to join with.
708
+ on
709
+ Column name in the left DataFrame to join on (must be sorted).
710
+ right_on
711
+ Column name in the right DataFrame to join on. If None, uses ``on``.
712
+ by
713
+ Additional exact-match columns for left DataFrame (optional).
714
+ right_by
715
+ Additional exact-match columns for right DataFrame. If None, uses ``by``.
716
+ strategy
717
+ Join strategy - "backward" (default) matches with the most recent past value,
718
+ "forward" matches with the nearest future value. Can also pass AsOfJoinStrategy enum.
719
+ right_suffix
720
+ Suffix to add to overlapping column names from the right DataFrame.
721
+ coalesce
722
+ Whether to coalesce the join keys (default True).
723
+
724
+ Returns
725
+ -------
726
+ Resulting DataFrame after the as-of join.
727
+ """
728
+ # Convert string strategy to enum if needed
729
+
730
+ return self._construct(
731
+ self_dataframe=self,
732
+ function_name="join_asof",
733
+ other=other,
734
+ on=on,
735
+ right_on=right_on,
736
+ by=by,
737
+ right_by=right_by,
738
+ strategy=strategy,
739
+ right_suffix=right_suffix,
740
+ coalesce=coalesce,
741
+ )
742
+
743
+ # # Window is not yet supported in LazyFramePlaceholder:
744
+ # def window(
745
+ # self,
746
+ # by: typing.Sequence[str],
747
+ # order_by: typing.Sequence[str | tuple[str, str]],
748
+ # *expressions: WindowExpr,
749
+ # ) -> LazyFramePlaceholder:
750
+ # ...
751
+
752
+ def agg(self, by: typing.Sequence[str], *aggregations: Underscore) -> "LazyFramePlaceholder":
753
+ """Group by columns and apply aggregation expressions.
754
+
755
+ Parameters
756
+ ----------
757
+ by
758
+ Column names to group by.
759
+ *aggregations
760
+ Aggregation expressions to apply to each group (e.g., sum, count, mean).
761
+
762
+ Returns
763
+ -------
764
+ DataFrame with one row per group containing the aggregated values.
765
+
766
+ Examples
767
+ --------
768
+ >>> from chalkdf import DataFrame
769
+ >>> from chalk.features import _
770
+ >>> df = DataFrame.from_dict({"group": ["A", "A", "B"], "value": [1, 2, 3]})
771
+ >>> agg_df = df.agg(["group"], _.value.sum().alias("total"))
772
+ """
773
+
774
+ if isinstance(by, str):
775
+ raise ValueError(f".agg(...) must be called with a list of group-by columns, not a single str {repr(by)}")
776
+
777
+ return self._construct(
778
+ self_dataframe=self,
779
+ function_name="agg",
780
+ args=(by, *aggregations),
781
+ )
782
+
783
+ def distinct_on(self, *columns: str) -> "LazyFramePlaceholder":
784
+ """Remove duplicate rows based on specified columns.
785
+
786
+ For rows with identical values in the specified columns, only one
787
+ row is kept (chosen arbitrarily).
788
+
789
+ Parameters
790
+ ----------
791
+ *columns
792
+ Column names to check for duplicates.
793
+
794
+ Returns
795
+ -------
796
+ DataFrame with duplicate rows removed.
797
+
798
+ Examples
799
+ --------
800
+ >>> from chalkdf import DataFrame
801
+ >>> df = DataFrame.from_dict({"x": [1, 1, 2], "y": [10, 20, 30]})
802
+ >>> unique = df.distinct_on("x")
803
+ """
804
+
805
+ return self._construct(
806
+ self_dataframe=self,
807
+ function_name="distinct_on",
808
+ args=columns,
809
+ )
810
+
811
+ def order_by(self, *columns: str | tuple[str, str]) -> LazyFramePlaceholder:
812
+ """Sort the DataFrame by one or more columns.
813
+
814
+ Parameters
815
+ ----------
816
+ *columns
817
+ Column names to sort by. Can be strings (for ascending order) or
818
+ tuples of (column_name, direction) where direction is "asc" or "desc".
819
+
820
+ Returns
821
+ -------
822
+ DataFrame sorted by the specified columns.
823
+
824
+ Examples
825
+ --------
826
+ >>> from chalkdf import DataFrame
827
+ >>> df = DataFrame.from_dict({"x": [3, 1, 2], "y": [30, 10, 20]})
828
+ >>> # Sort by x ascending
829
+ >>> sorted_df = df.order_by("x")
830
+ >>> # Sort by x descending, then y ascending
831
+ >>> sorted_df = df.order_by(("x", "desc"), "y")
832
+ """
833
+
834
+ return self._construct(
835
+ self_dataframe=self,
836
+ function_name="order_by",
837
+ args=columns,
838
+ )
839
+
840
+ def write(
841
+ self,
842
+ target_path: str,
843
+ target_file_name: str | None = None,
844
+ *,
845
+ file_format: str = "parquet",
846
+ serde_parameters: typing.Mapping[str, str] | None = None,
847
+ compression: str | None = None,
848
+ ensure_files: bool = False,
849
+ connector_id: str | None = None,
850
+ ) -> "LazyFramePlaceholder":
851
+ """Persist the DataFrame plan using Velox's Hive connector.
852
+
853
+ Parameters
854
+ ----------
855
+ target_path
856
+ Directory to write output files.
857
+ target_file_name
858
+ Optional explicit file name.
859
+ file_format
860
+ Output format (default ``parquet``).
861
+ serde_parameters
862
+ Optional SerDe options for text formats.
863
+ compression
864
+ Optional compression codec.
865
+ ensure_files
866
+ Ensure writers emit files even if no rows were produced.
867
+ connector_id
868
+ Optional connector id override.
869
+
870
+ Returns
871
+ -------
872
+ DataFrame representing the TableWrite operator.
873
+ """
874
+
875
+ return self._construct(
876
+ self_dataframe=self,
877
+ function_name="write",
878
+ target_path=target_path,
879
+ target_file_name=target_file_name,
880
+ file_format=file_format,
881
+ serde_parameters=serde_parameters,
882
+ compression=compression,
883
+ ensure_files=ensure_files,
884
+ connector_id=connector_id,
885
+ )
886
+
887
+ def rename(self, new_names: dict[str, str]) -> LazyFramePlaceholder:
888
+ """Rename columns in the DataFrame.
889
+
890
+ Parameters
891
+ ----------
892
+ new_names
893
+ Dictionary mapping old column names to new column names.
894
+
895
+ Returns
896
+ -------
897
+ DataFrame with renamed columns.
898
+
899
+ Examples
900
+ --------
901
+ >>> from chalkdf import DataFrame
902
+ >>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
903
+ >>> renamed = df.rename({"x": "id", "y": "value"})
904
+ """
905
+
906
+ return self._construct(
907
+ self_dataframe=self,
908
+ function_name="rename",
909
+ new_names=new_names,
910
+ )
911
+
912
+ @staticmethod
913
+ def from_proto(
914
+ proto: bytes | dataframe_pb2.DataFramePlan,
915
+ ) -> "LazyFramePlaceholder":
916
+ if isinstance(proto, bytes):
917
+ proto_bytes = proto
918
+ proto = dataframe_pb2.DataFramePlan()
919
+ proto.ParseFromString(proto_bytes)
920
+ return _convert_from_dataframe_proto(proto, dataframe_class=LazyFramePlaceholder)
921
+
922
+
923
+ def _extract_alias_from_underscore(u: Underscore) -> tuple[str, Underscore] | None:
924
+ """
925
+ Given an underscore expression like `_.something.alias("name")` splits the expression
926
+ into the alias `"name"` and the underscore expression `_.something`.
927
+
928
+ If this expression does not have an alias, returns `None` instead.
929
+ """
930
+ if not isinstance(u, UnderscoreCall):
931
+ return None
932
+ parent = u._chalk__parent # pyright: ignore[reportPrivateUsage]
933
+ if not isinstance(parent, UnderscoreAttr) or parent._chalk__attr != "alias": # pyright: ignore[reportPrivateUsage]
934
+ return None
935
+ if len(u._chalk__args) != 1: # pyright: ignore[reportPrivateUsage]
936
+ raise ValueError("alias() must be called with one argument")
937
+ alias = u._chalk__args[0] # pyright: ignore[reportPrivateUsage]
938
+ if not isinstance(alias, str):
939
+ raise ValueError("argument to alias() must be a string")
940
+ return (
941
+ alias,
942
+ parent._chalk__parent, # pyright: ignore[reportPrivateUsage]
943
+ )
944
+
945
+
946
+ def _convert_to_dataframe_proto(
947
+ lazy_frame: LazyFramePlaceholder,
948
+ ) -> dataframe_pb2.DataFramePlan:
949
+ """
950
+ Converts a `LazyFramePlaceholder` into a proto value, allowing it to be round-tripped
951
+ or converted into a Chalk DataFrame for execution.
952
+ """
953
+ df_constructors: list[dataframe_pb2.DataFrameConstructor] = []
954
+
955
+ # This map will memoize the constructor for a specified `LazyFramePlaceholder`.
956
+ lazy_frame_placeholder_cache: dict[LazyFramePlaceholder, dataframe_pb2.DataFrameIndex] = {}
957
+
958
+ def _convert_dataframe(df: LazyFramePlaceholder) -> dataframe_pb2.DataFrameIndex:
959
+ """
960
+ Recursively converts a `LazyFramePlaceholder` into a proto message.
961
+ If this `df` instance has been seen before, returns an index into the `df_constructors`
962
+ list pointing to the previous construction.
963
+
964
+ This allows plans that re-use operators to be efficiently encoded.
965
+ """
966
+ if df in lazy_frame_placeholder_cache:
967
+ return lazy_frame_placeholder_cache[df]
968
+
969
+ df_constructor = df._lazy_frame_constructor # pyright: ignore[reportPrivateUsage]
970
+ if df_constructor.self_dataframe is None:
971
+ self_proto = None
972
+ else:
973
+ self_proto = _convert_dataframe(df_constructor.self_dataframe)
974
+
975
+ proto_args = dataframe_pb2.PyList(
976
+ list_items=[_convert_arg(arg_value) for arg_value in df_constructor.args],
977
+ )
978
+ proto_kwargs = dataframe_pb2.PyDict(
979
+ dict_entries=[
980
+ dataframe_pb2.PyDictEntry(
981
+ entry_key=_convert_arg(kwarg_name),
982
+ entry_value=_convert_arg(kwarg_value),
983
+ )
984
+ for kwarg_name, kwarg_value in df_constructor.kwargs.items()
985
+ ],
986
+ )
987
+
988
+ new_constructor_index = len(df_constructors)
989
+ df_constructors.append(
990
+ dataframe_pb2.DataFrameConstructor(
991
+ self_operand=self_proto,
992
+ function_name=df_constructor.function_name,
993
+ args=proto_args,
994
+ kwargs=proto_kwargs,
995
+ )
996
+ )
997
+ lazy_frame_placeholder_cache[df] = dataframe_pb2.DataFrameIndex(
998
+ dataframe_op_index=new_constructor_index,
999
+ )
1000
+ return lazy_frame_placeholder_cache[df]
1001
+
1002
+ def _convert_arg(value: Any) -> dataframe_pb2.DataFrameOperand:
1003
+ if value is None:
1004
+ return dataframe_pb2.DataFrameOperand(
1005
+ value_none=dataframe_pb2.PyNone(),
1006
+ )
1007
+ if isinstance(value, int):
1008
+ return dataframe_pb2.DataFrameOperand(
1009
+ value_int=value,
1010
+ )
1011
+ if isinstance(value, str):
1012
+ return dataframe_pb2.DataFrameOperand(
1013
+ value_string=value,
1014
+ )
1015
+ if isinstance(value, bool):
1016
+ return dataframe_pb2.DataFrameOperand(
1017
+ value_bool=value,
1018
+ )
1019
+ if isinstance(value, (list, tuple)):
1020
+ return dataframe_pb2.DataFrameOperand(
1021
+ value_list=dataframe_pb2.PyList(
1022
+ list_items=[_convert_arg(item) for item in value],
1023
+ )
1024
+ )
1025
+ if isinstance(value, typing.Mapping):
1026
+ return dataframe_pb2.DataFrameOperand(
1027
+ value_dict=dataframe_pb2.PyDict(
1028
+ dict_entries=[
1029
+ dataframe_pb2.PyDictEntry(
1030
+ entry_key=_convert_arg(key),
1031
+ entry_value=_convert_arg(value),
1032
+ )
1033
+ for key, value in value.items()
1034
+ ]
1035
+ )
1036
+ )
1037
+ if isinstance(value, LazyFramePlaceholder):
1038
+ # Use the dataframe-specific helper function for this logic.
1039
+ return dataframe_pb2.DataFrameOperand(
1040
+ value_dataframe_index=_convert_dataframe(value),
1041
+ )
1042
+ if isinstance(value, Underscore):
1043
+ return dataframe_pb2.DataFrameOperand(
1044
+ underscore_expr=convert_value_to_proto_expr(value),
1045
+ )
1046
+ if isinstance(value, pyarrow.Schema):
1047
+ return dataframe_pb2.DataFrameOperand(
1048
+ arrow_schema=PrimitiveFeatureConverter.convert_pa_schema_to_proto_schema(value),
1049
+ )
1050
+ if isinstance(value, (pyarrow.Table, pyarrow.RecordBatch)):
1051
+ return dataframe_pb2.DataFrameOperand(
1052
+ arrow_table=PrimitiveFeatureConverter.convert_arrow_table_to_proto(value),
1053
+ )
1054
+
1055
+ # If libchalk.chalktable is available in the current environment, then we might encounter
1056
+ # a libchalk.chalktable.Expr value which needs to be proto-serialized.
1057
+ LibchalkExpr = None
1058
+ try:
1059
+ from libchalk.chalktable import Expr as LibchalkExpr # pyright: ignore
1060
+ except ImportError:
1061
+ pass
1062
+ if LibchalkExpr and isinstance(value, LibchalkExpr):
1063
+ value_expr_encoded = value.to_proto_bytes()
1064
+ return dataframe_pb2.DataFrameOperand(
1065
+ libchalk_expr=expression_pb2.LogicalExprNode.FromString(value_expr_encoded),
1066
+ )
1067
+
1068
+ raise ValueError(f"LazyFramePlaceholder function operand is of unsupported type {type(value)}")
1069
+
1070
+ _convert_arg(lazy_frame)
1071
+
1072
+ return dataframe_pb2.DataFramePlan(
1073
+ constructors=df_constructors,
1074
+ )
1075
+
1076
+
1077
+ def _convert_from_dataframe_proto(
1078
+ proto_plan: dataframe_pb2.DataFramePlan,
1079
+ dataframe_class: type,
1080
+ ) -> LazyFramePlaceholder:
1081
+ """
1082
+ Converts a proto into a lazy frame.
1083
+ """
1084
+ df_values: list[LazyFramePlaceholder] = []
1085
+
1086
+ def _convert_dataframe_index(df: dataframe_pb2.DataFrameIndex) -> LazyFramePlaceholder:
1087
+ if df.dataframe_op_index < 0 or df.dataframe_op_index >= len(df_values):
1088
+ raise ValueError(
1089
+ f"DataFrame proto message value is invalid - a DataFrame constructor references operator index {df.dataframe_op_index} but only {len(df_values)} dataframe(s) intermediate values have been defined so far."
1090
+ )
1091
+ return df_values[df.dataframe_op_index]
1092
+
1093
+ def _convert_dataframe(df: dataframe_pb2.DataFrameConstructor) -> LazyFramePlaceholder:
1094
+ if df.HasField("self_operand"):
1095
+ self_operand = _convert_dataframe_index(df.self_operand)
1096
+ else:
1097
+ self_operand = None
1098
+
1099
+ # TODO: validate that function_name is legal.
1100
+ if self_operand is None:
1101
+ method = getattr(dataframe_class, df.function_name)
1102
+ else:
1103
+ method = getattr(self_operand, df.function_name)
1104
+
1105
+ args = [_convert_arg(arg) for arg in df.args.list_items]
1106
+ kwargs = {_convert_arg(entry.entry_key): _convert_arg(entry.entry_value) for entry in df.kwargs.dict_entries}
1107
+
1108
+ return method(*args, **kwargs)
1109
+
1110
+ def _convert_arg(value: dataframe_pb2.DataFrameOperand) -> Any:
1111
+ if value.HasField("value_string"):
1112
+ return value.value_string
1113
+ if value.HasField("value_int"):
1114
+ return value.value_int
1115
+ if value.HasField("value_bool"):
1116
+ return value.value_bool
1117
+ if value.HasField("value_none"):
1118
+ return None
1119
+ if value.HasField("value_list"):
1120
+ return [_convert_arg(item) for item in value.value_list.list_items]
1121
+ if value.HasField("value_dict"):
1122
+ return {
1123
+ _convert_arg(entry.entry_key): _convert_arg(entry.entry_value)
1124
+ for entry in value.value_dict.dict_entries
1125
+ }
1126
+ if value.HasField("value_dataframe_index"):
1127
+ return _convert_dataframe_index(value.value_dataframe_index)
1128
+ if value.HasField("arrow_schema"):
1129
+ return PrimitiveFeatureConverter.convert_proto_schema_to_pa_schema(value.arrow_schema)
1130
+ if value.HasField("arrow_table"):
1131
+ return PrimitiveFeatureConverter.convert_arrow_table_from_proto(value.arrow_table)
1132
+ if value.HasField("underscore_expr"):
1133
+ return Underscore._from_proto(value.underscore_expr) # pyright: ignore[reportPrivateUsage]
1134
+ if value.HasField("libchalk_expr"):
1135
+ # In order to decode `libchalk_expr` vlaues, `libchalk` must be available as a module.
1136
+ try:
1137
+ from libchalk.chalktable import Expr as LibchalkExpr # pyright: ignore
1138
+ except ImportError:
1139
+ raise ValueError(
1140
+ "A dataframe parameter was encoded holding a libchalk.chalktable.Expr value, but the `libchalk` module is not available in the current environment. To decode this dataframe expression, import libchalk."
1141
+ )
1142
+ return LibchalkExpr.from_proto_bytes(value.libchalk_expr.SerializeToString())
1143
+
1144
+ raise ValueError(f"DataFrame operand expression {value} does not have any value set")
1145
+
1146
+ for df in proto_plan.constructors:
1147
+ df_values.append(_convert_dataframe(df))
1148
+
1149
+ if len(df_values) == 0:
1150
+ raise ValueError(
1151
+ "Could not parse LazyFramePlaceholder from proto expression; no dataframe constructors were present in the provided proto message"
1152
+ )
1153
+
1154
+ return df_values[-1]