chalkpy 2.90.1__py3-none-any.whl → 2.95.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chalk/__init__.py +2 -1
- chalk/_gen/chalk/arrow/v1/arrow_pb2.py +7 -5
- chalk/_gen/chalk/arrow/v1/arrow_pb2.pyi +6 -0
- chalk/_gen/chalk/artifacts/v1/chart_pb2.py +16 -16
- chalk/_gen/chalk/artifacts/v1/chart_pb2.pyi +4 -0
- chalk/_gen/chalk/artifacts/v1/cron_query_pb2.py +8 -7
- chalk/_gen/chalk/artifacts/v1/cron_query_pb2.pyi +5 -0
- chalk/_gen/chalk/common/v1/offline_query_pb2.py +17 -15
- chalk/_gen/chalk/common/v1/offline_query_pb2.pyi +25 -0
- chalk/_gen/chalk/common/v1/script_task_pb2.py +3 -3
- chalk/_gen/chalk/common/v1/script_task_pb2.pyi +2 -0
- chalk/_gen/chalk/dataframe/__init__.py +0 -0
- chalk/_gen/chalk/dataframe/v1/__init__.py +0 -0
- chalk/_gen/chalk/dataframe/v1/dataframe_pb2.py +48 -0
- chalk/_gen/chalk/dataframe/v1/dataframe_pb2.pyi +123 -0
- chalk/_gen/chalk/dataframe/v1/dataframe_pb2_grpc.py +4 -0
- chalk/_gen/chalk/dataframe/v1/dataframe_pb2_grpc.pyi +4 -0
- chalk/_gen/chalk/graph/v1/graph_pb2.py +150 -149
- chalk/_gen/chalk/graph/v1/graph_pb2.pyi +25 -0
- chalk/_gen/chalk/graph/v1/sources_pb2.py +94 -84
- chalk/_gen/chalk/graph/v1/sources_pb2.pyi +56 -0
- chalk/_gen/chalk/kubernetes/v1/horizontalpodautoscaler_pb2.py +79 -0
- chalk/_gen/chalk/kubernetes/v1/horizontalpodautoscaler_pb2.pyi +377 -0
- chalk/_gen/chalk/kubernetes/v1/horizontalpodautoscaler_pb2_grpc.py +4 -0
- chalk/_gen/chalk/kubernetes/v1/horizontalpodautoscaler_pb2_grpc.pyi +4 -0
- chalk/_gen/chalk/kubernetes/v1/scaledobject_pb2.py +43 -7
- chalk/_gen/chalk/kubernetes/v1/scaledobject_pb2.pyi +252 -2
- chalk/_gen/chalk/protosql/v1/sql_service_pb2.py +54 -27
- chalk/_gen/chalk/protosql/v1/sql_service_pb2.pyi +131 -3
- chalk/_gen/chalk/protosql/v1/sql_service_pb2_grpc.py +45 -0
- chalk/_gen/chalk/protosql/v1/sql_service_pb2_grpc.pyi +14 -0
- chalk/_gen/chalk/python/v1/types_pb2.py +14 -14
- chalk/_gen/chalk/python/v1/types_pb2.pyi +8 -0
- chalk/_gen/chalk/server/v1/benchmark_pb2.py +76 -0
- chalk/_gen/chalk/server/v1/benchmark_pb2.pyi +156 -0
- chalk/_gen/chalk/server/v1/benchmark_pb2_grpc.py +258 -0
- chalk/_gen/chalk/server/v1/benchmark_pb2_grpc.pyi +84 -0
- chalk/_gen/chalk/server/v1/billing_pb2.py +40 -38
- chalk/_gen/chalk/server/v1/billing_pb2.pyi +17 -1
- chalk/_gen/chalk/server/v1/branches_pb2.py +45 -0
- chalk/_gen/chalk/server/v1/branches_pb2.pyi +80 -0
- chalk/_gen/chalk/server/v1/branches_pb2_grpc.pyi +36 -0
- chalk/_gen/chalk/server/v1/builder_pb2.py +358 -288
- chalk/_gen/chalk/server/v1/builder_pb2.pyi +360 -10
- chalk/_gen/chalk/server/v1/builder_pb2_grpc.py +225 -0
- chalk/_gen/chalk/server/v1/builder_pb2_grpc.pyi +60 -0
- chalk/_gen/chalk/server/v1/chart_pb2.py +10 -10
- chalk/_gen/chalk/server/v1/chart_pb2.pyi +18 -2
- chalk/_gen/chalk/server/v1/clickhouse_pb2.py +42 -0
- chalk/_gen/chalk/server/v1/clickhouse_pb2.pyi +17 -0
- chalk/_gen/chalk/server/v1/clickhouse_pb2_grpc.py +78 -0
- chalk/_gen/chalk/server/v1/clickhouse_pb2_grpc.pyi +38 -0
- chalk/_gen/chalk/server/v1/cloud_components_pb2.py +141 -119
- chalk/_gen/chalk/server/v1/cloud_components_pb2.pyi +106 -4
- chalk/_gen/chalk/server/v1/cloud_components_pb2_grpc.py +45 -0
- chalk/_gen/chalk/server/v1/cloud_components_pb2_grpc.pyi +12 -0
- chalk/_gen/chalk/server/v1/cloud_credentials_pb2.py +11 -3
- chalk/_gen/chalk/server/v1/cloud_credentials_pb2.pyi +20 -0
- chalk/_gen/chalk/server/v1/cloud_credentials_pb2_grpc.py +45 -0
- chalk/_gen/chalk/server/v1/cloud_credentials_pb2_grpc.pyi +12 -0
- chalk/_gen/chalk/server/v1/dataplanejobqueue_pb2.py +52 -38
- chalk/_gen/chalk/server/v1/dataplanejobqueue_pb2.pyi +62 -1
- chalk/_gen/chalk/server/v1/dataplanejobqueue_pb2_grpc.py +90 -0
- chalk/_gen/chalk/server/v1/dataplanejobqueue_pb2_grpc.pyi +24 -0
- chalk/_gen/chalk/server/v1/dataplaneworkflows_pb2.py +90 -0
- chalk/_gen/chalk/server/v1/dataplaneworkflows_pb2.pyi +264 -0
- chalk/_gen/chalk/server/v1/dataplaneworkflows_pb2_grpc.py +170 -0
- chalk/_gen/chalk/server/v1/dataplaneworkflows_pb2_grpc.pyi +62 -0
- chalk/_gen/chalk/server/v1/deploy_pb2.py +9 -3
- chalk/_gen/chalk/server/v1/deploy_pb2.pyi +12 -0
- chalk/_gen/chalk/server/v1/deploy_pb2_grpc.py +45 -0
- chalk/_gen/chalk/server/v1/deploy_pb2_grpc.pyi +12 -0
- chalk/_gen/chalk/server/v1/deployment_pb2.py +6 -6
- chalk/_gen/chalk/server/v1/deployment_pb2.pyi +20 -0
- chalk/_gen/chalk/server/v1/environment_pb2.py +14 -12
- chalk/_gen/chalk/server/v1/environment_pb2.pyi +19 -0
- chalk/_gen/chalk/server/v1/eventbus_pb2.py +4 -2
- chalk/_gen/chalk/server/v1/files_pb2.py +65 -0
- chalk/_gen/chalk/server/v1/files_pb2.pyi +167 -0
- chalk/_gen/chalk/server/v1/files_pb2_grpc.py +4 -0
- chalk/_gen/chalk/server/v1/files_pb2_grpc.pyi +4 -0
- chalk/_gen/chalk/server/v1/graph_pb2.py +38 -26
- chalk/_gen/chalk/server/v1/graph_pb2.pyi +58 -0
- chalk/_gen/chalk/server/v1/graph_pb2_grpc.py +47 -0
- chalk/_gen/chalk/server/v1/graph_pb2_grpc.pyi +18 -0
- chalk/_gen/chalk/server/v1/incident_pb2.py +23 -21
- chalk/_gen/chalk/server/v1/incident_pb2.pyi +15 -1
- chalk/_gen/chalk/server/v1/indexing_job_pb2.py +44 -0
- chalk/_gen/chalk/server/v1/indexing_job_pb2.pyi +38 -0
- chalk/_gen/chalk/server/v1/indexing_job_pb2_grpc.py +78 -0
- chalk/_gen/chalk/server/v1/indexing_job_pb2_grpc.pyi +38 -0
- chalk/_gen/chalk/server/v1/integrations_pb2.py +11 -9
- chalk/_gen/chalk/server/v1/integrations_pb2.pyi +34 -2
- chalk/_gen/chalk/server/v1/kube_pb2.py +29 -19
- chalk/_gen/chalk/server/v1/kube_pb2.pyi +28 -0
- chalk/_gen/chalk/server/v1/kube_pb2_grpc.py +45 -0
- chalk/_gen/chalk/server/v1/kube_pb2_grpc.pyi +12 -0
- chalk/_gen/chalk/server/v1/log_pb2.py +21 -3
- chalk/_gen/chalk/server/v1/log_pb2.pyi +68 -0
- chalk/_gen/chalk/server/v1/log_pb2_grpc.py +90 -0
- chalk/_gen/chalk/server/v1/log_pb2_grpc.pyi +24 -0
- chalk/_gen/chalk/server/v1/model_registry_pb2.py +10 -10
- chalk/_gen/chalk/server/v1/model_registry_pb2.pyi +4 -1
- chalk/_gen/chalk/server/v1/plandebug_pb2.py +53 -0
- chalk/_gen/chalk/server/v1/plandebug_pb2.pyi +86 -0
- chalk/_gen/chalk/server/v1/plandebug_pb2_grpc.py +168 -0
- chalk/_gen/chalk/server/v1/plandebug_pb2_grpc.pyi +60 -0
- chalk/_gen/chalk/server/v1/queries_pb2.py +66 -66
- chalk/_gen/chalk/server/v1/queries_pb2.pyi +32 -2
- chalk/_gen/chalk/server/v1/scheduled_query_run_pb2.py +12 -12
- chalk/_gen/chalk/server/v1/scheduled_query_run_pb2.pyi +16 -3
- chalk/_gen/chalk/server/v1/scheduler_pb2.py +24 -12
- chalk/_gen/chalk/server/v1/scheduler_pb2.pyi +61 -1
- chalk/_gen/chalk/server/v1/scheduler_pb2_grpc.py +90 -0
- chalk/_gen/chalk/server/v1/scheduler_pb2_grpc.pyi +24 -0
- chalk/_gen/chalk/server/v1/script_tasks_pb2.py +15 -3
- chalk/_gen/chalk/server/v1/script_tasks_pb2.pyi +22 -0
- chalk/_gen/chalk/server/v1/script_tasks_pb2_grpc.py +90 -0
- chalk/_gen/chalk/server/v1/script_tasks_pb2_grpc.pyi +24 -0
- chalk/_gen/chalk/server/v1/sql_interface_pb2.py +75 -0
- chalk/_gen/chalk/server/v1/sql_interface_pb2.pyi +142 -0
- chalk/_gen/chalk/server/v1/sql_interface_pb2_grpc.py +349 -0
- chalk/_gen/chalk/server/v1/sql_interface_pb2_grpc.pyi +114 -0
- chalk/_gen/chalk/server/v1/sql_queries_pb2.py +48 -0
- chalk/_gen/chalk/server/v1/sql_queries_pb2.pyi +150 -0
- chalk/_gen/chalk/server/v1/sql_queries_pb2_grpc.py +123 -0
- chalk/_gen/chalk/server/v1/sql_queries_pb2_grpc.pyi +52 -0
- chalk/_gen/chalk/server/v1/team_pb2.py +154 -141
- chalk/_gen/chalk/server/v1/team_pb2.pyi +30 -2
- chalk/_gen/chalk/server/v1/team_pb2_grpc.py +45 -0
- chalk/_gen/chalk/server/v1/team_pb2_grpc.pyi +12 -0
- chalk/_gen/chalk/server/v1/topic_pb2.py +5 -3
- chalk/_gen/chalk/server/v1/topic_pb2.pyi +10 -1
- chalk/_gen/chalk/server/v1/trace_pb2.py +44 -40
- chalk/_gen/chalk/server/v1/trace_pb2.pyi +20 -0
- chalk/_gen/chalk/streaming/v1/debug_service_pb2.py +62 -0
- chalk/_gen/chalk/streaming/v1/debug_service_pb2.pyi +75 -0
- chalk/_gen/chalk/streaming/v1/debug_service_pb2_grpc.py +221 -0
- chalk/_gen/chalk/streaming/v1/debug_service_pb2_grpc.pyi +88 -0
- chalk/_gen/chalk/streaming/v1/simple_streaming_service_pb2.py +16 -10
- chalk/_gen/chalk/streaming/v1/simple_streaming_service_pb2.pyi +52 -1
- chalk/_gen/chalk/streaming/v1/simple_streaming_service_pb2_grpc.py +48 -0
- chalk/_gen/chalk/streaming/v1/simple_streaming_service_pb2_grpc.pyi +20 -0
- chalk/_gen/chalk/utils/v1/field_change_pb2.py +32 -0
- chalk/_gen/chalk/utils/v1/field_change_pb2.pyi +42 -0
- chalk/_gen/chalk/utils/v1/field_change_pb2_grpc.py +4 -0
- chalk/_gen/chalk/utils/v1/field_change_pb2_grpc.pyi +4 -0
- chalk/_lsp/error_builder.py +11 -0
- chalk/_version.py +1 -1
- chalk/client/client.py +128 -43
- chalk/client/client_async.py +149 -0
- chalk/client/client_async_impl.py +22 -0
- chalk/client/client_grpc.py +539 -104
- chalk/client/client_impl.py +449 -122
- chalk/client/dataset.py +7 -1
- chalk/client/models.py +98 -0
- chalk/client/serialization/model_serialization.py +92 -9
- chalk/df/LazyFramePlaceholder.py +1154 -0
- chalk/features/_class_property.py +7 -0
- chalk/features/_embedding/embedding.py +1 -0
- chalk/features/_encoding/converter.py +83 -2
- chalk/features/feature_field.py +40 -30
- chalk/features/feature_set_decorator.py +1 -0
- chalk/features/feature_wrapper.py +42 -3
- chalk/features/hooks.py +81 -10
- chalk/features/inference.py +33 -31
- chalk/features/resolver.py +224 -24
- chalk/functions/__init__.py +65 -3
- chalk/gitignore/gitignore_parser.py +5 -1
- chalk/importer.py +142 -68
- chalk/ml/__init__.py +2 -0
- chalk/ml/model_hooks.py +194 -26
- chalk/ml/model_reference.py +56 -8
- chalk/ml/model_version.py +24 -15
- chalk/ml/utils.py +20 -17
- chalk/operators/_utils.py +10 -3
- chalk/parsed/_proto/export.py +22 -0
- chalk/parsed/duplicate_input_gql.py +3 -0
- chalk/parsed/json_conversions.py +20 -14
- chalk/parsed/to_proto.py +16 -4
- chalk/parsed/user_types_to_json.py +31 -10
- chalk/parsed/validation_from_registries.py +182 -0
- chalk/queries/named_query.py +16 -6
- chalk/queries/scheduled_query.py +9 -1
- chalk/serialization/parsed_annotation.py +24 -11
- chalk/sql/__init__.py +18 -0
- chalk/sql/_internal/integrations/databricks.py +55 -17
- chalk/sql/_internal/integrations/mssql.py +127 -62
- chalk/sql/_internal/integrations/redshift.py +4 -0
- chalk/sql/_internal/sql_file_resolver.py +53 -9
- chalk/sql/_internal/sql_source.py +35 -2
- chalk/streams/_kafka_source.py +5 -1
- chalk/streams/_windows.py +15 -2
- chalk/utils/_otel_version.py +13 -0
- chalk/utils/async_helpers.py +2 -2
- chalk/utils/missing_dependency.py +5 -4
- chalk/utils/tracing.py +185 -95
- {chalkpy-2.90.1.dist-info → chalkpy-2.95.3.dist-info}/METADATA +4 -6
- {chalkpy-2.90.1.dist-info → chalkpy-2.95.3.dist-info}/RECORD +202 -146
- {chalkpy-2.90.1.dist-info → chalkpy-2.95.3.dist-info}/WHEEL +0 -0
- {chalkpy-2.90.1.dist-info → chalkpy-2.95.3.dist-info}/entry_points.txt +0 -0
- {chalkpy-2.90.1.dist-info → chalkpy-2.95.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1154 @@
|
|
|
1
|
+
"""Lightweight DataFrame wrapper around Chalk's execution engine.
|
|
2
|
+
|
|
3
|
+
The :class:`DataFrame` class constructs query plans backed by ``libchalk`` and
|
|
4
|
+
can materialize them into Arrow tables. It offers a minimal API similar to
|
|
5
|
+
other DataFrame libraries while delegating heavy lifting to the underlying
|
|
6
|
+
engine.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import typing
|
|
12
|
+
import uuid
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import TYPE_CHECKING, Any, Optional, TypeAlias
|
|
16
|
+
|
|
17
|
+
import pyarrow
|
|
18
|
+
|
|
19
|
+
import chalk._gen.chalk.dataframe.v1.dataframe_pb2 as dataframe_pb2
|
|
20
|
+
import chalk._gen.chalk.expression.v1.expression_pb2 as expression_pb2
|
|
21
|
+
from chalk.features._encoding.converter import PrimitiveFeatureConverter
|
|
22
|
+
from chalk.features.underscore import (
|
|
23
|
+
Underscore,
|
|
24
|
+
UnderscoreAttr,
|
|
25
|
+
UnderscoreCall,
|
|
26
|
+
UnderscoreRoot,
|
|
27
|
+
convert_value_to_proto_expr,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from chalk.features import Underscore
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
MaterializedTable: TypeAlias = pyarrow.RecordBatch | pyarrow.Table
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class _LazyFrameConstructor:
|
|
39
|
+
"""
|
|
40
|
+
A lazily-called function which will be used to construct a Chalk DataFrame.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
self_dataframe: "Optional[LazyFramePlaceholder]"
|
|
44
|
+
"""If present, this is the value of 'self' to call the function on."""
|
|
45
|
+
|
|
46
|
+
function_name: str
|
|
47
|
+
"""The name of the function to construct the DataFrame."""
|
|
48
|
+
|
|
49
|
+
args: tuple[Any, ...]
|
|
50
|
+
"""The args to pass to the DataFrame function."""
|
|
51
|
+
|
|
52
|
+
kwargs: dict[str, Any]
|
|
53
|
+
"""The kwargs to pass to the DataFrame function."""
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class LazyFramePlaceholder:
|
|
57
|
+
"""
|
|
58
|
+
A lazy representation of a DataFrame operation.
|
|
59
|
+
|
|
60
|
+
Examples
|
|
61
|
+
--------
|
|
62
|
+
>>> from chalk.df import LazyFramePlaceholder
|
|
63
|
+
>>> from chalk.features import _
|
|
64
|
+
>>> # Create from a dictionary
|
|
65
|
+
>>> df = LazyFramePlaceholder.named_table('input', pa.schema({"id": pa.int64(), "name": pa.string()}))
|
|
66
|
+
>>> # Apply operations
|
|
67
|
+
>>> filtered = df.filter(_.x > 1)
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
@staticmethod
|
|
71
|
+
def _construct(
|
|
72
|
+
*,
|
|
73
|
+
self_dataframe: "Optional[LazyFramePlaceholder]",
|
|
74
|
+
function_name: str,
|
|
75
|
+
args: tuple[Any, ...] = (),
|
|
76
|
+
**kwargs: Any,
|
|
77
|
+
):
|
|
78
|
+
return LazyFramePlaceholder(
|
|
79
|
+
_internal_constructor=_LazyFrameConstructor(
|
|
80
|
+
self_dataframe=self_dataframe,
|
|
81
|
+
function_name=function_name,
|
|
82
|
+
args=tuple(args),
|
|
83
|
+
kwargs=kwargs,
|
|
84
|
+
)
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
def __init__(
|
|
88
|
+
self,
|
|
89
|
+
*,
|
|
90
|
+
_internal_constructor: _LazyFrameConstructor,
|
|
91
|
+
):
|
|
92
|
+
"""
|
|
93
|
+
An internal construct that creates a `LazyFramePlaceholder` from its underlying operation.
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
super().__init__()
|
|
97
|
+
self._lazy_frame_constructor = _internal_constructor
|
|
98
|
+
|
|
99
|
+
def __repr__(self) -> str:
|
|
100
|
+
return "LazyFramePlaceholder(...)"
|
|
101
|
+
|
|
102
|
+
__str__ = __repr__
|
|
103
|
+
|
|
104
|
+
def _to_proto(self) -> dataframe_pb2.DataFramePlan:
|
|
105
|
+
"""
|
|
106
|
+
Convert this proto plan to a dataframe.
|
|
107
|
+
"""
|
|
108
|
+
return _convert_to_dataframe_proto(self)
|
|
109
|
+
|
|
110
|
+
@staticmethod
|
|
111
|
+
def _from_proto(proto: dataframe_pb2.DataFramePlan) -> "LazyFramePlaceholder":
|
|
112
|
+
"""
|
|
113
|
+
Parse a `LazyFramePlaceholder` from the specified proto plan.
|
|
114
|
+
"""
|
|
115
|
+
return _convert_from_dataframe_proto(proto, dataframe_class=LazyFramePlaceholder)
|
|
116
|
+
|
|
117
|
+
@classmethod
|
|
118
|
+
def named_table(cls, name: str, schema: pyarrow.Schema) -> LazyFramePlaceholder:
|
|
119
|
+
"""Create a ``DataFrame`` for a named table.
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
name
|
|
124
|
+
Table identifier.
|
|
125
|
+
schema
|
|
126
|
+
Arrow schema describing the table.
|
|
127
|
+
|
|
128
|
+
Returns
|
|
129
|
+
-------
|
|
130
|
+
DataFrame referencing the named table.
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
if not isinstance(name, str): # pyright: ignore[reportUnnecessaryIsInstance]
|
|
134
|
+
raise ValueError(
|
|
135
|
+
f"LazyFramePlaceholder.named_table expected `name` to have type 'str' but it was passed as a '{type(name)}'"
|
|
136
|
+
)
|
|
137
|
+
if not isinstance(schema, pyarrow.Schema): # pyright: ignore[reportUnnecessaryIsInstance]
|
|
138
|
+
raise ValueError(
|
|
139
|
+
f"LazyFramePlaceholder.named_table expected `schema` to have type 'pyarrow.Schema' but it was passed as a '{type(schema)}'"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
return LazyFramePlaceholder._construct(
|
|
143
|
+
function_name="named_table",
|
|
144
|
+
self_dataframe=None,
|
|
145
|
+
name=name,
|
|
146
|
+
schema=schema,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
@classmethod
|
|
150
|
+
def from_arrow(cls, data: MaterializedTable):
|
|
151
|
+
"""Construct a DataFrame from an in-memory Arrow object.
|
|
152
|
+
|
|
153
|
+
Parameters
|
|
154
|
+
----------
|
|
155
|
+
data
|
|
156
|
+
PyArrow Table or RecordBatch to convert into a DataFrame.
|
|
157
|
+
|
|
158
|
+
Returns
|
|
159
|
+
-------
|
|
160
|
+
DataFrame backed by the provided Arrow data.
|
|
161
|
+
|
|
162
|
+
Examples
|
|
163
|
+
--------
|
|
164
|
+
>>> import pyarrow as pa
|
|
165
|
+
>>> from chalkdf import DataFrame
|
|
166
|
+
>>> table = pa.table({"x": [1, 2, 3], "y": ["a", "b", "c"]})
|
|
167
|
+
>>> df = DataFrame.from_arrow(table)
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
assert isinstance(data, (pyarrow.Table, pyarrow.RecordBatch))
|
|
171
|
+
|
|
172
|
+
return LazyFramePlaceholder._construct(
|
|
173
|
+
self_dataframe=None,
|
|
174
|
+
function_name="from_arrow",
|
|
175
|
+
data=data,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
@classmethod
|
|
179
|
+
def from_dict(cls, data: dict):
|
|
180
|
+
"""Construct a DataFrame from a Python dictionary.
|
|
181
|
+
|
|
182
|
+
Parameters
|
|
183
|
+
----------
|
|
184
|
+
data
|
|
185
|
+
Dictionary mapping column names to lists of values.
|
|
186
|
+
|
|
187
|
+
Returns
|
|
188
|
+
-------
|
|
189
|
+
DataFrame backed by the provided dictionary data.
|
|
190
|
+
|
|
191
|
+
Examples
|
|
192
|
+
--------
|
|
193
|
+
>>> from chalkdf import DataFrame
|
|
194
|
+
>>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": ["a", "b", "c"]})
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
return LazyFramePlaceholder.from_arrow(pyarrow.table(data))
|
|
198
|
+
|
|
199
|
+
@classmethod
|
|
200
|
+
def scan(
|
|
201
|
+
cls,
|
|
202
|
+
input_uris: typing.Sequence[str | Path],
|
|
203
|
+
*,
|
|
204
|
+
name: typing.Optional[str] = None,
|
|
205
|
+
schema: pyarrow.Schema | None = None,
|
|
206
|
+
) -> "LazyFramePlaceholder":
|
|
207
|
+
"""Scan files and return a DataFrame.
|
|
208
|
+
|
|
209
|
+
Currently supports CSV (with headers) and Parquet file formats.
|
|
210
|
+
|
|
211
|
+
Parameters
|
|
212
|
+
----------
|
|
213
|
+
input_uris
|
|
214
|
+
List of file paths or URIs to scan. Supports local paths and file:// URIs.
|
|
215
|
+
name
|
|
216
|
+
Optional name to assign to the table being scanned.
|
|
217
|
+
schema
|
|
218
|
+
Schema of the data. Required for CSV files, optional for Parquet.
|
|
219
|
+
|
|
220
|
+
Returns
|
|
221
|
+
-------
|
|
222
|
+
DataFrame that reads data from the specified files.
|
|
223
|
+
|
|
224
|
+
Examples
|
|
225
|
+
--------
|
|
226
|
+
>>> from chalkdf import DataFrame
|
|
227
|
+
>>> # Scan Parquet files
|
|
228
|
+
>>> df = DataFrame.scan(["data/sales_2024.parquet"], name="sales_data")
|
|
229
|
+
>>> # Scan CSV with explicit schema
|
|
230
|
+
>>> import pyarrow as pa
|
|
231
|
+
>>> schema = pa.schema([("id", pa.int64()), ("name", pa.string())])
|
|
232
|
+
>>> df = DataFrame.scan(["data/users.csv"], name="users", schema=schema)
|
|
233
|
+
"""
|
|
234
|
+
# Accept filesystem paths or URIs; construct file:// URIs manually for
|
|
235
|
+
# local paths to avoid percent-encoding partition tokens like '='.
|
|
236
|
+
|
|
237
|
+
if isinstance(input_uris, str):
|
|
238
|
+
raise ValueError(
|
|
239
|
+
"The LazyFramePlaceholder.scan() function must be called with a list of input_uris, not a single str URI"
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
if name is None:
|
|
243
|
+
name = str(uuid.uuid4())
|
|
244
|
+
|
|
245
|
+
normalized_input_uris: list[str] = []
|
|
246
|
+
for p in input_uris:
|
|
247
|
+
s = p if isinstance(p, str) else str(p)
|
|
248
|
+
if "://" in s:
|
|
249
|
+
normalized_input_uris.append(s)
|
|
250
|
+
else:
|
|
251
|
+
abs_path = str(Path(s).resolve())
|
|
252
|
+
if not abs_path.startswith("/"):
|
|
253
|
+
normalized_input_uris.append(Path(s).resolve().as_uri())
|
|
254
|
+
else:
|
|
255
|
+
normalized_input_uris.append("file://" + abs_path)
|
|
256
|
+
|
|
257
|
+
return LazyFramePlaceholder._construct(
|
|
258
|
+
self_dataframe=None,
|
|
259
|
+
function_name="scan",
|
|
260
|
+
name=name,
|
|
261
|
+
input_uris=normalized_input_uris,
|
|
262
|
+
schema=schema,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
@classmethod
|
|
266
|
+
def scan_glue_iceberg(
|
|
267
|
+
cls,
|
|
268
|
+
glue_table_name: str,
|
|
269
|
+
schema: typing.Mapping[str, pyarrow.DataType],
|
|
270
|
+
*,
|
|
271
|
+
batch_row_count: int = 1_000,
|
|
272
|
+
aws_catalog_account_id: typing.Optional[str] = None,
|
|
273
|
+
aws_catalog_region: typing.Optional[str] = None,
|
|
274
|
+
aws_role_arn: typing.Optional[str] = None,
|
|
275
|
+
parquet_scan_range_column: typing.Optional[str] = None,
|
|
276
|
+
custom_partitions: typing.Optional[dict[str, tuple[typing.Literal["date_trunc(day)"], str]]] = None,
|
|
277
|
+
partition_column: typing.Optional[str] = None,
|
|
278
|
+
) -> "LazyFramePlaceholder":
|
|
279
|
+
"""Load data from an AWS Glue Iceberg table.
|
|
280
|
+
|
|
281
|
+
Parameters
|
|
282
|
+
----------
|
|
283
|
+
glue_table_name
|
|
284
|
+
Fully qualified ``database.table`` name.
|
|
285
|
+
schema
|
|
286
|
+
Mapping of column names to Arrow types.
|
|
287
|
+
batch_row_count
|
|
288
|
+
Number of rows per batch.
|
|
289
|
+
aws_catalog_account_id
|
|
290
|
+
AWS account hosting the Glue catalog.
|
|
291
|
+
aws_catalog_region
|
|
292
|
+
Region of the Glue catalog.
|
|
293
|
+
aws_role_arn
|
|
294
|
+
IAM role to assume for access.
|
|
295
|
+
parquet_scan_range_column
|
|
296
|
+
Column used for range-based reads.
|
|
297
|
+
custom_partitions
|
|
298
|
+
Additional partition definitions.
|
|
299
|
+
partition_column
|
|
300
|
+
Column name representing partitions.
|
|
301
|
+
|
|
302
|
+
Returns
|
|
303
|
+
-------
|
|
304
|
+
DataFrame backed by the Glue table.
|
|
305
|
+
"""
|
|
306
|
+
|
|
307
|
+
return LazyFramePlaceholder._construct(
|
|
308
|
+
self_dataframe=None,
|
|
309
|
+
function_name="scan_glue_iceberg",
|
|
310
|
+
schema=schema,
|
|
311
|
+
batch_row_count=batch_row_count,
|
|
312
|
+
aws_catalog_account_id=aws_catalog_account_id,
|
|
313
|
+
aws_catalog_region=aws_catalog_region,
|
|
314
|
+
aws_role_arn=aws_role_arn,
|
|
315
|
+
filter_predicate=None,
|
|
316
|
+
parquet_scan_range_column=parquet_scan_range_column,
|
|
317
|
+
custom_partitions=custom_partitions,
|
|
318
|
+
partition_column=partition_column,
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
@classmethod
|
|
322
|
+
def from_sql(
|
|
323
|
+
cls,
|
|
324
|
+
query: str,
|
|
325
|
+
) -> LazyFramePlaceholder:
|
|
326
|
+
"""Create a ``DataFrame`` from the result of executing a SQL query (DuckDB dialect).
|
|
327
|
+
|
|
328
|
+
Parameters
|
|
329
|
+
----------
|
|
330
|
+
query
|
|
331
|
+
SQL query string (DuckDB dialect).
|
|
332
|
+
**tables
|
|
333
|
+
Named tables to use in the query. Can be Arrow Table, RecordBatch, or DataFrame.
|
|
334
|
+
|
|
335
|
+
Returns
|
|
336
|
+
-------
|
|
337
|
+
DataFrame containing the query results.
|
|
338
|
+
"""
|
|
339
|
+
|
|
340
|
+
return LazyFramePlaceholder._construct(
|
|
341
|
+
self_dataframe=None,
|
|
342
|
+
function_name="from_sql",
|
|
343
|
+
query=query,
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
def with_columns(
|
|
347
|
+
self,
|
|
348
|
+
*columns: typing.Mapping[str, Underscore] | Underscore | tuple[str, Underscore],
|
|
349
|
+
) -> LazyFramePlaceholder:
|
|
350
|
+
"""Add or replace columns.
|
|
351
|
+
|
|
352
|
+
Accepts multiple forms:
|
|
353
|
+
- A mapping of column names to expressions
|
|
354
|
+
- Positional tuples of (name, expression)
|
|
355
|
+
- Bare positional expressions that must include ``.alias(<name>)``
|
|
356
|
+
|
|
357
|
+
Parameters
|
|
358
|
+
----------
|
|
359
|
+
*columns
|
|
360
|
+
Column definitions as mappings, tuples, or aliased expressions.
|
|
361
|
+
|
|
362
|
+
Returns
|
|
363
|
+
-------
|
|
364
|
+
DataFrame with the specified columns added or replaced.
|
|
365
|
+
|
|
366
|
+
Examples
|
|
367
|
+
--------
|
|
368
|
+
>>> from chalkdf import DataFrame
|
|
369
|
+
>>> from chalk.features import _
|
|
370
|
+
>>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
|
|
371
|
+
>>> # Add a new column using a dict with _ syntax
|
|
372
|
+
>>> df2 = df.with_columns({"z": _.x + _.y})
|
|
373
|
+
>>> # Add a new column using alias
|
|
374
|
+
>>> df3 = df.with_columns((_.x + _.y).alias("z"))
|
|
375
|
+
"""
|
|
376
|
+
entries: list[tuple[str, Underscore]] = []
|
|
377
|
+
if len(columns) == 0:
|
|
378
|
+
raise ValueError("with_columns requires at least one column expression")
|
|
379
|
+
|
|
380
|
+
for col in columns:
|
|
381
|
+
if isinstance(col, (list, tuple)):
|
|
382
|
+
if len(col) != 2:
|
|
383
|
+
raise ValueError(
|
|
384
|
+
f"LazyFramePlaceholder.with_column(...) cannot be called with tuple having {len(col)} members - expect (name, expression) pairs only."
|
|
385
|
+
)
|
|
386
|
+
entries.append(col)
|
|
387
|
+
elif isinstance(col, Underscore):
|
|
388
|
+
attempted_alias = _extract_alias_from_underscore(col)
|
|
389
|
+
if attempted_alias:
|
|
390
|
+
entries.append(attempted_alias)
|
|
391
|
+
else:
|
|
392
|
+
raise ValueError(
|
|
393
|
+
f"Positional with_columns expressions must use `.alias(...)` to set the column name, got expression '{col}' without any alias specified"
|
|
394
|
+
)
|
|
395
|
+
elif isinstance(col, typing.Mapping): # pyright: ignore[reportUnnecessaryIsInstance]
|
|
396
|
+
entries.extend((k, v) for k, v in col.items()) # pyright: ignore
|
|
397
|
+
else:
|
|
398
|
+
raise ValueError(
|
|
399
|
+
f"LazyFramePlaceholder.with_columns cannot be called with column argument `{repr(col)}`"
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
return LazyFramePlaceholder._construct(
|
|
403
|
+
self_dataframe=self,
|
|
404
|
+
function_name="with_columns",
|
|
405
|
+
args=tuple(entries),
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
def with_unique_id(self, name: str) -> LazyFramePlaceholder:
|
|
409
|
+
"""Add a monotonically increasing unique identifier column.
|
|
410
|
+
|
|
411
|
+
Parameters
|
|
412
|
+
----------
|
|
413
|
+
name
|
|
414
|
+
Name of the new ID column.
|
|
415
|
+
|
|
416
|
+
Returns
|
|
417
|
+
-------
|
|
418
|
+
DataFrame with a new column containing unique, incrementing IDs.
|
|
419
|
+
|
|
420
|
+
Examples
|
|
421
|
+
--------
|
|
422
|
+
>>> from chalkdf import DataFrame
|
|
423
|
+
>>> df = DataFrame.from_dict({"x": [10, 20, 30]})
|
|
424
|
+
>>> df_with_id = df.with_unique_id("row_id")
|
|
425
|
+
"""
|
|
426
|
+
|
|
427
|
+
return LazyFramePlaceholder._construct(
|
|
428
|
+
self_dataframe=self,
|
|
429
|
+
function_name="with_unique_id",
|
|
430
|
+
name=name,
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
def filter(self, expr: Underscore) -> LazyFramePlaceholder:
|
|
434
|
+
"""Filter rows based on a boolean expression.
|
|
435
|
+
|
|
436
|
+
Parameters
|
|
437
|
+
----------
|
|
438
|
+
expr
|
|
439
|
+
Boolean expression to filter rows. Only rows where the expression
|
|
440
|
+
evaluates to True are kept.
|
|
441
|
+
|
|
442
|
+
Returns
|
|
443
|
+
-------
|
|
444
|
+
DataFrame containing only the rows that match the filter condition.
|
|
445
|
+
|
|
446
|
+
Examples
|
|
447
|
+
--------
|
|
448
|
+
>>> from chalkdf import DataFrame
|
|
449
|
+
>>> from chalk.features import _
|
|
450
|
+
>>> df = DataFrame.from_dict({"x": [1, 2, 3, 4], "y": [10, 20, 30, 40]})
|
|
451
|
+
>>> filtered = df.filter(_.x > 2)
|
|
452
|
+
"""
|
|
453
|
+
|
|
454
|
+
return LazyFramePlaceholder._construct(
|
|
455
|
+
self_dataframe=self,
|
|
456
|
+
function_name="filter",
|
|
457
|
+
expr=expr,
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
def slice(self, start: int, length: int | None = None) -> LazyFramePlaceholder:
|
|
461
|
+
"""Return a subset of rows starting at a specific position.
|
|
462
|
+
|
|
463
|
+
Parameters
|
|
464
|
+
----------
|
|
465
|
+
start
|
|
466
|
+
Zero-based index where the slice begins.
|
|
467
|
+
length
|
|
468
|
+
Number of rows to include. If None, includes all remaining rows.
|
|
469
|
+
|
|
470
|
+
Returns
|
|
471
|
+
-------
|
|
472
|
+
DataFrame containing the sliced rows.
|
|
473
|
+
|
|
474
|
+
Examples
|
|
475
|
+
--------
|
|
476
|
+
>>> from chalkdf import DataFrame
|
|
477
|
+
>>> df = DataFrame.from_dict({"x": [1, 2, 3, 4, 5]})
|
|
478
|
+
>>> # Get rows 1-3 (indices 1, 2, 3)
|
|
479
|
+
>>> sliced = df.slice(1, 3)
|
|
480
|
+
"""
|
|
481
|
+
|
|
482
|
+
# Can't actually express "no limit" with velox limit/offset, but this'll do.
|
|
483
|
+
return self._construct(
|
|
484
|
+
self_dataframe=self,
|
|
485
|
+
function_name="slice",
|
|
486
|
+
start=start,
|
|
487
|
+
length=length,
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
def col(self, column: str) -> Underscore:
|
|
491
|
+
"""Get a column expression from the DataFrame.
|
|
492
|
+
|
|
493
|
+
Parameters
|
|
494
|
+
----------
|
|
495
|
+
column
|
|
496
|
+
Name of the column to retrieve.
|
|
497
|
+
|
|
498
|
+
Returns
|
|
499
|
+
-------
|
|
500
|
+
Column expression (as Underscore) that can be used in operations.
|
|
501
|
+
|
|
502
|
+
Examples
|
|
503
|
+
--------
|
|
504
|
+
>>> from chalkdf import DataFrame
|
|
505
|
+
>>> from chalk.features import _
|
|
506
|
+
>>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
|
|
507
|
+
>>> # Use col to reference columns in expressions
|
|
508
|
+
>>> df_filtered = df.filter(_.x > 1)
|
|
509
|
+
"""
|
|
510
|
+
return self.column(column)
|
|
511
|
+
|
|
512
|
+
def column(self, column: str) -> Underscore:
|
|
513
|
+
"""Get a column expression from the DataFrame.
|
|
514
|
+
|
|
515
|
+
Alias for col() method.
|
|
516
|
+
|
|
517
|
+
Parameters
|
|
518
|
+
----------
|
|
519
|
+
column
|
|
520
|
+
Name of the column to retrieve.
|
|
521
|
+
|
|
522
|
+
Returns
|
|
523
|
+
-------
|
|
524
|
+
Column expression (as Underscore) that can be used in operations.
|
|
525
|
+
|
|
526
|
+
Examples
|
|
527
|
+
--------
|
|
528
|
+
>>> from chalkdf import DataFrame
|
|
529
|
+
>>> from chalk.features import _
|
|
530
|
+
>>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
|
|
531
|
+
>>> df_sum = df.with_columns({"sum": _.x + _.y})
|
|
532
|
+
"""
|
|
533
|
+
|
|
534
|
+
# The LazyFramePlaceholder does not currently track schema, so it cannot detect
|
|
535
|
+
# errors about missing columns.
|
|
536
|
+
return UnderscoreAttr(UnderscoreRoot(), column)
|
|
537
|
+
|
|
538
|
+
def project(self, columns: typing.Mapping[str, Underscore]) -> "LazyFramePlaceholder":
|
|
539
|
+
"""Project to a new set of columns using expressions.
|
|
540
|
+
|
|
541
|
+
Parameters
|
|
542
|
+
----------
|
|
543
|
+
columns
|
|
544
|
+
Mapping of output column names to expressions that define them.
|
|
545
|
+
|
|
546
|
+
Returns
|
|
547
|
+
-------
|
|
548
|
+
DataFrame with only the specified columns.
|
|
549
|
+
|
|
550
|
+
Examples
|
|
551
|
+
--------
|
|
552
|
+
>>> from chalkdf import DataFrame
|
|
553
|
+
>>> from chalk.features import _
|
|
554
|
+
>>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
|
|
555
|
+
>>> projected = df.project({"sum": _.x + _.y, "x": _.x})
|
|
556
|
+
"""
|
|
557
|
+
|
|
558
|
+
return self._construct(
|
|
559
|
+
self_dataframe=self,
|
|
560
|
+
function_name="project",
|
|
561
|
+
columns=columns,
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
def select(self, *columns: str, strict: bool = True) -> "LazyFramePlaceholder":
|
|
565
|
+
"""Select existing columns by name.
|
|
566
|
+
|
|
567
|
+
Parameters
|
|
568
|
+
----------
|
|
569
|
+
*columns
|
|
570
|
+
Names of columns to select.
|
|
571
|
+
strict
|
|
572
|
+
If True, raise an error if any column doesn't exist. If False,
|
|
573
|
+
silently ignore missing columns.
|
|
574
|
+
|
|
575
|
+
Returns
|
|
576
|
+
-------
|
|
577
|
+
DataFrame with only the selected columns.
|
|
578
|
+
|
|
579
|
+
Examples
|
|
580
|
+
--------
|
|
581
|
+
>>> from chalkdf import DataFrame
|
|
582
|
+
>>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]})
|
|
583
|
+
>>> selected = df.select("x", "y")
|
|
584
|
+
"""
|
|
585
|
+
|
|
586
|
+
return self._construct(
|
|
587
|
+
self_dataframe=self,
|
|
588
|
+
function_name="select",
|
|
589
|
+
args=columns,
|
|
590
|
+
strict=strict,
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
def drop(self, *columns: str, strict: bool = True) -> LazyFramePlaceholder:
|
|
594
|
+
"""Drop specified columns from the DataFrame.
|
|
595
|
+
|
|
596
|
+
Parameters
|
|
597
|
+
----------
|
|
598
|
+
*columns
|
|
599
|
+
Names of columns to drop.
|
|
600
|
+
strict
|
|
601
|
+
If True, raise an error if any column doesn't exist. If False,
|
|
602
|
+
silently ignore missing columns.
|
|
603
|
+
|
|
604
|
+
Returns
|
|
605
|
+
-------
|
|
606
|
+
DataFrame without the dropped columns.
|
|
607
|
+
|
|
608
|
+
Examples
|
|
609
|
+
--------
|
|
610
|
+
>>> from chalkdf import DataFrame
|
|
611
|
+
>>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]})
|
|
612
|
+
>>> df_dropped = df.drop("z")
|
|
613
|
+
"""
|
|
614
|
+
|
|
615
|
+
return self._construct(
|
|
616
|
+
self_dataframe=self,
|
|
617
|
+
function_name="drop",
|
|
618
|
+
args=columns,
|
|
619
|
+
strict=strict,
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
def explode(self, column: str) -> "LazyFramePlaceholder":
|
|
623
|
+
"""Explode a list or array column into multiple rows.
|
|
624
|
+
|
|
625
|
+
Each element in the list becomes a separate row, with other column
|
|
626
|
+
values duplicated.
|
|
627
|
+
|
|
628
|
+
Parameters
|
|
629
|
+
----------
|
|
630
|
+
column
|
|
631
|
+
Name of the list/array column to explode.
|
|
632
|
+
|
|
633
|
+
Returns
|
|
634
|
+
-------
|
|
635
|
+
DataFrame with the list column expanded into multiple rows.
|
|
636
|
+
|
|
637
|
+
Examples
|
|
638
|
+
--------
|
|
639
|
+
>>> from chalkdf import DataFrame
|
|
640
|
+
>>> df = DataFrame.from_dict({"id": [1, 2], "items": [[10, 20], [30]]})
|
|
641
|
+
>>> exploded = df.explode("items")
|
|
642
|
+
"""
|
|
643
|
+
return self._construct(
|
|
644
|
+
self_dataframe=self,
|
|
645
|
+
function_name="explode",
|
|
646
|
+
column=column,
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
def join(
|
|
650
|
+
self,
|
|
651
|
+
other: "LazyFramePlaceholder",
|
|
652
|
+
on: dict[str, str] | typing.Sequence[str],
|
|
653
|
+
how: str = "inner",
|
|
654
|
+
right_suffix: str | None = None,
|
|
655
|
+
) -> "LazyFramePlaceholder":
|
|
656
|
+
"""Join this ``DataFrame`` with another.
|
|
657
|
+
|
|
658
|
+
Parameters
|
|
659
|
+
----------
|
|
660
|
+
other
|
|
661
|
+
Right-hand ``DataFrame``.
|
|
662
|
+
on
|
|
663
|
+
Column names or mapping of left->right join keys.
|
|
664
|
+
how
|
|
665
|
+
Join type (e.g. ``"inner"`` or ``"left"``).
|
|
666
|
+
right_suffix
|
|
667
|
+
Optional suffix applied to right-hand columns when names collide.
|
|
668
|
+
|
|
669
|
+
Returns
|
|
670
|
+
-------
|
|
671
|
+
Resulting ``DataFrame`` after the join.
|
|
672
|
+
"""
|
|
673
|
+
|
|
674
|
+
return self._construct(
|
|
675
|
+
self_dataframe=self,
|
|
676
|
+
function_name="join",
|
|
677
|
+
other=other,
|
|
678
|
+
on=on,
|
|
679
|
+
how=how,
|
|
680
|
+
right_suffix=right_suffix,
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
def join_asof(
|
|
684
|
+
self,
|
|
685
|
+
other: LazyFramePlaceholder,
|
|
686
|
+
on: str,
|
|
687
|
+
*,
|
|
688
|
+
right_on: str | None = None,
|
|
689
|
+
by: list[str] | None = None,
|
|
690
|
+
right_by: list[str] | None = None,
|
|
691
|
+
strategy: typing.Literal["forward", "backward"] = "backward",
|
|
692
|
+
right_suffix: str | None = None,
|
|
693
|
+
coalesce: bool = True,
|
|
694
|
+
) -> LazyFramePlaceholder:
|
|
695
|
+
"""Perform an as-of join with another DataFrame.
|
|
696
|
+
|
|
697
|
+
An as-of join is similar to a left join, but instead of matching on equality,
|
|
698
|
+
it matches on the nearest key from the right DataFrame. This is commonly used
|
|
699
|
+
for time-series data where you want to join with the most recent observation.
|
|
700
|
+
|
|
701
|
+
**Important**: Both DataFrames must be sorted by the ``on`` column before calling
|
|
702
|
+
this method. Use ``.order_by(on)`` to sort if needed.
|
|
703
|
+
|
|
704
|
+
Parameters
|
|
705
|
+
----------
|
|
706
|
+
other
|
|
707
|
+
Right-hand DataFrame to join with.
|
|
708
|
+
on
|
|
709
|
+
Column name in the left DataFrame to join on (must be sorted).
|
|
710
|
+
right_on
|
|
711
|
+
Column name in the right DataFrame to join on. If None, uses ``on``.
|
|
712
|
+
by
|
|
713
|
+
Additional exact-match columns for left DataFrame (optional).
|
|
714
|
+
right_by
|
|
715
|
+
Additional exact-match columns for right DataFrame. If None, uses ``by``.
|
|
716
|
+
strategy
|
|
717
|
+
Join strategy - "backward" (default) matches with the most recent past value,
|
|
718
|
+
"forward" matches with the nearest future value. Can also pass AsOfJoinStrategy enum.
|
|
719
|
+
right_suffix
|
|
720
|
+
Suffix to add to overlapping column names from the right DataFrame.
|
|
721
|
+
coalesce
|
|
722
|
+
Whether to coalesce the join keys (default True).
|
|
723
|
+
|
|
724
|
+
Returns
|
|
725
|
+
-------
|
|
726
|
+
Resulting DataFrame after the as-of join.
|
|
727
|
+
"""
|
|
728
|
+
# Convert string strategy to enum if needed
|
|
729
|
+
|
|
730
|
+
return self._construct(
|
|
731
|
+
self_dataframe=self,
|
|
732
|
+
function_name="join_asof",
|
|
733
|
+
other=other,
|
|
734
|
+
on=on,
|
|
735
|
+
right_on=right_on,
|
|
736
|
+
by=by,
|
|
737
|
+
right_by=right_by,
|
|
738
|
+
strategy=strategy,
|
|
739
|
+
right_suffix=right_suffix,
|
|
740
|
+
coalesce=coalesce,
|
|
741
|
+
)
|
|
742
|
+
|
|
743
|
+
# # Window is not yet supported in LazyFramePlaceholder:
|
|
744
|
+
# def window(
|
|
745
|
+
# self,
|
|
746
|
+
# by: typing.Sequence[str],
|
|
747
|
+
# order_by: typing.Sequence[str | tuple[str, str]],
|
|
748
|
+
# *expressions: WindowExpr,
|
|
749
|
+
# ) -> LazyFramePlaceholder:
|
|
750
|
+
# ...
|
|
751
|
+
|
|
752
|
+
def agg(self, by: typing.Sequence[str], *aggregations: Underscore) -> "LazyFramePlaceholder":
|
|
753
|
+
"""Group by columns and apply aggregation expressions.
|
|
754
|
+
|
|
755
|
+
Parameters
|
|
756
|
+
----------
|
|
757
|
+
by
|
|
758
|
+
Column names to group by.
|
|
759
|
+
*aggregations
|
|
760
|
+
Aggregation expressions to apply to each group (e.g., sum, count, mean).
|
|
761
|
+
|
|
762
|
+
Returns
|
|
763
|
+
-------
|
|
764
|
+
DataFrame with one row per group containing the aggregated values.
|
|
765
|
+
|
|
766
|
+
Examples
|
|
767
|
+
--------
|
|
768
|
+
>>> from chalkdf import DataFrame
|
|
769
|
+
>>> from chalk.features import _
|
|
770
|
+
>>> df = DataFrame.from_dict({"group": ["A", "A", "B"], "value": [1, 2, 3]})
|
|
771
|
+
>>> agg_df = df.agg(["group"], _.value.sum().alias("total"))
|
|
772
|
+
"""
|
|
773
|
+
|
|
774
|
+
if isinstance(by, str):
|
|
775
|
+
raise ValueError(f".agg(...) must be called with a list of group-by columns, not a single str {repr(by)}")
|
|
776
|
+
|
|
777
|
+
return self._construct(
|
|
778
|
+
self_dataframe=self,
|
|
779
|
+
function_name="agg",
|
|
780
|
+
args=(by, *aggregations),
|
|
781
|
+
)
|
|
782
|
+
|
|
783
|
+
def distinct_on(self, *columns: str) -> "LazyFramePlaceholder":
|
|
784
|
+
"""Remove duplicate rows based on specified columns.
|
|
785
|
+
|
|
786
|
+
For rows with identical values in the specified columns, only one
|
|
787
|
+
row is kept (chosen arbitrarily).
|
|
788
|
+
|
|
789
|
+
Parameters
|
|
790
|
+
----------
|
|
791
|
+
*columns
|
|
792
|
+
Column names to check for duplicates.
|
|
793
|
+
|
|
794
|
+
Returns
|
|
795
|
+
-------
|
|
796
|
+
DataFrame with duplicate rows removed.
|
|
797
|
+
|
|
798
|
+
Examples
|
|
799
|
+
--------
|
|
800
|
+
>>> from chalkdf import DataFrame
|
|
801
|
+
>>> df = DataFrame.from_dict({"x": [1, 1, 2], "y": [10, 20, 30]})
|
|
802
|
+
>>> unique = df.distinct_on("x")
|
|
803
|
+
"""
|
|
804
|
+
|
|
805
|
+
return self._construct(
|
|
806
|
+
self_dataframe=self,
|
|
807
|
+
function_name="distinct_on",
|
|
808
|
+
args=columns,
|
|
809
|
+
)
|
|
810
|
+
|
|
811
|
+
def order_by(self, *columns: str | tuple[str, str]) -> LazyFramePlaceholder:
|
|
812
|
+
"""Sort the DataFrame by one or more columns.
|
|
813
|
+
|
|
814
|
+
Parameters
|
|
815
|
+
----------
|
|
816
|
+
*columns
|
|
817
|
+
Column names to sort by. Can be strings (for ascending order) or
|
|
818
|
+
tuples of (column_name, direction) where direction is "asc" or "desc".
|
|
819
|
+
|
|
820
|
+
Returns
|
|
821
|
+
-------
|
|
822
|
+
DataFrame sorted by the specified columns.
|
|
823
|
+
|
|
824
|
+
Examples
|
|
825
|
+
--------
|
|
826
|
+
>>> from chalkdf import DataFrame
|
|
827
|
+
>>> df = DataFrame.from_dict({"x": [3, 1, 2], "y": [30, 10, 20]})
|
|
828
|
+
>>> # Sort by x ascending
|
|
829
|
+
>>> sorted_df = df.order_by("x")
|
|
830
|
+
>>> # Sort by x descending, then y ascending
|
|
831
|
+
>>> sorted_df = df.order_by(("x", "desc"), "y")
|
|
832
|
+
"""
|
|
833
|
+
|
|
834
|
+
return self._construct(
|
|
835
|
+
self_dataframe=self,
|
|
836
|
+
function_name="order_by",
|
|
837
|
+
args=columns,
|
|
838
|
+
)
|
|
839
|
+
|
|
840
|
+
def write(
|
|
841
|
+
self,
|
|
842
|
+
target_path: str,
|
|
843
|
+
target_file_name: str | None = None,
|
|
844
|
+
*,
|
|
845
|
+
file_format: str = "parquet",
|
|
846
|
+
serde_parameters: typing.Mapping[str, str] | None = None,
|
|
847
|
+
compression: str | None = None,
|
|
848
|
+
ensure_files: bool = False,
|
|
849
|
+
connector_id: str | None = None,
|
|
850
|
+
) -> "LazyFramePlaceholder":
|
|
851
|
+
"""Persist the DataFrame plan using Velox's Hive connector.
|
|
852
|
+
|
|
853
|
+
Parameters
|
|
854
|
+
----------
|
|
855
|
+
target_path
|
|
856
|
+
Directory to write output files.
|
|
857
|
+
target_file_name
|
|
858
|
+
Optional explicit file name.
|
|
859
|
+
file_format
|
|
860
|
+
Output format (default ``parquet``).
|
|
861
|
+
serde_parameters
|
|
862
|
+
Optional SerDe options for text formats.
|
|
863
|
+
compression
|
|
864
|
+
Optional compression codec.
|
|
865
|
+
ensure_files
|
|
866
|
+
Ensure writers emit files even if no rows were produced.
|
|
867
|
+
connector_id
|
|
868
|
+
Optional connector id override.
|
|
869
|
+
|
|
870
|
+
Returns
|
|
871
|
+
-------
|
|
872
|
+
DataFrame representing the TableWrite operator.
|
|
873
|
+
"""
|
|
874
|
+
|
|
875
|
+
return self._construct(
|
|
876
|
+
self_dataframe=self,
|
|
877
|
+
function_name="write",
|
|
878
|
+
target_path=target_path,
|
|
879
|
+
target_file_name=target_file_name,
|
|
880
|
+
file_format=file_format,
|
|
881
|
+
serde_parameters=serde_parameters,
|
|
882
|
+
compression=compression,
|
|
883
|
+
ensure_files=ensure_files,
|
|
884
|
+
connector_id=connector_id,
|
|
885
|
+
)
|
|
886
|
+
|
|
887
|
+
def rename(self, new_names: dict[str, str]) -> LazyFramePlaceholder:
|
|
888
|
+
"""Rename columns in the DataFrame.
|
|
889
|
+
|
|
890
|
+
Parameters
|
|
891
|
+
----------
|
|
892
|
+
new_names
|
|
893
|
+
Dictionary mapping old column names to new column names.
|
|
894
|
+
|
|
895
|
+
Returns
|
|
896
|
+
-------
|
|
897
|
+
DataFrame with renamed columns.
|
|
898
|
+
|
|
899
|
+
Examples
|
|
900
|
+
--------
|
|
901
|
+
>>> from chalkdf import DataFrame
|
|
902
|
+
>>> df = DataFrame.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
|
|
903
|
+
>>> renamed = df.rename({"x": "id", "y": "value"})
|
|
904
|
+
"""
|
|
905
|
+
|
|
906
|
+
return self._construct(
|
|
907
|
+
self_dataframe=self,
|
|
908
|
+
function_name="rename",
|
|
909
|
+
new_names=new_names,
|
|
910
|
+
)
|
|
911
|
+
|
|
912
|
+
@staticmethod
|
|
913
|
+
def from_proto(
|
|
914
|
+
proto: bytes | dataframe_pb2.DataFramePlan,
|
|
915
|
+
) -> "LazyFramePlaceholder":
|
|
916
|
+
if isinstance(proto, bytes):
|
|
917
|
+
proto_bytes = proto
|
|
918
|
+
proto = dataframe_pb2.DataFramePlan()
|
|
919
|
+
proto.ParseFromString(proto_bytes)
|
|
920
|
+
return _convert_from_dataframe_proto(proto, dataframe_class=LazyFramePlaceholder)
|
|
921
|
+
|
|
922
|
+
|
|
923
|
+
def _extract_alias_from_underscore(u: Underscore) -> tuple[str, Underscore] | None:
|
|
924
|
+
"""
|
|
925
|
+
Given an underscore expression like `_.something.alias("name")` splits the expression
|
|
926
|
+
into the alias `"name"` and the underscore expression `_.something`.
|
|
927
|
+
|
|
928
|
+
If this expression does not have an alias, returns `None` instead.
|
|
929
|
+
"""
|
|
930
|
+
if not isinstance(u, UnderscoreCall):
|
|
931
|
+
return None
|
|
932
|
+
parent = u._chalk__parent # pyright: ignore[reportPrivateUsage]
|
|
933
|
+
if not isinstance(parent, UnderscoreAttr) or parent._chalk__attr != "alias": # pyright: ignore[reportPrivateUsage]
|
|
934
|
+
return None
|
|
935
|
+
if len(u._chalk__args) != 1: # pyright: ignore[reportPrivateUsage]
|
|
936
|
+
raise ValueError("alias() must be called with one argument")
|
|
937
|
+
alias = u._chalk__args[0] # pyright: ignore[reportPrivateUsage]
|
|
938
|
+
if not isinstance(alias, str):
|
|
939
|
+
raise ValueError("argument to alias() must be a string")
|
|
940
|
+
return (
|
|
941
|
+
alias,
|
|
942
|
+
parent._chalk__parent, # pyright: ignore[reportPrivateUsage]
|
|
943
|
+
)
|
|
944
|
+
|
|
945
|
+
|
|
946
|
+
def _convert_to_dataframe_proto(
|
|
947
|
+
lazy_frame: LazyFramePlaceholder,
|
|
948
|
+
) -> dataframe_pb2.DataFramePlan:
|
|
949
|
+
"""
|
|
950
|
+
Converts a `LazyFramePlaceholder` into a proto value, allowing it to be round-tripped
|
|
951
|
+
or converted into a Chalk DataFrame for execution.
|
|
952
|
+
"""
|
|
953
|
+
df_constructors: list[dataframe_pb2.DataFrameConstructor] = []
|
|
954
|
+
|
|
955
|
+
# This map will memoize the constructor for a specified `LazyFramePlaceholder`.
|
|
956
|
+
lazy_frame_placeholder_cache: dict[LazyFramePlaceholder, dataframe_pb2.DataFrameIndex] = {}
|
|
957
|
+
|
|
958
|
+
def _convert_dataframe(df: LazyFramePlaceholder) -> dataframe_pb2.DataFrameIndex:
|
|
959
|
+
"""
|
|
960
|
+
Recursively converts a `LazyFramePlaceholder` into a proto message.
|
|
961
|
+
If this `df` instance has been seen before, returns an index into the `df_constructors`
|
|
962
|
+
list pointing to the previous construction.
|
|
963
|
+
|
|
964
|
+
This allows plans that re-use operators to be efficiently encoded.
|
|
965
|
+
"""
|
|
966
|
+
if df in lazy_frame_placeholder_cache:
|
|
967
|
+
return lazy_frame_placeholder_cache[df]
|
|
968
|
+
|
|
969
|
+
df_constructor = df._lazy_frame_constructor # pyright: ignore[reportPrivateUsage]
|
|
970
|
+
if df_constructor.self_dataframe is None:
|
|
971
|
+
self_proto = None
|
|
972
|
+
else:
|
|
973
|
+
self_proto = _convert_dataframe(df_constructor.self_dataframe)
|
|
974
|
+
|
|
975
|
+
proto_args = dataframe_pb2.PyList(
|
|
976
|
+
list_items=[_convert_arg(arg_value) for arg_value in df_constructor.args],
|
|
977
|
+
)
|
|
978
|
+
proto_kwargs = dataframe_pb2.PyDict(
|
|
979
|
+
dict_entries=[
|
|
980
|
+
dataframe_pb2.PyDictEntry(
|
|
981
|
+
entry_key=_convert_arg(kwarg_name),
|
|
982
|
+
entry_value=_convert_arg(kwarg_value),
|
|
983
|
+
)
|
|
984
|
+
for kwarg_name, kwarg_value in df_constructor.kwargs.items()
|
|
985
|
+
],
|
|
986
|
+
)
|
|
987
|
+
|
|
988
|
+
new_constructor_index = len(df_constructors)
|
|
989
|
+
df_constructors.append(
|
|
990
|
+
dataframe_pb2.DataFrameConstructor(
|
|
991
|
+
self_operand=self_proto,
|
|
992
|
+
function_name=df_constructor.function_name,
|
|
993
|
+
args=proto_args,
|
|
994
|
+
kwargs=proto_kwargs,
|
|
995
|
+
)
|
|
996
|
+
)
|
|
997
|
+
lazy_frame_placeholder_cache[df] = dataframe_pb2.DataFrameIndex(
|
|
998
|
+
dataframe_op_index=new_constructor_index,
|
|
999
|
+
)
|
|
1000
|
+
return lazy_frame_placeholder_cache[df]
|
|
1001
|
+
|
|
1002
|
+
def _convert_arg(value: Any) -> dataframe_pb2.DataFrameOperand:
|
|
1003
|
+
if value is None:
|
|
1004
|
+
return dataframe_pb2.DataFrameOperand(
|
|
1005
|
+
value_none=dataframe_pb2.PyNone(),
|
|
1006
|
+
)
|
|
1007
|
+
if isinstance(value, int):
|
|
1008
|
+
return dataframe_pb2.DataFrameOperand(
|
|
1009
|
+
value_int=value,
|
|
1010
|
+
)
|
|
1011
|
+
if isinstance(value, str):
|
|
1012
|
+
return dataframe_pb2.DataFrameOperand(
|
|
1013
|
+
value_string=value,
|
|
1014
|
+
)
|
|
1015
|
+
if isinstance(value, bool):
|
|
1016
|
+
return dataframe_pb2.DataFrameOperand(
|
|
1017
|
+
value_bool=value,
|
|
1018
|
+
)
|
|
1019
|
+
if isinstance(value, (list, tuple)):
|
|
1020
|
+
return dataframe_pb2.DataFrameOperand(
|
|
1021
|
+
value_list=dataframe_pb2.PyList(
|
|
1022
|
+
list_items=[_convert_arg(item) for item in value],
|
|
1023
|
+
)
|
|
1024
|
+
)
|
|
1025
|
+
if isinstance(value, typing.Mapping):
|
|
1026
|
+
return dataframe_pb2.DataFrameOperand(
|
|
1027
|
+
value_dict=dataframe_pb2.PyDict(
|
|
1028
|
+
dict_entries=[
|
|
1029
|
+
dataframe_pb2.PyDictEntry(
|
|
1030
|
+
entry_key=_convert_arg(key),
|
|
1031
|
+
entry_value=_convert_arg(value),
|
|
1032
|
+
)
|
|
1033
|
+
for key, value in value.items()
|
|
1034
|
+
]
|
|
1035
|
+
)
|
|
1036
|
+
)
|
|
1037
|
+
if isinstance(value, LazyFramePlaceholder):
|
|
1038
|
+
# Use the dataframe-specific helper function for this logic.
|
|
1039
|
+
return dataframe_pb2.DataFrameOperand(
|
|
1040
|
+
value_dataframe_index=_convert_dataframe(value),
|
|
1041
|
+
)
|
|
1042
|
+
if isinstance(value, Underscore):
|
|
1043
|
+
return dataframe_pb2.DataFrameOperand(
|
|
1044
|
+
underscore_expr=convert_value_to_proto_expr(value),
|
|
1045
|
+
)
|
|
1046
|
+
if isinstance(value, pyarrow.Schema):
|
|
1047
|
+
return dataframe_pb2.DataFrameOperand(
|
|
1048
|
+
arrow_schema=PrimitiveFeatureConverter.convert_pa_schema_to_proto_schema(value),
|
|
1049
|
+
)
|
|
1050
|
+
if isinstance(value, (pyarrow.Table, pyarrow.RecordBatch)):
|
|
1051
|
+
return dataframe_pb2.DataFrameOperand(
|
|
1052
|
+
arrow_table=PrimitiveFeatureConverter.convert_arrow_table_to_proto(value),
|
|
1053
|
+
)
|
|
1054
|
+
|
|
1055
|
+
# If libchalk.chalktable is available in the current environment, then we might encounter
|
|
1056
|
+
# a libchalk.chalktable.Expr value which needs to be proto-serialized.
|
|
1057
|
+
LibchalkExpr = None
|
|
1058
|
+
try:
|
|
1059
|
+
from libchalk.chalktable import Expr as LibchalkExpr # pyright: ignore
|
|
1060
|
+
except ImportError:
|
|
1061
|
+
pass
|
|
1062
|
+
if LibchalkExpr and isinstance(value, LibchalkExpr):
|
|
1063
|
+
value_expr_encoded = value.to_proto_bytes()
|
|
1064
|
+
return dataframe_pb2.DataFrameOperand(
|
|
1065
|
+
libchalk_expr=expression_pb2.LogicalExprNode.FromString(value_expr_encoded),
|
|
1066
|
+
)
|
|
1067
|
+
|
|
1068
|
+
raise ValueError(f"LazyFramePlaceholder function operand is of unsupported type {type(value)}")
|
|
1069
|
+
|
|
1070
|
+
_convert_arg(lazy_frame)
|
|
1071
|
+
|
|
1072
|
+
return dataframe_pb2.DataFramePlan(
|
|
1073
|
+
constructors=df_constructors,
|
|
1074
|
+
)
|
|
1075
|
+
|
|
1076
|
+
|
|
1077
|
+
def _convert_from_dataframe_proto(
|
|
1078
|
+
proto_plan: dataframe_pb2.DataFramePlan,
|
|
1079
|
+
dataframe_class: type,
|
|
1080
|
+
) -> LazyFramePlaceholder:
|
|
1081
|
+
"""
|
|
1082
|
+
Converts a proto into a lazy frame.
|
|
1083
|
+
"""
|
|
1084
|
+
df_values: list[LazyFramePlaceholder] = []
|
|
1085
|
+
|
|
1086
|
+
def _convert_dataframe_index(df: dataframe_pb2.DataFrameIndex) -> LazyFramePlaceholder:
|
|
1087
|
+
if df.dataframe_op_index < 0 or df.dataframe_op_index >= len(df_values):
|
|
1088
|
+
raise ValueError(
|
|
1089
|
+
f"DataFrame proto message value is invalid - a DataFrame constructor references operator index {df.dataframe_op_index} but only {len(df_values)} dataframe(s) intermediate values have been defined so far."
|
|
1090
|
+
)
|
|
1091
|
+
return df_values[df.dataframe_op_index]
|
|
1092
|
+
|
|
1093
|
+
def _convert_dataframe(df: dataframe_pb2.DataFrameConstructor) -> LazyFramePlaceholder:
|
|
1094
|
+
if df.HasField("self_operand"):
|
|
1095
|
+
self_operand = _convert_dataframe_index(df.self_operand)
|
|
1096
|
+
else:
|
|
1097
|
+
self_operand = None
|
|
1098
|
+
|
|
1099
|
+
# TODO: validate that function_name is legal.
|
|
1100
|
+
if self_operand is None:
|
|
1101
|
+
method = getattr(dataframe_class, df.function_name)
|
|
1102
|
+
else:
|
|
1103
|
+
method = getattr(self_operand, df.function_name)
|
|
1104
|
+
|
|
1105
|
+
args = [_convert_arg(arg) for arg in df.args.list_items]
|
|
1106
|
+
kwargs = {_convert_arg(entry.entry_key): _convert_arg(entry.entry_value) for entry in df.kwargs.dict_entries}
|
|
1107
|
+
|
|
1108
|
+
return method(*args, **kwargs)
|
|
1109
|
+
|
|
1110
|
+
def _convert_arg(value: dataframe_pb2.DataFrameOperand) -> Any:
|
|
1111
|
+
if value.HasField("value_string"):
|
|
1112
|
+
return value.value_string
|
|
1113
|
+
if value.HasField("value_int"):
|
|
1114
|
+
return value.value_int
|
|
1115
|
+
if value.HasField("value_bool"):
|
|
1116
|
+
return value.value_bool
|
|
1117
|
+
if value.HasField("value_none"):
|
|
1118
|
+
return None
|
|
1119
|
+
if value.HasField("value_list"):
|
|
1120
|
+
return [_convert_arg(item) for item in value.value_list.list_items]
|
|
1121
|
+
if value.HasField("value_dict"):
|
|
1122
|
+
return {
|
|
1123
|
+
_convert_arg(entry.entry_key): _convert_arg(entry.entry_value)
|
|
1124
|
+
for entry in value.value_dict.dict_entries
|
|
1125
|
+
}
|
|
1126
|
+
if value.HasField("value_dataframe_index"):
|
|
1127
|
+
return _convert_dataframe_index(value.value_dataframe_index)
|
|
1128
|
+
if value.HasField("arrow_schema"):
|
|
1129
|
+
return PrimitiveFeatureConverter.convert_proto_schema_to_pa_schema(value.arrow_schema)
|
|
1130
|
+
if value.HasField("arrow_table"):
|
|
1131
|
+
return PrimitiveFeatureConverter.convert_arrow_table_from_proto(value.arrow_table)
|
|
1132
|
+
if value.HasField("underscore_expr"):
|
|
1133
|
+
return Underscore._from_proto(value.underscore_expr) # pyright: ignore[reportPrivateUsage]
|
|
1134
|
+
if value.HasField("libchalk_expr"):
|
|
1135
|
+
# In order to decode `libchalk_expr` vlaues, `libchalk` must be available as a module.
|
|
1136
|
+
try:
|
|
1137
|
+
from libchalk.chalktable import Expr as LibchalkExpr # pyright: ignore
|
|
1138
|
+
except ImportError:
|
|
1139
|
+
raise ValueError(
|
|
1140
|
+
"A dataframe parameter was encoded holding a libchalk.chalktable.Expr value, but the `libchalk` module is not available in the current environment. To decode this dataframe expression, import libchalk."
|
|
1141
|
+
)
|
|
1142
|
+
return LibchalkExpr.from_proto_bytes(value.libchalk_expr.SerializeToString())
|
|
1143
|
+
|
|
1144
|
+
raise ValueError(f"DataFrame operand expression {value} does not have any value set")
|
|
1145
|
+
|
|
1146
|
+
for df in proto_plan.constructors:
|
|
1147
|
+
df_values.append(_convert_dataframe(df))
|
|
1148
|
+
|
|
1149
|
+
if len(df_values) == 0:
|
|
1150
|
+
raise ValueError(
|
|
1151
|
+
"Could not parse LazyFramePlaceholder from proto expression; no dataframe constructors were present in the provided proto message"
|
|
1152
|
+
)
|
|
1153
|
+
|
|
1154
|
+
return df_values[-1]
|