flyteplugins-polars 2.0.0b51__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flyteplugins/polars/__init__.py +0 -0
- flyteplugins/polars/df_transformer.py +240 -0
- flyteplugins_polars-2.0.0b51.dist-info/METADATA +25 -0
- flyteplugins_polars-2.0.0b51.dist-info/RECORD +7 -0
- flyteplugins_polars-2.0.0b51.dist-info/WHEEL +5 -0
- flyteplugins_polars-2.0.0b51.dist-info/entry_points.txt +2 -0
- flyteplugins_polars-2.0.0b51.dist-info/top_level.txt +1 -0
|
File without changes
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import os
|
|
3
|
+
import typing
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import flyte.storage as storage
|
|
7
|
+
from flyte._logging import logger
|
|
8
|
+
from flyte._utils import lazy_module
|
|
9
|
+
from flyte.io._dataframe.dataframe import PARQUET, DataFrame
|
|
10
|
+
from flyte.io.extend import (
|
|
11
|
+
DataFrameDecoder,
|
|
12
|
+
DataFrameEncoder,
|
|
13
|
+
DataFrameTransformerEngine,
|
|
14
|
+
)
|
|
15
|
+
from flyteidl2.core import literals_pb2, types_pb2
|
|
16
|
+
|
|
17
|
+
if typing.TYPE_CHECKING:
|
|
18
|
+
import polars as pl
|
|
19
|
+
else:
|
|
20
|
+
pl = lazy_module("polars")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_polars_storage_options(protocol: typing.Optional[str], anonymous: bool = False) -> typing.Dict[str, str]:
|
|
24
|
+
"""
|
|
25
|
+
Get storage options in a format compatible with Polars.
|
|
26
|
+
|
|
27
|
+
Polars requires storage_options to be a flat dict with string keys and values,
|
|
28
|
+
unlike fsspec which accepts nested dicts and complex objects.
|
|
29
|
+
"""
|
|
30
|
+
from flyte._initialize import get_storage
|
|
31
|
+
from flyte.errors import InitializationError
|
|
32
|
+
|
|
33
|
+
if not protocol:
|
|
34
|
+
return {}
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
storage_config = get_storage()
|
|
38
|
+
except InitializationError:
|
|
39
|
+
storage_config = None
|
|
40
|
+
|
|
41
|
+
match protocol:
|
|
42
|
+
case "s3":
|
|
43
|
+
from flyte.storage import S3
|
|
44
|
+
|
|
45
|
+
if storage_config and isinstance(storage_config, S3):
|
|
46
|
+
s3_config = storage_config
|
|
47
|
+
else:
|
|
48
|
+
s3_config = S3.auto()
|
|
49
|
+
|
|
50
|
+
opts: typing.Dict[str, str] = {}
|
|
51
|
+
if s3_config.access_key_id:
|
|
52
|
+
opts["aws_access_key_id"] = s3_config.access_key_id
|
|
53
|
+
if s3_config.secret_access_key:
|
|
54
|
+
opts["aws_secret_access_key"] = s3_config.secret_access_key
|
|
55
|
+
if s3_config.region:
|
|
56
|
+
opts["aws_region"] = s3_config.region
|
|
57
|
+
if s3_config.endpoint:
|
|
58
|
+
opts["aws_endpoint_url"] = s3_config.endpoint
|
|
59
|
+
if anonymous:
|
|
60
|
+
opts["aws_skip_signature"] = "true"
|
|
61
|
+
return opts
|
|
62
|
+
|
|
63
|
+
case "gs":
|
|
64
|
+
# GCS typically uses application default credentials
|
|
65
|
+
# Polars supports this automatically
|
|
66
|
+
return {}
|
|
67
|
+
|
|
68
|
+
case "abfs" | "abfss":
|
|
69
|
+
from flyte.storage import ABFS
|
|
70
|
+
|
|
71
|
+
if storage_config and isinstance(storage_config, ABFS):
|
|
72
|
+
abfs_config = storage_config
|
|
73
|
+
else:
|
|
74
|
+
abfs_config = ABFS.auto()
|
|
75
|
+
|
|
76
|
+
opts = {}
|
|
77
|
+
if abfs_config.account_name:
|
|
78
|
+
opts["azure_storage_account_name"] = abfs_config.account_name
|
|
79
|
+
if abfs_config.account_key:
|
|
80
|
+
opts["azure_storage_account_key"] = abfs_config.account_key
|
|
81
|
+
if abfs_config.tenant_id:
|
|
82
|
+
opts["azure_storage_tenant_id"] = abfs_config.tenant_id
|
|
83
|
+
if abfs_config.client_id:
|
|
84
|
+
opts["azure_storage_client_id"] = abfs_config.client_id
|
|
85
|
+
if abfs_config.client_secret:
|
|
86
|
+
opts["azure_storage_client_secret"] = abfs_config.client_secret
|
|
87
|
+
return opts
|
|
88
|
+
|
|
89
|
+
case _:
|
|
90
|
+
return {}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class PolarsToParquetEncodingHandler(DataFrameEncoder):
|
|
94
|
+
def __init__(self):
|
|
95
|
+
super().__init__(pl.DataFrame, None, PARQUET)
|
|
96
|
+
|
|
97
|
+
async def encode(
|
|
98
|
+
self,
|
|
99
|
+
dataframe: DataFrame,
|
|
100
|
+
structured_dataset_type: types_pb2.StructuredDatasetType,
|
|
101
|
+
) -> literals_pb2.StructuredDataset:
|
|
102
|
+
if not dataframe.uri:
|
|
103
|
+
from flyte._context import internal_ctx
|
|
104
|
+
|
|
105
|
+
ctx = internal_ctx()
|
|
106
|
+
uri = str(ctx.raw_data.get_random_remote_path())
|
|
107
|
+
else:
|
|
108
|
+
uri = typing.cast(str, dataframe.uri)
|
|
109
|
+
|
|
110
|
+
if not storage.is_remote(uri):
|
|
111
|
+
Path(uri).mkdir(parents=True, exist_ok=True)
|
|
112
|
+
path = os.path.join(uri, f"{0:05}.parquet")
|
|
113
|
+
df = typing.cast(pl.DataFrame, dataframe.val)
|
|
114
|
+
|
|
115
|
+
# Polars requires flat string key-value storage options
|
|
116
|
+
filesystem = storage.get_underlying_filesystem(path=path)
|
|
117
|
+
storage_options = get_polars_storage_options(protocol=filesystem.protocol)
|
|
118
|
+
df.write_parquet(path, storage_options=storage_options or None)
|
|
119
|
+
|
|
120
|
+
structured_dataset_type.format = PARQUET
|
|
121
|
+
return literals_pb2.StructuredDataset(
|
|
122
|
+
uri=uri, metadata=literals_pb2.StructuredDatasetMetadata(structured_dataset_type=structured_dataset_type)
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class ParquetToPolarsDecodingHandler(DataFrameDecoder):
|
|
127
|
+
def __init__(self):
|
|
128
|
+
super().__init__(pl.DataFrame, None, PARQUET)
|
|
129
|
+
|
|
130
|
+
async def decode(
|
|
131
|
+
self,
|
|
132
|
+
flyte_value: literals_pb2.StructuredDataset,
|
|
133
|
+
current_task_metadata: literals_pb2.StructuredDatasetMetadata,
|
|
134
|
+
) -> "pl.DataFrame":
|
|
135
|
+
uri = flyte_value.uri
|
|
136
|
+
columns = None
|
|
137
|
+
if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns:
|
|
138
|
+
columns = [c.name for c in current_task_metadata.structured_dataset_type.columns]
|
|
139
|
+
|
|
140
|
+
parquet_path = os.path.join(uri, f"{0:05}.parquet")
|
|
141
|
+
filesystem = storage.get_underlying_filesystem(path=parquet_path)
|
|
142
|
+
storage_options = get_polars_storage_options(protocol=filesystem.protocol)
|
|
143
|
+
try:
|
|
144
|
+
return pl.read_parquet(parquet_path, columns=columns, storage_options=storage_options or None)
|
|
145
|
+
except Exception as exc:
|
|
146
|
+
if exc.__class__.__name__ == "NoCredentialsError":
|
|
147
|
+
logger.debug("S3 source detected, attempting anonymous S3 access")
|
|
148
|
+
storage_options = get_polars_storage_options(protocol=filesystem.protocol, anonymous=True)
|
|
149
|
+
return pl.read_parquet(parquet_path, columns=columns, storage_options=storage_options or None)
|
|
150
|
+
else:
|
|
151
|
+
raise
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class PolarsLazyFrameToParquetEncodingHandler(DataFrameEncoder):
|
|
155
|
+
def __init__(self):
|
|
156
|
+
super().__init__(pl.LazyFrame, None, PARQUET)
|
|
157
|
+
|
|
158
|
+
async def encode(
|
|
159
|
+
self,
|
|
160
|
+
dataframe: DataFrame,
|
|
161
|
+
structured_dataset_type: types_pb2.StructuredDatasetType,
|
|
162
|
+
) -> literals_pb2.StructuredDataset:
|
|
163
|
+
if not dataframe.uri:
|
|
164
|
+
from flyte._context import internal_ctx
|
|
165
|
+
|
|
166
|
+
ctx = internal_ctx()
|
|
167
|
+
uri = str(ctx.raw_data.get_random_remote_path())
|
|
168
|
+
else:
|
|
169
|
+
uri = typing.cast(str, dataframe.uri)
|
|
170
|
+
|
|
171
|
+
if not storage.is_remote(uri):
|
|
172
|
+
Path(uri).mkdir(parents=True, exist_ok=True)
|
|
173
|
+
path = f"{os.path.join(uri, f'{0:05}')}.parquet"
|
|
174
|
+
lazy_df = typing.cast(pl.LazyFrame, dataframe.val)
|
|
175
|
+
|
|
176
|
+
# Use sink_parquet for efficient lazy writing
|
|
177
|
+
filesystem = storage.get_underlying_filesystem(path=uri)
|
|
178
|
+
path = path + filesystem.sep
|
|
179
|
+
storage_options = get_polars_storage_options(protocol=filesystem.protocol)
|
|
180
|
+
|
|
181
|
+
# TODO: support partitioning, which will entail user-defined behavior
|
|
182
|
+
lazy_df.sink_parquet(path, storage_options=storage_options or None)
|
|
183
|
+
|
|
184
|
+
structured_dataset_type.format = PARQUET
|
|
185
|
+
return literals_pb2.StructuredDataset(
|
|
186
|
+
uri=uri, metadata=literals_pb2.StructuredDatasetMetadata(structured_dataset_type=structured_dataset_type)
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class ParquetToPolarsLazyFrameDecodingHandler(DataFrameDecoder):
|
|
191
|
+
def __init__(self):
|
|
192
|
+
super().__init__(pl.LazyFrame, None, PARQUET)
|
|
193
|
+
|
|
194
|
+
async def decode(
|
|
195
|
+
self,
|
|
196
|
+
flyte_value: literals_pb2.StructuredDataset,
|
|
197
|
+
current_task_metadata: literals_pb2.StructuredDatasetMetadata,
|
|
198
|
+
) -> "pl.LazyFrame":
|
|
199
|
+
uri = flyte_value.uri
|
|
200
|
+
columns = None
|
|
201
|
+
if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns:
|
|
202
|
+
columns = [c.name for c in current_task_metadata.structured_dataset_type.columns]
|
|
203
|
+
|
|
204
|
+
parquet_path = os.path.join(uri, f"{0:05}.parquet")
|
|
205
|
+
|
|
206
|
+
filesystem = storage.get_underlying_filesystem(path=parquet_path)
|
|
207
|
+
storage_options = get_polars_storage_options(protocol=filesystem.protocol)
|
|
208
|
+
try:
|
|
209
|
+
# TODO: support partitioning, which will entail user-defined behavior
|
|
210
|
+
lf = pl.scan_parquet(parquet_path, storage_options=storage_options or None)
|
|
211
|
+
if columns:
|
|
212
|
+
lf = lf.select(*columns)
|
|
213
|
+
return lf
|
|
214
|
+
except Exception as exc:
|
|
215
|
+
if exc.__class__.__name__ == "NoCredentialsError":
|
|
216
|
+
logger.debug("S3 source detected, attempting anonymous S3 access")
|
|
217
|
+
storage_options = get_polars_storage_options(protocol=filesystem.protocol, anonymous=True)
|
|
218
|
+
lf = pl.scan_parquet(parquet_path, storage_options=storage_options or None)
|
|
219
|
+
if columns:
|
|
220
|
+
lf = lf.select(*columns)
|
|
221
|
+
return lf
|
|
222
|
+
else:
|
|
223
|
+
raise
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
@functools.lru_cache(maxsize=None)
|
|
227
|
+
def register_polars_df_transformers():
|
|
228
|
+
"""Register Polars DataFrame encoders and decoders with the DataFrameTransformerEngine.
|
|
229
|
+
|
|
230
|
+
This function is called automatically via the flyte.plugins.types entry point
|
|
231
|
+
when flyte.init() is called with load_plugin_type_transformers=True (the default).
|
|
232
|
+
"""
|
|
233
|
+
DataFrameTransformerEngine.register(PolarsToParquetEncodingHandler(), default_format_for_type=True)
|
|
234
|
+
DataFrameTransformerEngine.register(ParquetToPolarsDecodingHandler(), default_format_for_type=True)
|
|
235
|
+
DataFrameTransformerEngine.register(PolarsLazyFrameToParquetEncodingHandler(), default_format_for_type=True)
|
|
236
|
+
DataFrameTransformerEngine.register(ParquetToPolarsLazyFrameDecodingHandler(), default_format_for_type=True)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
# Also register at module import time for backwards compatibility
|
|
240
|
+
register_polars_df_transformers()
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: flyteplugins-polars
|
|
3
|
+
Version: 2.0.0b51
|
|
4
|
+
Summary: polars plugin for flyte
|
|
5
|
+
Author-email: Flyte Contributors <admin@flyte.org>
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: polars
|
|
9
|
+
Requires-Dist: flyte
|
|
10
|
+
|
|
11
|
+
# Polars Plugin
|
|
12
|
+
|
|
13
|
+
This plugin provides native support for **Polars DataFrames and LazyFrames** in Flyte, enabling efficient data processing with Polars' high-performance DataFrame library.
|
|
14
|
+
|
|
15
|
+
The plugin supports:
|
|
16
|
+
- `polars.DataFrame` - Eager evaluation DataFrames
|
|
17
|
+
- `polars.LazyFrame` - Lazy evaluation DataFrames for optimized query execution
|
|
18
|
+
|
|
19
|
+
Both types can be serialized to and deserialized from Parquet format, making them ideal for large-scale data processing workflows.
|
|
20
|
+
|
|
21
|
+
To install the plugin, run the following command:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install --pre flyteplugins-polars
|
|
25
|
+
```
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
flyteplugins/polars/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
flyteplugins/polars/df_transformer.py,sha256=UMCFIN-zUJQNV-4K4q-V8Xju5g2u3_oDkgufzQHAs9Y,9440
|
|
3
|
+
flyteplugins_polars-2.0.0b51.dist-info/METADATA,sha256=k1cWC0XvpHycDFrdYaCpaAVlrMboTbeKH0RxVbp6WoQ,833
|
|
4
|
+
flyteplugins_polars-2.0.0b51.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
5
|
+
flyteplugins_polars-2.0.0b51.dist-info/entry_points.txt,sha256=rPsduCtuVABnyAsWYbHTDacaXWyfihxs4SO-4bDV6HY,98
|
|
6
|
+
flyteplugins_polars-2.0.0b51.dist-info/top_level.txt,sha256=cgd779rPu9EsvdtuYgUxNHHgElaQvPn74KhB5XSeMBE,13
|
|
7
|
+
flyteplugins_polars-2.0.0b51.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
flyteplugins
|