flyteplugins-polars 2.0.0b51__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,240 @@
1
+ import functools
2
+ import os
3
+ import typing
4
+ from pathlib import Path
5
+
6
+ import flyte.storage as storage
7
+ from flyte._logging import logger
8
+ from flyte._utils import lazy_module
9
+ from flyte.io._dataframe.dataframe import PARQUET, DataFrame
10
+ from flyte.io.extend import (
11
+ DataFrameDecoder,
12
+ DataFrameEncoder,
13
+ DataFrameTransformerEngine,
14
+ )
15
+ from flyteidl2.core import literals_pb2, types_pb2
16
+
17
+ if typing.TYPE_CHECKING:
18
+ import polars as pl
19
+ else:
20
+ pl = lazy_module("polars")
21
+
22
+
23
+ def get_polars_storage_options(protocol: typing.Optional[str], anonymous: bool = False) -> typing.Dict[str, str]:
24
+ """
25
+ Get storage options in a format compatible with Polars.
26
+
27
+ Polars requires storage_options to be a flat dict with string keys and values,
28
+ unlike fsspec which accepts nested dicts and complex objects.
29
+ """
30
+ from flyte._initialize import get_storage
31
+ from flyte.errors import InitializationError
32
+
33
+ if not protocol:
34
+ return {}
35
+
36
+ try:
37
+ storage_config = get_storage()
38
+ except InitializationError:
39
+ storage_config = None
40
+
41
+ match protocol:
42
+ case "s3":
43
+ from flyte.storage import S3
44
+
45
+ if storage_config and isinstance(storage_config, S3):
46
+ s3_config = storage_config
47
+ else:
48
+ s3_config = S3.auto()
49
+
50
+ opts: typing.Dict[str, str] = {}
51
+ if s3_config.access_key_id:
52
+ opts["aws_access_key_id"] = s3_config.access_key_id
53
+ if s3_config.secret_access_key:
54
+ opts["aws_secret_access_key"] = s3_config.secret_access_key
55
+ if s3_config.region:
56
+ opts["aws_region"] = s3_config.region
57
+ if s3_config.endpoint:
58
+ opts["aws_endpoint_url"] = s3_config.endpoint
59
+ if anonymous:
60
+ opts["aws_skip_signature"] = "true"
61
+ return opts
62
+
63
+ case "gs":
64
+ # GCS typically uses application default credentials
65
+ # Polars supports this automatically
66
+ return {}
67
+
68
+ case "abfs" | "abfss":
69
+ from flyte.storage import ABFS
70
+
71
+ if storage_config and isinstance(storage_config, ABFS):
72
+ abfs_config = storage_config
73
+ else:
74
+ abfs_config = ABFS.auto()
75
+
76
+ opts = {}
77
+ if abfs_config.account_name:
78
+ opts["azure_storage_account_name"] = abfs_config.account_name
79
+ if abfs_config.account_key:
80
+ opts["azure_storage_account_key"] = abfs_config.account_key
81
+ if abfs_config.tenant_id:
82
+ opts["azure_storage_tenant_id"] = abfs_config.tenant_id
83
+ if abfs_config.client_id:
84
+ opts["azure_storage_client_id"] = abfs_config.client_id
85
+ if abfs_config.client_secret:
86
+ opts["azure_storage_client_secret"] = abfs_config.client_secret
87
+ return opts
88
+
89
+ case _:
90
+ return {}
91
+
92
+
93
+ class PolarsToParquetEncodingHandler(DataFrameEncoder):
94
+ def __init__(self):
95
+ super().__init__(pl.DataFrame, None, PARQUET)
96
+
97
+ async def encode(
98
+ self,
99
+ dataframe: DataFrame,
100
+ structured_dataset_type: types_pb2.StructuredDatasetType,
101
+ ) -> literals_pb2.StructuredDataset:
102
+ if not dataframe.uri:
103
+ from flyte._context import internal_ctx
104
+
105
+ ctx = internal_ctx()
106
+ uri = str(ctx.raw_data.get_random_remote_path())
107
+ else:
108
+ uri = typing.cast(str, dataframe.uri)
109
+
110
+ if not storage.is_remote(uri):
111
+ Path(uri).mkdir(parents=True, exist_ok=True)
112
+ path = os.path.join(uri, f"{0:05}.parquet")
113
+ df = typing.cast(pl.DataFrame, dataframe.val)
114
+
115
+ # Polars requires flat string key-value storage options
116
+ filesystem = storage.get_underlying_filesystem(path=path)
117
+ storage_options = get_polars_storage_options(protocol=filesystem.protocol)
118
+ df.write_parquet(path, storage_options=storage_options or None)
119
+
120
+ structured_dataset_type.format = PARQUET
121
+ return literals_pb2.StructuredDataset(
122
+ uri=uri, metadata=literals_pb2.StructuredDatasetMetadata(structured_dataset_type=structured_dataset_type)
123
+ )
124
+
125
+
126
+ class ParquetToPolarsDecodingHandler(DataFrameDecoder):
127
+ def __init__(self):
128
+ super().__init__(pl.DataFrame, None, PARQUET)
129
+
130
+ async def decode(
131
+ self,
132
+ flyte_value: literals_pb2.StructuredDataset,
133
+ current_task_metadata: literals_pb2.StructuredDatasetMetadata,
134
+ ) -> "pl.DataFrame":
135
+ uri = flyte_value.uri
136
+ columns = None
137
+ if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns:
138
+ columns = [c.name for c in current_task_metadata.structured_dataset_type.columns]
139
+
140
+ parquet_path = os.path.join(uri, f"{0:05}.parquet")
141
+ filesystem = storage.get_underlying_filesystem(path=parquet_path)
142
+ storage_options = get_polars_storage_options(protocol=filesystem.protocol)
143
+ try:
144
+ return pl.read_parquet(parquet_path, columns=columns, storage_options=storage_options or None)
145
+ except Exception as exc:
146
+ if exc.__class__.__name__ == "NoCredentialsError":
147
+ logger.debug("S3 source detected, attempting anonymous S3 access")
148
+ storage_options = get_polars_storage_options(protocol=filesystem.protocol, anonymous=True)
149
+ return pl.read_parquet(parquet_path, columns=columns, storage_options=storage_options or None)
150
+ else:
151
+ raise
152
+
153
+
154
+ class PolarsLazyFrameToParquetEncodingHandler(DataFrameEncoder):
155
+ def __init__(self):
156
+ super().__init__(pl.LazyFrame, None, PARQUET)
157
+
158
+ async def encode(
159
+ self,
160
+ dataframe: DataFrame,
161
+ structured_dataset_type: types_pb2.StructuredDatasetType,
162
+ ) -> literals_pb2.StructuredDataset:
163
+ if not dataframe.uri:
164
+ from flyte._context import internal_ctx
165
+
166
+ ctx = internal_ctx()
167
+ uri = str(ctx.raw_data.get_random_remote_path())
168
+ else:
169
+ uri = typing.cast(str, dataframe.uri)
170
+
171
+ if not storage.is_remote(uri):
172
+ Path(uri).mkdir(parents=True, exist_ok=True)
173
+ path = f"{os.path.join(uri, f'{0:05}')}.parquet"
174
+ lazy_df = typing.cast(pl.LazyFrame, dataframe.val)
175
+
176
+ # Use sink_parquet for efficient lazy writing
177
+ filesystem = storage.get_underlying_filesystem(path=uri)
178
+ path = path + filesystem.sep
179
+ storage_options = get_polars_storage_options(protocol=filesystem.protocol)
180
+
181
+ # TODO: support partitioning, which will entail user-defined behavior
182
+ lazy_df.sink_parquet(path, storage_options=storage_options or None)
183
+
184
+ structured_dataset_type.format = PARQUET
185
+ return literals_pb2.StructuredDataset(
186
+ uri=uri, metadata=literals_pb2.StructuredDatasetMetadata(structured_dataset_type=structured_dataset_type)
187
+ )
188
+
189
+
190
+ class ParquetToPolarsLazyFrameDecodingHandler(DataFrameDecoder):
191
+ def __init__(self):
192
+ super().__init__(pl.LazyFrame, None, PARQUET)
193
+
194
+ async def decode(
195
+ self,
196
+ flyte_value: literals_pb2.StructuredDataset,
197
+ current_task_metadata: literals_pb2.StructuredDatasetMetadata,
198
+ ) -> "pl.LazyFrame":
199
+ uri = flyte_value.uri
200
+ columns = None
201
+ if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns:
202
+ columns = [c.name for c in current_task_metadata.structured_dataset_type.columns]
203
+
204
+ parquet_path = os.path.join(uri, f"{0:05}.parquet")
205
+
206
+ filesystem = storage.get_underlying_filesystem(path=parquet_path)
207
+ storage_options = get_polars_storage_options(protocol=filesystem.protocol)
208
+ try:
209
+ # TODO: support partitioning, which will entail user-defined behavior
210
+ lf = pl.scan_parquet(parquet_path, storage_options=storage_options or None)
211
+ if columns:
212
+ lf = lf.select(*columns)
213
+ return lf
214
+ except Exception as exc:
215
+ if exc.__class__.__name__ == "NoCredentialsError":
216
+ logger.debug("S3 source detected, attempting anonymous S3 access")
217
+ storage_options = get_polars_storage_options(protocol=filesystem.protocol, anonymous=True)
218
+ lf = pl.scan_parquet(parquet_path, storage_options=storage_options or None)
219
+ if columns:
220
+ lf = lf.select(*columns)
221
+ return lf
222
+ else:
223
+ raise
224
+
225
+
226
+ @functools.lru_cache(maxsize=None)
227
+ def register_polars_df_transformers():
228
+ """Register Polars DataFrame encoders and decoders with the DataFrameTransformerEngine.
229
+
230
+ This function is called automatically via the flyte.plugins.types entry point
231
+ when flyte.init() is called with load_plugin_type_transformers=True (the default).
232
+ """
233
+ DataFrameTransformerEngine.register(PolarsToParquetEncodingHandler(), default_format_for_type=True)
234
+ DataFrameTransformerEngine.register(ParquetToPolarsDecodingHandler(), default_format_for_type=True)
235
+ DataFrameTransformerEngine.register(PolarsLazyFrameToParquetEncodingHandler(), default_format_for_type=True)
236
+ DataFrameTransformerEngine.register(ParquetToPolarsLazyFrameDecodingHandler(), default_format_for_type=True)
237
+
238
+
239
+ # Also register at module import time for backwards compatibility
240
+ register_polars_df_transformers()
@@ -0,0 +1,25 @@
1
+ Metadata-Version: 2.4
2
+ Name: flyteplugins-polars
3
+ Version: 2.0.0b51
4
+ Summary: polars plugin for flyte
5
+ Author-email: Flyte Contributors <admin@flyte.org>
6
+ Requires-Python: >=3.10
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: polars
9
+ Requires-Dist: flyte
10
+
11
+ # Polars Plugin
12
+
13
+ This plugin provides native support for **Polars DataFrames and LazyFrames** in Flyte, enabling efficient data processing with Polars' high-performance DataFrame library.
14
+
15
+ The plugin supports:
16
+ - `polars.DataFrame` - Eager evaluation DataFrames
17
+ - `polars.LazyFrame` - Lazy evaluation DataFrames for optimized query execution
18
+
19
+ Both types can be serialized to and deserialized from Parquet format, making them ideal for large-scale data processing workflows.
20
+
21
+ To install the plugin, run the following command:
22
+
23
+ ```bash
24
+ pip install --pre flyteplugins-polars
25
+ ```
@@ -0,0 +1,7 @@
1
+ flyteplugins/polars/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ flyteplugins/polars/df_transformer.py,sha256=UMCFIN-zUJQNV-4K4q-V8Xju5g2u3_oDkgufzQHAs9Y,9440
3
+ flyteplugins_polars-2.0.0b51.dist-info/METADATA,sha256=k1cWC0XvpHycDFrdYaCpaAVlrMboTbeKH0RxVbp6WoQ,833
4
+ flyteplugins_polars-2.0.0b51.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
5
+ flyteplugins_polars-2.0.0b51.dist-info/entry_points.txt,sha256=rPsduCtuVABnyAsWYbHTDacaXWyfihxs4SO-4bDV6HY,98
6
+ flyteplugins_polars-2.0.0b51.dist-info/top_level.txt,sha256=cgd779rPu9EsvdtuYgUxNHHgElaQvPn74KhB5XSeMBE,13
7
+ flyteplugins_polars-2.0.0b51.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.10.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [flyte.plugins.types]
2
+ polars = flyteplugins.polars.df_transformer:register_polars_df_transformers
@@ -0,0 +1 @@
1
+ flyteplugins