flyte 0.2.0b33__py3-none-any.whl → 0.2.0b34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of flyte might be problematic. Click here for more details.
- flyte/_code_bundle/_utils.py +2 -2
- flyte/_image.py +0 -2
- flyte/_internal/imagebuild/remote_builder.py +3 -2
- flyte/_task.py +30 -8
- flyte/_version.py +2 -2
- flyte/cli/_common.py +25 -0
- flyte/cli/_create.py +11 -0
- flyte/cli/main.py +11 -0
- flyte/errors.py +9 -0
- flyte/io/__init__.py +12 -12
- flyte/io/{_structured_dataset → _dataframe}/__init__.py +30 -30
- flyte/io/{_structured_dataset → _dataframe}/basic_dfs.py +25 -26
- flyte/io/{_structured_dataset/structured_dataset.py → _dataframe/dataframe.py} +131 -132
- flyte/types/_type_engine.py +2 -2
- {flyte-0.2.0b33.dist-info → flyte-0.2.0b34.dist-info}/METADATA +1 -1
- {flyte-0.2.0b33.dist-info → flyte-0.2.0b34.dist-info}/RECORD +20 -20
- {flyte-0.2.0b33.data → flyte-0.2.0b34.data}/scripts/runtime.py +0 -0
- {flyte-0.2.0b33.dist-info → flyte-0.2.0b34.dist-info}/WHEEL +0 -0
- {flyte-0.2.0b33.dist-info → flyte-0.2.0b34.dist-info}/entry_points.txt +0 -0
- {flyte-0.2.0b33.dist-info → flyte-0.2.0b34.dist-info}/top_level.txt +0 -0
|
@@ -35,23 +35,23 @@ else:
|
|
|
35
35
|
pd = lazy_module("pandas")
|
|
36
36
|
pa = lazy_module("pyarrow")
|
|
37
37
|
|
|
38
|
-
T = typing.TypeVar("T") #
|
|
38
|
+
T = typing.TypeVar("T") # DataFrame type or a dataframe type
|
|
39
39
|
DF = typing.TypeVar("DF") # Dataframe type
|
|
40
40
|
|
|
41
|
-
# For specifying the storage formats of
|
|
42
|
-
|
|
41
|
+
# For specifying the storage formats of DataFrames. It's just a string, nothing fancy.
|
|
42
|
+
DataFrameFormat: TypeAlias = str
|
|
43
43
|
|
|
44
44
|
# Storage formats
|
|
45
|
-
PARQUET:
|
|
46
|
-
CSV:
|
|
47
|
-
GENERIC_FORMAT:
|
|
45
|
+
PARQUET: DataFrameFormat = "parquet"
|
|
46
|
+
CSV: DataFrameFormat = "csv"
|
|
47
|
+
GENERIC_FORMAT: DataFrameFormat = ""
|
|
48
48
|
GENERIC_PROTOCOL: str = "generic protocol"
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
@dataclass
|
|
52
|
-
class
|
|
52
|
+
class DataFrame(SerializableType, DataClassJSONMixin):
|
|
53
53
|
"""
|
|
54
|
-
This is the user facing
|
|
54
|
+
This is the user facing DataFrame class. Please don't confuse it with the literals.StructuredDataset
|
|
55
55
|
class (that is just a model, a Python class representation of the protobuf).
|
|
56
56
|
"""
|
|
57
57
|
|
|
@@ -62,9 +62,9 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
|
|
|
62
62
|
def _serialize(self) -> Dict[str, Optional[str]]:
|
|
63
63
|
# dataclass case
|
|
64
64
|
lt = TypeEngine.to_literal_type(type(self))
|
|
65
|
-
engine =
|
|
65
|
+
engine = DataFrameTransformerEngine()
|
|
66
66
|
lv = loop_manager.run_sync(engine.to_literal, self, type(self), lt)
|
|
67
|
-
sd =
|
|
67
|
+
sd = DataFrame(uri=lv.scalar.structured_dataset.uri)
|
|
68
68
|
sd.file_format = lv.scalar.structured_dataset.metadata.structured_dataset_type.format
|
|
69
69
|
return {
|
|
70
70
|
"uri": sd.uri,
|
|
@@ -72,14 +72,14 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
|
|
|
72
72
|
}
|
|
73
73
|
|
|
74
74
|
@classmethod
|
|
75
|
-
def _deserialize(cls, value) -> "
|
|
75
|
+
def _deserialize(cls, value) -> "DataFrame":
|
|
76
76
|
uri = value.get("uri", None)
|
|
77
77
|
file_format = value.get("file_format", None)
|
|
78
78
|
|
|
79
79
|
if uri is None:
|
|
80
|
-
raise ValueError("
|
|
80
|
+
raise ValueError("DataFrame's uri and file format should not be None")
|
|
81
81
|
|
|
82
|
-
engine =
|
|
82
|
+
engine = DataFrameTransformerEngine()
|
|
83
83
|
return loop_manager.run_sync(
|
|
84
84
|
engine.to_python_value,
|
|
85
85
|
literals_pb2.Literal(
|
|
@@ -96,9 +96,9 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
|
|
|
96
96
|
)
|
|
97
97
|
|
|
98
98
|
@model_serializer
|
|
99
|
-
def
|
|
99
|
+
def serialize_dataframe(self) -> Dict[str, Optional[str]]:
|
|
100
100
|
lt = TypeEngine.to_literal_type(type(self))
|
|
101
|
-
sde =
|
|
101
|
+
sde = DataFrameTransformerEngine()
|
|
102
102
|
lv = loop_manager.run_sync(sde.to_literal, self, type(self), lt)
|
|
103
103
|
return {
|
|
104
104
|
"uri": lv.scalar.structured_dataset.uri,
|
|
@@ -106,11 +106,11 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
|
|
|
106
106
|
}
|
|
107
107
|
|
|
108
108
|
@model_validator(mode="after")
|
|
109
|
-
def
|
|
109
|
+
def deserialize_dataframe(self, info) -> DataFrame:
|
|
110
110
|
if info.context is None or info.context.get("deserialize") is not True:
|
|
111
111
|
return self
|
|
112
112
|
|
|
113
|
-
engine =
|
|
113
|
+
engine = DataFrameTransformerEngine()
|
|
114
114
|
return loop_manager.run_sync(
|
|
115
115
|
engine.to_python_value,
|
|
116
116
|
literals_pb2.Literal(
|
|
@@ -136,12 +136,12 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
|
|
|
136
136
|
|
|
137
137
|
def __init__(
|
|
138
138
|
self,
|
|
139
|
-
|
|
139
|
+
val: typing.Optional[typing.Any] = None,
|
|
140
140
|
uri: typing.Optional[str] = None,
|
|
141
141
|
metadata: typing.Optional[literals_pb2.StructuredDatasetMetadata] = None,
|
|
142
142
|
**kwargs,
|
|
143
143
|
):
|
|
144
|
-
self.
|
|
144
|
+
self._val = val
|
|
145
145
|
# Make these fields public, so that the dataclass transformer can set a value for it
|
|
146
146
|
# https://github.com/flyteorg/flytekit/blob/bcc8541bd6227b532f8462563fe8aac902242b21/flytekit/core/type_engine.py#L298
|
|
147
147
|
self.uri = uri
|
|
@@ -156,8 +156,8 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
|
|
|
156
156
|
self._already_uploaded = False
|
|
157
157
|
|
|
158
158
|
@property
|
|
159
|
-
def
|
|
160
|
-
return self.
|
|
159
|
+
def val(self) -> Optional[DF]:
|
|
160
|
+
return self._val
|
|
161
161
|
|
|
162
162
|
@property
|
|
163
163
|
def metadata(self) -> Optional[literals_pb2.StructuredDatasetMetadata]:
|
|
@@ -168,18 +168,18 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
|
|
|
168
168
|
return self._literal_sd
|
|
169
169
|
|
|
170
170
|
def open(self, dataframe_type: Type[DF]):
|
|
171
|
-
from flyte.io._structured_dataset import lazy_import_structured_dataset_handler
|
|
172
|
-
|
|
173
171
|
"""
|
|
174
172
|
Load the handler if needed. For the use case like:
|
|
175
173
|
@task
|
|
176
|
-
def t1(
|
|
174
|
+
def t1(df: DataFrame):
|
|
177
175
|
import pandas as pd
|
|
178
|
-
|
|
176
|
+
df.open(pd.DataFrame).all()
|
|
179
177
|
|
|
180
|
-
pandas is imported inside the task, so
|
|
178
|
+
pandas is imported inside the task, so panda handler won't be loaded during deserialization in type engine.
|
|
181
179
|
"""
|
|
182
|
-
|
|
180
|
+
from flyte.io._dataframe import lazy_import_dataframe_handler
|
|
181
|
+
|
|
182
|
+
lazy_import_dataframe_handler()
|
|
183
183
|
self._dataframe_type = dataframe_type
|
|
184
184
|
return self
|
|
185
185
|
|
|
@@ -187,22 +187,22 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
|
|
|
187
187
|
if self._dataframe_type is None:
|
|
188
188
|
raise ValueError("No dataframe type set. Use open() to set the local dataframe type you want to use.")
|
|
189
189
|
|
|
190
|
-
if self.uri is not None and self.
|
|
191
|
-
expected = TypeEngine.to_literal_type(
|
|
190
|
+
if self.uri is not None and self.val is None:
|
|
191
|
+
expected = TypeEngine.to_literal_type(DataFrame)
|
|
192
192
|
await self._set_literal(expected)
|
|
193
193
|
|
|
194
194
|
return await flyte_dataset_transformer.open_as(self.literal, self._dataframe_type, self.metadata)
|
|
195
195
|
|
|
196
196
|
async def _set_literal(self, expected: types_pb2.LiteralType) -> None:
|
|
197
197
|
"""
|
|
198
|
-
Explicitly set the
|
|
198
|
+
Explicitly set the DataFrame Literal to handle the following cases:
|
|
199
199
|
|
|
200
|
-
1. Read
|
|
200
|
+
1. Read the content from a DataFrame with an uri, for example:
|
|
201
201
|
|
|
202
202
|
@task
|
|
203
|
-
def
|
|
204
|
-
|
|
205
|
-
df =
|
|
203
|
+
def return_df() -> DataFrame:
|
|
204
|
+
df = DataFrame(uri="s3://my-s3-bucket/s3_flyte_dir/df.parquet", file_format="parquet")
|
|
205
|
+
df = df.open(pd.DataFrame).all()
|
|
206
206
|
return df
|
|
207
207
|
|
|
208
208
|
For details, please refer to this issue: https://github.com/flyteorg/flyte/issues/5954.
|
|
@@ -212,14 +212,14 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
|
|
|
212
212
|
|
|
213
213
|
For details, please refer to this issue: https://github.com/flyteorg/flyte/issues/5956.
|
|
214
214
|
"""
|
|
215
|
-
to_literal = await flyte_dataset_transformer.to_literal(self,
|
|
215
|
+
to_literal = await flyte_dataset_transformer.to_literal(self, DataFrame, expected)
|
|
216
216
|
self._literal_sd = to_literal.scalar.structured_dataset
|
|
217
217
|
if self.metadata is None:
|
|
218
218
|
self._metadata = self._literal_sd.metadata
|
|
219
219
|
|
|
220
220
|
async def set_literal(self, expected: types_pb2.LiteralType) -> None:
|
|
221
221
|
"""
|
|
222
|
-
A public wrapper method to set the
|
|
222
|
+
A public wrapper method to set the DataFrame Literal.
|
|
223
223
|
|
|
224
224
|
This method provides external access to the internal _set_literal method.
|
|
225
225
|
"""
|
|
@@ -256,7 +256,7 @@ def extract_cols_and_format(
|
|
|
256
256
|
Helper function, just used to iterate through Annotations and extract out the following information:
|
|
257
257
|
- base type, if not Annotated, it will just be the type that was passed in.
|
|
258
258
|
- column information, as a collections.OrderedDict,
|
|
259
|
-
- the storage format, as a ``
|
|
259
|
+
- the storage format, as a ``DataFrameFormat`` (str),
|
|
260
260
|
- pa.lib.Schema
|
|
261
261
|
|
|
262
262
|
If more than one of any type of thing is found, an error will be raised.
|
|
@@ -286,7 +286,7 @@ def extract_cols_and_format(
|
|
|
286
286
|
d = collections.OrderedDict()
|
|
287
287
|
d.update(aa)
|
|
288
288
|
ordered_dict_cols = d
|
|
289
|
-
elif isinstance(aa,
|
|
289
|
+
elif isinstance(aa, DataFrameFormat):
|
|
290
290
|
if fmt != "":
|
|
291
291
|
raise ValueError(f"A format was already specified {fmt}, cannot use {aa}")
|
|
292
292
|
fmt = aa
|
|
@@ -305,7 +305,7 @@ def extract_cols_and_format(
|
|
|
305
305
|
return t, ordered_dict_cols, fmt, pa_schema
|
|
306
306
|
|
|
307
307
|
|
|
308
|
-
class
|
|
308
|
+
class DataFrameEncoder(ABC, Generic[T]):
|
|
309
309
|
def __init__(
|
|
310
310
|
self,
|
|
311
311
|
python_type: Type[T],
|
|
@@ -314,10 +314,10 @@ class StructuredDatasetEncoder(ABC, Generic[T]):
|
|
|
314
314
|
):
|
|
315
315
|
"""
|
|
316
316
|
Extend this abstract class, implement the encode function, and register your concrete class with the
|
|
317
|
-
|
|
317
|
+
DataFrameTransformerEngine class in order for the core flytekit type engine to handle
|
|
318
318
|
dataframe libraries. This is the encoding interface, meaning it is used when there is a Python value that the
|
|
319
319
|
flytekit type engine is trying to convert into a Flyte Literal. For the other way, see
|
|
320
|
-
the
|
|
320
|
+
the DataFrameEncoder
|
|
321
321
|
|
|
322
322
|
:param python_type: The dataframe class in question that you want to register this encoder with
|
|
323
323
|
:param protocol: A prefix representing the storage driver (e.g. 's3, 'gs', 'bq', etc.). You can use either
|
|
@@ -347,7 +347,7 @@ class StructuredDatasetEncoder(ABC, Generic[T]):
|
|
|
347
347
|
@abstractmethod
|
|
348
348
|
async def encode(
|
|
349
349
|
self,
|
|
350
|
-
|
|
350
|
+
dataframe: DataFrame,
|
|
351
351
|
structured_dataset_type: types_pb2.StructuredDatasetType,
|
|
352
352
|
) -> literals_pb2.StructuredDataset:
|
|
353
353
|
"""
|
|
@@ -357,20 +357,20 @@ class StructuredDatasetEncoder(ABC, Generic[T]):
|
|
|
357
357
|
the
|
|
358
358
|
# TODO: Do we need to add a flag to indicate if it was wrapped by the transformer or by the user?
|
|
359
359
|
|
|
360
|
-
:param
|
|
361
|
-
:param structured_dataset_type: This the
|
|
360
|
+
:param dataframe: This is a DataFrame wrapper object. See more info above.
|
|
361
|
+
:param structured_dataset_type: This the DataFrameType, as found in the LiteralType of the interface
|
|
362
362
|
of the task that invoked this encoding call. It is passed along to encoders so that authors of encoders
|
|
363
|
-
can include it in the returned literals.
|
|
363
|
+
can include it in the returned literals.DataFrame. See the IDL for more information on why this
|
|
364
364
|
literal in particular carries the type information along with it. If the encoder doesn't supply it, it will
|
|
365
365
|
also be filled in after the encoder runs by the transformer engine.
|
|
366
|
-
:return: This function should return a
|
|
367
|
-
|
|
368
|
-
This function needs to return the IDL
|
|
366
|
+
:return: This function should return a DataFrame literal object. Do not confuse this with the
|
|
367
|
+
DataFrame wrapper class used as input to this function - that is the user facing Python class.
|
|
368
|
+
This function needs to return the IDL DataFrame.
|
|
369
369
|
"""
|
|
370
370
|
raise NotImplementedError
|
|
371
371
|
|
|
372
372
|
|
|
373
|
-
class
|
|
373
|
+
class DataFrameDecoder(ABC, Generic[DF]):
|
|
374
374
|
def __init__(
|
|
375
375
|
self,
|
|
376
376
|
python_type: Type[DF],
|
|
@@ -380,9 +380,9 @@ class StructuredDatasetDecoder(ABC, Generic[DF]):
|
|
|
380
380
|
):
|
|
381
381
|
"""
|
|
382
382
|
Extend this abstract class, implement the decode function, and register your concrete class with the
|
|
383
|
-
|
|
383
|
+
DataFrameTransformerEngine class in order for the core flytekit type engine to handle
|
|
384
384
|
dataframe libraries. This is the decoder interface, meaning it is used when there is a Flyte Literal value,
|
|
385
|
-
and we have to get a Python value out of it. For the other way, see the
|
|
385
|
+
and we have to get a Python value out of it. For the other way, see the DataFrameEncoder
|
|
386
386
|
|
|
387
387
|
:param python_type: The dataframe class in question that you want to register this decoder with
|
|
388
388
|
:param protocol: A prefix representing the storage driver (e.g. 's3, 'gs', 'bq', etc.). You can use either
|
|
@@ -419,8 +419,8 @@ class StructuredDatasetDecoder(ABC, Generic[DF]):
|
|
|
419
419
|
This is code that will be called by the dataset transformer engine to ultimately translate from a Flyte Literal
|
|
420
420
|
value into a Python instance.
|
|
421
421
|
|
|
422
|
-
:param flyte_value: This will be a Flyte IDL
|
|
423
|
-
|
|
422
|
+
:param flyte_value: This will be a Flyte IDL DataFrame Literal - do not confuse this with the
|
|
423
|
+
DataFrame class defined also in this module.
|
|
424
424
|
:param current_task_metadata: Metadata object containing the type (and columns if any) for the currently
|
|
425
425
|
executing task. This type may have more or less information than the type information bundled
|
|
426
426
|
inside the incoming flyte_value.
|
|
@@ -459,19 +459,19 @@ def get_supported_types():
|
|
|
459
459
|
class DuplicateHandlerError(ValueError): ...
|
|
460
460
|
|
|
461
461
|
|
|
462
|
-
class
|
|
462
|
+
class DataFrameTransformerEngine(TypeTransformer[DataFrame]):
|
|
463
463
|
"""
|
|
464
464
|
Think of this transformer as a higher-level meta transformer that is used for all the dataframe types.
|
|
465
465
|
If you are bringing a custom data frame type, or any data frame type, to flytekit, instead of
|
|
466
466
|
registering with the main type engine, you should register with this transformer instead.
|
|
467
467
|
"""
|
|
468
468
|
|
|
469
|
-
ENCODERS: ClassVar[Dict[Type, Dict[str, Dict[str,
|
|
470
|
-
DECODERS: ClassVar[Dict[Type, Dict[str, Dict[str,
|
|
469
|
+
ENCODERS: ClassVar[Dict[Type, Dict[str, Dict[str, DataFrameEncoder]]]] = {}
|
|
470
|
+
DECODERS: ClassVar[Dict[Type, Dict[str, Dict[str, DataFrameDecoder]]]] = {}
|
|
471
471
|
DEFAULT_PROTOCOLS: ClassVar[Dict[Type, str]] = {}
|
|
472
472
|
DEFAULT_FORMATS: ClassVar[Dict[Type, str]] = {}
|
|
473
473
|
|
|
474
|
-
Handlers = Union[
|
|
474
|
+
Handlers = Union[DataFrameEncoder, DataFrameDecoder]
|
|
475
475
|
Renderers: ClassVar[Dict[Type, Renderable]] = {}
|
|
476
476
|
|
|
477
477
|
@classmethod
|
|
@@ -527,17 +527,17 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
527
527
|
|
|
528
528
|
@classmethod
|
|
529
529
|
def get_encoder(cls, df_type: Type, protocol: str, format: str):
|
|
530
|
-
return cls._finder(
|
|
530
|
+
return cls._finder(DataFrameTransformerEngine.ENCODERS, df_type, protocol, format)
|
|
531
531
|
|
|
532
532
|
@classmethod
|
|
533
|
-
def get_decoder(cls, df_type: Type, protocol: str, format: str) ->
|
|
534
|
-
return cls._finder(
|
|
533
|
+
def get_decoder(cls, df_type: Type, protocol: str, format: str) -> DataFrameDecoder:
|
|
534
|
+
return cls._finder(DataFrameTransformerEngine.DECODERS, df_type, protocol, format)
|
|
535
535
|
|
|
536
536
|
@classmethod
|
|
537
537
|
def _handler_finder(cls, h: Handlers, protocol: str) -> Dict[str, Handlers]:
|
|
538
|
-
if isinstance(h,
|
|
538
|
+
if isinstance(h, DataFrameEncoder):
|
|
539
539
|
top_level = cls.ENCODERS
|
|
540
|
-
elif isinstance(h,
|
|
540
|
+
elif isinstance(h, DataFrameDecoder):
|
|
541
541
|
top_level = cls.DECODERS # type: ignore
|
|
542
542
|
else:
|
|
543
543
|
raise TypeError(f"We don't support this type of handler {h}")
|
|
@@ -548,7 +548,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
548
548
|
return top_level[h.python_type][protocol] # type: ignore
|
|
549
549
|
|
|
550
550
|
def __init__(self):
|
|
551
|
-
super().__init__("
|
|
551
|
+
super().__init__("DataFrame Transformer", DataFrame)
|
|
552
552
|
self._type_assertions_enabled = False
|
|
553
553
|
|
|
554
554
|
@classmethod
|
|
@@ -568,7 +568,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
568
568
|
Call this with any Encoder or Decoder to register it with the flytekit type system. If your handler does not
|
|
569
569
|
specify a protocol (e.g. s3, gs, etc.) field, then
|
|
570
570
|
|
|
571
|
-
:param h: The
|
|
571
|
+
:param h: The DataFrameEncoder or DataFrameDecoder you wish to register with this transformer.
|
|
572
572
|
:param default_for_type: If set, when a user returns from a task an instance of the dataframe the handler
|
|
573
573
|
handles, e.g. ``return pd.DataFrame(...)``, not wrapped around the ``StructuredDataset`` object, we will
|
|
574
574
|
use this handler's protocol and format as the default, effectively saying that this handler will be called.
|
|
@@ -582,7 +582,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
582
582
|
:param default_storage_for_type: Same as above but only for the storage format. Error if already set,
|
|
583
583
|
unless override is specified.
|
|
584
584
|
"""
|
|
585
|
-
if not (isinstance(h,
|
|
585
|
+
if not (isinstance(h, DataFrameEncoder) or isinstance(h, DataFrameDecoder)):
|
|
586
586
|
raise TypeError(f"We don't support this type of handler {h}")
|
|
587
587
|
|
|
588
588
|
if h.protocol is None:
|
|
@@ -648,27 +648,27 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
648
648
|
# Register with the type engine as well
|
|
649
649
|
# The semantics as of now are such that it doesn't matter which order these transformers are loaded in, as
|
|
650
650
|
# long as the older Pandas/FlyteSchema transformer do not also specify the override
|
|
651
|
-
engine =
|
|
651
|
+
engine = DataFrameTransformerEngine()
|
|
652
652
|
TypeEngine.register_additional_type(engine, h.python_type, override=True)
|
|
653
653
|
|
|
654
|
-
def assert_type(self, t: Type[
|
|
654
|
+
def assert_type(self, t: Type[DataFrame], v: typing.Any):
|
|
655
655
|
return
|
|
656
656
|
|
|
657
657
|
async def to_literal(
|
|
658
658
|
self,
|
|
659
|
-
python_val: Union[
|
|
660
|
-
python_type: Union[Type[
|
|
659
|
+
python_val: Union[DataFrame, typing.Any],
|
|
660
|
+
python_type: Union[Type[DataFrame], Type],
|
|
661
661
|
expected: types_pb2.LiteralType,
|
|
662
662
|
) -> literals_pb2.Literal:
|
|
663
663
|
# Make a copy in case we need to hand off to encoders, since we can't be sure of mutations.
|
|
664
664
|
python_type, *attrs = extract_cols_and_format(python_type)
|
|
665
665
|
sdt = types_pb2.StructuredDatasetType(format=self.DEFAULT_FORMATS.get(python_type, GENERIC_FORMAT))
|
|
666
666
|
|
|
667
|
-
if issubclass(python_type,
|
|
667
|
+
if issubclass(python_type, DataFrame) and not isinstance(python_val, DataFrame):
|
|
668
668
|
# Catch a common mistake
|
|
669
669
|
raise TypeTransformerFailedError(
|
|
670
|
-
f"Expected a
|
|
671
|
-
f" Did you forget to wrap your dataframe in a
|
|
670
|
+
f"Expected a DataFrame instance, but got {type(python_val)} instead."
|
|
671
|
+
f" Did you forget to wrap your dataframe in a DataFrame instance?"
|
|
672
672
|
)
|
|
673
673
|
|
|
674
674
|
if expected and expected.structured_dataset_type:
|
|
@@ -679,35 +679,34 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
679
679
|
external_schema_bytes=expected.structured_dataset_type.external_schema_bytes,
|
|
680
680
|
)
|
|
681
681
|
|
|
682
|
-
# If the type signature has the
|
|
683
|
-
#
|
|
684
|
-
if isinstance(python_val,
|
|
682
|
+
# If the type signature has the DataFrame class, it will, or at least should, also be a
|
|
683
|
+
# DataFrame instance.
|
|
684
|
+
if isinstance(python_val, DataFrame):
|
|
685
685
|
# There are three cases that we need to take care of here.
|
|
686
686
|
|
|
687
|
-
# 1. A task returns a
|
|
688
|
-
# then return the original literals.
|
|
687
|
+
# 1. A task returns a DataFrame that was just a passthrough input. If this happens
|
|
688
|
+
# then return the original literals.DataFrame without invoking any encoder
|
|
689
689
|
#
|
|
690
690
|
# Ex.
|
|
691
|
-
# def t1(dataset: Annotated[
|
|
691
|
+
# def t1(dataset: Annotated[DataFrame, my_cols]) -> Annotated[DataFrame, my_cols]:
|
|
692
692
|
# return dataset
|
|
693
693
|
if python_val._literal_sd is not None:
|
|
694
694
|
if python_val._already_uploaded:
|
|
695
695
|
return literals_pb2.Literal(scalar=literals_pb2.Scalar(structured_dataset=python_val._literal_sd))
|
|
696
|
-
if python_val.
|
|
696
|
+
if python_val.val is not None:
|
|
697
697
|
raise ValueError(
|
|
698
|
-
f"Shouldn't have specified both literal {python_val._literal_sd}"
|
|
699
|
-
f" and dataframe {python_val.dataframe}"
|
|
698
|
+
f"Shouldn't have specified both literal {python_val._literal_sd} and dataframe {python_val.val}"
|
|
700
699
|
)
|
|
701
700
|
return literals_pb2.Literal(scalar=literals_pb2.Scalar(structured_dataset=python_val._literal_sd))
|
|
702
701
|
|
|
703
|
-
# 2. A task returns a python
|
|
704
|
-
# Note: this case is also what happens we start a local execution of a task with a python
|
|
705
|
-
# It gets converted into a literal first, then back into a python
|
|
702
|
+
# 2. A task returns a python DataFrame with an uri.
|
|
703
|
+
# Note: this case is also what happens we start a local execution of a task with a python DataFrame.
|
|
704
|
+
# It gets converted into a literal first, then back into a python DataFrame.
|
|
706
705
|
#
|
|
707
706
|
# Ex.
|
|
708
|
-
# def t2(uri: str) -> Annotated[
|
|
709
|
-
# return
|
|
710
|
-
if python_val.
|
|
707
|
+
# def t2(uri: str) -> Annotated[DataFrame, my_cols]
|
|
708
|
+
# return DataFrame(uri=uri)
|
|
709
|
+
if python_val.val is None:
|
|
711
710
|
uri = python_val.uri
|
|
712
711
|
file_format = python_val.file_format
|
|
713
712
|
|
|
@@ -718,19 +717,20 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
718
717
|
uri = await storage.put(uri)
|
|
719
718
|
|
|
720
719
|
# Check the user-specified file_format
|
|
721
|
-
# When users specify file_format for a
|
|
720
|
+
# When users specify file_format for a DataFrame, the file_format should be retained
|
|
722
721
|
# conditionally. For details, please refer to https://github.com/flyteorg/flyte/issues/6096.
|
|
723
722
|
# Following illustrates why we can't always copy the user-specified file_format over:
|
|
724
723
|
#
|
|
725
724
|
# @task
|
|
726
|
-
# def modify_format(
|
|
727
|
-
# return
|
|
725
|
+
# def modify_format(df: Annotated[DataFrame, {}, "task-format"]) -> DataFrame:
|
|
726
|
+
# return df
|
|
728
727
|
#
|
|
729
|
-
#
|
|
730
|
-
#
|
|
728
|
+
# df = DataFrame(uri="s3://my-s3-bucket/df.parquet", file_format="user-format")
|
|
729
|
+
# df2 = modify_format(df=df)
|
|
731
730
|
#
|
|
732
|
-
# In this case, we expect
|
|
733
|
-
# If we directly copy the user-specified file_format over,
|
|
731
|
+
# In this case, we expect the df2.file_format to be task-format (as shown in Annotated),
|
|
732
|
+
# not user-format. If we directly copy the user-specified file_format over,
|
|
733
|
+
# the type hint information will be missing.
|
|
734
734
|
if sdt.format == GENERIC_FORMAT and file_format != GENERIC_FORMAT:
|
|
735
735
|
sdt.format = file_format
|
|
736
736
|
|
|
@@ -740,9 +740,9 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
740
740
|
)
|
|
741
741
|
return literals_pb2.Literal(scalar=literals_pb2.Scalar(structured_dataset=sd_model))
|
|
742
742
|
|
|
743
|
-
# 3. This is the third and probably most common case. The python
|
|
743
|
+
# 3. This is the third and probably most common case. The python DataFrame object wraps a dataframe
|
|
744
744
|
# that we will need to invoke an encoder for. Figure out which encoder to call and invoke it.
|
|
745
|
-
df_type = type(python_val.
|
|
745
|
+
df_type = type(python_val.val)
|
|
746
746
|
protocol = self._protocol_from_type_or_prefix(df_type, python_val.uri)
|
|
747
747
|
|
|
748
748
|
return await self.encode(
|
|
@@ -760,7 +760,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
760
760
|
structured_dataset_type=expected.structured_dataset_type if expected else None
|
|
761
761
|
)
|
|
762
762
|
|
|
763
|
-
sd =
|
|
763
|
+
sd = DataFrame(val=python_val, metadata=meta)
|
|
764
764
|
return await self.encode(sd, python_type, protocol, fmt, sdt)
|
|
765
765
|
|
|
766
766
|
def _protocol_from_type_or_prefix(self, df_type: Type, uri: Optional[str] = None) -> str:
|
|
@@ -782,13 +782,13 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
782
782
|
|
|
783
783
|
async def encode(
|
|
784
784
|
self,
|
|
785
|
-
sd:
|
|
785
|
+
sd: DataFrame,
|
|
786
786
|
df_type: Type,
|
|
787
787
|
protocol: str,
|
|
788
788
|
format: str,
|
|
789
789
|
structured_literal_type: types_pb2.StructuredDatasetType,
|
|
790
790
|
) -> literals_pb2.Literal:
|
|
791
|
-
handler:
|
|
791
|
+
handler: DataFrameEncoder
|
|
792
792
|
handler = self.get_encoder(df_type, protocol, format)
|
|
793
793
|
|
|
794
794
|
sd_model = await handler.encode(sd, structured_literal_type)
|
|
@@ -813,17 +813,17 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
813
813
|
sd._already_uploaded = True
|
|
814
814
|
return lit
|
|
815
815
|
|
|
816
|
-
# pr: han-ru: can this be removed if we make
|
|
817
|
-
def
|
|
818
|
-
self, dict_obj: typing.Dict[str, str], expected_python_type: Type[T] |
|
|
819
|
-
) -> T |
|
|
816
|
+
# pr: han-ru: can this be removed if we make DataFrame a pydantic model?
|
|
817
|
+
def dict_to_dataframe(
|
|
818
|
+
self, dict_obj: typing.Dict[str, str], expected_python_type: Type[T] | DataFrame
|
|
819
|
+
) -> T | DataFrame:
|
|
820
820
|
uri = dict_obj.get("uri", None)
|
|
821
821
|
file_format = dict_obj.get("file_format", None)
|
|
822
822
|
|
|
823
823
|
if uri is None:
|
|
824
|
-
raise ValueError("
|
|
824
|
+
raise ValueError("DataFrame's uri and file format should not be None")
|
|
825
825
|
|
|
826
|
-
# Instead of using python native
|
|
826
|
+
# Instead of using python native DataFrame, we need to build a literals.StructuredDataset
|
|
827
827
|
# The reason is that _literal_sd of python sd is accessed when task output LiteralMap is
|
|
828
828
|
# converted back to flyteidl. Hence, _literal_sd must have to_flyte_idl method
|
|
829
829
|
# See https://github.com/flyteorg/flytekit/blob/f938661ff8413219d1bea77f6914a58c302d5c6c/flytekit/bin/entrypoint.py#L326
|
|
@@ -833,15 +833,15 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
833
833
|
sd_literal = literals_pb2.StructuredDataset(uri=uri, metadata=metad)
|
|
834
834
|
|
|
835
835
|
return asyncio.run(
|
|
836
|
-
|
|
836
|
+
DataFrameTransformerEngine().to_python_value(
|
|
837
837
|
literals_pb2.Literal(scalar=literals_pb2.Scalar(structured_dataset=sd_literal)),
|
|
838
838
|
expected_python_type,
|
|
839
839
|
)
|
|
840
840
|
)
|
|
841
841
|
|
|
842
842
|
def from_binary_idl(
|
|
843
|
-
self, binary_idl_object: literals_pb2.Binary, expected_python_type: Type[T] |
|
|
844
|
-
) -> T |
|
|
843
|
+
self, binary_idl_object: literals_pb2.Binary, expected_python_type: Type[T] | DataFrame
|
|
844
|
+
) -> T | DataFrame:
|
|
845
845
|
"""
|
|
846
846
|
If the input is from flytekit, the Life Cycle will be as follows:
|
|
847
847
|
|
|
@@ -869,13 +869,13 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
869
869
|
"""
|
|
870
870
|
if binary_idl_object.tag == MESSAGEPACK:
|
|
871
871
|
python_val = msgpack.loads(binary_idl_object.value)
|
|
872
|
-
return self.
|
|
872
|
+
return self.dict_to_dataframe(dict_obj=python_val, expected_python_type=expected_python_type)
|
|
873
873
|
else:
|
|
874
874
|
raise TypeTransformerFailedError(f"Unsupported binary format: `{binary_idl_object.tag}`")
|
|
875
875
|
|
|
876
876
|
async def to_python_value(
|
|
877
|
-
self, lv: literals_pb2.Literal, expected_python_type: Type[T] |
|
|
878
|
-
) -> T |
|
|
877
|
+
self, lv: literals_pb2.Literal, expected_python_type: Type[T] | DataFrame
|
|
878
|
+
) -> T | DataFrame:
|
|
879
879
|
"""
|
|
880
880
|
The only tricky thing with converting a Literal (say the output of an earlier task), to a Python value at
|
|
881
881
|
the start of a task execution, is the column subsetting behavior. For example, if you have,
|
|
@@ -913,7 +913,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
913
913
|
# Detect annotations and extract out all the relevant information that the user might supply
|
|
914
914
|
expected_python_type, column_dict, storage_fmt, pa_schema = extract_cols_and_format(expected_python_type)
|
|
915
915
|
|
|
916
|
-
# Start handling for
|
|
916
|
+
# Start handling for DataFrame scalars, first look at the columns
|
|
917
917
|
incoming_columns = lv.scalar.structured_dataset.metadata.structured_dataset_type.columns
|
|
918
918
|
|
|
919
919
|
# If the incoming literal, also doesn't have columns, then we just have an empty list, so initialize here
|
|
@@ -935,10 +935,10 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
935
935
|
)
|
|
936
936
|
metad = literals_pb2.StructuredDatasetMetadata(structured_dataset_type=new_sdt)
|
|
937
937
|
|
|
938
|
-
# A
|
|
939
|
-
# t1(input_a:
|
|
940
|
-
# t1(input_a: Annotated[
|
|
941
|
-
if issubclass(expected_python_type,
|
|
938
|
+
# A DataFrame type, for example
|
|
939
|
+
# t1(input_a: DataFrame) # or
|
|
940
|
+
# t1(input_a: Annotated[DataFrame, my_cols])
|
|
941
|
+
if issubclass(expected_python_type, DataFrame):
|
|
942
942
|
sd = expected_python_type(
|
|
943
943
|
dataframe=None,
|
|
944
944
|
# Note here that the type being passed in
|
|
@@ -953,12 +953,12 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
953
953
|
return await self.open_as(lv.scalar.structured_dataset, df_type=expected_python_type, updated_metadata=metad)
|
|
954
954
|
|
|
955
955
|
def to_html(self, python_val: typing.Any, expected_python_type: Type[T]) -> str:
|
|
956
|
-
if isinstance(python_val,
|
|
957
|
-
if python_val.
|
|
958
|
-
df = python_val.
|
|
956
|
+
if isinstance(python_val, DataFrame):
|
|
957
|
+
if python_val.val is not None:
|
|
958
|
+
df = python_val.val
|
|
959
959
|
else:
|
|
960
960
|
# Here we only render column information by default instead of opening the structured dataset.
|
|
961
|
-
col = typing.cast(
|
|
961
|
+
col = typing.cast(DataFrame, python_val).columns()
|
|
962
962
|
dataframe = pd.DataFrame(col, ["column type"])
|
|
963
963
|
return dataframe.to_html() # type: ignore
|
|
964
964
|
else:
|
|
@@ -1004,11 +1004,12 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
1004
1004
|
def _get_dataset_column_literal_type(self, t: Type) -> types_pb2.LiteralType:
|
|
1005
1005
|
if t in get_supported_types():
|
|
1006
1006
|
return get_supported_types()[t]
|
|
1007
|
-
|
|
1007
|
+
origin = getattr(t, "__origin__", None)
|
|
1008
|
+
if origin is list:
|
|
1008
1009
|
return types_pb2.LiteralType(collection_type=self._get_dataset_column_literal_type(t.__args__[0]))
|
|
1009
|
-
if
|
|
1010
|
+
if origin is dict:
|
|
1010
1011
|
return types_pb2.LiteralType(map_value_type=self._get_dataset_column_literal_type(t.__args__[1]))
|
|
1011
|
-
raise AssertionError(f"type {t} is currently not supported by
|
|
1012
|
+
raise AssertionError(f"type {t} is currently not supported by DataFrame")
|
|
1012
1013
|
|
|
1013
1014
|
def _convert_ordered_dict_of_columns_to_list(
|
|
1014
1015
|
self, column_map: typing.Optional[typing.OrderedDict[str, Type]]
|
|
@@ -1022,9 +1023,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
1022
1023
|
converted_cols.append(types_pb2.StructuredDatasetType.DatasetColumn(name=k, literal_type=lt))
|
|
1023
1024
|
return converted_cols
|
|
1024
1025
|
|
|
1025
|
-
def _get_dataset_type(
|
|
1026
|
-
self, t: typing.Union[Type[StructuredDataset], typing.Any]
|
|
1027
|
-
) -> types_pb2.StructuredDatasetType:
|
|
1026
|
+
def _get_dataset_type(self, t: typing.Union[Type[DataFrame], typing.Any]) -> types_pb2.StructuredDatasetType:
|
|
1028
1027
|
original_python_type, column_map, storage_format, pa_schema = extract_cols_and_format(t) # type: ignore
|
|
1029
1028
|
|
|
1030
1029
|
# Get the column information
|
|
@@ -1039,7 +1038,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
1039
1038
|
external_schema_bytes=typing.cast(pa.lib.Schema, pa_schema).to_string().encode() if pa_schema else None,
|
|
1040
1039
|
)
|
|
1041
1040
|
|
|
1042
|
-
def get_literal_type(self, t: typing.Union[Type[
|
|
1041
|
+
def get_literal_type(self, t: typing.Union[Type[DataFrame], typing.Any]) -> types_pb2.LiteralType:
|
|
1043
1042
|
"""
|
|
1044
1043
|
Provide a concrete implementation so that writers of custom dataframe handlers since there's nothing that
|
|
1045
1044
|
special about the literal type. Any dataframe type will always be associated with the structured dataset type.
|
|
@@ -1049,13 +1048,13 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
1049
1048
|
"""
|
|
1050
1049
|
return types_pb2.LiteralType(structured_dataset_type=self._get_dataset_type(t))
|
|
1051
1050
|
|
|
1052
|
-
def guess_python_type(self, literal_type: types_pb2.LiteralType) -> Type[
|
|
1051
|
+
def guess_python_type(self, literal_type: types_pb2.LiteralType) -> Type[DataFrame]:
|
|
1053
1052
|
# todo: technically we should return the dataframe type specified in the constructor, but to do that,
|
|
1054
1053
|
# we'd have to store that, which we don't do today. See possibly #1363
|
|
1055
|
-
if literal_type.HasField("
|
|
1056
|
-
return
|
|
1057
|
-
raise ValueError(f"
|
|
1054
|
+
if literal_type.HasField("dataframe_type"):
|
|
1055
|
+
return DataFrame
|
|
1056
|
+
raise ValueError(f"DataFrameTransformerEngine cannot reverse {literal_type}")
|
|
1058
1057
|
|
|
1059
1058
|
|
|
1060
|
-
flyte_dataset_transformer =
|
|
1059
|
+
flyte_dataset_transformer = DataFrameTransformerEngine()
|
|
1061
1060
|
TypeEngine.register(flyte_dataset_transformer)
|