flyte 0.2.0b32__py3-none-any.whl → 0.2.0b34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of flyte might be problematic. Click here for more details.

@@ -35,23 +35,23 @@ else:
35
35
  pd = lazy_module("pandas")
36
36
  pa = lazy_module("pyarrow")
37
37
 
38
- T = typing.TypeVar("T") # StructuredDataset type or a dataframe type
38
+ T = typing.TypeVar("T") # DataFrame type or a dataframe type
39
39
  DF = typing.TypeVar("DF") # Dataframe type
40
40
 
41
- # For specifying the storage formats of StructuredDatasets. It's just a string, nothing fancy.
42
- StructuredDatasetFormat: TypeAlias = str
41
+ # For specifying the storage formats of DataFrames. It's just a string, nothing fancy.
42
+ DataFrameFormat: TypeAlias = str
43
43
 
44
44
  # Storage formats
45
- PARQUET: StructuredDatasetFormat = "parquet"
46
- CSV: StructuredDatasetFormat = "csv"
47
- GENERIC_FORMAT: StructuredDatasetFormat = ""
45
+ PARQUET: DataFrameFormat = "parquet"
46
+ CSV: DataFrameFormat = "csv"
47
+ GENERIC_FORMAT: DataFrameFormat = ""
48
48
  GENERIC_PROTOCOL: str = "generic protocol"
49
49
 
50
50
 
51
51
  @dataclass
52
- class StructuredDataset(SerializableType, DataClassJSONMixin):
52
+ class DataFrame(SerializableType, DataClassJSONMixin):
53
53
  """
54
- This is the user facing StructuredDataset class. Please don't confuse it with the literals.StructuredDataset
54
+ This is the user facing DataFrame class. Please don't confuse it with the literals.StructuredDataset
55
55
  class (that is just a model, a Python class representation of the protobuf).
56
56
  """
57
57
 
@@ -62,9 +62,9 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
62
62
  def _serialize(self) -> Dict[str, Optional[str]]:
63
63
  # dataclass case
64
64
  lt = TypeEngine.to_literal_type(type(self))
65
- engine = StructuredDatasetTransformerEngine()
65
+ engine = DataFrameTransformerEngine()
66
66
  lv = loop_manager.run_sync(engine.to_literal, self, type(self), lt)
67
- sd = StructuredDataset(uri=lv.scalar.structured_dataset.uri)
67
+ sd = DataFrame(uri=lv.scalar.structured_dataset.uri)
68
68
  sd.file_format = lv.scalar.structured_dataset.metadata.structured_dataset_type.format
69
69
  return {
70
70
  "uri": sd.uri,
@@ -72,14 +72,14 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
72
72
  }
73
73
 
74
74
  @classmethod
75
- def _deserialize(cls, value) -> "StructuredDataset":
75
+ def _deserialize(cls, value) -> "DataFrame":
76
76
  uri = value.get("uri", None)
77
77
  file_format = value.get("file_format", None)
78
78
 
79
79
  if uri is None:
80
- raise ValueError("StructuredDataset's uri and file format should not be None")
80
+ raise ValueError("DataFrame's uri and file format should not be None")
81
81
 
82
- engine = StructuredDatasetTransformerEngine()
82
+ engine = DataFrameTransformerEngine()
83
83
  return loop_manager.run_sync(
84
84
  engine.to_python_value,
85
85
  literals_pb2.Literal(
@@ -96,9 +96,9 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
96
96
  )
97
97
 
98
98
  @model_serializer
99
- def serialize_structured_dataset(self) -> Dict[str, Optional[str]]:
99
+ def serialize_dataframe(self) -> Dict[str, Optional[str]]:
100
100
  lt = TypeEngine.to_literal_type(type(self))
101
- sde = StructuredDatasetTransformerEngine()
101
+ sde = DataFrameTransformerEngine()
102
102
  lv = loop_manager.run_sync(sde.to_literal, self, type(self), lt)
103
103
  return {
104
104
  "uri": lv.scalar.structured_dataset.uri,
@@ -106,11 +106,11 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
106
106
  }
107
107
 
108
108
  @model_validator(mode="after")
109
- def deserialize_structured_dataset(self, info) -> StructuredDataset:
109
+ def deserialize_dataframe(self, info) -> DataFrame:
110
110
  if info.context is None or info.context.get("deserialize") is not True:
111
111
  return self
112
112
 
113
- engine = StructuredDatasetTransformerEngine()
113
+ engine = DataFrameTransformerEngine()
114
114
  return loop_manager.run_sync(
115
115
  engine.to_python_value,
116
116
  literals_pb2.Literal(
@@ -136,12 +136,12 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
136
136
 
137
137
  def __init__(
138
138
  self,
139
- dataframe: typing.Optional[typing.Any] = None,
139
+ val: typing.Optional[typing.Any] = None,
140
140
  uri: typing.Optional[str] = None,
141
141
  metadata: typing.Optional[literals_pb2.StructuredDatasetMetadata] = None,
142
142
  **kwargs,
143
143
  ):
144
- self._dataframe = dataframe
144
+ self._val = val
145
145
  # Make these fields public, so that the dataclass transformer can set a value for it
146
146
  # https://github.com/flyteorg/flytekit/blob/bcc8541bd6227b532f8462563fe8aac902242b21/flytekit/core/type_engine.py#L298
147
147
  self.uri = uri
@@ -156,8 +156,8 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
156
156
  self._already_uploaded = False
157
157
 
158
158
  @property
159
- def dataframe(self) -> Optional[DF]:
160
- return self._dataframe
159
+ def val(self) -> Optional[DF]:
160
+ return self._val
161
161
 
162
162
  @property
163
163
  def metadata(self) -> Optional[literals_pb2.StructuredDatasetMetadata]:
@@ -168,18 +168,18 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
168
168
  return self._literal_sd
169
169
 
170
170
  def open(self, dataframe_type: Type[DF]):
171
- from flyte.io._structured_dataset import lazy_import_structured_dataset_handler
172
-
173
171
  """
174
172
  Load the handler if needed. For the use case like:
175
173
  @task
176
- def t1(sd: StructuredDataset):
174
+ def t1(df: DataFrame):
177
175
  import pandas as pd
178
- sd.open(pd.DataFrame).all()
176
+ df.open(pd.DataFrame).all()
179
177
 
180
- pandas is imported inside the task, so pandas handler won't be loaded during deserialization in type engine.
178
+ pandas is imported inside the task, so panda handler won't be loaded during deserialization in type engine.
181
179
  """
182
- lazy_import_structured_dataset_handler()
180
+ from flyte.io._dataframe import lazy_import_dataframe_handler
181
+
182
+ lazy_import_dataframe_handler()
183
183
  self._dataframe_type = dataframe_type
184
184
  return self
185
185
 
@@ -187,22 +187,22 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
187
187
  if self._dataframe_type is None:
188
188
  raise ValueError("No dataframe type set. Use open() to set the local dataframe type you want to use.")
189
189
 
190
- if self.uri is not None and self.dataframe is None:
191
- expected = TypeEngine.to_literal_type(StructuredDataset)
190
+ if self.uri is not None and self.val is None:
191
+ expected = TypeEngine.to_literal_type(DataFrame)
192
192
  await self._set_literal(expected)
193
193
 
194
194
  return await flyte_dataset_transformer.open_as(self.literal, self._dataframe_type, self.metadata)
195
195
 
196
196
  async def _set_literal(self, expected: types_pb2.LiteralType) -> None:
197
197
  """
198
- Explicitly set the StructuredDataset Literal to handle the following cases:
198
+ Explicitly set the DataFrame Literal to handle the following cases:
199
199
 
200
- 1. Read a dataframe from a StructuredDataset with an uri, for example:
200
+ 1. Read the content from a DataFrame with an uri, for example:
201
201
 
202
202
  @task
203
- def return_sd() -> StructuredDataset:
204
- sd = StructuredDataset(uri="s3://my-s3-bucket/s3_flyte_dir/df.parquet", file_format="parquet")
205
- df = sd.open(pd.DataFrame).all()
203
+ def return_df() -> DataFrame:
204
+ df = DataFrame(uri="s3://my-s3-bucket/s3_flyte_dir/df.parquet", file_format="parquet")
205
+ df = df.open(pd.DataFrame).all()
206
206
  return df
207
207
 
208
208
  For details, please refer to this issue: https://github.com/flyteorg/flyte/issues/5954.
@@ -212,14 +212,14 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
212
212
 
213
213
  For details, please refer to this issue: https://github.com/flyteorg/flyte/issues/5956.
214
214
  """
215
- to_literal = await flyte_dataset_transformer.to_literal(self, StructuredDataset, expected)
215
+ to_literal = await flyte_dataset_transformer.to_literal(self, DataFrame, expected)
216
216
  self._literal_sd = to_literal.scalar.structured_dataset
217
217
  if self.metadata is None:
218
218
  self._metadata = self._literal_sd.metadata
219
219
 
220
220
  async def set_literal(self, expected: types_pb2.LiteralType) -> None:
221
221
  """
222
- A public wrapper method to set the StructuredDataset Literal.
222
+ A public wrapper method to set the DataFrame Literal.
223
223
 
224
224
  This method provides external access to the internal _set_literal method.
225
225
  """
@@ -256,7 +256,7 @@ def extract_cols_and_format(
256
256
  Helper function, just used to iterate through Annotations and extract out the following information:
257
257
  - base type, if not Annotated, it will just be the type that was passed in.
258
258
  - column information, as a collections.OrderedDict,
259
- - the storage format, as a ``StructuredDatasetFormat`` (str),
259
+ - the storage format, as a ``DataFrameFormat`` (str),
260
260
  - pa.lib.Schema
261
261
 
262
262
  If more than one of any type of thing is found, an error will be raised.
@@ -286,7 +286,7 @@ def extract_cols_and_format(
286
286
  d = collections.OrderedDict()
287
287
  d.update(aa)
288
288
  ordered_dict_cols = d
289
- elif isinstance(aa, StructuredDatasetFormat):
289
+ elif isinstance(aa, DataFrameFormat):
290
290
  if fmt != "":
291
291
  raise ValueError(f"A format was already specified {fmt}, cannot use {aa}")
292
292
  fmt = aa
@@ -305,7 +305,7 @@ def extract_cols_and_format(
305
305
  return t, ordered_dict_cols, fmt, pa_schema
306
306
 
307
307
 
308
- class StructuredDatasetEncoder(ABC, Generic[T]):
308
+ class DataFrameEncoder(ABC, Generic[T]):
309
309
  def __init__(
310
310
  self,
311
311
  python_type: Type[T],
@@ -314,10 +314,10 @@ class StructuredDatasetEncoder(ABC, Generic[T]):
314
314
  ):
315
315
  """
316
316
  Extend this abstract class, implement the encode function, and register your concrete class with the
317
- StructuredDatasetTransformerEngine class in order for the core flytekit type engine to handle
317
+ DataFrameTransformerEngine class in order for the core flytekit type engine to handle
318
318
  dataframe libraries. This is the encoding interface, meaning it is used when there is a Python value that the
319
319
  flytekit type engine is trying to convert into a Flyte Literal. For the other way, see
320
- the StructuredDatasetEncoder
320
+ the DataFrameEncoder
321
321
 
322
322
  :param python_type: The dataframe class in question that you want to register this encoder with
323
323
  :param protocol: A prefix representing the storage driver (e.g. 's3, 'gs', 'bq', etc.). You can use either
@@ -347,7 +347,7 @@ class StructuredDatasetEncoder(ABC, Generic[T]):
347
347
  @abstractmethod
348
348
  async def encode(
349
349
  self,
350
- structured_dataset: StructuredDataset,
350
+ dataframe: DataFrame,
351
351
  structured_dataset_type: types_pb2.StructuredDatasetType,
352
352
  ) -> literals_pb2.StructuredDataset:
353
353
  """
@@ -357,20 +357,20 @@ class StructuredDatasetEncoder(ABC, Generic[T]):
357
357
  the
358
358
  # TODO: Do we need to add a flag to indicate if it was wrapped by the transformer or by the user?
359
359
 
360
- :param structured_dataset: This is a StructuredDataset wrapper object. See more info above.
361
- :param structured_dataset_type: This the StructuredDatasetType, as found in the LiteralType of the interface
360
+ :param dataframe: This is a DataFrame wrapper object. See more info above.
361
+ :param structured_dataset_type: This the DataFrameType, as found in the LiteralType of the interface
362
362
  of the task that invoked this encoding call. It is passed along to encoders so that authors of encoders
363
- can include it in the returned literals.StructuredDataset. See the IDL for more information on why this
363
+ can include it in the returned literals.DataFrame. See the IDL for more information on why this
364
364
  literal in particular carries the type information along with it. If the encoder doesn't supply it, it will
365
365
  also be filled in after the encoder runs by the transformer engine.
366
- :return: This function should return a StructuredDataset literal object. Do not confuse this with the
367
- StructuredDataset wrapper class used as input to this function - that is the user facing Python class.
368
- This function needs to return the IDL StructuredDataset.
366
+ :return: This function should return a DataFrame literal object. Do not confuse this with the
367
+ DataFrame wrapper class used as input to this function - that is the user facing Python class.
368
+ This function needs to return the IDL DataFrame.
369
369
  """
370
370
  raise NotImplementedError
371
371
 
372
372
 
373
- class StructuredDatasetDecoder(ABC, Generic[DF]):
373
+ class DataFrameDecoder(ABC, Generic[DF]):
374
374
  def __init__(
375
375
  self,
376
376
  python_type: Type[DF],
@@ -380,9 +380,9 @@ class StructuredDatasetDecoder(ABC, Generic[DF]):
380
380
  ):
381
381
  """
382
382
  Extend this abstract class, implement the decode function, and register your concrete class with the
383
- StructuredDatasetTransformerEngine class in order for the core flytekit type engine to handle
383
+ DataFrameTransformerEngine class in order for the core flytekit type engine to handle
384
384
  dataframe libraries. This is the decoder interface, meaning it is used when there is a Flyte Literal value,
385
- and we have to get a Python value out of it. For the other way, see the StructuredDatasetEncoder
385
+ and we have to get a Python value out of it. For the other way, see the DataFrameEncoder
386
386
 
387
387
  :param python_type: The dataframe class in question that you want to register this decoder with
388
388
  :param protocol: A prefix representing the storage driver (e.g. 's3, 'gs', 'bq', etc.). You can use either
@@ -419,8 +419,8 @@ class StructuredDatasetDecoder(ABC, Generic[DF]):
419
419
  This is code that will be called by the dataset transformer engine to ultimately translate from a Flyte Literal
420
420
  value into a Python instance.
421
421
 
422
- :param flyte_value: This will be a Flyte IDL StructuredDataset Literal - do not confuse this with the
423
- StructuredDataset class defined also in this module.
422
+ :param flyte_value: This will be a Flyte IDL DataFrame Literal - do not confuse this with the
423
+ DataFrame class defined also in this module.
424
424
  :param current_task_metadata: Metadata object containing the type (and columns if any) for the currently
425
425
  executing task. This type may have more or less information than the type information bundled
426
426
  inside the incoming flyte_value.
@@ -459,19 +459,19 @@ def get_supported_types():
459
459
  class DuplicateHandlerError(ValueError): ...
460
460
 
461
461
 
462
- class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
462
+ class DataFrameTransformerEngine(TypeTransformer[DataFrame]):
463
463
  """
464
464
  Think of this transformer as a higher-level meta transformer that is used for all the dataframe types.
465
465
  If you are bringing a custom data frame type, or any data frame type, to flytekit, instead of
466
466
  registering with the main type engine, you should register with this transformer instead.
467
467
  """
468
468
 
469
- ENCODERS: ClassVar[Dict[Type, Dict[str, Dict[str, StructuredDatasetEncoder]]]] = {}
470
- DECODERS: ClassVar[Dict[Type, Dict[str, Dict[str, StructuredDatasetDecoder]]]] = {}
469
+ ENCODERS: ClassVar[Dict[Type, Dict[str, Dict[str, DataFrameEncoder]]]] = {}
470
+ DECODERS: ClassVar[Dict[Type, Dict[str, Dict[str, DataFrameDecoder]]]] = {}
471
471
  DEFAULT_PROTOCOLS: ClassVar[Dict[Type, str]] = {}
472
472
  DEFAULT_FORMATS: ClassVar[Dict[Type, str]] = {}
473
473
 
474
- Handlers = Union[StructuredDatasetEncoder, StructuredDatasetDecoder]
474
+ Handlers = Union[DataFrameEncoder, DataFrameDecoder]
475
475
  Renderers: ClassVar[Dict[Type, Renderable]] = {}
476
476
 
477
477
  @classmethod
@@ -527,17 +527,17 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
527
527
 
528
528
  @classmethod
529
529
  def get_encoder(cls, df_type: Type, protocol: str, format: str):
530
- return cls._finder(StructuredDatasetTransformerEngine.ENCODERS, df_type, protocol, format)
530
+ return cls._finder(DataFrameTransformerEngine.ENCODERS, df_type, protocol, format)
531
531
 
532
532
  @classmethod
533
- def get_decoder(cls, df_type: Type, protocol: str, format: str) -> StructuredDatasetDecoder:
534
- return cls._finder(StructuredDatasetTransformerEngine.DECODERS, df_type, protocol, format)
533
+ def get_decoder(cls, df_type: Type, protocol: str, format: str) -> DataFrameDecoder:
534
+ return cls._finder(DataFrameTransformerEngine.DECODERS, df_type, protocol, format)
535
535
 
536
536
  @classmethod
537
537
  def _handler_finder(cls, h: Handlers, protocol: str) -> Dict[str, Handlers]:
538
- if isinstance(h, StructuredDatasetEncoder):
538
+ if isinstance(h, DataFrameEncoder):
539
539
  top_level = cls.ENCODERS
540
- elif isinstance(h, StructuredDatasetDecoder):
540
+ elif isinstance(h, DataFrameDecoder):
541
541
  top_level = cls.DECODERS # type: ignore
542
542
  else:
543
543
  raise TypeError(f"We don't support this type of handler {h}")
@@ -548,7 +548,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
548
548
  return top_level[h.python_type][protocol] # type: ignore
549
549
 
550
550
  def __init__(self):
551
- super().__init__("StructuredDataset Transformer", StructuredDataset)
551
+ super().__init__("DataFrame Transformer", DataFrame)
552
552
  self._type_assertions_enabled = False
553
553
 
554
554
  @classmethod
@@ -568,7 +568,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
568
568
  Call this with any Encoder or Decoder to register it with the flytekit type system. If your handler does not
569
569
  specify a protocol (e.g. s3, gs, etc.) field, then
570
570
 
571
- :param h: The StructuredDatasetEncoder or StructuredDatasetDecoder you wish to register with this transformer.
571
+ :param h: The DataFrameEncoder or DataFrameDecoder you wish to register with this transformer.
572
572
  :param default_for_type: If set, when a user returns from a task an instance of the dataframe the handler
573
573
  handles, e.g. ``return pd.DataFrame(...)``, not wrapped around the ``StructuredDataset`` object, we will
574
574
  use this handler's protocol and format as the default, effectively saying that this handler will be called.
@@ -582,7 +582,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
582
582
  :param default_storage_for_type: Same as above but only for the storage format. Error if already set,
583
583
  unless override is specified.
584
584
  """
585
- if not (isinstance(h, StructuredDatasetEncoder) or isinstance(h, StructuredDatasetDecoder)):
585
+ if not (isinstance(h, DataFrameEncoder) or isinstance(h, DataFrameDecoder)):
586
586
  raise TypeError(f"We don't support this type of handler {h}")
587
587
 
588
588
  if h.protocol is None:
@@ -648,27 +648,27 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
648
648
  # Register with the type engine as well
649
649
  # The semantics as of now are such that it doesn't matter which order these transformers are loaded in, as
650
650
  # long as the older Pandas/FlyteSchema transformer do not also specify the override
651
- engine = StructuredDatasetTransformerEngine()
651
+ engine = DataFrameTransformerEngine()
652
652
  TypeEngine.register_additional_type(engine, h.python_type, override=True)
653
653
 
654
- def assert_type(self, t: Type[StructuredDataset], v: typing.Any):
654
+ def assert_type(self, t: Type[DataFrame], v: typing.Any):
655
655
  return
656
656
 
657
657
  async def to_literal(
658
658
  self,
659
- python_val: Union[StructuredDataset, typing.Any],
660
- python_type: Union[Type[StructuredDataset], Type],
659
+ python_val: Union[DataFrame, typing.Any],
660
+ python_type: Union[Type[DataFrame], Type],
661
661
  expected: types_pb2.LiteralType,
662
662
  ) -> literals_pb2.Literal:
663
663
  # Make a copy in case we need to hand off to encoders, since we can't be sure of mutations.
664
664
  python_type, *attrs = extract_cols_and_format(python_type)
665
665
  sdt = types_pb2.StructuredDatasetType(format=self.DEFAULT_FORMATS.get(python_type, GENERIC_FORMAT))
666
666
 
667
- if issubclass(python_type, StructuredDataset) and not isinstance(python_val, StructuredDataset):
667
+ if issubclass(python_type, DataFrame) and not isinstance(python_val, DataFrame):
668
668
  # Catch a common mistake
669
669
  raise TypeTransformerFailedError(
670
- f"Expected a StructuredDataset instance, but got {type(python_val)} instead."
671
- f" Did you forget to wrap your dataframe in a StructuredDataset instance?"
670
+ f"Expected a DataFrame instance, but got {type(python_val)} instead."
671
+ f" Did you forget to wrap your dataframe in a DataFrame instance?"
672
672
  )
673
673
 
674
674
  if expected and expected.structured_dataset_type:
@@ -679,35 +679,34 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
679
679
  external_schema_bytes=expected.structured_dataset_type.external_schema_bytes,
680
680
  )
681
681
 
682
- # If the type signature has the StructuredDataset class, it will, or at least should, also be a
683
- # StructuredDataset instance.
684
- if isinstance(python_val, StructuredDataset):
682
+ # If the type signature has the DataFrame class, it will, or at least should, also be a
683
+ # DataFrame instance.
684
+ if isinstance(python_val, DataFrame):
685
685
  # There are three cases that we need to take care of here.
686
686
 
687
- # 1. A task returns a StructuredDataset that was just a passthrough input. If this happens
688
- # then return the original literals.StructuredDataset without invoking any encoder
687
+ # 1. A task returns a DataFrame that was just a passthrough input. If this happens
688
+ # then return the original literals.DataFrame without invoking any encoder
689
689
  #
690
690
  # Ex.
691
- # def t1(dataset: Annotated[StructuredDataset, my_cols]) -> Annotated[StructuredDataset, my_cols]:
691
+ # def t1(dataset: Annotated[DataFrame, my_cols]) -> Annotated[DataFrame, my_cols]:
692
692
  # return dataset
693
693
  if python_val._literal_sd is not None:
694
694
  if python_val._already_uploaded:
695
695
  return literals_pb2.Literal(scalar=literals_pb2.Scalar(structured_dataset=python_val._literal_sd))
696
- if python_val.dataframe is not None:
696
+ if python_val.val is not None:
697
697
  raise ValueError(
698
- f"Shouldn't have specified both literal {python_val._literal_sd}"
699
- f" and dataframe {python_val.dataframe}"
698
+ f"Shouldn't have specified both literal {python_val._literal_sd} and dataframe {python_val.val}"
700
699
  )
701
700
  return literals_pb2.Literal(scalar=literals_pb2.Scalar(structured_dataset=python_val._literal_sd))
702
701
 
703
- # 2. A task returns a python StructuredDataset with an uri.
704
- # Note: this case is also what happens we start a local execution of a task with a python StructuredDataset.
705
- # It gets converted into a literal first, then back into a python StructuredDataset.
702
+ # 2. A task returns a python DataFrame with an uri.
703
+ # Note: this case is also what happens we start a local execution of a task with a python DataFrame.
704
+ # It gets converted into a literal first, then back into a python DataFrame.
706
705
  #
707
706
  # Ex.
708
- # def t2(uri: str) -> Annotated[StructuredDataset, my_cols]
709
- # return StructuredDataset(uri=uri)
710
- if python_val.dataframe is None:
707
+ # def t2(uri: str) -> Annotated[DataFrame, my_cols]
708
+ # return DataFrame(uri=uri)
709
+ if python_val.val is None:
711
710
  uri = python_val.uri
712
711
  file_format = python_val.file_format
713
712
 
@@ -718,19 +717,20 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
718
717
  uri = await storage.put(uri)
719
718
 
720
719
  # Check the user-specified file_format
721
- # When users specify file_format for a StructuredDataset, the file_format should be retained
720
+ # When users specify file_format for a DataFrame, the file_format should be retained
722
721
  # conditionally. For details, please refer to https://github.com/flyteorg/flyte/issues/6096.
723
722
  # Following illustrates why we can't always copy the user-specified file_format over:
724
723
  #
725
724
  # @task
726
- # def modify_format(sd: Annotated[StructuredDataset, {}, "task-format"]) -> StructuredDataset:
727
- # return sd
725
+ # def modify_format(df: Annotated[DataFrame, {}, "task-format"]) -> DataFrame:
726
+ # return df
728
727
  #
729
- # sd = StructuredDataset(uri="s3://my-s3-bucket/df.parquet", file_format="user-format")
730
- # sd2 = modify_format(sd=sd)
728
+ # df = DataFrame(uri="s3://my-s3-bucket/df.parquet", file_format="user-format")
729
+ # df2 = modify_format(df=df)
731
730
  #
732
- # In this case, we expect sd2.file_format to be task-format (as shown in Annotated), not user-format.
733
- # If we directly copy the user-specified file_format over, the type hint information will be missing.
731
+ # In this case, we expect the df2.file_format to be task-format (as shown in Annotated),
732
+ # not user-format. If we directly copy the user-specified file_format over,
733
+ # the type hint information will be missing.
734
734
  if sdt.format == GENERIC_FORMAT and file_format != GENERIC_FORMAT:
735
735
  sdt.format = file_format
736
736
 
@@ -740,9 +740,9 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
740
740
  )
741
741
  return literals_pb2.Literal(scalar=literals_pb2.Scalar(structured_dataset=sd_model))
742
742
 
743
- # 3. This is the third and probably most common case. The python StructuredDataset object wraps a dataframe
743
+ # 3. This is the third and probably most common case. The python DataFrame object wraps a dataframe
744
744
  # that we will need to invoke an encoder for. Figure out which encoder to call and invoke it.
745
- df_type = type(python_val.dataframe)
745
+ df_type = type(python_val.val)
746
746
  protocol = self._protocol_from_type_or_prefix(df_type, python_val.uri)
747
747
 
748
748
  return await self.encode(
@@ -760,7 +760,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
760
760
  structured_dataset_type=expected.structured_dataset_type if expected else None
761
761
  )
762
762
 
763
- sd = StructuredDataset(dataframe=python_val, metadata=meta)
763
+ sd = DataFrame(val=python_val, metadata=meta)
764
764
  return await self.encode(sd, python_type, protocol, fmt, sdt)
765
765
 
766
766
  def _protocol_from_type_or_prefix(self, df_type: Type, uri: Optional[str] = None) -> str:
@@ -782,13 +782,13 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
782
782
 
783
783
  async def encode(
784
784
  self,
785
- sd: StructuredDataset,
785
+ sd: DataFrame,
786
786
  df_type: Type,
787
787
  protocol: str,
788
788
  format: str,
789
789
  structured_literal_type: types_pb2.StructuredDatasetType,
790
790
  ) -> literals_pb2.Literal:
791
- handler: StructuredDatasetEncoder
791
+ handler: DataFrameEncoder
792
792
  handler = self.get_encoder(df_type, protocol, format)
793
793
 
794
794
  sd_model = await handler.encode(sd, structured_literal_type)
@@ -813,17 +813,17 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
813
813
  sd._already_uploaded = True
814
814
  return lit
815
815
 
816
- # pr: han-ru: can this be removed if we make StructuredDataset a pydantic model?
817
- def dict_to_structured_dataset(
818
- self, dict_obj: typing.Dict[str, str], expected_python_type: Type[T] | StructuredDataset
819
- ) -> T | StructuredDataset:
816
+ # pr: han-ru: can this be removed if we make DataFrame a pydantic model?
817
+ def dict_to_dataframe(
818
+ self, dict_obj: typing.Dict[str, str], expected_python_type: Type[T] | DataFrame
819
+ ) -> T | DataFrame:
820
820
  uri = dict_obj.get("uri", None)
821
821
  file_format = dict_obj.get("file_format", None)
822
822
 
823
823
  if uri is None:
824
- raise ValueError("StructuredDataset's uri and file format should not be None")
824
+ raise ValueError("DataFrame's uri and file format should not be None")
825
825
 
826
- # Instead of using python native StructuredDataset, we need to build a literals.StructuredDataset
826
+ # Instead of using python native DataFrame, we need to build a literals.StructuredDataset
827
827
  # The reason is that _literal_sd of python sd is accessed when task output LiteralMap is
828
828
  # converted back to flyteidl. Hence, _literal_sd must have to_flyte_idl method
829
829
  # See https://github.com/flyteorg/flytekit/blob/f938661ff8413219d1bea77f6914a58c302d5c6c/flytekit/bin/entrypoint.py#L326
@@ -833,15 +833,15 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
833
833
  sd_literal = literals_pb2.StructuredDataset(uri=uri, metadata=metad)
834
834
 
835
835
  return asyncio.run(
836
- StructuredDatasetTransformerEngine().to_python_value(
836
+ DataFrameTransformerEngine().to_python_value(
837
837
  literals_pb2.Literal(scalar=literals_pb2.Scalar(structured_dataset=sd_literal)),
838
838
  expected_python_type,
839
839
  )
840
840
  )
841
841
 
842
842
  def from_binary_idl(
843
- self, binary_idl_object: literals_pb2.Binary, expected_python_type: Type[T] | StructuredDataset
844
- ) -> T | StructuredDataset:
843
+ self, binary_idl_object: literals_pb2.Binary, expected_python_type: Type[T] | DataFrame
844
+ ) -> T | DataFrame:
845
845
  """
846
846
  If the input is from flytekit, the Life Cycle will be as follows:
847
847
 
@@ -869,13 +869,13 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
869
869
  """
870
870
  if binary_idl_object.tag == MESSAGEPACK:
871
871
  python_val = msgpack.loads(binary_idl_object.value)
872
- return self.dict_to_structured_dataset(dict_obj=python_val, expected_python_type=expected_python_type)
872
+ return self.dict_to_dataframe(dict_obj=python_val, expected_python_type=expected_python_type)
873
873
  else:
874
874
  raise TypeTransformerFailedError(f"Unsupported binary format: `{binary_idl_object.tag}`")
875
875
 
876
876
  async def to_python_value(
877
- self, lv: literals_pb2.Literal, expected_python_type: Type[T] | StructuredDataset
878
- ) -> T | StructuredDataset:
877
+ self, lv: literals_pb2.Literal, expected_python_type: Type[T] | DataFrame
878
+ ) -> T | DataFrame:
879
879
  """
880
880
  The only tricky thing with converting a Literal (say the output of an earlier task), to a Python value at
881
881
  the start of a task execution, is the column subsetting behavior. For example, if you have,
@@ -913,7 +913,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
913
913
  # Detect annotations and extract out all the relevant information that the user might supply
914
914
  expected_python_type, column_dict, storage_fmt, pa_schema = extract_cols_and_format(expected_python_type)
915
915
 
916
- # Start handling for StructuredDataset scalars, first look at the columns
916
+ # Start handling for DataFrame scalars, first look at the columns
917
917
  incoming_columns = lv.scalar.structured_dataset.metadata.structured_dataset_type.columns
918
918
 
919
919
  # If the incoming literal, also doesn't have columns, then we just have an empty list, so initialize here
@@ -935,10 +935,10 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
935
935
  )
936
936
  metad = literals_pb2.StructuredDatasetMetadata(structured_dataset_type=new_sdt)
937
937
 
938
- # A StructuredDataset type, for example
939
- # t1(input_a: StructuredDataset) # or
940
- # t1(input_a: Annotated[StructuredDataset, my_cols])
941
- if issubclass(expected_python_type, StructuredDataset):
938
+ # A DataFrame type, for example
939
+ # t1(input_a: DataFrame) # or
940
+ # t1(input_a: Annotated[DataFrame, my_cols])
941
+ if issubclass(expected_python_type, DataFrame):
942
942
  sd = expected_python_type(
943
943
  dataframe=None,
944
944
  # Note here that the type being passed in
@@ -953,12 +953,12 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
953
953
  return await self.open_as(lv.scalar.structured_dataset, df_type=expected_python_type, updated_metadata=metad)
954
954
 
955
955
  def to_html(self, python_val: typing.Any, expected_python_type: Type[T]) -> str:
956
- if isinstance(python_val, StructuredDataset):
957
- if python_val.dataframe is not None:
958
- df = python_val.dataframe
956
+ if isinstance(python_val, DataFrame):
957
+ if python_val.val is not None:
958
+ df = python_val.val
959
959
  else:
960
960
  # Here we only render column information by default instead of opening the structured dataset.
961
- col = typing.cast(StructuredDataset, python_val).columns()
961
+ col = typing.cast(DataFrame, python_val).columns()
962
962
  dataframe = pd.DataFrame(col, ["column type"])
963
963
  return dataframe.to_html() # type: ignore
964
964
  else:
@@ -1004,11 +1004,12 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
1004
1004
  def _get_dataset_column_literal_type(self, t: Type) -> types_pb2.LiteralType:
1005
1005
  if t in get_supported_types():
1006
1006
  return get_supported_types()[t]
1007
- if hasattr(t, "__origin__") and t.__origin__ is list:
1007
+ origin = getattr(t, "__origin__", None)
1008
+ if origin is list:
1008
1009
  return types_pb2.LiteralType(collection_type=self._get_dataset_column_literal_type(t.__args__[0]))
1009
- if hasattr(t, "__origin__") and t.__origin__ is dict:
1010
+ if origin is dict:
1010
1011
  return types_pb2.LiteralType(map_value_type=self._get_dataset_column_literal_type(t.__args__[1]))
1011
- raise AssertionError(f"type {t} is currently not supported by StructuredDataset")
1012
+ raise AssertionError(f"type {t} is currently not supported by DataFrame")
1012
1013
 
1013
1014
  def _convert_ordered_dict_of_columns_to_list(
1014
1015
  self, column_map: typing.Optional[typing.OrderedDict[str, Type]]
@@ -1022,9 +1023,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
1022
1023
  converted_cols.append(types_pb2.StructuredDatasetType.DatasetColumn(name=k, literal_type=lt))
1023
1024
  return converted_cols
1024
1025
 
1025
- def _get_dataset_type(
1026
- self, t: typing.Union[Type[StructuredDataset], typing.Any]
1027
- ) -> types_pb2.StructuredDatasetType:
1026
+ def _get_dataset_type(self, t: typing.Union[Type[DataFrame], typing.Any]) -> types_pb2.StructuredDatasetType:
1028
1027
  original_python_type, column_map, storage_format, pa_schema = extract_cols_and_format(t) # type: ignore
1029
1028
 
1030
1029
  # Get the column information
@@ -1039,7 +1038,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
1039
1038
  external_schema_bytes=typing.cast(pa.lib.Schema, pa_schema).to_string().encode() if pa_schema else None,
1040
1039
  )
1041
1040
 
1042
- def get_literal_type(self, t: typing.Union[Type[StructuredDataset], typing.Any]) -> types_pb2.LiteralType:
1041
+ def get_literal_type(self, t: typing.Union[Type[DataFrame], typing.Any]) -> types_pb2.LiteralType:
1043
1042
  """
1044
1043
  Provide a concrete implementation so that writers of custom dataframe handlers since there's nothing that
1045
1044
  special about the literal type. Any dataframe type will always be associated with the structured dataset type.
@@ -1049,13 +1048,13 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
1049
1048
  """
1050
1049
  return types_pb2.LiteralType(structured_dataset_type=self._get_dataset_type(t))
1051
1050
 
1052
- def guess_python_type(self, literal_type: types_pb2.LiteralType) -> Type[StructuredDataset]:
1051
+ def guess_python_type(self, literal_type: types_pb2.LiteralType) -> Type[DataFrame]:
1053
1052
  # todo: technically we should return the dataframe type specified in the constructor, but to do that,
1054
1053
  # we'd have to store that, which we don't do today. See possibly #1363
1055
- if literal_type.HasField("structured_dataset_type"):
1056
- return StructuredDataset
1057
- raise ValueError(f"StructuredDatasetTransformerEngine cannot reverse {literal_type}")
1054
+ if literal_type.HasField("dataframe_type"):
1055
+ return DataFrame
1056
+ raise ValueError(f"DataFrameTransformerEngine cannot reverse {literal_type}")
1058
1057
 
1059
1058
 
1060
- flyte_dataset_transformer = StructuredDatasetTransformerEngine()
1059
+ flyte_dataset_transformer = DataFrameTransformerEngine()
1061
1060
  TypeEngine.register(flyte_dataset_transformer)