PyPI - flyte - Versions diffs - 0.2.0b32__py3-none-any.whl → 0.2.0b34__py3-none-any.whl - Mend

flyte 0.2.0b32py3-none-any.whl → 0.2.0b34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of flyte might be problematic. Click here for more details.

Files changed (22) hide show

flyte/_code_bundle/_utils.py +2 -2
flyte/_image.py +0 -2
flyte/_internal/imagebuild/remote_builder.py +3 -2
flyte/_task.py +30 -8
flyte/_version.py +2 -2
flyte/cli/_common.py +25 -0
flyte/cli/_create.py +11 -0
flyte/cli/_params.py +1 -1
flyte/cli/main.py +11 -0
flyte/errors.py +9 -0
flyte/io/__init__.py +12 -12
flyte/io/{_structured_dataset → _dataframe}/__init__.py +30 -30
flyte/io/{_structured_dataset → _dataframe}/basic_dfs.py +25 -26
flyte/io/{_structured_dataset/structured_dataset.py → _dataframe/dataframe.py} +131 -132
flyte/models.py +1 -1
flyte/types/_type_engine.py +2 -2
{flyte-0.2.0b32.dist-info → flyte-0.2.0b34.dist-info}/METADATA +1 -1
{flyte-0.2.0b32.dist-info → flyte-0.2.0b34.dist-info}/RECORD +22 -22
{flyte-0.2.0b32.data → flyte-0.2.0b34.data}/scripts/runtime.py +0 -0
{flyte-0.2.0b32.dist-info → flyte-0.2.0b34.dist-info}/WHEEL +0 -0
{flyte-0.2.0b32.dist-info → flyte-0.2.0b34.dist-info}/entry_points.txt +0 -0
{flyte-0.2.0b32.dist-info → flyte-0.2.0b34.dist-info}/top_level.txt +0 -0

flyte/io/{_structured_dataset/structured_dataset.py → _dataframe/dataframe.py} RENAMED Viewed

@@ -35,23 +35,23 @@ else:
     pd = lazy_module("pandas")
     pa = lazy_module("pyarrow")
-T = typing.TypeVar("T")  # StructuredDataset type or a dataframe type
+T = typing.TypeVar("T")  # DataFrame type or a dataframe type
 DF = typing.TypeVar("DF")  # Dataframe type
-# For specifying the storage formats of StructuredDatasets. It's just a string, nothing fancy.
-StructuredDatasetFormat: TypeAlias = str
+# For specifying the storage formats of DataFrames. It's just a string, nothing fancy.
+DataFrameFormat: TypeAlias = str
 # Storage formats
-PARQUET: StructuredDatasetFormat = "parquet"
-CSV: StructuredDatasetFormat = "csv"
-GENERIC_FORMAT: StructuredDatasetFormat = ""
+PARQUET: DataFrameFormat = "parquet"
+CSV: DataFrameFormat = "csv"
+GENERIC_FORMAT: DataFrameFormat = ""
 GENERIC_PROTOCOL: str = "generic protocol"
 @dataclass
-class StructuredDataset(SerializableType, DataClassJSONMixin):
+class DataFrame(SerializableType, DataClassJSONMixin):
     """
-    This is the user facing StructuredDataset class. Please don't confuse it with the literals.StructuredDataset
+    This is the user facing DataFrame class. Please don't confuse it with the literals.StructuredDataset
     class (that is just a model, a Python class representation of the protobuf).
     """
@@ -62,9 +62,9 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
     def _serialize(self) -> Dict[str, Optional[str]]:
         # dataclass case
         lt = TypeEngine.to_literal_type(type(self))
-        engine = StructuredDatasetTransformerEngine()
+        engine = DataFrameTransformerEngine()
         lv = loop_manager.run_sync(engine.to_literal, self, type(self), lt)
-        sd = StructuredDataset(uri=lv.scalar.structured_dataset.uri)
+        sd = DataFrame(uri=lv.scalar.structured_dataset.uri)
         sd.file_format = lv.scalar.structured_dataset.metadata.structured_dataset_type.format
         return {
             "uri": sd.uri,
@@ -72,14 +72,14 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
         }
     @classmethod
-    def _deserialize(cls, value) -> "StructuredDataset":
+    def _deserialize(cls, value) -> "DataFrame":
         uri = value.get("uri", None)
         file_format = value.get("file_format", None)
         if uri is None:
-            raise ValueError("StructuredDataset's uri and file format should not be None")
+            raise ValueError("DataFrame's uri and file format should not be None")
-        engine = StructuredDatasetTransformerEngine()
+        engine = DataFrameTransformerEngine()
         return loop_manager.run_sync(
             engine.to_python_value,
             literals_pb2.Literal(
@@ -96,9 +96,9 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
         )
     @model_serializer
-    def serialize_structured_dataset(self) -> Dict[str, Optional[str]]:
+    def serialize_dataframe(self) -> Dict[str, Optional[str]]:
         lt = TypeEngine.to_literal_type(type(self))
-        sde = StructuredDatasetTransformerEngine()
+        sde = DataFrameTransformerEngine()
         lv = loop_manager.run_sync(sde.to_literal, self, type(self), lt)
         return {
             "uri": lv.scalar.structured_dataset.uri,
@@ -106,11 +106,11 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
         }
     @model_validator(mode="after")
-    def deserialize_structured_dataset(self, info) -> StructuredDataset:
+    def deserialize_dataframe(self, info) -> DataFrame:
         if info.context is None or info.context.get("deserialize") is not True:
             return self
-        engine = StructuredDatasetTransformerEngine()
+        engine = DataFrameTransformerEngine()
         return loop_manager.run_sync(
             engine.to_python_value,
             literals_pb2.Literal(
@@ -136,12 +136,12 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
     def __init__(
         self,
-        dataframe: typing.Optional[typing.Any] = None,
+        val: typing.Optional[typing.Any] = None,
         uri: typing.Optional[str] = None,
         metadata: typing.Optional[literals_pb2.StructuredDatasetMetadata] = None,
         **kwargs,
     ):
-        self._dataframe = dataframe
+        self._val = val
         # Make these fields public, so that the dataclass transformer can set a value for it
         # https://github.com/flyteorg/flytekit/blob/bcc8541bd6227b532f8462563fe8aac902242b21/flytekit/core/type_engine.py#L298
         self.uri = uri
@@ -156,8 +156,8 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
         self._already_uploaded = False
     @property
-    def dataframe(self) -> Optional[DF]:
-        return self._dataframe
+    def val(self) -> Optional[DF]:
+        return self._val
     @property
     def metadata(self) -> Optional[literals_pb2.StructuredDatasetMetadata]:
@@ -168,18 +168,18 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
         return self._literal_sd
     def open(self, dataframe_type: Type[DF]):
-        from flyte.io._structured_dataset import lazy_import_structured_dataset_handler
         """
         Load the handler if needed. For the use case like:
         @task
-        def t1(sd: StructuredDataset):
+        def t1(df: DataFrame):
           import pandas as pd
-          sd.open(pd.DataFrame).all()
+          df.open(pd.DataFrame).all()
-        pandas is imported inside the task, so pandas handler won't be loaded during deserialization in type engine.
+        pandas is imported inside the task, so panda handler won't be loaded during deserialization in type engine.
         """
-        lazy_import_structured_dataset_handler()
+        from flyte.io._dataframe import lazy_import_dataframe_handler
+        lazy_import_dataframe_handler()
         self._dataframe_type = dataframe_type
         return self
@@ -187,22 +187,22 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
         if self._dataframe_type is None:
             raise ValueError("No dataframe type set. Use open() to set the local dataframe type you want to use.")
-        if self.uri is not None and self.dataframe is None:
-            expected = TypeEngine.to_literal_type(StructuredDataset)
+        if self.uri is not None and self.val is None:
+            expected = TypeEngine.to_literal_type(DataFrame)
             await self._set_literal(expected)
         return await flyte_dataset_transformer.open_as(self.literal, self._dataframe_type, self.metadata)
     async def _set_literal(self, expected: types_pb2.LiteralType) -> None:
         """
-        Explicitly set the StructuredDataset Literal to handle the following cases:
+        Explicitly set the DataFrame Literal to handle the following cases:
-        1. Read a dataframe from a StructuredDataset with an uri, for example:
+        1. Read the content from a DataFrame with an uri, for example:
         @task
-        def return_sd() -> StructuredDataset:
-            sd = StructuredDataset(uri="s3://my-s3-bucket/s3_flyte_dir/df.parquet", file_format="parquet")
-            df = sd.open(pd.DataFrame).all()
+        def return_df() -> DataFrame:
+            df = DataFrame(uri="s3://my-s3-bucket/s3_flyte_dir/df.parquet", file_format="parquet")
+            df = df.open(pd.DataFrame).all()
             return df
         For details, please refer to this issue: https://github.com/flyteorg/flyte/issues/5954.
@@ -212,14 +212,14 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
         For details, please refer to this issue: https://github.com/flyteorg/flyte/issues/5956.
         """
-        to_literal = await flyte_dataset_transformer.to_literal(self, StructuredDataset, expected)
+        to_literal = await flyte_dataset_transformer.to_literal(self, DataFrame, expected)
         self._literal_sd = to_literal.scalar.structured_dataset
         if self.metadata is None:
             self._metadata = self._literal_sd.metadata
     async def set_literal(self, expected: types_pb2.LiteralType) -> None:
         """
-        A public wrapper method to set the StructuredDataset Literal.
+        A public wrapper method to set the DataFrame Literal.
         This method provides external access to the internal _set_literal method.
         """
@@ -256,7 +256,7 @@ def extract_cols_and_format(
     Helper function, just used to iterate through Annotations and extract out the following information:
       - base type, if not Annotated, it will just be the type that was passed in.
       - column information, as a collections.OrderedDict,
-      - the storage format, as a ``StructuredDatasetFormat`` (str),
+      - the storage format, as a ``DataFrameFormat`` (str),
       - pa.lib.Schema
     If more than one of any type of thing is found, an error will be raised.
@@ -286,7 +286,7 @@ def extract_cols_and_format(
                 d = collections.OrderedDict()
                 d.update(aa)
                 ordered_dict_cols = d
-            elif isinstance(aa, StructuredDatasetFormat):
+            elif isinstance(aa, DataFrameFormat):
                 if fmt != "":
                     raise ValueError(f"A format was already specified {fmt}, cannot use {aa}")
                 fmt = aa
@@ -305,7 +305,7 @@ def extract_cols_and_format(
     return t, ordered_dict_cols, fmt, pa_schema
-class StructuredDatasetEncoder(ABC, Generic[T]):
+class DataFrameEncoder(ABC, Generic[T]):
     def __init__(
         self,
         python_type: Type[T],
@@ -314,10 +314,10 @@ class StructuredDatasetEncoder(ABC, Generic[T]):
     ):
         """
         Extend this abstract class, implement the encode function, and register your concrete class with the
-        StructuredDatasetTransformerEngine class in order for the core flytekit type engine to handle
+        DataFrameTransformerEngine class in order for the core flytekit type engine to handle
         dataframe libraries. This is the encoding interface, meaning it is used when there is a Python value that the
         flytekit type engine is trying to convert into a Flyte Literal. For the other way, see
-        the StructuredDatasetEncoder
+        the DataFrameEncoder
         :param python_type: The dataframe class in question that you want to register this encoder with
         :param protocol: A prefix representing the storage driver (e.g. 's3, 'gs', 'bq', etc.). You can use either
@@ -347,7 +347,7 @@ class StructuredDatasetEncoder(ABC, Generic[T]):
     @abstractmethod
     async def encode(
         self,
-        structured_dataset: StructuredDataset,
+        dataframe: DataFrame,
         structured_dataset_type: types_pb2.StructuredDatasetType,
     ) -> literals_pb2.StructuredDataset:
         """
@@ -357,20 +357,20 @@ class StructuredDatasetEncoder(ABC, Generic[T]):
         the
         # TODO: Do we need to add a flag to indicate if it was wrapped by the transformer or by the user?
-        :param structured_dataset: This is a StructuredDataset wrapper object. See more info above.
-        :param structured_dataset_type: This the StructuredDatasetType, as found in the LiteralType of the interface
+        :param dataframe: This is a DataFrame wrapper object. See more info above.
+        :param structured_dataset_type: This the DataFrameType, as found in the LiteralType of the interface
           of the task that invoked this encoding call. It is passed along to encoders so that authors of encoders
-          can include it in the returned literals.StructuredDataset. See the IDL for more information on why this
+          can include it in the returned literals.DataFrame. See the IDL for more information on why this
           literal in particular carries the type information along with it. If the encoder doesn't supply it, it will
           also be filled in after the encoder runs by the transformer engine.
-        :return: This function should return a StructuredDataset literal object. Do not confuse this with the
-          StructuredDataset wrapper class used as input to this function - that is the user facing Python class.
-          This function needs to return the IDL StructuredDataset.
+        :return: This function should return a DataFrame literal object. Do not confuse this with the
+          DataFrame wrapper class used as input to this function - that is the user facing Python class.
+          This function needs to return the IDL DataFrame.
         """
         raise NotImplementedError
-class StructuredDatasetDecoder(ABC, Generic[DF]):
+class DataFrameDecoder(ABC, Generic[DF]):
     def __init__(
         self,
         python_type: Type[DF],
@@ -380,9 +380,9 @@ class StructuredDatasetDecoder(ABC, Generic[DF]):
     ):
         """
         Extend this abstract class, implement the decode function, and register your concrete class with the
-        StructuredDatasetTransformerEngine class in order for the core flytekit type engine to handle
+        DataFrameTransformerEngine class in order for the core flytekit type engine to handle
         dataframe libraries. This is the decoder interface, meaning it is used when there is a Flyte Literal value,
-        and we have to get a Python value out of it. For the other way, see the StructuredDatasetEncoder
+        and we have to get a Python value out of it. For the other way, see the DataFrameEncoder
         :param python_type: The dataframe class in question that you want to register this decoder with
         :param protocol: A prefix representing the storage driver (e.g. 's3, 'gs', 'bq', etc.). You can use either
@@ -419,8 +419,8 @@ class StructuredDatasetDecoder(ABC, Generic[DF]):
         This is code that will be called by the dataset transformer engine to ultimately translate from a Flyte Literal
         value into a Python instance.
-        :param flyte_value: This will be a Flyte IDL StructuredDataset Literal - do not confuse this with the
-          StructuredDataset class defined also in this module.
+        :param flyte_value: This will be a Flyte IDL DataFrame Literal - do not confuse this with the
+          DataFrame class defined also in this module.
         :param current_task_metadata: Metadata object containing the type (and columns if any) for the currently
            executing task. This type may have more or less information than the type information bundled
            inside the incoming flyte_value.
@@ -459,19 +459,19 @@ def get_supported_types():
 class DuplicateHandlerError(ValueError): ...
-class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
+class DataFrameTransformerEngine(TypeTransformer[DataFrame]):
     """
     Think of this transformer as a higher-level meta transformer that is used for all the dataframe types.
     If you are bringing a custom data frame type, or any data frame type, to flytekit, instead of
     registering with the main type engine, you should register with this transformer instead.
     """
-    ENCODERS: ClassVar[Dict[Type, Dict[str, Dict[str, StructuredDatasetEncoder]]]] = {}
-    DECODERS: ClassVar[Dict[Type, Dict[str, Dict[str, StructuredDatasetDecoder]]]] = {}
+    ENCODERS: ClassVar[Dict[Type, Dict[str, Dict[str, DataFrameEncoder]]]] = {}
+    DECODERS: ClassVar[Dict[Type, Dict[str, Dict[str, DataFrameDecoder]]]] = {}
     DEFAULT_PROTOCOLS: ClassVar[Dict[Type, str]] = {}
     DEFAULT_FORMATS: ClassVar[Dict[Type, str]] = {}
-    Handlers = Union[StructuredDatasetEncoder, StructuredDatasetDecoder]
+    Handlers = Union[DataFrameEncoder, DataFrameDecoder]
     Renderers: ClassVar[Dict[Type, Renderable]] = {}
     @classmethod
@@ -527,17 +527,17 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
     @classmethod
     def get_encoder(cls, df_type: Type, protocol: str, format: str):
-        return cls._finder(StructuredDatasetTransformerEngine.ENCODERS, df_type, protocol, format)
+        return cls._finder(DataFrameTransformerEngine.ENCODERS, df_type, protocol, format)
     @classmethod
-    def get_decoder(cls, df_type: Type, protocol: str, format: str) -> StructuredDatasetDecoder:
-        return cls._finder(StructuredDatasetTransformerEngine.DECODERS, df_type, protocol, format)
+    def get_decoder(cls, df_type: Type, protocol: str, format: str) -> DataFrameDecoder:
+        return cls._finder(DataFrameTransformerEngine.DECODERS, df_type, protocol, format)
     @classmethod
     def _handler_finder(cls, h: Handlers, protocol: str) -> Dict[str, Handlers]:
-        if isinstance(h, StructuredDatasetEncoder):
+        if isinstance(h, DataFrameEncoder):
             top_level = cls.ENCODERS
-        elif isinstance(h, StructuredDatasetDecoder):
+        elif isinstance(h, DataFrameDecoder):
             top_level = cls.DECODERS  # type: ignore
         else:
             raise TypeError(f"We don't support this type of handler {h}")
@@ -548,7 +548,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
         return top_level[h.python_type][protocol]  # type: ignore
     def __init__(self):
-        super().__init__("StructuredDataset Transformer", StructuredDataset)
+        super().__init__("DataFrame Transformer", DataFrame)
         self._type_assertions_enabled = False
     @classmethod
@@ -568,7 +568,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
         Call this with any Encoder or Decoder to register it with the flytekit type system. If your handler does not
         specify a protocol (e.g. s3, gs, etc.) field, then
-        :param h: The StructuredDatasetEncoder or StructuredDatasetDecoder you wish to register with this transformer.
+        :param h: The DataFrameEncoder or DataFrameDecoder you wish to register with this transformer.
         :param default_for_type: If set, when a user returns from a task an instance of the dataframe the handler
           handles, e.g. ``return pd.DataFrame(...)``, not wrapped around the ``StructuredDataset`` object, we will
           use this handler's protocol and format as the default, effectively saying that this handler will be called.
@@ -582,7 +582,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
         :param default_storage_for_type: Same as above but only for the storage format. Error if already set,
           unless override is specified.
         """
-        if not (isinstance(h, StructuredDatasetEncoder) or isinstance(h, StructuredDatasetDecoder)):
+        if not (isinstance(h, DataFrameEncoder) or isinstance(h, DataFrameDecoder)):
             raise TypeError(f"We don't support this type of handler {h}")
         if h.protocol is None:
@@ -648,27 +648,27 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
         # Register with the type engine as well
         # The semantics as of now are such that it doesn't matter which order these transformers are loaded in, as
         # long as the older Pandas/FlyteSchema transformer do not also specify the override
-        engine = StructuredDatasetTransformerEngine()
+        engine = DataFrameTransformerEngine()
         TypeEngine.register_additional_type(engine, h.python_type, override=True)
-    def assert_type(self, t: Type[StructuredDataset], v: typing.Any):
+    def assert_type(self, t: Type[DataFrame], v: typing.Any):
         return
     async def to_literal(
         self,
-        python_val: Union[StructuredDataset, typing.Any],
-        python_type: Union[Type[StructuredDataset], Type],
+        python_val: Union[DataFrame, typing.Any],
+        python_type: Union[Type[DataFrame], Type],
         expected: types_pb2.LiteralType,
     ) -> literals_pb2.Literal:
         # Make a copy in case we need to hand off to encoders, since we can't be sure of mutations.
         python_type, *attrs = extract_cols_and_format(python_type)
         sdt = types_pb2.StructuredDatasetType(format=self.DEFAULT_FORMATS.get(python_type, GENERIC_FORMAT))
-        if issubclass(python_type, StructuredDataset) and not isinstance(python_val, StructuredDataset):
+        if issubclass(python_type, DataFrame) and not isinstance(python_val, DataFrame):
             # Catch a common mistake
             raise TypeTransformerFailedError(
-                f"Expected a StructuredDataset instance, but got {type(python_val)} instead."
-                f" Did you forget to wrap your dataframe in a StructuredDataset instance?"
+                f"Expected a DataFrame instance, but got {type(python_val)} instead."
+                f" Did you forget to wrap your dataframe in a DataFrame instance?"
             )
         if expected and expected.structured_dataset_type:
@@ -679,35 +679,34 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
                 external_schema_bytes=expected.structured_dataset_type.external_schema_bytes,
             )
-        # If the type signature has the StructuredDataset class, it will, or at least should, also be a
-        # StructuredDataset instance.
-        if isinstance(python_val, StructuredDataset):
+        # If the type signature has the DataFrame class, it will, or at least should, also be a
+        # DataFrame instance.
+        if isinstance(python_val, DataFrame):
             # There are three cases that we need to take care of here.
-            # 1. A task returns a StructuredDataset that was just a passthrough input. If this happens
-            # then return the original literals.StructuredDataset without invoking any encoder
+            # 1. A task returns a DataFrame that was just a passthrough input. If this happens
+            # then return the original literals.DataFrame without invoking any encoder
             #
             # Ex.
-            #   def t1(dataset: Annotated[StructuredDataset, my_cols]) -> Annotated[StructuredDataset, my_cols]:
+            #   def t1(dataset: Annotated[DataFrame, my_cols]) -> Annotated[DataFrame, my_cols]:
             #       return dataset
             if python_val._literal_sd is not None:
                 if python_val._already_uploaded:
                     return literals_pb2.Literal(scalar=literals_pb2.Scalar(structured_dataset=python_val._literal_sd))
-                if python_val.dataframe is not None:
+                if python_val.val is not None:
                     raise ValueError(
-                        f"Shouldn't have specified both literal {python_val._literal_sd}"
-                        f" and dataframe {python_val.dataframe}"
+                        f"Shouldn't have specified both literal {python_val._literal_sd} and dataframe {python_val.val}"
                     )
                 return literals_pb2.Literal(scalar=literals_pb2.Scalar(structured_dataset=python_val._literal_sd))
-            # 2. A task returns a python StructuredDataset with an uri.
-            # Note: this case is also what happens we start a local execution of a task with a python StructuredDataset.
-            #  It gets converted into a literal first, then back into a python StructuredDataset.
+            # 2. A task returns a python DataFrame with an uri.
+            # Note: this case is also what happens we start a local execution of a task with a python DataFrame.
+            #  It gets converted into a literal first, then back into a python DataFrame.
             #
             # Ex.
-            #   def t2(uri: str) -> Annotated[StructuredDataset, my_cols]
-            #       return StructuredDataset(uri=uri)
-            if python_val.dataframe is None:
+            #   def t2(uri: str) -> Annotated[DataFrame, my_cols]
+            #       return DataFrame(uri=uri)
+            if python_val.val is None:
                 uri = python_val.uri
                 file_format = python_val.file_format
@@ -718,19 +717,20 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
                     uri = await storage.put(uri)
                 # Check the user-specified file_format
-                # When users specify file_format for a StructuredDataset, the file_format should be retained
+                # When users specify file_format for a DataFrame, the file_format should be retained
                 # conditionally. For details, please refer to https://github.com/flyteorg/flyte/issues/6096.
                 # Following illustrates why we can't always copy the user-specified file_format over:
                 #
                 # @task
-                # def modify_format(sd: Annotated[StructuredDataset, {}, "task-format"]) -> StructuredDataset:
-                #     return sd
+                # def modify_format(df: Annotated[DataFrame, {}, "task-format"]) -> DataFrame:
+                #     return df
                 #
-                # sd = StructuredDataset(uri="s3://my-s3-bucket/df.parquet", file_format="user-format")
-                # sd2 = modify_format(sd=sd)
+                # df = DataFrame(uri="s3://my-s3-bucket/df.parquet", file_format="user-format")
+                # df2 = modify_format(df=df)
                 #
-                # In this case, we expect sd2.file_format to be task-format (as shown in Annotated), not user-format.
-                # If we directly copy the user-specified file_format over, the type hint information will be missing.
+                # In this case, we expect the df2.file_format to be task-format (as shown in Annotated),
+                # not user-format. If we directly copy the user-specified file_format over,
+                # the type hint information will be missing.
                 if sdt.format == GENERIC_FORMAT and file_format != GENERIC_FORMAT:
                     sdt.format = file_format
@@ -740,9 +740,9 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
                 )
                 return literals_pb2.Literal(scalar=literals_pb2.Scalar(structured_dataset=sd_model))
-            # 3. This is the third and probably most common case. The python StructuredDataset object wraps a dataframe
+            # 3. This is the third and probably most common case. The python DataFrame object wraps a dataframe
             # that we will need to invoke an encoder for. Figure out which encoder to call and invoke it.
-            df_type = type(python_val.dataframe)
+            df_type = type(python_val.val)
             protocol = self._protocol_from_type_or_prefix(df_type, python_val.uri)
             return await self.encode(
@@ -760,7 +760,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
             structured_dataset_type=expected.structured_dataset_type if expected else None
         )
-        sd = StructuredDataset(dataframe=python_val, metadata=meta)
+        sd = DataFrame(val=python_val, metadata=meta)
         return await self.encode(sd, python_type, protocol, fmt, sdt)
     def _protocol_from_type_or_prefix(self, df_type: Type, uri: Optional[str] = None) -> str:
@@ -782,13 +782,13 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
     async def encode(
         self,
-        sd: StructuredDataset,
+        sd: DataFrame,
         df_type: Type,
         protocol: str,
         format: str,
         structured_literal_type: types_pb2.StructuredDatasetType,
     ) -> literals_pb2.Literal:
-        handler: StructuredDatasetEncoder
+        handler: DataFrameEncoder
         handler = self.get_encoder(df_type, protocol, format)
         sd_model = await handler.encode(sd, structured_literal_type)
@@ -813,17 +813,17 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
         sd._already_uploaded = True
         return lit
-    # pr: han-ru: can this be removed if we make StructuredDataset a pydantic model?
-    def dict_to_structured_dataset(
-        self, dict_obj: typing.Dict[str, str], expected_python_type: Type[T] | StructuredDataset
-    ) -> T | StructuredDataset:
+    # pr: han-ru: can this be removed if we make DataFrame a pydantic model?
+    def dict_to_dataframe(
+        self, dict_obj: typing.Dict[str, str], expected_python_type: Type[T] | DataFrame
+    ) -> T | DataFrame:
         uri = dict_obj.get("uri", None)
         file_format = dict_obj.get("file_format", None)
         if uri is None:
-            raise ValueError("StructuredDataset's uri and file format should not be None")
+            raise ValueError("DataFrame's uri and file format should not be None")
-        # Instead of using python native StructuredDataset, we need to build a literals.StructuredDataset
+        # Instead of using python native DataFrame, we need to build a literals.StructuredDataset
         # The reason is that _literal_sd of python sd is accessed when task output LiteralMap is
         # converted back to flyteidl. Hence, _literal_sd must have to_flyte_idl method
         # See https://github.com/flyteorg/flytekit/blob/f938661ff8413219d1bea77f6914a58c302d5c6c/flytekit/bin/entrypoint.py#L326
@@ -833,15 +833,15 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
         sd_literal = literals_pb2.StructuredDataset(uri=uri, metadata=metad)
         return asyncio.run(
-            StructuredDatasetTransformerEngine().to_python_value(
+            DataFrameTransformerEngine().to_python_value(
                 literals_pb2.Literal(scalar=literals_pb2.Scalar(structured_dataset=sd_literal)),
                 expected_python_type,
             )
         )
     def from_binary_idl(
-        self, binary_idl_object: literals_pb2.Binary, expected_python_type: Type[T] | StructuredDataset
-    ) -> T | StructuredDataset:
+        self, binary_idl_object: literals_pb2.Binary, expected_python_type: Type[T] | DataFrame
+    ) -> T | DataFrame:
         """
         If the input is from flytekit, the Life Cycle will be as follows:
@@ -869,13 +869,13 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
         """
         if binary_idl_object.tag == MESSAGEPACK:
             python_val = msgpack.loads(binary_idl_object.value)
-            return self.dict_to_structured_dataset(dict_obj=python_val, expected_python_type=expected_python_type)
+            return self.dict_to_dataframe(dict_obj=python_val, expected_python_type=expected_python_type)
         else:
             raise TypeTransformerFailedError(f"Unsupported binary format: `{binary_idl_object.tag}`")
     async def to_python_value(
-        self, lv: literals_pb2.Literal, expected_python_type: Type[T] | StructuredDataset
-    ) -> T | StructuredDataset:
+        self, lv: literals_pb2.Literal, expected_python_type: Type[T] | DataFrame
+    ) -> T | DataFrame:
         """
         The only tricky thing with converting a Literal (say the output of an earlier task), to a Python value at
         the start of a task execution, is the column subsetting behavior. For example, if you have,
@@ -913,7 +913,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
         # Detect annotations and extract out all the relevant information that the user might supply
         expected_python_type, column_dict, storage_fmt, pa_schema = extract_cols_and_format(expected_python_type)
-        # Start handling for StructuredDataset scalars, first look at the columns
+        # Start handling for DataFrame scalars, first look at the columns
         incoming_columns = lv.scalar.structured_dataset.metadata.structured_dataset_type.columns
         # If the incoming literal, also doesn't have columns, then we just have an empty list, so initialize here
@@ -935,10 +935,10 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
         )
         metad = literals_pb2.StructuredDatasetMetadata(structured_dataset_type=new_sdt)
-        # A StructuredDataset type, for example
-        #   t1(input_a: StructuredDataset)  # or
-        #   t1(input_a: Annotated[StructuredDataset, my_cols])
-        if issubclass(expected_python_type, StructuredDataset):
+        # A DataFrame type, for example
+        #   t1(input_a: DataFrame)  # or
+        #   t1(input_a: Annotated[DataFrame, my_cols])
+        if issubclass(expected_python_type, DataFrame):
             sd = expected_python_type(
                 dataframe=None,
                 # Note here that the type being passed in
@@ -953,12 +953,12 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
         return await self.open_as(lv.scalar.structured_dataset, df_type=expected_python_type, updated_metadata=metad)
     def to_html(self, python_val: typing.Any, expected_python_type: Type[T]) -> str:
-        if isinstance(python_val, StructuredDataset):
-            if python_val.dataframe is not None:
-                df = python_val.dataframe
+        if isinstance(python_val, DataFrame):
+            if python_val.val is not None:
+                df = python_val.val
             else:
                 # Here we only render column information by default instead of opening the structured dataset.
-                col = typing.cast(StructuredDataset, python_val).columns()
+                col = typing.cast(DataFrame, python_val).columns()
                 dataframe = pd.DataFrame(col, ["column type"])
                 return dataframe.to_html()  # type: ignore
         else:
@@ -1004,11 +1004,12 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
     def _get_dataset_column_literal_type(self, t: Type) -> types_pb2.LiteralType:
         if t in get_supported_types():
             return get_supported_types()[t]
-        if hasattr(t, "__origin__") and t.__origin__ is list:
+        origin = getattr(t, "__origin__", None)
+        if origin is list:
             return types_pb2.LiteralType(collection_type=self._get_dataset_column_literal_type(t.__args__[0]))
-        if hasattr(t, "__origin__") and t.__origin__ is dict:
+        if origin is dict:
             return types_pb2.LiteralType(map_value_type=self._get_dataset_column_literal_type(t.__args__[1]))
-        raise AssertionError(f"type {t} is currently not supported by StructuredDataset")
+        raise AssertionError(f"type {t} is currently not supported by DataFrame")
     def _convert_ordered_dict_of_columns_to_list(
         self, column_map: typing.Optional[typing.OrderedDict[str, Type]]
@@ -1022,9 +1023,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
             converted_cols.append(types_pb2.StructuredDatasetType.DatasetColumn(name=k, literal_type=lt))
         return converted_cols
-    def _get_dataset_type(
-        self, t: typing.Union[Type[StructuredDataset], typing.Any]
-    ) -> types_pb2.StructuredDatasetType:
+    def _get_dataset_type(self, t: typing.Union[Type[DataFrame], typing.Any]) -> types_pb2.StructuredDatasetType:
         original_python_type, column_map, storage_format, pa_schema = extract_cols_and_format(t)  # type: ignore
         # Get the column information
@@ -1039,7 +1038,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
             external_schema_bytes=typing.cast(pa.lib.Schema, pa_schema).to_string().encode() if pa_schema else None,
         )
-    def get_literal_type(self, t: typing.Union[Type[StructuredDataset], typing.Any]) -> types_pb2.LiteralType:
+    def get_literal_type(self, t: typing.Union[Type[DataFrame], typing.Any]) -> types_pb2.LiteralType:
         """
         Provide a concrete implementation so that writers of custom dataframe handlers since there's nothing that
         special about the literal type. Any dataframe type will always be associated with the structured dataset type.
@@ -1049,13 +1048,13 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
         """
         return types_pb2.LiteralType(structured_dataset_type=self._get_dataset_type(t))
-    def guess_python_type(self, literal_type: types_pb2.LiteralType) -> Type[StructuredDataset]:
+    def guess_python_type(self, literal_type: types_pb2.LiteralType) -> Type[DataFrame]:
         # todo: technically we should return the dataframe type specified in the constructor, but to do that,
         #   we'd have to store that, which we don't do today. See possibly #1363
-        if literal_type.HasField("structured_dataset_type"):
-            return StructuredDataset
-        raise ValueError(f"StructuredDatasetTransformerEngine cannot reverse {literal_type}")
+        if literal_type.HasField("dataframe_type"):
+            return DataFrame
+        raise ValueError(f"DataFrameTransformerEngine cannot reverse {literal_type}")
-flyte_dataset_transformer = StructuredDatasetTransformerEngine()
+flyte_dataset_transformer = DataFrameTransformerEngine()
 TypeEngine.register(flyte_dataset_transformer)

flyte 0.2.0b32__py3-none-any.whl → 0.2.0b34__py3-none-any.whl

Potentially problematic release.

flyte 0.2.0b32py3-none-any.whl → 0.2.0b34py3-none-any.whl