PyPI - flyte - Versions diffs - 2.0.0b17__py3-none-any.whl → 2.0.0b19__py3-none-any.whl - Mend

flyte 2.0.0b17py3-none-any.whl → 2.0.0b19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of flyte might be problematic. Click here for more details.

Files changed (52) hide show

flyte/_bin/runtime.py +3 -0
flyte/_debug/vscode.py +4 -2
flyte/_deploy.py +3 -1
flyte/_environment.py +15 -6
flyte/_hash.py +1 -16
flyte/_image.py +6 -1
flyte/_initialize.py +15 -16
flyte/_internal/controllers/__init__.py +4 -5
flyte/_internal/controllers/_local_controller.py +5 -5
flyte/_internal/controllers/remote/_controller.py +21 -28
flyte/_internal/controllers/remote/_core.py +1 -1
flyte/_internal/imagebuild/docker_builder.py +31 -23
flyte/_internal/imagebuild/remote_builder.py +37 -10
flyte/_internal/imagebuild/utils.py +2 -1
flyte/_internal/runtime/convert.py +69 -2
flyte/_internal/runtime/taskrunner.py +4 -1
flyte/_logging.py +110 -26
flyte/_map.py +90 -12
flyte/_pod.py +2 -1
flyte/_run.py +6 -1
flyte/_task.py +3 -0
flyte/_task_environment.py +5 -1
flyte/_trace.py +5 -0
flyte/_version.py +3 -3
flyte/cli/_create.py +4 -1
flyte/cli/_deploy.py +4 -5
flyte/cli/_params.py +18 -4
flyte/cli/_run.py +2 -2
flyte/config/_config.py +2 -2
flyte/config/_reader.py +14 -8
flyte/errors.py +3 -1
flyte/git/__init__.py +3 -0
flyte/git/_config.py +17 -0
flyte/io/_dataframe/basic_dfs.py +16 -7
flyte/io/_dataframe/dataframe.py +84 -123
flyte/io/_dir.py +35 -4
flyte/io/_file.py +61 -15
flyte/io/_hashing_io.py +342 -0
flyte/models.py +12 -4
flyte/remote/_action.py +4 -2
flyte/remote/_task.py +52 -22
flyte/report/_report.py +1 -1
flyte/storage/_storage.py +16 -1
flyte/types/_type_engine.py +1 -51
{flyte-2.0.0b17.data → flyte-2.0.0b19.data}/scripts/runtime.py +3 -0
{flyte-2.0.0b17.dist-info → flyte-2.0.0b19.dist-info}/METADATA +1 -1
{flyte-2.0.0b17.dist-info → flyte-2.0.0b19.dist-info}/RECORD +52 -49
{flyte-2.0.0b17.data → flyte-2.0.0b19.data}/scripts/debug.py +0 -0
{flyte-2.0.0b17.dist-info → flyte-2.0.0b19.dist-info}/WHEEL +0 -0
{flyte-2.0.0b17.dist-info → flyte-2.0.0b19.dist-info}/entry_points.txt +0 -0
{flyte-2.0.0b17.dist-info → flyte-2.0.0b19.dist-info}/licenses/LICENSE +0 -0
{flyte-2.0.0b17.dist-info → flyte-2.0.0b19.dist-info}/top_level.txt +0 -0

flyte/config/_config.py CHANGED Viewed

@@ -192,7 +192,7 @@ class Config(object):
         )
     @classmethod
-    def auto(cls, config_file: typing.Union[str, ConfigFile, None] = None) -> "Config":
+    def auto(cls, config_file: typing.Union[str, pathlib.Path, ConfigFile, None] = None) -> "Config":
         """
         Automatically constructs the Config Object. The order of precedence is as follows
           1. first try to find any env vars that match the config vars specified in the FLYTE_CONFIG format.
@@ -225,7 +225,7 @@ def set_if_exists(d: dict, k: str, val: typing.Any) -> dict:
     return d
-def auto(config_file: typing.Union[str, ConfigFile, None] = None) -> Config:
+def auto(config_file: typing.Union[str, pathlib.Path, ConfigFile, None] = None) -> Config:
     """
     Automatically constructs the Config Object. The order of precedence is as follows
       1. If specified, read the config from the provided file path.

flyte/config/_reader.py CHANGED Viewed

@@ -108,7 +108,7 @@ class ConfigFile(object):
         return pathlib.Path(self._location)
     @staticmethod
-    def _read_yaml_config(location: str) -> typing.Optional[typing.Dict[str, typing.Any]]:
+    def _read_yaml_config(location: str | pathlib.Path) -> typing.Optional[typing.Dict[str, typing.Any]]:
         with open(location, "r") as fh:
             try:
                 yaml_contents = yaml.safe_load(fh)
@@ -139,16 +139,22 @@ def resolve_config_path() -> pathlib.Path | None:
     """
     Config is read from the following locations in order of precedence:
     1. ./config.yaml if it exists
-    2. `UCTL_CONFIG` environment variable
-    3. `FLYTECTL_CONFIG` environment variable
-    4. ~/.union/config.yaml if it exists
-    5. ~/.flyte/config.yaml if it exists
+    2. ./.flyte/config.yaml if it exists
+    3. `UCTL_CONFIG` environment variable
+    4. `FLYTECTL_CONFIG` environment variable
+    5. ~/.union/config.yaml if it exists
+    6. ~/.flyte/config.yaml if it exists
     """
     current_location_config = Path("config.yaml")
     if current_location_config.exists():
         return current_location_config
     logger.debug("No ./config.yaml found")
+    dot_flyte_config = Path(".flyte", "config.yaml")
+    if dot_flyte_config.exists():
+        return dot_flyte_config
+    logger.debug("No ./.flyte/config.yaml found")
     uctl_path_from_env = getenv(UCTL_CONFIG_ENV_VAR, None)
     if uctl_path_from_env:
         return pathlib.Path(uctl_path_from_env)
@@ -173,13 +179,13 @@ def resolve_config_path() -> pathlib.Path | None:
 @lru_cache
-def get_config_file(c: typing.Union[str, ConfigFile, None]) -> ConfigFile | None:
+def get_config_file(c: typing.Union[str, pathlib.Path, ConfigFile, None]) -> ConfigFile | None:
     """
     Checks if the given argument is a file or a configFile and returns a loaded configFile else returns None
     """
-    if isinstance(c, str):
+    if isinstance(c, (str, pathlib.Path)):
         logger.debug(f"Using specified config file at {c}")
-        return ConfigFile(c)
+        return ConfigFile(str(c))
     elif isinstance(c, ConfigFile):
         return c
     config_path = resolve_config_path()

flyte/errors.py CHANGED Viewed

@@ -132,7 +132,9 @@ class CustomError(RuntimeUserError):
         Create a CustomError from an exception. The exception's class name is used as the error code and the exception
         message is used as the error message.
         """
-        return cls(e.__class__.__name__, str(e))
+        new_exc = cls(e.__class__.__name__, str(e))
+        new_exc.__cause__ = e
+        return new_exc
 class NotInTaskContextError(RuntimeUserError):

flyte/git/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from ._config import config_from_root
+__all__ = ["config_from_root"]

flyte/git/_config.py ADDED Viewed

@@ -0,0 +1,17 @@
+import pathlib
+import subprocess
+import flyte.config
+def config_from_root(path: pathlib.Path | str = ".flyte/config.yaml") -> flyte.config.Config:
+    """Get the config file from the git root directory.
+    By default, the config file is expected to be in `.flyte/config.yaml` in the git root directory.
+    """
+    result = subprocess.run(["git", "rev-parse", "--show-toplevel"], check=False, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"Failed to get git root directory: {result.stderr}")
+    root = pathlib.Path(result.stdout.strip())
+    return flyte.config.auto(root / path)

flyte/io/_dataframe/basic_dfs.py CHANGED Viewed

@@ -58,16 +58,16 @@ class PandasToCSVEncodingHandler(DataFrameEncoder):
         if not storage.is_remote(uri):
             Path(uri).mkdir(parents=True, exist_ok=True)
-        path = os.path.join(uri, ".csv")
+        csv_file = storage.join(uri, "data.csv")
         df = typing.cast(pd.DataFrame, dataframe.val)
         df.to_csv(
-            path,
+            csv_file,
             index=False,
-            storage_options=get_pandas_storage_options(uri=path),
+            storage_options=get_pandas_storage_options(uri=csv_file),
         )
         structured_dataset_type.format = CSV
         return literals_pb2.StructuredDataset(
-            uri=uri, metadata=literals_pb2.StructuredDatasetMetadata(structured_dataset_type)
+            uri=uri, metadata=literals_pb2.StructuredDatasetMetadata(structured_dataset_type=structured_dataset_type)
         )
@@ -83,16 +83,25 @@ class CSVToPandasDecodingHandler(DataFrameDecoder):
         uri = proto_value.uri
         columns = None
         kwargs = get_pandas_storage_options(uri=uri)
-        path = os.path.join(uri, ".csv")
+        csv_file = storage.join(uri, "data.csv")
         if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns:
             columns = [c.name for c in current_task_metadata.structured_dataset_type.columns]
         try:
-            return pd.read_csv(path, usecols=columns, storage_options=kwargs)
+            import io
+            # The pattern used here is a bit wonky because of obstore issues with csv, getting early eof error.
+            buf = io.BytesIO()
+            async for chunk in storage.get_stream(csv_file):
+                buf.write(chunk)
+            buf.seek(0)
+            df = pd.read_csv(buf)
+            return df
         except Exception as exc:
             if exc.__class__.__name__ == "NoCredentialsError":
                 logger.debug("S3 source detected, attempting anonymous S3 access")
                 kwargs = get_pandas_storage_options(uri=uri, anonymous=True)
-                return pd.read_csv(path, usecols=columns, storage_options=kwargs)
+                return pd.read_csv(csv_file, usecols=columns, storage_options=kwargs)
             else:
                 raise

flyte/io/_dataframe/dataframe.py CHANGED Viewed

@@ -1,20 +1,17 @@
 from __future__ import annotations
 import _datetime
-import asyncio
 import collections
 import types
 import typing
 from abc import ABC, abstractmethod
-from dataclasses import dataclass, field, is_dataclass
+from dataclasses import is_dataclass
 from typing import Any, ClassVar, Coroutine, Dict, Generic, List, Optional, Type, Union
-import msgpack
 from flyteidl.core import literals_pb2, types_pb2
 from fsspec.utils import get_protocol
-from mashumaro.mixins.json import DataClassJSONMixin
 from mashumaro.types import SerializableType
-from pydantic import model_serializer, model_validator
+from pydantic import BaseModel, ConfigDict, Field, PrivateAttr, model_serializer, model_validator
 from typing_extensions import Annotated, TypeAlias, get_args, get_origin
 import flyte.storage as storage
@@ -48,15 +45,23 @@ GENERIC_FORMAT: DataFrameFormat = ""
 GENERIC_PROTOCOL: str = "generic protocol"
-@dataclass
-class DataFrame(SerializableType, DataClassJSONMixin):
+class DataFrame(BaseModel, SerializableType):
     """
     This is the user facing DataFrame class. Please don't confuse it with the literals.StructuredDataset
     class (that is just a model, a Python class representation of the protobuf).
     """
-    uri: typing.Optional[str] = field(default=None)
-    file_format: typing.Optional[str] = field(default=GENERIC_FORMAT)
+    uri: typing.Optional[str] = Field(default=None)
+    format: typing.Optional[str] = Field(default=GENERIC_FORMAT)
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    # Private attributes that are not part of the Pydantic model schema
+    _raw_df: typing.Optional[typing.Any] = PrivateAttr(default=None)
+    _metadata: typing.Optional[literals_pb2.StructuredDatasetMetadata] = PrivateAttr(default=None)
+    _literal_sd: Optional[literals_pb2.StructuredDataset] = PrivateAttr(default=None)
+    _dataframe_type: Optional[Type[Any]] = PrivateAttr(default=None)
+    _already_uploaded: bool = PrivateAttr(default=False)
     # loop manager is working better than synchronicity for some reason, was getting an error but may be an easy fix
     def _serialize(self) -> Dict[str, Optional[str]]:
@@ -65,16 +70,16 @@ class DataFrame(SerializableType, DataClassJSONMixin):
         engine = DataFrameTransformerEngine()
         lv = loop_manager.run_sync(engine.to_literal, self, type(self), lt)
         sd = DataFrame(uri=lv.scalar.structured_dataset.uri)
-        sd.file_format = lv.scalar.structured_dataset.metadata.structured_dataset_type.format
+        sd.format = lv.scalar.structured_dataset.metadata.structured_dataset_type.format
         return {
             "uri": sd.uri,
-            "file_format": sd.file_format,
+            "format": sd.format,
         }
     @classmethod
-    def _deserialize(cls, value) -> "DataFrame":
+    def _deserialize(cls, value) -> DataFrame:
         uri = value.get("uri", None)
-        file_format = value.get("file_format", None)
+        format_val = value.get("format", None)
         if uri is None:
             raise ValueError("DataFrame's uri and file format should not be None")
@@ -86,7 +91,7 @@ class DataFrame(SerializableType, DataClassJSONMixin):
                 scalar=literals_pb2.Scalar(
                     structured_dataset=literals_pb2.StructuredDataset(
                         metadata=literals_pb2.StructuredDatasetMetadata(
-                            structured_dataset_type=types_pb2.StructuredDatasetType(format=file_format)
+                            structured_dataset_type=types_pb2.StructuredDatasetType(format=format_val)
                         ),
                         uri=uri,
                     )
@@ -102,7 +107,7 @@ class DataFrame(SerializableType, DataClassJSONMixin):
         lv = loop_manager.run_sync(sde.to_literal, self, type(self), lt)
         return {
             "uri": lv.scalar.structured_dataset.uri,
-            "file_format": lv.scalar.structured_dataset.metadata.structured_dataset_type.format,
+            "format": lv.scalar.structured_dataset.metadata.structured_dataset_type.format,
         }
     @model_validator(mode="after")
@@ -117,7 +122,7 @@ class DataFrame(SerializableType, DataClassJSONMixin):
                 scalar=literals_pb2.Scalar(
                     structured_dataset=literals_pb2.StructuredDataset(
                         metadata=literals_pb2.StructuredDatasetMetadata(
-                            structured_dataset_type=types_pb2.StructuredDatasetType(format=self.file_format)
+                            structured_dataset_type=types_pb2.StructuredDatasetType(format=self.format)
                         ),
                         uri=self.uri,
                     )
@@ -134,30 +139,46 @@ class DataFrame(SerializableType, DataClassJSONMixin):
     def column_names(cls) -> typing.List[str]:
         return [k for k, v in cls.columns().items()]
-    def __init__(
-        self,
+    @classmethod
+    def from_df(
+        cls,
         val: typing.Optional[typing.Any] = None,
         uri: typing.Optional[str] = None,
-        metadata: typing.Optional[literals_pb2.StructuredDatasetMetadata] = None,
+    ) -> DataFrame:
+        """
+        Wrapper to create a DataFrame from a dataframe.
+        The reason this is implemented as a wrapper instead of a full translation invoking
+        the type engine and the encoders is because there's too much information in the type
+        signature of the task that we don't want the user to have to replicate.
+        """
+        instance = cls(uri=uri)
+        instance._raw_df = val
+        return instance
+    @classmethod
+    def from_existing_remote(
+        cls,
+        remote_path: str,
+        format: typing.Optional[str] = None,
         **kwargs,
-    ):
-        self._val = val
-        # Make these fields public, so that the dataclass transformer can set a value for it
-        # https://github.com/flyteorg/flytekit/blob/bcc8541bd6227b532f8462563fe8aac902242b21/flytekit/core/type_engine.py#L298
-        self.uri = uri
-        # When dataclass_json runs from_json, we need to set it here, otherwise the format will be empty string
-        self.file_format = kwargs["file_format"] if "file_format" in kwargs else GENERIC_FORMAT
-        # This is a special attribute that indicates if the data was either downloaded or uploaded
-        self._metadata = metadata
-        # This is not for users to set, the transformer will set this.
-        self._literal_sd: Optional[literals_pb2.StructuredDataset] = None
-        # Not meant for users to set, will be set by an open() call
-        self._dataframe_type: Optional[DF] = None  # type: ignore
-        self._already_uploaded = False
+    ) -> "DataFrame":
+        """
+        Create a DataFrame reference from an existing remote dataframe.
+        Args:
+            remote_path: The remote path to the existing dataframe
+            format: Format of the stored dataframe
+        Example:
+            ```python
+            df = DataFrame.from_existing_remote("s3://bucket/data.parquet", format="parquet")
+            ```
+        """
+        return cls(uri=remote_path, format=format or GENERIC_FORMAT, **kwargs)
     @property
     def val(self) -> Optional[DF]:
-        return self._val
+        return self._raw_df
     @property
     def metadata(self) -> Optional[literals_pb2.StructuredDatasetMetadata]:
@@ -201,7 +222,7 @@ class DataFrame(SerializableType, DataClassJSONMixin):
         @task
         def return_df() -> DataFrame:
-            df = DataFrame(uri="s3://my-s3-bucket/s3_flyte_dir/df.parquet", file_format="parquet")
+            df = DataFrame(uri="s3://my-s3-bucket/s3_flyte_dir/df.parquet", format="parquet")
             df = df.open(pd.DataFrame).all()
             return df
@@ -244,6 +265,9 @@ def flatten_dict(sub_dict: dict, parent_key: str = "") -> typing.Dict:
             fields = getattr(value, "__dataclass_fields__")
             d = {k: v.type for k, v in fields.items()}
             result.update(flatten_dict(sub_dict=d, parent_key=current_key))
+        elif hasattr(value, "model_fields"):  # Pydantic model
+            d = {k: v.annotation for k, v in value.model_fields.items()}
+            result.update(flatten_dict(sub_dict=d, parent_key=current_key))
         else:
             result[current_key] = value
     return result
@@ -708,16 +732,16 @@ class DataFrameTransformerEngine(TypeTransformer[DataFrame]):
             #       return DataFrame(uri=uri)
             if python_val.val is None:
                 uri = python_val.uri
-                file_format = python_val.file_format
+                format_val = python_val.format
                 # Check the user-specified uri
                 if not uri:
                     raise ValueError(f"If dataframe is not specified, then the uri should be specified. {python_val}")
                 if not storage.is_remote(uri):
-                    uri = await storage.put(uri)
+                    uri = await storage.put(uri, recursive=True)
-                # Check the user-specified file_format
-                # When users specify file_format for a DataFrame, the file_format should be retained
+                # Check the user-specified format
+                # When users specify format for a DataFrame, the format should be retained
                 # conditionally. For details, please refer to https://github.com/flyteorg/flyte/issues/6096.
                 # Following illustrates why we can't always copy the user-specified file_format over:
                 #
@@ -725,14 +749,14 @@ class DataFrameTransformerEngine(TypeTransformer[DataFrame]):
                 # def modify_format(df: Annotated[DataFrame, {}, "task-format"]) -> DataFrame:
                 #     return df
                 #
-                # df = DataFrame(uri="s3://my-s3-bucket/df.parquet", file_format="user-format")
+                # df = DataFrame(uri="s3://my-s3-bucket/df.parquet", format="user-format")
                 # df2 = modify_format(df=df)
                 #
-                # In this case, we expect the df2.file_format to be task-format (as shown in Annotated),
-                # not user-format. If we directly copy the user-specified file_format over,
+                # In this case, we expect the df2.format to be task-format (as shown in Annotated),
+                # not user-format. If we directly copy the user-specified format over,
                 # the type hint information will be missing.
-                if sdt.format == GENERIC_FORMAT and file_format != GENERIC_FORMAT:
-                    sdt.format = file_format
+                if sdt.format == GENERIC_FORMAT and format_val != GENERIC_FORMAT:
+                    sdt.format = format_val
                 sd_model = literals_pb2.StructuredDataset(
                     uri=uri,
@@ -760,8 +784,9 @@ class DataFrameTransformerEngine(TypeTransformer[DataFrame]):
             structured_dataset_type=expected.structured_dataset_type if expected else None
         )
-        sd = DataFrame(val=python_val, metadata=meta)
-        return await self.encode(sd, python_type, protocol, fmt, sdt)
+        fdf = DataFrame.from_df(val=python_val)
+        fdf._metadata = meta
+        return await self.encode(fdf, python_type, protocol, fmt, sdt)
     def _protocol_from_type_or_prefix(self, df_type: Type, uri: Optional[str] = None) -> str:
         """
@@ -782,7 +807,7 @@ class DataFrameTransformerEngine(TypeTransformer[DataFrame]):
     async def encode(
         self,
-        sd: DataFrame,
+        df: DataFrame,
         df_type: Type,
         protocol: str,
         format: str,
@@ -791,7 +816,7 @@ class DataFrameTransformerEngine(TypeTransformer[DataFrame]):
         handler: DataFrameEncoder
         handler = self.get_encoder(df_type, protocol, format)
-        sd_model = await handler.encode(sd, structured_literal_type)
+        sd_model = await handler.encode(df, structured_literal_type)
         # This block is here in case the encoder did not set the type information in the metadata. Since this literal
         # is special in that it carries around the type itself, we want to make sure the type info therein is at
         # least as good as the type of the interface.
@@ -807,72 +832,13 @@ class DataFrameTransformerEngine(TypeTransformer[DataFrame]):
         lit = literals_pb2.Literal(scalar=literals_pb2.Scalar(structured_dataset=sd_model))
         # Because the handler.encode may have uploaded something, and because the sd may end up living inside a
-        # dataclass, we need to modify any uploaded flyte:// urls here.
-        modify_literal_uris(lit)  # todo: verify that this can be removed.
-        sd._literal_sd = sd_model
-        sd._already_uploaded = True
+        # dataclass, we need to modify any uploaded flyte:// urls here. Needed here even though the Type engine
+        # already does this because the DataframeTransformerEngine may be called directly.
+        modify_literal_uris(lit)
+        df._literal_sd = sd_model
+        df._already_uploaded = True
         return lit
-    # pr: han-ru: can this be removed if we make DataFrame a pydantic model?
-    def dict_to_dataframe(
-        self, dict_obj: typing.Dict[str, str], expected_python_type: Type[T] | DataFrame
-    ) -> T | DataFrame:
-        uri = dict_obj.get("uri", None)
-        file_format = dict_obj.get("file_format", None)
-        if uri is None:
-            raise ValueError("DataFrame's uri and file format should not be None")
-        # Instead of using python native DataFrame, we need to build a literals.StructuredDataset
-        # The reason is that _literal_sd of python sd is accessed when task output LiteralMap is
-        # converted back to flyteidl. Hence, _literal_sd must have to_flyte_idl method
-        # See https://github.com/flyteorg/flytekit/blob/f938661ff8413219d1bea77f6914a58c302d5c6c/flytekit/bin/entrypoint.py#L326
-        # For details, please refer to this issue: https://github.com/flyteorg/flyte/issues/5956.
-        sdt = types_pb2.StructuredDatasetType(format=file_format)
-        metad = literals_pb2.StructuredDatasetMetadata(structured_dataset_type=sdt)
-        sd_literal = literals_pb2.StructuredDataset(uri=uri, metadata=metad)
-        return asyncio.run(
-            DataFrameTransformerEngine().to_python_value(
-                literals_pb2.Literal(scalar=literals_pb2.Scalar(structured_dataset=sd_literal)),
-                expected_python_type,
-            )
-        )
-    def from_binary_idl(
-        self, binary_idl_object: literals_pb2.Binary, expected_python_type: Type[T] | DataFrame
-    ) -> T | DataFrame:
-        """
-        If the input is from flytekit, the Life Cycle will be as follows:
-        Life Cycle:
-        binary IDL                 -> resolved binary         -> bytes                   -> expected Python object
-        (flytekit customized          (propeller processing)     (flytekit binary IDL)      (flytekit customized
-        serialization)                                                                       deserialization)
-        Example Code:
-        @dataclass
-        class DC:
-            sd: StructuredDataset
-        @workflow
-        def wf(dc: DC):
-            t_sd(dc.sd)
-        Note:
-        - The deserialization is the same as put a structured dataset in a dataclass,
-          which will deserialize by the mashumaro's API.
-        Related PR:
-        - Title: Override Dataclass Serialization/Deserialization Behavior for FlyteTypes via Mashumaro
-        - Link: https://github.com/flyteorg/flytekit/pull/2554
-        """
-        if binary_idl_object.tag == MESSAGEPACK:
-            python_val = msgpack.loads(binary_idl_object.value)
-            return self.dict_to_dataframe(dict_obj=python_val, expected_python_type=expected_python_type)
-        else:
-            raise TypeTransformerFailedError(f"Unsupported binary format: `{binary_idl_object.tag}`")
     async def to_python_value(
         self, lv: literals_pb2.Literal, expected_python_type: Type[T] | DataFrame
     ) -> T | DataFrame:
@@ -906,9 +872,8 @@ class DataFrameTransformerEngine(TypeTransformer[DataFrame]):
         |                             | the running task's signature.           |                                      |
         +-----------------------------+-----------------------------------------+--------------------------------------+
         """
-        # Handle dataclass attribute access
         if lv.HasField("scalar") and lv.scalar.HasField("binary"):
-            return self.from_binary_idl(lv.scalar.binary, expected_python_type)
+            raise TypeTransformerFailedError("Attribute access unsupported.")
         # Detect annotations and extract out all the relevant information that the user might supply
         expected_python_type, column_dict, storage_fmt, pa_schema = extract_cols_and_format(expected_python_type)
@@ -939,16 +904,12 @@ class DataFrameTransformerEngine(TypeTransformer[DataFrame]):
         #   t1(input_a: DataFrame)  # or
         #   t1(input_a: Annotated[DataFrame, my_cols])
         if issubclass(expected_python_type, DataFrame):
-            sd = expected_python_type(
-                dataframe=None,
-                # Note here that the type being passed in
-                metadata=metad,
-            )
-            sd._literal_sd = lv.scalar.structured_dataset
-            sd.file_format = metad.structured_dataset_type.format
-            return sd
+            fdf = DataFrame(format=metad.structured_dataset_type.format)
+            fdf._literal_sd = lv.scalar.structured_dataset
+            fdf._metadata = metad
+            return fdf
-        # If the requested type was not a StructuredDataset, then it means it was a plain dataframe type, which means
+        # If the requested type was not a flyte.DataFrame, then it means it was a raw dataframe type, which means
         # we should do the opening/downloading and whatever else it might entail right now. No iteration option here.
         return await self.open_as(lv.scalar.structured_dataset, df_type=expected_python_type, updated_metadata=metad)

flyte/io/_dir.py CHANGED Viewed

@@ -48,6 +48,7 @@ class Dir(BaseModel, Generic[T], SerializableType):
     path: str
     name: Optional[str] = None
     format: str = ""
+    hash: Optional[str] = None
     class Config:
         arbitrary_types_allowed = True
@@ -248,13 +249,20 @@ class Dir(BaseModel, Generic[T], SerializableType):
         raise NotImplementedError("Sync download is not implemented for remote paths")
     @classmethod
-    async def from_local(cls, local_path: Union[str, Path], remote_path: Optional[str] = None) -> Dir[T]:
+    async def from_local(
+        cls,
+        local_path: Union[str, Path],
+        remote_path: Optional[str] = None,
+        dir_cache_key: Optional[str] = None,
+    ) -> Dir[T]:
         """
         Asynchronously create a new Dir by uploading a local directory to the configured remote store.
         Args:
             local_path: Path to the local directory
             remote_path: Optional path to store the directory remotely. If None, a path will be generated.
+            dir_cache_key: If you have a precomputed hash value you want to use when computing cache keys for
+              discoverable tasks that this File is an input to.
         Returns:
             A new Dir instance pointing to the uploaded directory
@@ -262,13 +270,34 @@ class Dir(BaseModel, Generic[T], SerializableType):
         Example:
             ```python
             remote_dir = await Dir[DataFrame].from_local('/tmp/data_dir/', 's3://bucket/data/')
+            # With a known hash value you want to use for cache key calculation
+            remote_dir = await Dir[DataFrame].from_local('/tmp/data_dir/', 's3://bucket/data/', dir_cache_key='abc123')
             ```
         """
         local_path_str = str(local_path)
         dirname = os.path.basename(os.path.normpath(local_path_str))
         output_path = await storage.put(from_path=local_path_str, to_path=remote_path, recursive=True)
-        return cls(path=output_path, name=dirname)
+        return cls(path=output_path, name=dirname, hash=dir_cache_key)
+    @classmethod
+    def from_existing_remote(cls, remote_path: str, dir_cache_key: Optional[str] = None) -> Dir[T]:
+        """
+        Create a Dir reference from an existing remote directory.
+        Args:
+            remote_path: The remote path to the existing directory
+            dir_cache_key: Optional hash value to use for cache key computation. If not specified,
+                            the cache key will be computed based on this object's attributes.
+        Example:
+            ```python
+            remote_dir = Dir.from_existing_remote("s3://bucket/data/")
+            # With a known hash
+            remote_dir = Dir.from_existing_remote("s3://bucket/data/", dir_cache_key="abc123")
+            ```
+        """
+        return cls(path=remote_path, hash=dir_cache_key)
     @classmethod
     def from_local_sync(cls, local_path: Union[str, Path], remote_path: Optional[str] = None) -> Dir[T]:
@@ -414,7 +443,8 @@ class DirTransformer(TypeTransformer[Dir]):
                     ),
                     uri=python_val.path,
                 )
-            )
+            ),
+            hash=python_val.hash if python_val.hash else None,
         )
     async def to_python_value(
@@ -432,7 +462,8 @@ class DirTransformer(TypeTransformer[Dir]):
         uri = lv.scalar.blob.uri
         filename = Path(uri).name
-        f: Dir = Dir(path=uri, name=filename, format=lv.scalar.blob.metadata.type.format)
+        hash_value = lv.hash if lv.hash else None
+        f: Dir = Dir(path=uri, name=filename, format=lv.scalar.blob.metadata.type.format, hash=hash_value)
         return f
     def guess_python_type(self, literal_type: types_pb2.LiteralType) -> Type[Dir]:

flyte 2.0.0b17__py3-none-any.whl → 2.0.0b19__py3-none-any.whl

Potentially problematic release.

flyte 2.0.0b17py3-none-any.whl → 2.0.0b19py3-none-any.whl