flyte 0.0.1b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of flyte might be problematic. Click here for more details.
- flyte/__init__.py +62 -0
- flyte/_api_commons.py +3 -0
- flyte/_bin/__init__.py +0 -0
- flyte/_bin/runtime.py +126 -0
- flyte/_build.py +25 -0
- flyte/_cache/__init__.py +12 -0
- flyte/_cache/cache.py +146 -0
- flyte/_cache/defaults.py +9 -0
- flyte/_cache/policy_function_body.py +42 -0
- flyte/_cli/__init__.py +0 -0
- flyte/_cli/_common.py +287 -0
- flyte/_cli/_create.py +42 -0
- flyte/_cli/_delete.py +23 -0
- flyte/_cli/_deploy.py +140 -0
- flyte/_cli/_get.py +235 -0
- flyte/_cli/_run.py +152 -0
- flyte/_cli/main.py +72 -0
- flyte/_code_bundle/__init__.py +8 -0
- flyte/_code_bundle/_ignore.py +113 -0
- flyte/_code_bundle/_packaging.py +187 -0
- flyte/_code_bundle/_utils.py +339 -0
- flyte/_code_bundle/bundle.py +178 -0
- flyte/_context.py +146 -0
- flyte/_datastructures.py +342 -0
- flyte/_deploy.py +202 -0
- flyte/_doc.py +29 -0
- flyte/_docstring.py +32 -0
- flyte/_environment.py +43 -0
- flyte/_group.py +31 -0
- flyte/_hash.py +23 -0
- flyte/_image.py +760 -0
- flyte/_initialize.py +634 -0
- flyte/_interface.py +84 -0
- flyte/_internal/__init__.py +3 -0
- flyte/_internal/controllers/__init__.py +115 -0
- flyte/_internal/controllers/_local_controller.py +118 -0
- flyte/_internal/controllers/_trace.py +40 -0
- flyte/_internal/controllers/pbhash.py +39 -0
- flyte/_internal/controllers/remote/__init__.py +40 -0
- flyte/_internal/controllers/remote/_action.py +141 -0
- flyte/_internal/controllers/remote/_client.py +43 -0
- flyte/_internal/controllers/remote/_controller.py +361 -0
- flyte/_internal/controllers/remote/_core.py +402 -0
- flyte/_internal/controllers/remote/_informer.py +361 -0
- flyte/_internal/controllers/remote/_service_protocol.py +50 -0
- flyte/_internal/imagebuild/__init__.py +11 -0
- flyte/_internal/imagebuild/docker_builder.py +416 -0
- flyte/_internal/imagebuild/image_builder.py +241 -0
- flyte/_internal/imagebuild/remote_builder.py +0 -0
- flyte/_internal/resolvers/__init__.py +0 -0
- flyte/_internal/resolvers/_task_module.py +54 -0
- flyte/_internal/resolvers/common.py +31 -0
- flyte/_internal/resolvers/default.py +28 -0
- flyte/_internal/runtime/__init__.py +0 -0
- flyte/_internal/runtime/convert.py +199 -0
- flyte/_internal/runtime/entrypoints.py +135 -0
- flyte/_internal/runtime/io.py +136 -0
- flyte/_internal/runtime/resources_serde.py +138 -0
- flyte/_internal/runtime/task_serde.py +210 -0
- flyte/_internal/runtime/taskrunner.py +190 -0
- flyte/_internal/runtime/types_serde.py +54 -0
- flyte/_logging.py +124 -0
- flyte/_protos/__init__.py +0 -0
- flyte/_protos/common/authorization_pb2.py +66 -0
- flyte/_protos/common/authorization_pb2.pyi +108 -0
- flyte/_protos/common/authorization_pb2_grpc.py +4 -0
- flyte/_protos/common/identifier_pb2.py +71 -0
- flyte/_protos/common/identifier_pb2.pyi +82 -0
- flyte/_protos/common/identifier_pb2_grpc.py +4 -0
- flyte/_protos/common/identity_pb2.py +48 -0
- flyte/_protos/common/identity_pb2.pyi +72 -0
- flyte/_protos/common/identity_pb2_grpc.py +4 -0
- flyte/_protos/common/list_pb2.py +36 -0
- flyte/_protos/common/list_pb2.pyi +69 -0
- flyte/_protos/common/list_pb2_grpc.py +4 -0
- flyte/_protos/common/policy_pb2.py +37 -0
- flyte/_protos/common/policy_pb2.pyi +27 -0
- flyte/_protos/common/policy_pb2_grpc.py +4 -0
- flyte/_protos/common/role_pb2.py +37 -0
- flyte/_protos/common/role_pb2.pyi +53 -0
- flyte/_protos/common/role_pb2_grpc.py +4 -0
- flyte/_protos/common/runtime_version_pb2.py +28 -0
- flyte/_protos/common/runtime_version_pb2.pyi +24 -0
- flyte/_protos/common/runtime_version_pb2_grpc.py +4 -0
- flyte/_protos/logs/dataplane/payload_pb2.py +96 -0
- flyte/_protos/logs/dataplane/payload_pb2.pyi +168 -0
- flyte/_protos/logs/dataplane/payload_pb2_grpc.py +4 -0
- flyte/_protos/secret/definition_pb2.py +49 -0
- flyte/_protos/secret/definition_pb2.pyi +93 -0
- flyte/_protos/secret/definition_pb2_grpc.py +4 -0
- flyte/_protos/secret/payload_pb2.py +62 -0
- flyte/_protos/secret/payload_pb2.pyi +94 -0
- flyte/_protos/secret/payload_pb2_grpc.py +4 -0
- flyte/_protos/secret/secret_pb2.py +38 -0
- flyte/_protos/secret/secret_pb2.pyi +6 -0
- flyte/_protos/secret/secret_pb2_grpc.py +198 -0
- flyte/_protos/secret/secret_pb2_grpc_grpc.py +198 -0
- flyte/_protos/validate/validate/validate_pb2.py +76 -0
- flyte/_protos/workflow/node_execution_service_pb2.py +26 -0
- flyte/_protos/workflow/node_execution_service_pb2.pyi +4 -0
- flyte/_protos/workflow/node_execution_service_pb2_grpc.py +32 -0
- flyte/_protos/workflow/queue_service_pb2.py +106 -0
- flyte/_protos/workflow/queue_service_pb2.pyi +141 -0
- flyte/_protos/workflow/queue_service_pb2_grpc.py +172 -0
- flyte/_protos/workflow/run_definition_pb2.py +128 -0
- flyte/_protos/workflow/run_definition_pb2.pyi +310 -0
- flyte/_protos/workflow/run_definition_pb2_grpc.py +4 -0
- flyte/_protos/workflow/run_logs_service_pb2.py +41 -0
- flyte/_protos/workflow/run_logs_service_pb2.pyi +28 -0
- flyte/_protos/workflow/run_logs_service_pb2_grpc.py +69 -0
- flyte/_protos/workflow/run_service_pb2.py +133 -0
- flyte/_protos/workflow/run_service_pb2.pyi +175 -0
- flyte/_protos/workflow/run_service_pb2_grpc.py +412 -0
- flyte/_protos/workflow/state_service_pb2.py +58 -0
- flyte/_protos/workflow/state_service_pb2.pyi +71 -0
- flyte/_protos/workflow/state_service_pb2_grpc.py +138 -0
- flyte/_protos/workflow/task_definition_pb2.py +72 -0
- flyte/_protos/workflow/task_definition_pb2.pyi +65 -0
- flyte/_protos/workflow/task_definition_pb2_grpc.py +4 -0
- flyte/_protos/workflow/task_service_pb2.py +44 -0
- flyte/_protos/workflow/task_service_pb2.pyi +31 -0
- flyte/_protos/workflow/task_service_pb2_grpc.py +104 -0
- flyte/_resources.py +226 -0
- flyte/_retry.py +32 -0
- flyte/_reusable_environment.py +25 -0
- flyte/_run.py +411 -0
- flyte/_secret.py +61 -0
- flyte/_task.py +367 -0
- flyte/_task_environment.py +200 -0
- flyte/_timeout.py +47 -0
- flyte/_tools.py +27 -0
- flyte/_trace.py +128 -0
- flyte/_utils/__init__.py +20 -0
- flyte/_utils/asyn.py +119 -0
- flyte/_utils/coro_management.py +25 -0
- flyte/_utils/file_handling.py +72 -0
- flyte/_utils/helpers.py +108 -0
- flyte/_utils/lazy_module.py +54 -0
- flyte/_utils/uv_script_parser.py +49 -0
- flyte/_version.py +21 -0
- flyte/connectors/__init__.py +0 -0
- flyte/errors.py +143 -0
- flyte/extras/__init__.py +5 -0
- flyte/extras/_container.py +273 -0
- flyte/io/__init__.py +11 -0
- flyte/io/_dataframe.py +0 -0
- flyte/io/_dir.py +448 -0
- flyte/io/_file.py +468 -0
- flyte/io/pickle/__init__.py +0 -0
- flyte/io/pickle/transformer.py +117 -0
- flyte/io/structured_dataset/__init__.py +129 -0
- flyte/io/structured_dataset/basic_dfs.py +219 -0
- flyte/io/structured_dataset/structured_dataset.py +1061 -0
- flyte/py.typed +0 -0
- flyte/remote/__init__.py +25 -0
- flyte/remote/_client/__init__.py +0 -0
- flyte/remote/_client/_protocols.py +131 -0
- flyte/remote/_client/auth/__init__.py +12 -0
- flyte/remote/_client/auth/_authenticators/__init__.py +0 -0
- flyte/remote/_client/auth/_authenticators/base.py +397 -0
- flyte/remote/_client/auth/_authenticators/client_credentials.py +73 -0
- flyte/remote/_client/auth/_authenticators/device_code.py +118 -0
- flyte/remote/_client/auth/_authenticators/external_command.py +79 -0
- flyte/remote/_client/auth/_authenticators/factory.py +200 -0
- flyte/remote/_client/auth/_authenticators/pkce.py +516 -0
- flyte/remote/_client/auth/_channel.py +184 -0
- flyte/remote/_client/auth/_client_config.py +83 -0
- flyte/remote/_client/auth/_default_html.py +32 -0
- flyte/remote/_client/auth/_grpc_utils/__init__.py +0 -0
- flyte/remote/_client/auth/_grpc_utils/auth_interceptor.py +288 -0
- flyte/remote/_client/auth/_grpc_utils/default_metadata_interceptor.py +151 -0
- flyte/remote/_client/auth/_keyring.py +143 -0
- flyte/remote/_client/auth/_token_client.py +260 -0
- flyte/remote/_client/auth/errors.py +16 -0
- flyte/remote/_client/controlplane.py +95 -0
- flyte/remote/_console.py +18 -0
- flyte/remote/_data.py +155 -0
- flyte/remote/_logs.py +116 -0
- flyte/remote/_project.py +86 -0
- flyte/remote/_run.py +873 -0
- flyte/remote/_secret.py +132 -0
- flyte/remote/_task.py +227 -0
- flyte/report/__init__.py +3 -0
- flyte/report/_report.py +178 -0
- flyte/report/_template.html +124 -0
- flyte/storage/__init__.py +24 -0
- flyte/storage/_remote_fs.py +34 -0
- flyte/storage/_storage.py +251 -0
- flyte/storage/_utils.py +5 -0
- flyte/types/__init__.py +13 -0
- flyte/types/_interface.py +25 -0
- flyte/types/_renderer.py +162 -0
- flyte/types/_string_literals.py +120 -0
- flyte/types/_type_engine.py +2210 -0
- flyte/types/_utils.py +80 -0
- flyte-0.0.1b0.dist-info/METADATA +179 -0
- flyte-0.0.1b0.dist-info/RECORD +390 -0
- flyte-0.0.1b0.dist-info/WHEEL +5 -0
- flyte-0.0.1b0.dist-info/entry_points.txt +3 -0
- flyte-0.0.1b0.dist-info/top_level.txt +1 -0
- union/__init__.py +54 -0
- union/_api_commons.py +3 -0
- union/_bin/__init__.py +0 -0
- union/_bin/runtime.py +113 -0
- union/_build.py +25 -0
- union/_cache/__init__.py +12 -0
- union/_cache/cache.py +141 -0
- union/_cache/defaults.py +9 -0
- union/_cache/policy_function_body.py +42 -0
- union/_cli/__init__.py +0 -0
- union/_cli/_common.py +263 -0
- union/_cli/_create.py +40 -0
- union/_cli/_delete.py +23 -0
- union/_cli/_deploy.py +120 -0
- union/_cli/_get.py +162 -0
- union/_cli/_params.py +579 -0
- union/_cli/_run.py +150 -0
- union/_cli/main.py +72 -0
- union/_code_bundle/__init__.py +8 -0
- union/_code_bundle/_ignore.py +113 -0
- union/_code_bundle/_packaging.py +187 -0
- union/_code_bundle/_utils.py +342 -0
- union/_code_bundle/bundle.py +176 -0
- union/_context.py +146 -0
- union/_datastructures.py +295 -0
- union/_deploy.py +185 -0
- union/_doc.py +29 -0
- union/_docstring.py +26 -0
- union/_environment.py +43 -0
- union/_group.py +31 -0
- union/_hash.py +23 -0
- union/_image.py +760 -0
- union/_initialize.py +585 -0
- union/_interface.py +84 -0
- union/_internal/__init__.py +3 -0
- union/_internal/controllers/__init__.py +77 -0
- union/_internal/controllers/_local_controller.py +77 -0
- union/_internal/controllers/pbhash.py +39 -0
- union/_internal/controllers/remote/__init__.py +40 -0
- union/_internal/controllers/remote/_action.py +131 -0
- union/_internal/controllers/remote/_client.py +43 -0
- union/_internal/controllers/remote/_controller.py +169 -0
- union/_internal/controllers/remote/_core.py +341 -0
- union/_internal/controllers/remote/_informer.py +260 -0
- union/_internal/controllers/remote/_service_protocol.py +44 -0
- union/_internal/imagebuild/__init__.py +11 -0
- union/_internal/imagebuild/docker_builder.py +416 -0
- union/_internal/imagebuild/image_builder.py +243 -0
- union/_internal/imagebuild/remote_builder.py +0 -0
- union/_internal/resolvers/__init__.py +0 -0
- union/_internal/resolvers/_task_module.py +31 -0
- union/_internal/resolvers/common.py +24 -0
- union/_internal/resolvers/default.py +27 -0
- union/_internal/runtime/__init__.py +0 -0
- union/_internal/runtime/convert.py +163 -0
- union/_internal/runtime/entrypoints.py +121 -0
- union/_internal/runtime/io.py +136 -0
- union/_internal/runtime/resources_serde.py +134 -0
- union/_internal/runtime/task_serde.py +202 -0
- union/_internal/runtime/taskrunner.py +179 -0
- union/_internal/runtime/types_serde.py +53 -0
- union/_logging.py +124 -0
- union/_protos/__init__.py +0 -0
- union/_protos/common/authorization_pb2.py +66 -0
- union/_protos/common/authorization_pb2.pyi +106 -0
- union/_protos/common/authorization_pb2_grpc.py +4 -0
- union/_protos/common/identifier_pb2.py +71 -0
- union/_protos/common/identifier_pb2.pyi +82 -0
- union/_protos/common/identifier_pb2_grpc.py +4 -0
- union/_protos/common/identity_pb2.py +48 -0
- union/_protos/common/identity_pb2.pyi +72 -0
- union/_protos/common/identity_pb2_grpc.py +4 -0
- union/_protos/common/list_pb2.py +36 -0
- union/_protos/common/list_pb2.pyi +69 -0
- union/_protos/common/list_pb2_grpc.py +4 -0
- union/_protos/common/policy_pb2.py +37 -0
- union/_protos/common/policy_pb2.pyi +27 -0
- union/_protos/common/policy_pb2_grpc.py +4 -0
- union/_protos/common/role_pb2.py +37 -0
- union/_protos/common/role_pb2.pyi +51 -0
- union/_protos/common/role_pb2_grpc.py +4 -0
- union/_protos/common/runtime_version_pb2.py +28 -0
- union/_protos/common/runtime_version_pb2.pyi +24 -0
- union/_protos/common/runtime_version_pb2_grpc.py +4 -0
- union/_protos/logs/dataplane/payload_pb2.py +96 -0
- union/_protos/logs/dataplane/payload_pb2.pyi +168 -0
- union/_protos/logs/dataplane/payload_pb2_grpc.py +4 -0
- union/_protos/secret/definition_pb2.py +49 -0
- union/_protos/secret/definition_pb2.pyi +93 -0
- union/_protos/secret/definition_pb2_grpc.py +4 -0
- union/_protos/secret/payload_pb2.py +62 -0
- union/_protos/secret/payload_pb2.pyi +94 -0
- union/_protos/secret/payload_pb2_grpc.py +4 -0
- union/_protos/secret/secret_pb2.py +38 -0
- union/_protos/secret/secret_pb2.pyi +6 -0
- union/_protos/secret/secret_pb2_grpc.py +198 -0
- union/_protos/validate/validate/validate_pb2.py +76 -0
- union/_protos/workflow/node_execution_service_pb2.py +26 -0
- union/_protos/workflow/node_execution_service_pb2.pyi +4 -0
- union/_protos/workflow/node_execution_service_pb2_grpc.py +32 -0
- union/_protos/workflow/queue_service_pb2.py +75 -0
- union/_protos/workflow/queue_service_pb2.pyi +103 -0
- union/_protos/workflow/queue_service_pb2_grpc.py +172 -0
- union/_protos/workflow/run_definition_pb2.py +100 -0
- union/_protos/workflow/run_definition_pb2.pyi +256 -0
- union/_protos/workflow/run_definition_pb2_grpc.py +4 -0
- union/_protos/workflow/run_logs_service_pb2.py +41 -0
- union/_protos/workflow/run_logs_service_pb2.pyi +28 -0
- union/_protos/workflow/run_logs_service_pb2_grpc.py +69 -0
- union/_protos/workflow/run_service_pb2.py +133 -0
- union/_protos/workflow/run_service_pb2.pyi +173 -0
- union/_protos/workflow/run_service_pb2_grpc.py +412 -0
- union/_protos/workflow/state_service_pb2.py +58 -0
- union/_protos/workflow/state_service_pb2.pyi +69 -0
- union/_protos/workflow/state_service_pb2_grpc.py +138 -0
- union/_protos/workflow/task_definition_pb2.py +72 -0
- union/_protos/workflow/task_definition_pb2.pyi +65 -0
- union/_protos/workflow/task_definition_pb2_grpc.py +4 -0
- union/_protos/workflow/task_service_pb2.py +44 -0
- union/_protos/workflow/task_service_pb2.pyi +31 -0
- union/_protos/workflow/task_service_pb2_grpc.py +104 -0
- union/_resources.py +226 -0
- union/_retry.py +32 -0
- union/_reusable_environment.py +25 -0
- union/_run.py +374 -0
- union/_secret.py +61 -0
- union/_task.py +354 -0
- union/_task_environment.py +186 -0
- union/_timeout.py +47 -0
- union/_tools.py +27 -0
- union/_utils/__init__.py +11 -0
- union/_utils/asyn.py +119 -0
- union/_utils/file_handling.py +71 -0
- union/_utils/helpers.py +46 -0
- union/_utils/lazy_module.py +54 -0
- union/_utils/uv_script_parser.py +49 -0
- union/_version.py +21 -0
- union/connectors/__init__.py +0 -0
- union/errors.py +128 -0
- union/extras/__init__.py +5 -0
- union/extras/_container.py +263 -0
- union/io/__init__.py +11 -0
- union/io/_dataframe.py +0 -0
- union/io/_dir.py +425 -0
- union/io/_file.py +418 -0
- union/io/pickle/__init__.py +0 -0
- union/io/pickle/transformer.py +117 -0
- union/io/structured_dataset/__init__.py +122 -0
- union/io/structured_dataset/basic_dfs.py +219 -0
- union/io/structured_dataset/structured_dataset.py +1057 -0
- union/py.typed +0 -0
- union/remote/__init__.py +23 -0
- union/remote/_client/__init__.py +0 -0
- union/remote/_client/_protocols.py +129 -0
- union/remote/_client/auth/__init__.py +12 -0
- union/remote/_client/auth/_authenticators/__init__.py +0 -0
- union/remote/_client/auth/_authenticators/base.py +391 -0
- union/remote/_client/auth/_authenticators/client_credentials.py +73 -0
- union/remote/_client/auth/_authenticators/device_code.py +120 -0
- union/remote/_client/auth/_authenticators/external_command.py +77 -0
- union/remote/_client/auth/_authenticators/factory.py +200 -0
- union/remote/_client/auth/_authenticators/pkce.py +515 -0
- union/remote/_client/auth/_channel.py +184 -0
- union/remote/_client/auth/_client_config.py +83 -0
- union/remote/_client/auth/_default_html.py +32 -0
- union/remote/_client/auth/_grpc_utils/__init__.py +0 -0
- union/remote/_client/auth/_grpc_utils/auth_interceptor.py +204 -0
- union/remote/_client/auth/_grpc_utils/default_metadata_interceptor.py +144 -0
- union/remote/_client/auth/_keyring.py +154 -0
- union/remote/_client/auth/_token_client.py +258 -0
- union/remote/_client/auth/errors.py +16 -0
- union/remote/_client/controlplane.py +86 -0
- union/remote/_data.py +149 -0
- union/remote/_logs.py +74 -0
- union/remote/_project.py +86 -0
- union/remote/_run.py +820 -0
- union/remote/_secret.py +132 -0
- union/remote/_task.py +193 -0
- union/report/__init__.py +3 -0
- union/report/_report.py +178 -0
- union/report/_template.html +124 -0
- union/storage/__init__.py +24 -0
- union/storage/_remote_fs.py +34 -0
- union/storage/_storage.py +247 -0
- union/storage/_utils.py +5 -0
- union/types/__init__.py +11 -0
- union/types/_renderer.py +162 -0
- union/types/_string_literals.py +120 -0
- union/types/_type_engine.py +2131 -0
- union/types/_utils.py +80 -0
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import typing
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TypeVar
|
|
5
|
+
|
|
6
|
+
from flyteidl.core import literals_pb2, types_pb2
|
|
7
|
+
from fsspec.core import split_protocol, strip_protocol
|
|
8
|
+
|
|
9
|
+
import union.storage as storage
|
|
10
|
+
from union._logging import logger
|
|
11
|
+
from union._utils import lazy_module
|
|
12
|
+
from union.io.structured_dataset.structured_dataset import (
|
|
13
|
+
CSV,
|
|
14
|
+
PARQUET,
|
|
15
|
+
StructuredDataset,
|
|
16
|
+
StructuredDatasetDecoder,
|
|
17
|
+
StructuredDatasetEncoder,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
if typing.TYPE_CHECKING:
|
|
21
|
+
import pandas as pd
|
|
22
|
+
import pyarrow as pa
|
|
23
|
+
else:
|
|
24
|
+
pd = lazy_module("pandas")
|
|
25
|
+
pa = lazy_module("pyarrow")
|
|
26
|
+
|
|
27
|
+
T = TypeVar("T")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# pr: add back after storage
|
|
31
|
+
def get_pandas_storage_options(uri: str, data_config=None, anonymous: bool = False) -> typing.Optional[typing.Dict]:
|
|
32
|
+
from pandas.io.common import is_fsspec_url
|
|
33
|
+
|
|
34
|
+
if is_fsspec_url(uri):
|
|
35
|
+
if uri.startswith("s3"):
|
|
36
|
+
# pr: after storage, replace with real call to get_fsspec_storage_options
|
|
37
|
+
return {
|
|
38
|
+
"cache_regions": True,
|
|
39
|
+
"client_kwargs": {"endpoint_url": "http://localhost:30002"},
|
|
40
|
+
"key": "minio",
|
|
41
|
+
"secret": "miniostorage",
|
|
42
|
+
}
|
|
43
|
+
return {}
|
|
44
|
+
|
|
45
|
+
# Pandas does not allow storage_options for non-fsspec paths e.g. local.
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class PandasToCSVEncodingHandler(StructuredDatasetEncoder):
|
|
50
|
+
def __init__(self):
|
|
51
|
+
super().__init__(pd.DataFrame, None, CSV)
|
|
52
|
+
|
|
53
|
+
async def encode(
|
|
54
|
+
self,
|
|
55
|
+
structured_dataset: StructuredDataset,
|
|
56
|
+
structured_dataset_type: types_pb2.StructuredDatasetType,
|
|
57
|
+
) -> literals_pb2.StructuredDataset:
|
|
58
|
+
if not structured_dataset.uri:
|
|
59
|
+
from union._context import internal_ctx
|
|
60
|
+
|
|
61
|
+
ctx = internal_ctx()
|
|
62
|
+
uri = ctx.raw_data.get_random_remote_path()
|
|
63
|
+
else:
|
|
64
|
+
uri = typing.cast(str, structured_dataset.uri)
|
|
65
|
+
|
|
66
|
+
if not storage.is_remote(uri):
|
|
67
|
+
Path(uri).mkdir(parents=True, exist_ok=True)
|
|
68
|
+
path = os.path.join(uri, ".csv")
|
|
69
|
+
df = typing.cast(pd.DataFrame, structured_dataset.dataframe)
|
|
70
|
+
df.to_csv(
|
|
71
|
+
path,
|
|
72
|
+
index=False,
|
|
73
|
+
storage_options=get_pandas_storage_options(uri=path, data_config=None),
|
|
74
|
+
)
|
|
75
|
+
structured_dataset_type.format = CSV
|
|
76
|
+
return literals_pb2.StructuredDataset(
|
|
77
|
+
uri=uri, metadata=literals_pb2.StructuredDatasetMetadata(structured_dataset_type)
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class CSVToPandasDecodingHandler(StructuredDatasetDecoder):
|
|
82
|
+
def __init__(self):
|
|
83
|
+
super().__init__(pd.DataFrame, None, CSV)
|
|
84
|
+
|
|
85
|
+
async def decode(
|
|
86
|
+
self,
|
|
87
|
+
proto_value: literals_pb2.StructuredDataset,
|
|
88
|
+
current_task_metadata: literals_pb2.StructuredDatasetMetadata,
|
|
89
|
+
) -> "pd.DataFrame":
|
|
90
|
+
from botocore.exceptions import NoCredentialsError
|
|
91
|
+
|
|
92
|
+
uri = proto_value.uri
|
|
93
|
+
columns = None
|
|
94
|
+
kwargs = get_pandas_storage_options(uri=uri, data_config=None)
|
|
95
|
+
path = os.path.join(uri, ".csv")
|
|
96
|
+
if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns:
|
|
97
|
+
columns = [c.name for c in current_task_metadata.structured_dataset_type.columns]
|
|
98
|
+
try:
|
|
99
|
+
return pd.read_csv(path, usecols=columns, storage_options=kwargs)
|
|
100
|
+
except NoCredentialsError:
|
|
101
|
+
logger.debug("S3 source detected, attempting anonymous S3 access")
|
|
102
|
+
kwargs = get_pandas_storage_options(uri=uri, data_config=None, anonymous=True)
|
|
103
|
+
return pd.read_csv(path, usecols=columns, storage_options=kwargs)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class PandasToParquetEncodingHandler(StructuredDatasetEncoder):
|
|
107
|
+
def __init__(self):
|
|
108
|
+
super().__init__(pd.DataFrame, None, PARQUET)
|
|
109
|
+
|
|
110
|
+
async def encode(
|
|
111
|
+
self,
|
|
112
|
+
structured_dataset: StructuredDataset,
|
|
113
|
+
structured_dataset_type: types_pb2.StructuredDatasetType,
|
|
114
|
+
) -> literals_pb2.StructuredDataset:
|
|
115
|
+
if not structured_dataset.uri:
|
|
116
|
+
from union._context import internal_ctx
|
|
117
|
+
|
|
118
|
+
ctx = internal_ctx()
|
|
119
|
+
uri = str(ctx.raw_data.get_random_remote_path())
|
|
120
|
+
else:
|
|
121
|
+
uri = typing.cast(str, structured_dataset.uri)
|
|
122
|
+
|
|
123
|
+
if not storage.is_remote(uri):
|
|
124
|
+
Path(uri).mkdir(parents=True, exist_ok=True)
|
|
125
|
+
path = os.path.join(uri, f"{0:05}")
|
|
126
|
+
df = typing.cast(pd.DataFrame, structured_dataset.dataframe)
|
|
127
|
+
df.to_parquet(
|
|
128
|
+
path,
|
|
129
|
+
coerce_timestamps="us",
|
|
130
|
+
allow_truncated_timestamps=False,
|
|
131
|
+
storage_options=get_pandas_storage_options(uri=path, data_config=None),
|
|
132
|
+
)
|
|
133
|
+
structured_dataset_type.format = PARQUET
|
|
134
|
+
return literals_pb2.StructuredDataset(
|
|
135
|
+
uri=uri, metadata=literals_pb2.StructuredDatasetMetadata(structured_dataset_type=structured_dataset_type)
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class ParquetToPandasDecodingHandler(StructuredDatasetDecoder):
|
|
140
|
+
def __init__(self):
|
|
141
|
+
super().__init__(pd.DataFrame, None, PARQUET)
|
|
142
|
+
|
|
143
|
+
async def decode(
|
|
144
|
+
self,
|
|
145
|
+
flyte_value: literals_pb2.StructuredDataset,
|
|
146
|
+
current_task_metadata: literals_pb2.StructuredDatasetMetadata,
|
|
147
|
+
) -> "pd.DataFrame":
|
|
148
|
+
from botocore.exceptions import NoCredentialsError
|
|
149
|
+
|
|
150
|
+
uri = flyte_value.uri
|
|
151
|
+
columns = None
|
|
152
|
+
kwargs = get_pandas_storage_options(uri=uri, data_config=None)
|
|
153
|
+
if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns:
|
|
154
|
+
columns = [c.name for c in current_task_metadata.structured_dataset_type.columns]
|
|
155
|
+
try:
|
|
156
|
+
return pd.read_parquet(uri, columns=columns, storage_options=kwargs)
|
|
157
|
+
except NoCredentialsError:
|
|
158
|
+
logger.debug("S3 source detected, attempting anonymous S3 access")
|
|
159
|
+
kwargs = get_pandas_storage_options(uri=uri, data_config=None, anonymous=True)
|
|
160
|
+
return pd.read_parquet(uri, columns=columns, storage_options=kwargs)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class ArrowToParquetEncodingHandler(StructuredDatasetEncoder):
|
|
164
|
+
def __init__(self):
|
|
165
|
+
super().__init__(pa.Table, None, PARQUET)
|
|
166
|
+
|
|
167
|
+
async def encode(
|
|
168
|
+
self,
|
|
169
|
+
structured_dataset: StructuredDataset,
|
|
170
|
+
structured_dataset_type: types_pb2.StructuredDatasetType,
|
|
171
|
+
) -> literals_pb2.StructuredDataset:
|
|
172
|
+
import pyarrow.parquet as pq
|
|
173
|
+
|
|
174
|
+
if not structured_dataset.uri:
|
|
175
|
+
from union._context import internal_ctx
|
|
176
|
+
|
|
177
|
+
ctx = internal_ctx()
|
|
178
|
+
uri = ctx.raw_data.get_random_remote_path()
|
|
179
|
+
else:
|
|
180
|
+
uri = typing.cast(str, structured_dataset.uri)
|
|
181
|
+
|
|
182
|
+
if not storage.is_remote(uri):
|
|
183
|
+
Path(uri).mkdir(parents=True, exist_ok=True)
|
|
184
|
+
path = os.path.join(uri, f"{0:05}")
|
|
185
|
+
filesystem = storage.get_underlying_filesystem(path=path)
|
|
186
|
+
pq.write_table(structured_dataset.dataframe, strip_protocol(path), filesystem=filesystem)
|
|
187
|
+
return literals_pb2.StructuredDataset(
|
|
188
|
+
uri=uri, metadata=literals_pb2.StructuredDatasetMetadata(structured_dataset_type)
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class ParquetToArrowDecodingHandler(StructuredDatasetDecoder):
|
|
193
|
+
def __init__(self):
|
|
194
|
+
super().__init__(pa.Table, None, PARQUET)
|
|
195
|
+
|
|
196
|
+
async def decode(
|
|
197
|
+
self,
|
|
198
|
+
proto_value: literals_pb2.StructuredDataset,
|
|
199
|
+
current_task_metadata: literals_pb2.StructuredDatasetMetadata,
|
|
200
|
+
) -> "pa.Table":
|
|
201
|
+
import pyarrow.parquet as pq
|
|
202
|
+
from botocore.exceptions import NoCredentialsError
|
|
203
|
+
|
|
204
|
+
uri = proto_value.uri
|
|
205
|
+
if not storage.is_remote(uri):
|
|
206
|
+
Path(uri).parent.mkdir(parents=True, exist_ok=True)
|
|
207
|
+
_, path = split_protocol(uri)
|
|
208
|
+
|
|
209
|
+
columns = None
|
|
210
|
+
if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns:
|
|
211
|
+
columns = [c.name for c in current_task_metadata.structured_dataset_type.columns]
|
|
212
|
+
try:
|
|
213
|
+
return pq.read_table(path, columns=columns)
|
|
214
|
+
except NoCredentialsError as e:
|
|
215
|
+
logger.debug("S3 source detected, attempting anonymous S3 access")
|
|
216
|
+
fs = storage.get_underlying_filesystem(path=uri, anonymous=True)
|
|
217
|
+
if fs is not None:
|
|
218
|
+
return pq.read_table(path, filesystem=fs, columns=columns)
|
|
219
|
+
raise e
|