flyte 0.0.1b3__py3-none-any.whl → 0.2.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of flyte might be problematic. Click here for more details.
- flyte/__init__.py +20 -4
- flyte/_bin/runtime.py +33 -7
- flyte/_build.py +3 -2
- flyte/_cache/cache.py +1 -2
- flyte/_code_bundle/_packaging.py +1 -1
- flyte/_code_bundle/_utils.py +0 -16
- flyte/_code_bundle/bundle.py +43 -12
- flyte/_context.py +8 -2
- flyte/_deploy.py +56 -15
- flyte/_environment.py +45 -4
- flyte/_excepthook.py +37 -0
- flyte/_group.py +2 -1
- flyte/_image.py +8 -4
- flyte/_initialize.py +112 -254
- flyte/_interface.py +3 -3
- flyte/_internal/controllers/__init__.py +19 -6
- flyte/_internal/controllers/_local_controller.py +83 -8
- flyte/_internal/controllers/_trace.py +2 -1
- flyte/_internal/controllers/remote/__init__.py +27 -7
- flyte/_internal/controllers/remote/_action.py +7 -2
- flyte/_internal/controllers/remote/_client.py +5 -1
- flyte/_internal/controllers/remote/_controller.py +159 -26
- flyte/_internal/controllers/remote/_core.py +13 -5
- flyte/_internal/controllers/remote/_informer.py +4 -4
- flyte/_internal/controllers/remote/_service_protocol.py +6 -6
- flyte/_internal/imagebuild/docker_builder.py +12 -1
- flyte/_internal/imagebuild/image_builder.py +16 -11
- flyte/_internal/runtime/convert.py +164 -21
- flyte/_internal/runtime/entrypoints.py +1 -1
- flyte/_internal/runtime/io.py +3 -3
- flyte/_internal/runtime/task_serde.py +140 -20
- flyte/_internal/runtime/taskrunner.py +4 -3
- flyte/_internal/runtime/types_serde.py +1 -1
- flyte/_logging.py +12 -1
- flyte/_map.py +215 -0
- flyte/_pod.py +19 -0
- flyte/_protos/common/list_pb2.py +3 -3
- flyte/_protos/common/list_pb2.pyi +2 -0
- flyte/_protos/logs/dataplane/payload_pb2.py +28 -24
- flyte/_protos/logs/dataplane/payload_pb2.pyi +11 -2
- flyte/_protos/workflow/common_pb2.py +27 -0
- flyte/_protos/workflow/common_pb2.pyi +14 -0
- flyte/_protos/workflow/environment_pb2.py +29 -0
- flyte/_protos/workflow/environment_pb2.pyi +12 -0
- flyte/_protos/workflow/queue_service_pb2.py +40 -41
- flyte/_protos/workflow/queue_service_pb2.pyi +35 -30
- flyte/_protos/workflow/queue_service_pb2_grpc.py +15 -15
- flyte/_protos/workflow/run_definition_pb2.py +61 -61
- flyte/_protos/workflow/run_definition_pb2.pyi +8 -4
- flyte/_protos/workflow/run_service_pb2.py +20 -24
- flyte/_protos/workflow/run_service_pb2.pyi +2 -6
- flyte/_protos/workflow/state_service_pb2.py +36 -28
- flyte/_protos/workflow/state_service_pb2.pyi +19 -15
- flyte/_protos/workflow/state_service_pb2_grpc.py +28 -28
- flyte/_protos/workflow/task_definition_pb2.py +29 -22
- flyte/_protos/workflow/task_definition_pb2.pyi +21 -5
- flyte/_protos/workflow/task_service_pb2.py +27 -11
- flyte/_protos/workflow/task_service_pb2.pyi +29 -1
- flyte/_protos/workflow/task_service_pb2_grpc.py +34 -0
- flyte/_run.py +166 -95
- flyte/_task.py +110 -28
- flyte/_task_environment.py +55 -72
- flyte/_trace.py +6 -14
- flyte/_utils/__init__.py +6 -0
- flyte/_utils/async_cache.py +139 -0
- flyte/_utils/coro_management.py +0 -2
- flyte/_utils/helpers.py +45 -19
- flyte/_utils/org_discovery.py +57 -0
- flyte/_version.py +2 -2
- flyte/cli/__init__.py +3 -0
- flyte/cli/_abort.py +28 -0
- flyte/{_cli → cli}/_common.py +73 -23
- flyte/cli/_create.py +145 -0
- flyte/{_cli → cli}/_delete.py +4 -4
- flyte/{_cli → cli}/_deploy.py +26 -14
- flyte/cli/_gen.py +163 -0
- flyte/{_cli → cli}/_get.py +98 -23
- {union/_cli → flyte/cli}/_params.py +106 -147
- flyte/{_cli → cli}/_run.py +99 -20
- flyte/cli/main.py +166 -0
- flyte/config/__init__.py +3 -0
- flyte/config/_config.py +216 -0
- flyte/config/_internal.py +64 -0
- flyte/config/_reader.py +207 -0
- flyte/errors.py +29 -0
- flyte/extras/_container.py +33 -43
- flyte/io/__init__.py +17 -1
- flyte/io/_dir.py +2 -2
- flyte/io/_file.py +3 -4
- flyte/io/{structured_dataset → _structured_dataset}/basic_dfs.py +1 -1
- flyte/io/{structured_dataset → _structured_dataset}/structured_dataset.py +1 -1
- flyte/{_datastructures.py → models.py} +56 -7
- flyte/remote/__init__.py +2 -1
- flyte/remote/_client/_protocols.py +2 -0
- flyte/remote/_client/auth/_auth_utils.py +14 -0
- flyte/remote/_client/auth/_channel.py +34 -3
- flyte/remote/_client/auth/_token_client.py +3 -3
- flyte/remote/_client/controlplane.py +13 -13
- flyte/remote/_console.py +1 -1
- flyte/remote/_data.py +10 -6
- flyte/remote/_logs.py +89 -29
- flyte/remote/_project.py +8 -9
- flyte/remote/_run.py +228 -131
- flyte/remote/_secret.py +12 -12
- flyte/remote/_task.py +179 -15
- flyte/report/_report.py +4 -4
- flyte/storage/__init__.py +5 -0
- flyte/storage/_config.py +233 -0
- flyte/storage/_storage.py +23 -3
- flyte/syncify/__init__.py +56 -0
- flyte/syncify/_api.py +371 -0
- flyte/types/__init__.py +23 -0
- flyte/types/_interface.py +22 -7
- flyte/{io/pickle/transformer.py → types/_pickle.py} +2 -1
- flyte/types/_type_engine.py +95 -18
- flyte-0.2.0a0.dist-info/METADATA +249 -0
- flyte-0.2.0a0.dist-info/RECORD +218 -0
- {flyte-0.0.1b3.dist-info → flyte-0.2.0a0.dist-info}/entry_points.txt +1 -1
- flyte/_api_commons.py +0 -3
- flyte/_cli/__init__.py +0 -0
- flyte/_cli/_create.py +0 -42
- flyte/_cli/main.py +0 -72
- flyte/_internal/controllers/pbhash.py +0 -39
- flyte/io/_dataframe.py +0 -0
- flyte/io/pickle/__init__.py +0 -0
- flyte-0.0.1b3.dist-info/METADATA +0 -179
- flyte-0.0.1b3.dist-info/RECORD +0 -390
- union/__init__.py +0 -54
- union/_api_commons.py +0 -3
- union/_bin/__init__.py +0 -0
- union/_bin/runtime.py +0 -113
- union/_build.py +0 -25
- union/_cache/__init__.py +0 -12
- union/_cache/cache.py +0 -141
- union/_cache/defaults.py +0 -9
- union/_cache/policy_function_body.py +0 -42
- union/_cli/__init__.py +0 -0
- union/_cli/_common.py +0 -263
- union/_cli/_create.py +0 -40
- union/_cli/_delete.py +0 -23
- union/_cli/_deploy.py +0 -120
- union/_cli/_get.py +0 -162
- union/_cli/_run.py +0 -150
- union/_cli/main.py +0 -72
- union/_code_bundle/__init__.py +0 -8
- union/_code_bundle/_ignore.py +0 -113
- union/_code_bundle/_packaging.py +0 -187
- union/_code_bundle/_utils.py +0 -342
- union/_code_bundle/bundle.py +0 -176
- union/_context.py +0 -146
- union/_datastructures.py +0 -295
- union/_deploy.py +0 -185
- union/_doc.py +0 -29
- union/_docstring.py +0 -26
- union/_environment.py +0 -43
- union/_group.py +0 -31
- union/_hash.py +0 -23
- union/_image.py +0 -760
- union/_initialize.py +0 -585
- union/_interface.py +0 -84
- union/_internal/__init__.py +0 -3
- union/_internal/controllers/__init__.py +0 -77
- union/_internal/controllers/_local_controller.py +0 -77
- union/_internal/controllers/pbhash.py +0 -39
- union/_internal/controllers/remote/__init__.py +0 -40
- union/_internal/controllers/remote/_action.py +0 -131
- union/_internal/controllers/remote/_client.py +0 -43
- union/_internal/controllers/remote/_controller.py +0 -169
- union/_internal/controllers/remote/_core.py +0 -341
- union/_internal/controllers/remote/_informer.py +0 -260
- union/_internal/controllers/remote/_service_protocol.py +0 -44
- union/_internal/imagebuild/__init__.py +0 -11
- union/_internal/imagebuild/docker_builder.py +0 -416
- union/_internal/imagebuild/image_builder.py +0 -243
- union/_internal/imagebuild/remote_builder.py +0 -0
- union/_internal/resolvers/__init__.py +0 -0
- union/_internal/resolvers/_task_module.py +0 -31
- union/_internal/resolvers/common.py +0 -24
- union/_internal/resolvers/default.py +0 -27
- union/_internal/runtime/__init__.py +0 -0
- union/_internal/runtime/convert.py +0 -163
- union/_internal/runtime/entrypoints.py +0 -121
- union/_internal/runtime/io.py +0 -136
- union/_internal/runtime/resources_serde.py +0 -134
- union/_internal/runtime/task_serde.py +0 -202
- union/_internal/runtime/taskrunner.py +0 -179
- union/_internal/runtime/types_serde.py +0 -53
- union/_logging.py +0 -124
- union/_protos/__init__.py +0 -0
- union/_protos/common/authorization_pb2.py +0 -66
- union/_protos/common/authorization_pb2.pyi +0 -106
- union/_protos/common/identifier_pb2.py +0 -71
- union/_protos/common/identifier_pb2.pyi +0 -82
- union/_protos/common/identity_pb2.py +0 -48
- union/_protos/common/identity_pb2.pyi +0 -72
- union/_protos/common/identity_pb2_grpc.py +0 -4
- union/_protos/common/list_pb2.py +0 -36
- union/_protos/common/list_pb2.pyi +0 -69
- union/_protos/common/list_pb2_grpc.py +0 -4
- union/_protos/common/policy_pb2.py +0 -37
- union/_protos/common/policy_pb2.pyi +0 -27
- union/_protos/common/policy_pb2_grpc.py +0 -4
- union/_protos/common/role_pb2.py +0 -37
- union/_protos/common/role_pb2.pyi +0 -51
- union/_protos/common/role_pb2_grpc.py +0 -4
- union/_protos/common/runtime_version_pb2.py +0 -28
- union/_protos/common/runtime_version_pb2.pyi +0 -24
- union/_protos/common/runtime_version_pb2_grpc.py +0 -4
- union/_protos/logs/dataplane/payload_pb2.py +0 -96
- union/_protos/logs/dataplane/payload_pb2.pyi +0 -168
- union/_protos/logs/dataplane/payload_pb2_grpc.py +0 -4
- union/_protos/secret/definition_pb2.py +0 -49
- union/_protos/secret/definition_pb2.pyi +0 -93
- union/_protos/secret/definition_pb2_grpc.py +0 -4
- union/_protos/secret/payload_pb2.py +0 -62
- union/_protos/secret/payload_pb2.pyi +0 -94
- union/_protos/secret/payload_pb2_grpc.py +0 -4
- union/_protos/secret/secret_pb2.py +0 -38
- union/_protos/secret/secret_pb2.pyi +0 -6
- union/_protos/secret/secret_pb2_grpc.py +0 -198
- union/_protos/validate/validate/validate_pb2.py +0 -76
- union/_protos/workflow/node_execution_service_pb2.py +0 -26
- union/_protos/workflow/node_execution_service_pb2.pyi +0 -4
- union/_protos/workflow/node_execution_service_pb2_grpc.py +0 -32
- union/_protos/workflow/queue_service_pb2.py +0 -75
- union/_protos/workflow/queue_service_pb2.pyi +0 -103
- union/_protos/workflow/queue_service_pb2_grpc.py +0 -172
- union/_protos/workflow/run_definition_pb2.py +0 -100
- union/_protos/workflow/run_definition_pb2.pyi +0 -256
- union/_protos/workflow/run_definition_pb2_grpc.py +0 -4
- union/_protos/workflow/run_logs_service_pb2.py +0 -41
- union/_protos/workflow/run_logs_service_pb2.pyi +0 -28
- union/_protos/workflow/run_logs_service_pb2_grpc.py +0 -69
- union/_protos/workflow/run_service_pb2.py +0 -133
- union/_protos/workflow/run_service_pb2.pyi +0 -173
- union/_protos/workflow/run_service_pb2_grpc.py +0 -412
- union/_protos/workflow/state_service_pb2.py +0 -58
- union/_protos/workflow/state_service_pb2.pyi +0 -69
- union/_protos/workflow/state_service_pb2_grpc.py +0 -138
- union/_protos/workflow/task_definition_pb2.py +0 -72
- union/_protos/workflow/task_definition_pb2.pyi +0 -65
- union/_protos/workflow/task_definition_pb2_grpc.py +0 -4
- union/_protos/workflow/task_service_pb2.py +0 -44
- union/_protos/workflow/task_service_pb2.pyi +0 -31
- union/_protos/workflow/task_service_pb2_grpc.py +0 -104
- union/_resources.py +0 -226
- union/_retry.py +0 -32
- union/_reusable_environment.py +0 -25
- union/_run.py +0 -374
- union/_secret.py +0 -61
- union/_task.py +0 -354
- union/_task_environment.py +0 -186
- union/_timeout.py +0 -47
- union/_tools.py +0 -27
- union/_utils/__init__.py +0 -11
- union/_utils/asyn.py +0 -119
- union/_utils/file_handling.py +0 -71
- union/_utils/helpers.py +0 -46
- union/_utils/lazy_module.py +0 -54
- union/_utils/uv_script_parser.py +0 -49
- union/_version.py +0 -21
- union/connectors/__init__.py +0 -0
- union/errors.py +0 -128
- union/extras/__init__.py +0 -5
- union/extras/_container.py +0 -263
- union/io/__init__.py +0 -11
- union/io/_dataframe.py +0 -0
- union/io/_dir.py +0 -425
- union/io/_file.py +0 -418
- union/io/pickle/__init__.py +0 -0
- union/io/pickle/transformer.py +0 -117
- union/io/structured_dataset/__init__.py +0 -122
- union/io/structured_dataset/basic_dfs.py +0 -219
- union/io/structured_dataset/structured_dataset.py +0 -1057
- union/py.typed +0 -0
- union/remote/__init__.py +0 -23
- union/remote/_client/__init__.py +0 -0
- union/remote/_client/_protocols.py +0 -129
- union/remote/_client/auth/__init__.py +0 -12
- union/remote/_client/auth/_authenticators/__init__.py +0 -0
- union/remote/_client/auth/_authenticators/base.py +0 -391
- union/remote/_client/auth/_authenticators/client_credentials.py +0 -73
- union/remote/_client/auth/_authenticators/device_code.py +0 -120
- union/remote/_client/auth/_authenticators/external_command.py +0 -77
- union/remote/_client/auth/_authenticators/factory.py +0 -200
- union/remote/_client/auth/_authenticators/pkce.py +0 -515
- union/remote/_client/auth/_channel.py +0 -184
- union/remote/_client/auth/_client_config.py +0 -83
- union/remote/_client/auth/_default_html.py +0 -32
- union/remote/_client/auth/_grpc_utils/__init__.py +0 -0
- union/remote/_client/auth/_grpc_utils/auth_interceptor.py +0 -204
- union/remote/_client/auth/_grpc_utils/default_metadata_interceptor.py +0 -144
- union/remote/_client/auth/_keyring.py +0 -154
- union/remote/_client/auth/_token_client.py +0 -258
- union/remote/_client/auth/errors.py +0 -16
- union/remote/_client/controlplane.py +0 -86
- union/remote/_data.py +0 -149
- union/remote/_logs.py +0 -74
- union/remote/_project.py +0 -86
- union/remote/_run.py +0 -820
- union/remote/_secret.py +0 -132
- union/remote/_task.py +0 -193
- union/report/__init__.py +0 -3
- union/report/_report.py +0 -178
- union/report/_template.html +0 -124
- union/storage/__init__.py +0 -24
- union/storage/_remote_fs.py +0 -34
- union/storage/_storage.py +0 -247
- union/storage/_utils.py +0 -5
- union/types/__init__.py +0 -11
- union/types/_renderer.py +0 -162
- union/types/_string_literals.py +0 -120
- union/types/_type_engine.py +0 -2131
- union/types/_utils.py +0 -80
- /union/_protos/common/authorization_pb2_grpc.py → /flyte/_protos/workflow/common_pb2_grpc.py +0 -0
- /union/_protos/common/identifier_pb2_grpc.py → /flyte/_protos/workflow/environment_pb2_grpc.py +0 -0
- /flyte/io/{structured_dataset → _structured_dataset}/__init__.py +0 -0
- {flyte-0.0.1b3.dist-info → flyte-0.2.0a0.dist-info}/WHEEL +0 -0
- {flyte-0.0.1b3.dist-info → flyte-0.2.0a0.dist-info}/top_level.txt +0 -0
|
@@ -1,219 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import typing
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import TypeVar
|
|
5
|
-
|
|
6
|
-
from flyteidl.core import literals_pb2, types_pb2
|
|
7
|
-
from fsspec.core import split_protocol, strip_protocol
|
|
8
|
-
|
|
9
|
-
import union.storage as storage
|
|
10
|
-
from union._logging import logger
|
|
11
|
-
from union._utils import lazy_module
|
|
12
|
-
from union.io.structured_dataset.structured_dataset import (
|
|
13
|
-
CSV,
|
|
14
|
-
PARQUET,
|
|
15
|
-
StructuredDataset,
|
|
16
|
-
StructuredDatasetDecoder,
|
|
17
|
-
StructuredDatasetEncoder,
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
if typing.TYPE_CHECKING:
|
|
21
|
-
import pandas as pd
|
|
22
|
-
import pyarrow as pa
|
|
23
|
-
else:
|
|
24
|
-
pd = lazy_module("pandas")
|
|
25
|
-
pa = lazy_module("pyarrow")
|
|
26
|
-
|
|
27
|
-
T = TypeVar("T")
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
# pr: add back after storage
|
|
31
|
-
def get_pandas_storage_options(uri: str, data_config=None, anonymous: bool = False) -> typing.Optional[typing.Dict]:
|
|
32
|
-
from pandas.io.common import is_fsspec_url
|
|
33
|
-
|
|
34
|
-
if is_fsspec_url(uri):
|
|
35
|
-
if uri.startswith("s3"):
|
|
36
|
-
# pr: after storage, replace with real call to get_fsspec_storage_options
|
|
37
|
-
return {
|
|
38
|
-
"cache_regions": True,
|
|
39
|
-
"client_kwargs": {"endpoint_url": "http://localhost:30002"},
|
|
40
|
-
"key": "minio",
|
|
41
|
-
"secret": "miniostorage",
|
|
42
|
-
}
|
|
43
|
-
return {}
|
|
44
|
-
|
|
45
|
-
# Pandas does not allow storage_options for non-fsspec paths e.g. local.
|
|
46
|
-
return None
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
class PandasToCSVEncodingHandler(StructuredDatasetEncoder):
|
|
50
|
-
def __init__(self):
|
|
51
|
-
super().__init__(pd.DataFrame, None, CSV)
|
|
52
|
-
|
|
53
|
-
async def encode(
|
|
54
|
-
self,
|
|
55
|
-
structured_dataset: StructuredDataset,
|
|
56
|
-
structured_dataset_type: types_pb2.StructuredDatasetType,
|
|
57
|
-
) -> literals_pb2.StructuredDataset:
|
|
58
|
-
if not structured_dataset.uri:
|
|
59
|
-
from union._context import internal_ctx
|
|
60
|
-
|
|
61
|
-
ctx = internal_ctx()
|
|
62
|
-
uri = ctx.raw_data.get_random_remote_path()
|
|
63
|
-
else:
|
|
64
|
-
uri = typing.cast(str, structured_dataset.uri)
|
|
65
|
-
|
|
66
|
-
if not storage.is_remote(uri):
|
|
67
|
-
Path(uri).mkdir(parents=True, exist_ok=True)
|
|
68
|
-
path = os.path.join(uri, ".csv")
|
|
69
|
-
df = typing.cast(pd.DataFrame, structured_dataset.dataframe)
|
|
70
|
-
df.to_csv(
|
|
71
|
-
path,
|
|
72
|
-
index=False,
|
|
73
|
-
storage_options=get_pandas_storage_options(uri=path, data_config=None),
|
|
74
|
-
)
|
|
75
|
-
structured_dataset_type.format = CSV
|
|
76
|
-
return literals_pb2.StructuredDataset(
|
|
77
|
-
uri=uri, metadata=literals_pb2.StructuredDatasetMetadata(structured_dataset_type)
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
class CSVToPandasDecodingHandler(StructuredDatasetDecoder):
|
|
82
|
-
def __init__(self):
|
|
83
|
-
super().__init__(pd.DataFrame, None, CSV)
|
|
84
|
-
|
|
85
|
-
async def decode(
|
|
86
|
-
self,
|
|
87
|
-
proto_value: literals_pb2.StructuredDataset,
|
|
88
|
-
current_task_metadata: literals_pb2.StructuredDatasetMetadata,
|
|
89
|
-
) -> "pd.DataFrame":
|
|
90
|
-
from botocore.exceptions import NoCredentialsError
|
|
91
|
-
|
|
92
|
-
uri = proto_value.uri
|
|
93
|
-
columns = None
|
|
94
|
-
kwargs = get_pandas_storage_options(uri=uri, data_config=None)
|
|
95
|
-
path = os.path.join(uri, ".csv")
|
|
96
|
-
if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns:
|
|
97
|
-
columns = [c.name for c in current_task_metadata.structured_dataset_type.columns]
|
|
98
|
-
try:
|
|
99
|
-
return pd.read_csv(path, usecols=columns, storage_options=kwargs)
|
|
100
|
-
except NoCredentialsError:
|
|
101
|
-
logger.debug("S3 source detected, attempting anonymous S3 access")
|
|
102
|
-
kwargs = get_pandas_storage_options(uri=uri, data_config=None, anonymous=True)
|
|
103
|
-
return pd.read_csv(path, usecols=columns, storage_options=kwargs)
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
class PandasToParquetEncodingHandler(StructuredDatasetEncoder):
|
|
107
|
-
def __init__(self):
|
|
108
|
-
super().__init__(pd.DataFrame, None, PARQUET)
|
|
109
|
-
|
|
110
|
-
async def encode(
|
|
111
|
-
self,
|
|
112
|
-
structured_dataset: StructuredDataset,
|
|
113
|
-
structured_dataset_type: types_pb2.StructuredDatasetType,
|
|
114
|
-
) -> literals_pb2.StructuredDataset:
|
|
115
|
-
if not structured_dataset.uri:
|
|
116
|
-
from union._context import internal_ctx
|
|
117
|
-
|
|
118
|
-
ctx = internal_ctx()
|
|
119
|
-
uri = str(ctx.raw_data.get_random_remote_path())
|
|
120
|
-
else:
|
|
121
|
-
uri = typing.cast(str, structured_dataset.uri)
|
|
122
|
-
|
|
123
|
-
if not storage.is_remote(uri):
|
|
124
|
-
Path(uri).mkdir(parents=True, exist_ok=True)
|
|
125
|
-
path = os.path.join(uri, f"{0:05}")
|
|
126
|
-
df = typing.cast(pd.DataFrame, structured_dataset.dataframe)
|
|
127
|
-
df.to_parquet(
|
|
128
|
-
path,
|
|
129
|
-
coerce_timestamps="us",
|
|
130
|
-
allow_truncated_timestamps=False,
|
|
131
|
-
storage_options=get_pandas_storage_options(uri=path, data_config=None),
|
|
132
|
-
)
|
|
133
|
-
structured_dataset_type.format = PARQUET
|
|
134
|
-
return literals_pb2.StructuredDataset(
|
|
135
|
-
uri=uri, metadata=literals_pb2.StructuredDatasetMetadata(structured_dataset_type=structured_dataset_type)
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
class ParquetToPandasDecodingHandler(StructuredDatasetDecoder):
|
|
140
|
-
def __init__(self):
|
|
141
|
-
super().__init__(pd.DataFrame, None, PARQUET)
|
|
142
|
-
|
|
143
|
-
async def decode(
|
|
144
|
-
self,
|
|
145
|
-
flyte_value: literals_pb2.StructuredDataset,
|
|
146
|
-
current_task_metadata: literals_pb2.StructuredDatasetMetadata,
|
|
147
|
-
) -> "pd.DataFrame":
|
|
148
|
-
from botocore.exceptions import NoCredentialsError
|
|
149
|
-
|
|
150
|
-
uri = flyte_value.uri
|
|
151
|
-
columns = None
|
|
152
|
-
kwargs = get_pandas_storage_options(uri=uri, data_config=None)
|
|
153
|
-
if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns:
|
|
154
|
-
columns = [c.name for c in current_task_metadata.structured_dataset_type.columns]
|
|
155
|
-
try:
|
|
156
|
-
return pd.read_parquet(uri, columns=columns, storage_options=kwargs)
|
|
157
|
-
except NoCredentialsError:
|
|
158
|
-
logger.debug("S3 source detected, attempting anonymous S3 access")
|
|
159
|
-
kwargs = get_pandas_storage_options(uri=uri, data_config=None, anonymous=True)
|
|
160
|
-
return pd.read_parquet(uri, columns=columns, storage_options=kwargs)
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
class ArrowToParquetEncodingHandler(StructuredDatasetEncoder):
|
|
164
|
-
def __init__(self):
|
|
165
|
-
super().__init__(pa.Table, None, PARQUET)
|
|
166
|
-
|
|
167
|
-
async def encode(
|
|
168
|
-
self,
|
|
169
|
-
structured_dataset: StructuredDataset,
|
|
170
|
-
structured_dataset_type: types_pb2.StructuredDatasetType,
|
|
171
|
-
) -> literals_pb2.StructuredDataset:
|
|
172
|
-
import pyarrow.parquet as pq
|
|
173
|
-
|
|
174
|
-
if not structured_dataset.uri:
|
|
175
|
-
from union._context import internal_ctx
|
|
176
|
-
|
|
177
|
-
ctx = internal_ctx()
|
|
178
|
-
uri = ctx.raw_data.get_random_remote_path()
|
|
179
|
-
else:
|
|
180
|
-
uri = typing.cast(str, structured_dataset.uri)
|
|
181
|
-
|
|
182
|
-
if not storage.is_remote(uri):
|
|
183
|
-
Path(uri).mkdir(parents=True, exist_ok=True)
|
|
184
|
-
path = os.path.join(uri, f"{0:05}")
|
|
185
|
-
filesystem = storage.get_underlying_filesystem(path=path)
|
|
186
|
-
pq.write_table(structured_dataset.dataframe, strip_protocol(path), filesystem=filesystem)
|
|
187
|
-
return literals_pb2.StructuredDataset(
|
|
188
|
-
uri=uri, metadata=literals_pb2.StructuredDatasetMetadata(structured_dataset_type)
|
|
189
|
-
)
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
class ParquetToArrowDecodingHandler(StructuredDatasetDecoder):
|
|
193
|
-
def __init__(self):
|
|
194
|
-
super().__init__(pa.Table, None, PARQUET)
|
|
195
|
-
|
|
196
|
-
async def decode(
|
|
197
|
-
self,
|
|
198
|
-
proto_value: literals_pb2.StructuredDataset,
|
|
199
|
-
current_task_metadata: literals_pb2.StructuredDatasetMetadata,
|
|
200
|
-
) -> "pa.Table":
|
|
201
|
-
import pyarrow.parquet as pq
|
|
202
|
-
from botocore.exceptions import NoCredentialsError
|
|
203
|
-
|
|
204
|
-
uri = proto_value.uri
|
|
205
|
-
if not storage.is_remote(uri):
|
|
206
|
-
Path(uri).parent.mkdir(parents=True, exist_ok=True)
|
|
207
|
-
_, path = split_protocol(uri)
|
|
208
|
-
|
|
209
|
-
columns = None
|
|
210
|
-
if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns:
|
|
211
|
-
columns = [c.name for c in current_task_metadata.structured_dataset_type.columns]
|
|
212
|
-
try:
|
|
213
|
-
return pq.read_table(path, columns=columns)
|
|
214
|
-
except NoCredentialsError as e:
|
|
215
|
-
logger.debug("S3 source detected, attempting anonymous S3 access")
|
|
216
|
-
fs = storage.get_underlying_filesystem(path=uri, anonymous=True)
|
|
217
|
-
if fs is not None:
|
|
218
|
-
return pq.read_table(path, filesystem=fs, columns=columns)
|
|
219
|
-
raise e
|