flyte 0.0.1b0__py3-none-any.whl → 2.0.0b46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flyte/__init__.py +83 -30
- flyte/_bin/connect.py +61 -0
- flyte/_bin/debug.py +38 -0
- flyte/_bin/runtime.py +87 -19
- flyte/_bin/serve.py +351 -0
- flyte/_build.py +3 -2
- flyte/_cache/cache.py +6 -5
- flyte/_cache/local_cache.py +216 -0
- flyte/_code_bundle/_ignore.py +31 -5
- flyte/_code_bundle/_packaging.py +42 -11
- flyte/_code_bundle/_utils.py +57 -34
- flyte/_code_bundle/bundle.py +130 -27
- flyte/_constants.py +1 -0
- flyte/_context.py +21 -5
- flyte/_custom_context.py +73 -0
- flyte/_debug/constants.py +37 -0
- flyte/_debug/utils.py +17 -0
- flyte/_debug/vscode.py +315 -0
- flyte/_deploy.py +396 -75
- flyte/_deployer.py +109 -0
- flyte/_environment.py +94 -11
- flyte/_excepthook.py +37 -0
- flyte/_group.py +2 -1
- flyte/_hash.py +1 -16
- flyte/_image.py +544 -234
- flyte/_initialize.py +443 -294
- flyte/_interface.py +40 -5
- flyte/_internal/controllers/__init__.py +22 -8
- flyte/_internal/controllers/_local_controller.py +159 -35
- flyte/_internal/controllers/_trace.py +18 -10
- flyte/_internal/controllers/remote/__init__.py +38 -9
- flyte/_internal/controllers/remote/_action.py +82 -12
- flyte/_internal/controllers/remote/_client.py +6 -2
- flyte/_internal/controllers/remote/_controller.py +290 -64
- flyte/_internal/controllers/remote/_core.py +155 -95
- flyte/_internal/controllers/remote/_informer.py +40 -20
- flyte/_internal/controllers/remote/_service_protocol.py +2 -2
- flyte/_internal/imagebuild/__init__.py +2 -10
- flyte/_internal/imagebuild/docker_builder.py +391 -84
- flyte/_internal/imagebuild/image_builder.py +111 -55
- flyte/_internal/imagebuild/remote_builder.py +409 -0
- flyte/_internal/imagebuild/utils.py +79 -0
- flyte/_internal/resolvers/_app_env_module.py +92 -0
- flyte/_internal/resolvers/_task_module.py +5 -38
- flyte/_internal/resolvers/app_env.py +26 -0
- flyte/_internal/resolvers/common.py +8 -1
- flyte/_internal/resolvers/default.py +2 -2
- flyte/_internal/runtime/convert.py +322 -33
- flyte/_internal/runtime/entrypoints.py +106 -18
- flyte/_internal/runtime/io.py +71 -23
- flyte/_internal/runtime/resources_serde.py +21 -7
- flyte/_internal/runtime/reuse.py +125 -0
- flyte/_internal/runtime/rusty.py +196 -0
- flyte/_internal/runtime/task_serde.py +239 -66
- flyte/_internal/runtime/taskrunner.py +48 -8
- flyte/_internal/runtime/trigger_serde.py +162 -0
- flyte/_internal/runtime/types_serde.py +7 -16
- flyte/_keyring/file.py +115 -0
- flyte/_link.py +30 -0
- flyte/_logging.py +241 -42
- flyte/_map.py +312 -0
- flyte/_metrics.py +59 -0
- flyte/_module.py +74 -0
- flyte/_pod.py +30 -0
- flyte/_resources.py +296 -33
- flyte/_retry.py +1 -7
- flyte/_reusable_environment.py +72 -7
- flyte/_run.py +461 -132
- flyte/_secret.py +47 -11
- flyte/_serve.py +333 -0
- flyte/_task.py +245 -56
- flyte/_task_environment.py +219 -97
- flyte/_task_plugins.py +47 -0
- flyte/_tools.py +8 -8
- flyte/_trace.py +15 -24
- flyte/_trigger.py +1027 -0
- flyte/_utils/__init__.py +12 -1
- flyte/_utils/asyn.py +3 -1
- flyte/_utils/async_cache.py +139 -0
- flyte/_utils/coro_management.py +5 -4
- flyte/_utils/description_parser.py +19 -0
- flyte/_utils/docker_credentials.py +173 -0
- flyte/_utils/helpers.py +45 -19
- flyte/_utils/module_loader.py +123 -0
- flyte/_utils/org_discovery.py +57 -0
- flyte/_utils/uv_script_parser.py +8 -1
- flyte/_version.py +16 -3
- flyte/app/__init__.py +27 -0
- flyte/app/_app_environment.py +362 -0
- flyte/app/_connector_environment.py +40 -0
- flyte/app/_deploy.py +130 -0
- flyte/app/_parameter.py +343 -0
- flyte/app/_runtime/__init__.py +3 -0
- flyte/app/_runtime/app_serde.py +383 -0
- flyte/app/_types.py +113 -0
- flyte/app/extras/__init__.py +9 -0
- flyte/app/extras/_auth_middleware.py +217 -0
- flyte/app/extras/_fastapi.py +93 -0
- flyte/app/extras/_model_loader/__init__.py +3 -0
- flyte/app/extras/_model_loader/config.py +7 -0
- flyte/app/extras/_model_loader/loader.py +288 -0
- flyte/cli/__init__.py +12 -0
- flyte/cli/_abort.py +28 -0
- flyte/cli/_build.py +114 -0
- flyte/cli/_common.py +493 -0
- flyte/cli/_create.py +371 -0
- flyte/cli/_delete.py +45 -0
- flyte/cli/_deploy.py +401 -0
- flyte/cli/_gen.py +316 -0
- flyte/cli/_get.py +446 -0
- flyte/cli/_option.py +33 -0
- {union/_cli → flyte/cli}/_params.py +152 -153
- flyte/cli/_plugins.py +209 -0
- flyte/cli/_prefetch.py +292 -0
- flyte/cli/_run.py +690 -0
- flyte/cli/_serve.py +338 -0
- flyte/cli/_update.py +86 -0
- flyte/cli/_user.py +20 -0
- flyte/cli/main.py +246 -0
- flyte/config/__init__.py +3 -0
- flyte/config/_config.py +248 -0
- flyte/config/_internal.py +73 -0
- flyte/config/_reader.py +225 -0
- flyte/connectors/__init__.py +11 -0
- flyte/connectors/_connector.py +330 -0
- flyte/connectors/_server.py +194 -0
- flyte/connectors/utils.py +159 -0
- flyte/errors.py +134 -2
- flyte/extend.py +24 -0
- flyte/extras/_container.py +69 -56
- flyte/git/__init__.py +3 -0
- flyte/git/_config.py +279 -0
- flyte/io/__init__.py +8 -1
- flyte/io/{structured_dataset → _dataframe}/__init__.py +32 -30
- flyte/io/{structured_dataset → _dataframe}/basic_dfs.py +75 -68
- flyte/io/{structured_dataset/structured_dataset.py → _dataframe/dataframe.py} +207 -242
- flyte/io/_dir.py +575 -113
- flyte/io/_file.py +587 -141
- flyte/io/_hashing_io.py +342 -0
- flyte/io/extend.py +7 -0
- flyte/models.py +635 -0
- flyte/prefetch/__init__.py +22 -0
- flyte/prefetch/_hf_model.py +563 -0
- flyte/remote/__init__.py +14 -3
- flyte/remote/_action.py +879 -0
- flyte/remote/_app.py +346 -0
- flyte/remote/_auth_metadata.py +42 -0
- flyte/remote/_client/_protocols.py +62 -4
- flyte/remote/_client/auth/_auth_utils.py +19 -0
- flyte/remote/_client/auth/_authenticators/base.py +8 -2
- flyte/remote/_client/auth/_authenticators/device_code.py +4 -5
- flyte/remote/_client/auth/_authenticators/factory.py +4 -0
- flyte/remote/_client/auth/_authenticators/passthrough.py +79 -0
- flyte/remote/_client/auth/_authenticators/pkce.py +17 -18
- flyte/remote/_client/auth/_channel.py +47 -18
- flyte/remote/_client/auth/_client_config.py +5 -3
- flyte/remote/_client/auth/_keyring.py +15 -2
- flyte/remote/_client/auth/_token_client.py +3 -3
- flyte/remote/_client/controlplane.py +206 -18
- flyte/remote/_common.py +66 -0
- flyte/remote/_data.py +107 -22
- flyte/remote/_logs.py +116 -33
- flyte/remote/_project.py +21 -19
- flyte/remote/_run.py +164 -631
- flyte/remote/_secret.py +72 -29
- flyte/remote/_task.py +387 -46
- flyte/remote/_trigger.py +368 -0
- flyte/remote/_user.py +43 -0
- flyte/report/_report.py +10 -6
- flyte/storage/__init__.py +13 -1
- flyte/storage/_config.py +237 -0
- flyte/storage/_parallel_reader.py +289 -0
- flyte/storage/_storage.py +268 -59
- flyte/syncify/__init__.py +56 -0
- flyte/syncify/_api.py +414 -0
- flyte/types/__init__.py +39 -0
- flyte/types/_interface.py +22 -7
- flyte/{io/pickle/transformer.py → types/_pickle.py} +37 -9
- flyte/types/_string_literals.py +8 -9
- flyte/types/_type_engine.py +230 -129
- flyte/types/_utils.py +1 -1
- flyte-2.0.0b46.data/scripts/debug.py +38 -0
- flyte-2.0.0b46.data/scripts/runtime.py +194 -0
- flyte-2.0.0b46.dist-info/METADATA +352 -0
- flyte-2.0.0b46.dist-info/RECORD +221 -0
- flyte-2.0.0b46.dist-info/entry_points.txt +8 -0
- flyte-2.0.0b46.dist-info/licenses/LICENSE +201 -0
- flyte/_api_commons.py +0 -3
- flyte/_cli/_common.py +0 -287
- flyte/_cli/_create.py +0 -42
- flyte/_cli/_delete.py +0 -23
- flyte/_cli/_deploy.py +0 -140
- flyte/_cli/_get.py +0 -235
- flyte/_cli/_run.py +0 -152
- flyte/_cli/main.py +0 -72
- flyte/_datastructures.py +0 -342
- flyte/_internal/controllers/pbhash.py +0 -39
- flyte/_protos/common/authorization_pb2.py +0 -66
- flyte/_protos/common/authorization_pb2.pyi +0 -108
- flyte/_protos/common/authorization_pb2_grpc.py +0 -4
- flyte/_protos/common/identifier_pb2.py +0 -71
- flyte/_protos/common/identifier_pb2.pyi +0 -82
- flyte/_protos/common/identifier_pb2_grpc.py +0 -4
- flyte/_protos/common/identity_pb2.py +0 -48
- flyte/_protos/common/identity_pb2.pyi +0 -72
- flyte/_protos/common/identity_pb2_grpc.py +0 -4
- flyte/_protos/common/list_pb2.py +0 -36
- flyte/_protos/common/list_pb2.pyi +0 -69
- flyte/_protos/common/list_pb2_grpc.py +0 -4
- flyte/_protos/common/policy_pb2.py +0 -37
- flyte/_protos/common/policy_pb2.pyi +0 -27
- flyte/_protos/common/policy_pb2_grpc.py +0 -4
- flyte/_protos/common/role_pb2.py +0 -37
- flyte/_protos/common/role_pb2.pyi +0 -53
- flyte/_protos/common/role_pb2_grpc.py +0 -4
- flyte/_protos/common/runtime_version_pb2.py +0 -28
- flyte/_protos/common/runtime_version_pb2.pyi +0 -24
- flyte/_protos/common/runtime_version_pb2_grpc.py +0 -4
- flyte/_protos/logs/dataplane/payload_pb2.py +0 -96
- flyte/_protos/logs/dataplane/payload_pb2.pyi +0 -168
- flyte/_protos/logs/dataplane/payload_pb2_grpc.py +0 -4
- flyte/_protos/secret/definition_pb2.py +0 -49
- flyte/_protos/secret/definition_pb2.pyi +0 -93
- flyte/_protos/secret/definition_pb2_grpc.py +0 -4
- flyte/_protos/secret/payload_pb2.py +0 -62
- flyte/_protos/secret/payload_pb2.pyi +0 -94
- flyte/_protos/secret/payload_pb2_grpc.py +0 -4
- flyte/_protos/secret/secret_pb2.py +0 -38
- flyte/_protos/secret/secret_pb2.pyi +0 -6
- flyte/_protos/secret/secret_pb2_grpc.py +0 -198
- flyte/_protos/secret/secret_pb2_grpc_grpc.py +0 -198
- flyte/_protos/validate/validate/validate_pb2.py +0 -76
- flyte/_protos/workflow/node_execution_service_pb2.py +0 -26
- flyte/_protos/workflow/node_execution_service_pb2.pyi +0 -4
- flyte/_protos/workflow/node_execution_service_pb2_grpc.py +0 -32
- flyte/_protos/workflow/queue_service_pb2.py +0 -106
- flyte/_protos/workflow/queue_service_pb2.pyi +0 -141
- flyte/_protos/workflow/queue_service_pb2_grpc.py +0 -172
- flyte/_protos/workflow/run_definition_pb2.py +0 -128
- flyte/_protos/workflow/run_definition_pb2.pyi +0 -310
- flyte/_protos/workflow/run_definition_pb2_grpc.py +0 -4
- flyte/_protos/workflow/run_logs_service_pb2.py +0 -41
- flyte/_protos/workflow/run_logs_service_pb2.pyi +0 -28
- flyte/_protos/workflow/run_logs_service_pb2_grpc.py +0 -69
- flyte/_protos/workflow/run_service_pb2.py +0 -133
- flyte/_protos/workflow/run_service_pb2.pyi +0 -175
- flyte/_protos/workflow/run_service_pb2_grpc.py +0 -412
- flyte/_protos/workflow/state_service_pb2.py +0 -58
- flyte/_protos/workflow/state_service_pb2.pyi +0 -71
- flyte/_protos/workflow/state_service_pb2_grpc.py +0 -138
- flyte/_protos/workflow/task_definition_pb2.py +0 -72
- flyte/_protos/workflow/task_definition_pb2.pyi +0 -65
- flyte/_protos/workflow/task_definition_pb2_grpc.py +0 -4
- flyte/_protos/workflow/task_service_pb2.py +0 -44
- flyte/_protos/workflow/task_service_pb2.pyi +0 -31
- flyte/_protos/workflow/task_service_pb2_grpc.py +0 -104
- flyte/io/_dataframe.py +0 -0
- flyte/io/pickle/__init__.py +0 -0
- flyte/remote/_console.py +0 -18
- flyte-0.0.1b0.dist-info/METADATA +0 -179
- flyte-0.0.1b0.dist-info/RECORD +0 -390
- flyte-0.0.1b0.dist-info/entry_points.txt +0 -3
- union/__init__.py +0 -54
- union/_api_commons.py +0 -3
- union/_bin/__init__.py +0 -0
- union/_bin/runtime.py +0 -113
- union/_build.py +0 -25
- union/_cache/__init__.py +0 -12
- union/_cache/cache.py +0 -141
- union/_cache/defaults.py +0 -9
- union/_cache/policy_function_body.py +0 -42
- union/_cli/__init__.py +0 -0
- union/_cli/_common.py +0 -263
- union/_cli/_create.py +0 -40
- union/_cli/_delete.py +0 -23
- union/_cli/_deploy.py +0 -120
- union/_cli/_get.py +0 -162
- union/_cli/_run.py +0 -150
- union/_cli/main.py +0 -72
- union/_code_bundle/__init__.py +0 -8
- union/_code_bundle/_ignore.py +0 -113
- union/_code_bundle/_packaging.py +0 -187
- union/_code_bundle/_utils.py +0 -342
- union/_code_bundle/bundle.py +0 -176
- union/_context.py +0 -146
- union/_datastructures.py +0 -295
- union/_deploy.py +0 -185
- union/_doc.py +0 -29
- union/_docstring.py +0 -26
- union/_environment.py +0 -43
- union/_group.py +0 -31
- union/_hash.py +0 -23
- union/_image.py +0 -760
- union/_initialize.py +0 -585
- union/_interface.py +0 -84
- union/_internal/__init__.py +0 -3
- union/_internal/controllers/__init__.py +0 -77
- union/_internal/controllers/_local_controller.py +0 -77
- union/_internal/controllers/pbhash.py +0 -39
- union/_internal/controllers/remote/__init__.py +0 -40
- union/_internal/controllers/remote/_action.py +0 -131
- union/_internal/controllers/remote/_client.py +0 -43
- union/_internal/controllers/remote/_controller.py +0 -169
- union/_internal/controllers/remote/_core.py +0 -341
- union/_internal/controllers/remote/_informer.py +0 -260
- union/_internal/controllers/remote/_service_protocol.py +0 -44
- union/_internal/imagebuild/__init__.py +0 -11
- union/_internal/imagebuild/docker_builder.py +0 -416
- union/_internal/imagebuild/image_builder.py +0 -243
- union/_internal/imagebuild/remote_builder.py +0 -0
- union/_internal/resolvers/__init__.py +0 -0
- union/_internal/resolvers/_task_module.py +0 -31
- union/_internal/resolvers/common.py +0 -24
- union/_internal/resolvers/default.py +0 -27
- union/_internal/runtime/__init__.py +0 -0
- union/_internal/runtime/convert.py +0 -163
- union/_internal/runtime/entrypoints.py +0 -121
- union/_internal/runtime/io.py +0 -136
- union/_internal/runtime/resources_serde.py +0 -134
- union/_internal/runtime/task_serde.py +0 -202
- union/_internal/runtime/taskrunner.py +0 -179
- union/_internal/runtime/types_serde.py +0 -53
- union/_logging.py +0 -124
- union/_protos/__init__.py +0 -0
- union/_protos/common/authorization_pb2.py +0 -66
- union/_protos/common/authorization_pb2.pyi +0 -106
- union/_protos/common/authorization_pb2_grpc.py +0 -4
- union/_protos/common/identifier_pb2.py +0 -71
- union/_protos/common/identifier_pb2.pyi +0 -82
- union/_protos/common/identifier_pb2_grpc.py +0 -4
- union/_protos/common/identity_pb2.py +0 -48
- union/_protos/common/identity_pb2.pyi +0 -72
- union/_protos/common/identity_pb2_grpc.py +0 -4
- union/_protos/common/list_pb2.py +0 -36
- union/_protos/common/list_pb2.pyi +0 -69
- union/_protos/common/list_pb2_grpc.py +0 -4
- union/_protos/common/policy_pb2.py +0 -37
- union/_protos/common/policy_pb2.pyi +0 -27
- union/_protos/common/policy_pb2_grpc.py +0 -4
- union/_protos/common/role_pb2.py +0 -37
- union/_protos/common/role_pb2.pyi +0 -51
- union/_protos/common/role_pb2_grpc.py +0 -4
- union/_protos/common/runtime_version_pb2.py +0 -28
- union/_protos/common/runtime_version_pb2.pyi +0 -24
- union/_protos/common/runtime_version_pb2_grpc.py +0 -4
- union/_protos/logs/dataplane/payload_pb2.py +0 -96
- union/_protos/logs/dataplane/payload_pb2.pyi +0 -168
- union/_protos/logs/dataplane/payload_pb2_grpc.py +0 -4
- union/_protos/secret/definition_pb2.py +0 -49
- union/_protos/secret/definition_pb2.pyi +0 -93
- union/_protos/secret/definition_pb2_grpc.py +0 -4
- union/_protos/secret/payload_pb2.py +0 -62
- union/_protos/secret/payload_pb2.pyi +0 -94
- union/_protos/secret/payload_pb2_grpc.py +0 -4
- union/_protos/secret/secret_pb2.py +0 -38
- union/_protos/secret/secret_pb2.pyi +0 -6
- union/_protos/secret/secret_pb2_grpc.py +0 -198
- union/_protos/validate/validate/validate_pb2.py +0 -76
- union/_protos/workflow/node_execution_service_pb2.py +0 -26
- union/_protos/workflow/node_execution_service_pb2.pyi +0 -4
- union/_protos/workflow/node_execution_service_pb2_grpc.py +0 -32
- union/_protos/workflow/queue_service_pb2.py +0 -75
- union/_protos/workflow/queue_service_pb2.pyi +0 -103
- union/_protos/workflow/queue_service_pb2_grpc.py +0 -172
- union/_protos/workflow/run_definition_pb2.py +0 -100
- union/_protos/workflow/run_definition_pb2.pyi +0 -256
- union/_protos/workflow/run_definition_pb2_grpc.py +0 -4
- union/_protos/workflow/run_logs_service_pb2.py +0 -41
- union/_protos/workflow/run_logs_service_pb2.pyi +0 -28
- union/_protos/workflow/run_logs_service_pb2_grpc.py +0 -69
- union/_protos/workflow/run_service_pb2.py +0 -133
- union/_protos/workflow/run_service_pb2.pyi +0 -173
- union/_protos/workflow/run_service_pb2_grpc.py +0 -412
- union/_protos/workflow/state_service_pb2.py +0 -58
- union/_protos/workflow/state_service_pb2.pyi +0 -69
- union/_protos/workflow/state_service_pb2_grpc.py +0 -138
- union/_protos/workflow/task_definition_pb2.py +0 -72
- union/_protos/workflow/task_definition_pb2.pyi +0 -65
- union/_protos/workflow/task_definition_pb2_grpc.py +0 -4
- union/_protos/workflow/task_service_pb2.py +0 -44
- union/_protos/workflow/task_service_pb2.pyi +0 -31
- union/_protos/workflow/task_service_pb2_grpc.py +0 -104
- union/_resources.py +0 -226
- union/_retry.py +0 -32
- union/_reusable_environment.py +0 -25
- union/_run.py +0 -374
- union/_secret.py +0 -61
- union/_task.py +0 -354
- union/_task_environment.py +0 -186
- union/_timeout.py +0 -47
- union/_tools.py +0 -27
- union/_utils/__init__.py +0 -11
- union/_utils/asyn.py +0 -119
- union/_utils/file_handling.py +0 -71
- union/_utils/helpers.py +0 -46
- union/_utils/lazy_module.py +0 -54
- union/_utils/uv_script_parser.py +0 -49
- union/_version.py +0 -21
- union/connectors/__init__.py +0 -0
- union/errors.py +0 -128
- union/extras/__init__.py +0 -5
- union/extras/_container.py +0 -263
- union/io/__init__.py +0 -11
- union/io/_dataframe.py +0 -0
- union/io/_dir.py +0 -425
- union/io/_file.py +0 -418
- union/io/pickle/__init__.py +0 -0
- union/io/pickle/transformer.py +0 -117
- union/io/structured_dataset/__init__.py +0 -122
- union/io/structured_dataset/basic_dfs.py +0 -219
- union/io/structured_dataset/structured_dataset.py +0 -1057
- union/py.typed +0 -0
- union/remote/__init__.py +0 -23
- union/remote/_client/__init__.py +0 -0
- union/remote/_client/_protocols.py +0 -129
- union/remote/_client/auth/__init__.py +0 -12
- union/remote/_client/auth/_authenticators/__init__.py +0 -0
- union/remote/_client/auth/_authenticators/base.py +0 -391
- union/remote/_client/auth/_authenticators/client_credentials.py +0 -73
- union/remote/_client/auth/_authenticators/device_code.py +0 -120
- union/remote/_client/auth/_authenticators/external_command.py +0 -77
- union/remote/_client/auth/_authenticators/factory.py +0 -200
- union/remote/_client/auth/_authenticators/pkce.py +0 -515
- union/remote/_client/auth/_channel.py +0 -184
- union/remote/_client/auth/_client_config.py +0 -83
- union/remote/_client/auth/_default_html.py +0 -32
- union/remote/_client/auth/_grpc_utils/__init__.py +0 -0
- union/remote/_client/auth/_grpc_utils/auth_interceptor.py +0 -204
- union/remote/_client/auth/_grpc_utils/default_metadata_interceptor.py +0 -144
- union/remote/_client/auth/_keyring.py +0 -154
- union/remote/_client/auth/_token_client.py +0 -258
- union/remote/_client/auth/errors.py +0 -16
- union/remote/_client/controlplane.py +0 -86
- union/remote/_data.py +0 -149
- union/remote/_logs.py +0 -74
- union/remote/_project.py +0 -86
- union/remote/_run.py +0 -820
- union/remote/_secret.py +0 -132
- union/remote/_task.py +0 -193
- union/report/__init__.py +0 -3
- union/report/_report.py +0 -178
- union/report/_template.html +0 -124
- union/storage/__init__.py +0 -24
- union/storage/_remote_fs.py +0 -34
- union/storage/_storage.py +0 -247
- union/storage/_utils.py +0 -5
- union/types/__init__.py +0 -11
- union/types/_renderer.py +0 -162
- union/types/_string_literals.py +0 -120
- union/types/_type_engine.py +0 -2131
- union/types/_utils.py +0 -80
- /flyte/{_cli → _debug}/__init__.py +0 -0
- /flyte/{_protos → _keyring}/__init__.py +0 -0
- {flyte-0.0.1b0.dist-info → flyte-2.0.0b46.dist-info}/WHEEL +0 -0
- {flyte-0.0.1b0.dist-info → flyte-2.0.0b46.dist-info}/top_level.txt +0 -0
|
@@ -1,20 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import _datetime
|
|
4
|
-
import asyncio
|
|
5
4
|
import collections
|
|
6
5
|
import types
|
|
7
6
|
import typing
|
|
8
7
|
from abc import ABC, abstractmethod
|
|
9
|
-
from dataclasses import
|
|
8
|
+
from dataclasses import is_dataclass
|
|
10
9
|
from typing import Any, ClassVar, Coroutine, Dict, Generic, List, Optional, Type, Union
|
|
11
10
|
|
|
12
|
-
import
|
|
13
|
-
from flyteidl.core import literals_pb2, types_pb2
|
|
11
|
+
from flyteidl2.core import literals_pb2, types_pb2
|
|
14
12
|
from fsspec.utils import get_protocol
|
|
15
|
-
from mashumaro.mixins.json import DataClassJSONMixin
|
|
16
13
|
from mashumaro.types import SerializableType
|
|
17
|
-
from pydantic import model_serializer, model_validator
|
|
14
|
+
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr, model_serializer, model_validator
|
|
18
15
|
from typing_extensions import Annotated, TypeAlias, get_args, get_origin
|
|
19
16
|
|
|
20
17
|
import flyte.storage as storage
|
|
@@ -35,58 +32,66 @@ else:
|
|
|
35
32
|
pd = lazy_module("pandas")
|
|
36
33
|
pa = lazy_module("pyarrow")
|
|
37
34
|
|
|
38
|
-
T = typing.TypeVar("T") #
|
|
35
|
+
T = typing.TypeVar("T") # DataFrame type or a dataframe type
|
|
39
36
|
DF = typing.TypeVar("DF") # Dataframe type
|
|
40
37
|
|
|
41
|
-
# For specifying the storage formats of
|
|
42
|
-
|
|
38
|
+
# For specifying the storage formats of DataFrames. It's just a string, nothing fancy.
|
|
39
|
+
DataFrameFormat: TypeAlias = str
|
|
43
40
|
|
|
44
41
|
# Storage formats
|
|
45
|
-
PARQUET:
|
|
46
|
-
CSV:
|
|
47
|
-
GENERIC_FORMAT:
|
|
42
|
+
PARQUET: DataFrameFormat = "parquet"
|
|
43
|
+
CSV: DataFrameFormat = "csv"
|
|
44
|
+
GENERIC_FORMAT: DataFrameFormat = ""
|
|
48
45
|
GENERIC_PROTOCOL: str = "generic protocol"
|
|
49
46
|
|
|
50
47
|
|
|
51
|
-
|
|
52
|
-
class StructuredDataset(SerializableType, DataClassJSONMixin):
|
|
48
|
+
class DataFrame(BaseModel, SerializableType):
|
|
53
49
|
"""
|
|
54
|
-
This is the user facing
|
|
50
|
+
This is the user facing DataFrame class. Please don't confuse it with the literals.StructuredDataset
|
|
55
51
|
class (that is just a model, a Python class representation of the protobuf).
|
|
56
52
|
"""
|
|
57
53
|
|
|
58
|
-
uri: typing.Optional[str] =
|
|
59
|
-
|
|
54
|
+
uri: typing.Optional[str] = Field(default=None)
|
|
55
|
+
format: typing.Optional[str] = Field(default=GENERIC_FORMAT)
|
|
56
|
+
|
|
57
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
58
|
+
|
|
59
|
+
# Private attributes that are not part of the Pydantic model schema
|
|
60
|
+
_raw_df: typing.Optional[typing.Any] = PrivateAttr(default=None)
|
|
61
|
+
_metadata: typing.Optional[literals_pb2.StructuredDatasetMetadata] = PrivateAttr(default=None)
|
|
62
|
+
_literal_sd: Optional[literals_pb2.StructuredDataset] = PrivateAttr(default=None)
|
|
63
|
+
_dataframe_type: Optional[Type[Any]] = PrivateAttr(default=None)
|
|
64
|
+
_already_uploaded: bool = PrivateAttr(default=False)
|
|
60
65
|
|
|
61
66
|
# loop manager is working better than synchronicity for some reason, was getting an error but may be an easy fix
|
|
62
67
|
def _serialize(self) -> Dict[str, Optional[str]]:
|
|
63
68
|
# dataclass case
|
|
64
69
|
lt = TypeEngine.to_literal_type(type(self))
|
|
65
|
-
engine =
|
|
70
|
+
engine = DataFrameTransformerEngine()
|
|
66
71
|
lv = loop_manager.run_sync(engine.to_literal, self, type(self), lt)
|
|
67
|
-
sd =
|
|
68
|
-
sd.
|
|
72
|
+
sd = DataFrame(uri=lv.scalar.structured_dataset.uri)
|
|
73
|
+
sd.format = lv.scalar.structured_dataset.metadata.structured_dataset_type.format
|
|
69
74
|
return {
|
|
70
75
|
"uri": sd.uri,
|
|
71
|
-
"
|
|
76
|
+
"format": sd.format,
|
|
72
77
|
}
|
|
73
78
|
|
|
74
79
|
@classmethod
|
|
75
|
-
def _deserialize(cls, value) ->
|
|
80
|
+
def _deserialize(cls, value) -> DataFrame:
|
|
76
81
|
uri = value.get("uri", None)
|
|
77
|
-
|
|
82
|
+
format_val = value.get("format", None)
|
|
78
83
|
|
|
79
84
|
if uri is None:
|
|
80
|
-
raise ValueError("
|
|
85
|
+
raise ValueError("DataFrame's uri and file format should not be None")
|
|
81
86
|
|
|
82
|
-
engine =
|
|
87
|
+
engine = DataFrameTransformerEngine()
|
|
83
88
|
return loop_manager.run_sync(
|
|
84
89
|
engine.to_python_value,
|
|
85
90
|
literals_pb2.Literal(
|
|
86
91
|
scalar=literals_pb2.Scalar(
|
|
87
92
|
structured_dataset=literals_pb2.StructuredDataset(
|
|
88
93
|
metadata=literals_pb2.StructuredDatasetMetadata(
|
|
89
|
-
structured_dataset_type=types_pb2.StructuredDatasetType(format=
|
|
94
|
+
structured_dataset_type=types_pb2.StructuredDatasetType(format=format_val)
|
|
90
95
|
),
|
|
91
96
|
uri=uri,
|
|
92
97
|
)
|
|
@@ -96,28 +101,28 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
|
|
|
96
101
|
)
|
|
97
102
|
|
|
98
103
|
@model_serializer
|
|
99
|
-
def
|
|
104
|
+
def serialize_dataframe(self) -> Dict[str, Optional[str]]:
|
|
100
105
|
lt = TypeEngine.to_literal_type(type(self))
|
|
101
|
-
sde =
|
|
106
|
+
sde = DataFrameTransformerEngine()
|
|
102
107
|
lv = loop_manager.run_sync(sde.to_literal, self, type(self), lt)
|
|
103
108
|
return {
|
|
104
109
|
"uri": lv.scalar.structured_dataset.uri,
|
|
105
|
-
"
|
|
110
|
+
"format": lv.scalar.structured_dataset.metadata.structured_dataset_type.format,
|
|
106
111
|
}
|
|
107
112
|
|
|
108
113
|
@model_validator(mode="after")
|
|
109
|
-
def
|
|
114
|
+
def deserialize_dataframe(self, info) -> DataFrame:
|
|
110
115
|
if info.context is None or info.context.get("deserialize") is not True:
|
|
111
116
|
return self
|
|
112
117
|
|
|
113
|
-
engine =
|
|
118
|
+
engine = DataFrameTransformerEngine()
|
|
114
119
|
return loop_manager.run_sync(
|
|
115
120
|
engine.to_python_value,
|
|
116
121
|
literals_pb2.Literal(
|
|
117
122
|
scalar=literals_pb2.Scalar(
|
|
118
123
|
structured_dataset=literals_pb2.StructuredDataset(
|
|
119
124
|
metadata=literals_pb2.StructuredDatasetMetadata(
|
|
120
|
-
structured_dataset_type=types_pb2.StructuredDatasetType(format=self.
|
|
125
|
+
structured_dataset_type=types_pb2.StructuredDatasetType(format=self.format)
|
|
121
126
|
),
|
|
122
127
|
uri=self.uri,
|
|
123
128
|
)
|
|
@@ -134,30 +139,46 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
|
|
|
134
139
|
def column_names(cls) -> typing.List[str]:
|
|
135
140
|
return [k for k, v in cls.columns().items()]
|
|
136
141
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
142
|
+
@classmethod
|
|
143
|
+
def from_df(
|
|
144
|
+
cls,
|
|
145
|
+
val: typing.Optional[typing.Any] = None,
|
|
140
146
|
uri: typing.Optional[str] = None,
|
|
141
|
-
|
|
147
|
+
) -> DataFrame:
|
|
148
|
+
"""
|
|
149
|
+
Wrapper to create a DataFrame from a dataframe.
|
|
150
|
+
The reason this is implemented as a wrapper instead of a full translation invoking
|
|
151
|
+
the type engine and the encoders is because there's too much information in the type
|
|
152
|
+
signature of the task that we don't want the user to have to replicate.
|
|
153
|
+
"""
|
|
154
|
+
instance = cls(uri=uri)
|
|
155
|
+
instance._raw_df = val
|
|
156
|
+
return instance
|
|
157
|
+
|
|
158
|
+
@classmethod
|
|
159
|
+
def from_existing_remote(
|
|
160
|
+
cls,
|
|
161
|
+
remote_path: str,
|
|
162
|
+
format: typing.Optional[str] = None,
|
|
142
163
|
**kwargs,
|
|
143
|
-
):
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
164
|
+
) -> "DataFrame":
|
|
165
|
+
"""
|
|
166
|
+
Create a DataFrame reference from an existing remote dataframe.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
remote_path: The remote path to the existing dataframe
|
|
170
|
+
format: Format of the stored dataframe
|
|
171
|
+
|
|
172
|
+
Example:
|
|
173
|
+
```python
|
|
174
|
+
df = DataFrame.from_existing_remote("s3://bucket/data.parquet", format="parquet")
|
|
175
|
+
```
|
|
176
|
+
"""
|
|
177
|
+
return cls(uri=remote_path, format=format or GENERIC_FORMAT, **kwargs)
|
|
157
178
|
|
|
158
179
|
@property
|
|
159
|
-
def
|
|
160
|
-
return self.
|
|
180
|
+
def val(self) -> Optional[DF]:
|
|
181
|
+
return self._raw_df
|
|
161
182
|
|
|
162
183
|
@property
|
|
163
184
|
def metadata(self) -> Optional[literals_pb2.StructuredDatasetMetadata]:
|
|
@@ -168,18 +189,18 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
|
|
|
168
189
|
return self._literal_sd
|
|
169
190
|
|
|
170
191
|
def open(self, dataframe_type: Type[DF]):
|
|
171
|
-
from flyte.io.structured_dataset import lazy_import_structured_dataset_handler
|
|
172
|
-
|
|
173
192
|
"""
|
|
174
193
|
Load the handler if needed. For the use case like:
|
|
175
194
|
@task
|
|
176
|
-
def t1(
|
|
195
|
+
def t1(df: DataFrame):
|
|
177
196
|
import pandas as pd
|
|
178
|
-
|
|
197
|
+
df.open(pd.DataFrame).all()
|
|
179
198
|
|
|
180
|
-
pandas is imported inside the task, so
|
|
199
|
+
pandas is imported inside the task, so panda handler won't be loaded during deserialization in type engine.
|
|
181
200
|
"""
|
|
182
|
-
|
|
201
|
+
from flyte.io._dataframe import lazy_import_dataframe_handler
|
|
202
|
+
|
|
203
|
+
lazy_import_dataframe_handler()
|
|
183
204
|
self._dataframe_type = dataframe_type
|
|
184
205
|
return self
|
|
185
206
|
|
|
@@ -187,22 +208,22 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
|
|
|
187
208
|
if self._dataframe_type is None:
|
|
188
209
|
raise ValueError("No dataframe type set. Use open() to set the local dataframe type you want to use.")
|
|
189
210
|
|
|
190
|
-
if self.uri is not None and self.
|
|
191
|
-
expected = TypeEngine.to_literal_type(
|
|
211
|
+
if self.uri is not None and self.val is None:
|
|
212
|
+
expected = TypeEngine.to_literal_type(DataFrame)
|
|
192
213
|
await self._set_literal(expected)
|
|
193
214
|
|
|
194
215
|
return await flyte_dataset_transformer.open_as(self.literal, self._dataframe_type, self.metadata)
|
|
195
216
|
|
|
196
217
|
async def _set_literal(self, expected: types_pb2.LiteralType) -> None:
|
|
197
218
|
"""
|
|
198
|
-
Explicitly set the
|
|
219
|
+
Explicitly set the DataFrame Literal to handle the following cases:
|
|
199
220
|
|
|
200
|
-
1. Read
|
|
221
|
+
1. Read the content from a DataFrame with an uri, for example:
|
|
201
222
|
|
|
202
223
|
@task
|
|
203
|
-
def
|
|
204
|
-
|
|
205
|
-
df =
|
|
224
|
+
def return_df() -> DataFrame:
|
|
225
|
+
df = DataFrame(uri="s3://my-s3-bucket/s3_flyte_dir/df.parquet", format="parquet")
|
|
226
|
+
df = df.open(pd.DataFrame).all()
|
|
206
227
|
return df
|
|
207
228
|
|
|
208
229
|
For details, please refer to this issue: https://github.com/flyteorg/flyte/issues/5954.
|
|
@@ -212,14 +233,14 @@ class StructuredDataset(SerializableType, DataClassJSONMixin):
|
|
|
212
233
|
|
|
213
234
|
For details, please refer to this issue: https://github.com/flyteorg/flyte/issues/5956.
|
|
214
235
|
"""
|
|
215
|
-
to_literal = await flyte_dataset_transformer.to_literal(self,
|
|
236
|
+
to_literal = await flyte_dataset_transformer.to_literal(self, DataFrame, expected)
|
|
216
237
|
self._literal_sd = to_literal.scalar.structured_dataset
|
|
217
238
|
if self.metadata is None:
|
|
218
239
|
self._metadata = self._literal_sd.metadata
|
|
219
240
|
|
|
220
241
|
async def set_literal(self, expected: types_pb2.LiteralType) -> None:
|
|
221
242
|
"""
|
|
222
|
-
A public wrapper method to set the
|
|
243
|
+
A public wrapper method to set the DataFrame Literal.
|
|
223
244
|
|
|
224
245
|
This method provides external access to the internal _set_literal method.
|
|
225
246
|
"""
|
|
@@ -244,6 +265,9 @@ def flatten_dict(sub_dict: dict, parent_key: str = "") -> typing.Dict:
|
|
|
244
265
|
fields = getattr(value, "__dataclass_fields__")
|
|
245
266
|
d = {k: v.type for k, v in fields.items()}
|
|
246
267
|
result.update(flatten_dict(sub_dict=d, parent_key=current_key))
|
|
268
|
+
elif hasattr(value, "model_fields"): # Pydantic model
|
|
269
|
+
d = {k: v.annotation for k, v in value.model_fields.items()}
|
|
270
|
+
result.update(flatten_dict(sub_dict=d, parent_key=current_key))
|
|
247
271
|
else:
|
|
248
272
|
result[current_key] = value
|
|
249
273
|
return result
|
|
@@ -256,7 +280,7 @@ def extract_cols_and_format(
|
|
|
256
280
|
Helper function, just used to iterate through Annotations and extract out the following information:
|
|
257
281
|
- base type, if not Annotated, it will just be the type that was passed in.
|
|
258
282
|
- column information, as a collections.OrderedDict,
|
|
259
|
-
- the storage format, as a ``
|
|
283
|
+
- the storage format, as a ``DataFrameFormat`` (str),
|
|
260
284
|
- pa.lib.Schema
|
|
261
285
|
|
|
262
286
|
If more than one of any type of thing is found, an error will be raised.
|
|
@@ -286,7 +310,7 @@ def extract_cols_and_format(
|
|
|
286
310
|
d = collections.OrderedDict()
|
|
287
311
|
d.update(aa)
|
|
288
312
|
ordered_dict_cols = d
|
|
289
|
-
elif isinstance(aa,
|
|
313
|
+
elif isinstance(aa, DataFrameFormat):
|
|
290
314
|
if fmt != "":
|
|
291
315
|
raise ValueError(f"A format was already specified {fmt}, cannot use {aa}")
|
|
292
316
|
fmt = aa
|
|
@@ -305,7 +329,7 @@ def extract_cols_and_format(
|
|
|
305
329
|
return t, ordered_dict_cols, fmt, pa_schema
|
|
306
330
|
|
|
307
331
|
|
|
308
|
-
class
|
|
332
|
+
class DataFrameEncoder(ABC, Generic[T]):
|
|
309
333
|
def __init__(
|
|
310
334
|
self,
|
|
311
335
|
python_type: Type[T],
|
|
@@ -314,10 +338,10 @@ class StructuredDatasetEncoder(ABC, Generic[T]):
|
|
|
314
338
|
):
|
|
315
339
|
"""
|
|
316
340
|
Extend this abstract class, implement the encode function, and register your concrete class with the
|
|
317
|
-
|
|
341
|
+
DataFrameTransformerEngine class in order for the core flytekit type engine to handle
|
|
318
342
|
dataframe libraries. This is the encoding interface, meaning it is used when there is a Python value that the
|
|
319
343
|
flytekit type engine is trying to convert into a Flyte Literal. For the other way, see
|
|
320
|
-
the
|
|
344
|
+
the DataFrameEncoder
|
|
321
345
|
|
|
322
346
|
:param python_type: The dataframe class in question that you want to register this encoder with
|
|
323
347
|
:param protocol: A prefix representing the storage driver (e.g. 's3, 'gs', 'bq', etc.). You can use either
|
|
@@ -347,7 +371,7 @@ class StructuredDatasetEncoder(ABC, Generic[T]):
|
|
|
347
371
|
@abstractmethod
|
|
348
372
|
async def encode(
|
|
349
373
|
self,
|
|
350
|
-
|
|
374
|
+
dataframe: DataFrame,
|
|
351
375
|
structured_dataset_type: types_pb2.StructuredDatasetType,
|
|
352
376
|
) -> literals_pb2.StructuredDataset:
|
|
353
377
|
"""
|
|
@@ -357,20 +381,20 @@ class StructuredDatasetEncoder(ABC, Generic[T]):
|
|
|
357
381
|
the
|
|
358
382
|
# TODO: Do we need to add a flag to indicate if it was wrapped by the transformer or by the user?
|
|
359
383
|
|
|
360
|
-
:param
|
|
361
|
-
:param structured_dataset_type: This the
|
|
384
|
+
:param dataframe: This is a DataFrame wrapper object. See more info above.
|
|
385
|
+
:param structured_dataset_type: This the DataFrameType, as found in the LiteralType of the interface
|
|
362
386
|
of the task that invoked this encoding call. It is passed along to encoders so that authors of encoders
|
|
363
|
-
can include it in the returned literals.
|
|
387
|
+
can include it in the returned literals.DataFrame. See the IDL for more information on why this
|
|
364
388
|
literal in particular carries the type information along with it. If the encoder doesn't supply it, it will
|
|
365
389
|
also be filled in after the encoder runs by the transformer engine.
|
|
366
|
-
:return: This function should return a
|
|
367
|
-
|
|
368
|
-
This function needs to return the IDL
|
|
390
|
+
:return: This function should return a DataFrame literal object. Do not confuse this with the
|
|
391
|
+
DataFrame wrapper class used as input to this function - that is the user facing Python class.
|
|
392
|
+
This function needs to return the IDL DataFrame.
|
|
369
393
|
"""
|
|
370
394
|
raise NotImplementedError
|
|
371
395
|
|
|
372
396
|
|
|
373
|
-
class
|
|
397
|
+
class DataFrameDecoder(ABC, Generic[DF]):
|
|
374
398
|
def __init__(
|
|
375
399
|
self,
|
|
376
400
|
python_type: Type[DF],
|
|
@@ -380,9 +404,9 @@ class StructuredDatasetDecoder(ABC, Generic[DF]):
|
|
|
380
404
|
):
|
|
381
405
|
"""
|
|
382
406
|
Extend this abstract class, implement the decode function, and register your concrete class with the
|
|
383
|
-
|
|
407
|
+
DataFrameTransformerEngine class in order for the core flytekit type engine to handle
|
|
384
408
|
dataframe libraries. This is the decoder interface, meaning it is used when there is a Flyte Literal value,
|
|
385
|
-
and we have to get a Python value out of it. For the other way, see the
|
|
409
|
+
and we have to get a Python value out of it. For the other way, see the DataFrameEncoder
|
|
386
410
|
|
|
387
411
|
:param python_type: The dataframe class in question that you want to register this decoder with
|
|
388
412
|
:param protocol: A prefix representing the storage driver (e.g. 's3, 'gs', 'bq', etc.). You can use either
|
|
@@ -419,8 +443,8 @@ class StructuredDatasetDecoder(ABC, Generic[DF]):
|
|
|
419
443
|
This is code that will be called by the dataset transformer engine to ultimately translate from a Flyte Literal
|
|
420
444
|
value into a Python instance.
|
|
421
445
|
|
|
422
|
-
:param flyte_value: This will be a Flyte IDL
|
|
423
|
-
|
|
446
|
+
:param flyte_value: This will be a Flyte IDL DataFrame Literal - do not confuse this with the
|
|
447
|
+
DataFrame class defined also in this module.
|
|
424
448
|
:param current_task_metadata: Metadata object containing the type (and columns if any) for the currently
|
|
425
449
|
executing task. This type may have more or less information than the type information bundled
|
|
426
450
|
inside the incoming flyte_value.
|
|
@@ -459,19 +483,19 @@ def get_supported_types():
|
|
|
459
483
|
class DuplicateHandlerError(ValueError): ...
|
|
460
484
|
|
|
461
485
|
|
|
462
|
-
class
|
|
486
|
+
class DataFrameTransformerEngine(TypeTransformer[DataFrame]):
|
|
463
487
|
"""
|
|
464
488
|
Think of this transformer as a higher-level meta transformer that is used for all the dataframe types.
|
|
465
489
|
If you are bringing a custom data frame type, or any data frame type, to flytekit, instead of
|
|
466
490
|
registering with the main type engine, you should register with this transformer instead.
|
|
467
491
|
"""
|
|
468
492
|
|
|
469
|
-
ENCODERS: ClassVar[Dict[Type, Dict[str, Dict[str,
|
|
470
|
-
DECODERS: ClassVar[Dict[Type, Dict[str, Dict[str,
|
|
493
|
+
ENCODERS: ClassVar[Dict[Type, Dict[str, Dict[str, DataFrameEncoder]]]] = {}
|
|
494
|
+
DECODERS: ClassVar[Dict[Type, Dict[str, Dict[str, DataFrameDecoder]]]] = {}
|
|
471
495
|
DEFAULT_PROTOCOLS: ClassVar[Dict[Type, str]] = {}
|
|
472
496
|
DEFAULT_FORMATS: ClassVar[Dict[Type, str]] = {}
|
|
473
497
|
|
|
474
|
-
Handlers = Union[
|
|
498
|
+
Handlers = Union[DataFrameEncoder, DataFrameDecoder]
|
|
475
499
|
Renderers: ClassVar[Dict[Type, Renderable]] = {}
|
|
476
500
|
|
|
477
501
|
@classmethod
|
|
@@ -527,17 +551,17 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
527
551
|
|
|
528
552
|
@classmethod
|
|
529
553
|
def get_encoder(cls, df_type: Type, protocol: str, format: str):
|
|
530
|
-
return cls._finder(
|
|
554
|
+
return cls._finder(DataFrameTransformerEngine.ENCODERS, df_type, protocol, format)
|
|
531
555
|
|
|
532
556
|
@classmethod
|
|
533
|
-
def get_decoder(cls, df_type: Type, protocol: str, format: str) ->
|
|
534
|
-
return cls._finder(
|
|
557
|
+
def get_decoder(cls, df_type: Type, protocol: str, format: str) -> DataFrameDecoder:
|
|
558
|
+
return cls._finder(DataFrameTransformerEngine.DECODERS, df_type, protocol, format)
|
|
535
559
|
|
|
536
560
|
@classmethod
|
|
537
561
|
def _handler_finder(cls, h: Handlers, protocol: str) -> Dict[str, Handlers]:
|
|
538
|
-
if isinstance(h,
|
|
562
|
+
if isinstance(h, DataFrameEncoder):
|
|
539
563
|
top_level = cls.ENCODERS
|
|
540
|
-
elif isinstance(h,
|
|
564
|
+
elif isinstance(h, DataFrameDecoder):
|
|
541
565
|
top_level = cls.DECODERS # type: ignore
|
|
542
566
|
else:
|
|
543
567
|
raise TypeError(f"We don't support this type of handler {h}")
|
|
@@ -548,7 +572,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
548
572
|
return top_level[h.python_type][protocol] # type: ignore
|
|
549
573
|
|
|
550
574
|
def __init__(self):
|
|
551
|
-
super().__init__("
|
|
575
|
+
super().__init__("DataFrame Transformer", DataFrame)
|
|
552
576
|
self._type_assertions_enabled = False
|
|
553
577
|
|
|
554
578
|
@classmethod
|
|
@@ -568,7 +592,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
568
592
|
Call this with any Encoder or Decoder to register it with the flytekit type system. If your handler does not
|
|
569
593
|
specify a protocol (e.g. s3, gs, etc.) field, then
|
|
570
594
|
|
|
571
|
-
:param h: The
|
|
595
|
+
:param h: The DataFrameEncoder or DataFrameDecoder you wish to register with this transformer.
|
|
572
596
|
:param default_for_type: If set, when a user returns from a task an instance of the dataframe the handler
|
|
573
597
|
handles, e.g. ``return pd.DataFrame(...)``, not wrapped around the ``StructuredDataset`` object, we will
|
|
574
598
|
use this handler's protocol and format as the default, effectively saying that this handler will be called.
|
|
@@ -582,7 +606,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
582
606
|
:param default_storage_for_type: Same as above but only for the storage format. Error if already set,
|
|
583
607
|
unless override is specified.
|
|
584
608
|
"""
|
|
585
|
-
if not (isinstance(h,
|
|
609
|
+
if not (isinstance(h, DataFrameEncoder) or isinstance(h, DataFrameDecoder)):
|
|
586
610
|
raise TypeError(f"We don't support this type of handler {h}")
|
|
587
611
|
|
|
588
612
|
if h.protocol is None:
|
|
@@ -623,17 +647,21 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
623
647
|
f"Already registered a handler for {(h.python_type, protocol, h.supported_format)}"
|
|
624
648
|
)
|
|
625
649
|
lowest_level[h.supported_format] = h
|
|
626
|
-
logger.debug(
|
|
650
|
+
logger.debug(
|
|
651
|
+
f"Registered {h.__class__.__name__} as handler for {h.python_type.__class__.__name__},"
|
|
652
|
+
f" protocol {protocol}, fmt {h.supported_format}"
|
|
653
|
+
)
|
|
627
654
|
|
|
628
655
|
if (default_format_for_type or default_for_type) and h.supported_format != GENERIC_FORMAT:
|
|
629
656
|
if h.python_type in cls.DEFAULT_FORMATS and not override:
|
|
630
657
|
if cls.DEFAULT_FORMATS[h.python_type] != h.supported_format:
|
|
631
658
|
logger.info(
|
|
632
|
-
f"Not using handler {h} with format {h.supported_format}"
|
|
633
|
-
f" as default for {h.python_type
|
|
659
|
+
f"Not using handler {h.__class__.__name__} with format {h.supported_format}"
|
|
660
|
+
f" as default for {h.python_type.__class__.__name__},"
|
|
661
|
+
f" {cls.DEFAULT_FORMATS[h.python_type]} already specified."
|
|
634
662
|
)
|
|
635
663
|
else:
|
|
636
|
-
logger.debug(f"Use {type(h).__name__} as default handler for {h.python_type}.")
|
|
664
|
+
logger.debug(f"Use {type(h).__name__} as default handler for {h.python_type.__class__.__name__}.")
|
|
637
665
|
cls.DEFAULT_FORMATS[h.python_type] = h.supported_format
|
|
638
666
|
if default_storage_for_type or default_for_type:
|
|
639
667
|
if h.protocol in cls.DEFAULT_PROTOCOLS and not override:
|
|
@@ -648,27 +676,27 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
648
676
|
# Register with the type engine as well
|
|
649
677
|
# The semantics as of now are such that it doesn't matter which order these transformers are loaded in, as
|
|
650
678
|
# long as the older Pandas/FlyteSchema transformer do not also specify the override
|
|
651
|
-
engine =
|
|
679
|
+
engine = DataFrameTransformerEngine()
|
|
652
680
|
TypeEngine.register_additional_type(engine, h.python_type, override=True)
|
|
653
681
|
|
|
654
|
-
def assert_type(self, t: Type[
|
|
682
|
+
def assert_type(self, t: Type[DataFrame], v: typing.Any):
|
|
655
683
|
return
|
|
656
684
|
|
|
657
685
|
async def to_literal(
|
|
658
686
|
self,
|
|
659
|
-
python_val: Union[
|
|
660
|
-
python_type: Union[Type[
|
|
687
|
+
python_val: Union[DataFrame, typing.Any],
|
|
688
|
+
python_type: Union[Type[DataFrame], Type],
|
|
661
689
|
expected: types_pb2.LiteralType,
|
|
662
690
|
) -> literals_pb2.Literal:
|
|
663
691
|
# Make a copy in case we need to hand off to encoders, since we can't be sure of mutations.
|
|
664
|
-
python_type, *
|
|
692
|
+
python_type, *_attrs = extract_cols_and_format(python_type)
|
|
665
693
|
sdt = types_pb2.StructuredDatasetType(format=self.DEFAULT_FORMATS.get(python_type, GENERIC_FORMAT))
|
|
666
694
|
|
|
667
|
-
if issubclass(python_type,
|
|
695
|
+
if issubclass(python_type, DataFrame) and not isinstance(python_val, DataFrame):
|
|
668
696
|
# Catch a common mistake
|
|
669
697
|
raise TypeTransformerFailedError(
|
|
670
|
-
f"Expected a
|
|
671
|
-
f" Did you forget to wrap your dataframe in a
|
|
698
|
+
f"Expected a DataFrame instance, but got {type(python_val)} instead."
|
|
699
|
+
f" Did you forget to wrap your dataframe in a DataFrame instance?"
|
|
672
700
|
)
|
|
673
701
|
|
|
674
702
|
if expected and expected.structured_dataset_type:
|
|
@@ -679,60 +707,60 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
679
707
|
external_schema_bytes=expected.structured_dataset_type.external_schema_bytes,
|
|
680
708
|
)
|
|
681
709
|
|
|
682
|
-
# If the type signature has the
|
|
683
|
-
#
|
|
684
|
-
if isinstance(python_val,
|
|
710
|
+
# If the type signature has the DataFrame class, it will, or at least should, also be a
|
|
711
|
+
# DataFrame instance.
|
|
712
|
+
if isinstance(python_val, DataFrame):
|
|
685
713
|
# There are three cases that we need to take care of here.
|
|
686
714
|
|
|
687
|
-
# 1. A task returns a
|
|
688
|
-
# then return the original literals.
|
|
715
|
+
# 1. A task returns a DataFrame that was just a passthrough input. If this happens
|
|
716
|
+
# then return the original literals.DataFrame without invoking any encoder
|
|
689
717
|
#
|
|
690
718
|
# Ex.
|
|
691
|
-
# def t1(dataset: Annotated[
|
|
719
|
+
# def t1(dataset: Annotated[DataFrame, my_cols]) -> Annotated[DataFrame, my_cols]:
|
|
692
720
|
# return dataset
|
|
693
721
|
if python_val._literal_sd is not None:
|
|
694
722
|
if python_val._already_uploaded:
|
|
695
723
|
return literals_pb2.Literal(scalar=literals_pb2.Scalar(structured_dataset=python_val._literal_sd))
|
|
696
|
-
if python_val.
|
|
724
|
+
if python_val.val is not None:
|
|
697
725
|
raise ValueError(
|
|
698
|
-
f"Shouldn't have specified both literal {python_val._literal_sd}"
|
|
699
|
-
f" and dataframe {python_val.dataframe}"
|
|
726
|
+
f"Shouldn't have specified both literal {python_val._literal_sd} and dataframe {python_val.val}"
|
|
700
727
|
)
|
|
701
728
|
return literals_pb2.Literal(scalar=literals_pb2.Scalar(structured_dataset=python_val._literal_sd))
|
|
702
729
|
|
|
703
|
-
# 2. A task returns a python
|
|
704
|
-
# Note: this case is also what happens we start a local execution of a task with a python
|
|
705
|
-
# It gets converted into a literal first, then back into a python
|
|
730
|
+
# 2. A task returns a python DataFrame with an uri.
|
|
731
|
+
# Note: this case is also what happens we start a local execution of a task with a python DataFrame.
|
|
732
|
+
# It gets converted into a literal first, then back into a python DataFrame.
|
|
706
733
|
#
|
|
707
734
|
# Ex.
|
|
708
|
-
# def t2(uri: str) -> Annotated[
|
|
709
|
-
# return
|
|
710
|
-
if python_val.
|
|
735
|
+
# def t2(uri: str) -> Annotated[DataFrame, my_cols]
|
|
736
|
+
# return DataFrame(uri=uri)
|
|
737
|
+
if python_val.val is None:
|
|
711
738
|
uri = python_val.uri
|
|
712
|
-
|
|
739
|
+
format_val = python_val.format
|
|
713
740
|
|
|
714
741
|
# Check the user-specified uri
|
|
715
742
|
if not uri:
|
|
716
743
|
raise ValueError(f"If dataframe is not specified, then the uri should be specified. {python_val}")
|
|
717
744
|
if not storage.is_remote(uri):
|
|
718
|
-
uri = await storage.put(uri)
|
|
745
|
+
uri = await storage.put(uri, recursive=True)
|
|
719
746
|
|
|
720
|
-
# Check the user-specified
|
|
721
|
-
# When users specify
|
|
747
|
+
# Check the user-specified format
|
|
748
|
+
# When users specify format for a DataFrame, the format should be retained
|
|
722
749
|
# conditionally. For details, please refer to https://github.com/flyteorg/flyte/issues/6096.
|
|
723
750
|
# Following illustrates why we can't always copy the user-specified file_format over:
|
|
724
751
|
#
|
|
725
752
|
# @task
|
|
726
|
-
# def modify_format(
|
|
727
|
-
# return
|
|
753
|
+
# def modify_format(df: Annotated[DataFrame, {}, "task-format"]) -> DataFrame:
|
|
754
|
+
# return df
|
|
728
755
|
#
|
|
729
|
-
#
|
|
730
|
-
#
|
|
756
|
+
# df = DataFrame(uri="s3://my-s3-bucket/df.parquet", format="user-format")
|
|
757
|
+
# df2 = modify_format(df=df)
|
|
731
758
|
#
|
|
732
|
-
# In this case, we expect
|
|
733
|
-
# If we directly copy the user-specified
|
|
734
|
-
|
|
735
|
-
|
|
759
|
+
# In this case, we expect the df2.format to be task-format (as shown in Annotated),
|
|
760
|
+
# not user-format. If we directly copy the user-specified format over,
|
|
761
|
+
# the type hint information will be missing.
|
|
762
|
+
if sdt.format == GENERIC_FORMAT and format_val != GENERIC_FORMAT:
|
|
763
|
+
sdt.format = format_val
|
|
736
764
|
|
|
737
765
|
sd_model = literals_pb2.StructuredDataset(
|
|
738
766
|
uri=uri,
|
|
@@ -740,9 +768,9 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
740
768
|
)
|
|
741
769
|
return literals_pb2.Literal(scalar=literals_pb2.Scalar(structured_dataset=sd_model))
|
|
742
770
|
|
|
743
|
-
# 3. This is the third and probably most common case. The python
|
|
771
|
+
# 3. This is the third and probably most common case. The python DataFrame object wraps a dataframe
|
|
744
772
|
# that we will need to invoke an encoder for. Figure out which encoder to call and invoke it.
|
|
745
|
-
df_type = type(python_val.
|
|
773
|
+
df_type = type(python_val.val)
|
|
746
774
|
protocol = self._protocol_from_type_or_prefix(df_type, python_val.uri)
|
|
747
775
|
|
|
748
776
|
return await self.encode(
|
|
@@ -760,8 +788,9 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
760
788
|
structured_dataset_type=expected.structured_dataset_type if expected else None
|
|
761
789
|
)
|
|
762
790
|
|
|
763
|
-
|
|
764
|
-
|
|
791
|
+
fdf = DataFrame.from_df(val=python_val)
|
|
792
|
+
fdf._metadata = meta
|
|
793
|
+
return await self.encode(fdf, python_type, protocol, fmt, sdt)
|
|
765
794
|
|
|
766
795
|
def _protocol_from_type_or_prefix(self, df_type: Type, uri: Optional[str] = None) -> str:
|
|
767
796
|
"""
|
|
@@ -782,16 +811,16 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
782
811
|
|
|
783
812
|
async def encode(
|
|
784
813
|
self,
|
|
785
|
-
|
|
814
|
+
df: DataFrame,
|
|
786
815
|
df_type: Type,
|
|
787
816
|
protocol: str,
|
|
788
817
|
format: str,
|
|
789
818
|
structured_literal_type: types_pb2.StructuredDatasetType,
|
|
790
819
|
) -> literals_pb2.Literal:
|
|
791
|
-
handler:
|
|
820
|
+
handler: DataFrameEncoder
|
|
792
821
|
handler = self.get_encoder(df_type, protocol, format)
|
|
793
822
|
|
|
794
|
-
sd_model = await handler.encode(
|
|
823
|
+
sd_model = await handler.encode(df, structured_literal_type)
|
|
795
824
|
# This block is here in case the encoder did not set the type information in the metadata. Since this literal
|
|
796
825
|
# is special in that it carries around the type itself, we want to make sure the type info therein is at
|
|
797
826
|
# least as good as the type of the interface.
|
|
@@ -807,75 +836,16 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
807
836
|
lit = literals_pb2.Literal(scalar=literals_pb2.Scalar(structured_dataset=sd_model))
|
|
808
837
|
|
|
809
838
|
# Because the handler.encode may have uploaded something, and because the sd may end up living inside a
|
|
810
|
-
# dataclass, we need to modify any uploaded flyte:// urls here.
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
839
|
+
# dataclass, we need to modify any uploaded flyte:// urls here. Needed here even though the Type engine
|
|
840
|
+
# already does this because the DataframeTransformerEngine may be called directly.
|
|
841
|
+
modify_literal_uris(lit)
|
|
842
|
+
df._literal_sd = sd_model
|
|
843
|
+
df._already_uploaded = True
|
|
814
844
|
return lit
|
|
815
845
|
|
|
816
|
-
# pr: han-ru: can this be removed if we make StructuredDataset a pydantic model?
|
|
817
|
-
def dict_to_structured_dataset(
|
|
818
|
-
self, dict_obj: typing.Dict[str, str], expected_python_type: Type[T] | StructuredDataset
|
|
819
|
-
) -> T | StructuredDataset:
|
|
820
|
-
uri = dict_obj.get("uri", None)
|
|
821
|
-
file_format = dict_obj.get("file_format", None)
|
|
822
|
-
|
|
823
|
-
if uri is None:
|
|
824
|
-
raise ValueError("StructuredDataset's uri and file format should not be None")
|
|
825
|
-
|
|
826
|
-
# Instead of using python native StructuredDataset, we need to build a literals.StructuredDataset
|
|
827
|
-
# The reason is that _literal_sd of python sd is accessed when task output LiteralMap is
|
|
828
|
-
# converted back to flyteidl. Hence, _literal_sd must have to_flyte_idl method
|
|
829
|
-
# See https://github.com/flyteorg/flytekit/blob/f938661ff8413219d1bea77f6914a58c302d5c6c/flytekit/bin/entrypoint.py#L326
|
|
830
|
-
# For details, please refer to this issue: https://github.com/flyteorg/flyte/issues/5956.
|
|
831
|
-
sdt = types_pb2.StructuredDatasetType(format=file_format)
|
|
832
|
-
metad = literals_pb2.StructuredDatasetMetadata(structured_dataset_type=sdt)
|
|
833
|
-
sd_literal = literals_pb2.StructuredDataset(uri=uri, metadata=metad)
|
|
834
|
-
|
|
835
|
-
return asyncio.run(
|
|
836
|
-
StructuredDatasetTransformerEngine().to_python_value(
|
|
837
|
-
literals_pb2.Literal(scalar=literals_pb2.Scalar(structured_dataset=sd_literal)),
|
|
838
|
-
expected_python_type,
|
|
839
|
-
)
|
|
840
|
-
)
|
|
841
|
-
|
|
842
|
-
def from_binary_idl(
|
|
843
|
-
self, binary_idl_object: literals_pb2.Binary, expected_python_type: Type[T] | StructuredDataset
|
|
844
|
-
) -> T | StructuredDataset:
|
|
845
|
-
"""
|
|
846
|
-
If the input is from flytekit, the Life Cycle will be as follows:
|
|
847
|
-
|
|
848
|
-
Life Cycle:
|
|
849
|
-
binary IDL -> resolved binary -> bytes -> expected Python object
|
|
850
|
-
(flytekit customized (propeller processing) (flytekit binary IDL) (flytekit customized
|
|
851
|
-
serialization) deserialization)
|
|
852
|
-
|
|
853
|
-
Example Code:
|
|
854
|
-
@dataclass
|
|
855
|
-
class DC:
|
|
856
|
-
sd: StructuredDataset
|
|
857
|
-
|
|
858
|
-
@workflow
|
|
859
|
-
def wf(dc: DC):
|
|
860
|
-
t_sd(dc.sd)
|
|
861
|
-
|
|
862
|
-
Note:
|
|
863
|
-
- The deserialization is the same as put a structured dataset in a dataclass,
|
|
864
|
-
which will deserialize by the mashumaro's API.
|
|
865
|
-
|
|
866
|
-
Related PR:
|
|
867
|
-
- Title: Override Dataclass Serialization/Deserialization Behavior for FlyteTypes via Mashumaro
|
|
868
|
-
- Link: https://github.com/flyteorg/flytekit/pull/2554
|
|
869
|
-
"""
|
|
870
|
-
if binary_idl_object.tag == MESSAGEPACK:
|
|
871
|
-
python_val = msgpack.loads(binary_idl_object.value)
|
|
872
|
-
return self.dict_to_structured_dataset(dict_obj=python_val, expected_python_type=expected_python_type)
|
|
873
|
-
else:
|
|
874
|
-
raise TypeTransformerFailedError(f"Unsupported binary format: `{binary_idl_object.tag}`")
|
|
875
|
-
|
|
876
846
|
async def to_python_value(
|
|
877
|
-
self, lv: literals_pb2.Literal, expected_python_type: Type[T] |
|
|
878
|
-
) -> T |
|
|
847
|
+
self, lv: literals_pb2.Literal, expected_python_type: Type[T] | DataFrame
|
|
848
|
+
) -> T | DataFrame:
|
|
879
849
|
"""
|
|
880
850
|
The only tricky thing with converting a Literal (say the output of an earlier task), to a Python value at
|
|
881
851
|
the start of a task execution, is the column subsetting behavior. For example, if you have,
|
|
@@ -906,14 +876,13 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
906
876
|
| | the running task's signature. | |
|
|
907
877
|
+-----------------------------+-----------------------------------------+--------------------------------------+
|
|
908
878
|
"""
|
|
909
|
-
# Handle dataclass attribute access
|
|
910
879
|
if lv.HasField("scalar") and lv.scalar.HasField("binary"):
|
|
911
|
-
|
|
880
|
+
raise TypeTransformerFailedError("Attribute access unsupported.")
|
|
912
881
|
|
|
913
882
|
# Detect annotations and extract out all the relevant information that the user might supply
|
|
914
|
-
expected_python_type, column_dict,
|
|
883
|
+
expected_python_type, column_dict, _storage_fmt, _pa_schema = extract_cols_and_format(expected_python_type)
|
|
915
884
|
|
|
916
|
-
# Start handling for
|
|
885
|
+
# Start handling for DataFrame scalars, first look at the columns
|
|
917
886
|
incoming_columns = lv.scalar.structured_dataset.metadata.structured_dataset_type.columns
|
|
918
887
|
|
|
919
888
|
# If the incoming literal, also doesn't have columns, then we just have an empty list, so initialize here
|
|
@@ -935,30 +904,27 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
935
904
|
)
|
|
936
905
|
metad = literals_pb2.StructuredDatasetMetadata(structured_dataset_type=new_sdt)
|
|
937
906
|
|
|
938
|
-
# A
|
|
939
|
-
# t1(input_a:
|
|
940
|
-
# t1(input_a: Annotated[
|
|
941
|
-
if issubclass(expected_python_type,
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
return sd
|
|
950
|
-
|
|
951
|
-
# If the requested type was not a StructuredDataset, then it means it was a plain dataframe type, which means
|
|
907
|
+
# A DataFrame type, for example
|
|
908
|
+
# t1(input_a: DataFrame) # or
|
|
909
|
+
# t1(input_a: Annotated[DataFrame, my_cols])
|
|
910
|
+
if issubclass(expected_python_type, DataFrame):
|
|
911
|
+
fdf = DataFrame(format=metad.structured_dataset_type.format, uri=lv.scalar.structured_dataset.uri)
|
|
912
|
+
fdf._already_uploaded = True
|
|
913
|
+
fdf._literal_sd = lv.scalar.structured_dataset
|
|
914
|
+
fdf._metadata = metad
|
|
915
|
+
return fdf
|
|
916
|
+
|
|
917
|
+
# If the requested type was not a flyte.DataFrame, then it means it was a raw dataframe type, which means
|
|
952
918
|
# we should do the opening/downloading and whatever else it might entail right now. No iteration option here.
|
|
953
919
|
return await self.open_as(lv.scalar.structured_dataset, df_type=expected_python_type, updated_metadata=metad)
|
|
954
920
|
|
|
955
921
|
def to_html(self, python_val: typing.Any, expected_python_type: Type[T]) -> str:
|
|
956
|
-
if isinstance(python_val,
|
|
957
|
-
if python_val.
|
|
958
|
-
df = python_val.
|
|
922
|
+
if isinstance(python_val, DataFrame):
|
|
923
|
+
if python_val.val is not None:
|
|
924
|
+
df = python_val.val
|
|
959
925
|
else:
|
|
960
926
|
# Here we only render column information by default instead of opening the structured dataset.
|
|
961
|
-
col = typing.cast(
|
|
927
|
+
col = typing.cast(DataFrame, python_val).columns()
|
|
962
928
|
dataframe = pd.DataFrame(col, ["column type"])
|
|
963
929
|
return dataframe.to_html() # type: ignore
|
|
964
930
|
else:
|
|
@@ -1004,11 +970,12 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
1004
970
|
def _get_dataset_column_literal_type(self, t: Type) -> types_pb2.LiteralType:
|
|
1005
971
|
if t in get_supported_types():
|
|
1006
972
|
return get_supported_types()[t]
|
|
1007
|
-
|
|
973
|
+
origin = getattr(t, "__origin__", None)
|
|
974
|
+
if origin is list:
|
|
1008
975
|
return types_pb2.LiteralType(collection_type=self._get_dataset_column_literal_type(t.__args__[0]))
|
|
1009
|
-
if
|
|
976
|
+
if origin is dict:
|
|
1010
977
|
return types_pb2.LiteralType(map_value_type=self._get_dataset_column_literal_type(t.__args__[1]))
|
|
1011
|
-
raise AssertionError(f"type {t} is currently not supported by
|
|
978
|
+
raise AssertionError(f"type {t} is currently not supported by DataFrame")
|
|
1012
979
|
|
|
1013
980
|
def _convert_ordered_dict_of_columns_to_list(
|
|
1014
981
|
self, column_map: typing.Optional[typing.OrderedDict[str, Type]]
|
|
@@ -1022,10 +989,8 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
1022
989
|
converted_cols.append(types_pb2.StructuredDatasetType.DatasetColumn(name=k, literal_type=lt))
|
|
1023
990
|
return converted_cols
|
|
1024
991
|
|
|
1025
|
-
def _get_dataset_type(
|
|
1026
|
-
|
|
1027
|
-
) -> types_pb2.StructuredDatasetType:
|
|
1028
|
-
original_python_type, column_map, storage_format, pa_schema = extract_cols_and_format(t) # type: ignore
|
|
992
|
+
def _get_dataset_type(self, t: typing.Union[Type[DataFrame], typing.Any]) -> types_pb2.StructuredDatasetType:
|
|
993
|
+
_original_python_type, column_map, storage_format, pa_schema = extract_cols_and_format(t) # type: ignore
|
|
1029
994
|
|
|
1030
995
|
# Get the column information
|
|
1031
996
|
converted_cols: typing.List[types_pb2.StructuredDatasetType.DatasetColumn] = (
|
|
@@ -1039,7 +1004,7 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
1039
1004
|
external_schema_bytes=typing.cast(pa.lib.Schema, pa_schema).to_string().encode() if pa_schema else None,
|
|
1040
1005
|
)
|
|
1041
1006
|
|
|
1042
|
-
def get_literal_type(self, t: typing.Union[Type[
|
|
1007
|
+
def get_literal_type(self, t: typing.Union[Type[DataFrame], typing.Any]) -> types_pb2.LiteralType:
|
|
1043
1008
|
"""
|
|
1044
1009
|
Provide a concrete implementation so that writers of custom dataframe handlers since there's nothing that
|
|
1045
1010
|
special about the literal type. Any dataframe type will always be associated with the structured dataset type.
|
|
@@ -1049,13 +1014,13 @@ class StructuredDatasetTransformerEngine(TypeTransformer[StructuredDataset]):
|
|
|
1049
1014
|
"""
|
|
1050
1015
|
return types_pb2.LiteralType(structured_dataset_type=self._get_dataset_type(t))
|
|
1051
1016
|
|
|
1052
|
-
def guess_python_type(self, literal_type: types_pb2.LiteralType) -> Type[
|
|
1017
|
+
def guess_python_type(self, literal_type: types_pb2.LiteralType) -> Type[DataFrame]:
|
|
1053
1018
|
# todo: technically we should return the dataframe type specified in the constructor, but to do that,
|
|
1054
1019
|
# we'd have to store that, which we don't do today. See possibly #1363
|
|
1055
1020
|
if literal_type.HasField("structured_dataset_type"):
|
|
1056
|
-
return
|
|
1057
|
-
raise ValueError(f"
|
|
1021
|
+
return DataFrame
|
|
1022
|
+
raise ValueError(f"DataFrameTransformerEngine cannot reverse {literal_type}")
|
|
1058
1023
|
|
|
1059
1024
|
|
|
1060
|
-
flyte_dataset_transformer =
|
|
1025
|
+
flyte_dataset_transformer = DataFrameTransformerEngine()
|
|
1061
1026
|
TypeEngine.register(flyte_dataset_transformer)
|