dagster-azure 0.13.19__tar.gz → 0.28.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dagster-azure-0.13.19 → dagster_azure-0.28.1}/LICENSE +1 -1
- {dagster-azure-0.13.19 → dagster_azure-0.28.1}/MANIFEST.in +1 -0
- dagster_azure-0.28.1/PKG-INFO +32 -0
- dagster_azure-0.28.1/README.md +4 -0
- dagster_azure-0.28.1/dagster_azure/__init__.py +5 -0
- dagster_azure-0.28.1/dagster_azure/adls2/__init__.py +19 -0
- {dagster-azure-0.13.19 → dagster_azure-0.28.1}/dagster_azure/adls2/file_manager.py +30 -28
- dagster_azure-0.28.1/dagster_azure/adls2/io_manager.py +314 -0
- dagster_azure-0.28.1/dagster_azure/adls2/resources.py +262 -0
- {dagster-azure-0.13.19 → dagster_azure-0.28.1}/dagster_azure/adls2/utils.py +12 -9
- dagster_azure-0.28.1/dagster_azure/blob/__init__.py +12 -0
- dagster_azure-0.28.1/dagster_azure/blob/compute_log_manager.py +391 -0
- {dagster-azure-0.13.19 → dagster_azure-0.28.1}/dagster_azure/blob/fake_blob_client.py +40 -21
- dagster_azure-0.28.1/dagster_azure/blob/resources.py +126 -0
- {dagster-azure-0.13.19 → dagster_azure-0.28.1}/dagster_azure/blob/utils.py +12 -11
- dagster_azure-0.28.1/dagster_azure/fakes/__init__.py +5 -0
- {dagster-azure-0.13.19/dagster_azure/adls2 → dagster_azure-0.28.1/dagster_azure/fakes}/fake_adls2_resource.py +81 -28
- dagster_azure-0.28.1/dagster_azure/pipes/__init__.py +9 -0
- dagster_azure-0.28.1/dagster_azure/pipes/clients/__init__.py +5 -0
- dagster_azure-0.28.1/dagster_azure/pipes/clients/azureml.py +140 -0
- dagster_azure-0.28.1/dagster_azure/pipes/context_injectors.py +47 -0
- dagster_azure-0.28.1/dagster_azure/pipes/message_readers.py +83 -0
- dagster_azure-0.28.1/dagster_azure/py.typed +1 -0
- dagster_azure-0.28.1/dagster_azure/version.py +1 -0
- dagster_azure-0.28.1/dagster_azure.egg-info/PKG-INFO +32 -0
- {dagster-azure-0.13.19 → dagster_azure-0.28.1}/dagster_azure.egg-info/SOURCES.txt +9 -9
- {dagster-azure-0.13.19 → dagster_azure-0.28.1}/dagster_azure.egg-info/entry_points.txt +0 -1
- {dagster-azure-0.13.19 → dagster_azure-0.28.1}/dagster_azure.egg-info/requires.txt +3 -1
- dagster_azure-0.28.1/dagster_azure.egg-info/top_level.txt +1 -0
- dagster_azure-0.28.1/setup.py +47 -0
- dagster-azure-0.13.19/PKG-INFO +0 -15
- dagster-azure-0.13.19/README.md +0 -4
- dagster-azure-0.13.19/dagster_azure/__init__.py +0 -5
- dagster-azure-0.13.19/dagster_azure/adls2/__init__.py +0 -6
- dagster-azure-0.13.19/dagster_azure/adls2/file_cache.py +0 -74
- dagster-azure-0.13.19/dagster_azure/adls2/io_manager.py +0 -133
- dagster-azure-0.13.19/dagster_azure/adls2/resources.py +0 -122
- dagster-azure-0.13.19/dagster_azure/blob/__init__.py +0 -3
- dagster-azure-0.13.19/dagster_azure/blob/compute_log_manager.py +0 -205
- dagster-azure-0.13.19/dagster_azure/version.py +0 -1
- dagster-azure-0.13.19/dagster_azure.egg-info/PKG-INFO +0 -15
- dagster-azure-0.13.19/dagster_azure.egg-info/top_level.txt +0 -2
- dagster-azure-0.13.19/dagster_azure_tests/__init__.py +0 -0
- dagster-azure-0.13.19/dagster_azure_tests/adls2_tests/__init__.py +0 -0
- dagster-azure-0.13.19/dagster_azure_tests/adls2_tests/conftest.py +0 -18
- dagster-azure-0.13.19/dagster_azure_tests/adls2_tests/test_adls2_file_cache.py +0 -60
- dagster-azure-0.13.19/dagster_azure_tests/adls2_tests/test_adls2_file_manager.py +0 -190
- dagster-azure-0.13.19/dagster_azure_tests/adls2_tests/test_io_manager.py +0 -142
- dagster-azure-0.13.19/dagster_azure_tests/test_version.py +0 -5
- dagster-azure-0.13.19/setup.py +0 -43
- {dagster-azure-0.13.19 → dagster_azure-0.28.1}/dagster_azure.egg-info/dependency_links.txt +0 -0
- {dagster-azure-0.13.19 → dagster_azure-0.28.1}/dagster_azure.egg-info/not-zip-safe +0 -0
- {dagster-azure-0.13.19 → dagster_azure-0.28.1}/setup.cfg +0 -0
|
@@ -186,7 +186,7 @@
|
|
|
186
186
|
same "printed page" as the copyright notice for easier
|
|
187
187
|
identification within third-party archives.
|
|
188
188
|
|
|
189
|
-
Copyright
|
|
189
|
+
Copyright 2025 Dagster Labs, Inc.
|
|
190
190
|
|
|
191
191
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
192
192
|
you may not use this file except in compliance with the License.
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dagster-azure
|
|
3
|
+
Version: 0.28.1
|
|
4
|
+
Summary: Package for Azure-specific Dagster framework op and resource components.
|
|
5
|
+
Home-page: https://github.com/dagster-io/dagster/tree/master/python_modules/libraries/dagster-azure
|
|
6
|
+
Author: Dagster Labs
|
|
7
|
+
Author-email: hello@dagsterlabs.com
|
|
8
|
+
License: Apache-2.0
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Requires-Python: >=3.9,<3.14
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: azure-core<2.0.0,>=1.7.0
|
|
19
|
+
Requires-Dist: azure-identity<2.0.0,>=1.7.0
|
|
20
|
+
Requires-Dist: azure-ai-ml<2.0.0,>=1.28.0
|
|
21
|
+
Requires-Dist: azure-storage-blob<13.0.0,>=12.5.0
|
|
22
|
+
Requires-Dist: azure-storage-file-datalake<13.0.0,>=12.5
|
|
23
|
+
Requires-Dist: dagster==1.12.1
|
|
24
|
+
Dynamic: author
|
|
25
|
+
Dynamic: author-email
|
|
26
|
+
Dynamic: classifier
|
|
27
|
+
Dynamic: home-page
|
|
28
|
+
Dynamic: license
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
Dynamic: requires-dist
|
|
31
|
+
Dynamic: requires-python
|
|
32
|
+
Dynamic: summary
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from dagster_azure.adls2.file_manager import (
|
|
2
|
+
ADLS2FileHandle as ADLS2FileHandle,
|
|
3
|
+
ADLS2FileManager as ADLS2FileManager,
|
|
4
|
+
)
|
|
5
|
+
from dagster_azure.adls2.io_manager import (
|
|
6
|
+
ADLS2PickleIOManager as ADLS2PickleIOManager,
|
|
7
|
+
ConfigurablePickledObjectADLS2IOManager as ConfigurablePickledObjectADLS2IOManager,
|
|
8
|
+
PickledObjectADLS2IOManager as PickledObjectADLS2IOManager,
|
|
9
|
+
adls2_pickle_io_manager as adls2_pickle_io_manager,
|
|
10
|
+
)
|
|
11
|
+
from dagster_azure.adls2.resources import (
|
|
12
|
+
ADLS2DefaultAzureCredential as ADLS2DefaultAzureCredential,
|
|
13
|
+
ADLS2Key as ADLS2Key,
|
|
14
|
+
ADLS2Resource as ADLS2Resource,
|
|
15
|
+
ADLS2SASToken as ADLS2SASToken,
|
|
16
|
+
adls2_file_manager as adls2_file_manager,
|
|
17
|
+
adls2_resource as adls2_resource,
|
|
18
|
+
)
|
|
19
|
+
from dagster_azure.adls2.utils import create_adls2_client as create_adls2_client
|
|
@@ -1,17 +1,19 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import uuid
|
|
3
3
|
from contextlib import contextmanager
|
|
4
|
+
from typing import Any, Optional
|
|
4
5
|
|
|
5
|
-
|
|
6
|
-
from dagster.
|
|
6
|
+
import dagster._check as check
|
|
7
|
+
from dagster._core.storage.file_manager import (
|
|
7
8
|
FileHandle,
|
|
8
9
|
FileManager,
|
|
9
10
|
TempfileManager,
|
|
10
11
|
check_file_like_obj,
|
|
11
12
|
)
|
|
12
13
|
|
|
14
|
+
from dagster_azure.adls2.utils import DataLakeServiceClient
|
|
15
|
+
|
|
13
16
|
|
|
14
|
-
@usable_as_dagster_type
|
|
15
17
|
class ADLS2FileHandle(FileHandle):
|
|
16
18
|
"""A reference to a file on ADLS2."""
|
|
17
19
|
|
|
@@ -21,48 +23,44 @@ class ADLS2FileHandle(FileHandle):
|
|
|
21
23
|
self._key = check.str_param(key, "key")
|
|
22
24
|
|
|
23
25
|
@property
|
|
24
|
-
def account(self):
|
|
26
|
+
def account(self) -> str:
|
|
25
27
|
"""str: The name of the ADLS2 account."""
|
|
26
28
|
return self._account
|
|
27
29
|
|
|
28
30
|
@property
|
|
29
|
-
def file_system(self):
|
|
31
|
+
def file_system(self) -> str:
|
|
30
32
|
"""str: The name of the ADLS2 file system."""
|
|
31
33
|
return self._file_system
|
|
32
34
|
|
|
33
35
|
@property
|
|
34
|
-
def key(self):
|
|
36
|
+
def key(self) -> str:
|
|
35
37
|
"""str: The ADLS2 key."""
|
|
36
38
|
return self._key
|
|
37
39
|
|
|
38
40
|
@property
|
|
39
|
-
def path_desc(self):
|
|
41
|
+
def path_desc(self) -> str:
|
|
40
42
|
"""str: The file's ADLS2 URL."""
|
|
41
43
|
return self.adls2_path
|
|
42
44
|
|
|
43
45
|
@property
|
|
44
|
-
def adls2_path(self):
|
|
46
|
+
def adls2_path(self) -> str:
|
|
45
47
|
"""str: The file's ADLS2 URL."""
|
|
46
|
-
return "adfss://{file_system}@{account}.dfs.core.windows.net/{key}"
|
|
47
|
-
file_system=self.file_system,
|
|
48
|
-
account=self.account,
|
|
49
|
-
key=self.key,
|
|
50
|
-
)
|
|
48
|
+
return f"adfss://{self.file_system}@{self.account}.dfs.core.windows.net/{self.key}"
|
|
51
49
|
|
|
52
50
|
|
|
53
51
|
class ADLS2FileManager(FileManager):
|
|
54
|
-
def __init__(self, adls2_client, file_system, prefix):
|
|
52
|
+
def __init__(self, adls2_client: DataLakeServiceClient, file_system: str, prefix: str):
|
|
55
53
|
self._client = adls2_client
|
|
56
54
|
self._file_system = check.str_param(file_system, "file_system")
|
|
57
55
|
self._prefix = check.str_param(prefix, "prefix")
|
|
58
|
-
self._local_handle_cache = {}
|
|
56
|
+
self._local_handle_cache: dict[str, str] = {}
|
|
59
57
|
self._temp_file_manager = TempfileManager()
|
|
60
58
|
|
|
61
|
-
def copy_handle_to_local_temp(self, file_handle):
|
|
59
|
+
def copy_handle_to_local_temp(self, file_handle: ADLS2FileHandle): # pyright: ignore[reportIncompatibleMethodOverride]
|
|
62
60
|
self._download_if_not_cached(file_handle)
|
|
63
61
|
return self._get_local_path(file_handle)
|
|
64
62
|
|
|
65
|
-
def _download_if_not_cached(self, file_handle):
|
|
63
|
+
def _download_if_not_cached(self, file_handle: ADLS2FileHandle):
|
|
66
64
|
if not self._file_handle_cached(file_handle):
|
|
67
65
|
# instigate download
|
|
68
66
|
temp_file_obj = self._temp_file_manager.tempfile()
|
|
@@ -79,41 +77,45 @@ class ADLS2FileManager(FileManager):
|
|
|
79
77
|
return file_handle
|
|
80
78
|
|
|
81
79
|
@contextmanager
|
|
82
|
-
def read(self, file_handle, mode="rb"):
|
|
80
|
+
def read(self, file_handle: ADLS2FileHandle, mode: str = "rb"): # pyright: ignore[reportIncompatibleMethodOverride]
|
|
83
81
|
check.inst_param(file_handle, "file_handle", ADLS2FileHandle)
|
|
84
82
|
check.str_param(mode, "mode")
|
|
85
83
|
check.param_invariant(mode in {"r", "rb"}, "mode")
|
|
86
84
|
|
|
87
85
|
self._download_if_not_cached(file_handle)
|
|
88
86
|
|
|
89
|
-
|
|
87
|
+
encoding = None if "b" in mode else "utf-8"
|
|
88
|
+
with open(self._get_local_path(file_handle), mode, encoding=encoding) as file_obj:
|
|
90
89
|
yield file_obj
|
|
91
90
|
|
|
92
|
-
def _file_handle_cached(self, file_handle):
|
|
91
|
+
def _file_handle_cached(self, file_handle: ADLS2FileHandle) -> bool:
|
|
93
92
|
return file_handle.adls2_path in self._local_handle_cache
|
|
94
93
|
|
|
95
|
-
def _get_local_path(self, file_handle):
|
|
94
|
+
def _get_local_path(self, file_handle: ADLS2FileHandle) -> str:
|
|
96
95
|
return self._local_handle_cache[file_handle.adls2_path]
|
|
97
96
|
|
|
98
|
-
def read_data(self, file_handle):
|
|
97
|
+
def read_data(self, file_handle: ADLS2FileHandle) -> Any: # pyright: ignore[reportIncompatibleMethodOverride]
|
|
99
98
|
with self.read(file_handle, mode="rb") as file_obj:
|
|
100
99
|
return file_obj.read()
|
|
101
100
|
|
|
102
|
-
def write_data(self, data, ext=None):
|
|
101
|
+
def write_data(self, data: bytes, ext: Optional[str] = None) -> ADLS2FileHandle:
|
|
103
102
|
check.inst_param(data, "data", bytes)
|
|
104
103
|
return self.write(io.BytesIO(data), mode="wb", ext=ext)
|
|
105
104
|
|
|
106
|
-
def write(
|
|
105
|
+
def write( # pyright: ignore[reportIncompatibleMethodOverride]
|
|
106
|
+
self, file_obj: io.BytesIO, mode: str = "wb", ext: Optional[str] = None
|
|
107
|
+
) -> ADLS2FileHandle:
|
|
107
108
|
check_file_like_obj(file_obj)
|
|
108
109
|
adls2_key = self.get_full_key(str(uuid.uuid4()) + (("." + ext) if ext is not None else ""))
|
|
109
110
|
adls2_file = self._client.get_file_client(
|
|
110
111
|
file_system=self._file_system, file_path=adls2_key
|
|
111
112
|
)
|
|
112
113
|
adls2_file.upload_data(file_obj, overwrite=True)
|
|
113
|
-
|
|
114
|
+
account_name = check.not_none(self._client.account_name, "Expected account name to be set")
|
|
115
|
+
return ADLS2FileHandle(account_name, self._file_system, adls2_key)
|
|
114
116
|
|
|
115
|
-
def get_full_key(self, file_key):
|
|
116
|
-
return "{
|
|
117
|
+
def get_full_key(self, file_key: str) -> str:
|
|
118
|
+
return f"{self._prefix}/{file_key}"
|
|
117
119
|
|
|
118
|
-
def delete_local_temp(self):
|
|
120
|
+
def delete_local_temp(self) -> None:
|
|
119
121
|
self._temp_file_manager.close()
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
from collections.abc import Iterator
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from typing import Any, Union
|
|
5
|
+
|
|
6
|
+
from dagster import (
|
|
7
|
+
InputContext,
|
|
8
|
+
OutputContext,
|
|
9
|
+
ResourceDependency,
|
|
10
|
+
_check as check,
|
|
11
|
+
io_manager,
|
|
12
|
+
)
|
|
13
|
+
from dagster._annotations import deprecated
|
|
14
|
+
from dagster._config.pythonic_config import ConfigurableIOManager
|
|
15
|
+
from dagster._core.execution.context.init import InitResourceContext
|
|
16
|
+
from dagster._core.storage.io_manager import dagster_maintained_io_manager
|
|
17
|
+
from dagster._core.storage.upath_io_manager import UPathIOManager
|
|
18
|
+
from dagster._utils import PICKLE_PROTOCOL
|
|
19
|
+
from dagster._utils.cached_method import cached_method
|
|
20
|
+
from pydantic import Field
|
|
21
|
+
from upath import UPath
|
|
22
|
+
|
|
23
|
+
from dagster_azure.adls2.resources import ADLS2Resource
|
|
24
|
+
from dagster_azure.adls2.utils import (
|
|
25
|
+
DataLakeLeaseClient,
|
|
26
|
+
DataLakeServiceClient,
|
|
27
|
+
ResourceNotFoundError,
|
|
28
|
+
)
|
|
29
|
+
from dagster_azure.blob.utils import BlobLeaseClient, BlobServiceClient
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class PickledObjectADLS2IOManager(UPathIOManager):
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
file_system: str,
|
|
36
|
+
adls2_client: DataLakeServiceClient,
|
|
37
|
+
blob_client: BlobServiceClient,
|
|
38
|
+
lease_client_constructor: Union[type[DataLakeLeaseClient], type[BlobLeaseClient]],
|
|
39
|
+
prefix: str = "dagster",
|
|
40
|
+
lease_duration: int = 60,
|
|
41
|
+
):
|
|
42
|
+
if lease_duration != -1 and (lease_duration < 15 or lease_duration > 60):
|
|
43
|
+
raise ValueError("lease_duration must be -1 (unlimited) or between 15 and 60")
|
|
44
|
+
|
|
45
|
+
self.adls2_client = adls2_client
|
|
46
|
+
self.file_system_client = self.adls2_client.get_file_system_client(file_system)
|
|
47
|
+
# We also need a blob client to handle copying as ADLS doesn't have a copy API yet
|
|
48
|
+
self.blob_client = blob_client
|
|
49
|
+
self.blob_container_client = self.blob_client.get_container_client(file_system)
|
|
50
|
+
self.prefix = check.str_param(prefix, "prefix")
|
|
51
|
+
|
|
52
|
+
self.lease_client_constructor = lease_client_constructor
|
|
53
|
+
self.lease_duration = lease_duration
|
|
54
|
+
self.file_system_client.get_file_system_properties()
|
|
55
|
+
super().__init__(base_path=UPath(self.prefix))
|
|
56
|
+
|
|
57
|
+
def get_op_output_relative_path(self, context: Union[InputContext, OutputContext]) -> UPath:
|
|
58
|
+
parts = context.get_identifier()
|
|
59
|
+
run_id = parts[0]
|
|
60
|
+
output_parts = parts[1:]
|
|
61
|
+
return UPath("storage", run_id, "files", *output_parts)
|
|
62
|
+
|
|
63
|
+
def get_loading_input_log_message(self, path: UPath) -> str:
|
|
64
|
+
return f"Loading ADLS2 object from: {self._uri_for_path(path)}"
|
|
65
|
+
|
|
66
|
+
def get_writing_output_log_message(self, path: UPath) -> str:
|
|
67
|
+
return f"Writing ADLS2 object at: {self._uri_for_path(path)}"
|
|
68
|
+
|
|
69
|
+
def unlink(self, path: UPath) -> None:
|
|
70
|
+
file_client = self.file_system_client.get_file_client(path.as_posix())
|
|
71
|
+
with self._acquire_lease(file_client, is_rm=True) as lease:
|
|
72
|
+
file_client.delete_file(lease=lease, recursive=True)
|
|
73
|
+
|
|
74
|
+
def make_directory(self, path: UPath) -> None:
|
|
75
|
+
# It is not necessary to create directories in ADLS2
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
def path_exists(self, path: UPath) -> bool:
|
|
79
|
+
try:
|
|
80
|
+
self.file_system_client.get_file_client(path.as_posix()).get_file_properties()
|
|
81
|
+
except ResourceNotFoundError:
|
|
82
|
+
return False
|
|
83
|
+
return True
|
|
84
|
+
|
|
85
|
+
def _uri_for_path(self, path: UPath, protocol: str = "abfss://") -> str:
|
|
86
|
+
return f"{protocol}{self.file_system_client.file_system_name}@{self.file_system_client.account_name}.dfs.core.windows.net/{path.as_posix()}"
|
|
87
|
+
|
|
88
|
+
@contextmanager
|
|
89
|
+
def _acquire_lease(self, client: Any, is_rm: bool = False) -> Iterator[str]:
|
|
90
|
+
lease_client = self.lease_client_constructor(client=client)
|
|
91
|
+
try:
|
|
92
|
+
# Unclear why this needs to be type-ignored
|
|
93
|
+
lease_client.acquire(lease_duration=self.lease_duration)
|
|
94
|
+
yield lease_client.id
|
|
95
|
+
finally:
|
|
96
|
+
# cannot release a lease on a file that no longer exists, so need to check
|
|
97
|
+
if not is_rm:
|
|
98
|
+
lease_client.release()
|
|
99
|
+
|
|
100
|
+
def load_from_path(self, context: InputContext, path: UPath) -> Any:
|
|
101
|
+
if context.dagster_type.typing_type == type(None):
|
|
102
|
+
return None
|
|
103
|
+
file = self.file_system_client.get_file_client(path.as_posix())
|
|
104
|
+
stream = file.download_file()
|
|
105
|
+
return pickle.loads(stream.readall())
|
|
106
|
+
|
|
107
|
+
def dump_to_path(self, context: OutputContext, obj: Any, path: UPath) -> None:
|
|
108
|
+
if self.path_exists(path):
|
|
109
|
+
context.log.warning(f"Removing existing ADLS2 key: {path}")
|
|
110
|
+
self.unlink(path)
|
|
111
|
+
|
|
112
|
+
pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)
|
|
113
|
+
file = self.file_system_client.create_file(path.as_posix())
|
|
114
|
+
with self._acquire_lease(file) as lease:
|
|
115
|
+
file.upload_data(pickled_obj, lease=lease, overwrite=True)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class ADLS2PickleIOManager(ConfigurableIOManager):
|
|
119
|
+
"""Persistent IO manager using Azure Data Lake Storage Gen2 for storage.
|
|
120
|
+
|
|
121
|
+
Serializes objects via pickling. Suitable for objects storage for distributed executors, so long
|
|
122
|
+
as each execution node has network connectivity and credentials for ADLS and the backing
|
|
123
|
+
container.
|
|
124
|
+
|
|
125
|
+
Assigns each op output to a unique filepath containing run ID, step key, and output name.
|
|
126
|
+
Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key
|
|
127
|
+
has multiple components, the final component is used as the name of the file, and the preceding
|
|
128
|
+
components as parent directories under the base_dir.
|
|
129
|
+
|
|
130
|
+
Subsequent materializations of an asset will overwrite previous materializations of that asset.
|
|
131
|
+
With a base directory of "/my/base/path", an asset with key
|
|
132
|
+
`AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory
|
|
133
|
+
with path "/my/base/path/one/two/".
|
|
134
|
+
|
|
135
|
+
Example usage:
|
|
136
|
+
|
|
137
|
+
1. Attach this IO manager to a set of assets.
|
|
138
|
+
|
|
139
|
+
.. code-block:: python
|
|
140
|
+
|
|
141
|
+
from dagster import Definitions, asset
|
|
142
|
+
from dagster_azure.adls2 import ADLS2PickleIOManager, ADLS2Resource, ADLS2SASToken
|
|
143
|
+
|
|
144
|
+
@asset
|
|
145
|
+
def asset1():
|
|
146
|
+
# create df ...
|
|
147
|
+
return df
|
|
148
|
+
|
|
149
|
+
@asset
|
|
150
|
+
def asset2(asset1):
|
|
151
|
+
return df[:5]
|
|
152
|
+
|
|
153
|
+
Definitions(
|
|
154
|
+
assets=[asset1, asset2],
|
|
155
|
+
resources={
|
|
156
|
+
"io_manager": ADLS2PickleIOManager(
|
|
157
|
+
adls2_file_system="my-cool-fs",
|
|
158
|
+
adls2_prefix="my-cool-prefix",
|
|
159
|
+
adls2=ADLS2Resource(
|
|
160
|
+
storage_account="my-storage-account",
|
|
161
|
+
credential=ADLS2SASToken(token="my-sas-token"),
|
|
162
|
+
),
|
|
163
|
+
),
|
|
164
|
+
},
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
2. Attach this IO manager to your job to make it available to your ops.
|
|
169
|
+
|
|
170
|
+
.. code-block:: python
|
|
171
|
+
|
|
172
|
+
from dagster import job
|
|
173
|
+
from dagster_azure.adls2 import ADLS2PickleIOManager, ADLS2Resource, ADLS2SASToken
|
|
174
|
+
|
|
175
|
+
@job(
|
|
176
|
+
resource_defs={
|
|
177
|
+
"io_manager": ADLS2PickleIOManager(
|
|
178
|
+
adls2_file_system="my-cool-fs",
|
|
179
|
+
adls2_prefix="my-cool-prefix",
|
|
180
|
+
adls2=ADLS2Resource(
|
|
181
|
+
storage_account="my-storage-account",
|
|
182
|
+
credential=ADLS2SASToken(token="my-sas-token"),
|
|
183
|
+
),
|
|
184
|
+
),
|
|
185
|
+
},
|
|
186
|
+
)
|
|
187
|
+
def my_job():
|
|
188
|
+
...
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
adls2: ResourceDependency[ADLS2Resource]
|
|
192
|
+
adls2_file_system: str = Field(description="ADLS Gen2 file system name.")
|
|
193
|
+
adls2_prefix: str = Field(
|
|
194
|
+
default="dagster", description="ADLS Gen2 file system prefix to write to."
|
|
195
|
+
)
|
|
196
|
+
lease_duration: int = Field(
|
|
197
|
+
default=60,
|
|
198
|
+
description="Lease duration in seconds. Must be between 15 and 60 seconds or -1 for infinite.",
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
@classmethod
|
|
202
|
+
def _is_dagster_maintained(cls) -> bool:
|
|
203
|
+
return True
|
|
204
|
+
|
|
205
|
+
@property
|
|
206
|
+
@cached_method
|
|
207
|
+
def _internal_io_manager(self) -> PickledObjectADLS2IOManager:
|
|
208
|
+
return PickledObjectADLS2IOManager(
|
|
209
|
+
self.adls2_file_system,
|
|
210
|
+
self.adls2.adls2_client,
|
|
211
|
+
self.adls2.blob_client,
|
|
212
|
+
self.adls2.lease_client_constructor,
|
|
213
|
+
self.adls2_prefix,
|
|
214
|
+
self.lease_duration,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
def load_input(self, context: "InputContext") -> Any:
|
|
218
|
+
return self._internal_io_manager.load_input(context)
|
|
219
|
+
|
|
220
|
+
def handle_output(self, context: "OutputContext", obj: Any) -> None:
|
|
221
|
+
self._internal_io_manager.handle_output(context, obj)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
@deprecated(
|
|
225
|
+
breaking_version="2.0",
|
|
226
|
+
additional_warn_text="Please use ADLS2PickleIOManager instead.",
|
|
227
|
+
)
|
|
228
|
+
class ConfigurablePickledObjectADLS2IOManager(ADLS2PickleIOManager):
|
|
229
|
+
"""Renamed to ADLS2PickleIOManager. See ADLS2PickleIOManager for documentation."""
|
|
230
|
+
|
|
231
|
+
pass
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
@dagster_maintained_io_manager
|
|
235
|
+
@io_manager(
|
|
236
|
+
config_schema=ADLS2PickleIOManager.to_config_schema(),
|
|
237
|
+
required_resource_keys={"adls2"},
|
|
238
|
+
)
|
|
239
|
+
def adls2_pickle_io_manager(init_context: InitResourceContext) -> PickledObjectADLS2IOManager:
|
|
240
|
+
"""Persistent IO manager using Azure Data Lake Storage Gen2 for storage.
|
|
241
|
+
|
|
242
|
+
Serializes objects via pickling. Suitable for objects storage for distributed executors, so long
|
|
243
|
+
as each execution node has network connectivity and credentials for ADLS and the backing
|
|
244
|
+
container.
|
|
245
|
+
|
|
246
|
+
Assigns each op output to a unique filepath containing run ID, step key, and output name.
|
|
247
|
+
Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key
|
|
248
|
+
has multiple components, the final component is used as the name of the file, and the preceding
|
|
249
|
+
components as parent directories under the base_dir.
|
|
250
|
+
|
|
251
|
+
Subsequent materializations of an asset will overwrite previous materializations of that asset.
|
|
252
|
+
With a base directory of "/my/base/path", an asset with key
|
|
253
|
+
`AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory
|
|
254
|
+
with path "/my/base/path/one/two/".
|
|
255
|
+
|
|
256
|
+
Example usage:
|
|
257
|
+
|
|
258
|
+
Attach this IO manager to a set of assets.
|
|
259
|
+
|
|
260
|
+
.. code-block:: python
|
|
261
|
+
|
|
262
|
+
from dagster import Definitions, asset
|
|
263
|
+
from dagster_azure.adls2 import adls2_pickle_io_manager, adls2_resource
|
|
264
|
+
|
|
265
|
+
@asset
|
|
266
|
+
def asset1():
|
|
267
|
+
# create df ...
|
|
268
|
+
return df
|
|
269
|
+
|
|
270
|
+
@asset
|
|
271
|
+
def asset2(asset1):
|
|
272
|
+
return df[:5]
|
|
273
|
+
|
|
274
|
+
Definitions(
|
|
275
|
+
assets=[asset1, asset2],
|
|
276
|
+
resources={
|
|
277
|
+
"io_manager": adls2_pickle_io_manager.configured(
|
|
278
|
+
{"adls2_file_system": "my-cool-fs", "adls2_prefix": "my-cool-prefix"}
|
|
279
|
+
),
|
|
280
|
+
"adls2": adls2_resource,
|
|
281
|
+
},
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
Attach this IO manager to your job to make it available to your ops.
|
|
286
|
+
|
|
287
|
+
.. code-block:: python
|
|
288
|
+
|
|
289
|
+
from dagster import job
|
|
290
|
+
from dagster_azure.adls2 import adls2_pickle_io_manager, adls2_resource
|
|
291
|
+
|
|
292
|
+
@job(
|
|
293
|
+
resource_defs={
|
|
294
|
+
"io_manager": adls2_pickle_io_manager.configured(
|
|
295
|
+
{"adls2_file_system": "my-cool-fs", "adls2_prefix": "my-cool-prefix"}
|
|
296
|
+
),
|
|
297
|
+
"adls2": adls2_resource,
|
|
298
|
+
},
|
|
299
|
+
)
|
|
300
|
+
def my_job():
|
|
301
|
+
...
|
|
302
|
+
"""
|
|
303
|
+
adls_resource = init_context.resources.adls2
|
|
304
|
+
adls2_client = adls_resource.adls2_client
|
|
305
|
+
blob_client = adls_resource.blob_client
|
|
306
|
+
lease_client = adls_resource.lease_client_constructor
|
|
307
|
+
return PickledObjectADLS2IOManager(
|
|
308
|
+
init_context.resource_config["adls2_file_system"],
|
|
309
|
+
adls2_client,
|
|
310
|
+
blob_client,
|
|
311
|
+
lease_client,
|
|
312
|
+
init_context.resource_config.get("adls2_prefix"),
|
|
313
|
+
init_context.resource_config.get("lease_duration"),
|
|
314
|
+
)
|