runnable 0.25.0__py3-none-any.whl → 0.27.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extensions/catalog/any_path.py +201 -0
- extensions/catalog/file_system.py +29 -230
- extensions/catalog/minio.py +72 -0
- extensions/catalog/s3.py +11 -0
- extensions/pipeline_executor/__init__.py +3 -34
- extensions/run_log_store/any_path.py +104 -0
- extensions/run_log_store/chunked_fs.py +13 -9
- extensions/run_log_store/file_system.py +6 -60
- extensions/run_log_store/generic_chunked.py +17 -11
- extensions/run_log_store/minio.py +111 -0
- runnable/catalog.py +8 -28
- runnable/datastore.py +2 -2
- runnable/executor.py +0 -17
- runnable/tasks.py +1 -3
- runnable/utils.py +21 -18
- {runnable-0.25.0.dist-info → runnable-0.27.0.dist-info}/METADATA +4 -1
- {runnable-0.25.0.dist-info → runnable-0.27.0.dist-info}/RECORD +20 -15
- {runnable-0.25.0.dist-info → runnable-0.27.0.dist-info}/entry_points.txt +3 -0
- {runnable-0.25.0.dist-info → runnable-0.27.0.dist-info}/WHEEL +0 -0
- {runnable-0.25.0.dist-info → runnable-0.27.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,104 @@
|
|
1
|
+
import logging
|
2
|
+
from abc import abstractmethod
|
3
|
+
from typing import Any, Dict
|
4
|
+
|
5
|
+
from runnable import defaults, exceptions
|
6
|
+
from runnable.datastore import BaseRunLogStore, RunLog
|
7
|
+
|
8
|
+
logger = logging.getLogger(defaults.LOGGER_NAME)
|
9
|
+
|
10
|
+
|
11
|
+
class AnyPathRunLogStore(BaseRunLogStore):
|
12
|
+
"""
|
13
|
+
In this type of Run Log store, we use a file system to store the JSON run log.
|
14
|
+
|
15
|
+
Every single run is stored as a different file which makes it compatible across other store types.
|
16
|
+
|
17
|
+
When to use:
|
18
|
+
When locally testing a pipeline and have the need to compare across runs.
|
19
|
+
Its fully featured and perfectly fine if your local environment is where you would do everything.
|
20
|
+
|
21
|
+
Do not use:
|
22
|
+
If you need parallelization on local, this run log would not support it.
|
23
|
+
|
24
|
+
Example config:
|
25
|
+
|
26
|
+
run_log:
|
27
|
+
type: file-system
|
28
|
+
config:
|
29
|
+
log_folder: The folder to out the logs. Defaults to .run_log_store
|
30
|
+
|
31
|
+
"""
|
32
|
+
|
33
|
+
service_name: str = "file-system"
|
34
|
+
log_folder: str = defaults.LOG_LOCATION_FOLDER
|
35
|
+
|
36
|
+
@property
|
37
|
+
def log_folder_name(self):
|
38
|
+
return self.log_folder
|
39
|
+
|
40
|
+
def get_summary(self) -> Dict[str, Any]:
|
41
|
+
summary = {"Type": self.service_name, "Location": self.log_folder}
|
42
|
+
|
43
|
+
return summary
|
44
|
+
|
45
|
+
@abstractmethod
|
46
|
+
def write_to_path(self, run_log: RunLog): ...
|
47
|
+
|
48
|
+
@abstractmethod
|
49
|
+
def read_from_path(self, run_id: str) -> RunLog: ...
|
50
|
+
|
51
|
+
def create_run_log(
|
52
|
+
self,
|
53
|
+
run_id: str,
|
54
|
+
dag_hash: str = "",
|
55
|
+
use_cached: bool = False,
|
56
|
+
tag: str = "",
|
57
|
+
original_run_id: str = "",
|
58
|
+
status: str = defaults.CREATED,
|
59
|
+
) -> RunLog:
|
60
|
+
"""
|
61
|
+
# Creates a Run log
|
62
|
+
# Adds it to the db
|
63
|
+
"""
|
64
|
+
|
65
|
+
try:
|
66
|
+
self.get_run_log_by_id(run_id=run_id, full=False)
|
67
|
+
raise exceptions.RunLogExistsError(run_id=run_id)
|
68
|
+
except exceptions.RunLogNotFoundError:
|
69
|
+
pass
|
70
|
+
|
71
|
+
logger.info(f"{self.service_name} Creating a Run Log for : {run_id}")
|
72
|
+
run_log = RunLog(
|
73
|
+
run_id=run_id,
|
74
|
+
dag_hash=dag_hash,
|
75
|
+
tag=tag,
|
76
|
+
status=status,
|
77
|
+
)
|
78
|
+
self.write_to_path(run_log)
|
79
|
+
return run_log
|
80
|
+
|
81
|
+
def get_run_log_by_id(
|
82
|
+
self,
|
83
|
+
run_id: str,
|
84
|
+
full: bool = False,
|
85
|
+
) -> RunLog:
|
86
|
+
"""
|
87
|
+
# Returns the run_log defined by id
|
88
|
+
# Raises Exception if not found
|
89
|
+
"""
|
90
|
+
try:
|
91
|
+
logger.info(f"{self.service_name} Getting a Run Log for : {run_id}")
|
92
|
+
run_log = self.read_from_path(run_id)
|
93
|
+
return run_log
|
94
|
+
except FileNotFoundError as e:
|
95
|
+
raise exceptions.RunLogNotFoundError(run_id) from e
|
96
|
+
|
97
|
+
def put_run_log(self, run_log: RunLog):
|
98
|
+
"""
|
99
|
+
# Puts the run_log into the database
|
100
|
+
"""
|
101
|
+
logger.info(
|
102
|
+
f"{self.service_name} Putting the run log in the DB: {run_log.run_id}"
|
103
|
+
)
|
104
|
+
self.write_to_path(run_log)
|
@@ -2,14 +2,16 @@ import json
|
|
2
2
|
import logging
|
3
3
|
from pathlib import Path
|
4
4
|
from string import Template
|
5
|
-
from typing import Any, Dict, Optional,
|
5
|
+
from typing import Any, Dict, Optional, Union
|
6
|
+
|
7
|
+
from cloudpathlib import CloudPath
|
6
8
|
|
7
9
|
from extensions.run_log_store.generic_chunked import ChunkedRunLogStore
|
8
10
|
from runnable import defaults, utils
|
9
11
|
|
10
12
|
logger = logging.getLogger(defaults.LOGGER_NAME)
|
11
13
|
|
12
|
-
|
14
|
+
MixT = Union[CloudPath, Path]
|
13
15
|
|
14
16
|
|
15
17
|
class ChunkedFileSystemRunLogStore(ChunkedRunLogStore):
|
@@ -28,7 +30,7 @@ class ChunkedFileSystemRunLogStore(ChunkedRunLogStore):
|
|
28
30
|
|
29
31
|
def get_matches(
|
30
32
|
self, run_id: str, name: str, multiple_allowed: bool = False
|
31
|
-
) -> Optional[Union[
|
33
|
+
) -> Optional[Union[list[Path], list[CloudPath], MixT]]:
|
32
34
|
"""
|
33
35
|
Get contents of files matching the pattern name*
|
34
36
|
|
@@ -78,7 +80,7 @@ class ChunkedFileSystemRunLogStore(ChunkedRunLogStore):
|
|
78
80
|
|
79
81
|
return str(name) + ".json"
|
80
82
|
|
81
|
-
def _store(self, run_id: str, contents: dict, name:
|
83
|
+
def _store(self, run_id: str, contents: dict, name: MixT, insert=False):
|
82
84
|
"""
|
83
85
|
Store the contents against the name in the folder.
|
84
86
|
|
@@ -87,15 +89,16 @@ class ChunkedFileSystemRunLogStore(ChunkedRunLogStore):
|
|
87
89
|
contents (dict): The dict to store
|
88
90
|
name (str): The name to store as
|
89
91
|
"""
|
92
|
+
log_folder_with_run_id = self.log_folder_with_run_id(run_id=run_id)
|
90
93
|
if insert:
|
91
|
-
name =
|
94
|
+
name = log_folder_with_run_id / name
|
92
95
|
|
93
|
-
utils.safe_make_dir(
|
96
|
+
utils.safe_make_dir(log_folder_with_run_id)
|
94
97
|
|
95
|
-
with open(self.safe_suffix_json(name), "w") as fw:
|
98
|
+
with open(log_folder_with_run_id / self.safe_suffix_json(name.name), "w") as fw:
|
96
99
|
json.dump(contents, fw, ensure_ascii=True, indent=4)
|
97
100
|
|
98
|
-
def _retrieve(self,
|
101
|
+
def _retrieve(self, run_id: str, name: MixT) -> dict:
|
99
102
|
"""
|
100
103
|
Does the job of retrieving from the folder.
|
101
104
|
|
@@ -106,8 +109,9 @@ class ChunkedFileSystemRunLogStore(ChunkedRunLogStore):
|
|
106
109
|
dict: The contents
|
107
110
|
"""
|
108
111
|
contents: dict = {}
|
112
|
+
log_folder_with_run_id = self.log_folder_with_run_id(run_id=run_id)
|
109
113
|
|
110
|
-
with open(self.safe_suffix_json(name), "r") as fr:
|
114
|
+
with open(log_folder_with_run_id / self.safe_suffix_json(name.name), "r") as fr:
|
111
115
|
contents = json.load(fr)
|
112
116
|
|
113
117
|
return contents
|
@@ -3,13 +3,14 @@ import logging
|
|
3
3
|
from pathlib import Path
|
4
4
|
from typing import Any, Dict
|
5
5
|
|
6
|
-
from
|
7
|
-
from runnable
|
6
|
+
from extensions.run_log_store.any_path import AnyPathRunLogStore
|
7
|
+
from runnable import defaults, utils
|
8
|
+
from runnable.datastore import RunLog
|
8
9
|
|
9
10
|
logger = logging.getLogger(defaults.LOGGER_NAME)
|
10
11
|
|
11
12
|
|
12
|
-
class FileSystemRunLogstore(
|
13
|
+
class FileSystemRunLogstore(AnyPathRunLogStore):
|
13
14
|
"""
|
14
15
|
In this type of Run Log store, we use a file system to store the JSON run log.
|
15
16
|
|
@@ -43,7 +44,7 @@ class FileSystemRunLogstore(BaseRunLogStore):
|
|
43
44
|
|
44
45
|
return summary
|
45
46
|
|
46
|
-
def
|
47
|
+
def write_to_path(self, run_log: RunLog):
|
47
48
|
"""
|
48
49
|
Write the run log to the folder
|
49
50
|
|
@@ -60,7 +61,7 @@ class FileSystemRunLogstore(BaseRunLogStore):
|
|
60
61
|
with json_file_path.open("w") as fw:
|
61
62
|
json.dump(run_log.model_dump(), fw, ensure_ascii=True, indent=4) # pylint: disable=no-member
|
62
63
|
|
63
|
-
def
|
64
|
+
def read_from_path(self, run_id: str) -> RunLog:
|
64
65
|
"""
|
65
66
|
Look into the run log folder for the run log for the run id.
|
66
67
|
|
@@ -88,58 +89,3 @@ class FileSystemRunLogstore(BaseRunLogStore):
|
|
88
89
|
json_str = json.load(fr)
|
89
90
|
run_log = RunLog(**json_str) # pylint: disable=no-member
|
90
91
|
return run_log
|
91
|
-
|
92
|
-
def create_run_log(
|
93
|
-
self,
|
94
|
-
run_id: str,
|
95
|
-
dag_hash: str = "",
|
96
|
-
use_cached: bool = False,
|
97
|
-
tag: str = "",
|
98
|
-
original_run_id: str = "",
|
99
|
-
status: str = defaults.CREATED,
|
100
|
-
) -> RunLog:
|
101
|
-
"""
|
102
|
-
# Creates a Run log
|
103
|
-
# Adds it to the db
|
104
|
-
"""
|
105
|
-
|
106
|
-
try:
|
107
|
-
self.get_run_log_by_id(run_id=run_id, full=False)
|
108
|
-
raise exceptions.RunLogExistsError(run_id=run_id)
|
109
|
-
except exceptions.RunLogNotFoundError:
|
110
|
-
pass
|
111
|
-
|
112
|
-
logger.info(f"{self.service_name} Creating a Run Log for : {run_id}")
|
113
|
-
run_log = RunLog(
|
114
|
-
run_id=run_id,
|
115
|
-
dag_hash=dag_hash,
|
116
|
-
tag=tag,
|
117
|
-
status=status,
|
118
|
-
)
|
119
|
-
self.write_to_folder(run_log)
|
120
|
-
return run_log
|
121
|
-
|
122
|
-
def get_run_log_by_id(
|
123
|
-
self,
|
124
|
-
run_id: str,
|
125
|
-
full: bool = False,
|
126
|
-
) -> RunLog:
|
127
|
-
"""
|
128
|
-
# Returns the run_log defined by id
|
129
|
-
# Raises Exception if not found
|
130
|
-
"""
|
131
|
-
try:
|
132
|
-
logger.info(f"{self.service_name} Getting a Run Log for : {run_id}")
|
133
|
-
run_log = self.get_from_folder(run_id)
|
134
|
-
return run_log
|
135
|
-
except FileNotFoundError as e:
|
136
|
-
raise exceptions.RunLogNotFoundError(run_id) from e
|
137
|
-
|
138
|
-
def put_run_log(self, run_log: RunLog):
|
139
|
-
"""
|
140
|
-
# Puts the run_log into the database
|
141
|
-
"""
|
142
|
-
logger.info(
|
143
|
-
f"{self.service_name} Putting the run log in the DB: {run_log.run_id}"
|
144
|
-
)
|
145
|
-
self.write_to_folder(run_log)
|
@@ -4,7 +4,9 @@ from abc import abstractmethod
|
|
4
4
|
from enum import Enum
|
5
5
|
from pathlib import Path
|
6
6
|
from string import Template
|
7
|
-
from typing import Any, Dict, Optional,
|
7
|
+
from typing import Any, Dict, Optional, Union
|
8
|
+
|
9
|
+
from cloudpathlib import CloudPath
|
8
10
|
|
9
11
|
from runnable import defaults, exceptions
|
10
12
|
from runnable.datastore import (
|
@@ -21,7 +23,7 @@ from runnable.datastore import (
|
|
21
23
|
logger = logging.getLogger(defaults.LOGGER_NAME)
|
22
24
|
|
23
25
|
|
24
|
-
|
26
|
+
MixT = Union[CloudPath, Path] # Holds str, path
|
25
27
|
|
26
28
|
|
27
29
|
class EntityNotFoundError(Exception):
|
@@ -87,7 +89,7 @@ class ChunkedRunLogStore(BaseRunLogStore):
|
|
87
89
|
@abstractmethod
|
88
90
|
def get_matches(
|
89
91
|
self, run_id: str, name: str, multiple_allowed: bool = False
|
90
|
-
) -> Optional[Union[
|
92
|
+
) -> Optional[Union[list[Path], list[CloudPath], MixT]]:
|
91
93
|
"""
|
92
94
|
Get contents of persistence layer matching the pattern name*
|
93
95
|
|
@@ -98,7 +100,7 @@ class ChunkedRunLogStore(BaseRunLogStore):
|
|
98
100
|
...
|
99
101
|
|
100
102
|
@abstractmethod
|
101
|
-
def _store(self, run_id: str, contents: dict, name:
|
103
|
+
def _store(self, run_id: str, contents: dict, name: MixT, insert: bool = False):
|
102
104
|
"""
|
103
105
|
Store the contents against the name in the persistence layer.
|
104
106
|
|
@@ -110,7 +112,7 @@ class ChunkedRunLogStore(BaseRunLogStore):
|
|
110
112
|
...
|
111
113
|
|
112
114
|
@abstractmethod
|
113
|
-
def _retrieve(self, name:
|
115
|
+
def _retrieve(self, run_id: str, name: MixT) -> dict:
|
114
116
|
"""
|
115
117
|
Does the job of retrieving from the persistent layer.
|
116
118
|
|
@@ -140,7 +142,7 @@ class ChunkedRunLogStore(BaseRunLogStore):
|
|
140
142
|
insert = False
|
141
143
|
|
142
144
|
if match:
|
143
|
-
existing_contents = self._retrieve(name=match) # type: ignore
|
145
|
+
existing_contents = self._retrieve(run_id=run_id, name=match) # type: ignore
|
144
146
|
contents = dict(existing_contents, **contents)
|
145
147
|
name_to_give = match # type: ignore
|
146
148
|
else:
|
@@ -149,7 +151,9 @@ class ChunkedRunLogStore(BaseRunLogStore):
|
|
149
151
|
)
|
150
152
|
insert = True
|
151
153
|
|
152
|
-
self._store(
|
154
|
+
self._store(
|
155
|
+
run_id=run_id, contents=contents, name=Path(name_to_give), insert=insert
|
156
|
+
)
|
153
157
|
|
154
158
|
def retrieve(
|
155
159
|
self, run_id: str, log_type: LogTypes, name: str = "", multiple_allowed=False
|
@@ -190,13 +194,13 @@ class ChunkedRunLogStore(BaseRunLogStore):
|
|
190
194
|
|
191
195
|
if matches:
|
192
196
|
if not multiple_allowed:
|
193
|
-
contents = self._retrieve(name=matches) # type: ignore
|
197
|
+
contents = self._retrieve(run_id=run_id, name=matches) # type: ignore
|
194
198
|
model = self.ModelTypes[log_type.name].value
|
195
199
|
return model(**contents)
|
196
200
|
|
197
201
|
models = []
|
198
202
|
for match in matches: # type: ignore
|
199
|
-
contents = self._retrieve(name=match)
|
203
|
+
contents = self._retrieve(run_id=run_id, name=match)
|
200
204
|
model = self.ModelTypes[log_type.name].value
|
201
205
|
models.append(model(**contents))
|
202
206
|
return models
|
@@ -225,7 +229,9 @@ class ChunkedRunLogStore(BaseRunLogStore):
|
|
225
229
|
# No branch logs are found
|
226
230
|
return {}
|
227
231
|
# Forcing get_matches to always return a list is a better design
|
228
|
-
|
232
|
+
|
233
|
+
assert isinstance(matches, list)
|
234
|
+
epoch_created = [str(match).split("-")[-1] for match in matches]
|
229
235
|
|
230
236
|
# sort matches by epoch created
|
231
237
|
epoch_created, matches = zip(*sorted(zip(epoch_created, matches))) # type: ignore
|
@@ -234,7 +240,7 @@ class ChunkedRunLogStore(BaseRunLogStore):
|
|
234
240
|
|
235
241
|
for match in matches:
|
236
242
|
model = self.ModelTypes[log_type.name].value
|
237
|
-
log_model = model(**self._retrieve(match))
|
243
|
+
log_model = model(**self._retrieve(run_id=run_id, name=match))
|
238
244
|
logs[log_model.internal_name] = log_model # type: ignore
|
239
245
|
|
240
246
|
return logs
|
@@ -0,0 +1,111 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
from functools import lru_cache
|
4
|
+
from typing import Any, Dict
|
5
|
+
|
6
|
+
from cloudpathlib import S3Client, S3Path
|
7
|
+
from pydantic import Field, SecretStr
|
8
|
+
|
9
|
+
from extensions.run_log_store.any_path import AnyPathRunLogStore
|
10
|
+
from runnable import defaults
|
11
|
+
from runnable.datastore import RunLog
|
12
|
+
|
13
|
+
logger = logging.getLogger(defaults.LOGGER_NAME)
|
14
|
+
|
15
|
+
|
16
|
+
@lru_cache
|
17
|
+
def get_minio_client(
|
18
|
+
endpoint_url: str, aws_access_key_id: str, aws_secret_access_key: str
|
19
|
+
) -> S3Client:
|
20
|
+
return S3Client(
|
21
|
+
endpoint_url=endpoint_url,
|
22
|
+
aws_access_key_id=aws_access_key_id,
|
23
|
+
aws_secret_access_key=aws_secret_access_key,
|
24
|
+
)
|
25
|
+
|
26
|
+
|
27
|
+
class MinioRunLogStore(AnyPathRunLogStore):
|
28
|
+
"""
|
29
|
+
In this type of Run Log store, we use a file system to store the JSON run log.
|
30
|
+
|
31
|
+
Every single run is stored as a different file which makes it compatible across other store types.
|
32
|
+
|
33
|
+
When to use:
|
34
|
+
When locally testing a pipeline and have the need to compare across runs.
|
35
|
+
Its fully featured and perfectly fine if your local environment is where you would do everything.
|
36
|
+
|
37
|
+
Do not use:
|
38
|
+
If you need parallelization on local, this run log would not support it.
|
39
|
+
|
40
|
+
Example config:
|
41
|
+
|
42
|
+
run_log:
|
43
|
+
type: file-system
|
44
|
+
config:
|
45
|
+
log_folder: The folder to out the logs. Defaults to .run_log_store
|
46
|
+
|
47
|
+
"""
|
48
|
+
|
49
|
+
service_name: str = "file-system"
|
50
|
+
|
51
|
+
endpoint_url: str = Field(default="http://localhost:9002")
|
52
|
+
aws_access_key_id: SecretStr = SecretStr(secret_value="minioadmin")
|
53
|
+
aws_secret_access_key: SecretStr = SecretStr(secret_value="minioadmin")
|
54
|
+
bucket: str = Field(default="runnable/run-logs")
|
55
|
+
|
56
|
+
def get_summary(self) -> Dict[str, Any]:
|
57
|
+
summary = {"Type": self.service_name, "Location": self.log_folder}
|
58
|
+
|
59
|
+
return summary
|
60
|
+
|
61
|
+
def get_run_log_bucket(self) -> S3Path:
|
62
|
+
run_id = self._context.run_id
|
63
|
+
|
64
|
+
return S3Path(
|
65
|
+
f"s3://{self.bucket}/{run_id}/",
|
66
|
+
client=get_minio_client(
|
67
|
+
self.endpoint_url,
|
68
|
+
self.aws_access_key_id.get_secret_value(),
|
69
|
+
self.aws_secret_access_key.get_secret_value(),
|
70
|
+
),
|
71
|
+
)
|
72
|
+
|
73
|
+
def write_to_path(self, run_log: RunLog):
|
74
|
+
"""
|
75
|
+
Write the run log to the folder
|
76
|
+
|
77
|
+
Args:
|
78
|
+
run_log (RunLog): The run log to be added to the database
|
79
|
+
"""
|
80
|
+
run_log_bucket = self.get_run_log_bucket()
|
81
|
+
run_log_bucket.mkdir(parents=True, exist_ok=True)
|
82
|
+
|
83
|
+
run_log_object = run_log_bucket / f"{run_log.run_id}.json"
|
84
|
+
run_log_object.write_text(
|
85
|
+
json.dumps(run_log.model_dump_json(), ensure_ascii=True, indent=4)
|
86
|
+
)
|
87
|
+
|
88
|
+
def read_from_path(self, run_id: str) -> RunLog:
|
89
|
+
"""
|
90
|
+
Look into the run log folder for the run log for the run id.
|
91
|
+
|
92
|
+
If the run log does not exist, raise an exception. If it does, decode it
|
93
|
+
as a RunLog and return it
|
94
|
+
|
95
|
+
Args:
|
96
|
+
run_id (str): The requested run id to retrieve the run log store
|
97
|
+
|
98
|
+
Raises:
|
99
|
+
FileNotFoundError: If the Run Log has not been found.
|
100
|
+
|
101
|
+
Returns:
|
102
|
+
RunLog: The decoded Run log
|
103
|
+
"""
|
104
|
+
run_log_bucket = self.get_run_log_bucket()
|
105
|
+
|
106
|
+
run_log_object = run_log_bucket / f"{run_id}.json"
|
107
|
+
|
108
|
+
run_log_text = json.loads(run_log_object.read_text())
|
109
|
+
run_log = RunLog(**json.loads(run_log_text))
|
110
|
+
|
111
|
+
return run_log
|
runnable/catalog.py
CHANGED
@@ -2,7 +2,7 @@ import logging
|
|
2
2
|
from abc import ABC, abstractmethod
|
3
3
|
from typing import Any, Dict, List, Optional
|
4
4
|
|
5
|
-
from pydantic import BaseModel, ConfigDict
|
5
|
+
from pydantic import BaseModel, ConfigDict, Field
|
6
6
|
|
7
7
|
import runnable.context as context
|
8
8
|
from runnable import defaults
|
@@ -43,6 +43,9 @@ class BaseCatalog(ABC, BaseModel):
|
|
43
43
|
|
44
44
|
service_name: str = ""
|
45
45
|
service_type: str = "catalog"
|
46
|
+
|
47
|
+
compute_data_folder: str = Field(default=defaults.COMPUTE_DATA_FOLDER)
|
48
|
+
|
46
49
|
model_config = ConfigDict(extra="forbid")
|
47
50
|
|
48
51
|
@abstractmethod
|
@@ -52,14 +55,8 @@ class BaseCatalog(ABC, BaseModel):
|
|
52
55
|
def _context(self):
|
53
56
|
return context.run_context
|
54
57
|
|
55
|
-
@property
|
56
|
-
def compute_data_folder(self) -> str:
|
57
|
-
return defaults.COMPUTE_DATA_FOLDER
|
58
|
-
|
59
58
|
@abstractmethod
|
60
|
-
def get(
|
61
|
-
self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
|
62
|
-
) -> List[DataCatalog]:
|
59
|
+
def get(self, name: str) -> List[DataCatalog]:
|
63
60
|
"""
|
64
61
|
Get the catalog item by 'name' for the 'run id' and store it in compute data folder.
|
65
62
|
|
@@ -79,14 +76,7 @@ class BaseCatalog(ABC, BaseModel):
|
|
79
76
|
raise NotImplementedError
|
80
77
|
|
81
78
|
@abstractmethod
|
82
|
-
def put(
|
83
|
-
self,
|
84
|
-
name: str,
|
85
|
-
run_id: str,
|
86
|
-
compute_data_folder: str = "",
|
87
|
-
synced_catalogs: Optional[List[DataCatalog]] = None,
|
88
|
-
**kwargs,
|
89
|
-
) -> List[DataCatalog]:
|
79
|
+
def put(self, name: str) -> List[DataCatalog]:
|
90
80
|
"""
|
91
81
|
Put the file by 'name' from the 'compute_data_folder' in the catalog for the run_id.
|
92
82
|
|
@@ -140,23 +130,14 @@ class DoNothingCatalog(BaseCatalog):
|
|
140
130
|
def get_summary(self) -> Dict[str, Any]:
|
141
131
|
return {}
|
142
132
|
|
143
|
-
def get(
|
144
|
-
self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
|
145
|
-
) -> List[DataCatalog]:
|
133
|
+
def get(self, name: str) -> List[DataCatalog]:
|
146
134
|
"""
|
147
135
|
Does nothing
|
148
136
|
"""
|
149
137
|
logger.info("Using a do-nothing catalog, doing nothing in get")
|
150
138
|
return []
|
151
139
|
|
152
|
-
def put(
|
153
|
-
self,
|
154
|
-
name: str,
|
155
|
-
run_id: str,
|
156
|
-
compute_data_folder: str = "",
|
157
|
-
synced_catalogs: Optional[List[DataCatalog]] = None,
|
158
|
-
**kwargs,
|
159
|
-
) -> List[DataCatalog]:
|
140
|
+
def put(self, name: str) -> List[DataCatalog]:
|
160
141
|
"""
|
161
142
|
Does nothing
|
162
143
|
"""
|
@@ -168,4 +149,3 @@ class DoNothingCatalog(BaseCatalog):
|
|
168
149
|
Does nothing
|
169
150
|
"""
|
170
151
|
logger.info("Using a do-nothing catalog, doing nothing while sync between runs")
|
171
|
-
logger.info("Using a do-nothing catalog, doing nothing while sync between runs")
|
runnable/datastore.py
CHANGED
@@ -114,7 +114,7 @@ class ObjectParameter(BaseModel):
|
|
114
114
|
|
115
115
|
# If the object was serialised, get it from the catalog
|
116
116
|
catalog_handler = context.run_context.catalog_handler
|
117
|
-
catalog_handler.get(name=self.file_name
|
117
|
+
catalog_handler.get(name=self.file_name)
|
118
118
|
obj = context.run_context.pickler.load(path=self.file_name)
|
119
119
|
os.remove(self.file_name) # Remove after loading
|
120
120
|
return obj
|
@@ -128,7 +128,7 @@ class ObjectParameter(BaseModel):
|
|
128
128
|
context.run_context.pickler.dump(data=data, path=self.file_name)
|
129
129
|
|
130
130
|
catalog_handler = context.run_context.catalog_handler
|
131
|
-
catalog_handler.put(name=self.file_name
|
131
|
+
catalog_handler.put(name=self.file_name)
|
132
132
|
os.remove(self.file_name) # Remove after loading
|
133
133
|
|
134
134
|
|
runnable/executor.py
CHANGED
@@ -173,23 +173,6 @@ class BasePipelineExecutor(BaseExecutor):
|
|
173
173
|
"""
|
174
174
|
...
|
175
175
|
|
176
|
-
@abstractmethod
|
177
|
-
def get_effective_compute_data_folder(self) -> Optional[str]:
|
178
|
-
"""
|
179
|
-
Get the effective compute data folder for the given stage.
|
180
|
-
If there is nothing to catalog, we return None.
|
181
|
-
|
182
|
-
The default is the compute data folder of the catalog but this can be over-ridden by the node.
|
183
|
-
|
184
|
-
Args:
|
185
|
-
stage (str): The stage we are in the process of cataloging
|
186
|
-
|
187
|
-
|
188
|
-
Returns:
|
189
|
-
Optional[str]: The compute data folder as defined by catalog handler or the node or None.
|
190
|
-
"""
|
191
|
-
...
|
192
|
-
|
193
176
|
@abstractmethod
|
194
177
|
def _sync_catalog(
|
195
178
|
self, stage: str, synced_catalogs=None
|
runnable/tasks.py
CHANGED
@@ -501,9 +501,7 @@ class NotebookTaskType(BaseTaskType):
|
|
501
501
|
pm.execute_notebook(**kwds)
|
502
502
|
task_console.print(out_file.getvalue())
|
503
503
|
|
504
|
-
context.run_context.catalog_handler.put(
|
505
|
-
name=notebook_output_path, run_id=context.run_context.run_id
|
506
|
-
)
|
504
|
+
context.run_context.catalog_handler.put(name=notebook_output_path)
|
507
505
|
|
508
506
|
client = PloomberClient.from_path(path=notebook_output_path)
|
509
507
|
namespace = client.get_namespace()
|
runnable/utils.py
CHANGED
@@ -359,26 +359,26 @@ def diff_dict(d1: Dict[str, Any], d2: Dict[str, Any]) -> Dict[str, Any]:
|
|
359
359
|
return diff
|
360
360
|
|
361
361
|
|
362
|
-
def hash_bytestr_iter(bytesiter, hasher, ashexstr=True): # pylint: disable=C0116
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
362
|
+
# def hash_bytestr_iter(bytesiter, hasher, ashexstr=True): # pylint: disable=C0116
|
363
|
+
# """Hashes the given bytesiter using the given hasher."""
|
364
|
+
# for block in bytesiter: # pragma: no cover
|
365
|
+
# hasher.update(block)
|
366
|
+
# return hasher.hexdigest() if ashexstr else hasher.digest() # pragma: no cover
|
367
367
|
|
368
368
|
|
369
|
-
def file_as_blockiter(afile, blocksize=65536): # pylint: disable=C0116
|
370
|
-
|
371
|
-
|
369
|
+
# def file_as_blockiter(afile, blocksize=65536): # pylint: disable=C0116
|
370
|
+
# """From a StackOverflow answer: that is used to generate a MD5 hash of a large files.
|
371
|
+
# # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file.
|
372
372
|
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
373
|
+
# """
|
374
|
+
# with afile: # pragma: no cover
|
375
|
+
# block = afile.read(blocksize)
|
376
|
+
# while len(block) > 0:
|
377
|
+
# yield block
|
378
|
+
# block = afile.read(blocksize)
|
379
379
|
|
380
380
|
|
381
|
-
def get_data_hash(file_name: str):
|
381
|
+
def get_data_hash(file_name: str) -> str:
|
382
382
|
"""Returns the hash of the data file.
|
383
383
|
|
384
384
|
Args:
|
@@ -389,9 +389,12 @@ def get_data_hash(file_name: str):
|
|
389
389
|
"""
|
390
390
|
# https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file
|
391
391
|
# TODO: For a big file, we should only hash the first few bytes
|
392
|
-
|
393
|
-
|
394
|
-
|
392
|
+
with open(file_name, "rb") as f:
|
393
|
+
file_hash = hashlib.md5()
|
394
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
395
|
+
file_hash.update(chunk)
|
396
|
+
|
397
|
+
return file_hash.hexdigest()
|
395
398
|
|
396
399
|
|
397
400
|
# TODO: This is not the right place for this.
|