runnable 0.25.0__py3-none-any.whl → 0.26.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,201 @@
1
+ import logging
2
+ import os
3
+ import shutil
4
+ from abc import abstractmethod
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List
7
+
8
+ from cloudpathlib import CloudPath
9
+
10
+ from runnable import defaults, utils
11
+ from runnable.catalog import BaseCatalog
12
+ from runnable.datastore import DataCatalog
13
+
14
+ logger = logging.getLogger(defaults.LOGGER_NAME)
15
+
16
+
17
+ class AnyPathCatalog(BaseCatalog):
18
+ """
19
+ A Catalog handler that uses the local file system for cataloging.
20
+
21
+ Note: Do not use this if the steps of the pipeline run on different compute environments.
22
+
23
+ Example config:
24
+
25
+ catalog:
26
+ type: file-system
27
+ config:
28
+ catalog_location: The location to store the catalog.
29
+ compute_data_folder: The folder to source the data from.
30
+
31
+ """
32
+
33
+ @abstractmethod
34
+ def get_summary(self) -> Dict[str, Any]: ...
35
+
36
+ @abstractmethod
37
+ def upload_to_catalog(self, file: Path) -> None: ...
38
+
39
+ @abstractmethod
40
+ def download_from_catalog(self, file: Path | CloudPath) -> None: ...
41
+
42
+ @abstractmethod
43
+ def get_catalog_location(self) -> Path | CloudPath:
44
+ """
45
+ For local file systems, this is the .catalog/run_id/compute_data_folder
46
+ For cloud systems, this is s3://bucket/run_id/compute_data_folder
47
+ """
48
+ ...
49
+
50
+ def get(self, name: str) -> List[DataCatalog]:
51
+ """
52
+ Get the file by matching glob pattern to the name
53
+
54
+ Args:
55
+ name ([str]): A glob matching the file name
56
+ run_id ([str]): The run id
57
+
58
+ Raises:
59
+ Exception: If the catalog location does not exist
60
+
61
+ Returns:
62
+ List(object) : A list of catalog objects
63
+ """
64
+ run_catalog = self.get_catalog_location()
65
+
66
+ # Iterate through the contents of the run_catalog and copy the files that fit the name pattern
67
+ # We should also return a list of data hashes
68
+ glob_files = run_catalog.glob(name)
69
+ logger.debug(
70
+ f"Glob identified {glob_files} as matches to from the catalog location: {run_catalog}"
71
+ )
72
+
73
+ data_catalogs = []
74
+ run_log_store = self._context.run_log_store
75
+ for file in glob_files:
76
+ if file.is_dir():
77
+ # Need not add a data catalog for the folder
78
+ continue
79
+
80
+ if str(file).endswith(".execution.log"):
81
+ continue
82
+
83
+ self.download_from_catalog(file)
84
+ relative_file_path = file.relative_to(run_catalog) # type: ignore
85
+
86
+ data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
87
+ data_catalog.catalog_relative_path = str(relative_file_path)
88
+ data_catalog.data_hash = utils.get_data_hash(str(relative_file_path))
89
+ data_catalog.stage = "get"
90
+ data_catalogs.append(data_catalog)
91
+
92
+ if not data_catalogs:
93
+ raise Exception(f"Did not find any files matching {name} in {run_catalog}")
94
+
95
+ return data_catalogs
96
+
97
+ def put(self, name: str) -> List[DataCatalog]:
98
+ """
99
+ Put the files matching the glob pattern into the catalog.
100
+
101
+ If previously synced catalogs are provided, and no changes were observed, we do not sync them.
102
+
103
+ Args:
104
+ name (str): The glob pattern of the files to catalog
105
+ run_id (str): The run id of the run
106
+ compute_data_folder (str, optional): The compute data folder to sync from. Defaults to settings default.
107
+ synced_catalogs (dict, optional): dictionary of previously synced catalogs. Defaults to None.
108
+
109
+ Raises:
110
+ Exception: If the compute data folder does not exist.
111
+
112
+ Returns:
113
+ List(object) : A list of catalog objects
114
+ """
115
+ run_id = self._context.run_id
116
+ logger.info(
117
+ f"Using the {self.service_name} catalog and trying to put {name} for run_id: {run_id}"
118
+ )
119
+
120
+ copy_from = Path(self.compute_data_folder)
121
+
122
+ if not copy_from.is_dir():
123
+ msg = (
124
+ f"Expected compute data folder to be present at: {copy_from} but not found. \n"
125
+ "Note: runnable does not create the compute data folder for you. Please ensure that the "
126
+ "folder exists.\n"
127
+ )
128
+ raise Exception(msg)
129
+
130
+ # Iterate through the contents of copy_from and if the name matches, we move them to the run_catalog
131
+ # We should also return a list of datastore.DataCatalog items
132
+ glob_files = copy_from.glob(name)
133
+ logger.debug(
134
+ f"Glob identified {glob_files} as matches to from the compute data folder: {copy_from}"
135
+ )
136
+
137
+ data_catalogs = []
138
+ run_log_store = self._context.run_log_store
139
+ for file in glob_files:
140
+ if file.is_dir():
141
+ # Need not add a data catalog for the folder
142
+ continue
143
+
144
+ relative_file_path = file.relative_to(copy_from)
145
+
146
+ data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
147
+ data_catalog.catalog_relative_path = (
148
+ run_id + os.sep + str(relative_file_path)
149
+ )
150
+ data_catalog.data_hash = utils.get_data_hash(str(file))
151
+ data_catalog.stage = "put"
152
+ data_catalogs.append(data_catalog)
153
+
154
+ # TODO: Think about syncing only if the file is changed
155
+ self.upload_to_catalog(file)
156
+
157
+ if not data_catalogs:
158
+ raise Exception(f"Did not find any files matching {name} in {copy_from}")
159
+
160
+ return data_catalogs
161
+
162
+ def sync_between_runs(self, previous_run_id: str, run_id: str):
163
+ """
164
+ Given the previous run id, sync the catalogs between the current one and previous
165
+
166
+ Args:
167
+ previous_run_id (str): The previous run id to sync the catalogs from
168
+ run_id (str): The run_id to which the data catalogs should be synced to.
169
+
170
+ Raises:
171
+ Exception: If the previous run log does not exist in the catalog
172
+
173
+ """
174
+ logger.info(
175
+ f"Using the {self.service_name} catalog and syncing catalogs"
176
+ "between old: {previous_run_id} to new: {run_id}"
177
+ )
178
+
179
+ catalog_location = Path(self.get_catalog_location())
180
+ run_catalog = catalog_location / run_id
181
+ utils.safe_make_dir(run_catalog)
182
+
183
+ if not utils.does_dir_exist(catalog_location / previous_run_id):
184
+ msg = (
185
+ f"Catalogs from previous run : {previous_run_id} are not found.\n"
186
+ "Note: Please provision the catalog objects generated by previous run in the same catalog location"
187
+ " as the current run, even if the catalog handler for the previous run was different"
188
+ )
189
+ raise Exception(msg)
190
+
191
+ cataloged_files = list((catalog_location / previous_run_id).glob("*"))
192
+
193
+ for cataloged_file in cataloged_files:
194
+ if str(cataloged_file).endswith("execution.log"):
195
+ continue
196
+
197
+ if cataloged_file.is_file():
198
+ shutil.copy(cataloged_file, run_catalog / cataloged_file.name)
199
+ else:
200
+ shutil.copytree(cataloged_file, run_catalog / cataloged_file.name)
201
+ logger.info(f"Copied file from: {cataloged_file} to {run_catalog}")
@@ -1,253 +1,52 @@
1
1
  import logging
2
- import os
3
2
  import shutil
4
3
  from pathlib import Path
5
- from typing import Any, Dict, List, Optional
4
+ from typing import Any
6
5
 
7
- from runnable import defaults, utils
8
- from runnable.catalog import BaseCatalog, is_catalog_out_of_sync
9
- from runnable.datastore import DataCatalog
6
+ from cloudpathlib import CloudPath
7
+ from pydantic import Field
10
8
 
11
- logger = logging.getLogger(defaults.LOGGER_NAME)
12
-
13
-
14
- class FileSystemCatalog(BaseCatalog):
15
- """
16
- A Catalog handler that uses the local file system for cataloging.
17
-
18
- Note: Do not use this if the steps of the pipeline run on different compute environments.
9
+ from extensions.catalog.any_path import AnyPathCatalog
10
+ from runnable import defaults
19
11
 
20
- Example config:
21
-
22
- catalog:
23
- type: file-system
24
- config:
25
- catalog_location: The location to store the catalog.
26
- compute_data_folder: The folder to source the data from.
12
+ logger = logging.getLogger(defaults.LOGGER_NAME)
27
13
 
28
- """
29
14
 
15
+ class FileSystemCatalog(AnyPathCatalog):
30
16
  service_name: str = "file-system"
31
- catalog_location: str = defaults.CATALOG_LOCATION_FOLDER
32
17
 
33
- def get_catalog_location(self):
34
- return self.catalog_location
18
+ catalog_location: str = Field(default=defaults.CATALOG_LOCATION_FOLDER)
35
19
 
36
- def get_summary(self) -> Dict[str, Any]:
37
- summary = {
38
- "Catalog Location": self.get_catalog_location(),
20
+ def get_summary(self) -> dict[str, Any]:
21
+ return {
22
+ "compute_data_folder": self.compute_data_folder,
23
+ "catalog_location": self.catalog_location,
39
24
  }
40
25
 
41
- return summary
42
-
43
- def get(
44
- self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
45
- ) -> List[DataCatalog]:
46
- """
47
- Get the file by matching glob pattern to the name
26
+ def get_catalog_location(self) -> Path:
27
+ run_id = self._context.run_id
28
+ return Path(self.catalog_location) / run_id / self.compute_data_folder
48
29
 
49
- Args:
50
- name ([str]): A glob matching the file name
51
- run_id ([str]): The run id
30
+ def download_from_catalog(self, file: Path | CloudPath) -> None:
31
+ assert isinstance(file, Path)
52
32
 
53
- Raises:
54
- Exception: If the catalog location does not exist
55
-
56
- Returns:
57
- List(object) : A list of catalog objects
58
- """
59
- logger.info(
60
- f"Using the {self.service_name} catalog and trying to get {name} for run_id: {run_id}"
61
- )
33
+ run_catalog = self.get_catalog_location()
34
+ relative_file_path = file.relative_to(run_catalog)
62
35
 
63
36
  copy_to = self.compute_data_folder
64
- if compute_data_folder:
65
- copy_to = compute_data_folder
66
-
67
- copy_to = Path(copy_to) # type: ignore
68
-
69
- catalog_location = self.get_catalog_location()
70
- run_catalog = Path(catalog_location) / run_id / copy_to
71
-
72
- logger.debug(
73
- f"Copying objects to {copy_to} from the run catalog location of {run_catalog}"
74
- )
37
+ # Make the directory in the data folder if required
38
+ Path(copy_to / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
39
+ shutil.copy(file, copy_to / relative_file_path)
75
40
 
76
- if not utils.does_dir_exist(run_catalog):
77
- msg = (
78
- f"Expected Catalog to be present at: {run_catalog} but not found.\n"
79
- "Note: Please make sure that some data was put in the catalog before trying to get from it.\n"
80
- )
81
- raise Exception(msg)
41
+ def upload_to_catalog(self, file: Path) -> None:
42
+ run_catalog = self.get_catalog_location()
43
+ run_catalog.mkdir(parents=True, exist_ok=True)
82
44
 
83
- # Iterate through the contents of the run_catalog and copy the files that fit the name pattern
84
- # We should also return a list of data hashes
85
- glob_files = run_catalog.glob(name)
86
45
  logger.debug(
87
- f"Glob identified {glob_files} as matches to from the catalog location: {run_catalog}"
46
+ f"Copying objects from {self.compute_data_folder} to the run catalog location of {run_catalog}"
88
47
  )
89
48
 
90
- data_catalogs = []
91
- run_log_store = self._context.run_log_store
92
- for file in glob_files:
93
- if file.is_dir():
94
- # Need not add a data catalog for the folder
95
- continue
96
-
97
- if str(file).endswith(".execution.log"):
98
- continue
99
-
100
- relative_file_path = file.relative_to(run_catalog)
101
-
102
- data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
103
- data_catalog.catalog_handler_location = catalog_location
104
- data_catalog.catalog_relative_path = str(relative_file_path)
105
- data_catalog.data_hash = utils.get_data_hash(str(file))
106
- data_catalog.stage = "get"
107
- data_catalogs.append(data_catalog)
108
-
109
- # Make the directory in the data folder if required
110
- Path(copy_to / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
111
- shutil.copy(file, copy_to / relative_file_path)
112
-
113
- logger.info(f"Copied {file} from {run_catalog} to {copy_to}")
114
-
115
- if not data_catalogs:
116
- raise Exception(f"Did not find any files matching {name} in {run_catalog}")
117
-
118
- return data_catalogs
119
-
120
- def put(
121
- self,
122
- name: str,
123
- run_id: str,
124
- compute_data_folder: str = "",
125
- synced_catalogs: Optional[List[DataCatalog]] = None,
126
- **kwargs,
127
- ) -> List[DataCatalog]:
128
- """
129
- Put the files matching the glob pattern into the catalog.
130
-
131
- If previously synced catalogs are provided, and no changes were observed, we do not sync them.
132
-
133
- Args:
134
- name (str): The glob pattern of the files to catalog
135
- run_id (str): The run id of the run
136
- compute_data_folder (str, optional): The compute data folder to sync from. Defaults to settings default.
137
- synced_catalogs (dict, optional): dictionary of previously synced catalogs. Defaults to None.
138
-
139
- Raises:
140
- Exception: If the compute data folder does not exist.
141
-
142
- Returns:
143
- List(object) : A list of catalog objects
144
- """
145
- logger.info(
146
- f"Using the {self.service_name} catalog and trying to put {name} for run_id: {run_id}"
147
- )
148
-
149
- copy_from = self.compute_data_folder
150
- if compute_data_folder:
151
- copy_from = compute_data_folder
152
- copy_from = Path(copy_from) # type: ignore
153
-
154
- catalog_location = self.get_catalog_location()
155
- run_catalog = Path(catalog_location) / run_id
156
- utils.safe_make_dir(run_catalog)
157
-
158
- logger.debug(
159
- f"Copying objects from {copy_from} to the run catalog location of {run_catalog}"
160
- )
161
-
162
- if not utils.does_dir_exist(copy_from):
163
- msg = (
164
- f"Expected compute data folder to be present at: {compute_data_folder} but not found. \n"
165
- "Note: runnable does not create the compute data folder for you. Please ensure that the "
166
- "folder exists.\n"
167
- )
168
- raise Exception(msg)
169
-
170
- # Iterate through the contents of copy_from and if the name matches, we move them to the run_catalog
171
- # We should also return a list of datastore.DataCatalog items
172
-
173
- glob_files = copy_from.glob(name) # type: ignore
174
- logger.debug(
175
- f"Glob identified {glob_files} as matches to from the compute data folder: {copy_from}"
176
- )
177
-
178
- data_catalogs = []
179
- run_log_store = self._context.run_log_store
180
- for file in glob_files:
181
- if file.is_dir():
182
- # Need not add a data catalog for the folder
183
- continue
184
-
185
- relative_file_path = file.relative_to(".")
186
-
187
- data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
188
- data_catalog.catalog_handler_location = catalog_location
189
- data_catalog.catalog_relative_path = (
190
- run_id + os.sep + str(relative_file_path)
191
- )
192
- data_catalog.data_hash = utils.get_data_hash(str(file))
193
- data_catalog.stage = "put"
194
- data_catalogs.append(data_catalog)
195
-
196
- if is_catalog_out_of_sync(data_catalog, synced_catalogs):
197
- logger.info(f"{data_catalog.name} was found to be changed, syncing")
198
-
199
- # Make the directory in the catalog if required
200
- Path(run_catalog / relative_file_path.parent).mkdir(
201
- parents=True, exist_ok=True
202
- )
203
- shutil.copy(file, run_catalog / relative_file_path)
204
- else:
205
- logger.info(
206
- f"{data_catalog.name} was found to be unchanged, ignoring syncing"
207
- )
208
-
209
- if not data_catalogs:
210
- raise Exception(f"Did not find any files matching {name} in {copy_from}")
211
-
212
- return data_catalogs
213
-
214
- def sync_between_runs(self, previous_run_id: str, run_id: str):
215
- """
216
- Given the previous run id, sync the catalogs between the current one and previous
217
-
218
- Args:
219
- previous_run_id (str): The previous run id to sync the catalogs from
220
- run_id (str): The run_id to which the data catalogs should be synced to.
221
-
222
- Raises:
223
- Exception: If the previous run log does not exist in the catalog
224
-
225
- """
226
- logger.info(
227
- f"Using the {self.service_name} catalog and syncing catalogs"
228
- "between old: {previous_run_id} to new: {run_id}"
229
- )
230
-
231
- catalog_location = Path(self.get_catalog_location())
232
- run_catalog = catalog_location / run_id
233
- utils.safe_make_dir(run_catalog)
234
-
235
- if not utils.does_dir_exist(catalog_location / previous_run_id):
236
- msg = (
237
- f"Catalogs from previous run : {previous_run_id} are not found.\n"
238
- "Note: Please provision the catalog objects generated by previous run in the same catalog location"
239
- " as the current run, even if the catalog handler for the previous run was different"
240
- )
241
- raise Exception(msg)
242
-
243
- cataloged_files = list((catalog_location / previous_run_id).glob("*"))
244
-
245
- for cataloged_file in cataloged_files:
246
- if str(cataloged_file).endswith("execution.log"):
247
- continue
49
+ relative_file_path = file.relative_to(self.compute_data_folder)
248
50
 
249
- if cataloged_file.is_file():
250
- shutil.copy(cataloged_file, run_catalog / cataloged_file.name)
251
- else:
252
- shutil.copytree(cataloged_file, run_catalog / cataloged_file.name)
253
- logger.info(f"Copied file from: {cataloged_file} to {run_catalog}")
51
+ (run_catalog / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
52
+ shutil.copy(file, run_catalog / relative_file_path)
@@ -0,0 +1,69 @@
1
+ import logging
2
+ from functools import lru_cache
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ from cloudpathlib import CloudPath, S3Client, S3Path
7
+
8
+ from extensions.catalog.any_path import AnyPathCatalog
9
+ from runnable import defaults
10
+
11
+ logger = logging.getLogger(defaults.LOGGER_NAME)
12
+
13
+
14
+ @lru_cache
15
+ def get_minio_client(
16
+ endpoint_url: str, aws_access_key_id: str, aws_secret_access_key: str
17
+ ) -> S3Client:
18
+ return S3Client(
19
+ endpoint_url=endpoint_url,
20
+ aws_access_key_id=aws_access_key_id,
21
+ aws_secret_access_key=aws_secret_access_key,
22
+ )
23
+
24
+
25
+ class MinioCatalog(AnyPathCatalog):
26
+ service_name: str = "minio"
27
+
28
+ endpoint_url: str = "http://localhost:9002"
29
+ aws_access_key_id: str = "minioadmin"
30
+ aws_secret_access_key: str = "minioadmin"
31
+ bucket: str = "runnable"
32
+
33
+ def get_summary(self) -> dict[str, Any]:
34
+ return {
35
+ "service_name": self.service_name,
36
+ "compute_data_folder": self.compute_data_folder,
37
+ "endpoint_url": self.endpoint_url,
38
+ "bucket": self.bucket,
39
+ }
40
+
41
+ def get_catalog_location(self) -> S3Path:
42
+ run_id = self._context.run_id
43
+
44
+ return S3Path(
45
+ f"s3://{self.bucket}/{run_id}/{self.compute_data_folder}".strip("."),
46
+ client=get_minio_client(
47
+ self.endpoint_url, self.aws_access_key_id, self.aws_secret_access_key
48
+ ),
49
+ )
50
+
51
+ def download_from_catalog(self, file: Path | CloudPath) -> None:
52
+ assert isinstance(file, S3Path)
53
+
54
+ relative_file_path = file.relative_to(self.get_catalog_location())
55
+
56
+ file_to_download = Path(self.compute_data_folder) / relative_file_path
57
+ file_to_download.parent.mkdir(parents=True, exist_ok=True)
58
+
59
+ file.download_to(file_to_download)
60
+
61
+ def upload_to_catalog(self, file: Path) -> None:
62
+ run_catalog = self.get_catalog_location()
63
+
64
+ relative_file_path = file.relative_to(self.compute_data_folder)
65
+ (run_catalog / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
66
+
67
+ file_in_cloud = run_catalog / file
68
+ assert isinstance(file_in_cloud, S3Path)
69
+ file_in_cloud.upload_from(file)
@@ -0,0 +1,11 @@
1
+ from cloudpathlib import S3Path
2
+
3
+ from extensions.catalog.any_path import AnyPathCatalog
4
+
5
+
6
+ class S3Catalog(AnyPathCatalog):
7
+ service_name: str = "s3"
8
+
9
+ def get_path(self, path: str) -> S3Path:
10
+ # TODO: Might need to assert the credentials are set
11
+ return S3Path(path)
@@ -151,54 +151,25 @@ class GenericPipelineExecutor(BasePipelineExecutor):
151
151
  # Nothing to get/put from the catalog
152
152
  return None
153
153
 
154
- compute_data_folder = self.get_effective_compute_data_folder()
155
-
156
154
  data_catalogs = []
157
155
  for name_pattern in node_catalog_settings.get(stage) or []:
158
156
  if stage == "get":
159
157
  data_catalog = self._context.catalog_handler.get(
160
158
  name=name_pattern,
161
- run_id=self._context.run_id,
162
- compute_data_folder=compute_data_folder,
163
159
  )
164
160
 
165
161
  elif stage == "put":
166
162
  data_catalog = self._context.catalog_handler.put(
167
163
  name=name_pattern,
168
- run_id=self._context.run_id,
169
- compute_data_folder=compute_data_folder,
170
- synced_catalogs=synced_catalogs,
171
164
  )
165
+ else:
166
+ raise Exception(f"Stage {stage} not supported")
172
167
 
173
168
  logger.debug(f"Added data catalog: {data_catalog} to step log")
174
169
  data_catalogs.extend(data_catalog)
175
170
 
176
171
  return data_catalogs
177
172
 
178
- def get_effective_compute_data_folder(self) -> str:
179
- """
180
- Get the effective compute data folder for the given stage.
181
- If there is nothing to catalog, we return None.
182
-
183
- The default is the compute data folder of the catalog but this can be over-ridden by the node.
184
-
185
- Args:
186
- stage (str): The stage we are in the process of cataloging
187
-
188
-
189
- Returns:
190
- str: The compute data folder as defined by the node defaulting to catalog handler
191
- """
192
- assert isinstance(self._context_node, BaseNode)
193
- compute_data_folder = self._context.catalog_handler.compute_data_folder
194
-
195
- catalog_settings = self._context_node._get_catalog_settings()
196
- effective_compute_data_folder = (
197
- catalog_settings.get("compute_data_folder", "") or compute_data_folder
198
- )
199
-
200
- return effective_compute_data_folder
201
-
202
173
  @property
203
174
  def step_attempt_number(self) -> int:
204
175
  """
@@ -219,9 +190,7 @@ class GenericPipelineExecutor(BasePipelineExecutor):
219
190
  )
220
191
  task_console.save_text(log_file_name)
221
192
  # Put the log file in the catalog
222
- self._context.catalog_handler.put(
223
- name=log_file_name, run_id=self._context.run_id
224
- )
193
+ self._context.catalog_handler.put(name=log_file_name)
225
194
  os.remove(log_file_name)
226
195
 
227
196
  def _execute_node(
runnable/catalog.py CHANGED
@@ -2,7 +2,7 @@ import logging
2
2
  from abc import ABC, abstractmethod
3
3
  from typing import Any, Dict, List, Optional
4
4
 
5
- from pydantic import BaseModel, ConfigDict
5
+ from pydantic import BaseModel, ConfigDict, Field
6
6
 
7
7
  import runnable.context as context
8
8
  from runnable import defaults
@@ -43,6 +43,9 @@ class BaseCatalog(ABC, BaseModel):
43
43
 
44
44
  service_name: str = ""
45
45
  service_type: str = "catalog"
46
+
47
+ compute_data_folder: str = Field(default=defaults.COMPUTE_DATA_FOLDER)
48
+
46
49
  model_config = ConfigDict(extra="forbid")
47
50
 
48
51
  @abstractmethod
@@ -52,14 +55,8 @@ class BaseCatalog(ABC, BaseModel):
52
55
  def _context(self):
53
56
  return context.run_context
54
57
 
55
- @property
56
- def compute_data_folder(self) -> str:
57
- return defaults.COMPUTE_DATA_FOLDER
58
-
59
58
  @abstractmethod
60
- def get(
61
- self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
62
- ) -> List[DataCatalog]:
59
+ def get(self, name: str) -> List[DataCatalog]:
63
60
  """
64
61
  Get the catalog item by 'name' for the 'run id' and store it in compute data folder.
65
62
 
@@ -79,14 +76,7 @@ class BaseCatalog(ABC, BaseModel):
79
76
  raise NotImplementedError
80
77
 
81
78
  @abstractmethod
82
- def put(
83
- self,
84
- name: str,
85
- run_id: str,
86
- compute_data_folder: str = "",
87
- synced_catalogs: Optional[List[DataCatalog]] = None,
88
- **kwargs,
89
- ) -> List[DataCatalog]:
79
+ def put(self, name: str) -> List[DataCatalog]:
90
80
  """
91
81
  Put the file by 'name' from the 'compute_data_folder' in the catalog for the run_id.
92
82
 
@@ -140,23 +130,14 @@ class DoNothingCatalog(BaseCatalog):
140
130
  def get_summary(self) -> Dict[str, Any]:
141
131
  return {}
142
132
 
143
- def get(
144
- self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
145
- ) -> List[DataCatalog]:
133
+ def get(self, name: str) -> List[DataCatalog]:
146
134
  """
147
135
  Does nothing
148
136
  """
149
137
  logger.info("Using a do-nothing catalog, doing nothing in get")
150
138
  return []
151
139
 
152
- def put(
153
- self,
154
- name: str,
155
- run_id: str,
156
- compute_data_folder: str = "",
157
- synced_catalogs: Optional[List[DataCatalog]] = None,
158
- **kwargs,
159
- ) -> List[DataCatalog]:
140
+ def put(self, name: str) -> List[DataCatalog]:
160
141
  """
161
142
  Does nothing
162
143
  """
@@ -168,4 +149,3 @@ class DoNothingCatalog(BaseCatalog):
168
149
  Does nothing
169
150
  """
170
151
  logger.info("Using a do-nothing catalog, doing nothing while sync between runs")
171
- logger.info("Using a do-nothing catalog, doing nothing while sync between runs")
runnable/datastore.py CHANGED
@@ -114,7 +114,7 @@ class ObjectParameter(BaseModel):
114
114
 
115
115
  # If the object was serialised, get it from the catalog
116
116
  catalog_handler = context.run_context.catalog_handler
117
- catalog_handler.get(name=self.file_name, run_id=context.run_context.run_id)
117
+ catalog_handler.get(name=self.file_name)
118
118
  obj = context.run_context.pickler.load(path=self.file_name)
119
119
  os.remove(self.file_name) # Remove after loading
120
120
  return obj
@@ -128,7 +128,7 @@ class ObjectParameter(BaseModel):
128
128
  context.run_context.pickler.dump(data=data, path=self.file_name)
129
129
 
130
130
  catalog_handler = context.run_context.catalog_handler
131
- catalog_handler.put(name=self.file_name, run_id=context.run_context.run_id)
131
+ catalog_handler.put(name=self.file_name)
132
132
  os.remove(self.file_name) # Remove after loading
133
133
 
134
134
 
runnable/executor.py CHANGED
@@ -173,23 +173,6 @@ class BasePipelineExecutor(BaseExecutor):
173
173
  """
174
174
  ...
175
175
 
176
- @abstractmethod
177
- def get_effective_compute_data_folder(self) -> Optional[str]:
178
- """
179
- Get the effective compute data folder for the given stage.
180
- If there is nothing to catalog, we return None.
181
-
182
- The default is the compute data folder of the catalog but this can be over-ridden by the node.
183
-
184
- Args:
185
- stage (str): The stage we are in the process of cataloging
186
-
187
-
188
- Returns:
189
- Optional[str]: The compute data folder as defined by catalog handler or the node or None.
190
- """
191
- ...
192
-
193
176
  @abstractmethod
194
177
  def _sync_catalog(
195
178
  self, stage: str, synced_catalogs=None
runnable/tasks.py CHANGED
@@ -501,9 +501,7 @@ class NotebookTaskType(BaseTaskType):
501
501
  pm.execute_notebook(**kwds)
502
502
  task_console.print(out_file.getvalue())
503
503
 
504
- context.run_context.catalog_handler.put(
505
- name=notebook_output_path, run_id=context.run_context.run_id
506
- )
504
+ context.run_context.catalog_handler.put(name=notebook_output_path)
507
505
 
508
506
  client = PloomberClient.from_path(path=notebook_output_path)
509
507
  namespace = client.get_namespace()
runnable/utils.py CHANGED
@@ -359,26 +359,26 @@ def diff_dict(d1: Dict[str, Any], d2: Dict[str, Any]) -> Dict[str, Any]:
359
359
  return diff
360
360
 
361
361
 
362
- def hash_bytestr_iter(bytesiter, hasher, ashexstr=True): # pylint: disable=C0116
363
- """Hashes the given bytesiter using the given hasher."""
364
- for block in bytesiter: # pragma: no cover
365
- hasher.update(block)
366
- return hasher.hexdigest() if ashexstr else hasher.digest() # pragma: no cover
362
+ # def hash_bytestr_iter(bytesiter, hasher, ashexstr=True): # pylint: disable=C0116
363
+ # """Hashes the given bytesiter using the given hasher."""
364
+ # for block in bytesiter: # pragma: no cover
365
+ # hasher.update(block)
366
+ # return hasher.hexdigest() if ashexstr else hasher.digest() # pragma: no cover
367
367
 
368
368
 
369
- def file_as_blockiter(afile, blocksize=65536): # pylint: disable=C0116
370
- """From a StackOverflow answer: that is used to generate a MD5 hash of a large files.
371
- # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file.
369
+ # def file_as_blockiter(afile, blocksize=65536): # pylint: disable=C0116
370
+ # """From a StackOverflow answer: that is used to generate a MD5 hash of a large files.
371
+ # # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file.
372
372
 
373
- """
374
- with afile: # pragma: no cover
375
- block = afile.read(blocksize)
376
- while len(block) > 0:
377
- yield block
378
- block = afile.read(blocksize)
373
+ # """
374
+ # with afile: # pragma: no cover
375
+ # block = afile.read(blocksize)
376
+ # while len(block) > 0:
377
+ # yield block
378
+ # block = afile.read(blocksize)
379
379
 
380
380
 
381
- def get_data_hash(file_name: str):
381
+ def get_data_hash(file_name: str) -> str:
382
382
  """Returns the hash of the data file.
383
383
 
384
384
  Args:
@@ -389,9 +389,12 @@ def get_data_hash(file_name: str):
389
389
  """
390
390
  # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file
391
391
  # TODO: For a big file, we should only hash the first few bytes
392
- return hash_bytestr_iter(
393
- file_as_blockiter(open(file_name, "rb")), hashlib.sha256()
394
- ) # pragma: no cover
392
+ with open(file_name, "rb") as f:
393
+ file_hash = hashlib.md5()
394
+ for chunk in iter(lambda: f.read(4096), b""):
395
+ file_hash.update(chunk)
396
+
397
+ return file_hash.hexdigest()
395
398
 
396
399
 
397
400
  # TODO: This is not the right place for this.
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: runnable
3
- Version: 0.25.0
3
+ Version: 0.26.0
4
4
  Summary: Add your description here
5
5
  Author-email: "Vammi, Vijay" <vijay.vammi@astrazeneca.com>
6
6
  License-File: LICENSE
7
7
  Requires-Python: >=3.10
8
8
  Requires-Dist: click-plugins>=1.1.1
9
9
  Requires-Dist: click<=8.1.3
10
+ Requires-Dist: cloudpathlib>=0.20.0
10
11
  Requires-Dist: dill>=0.3.9
11
12
  Requires-Dist: pydantic>=2.10.3
12
13
  Requires-Dist: python-dotenv>=1.0.1
@@ -23,6 +24,8 @@ Provides-Extra: k8s
23
24
  Requires-Dist: kubernetes>=31.0.0; extra == 'k8s'
24
25
  Provides-Extra: notebook
25
26
  Requires-Dist: ploomber-engine>=0.0.33; extra == 'notebook'
27
+ Provides-Extra: s3
28
+ Requires-Dist: cloudpathlib[s3]; extra == 's3'
26
29
  Description-Content-Type: text/markdown
27
30
 
28
31
 
@@ -1,8 +1,11 @@
1
1
  extensions/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  extensions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  extensions/catalog/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- extensions/catalog/file_system.py,sha256=VZEUx4X-GDSM8rJ_2kiCOyw1eek3roN0CiSB8wdUcOA,9307
4
+ extensions/catalog/any_path.py,sha256=aNjphoPIyllUfY2uNDFWD1ErM3Px6izSGr0-oGowN8k,7263
5
+ extensions/catalog/file_system.py,sha256=T_qFPFfrmykoAMc1rjNi_DBb437me8WPRcFglwAK744,1767
6
+ extensions/catalog/minio.py,sha256=D5ofitU75OJGZdPM8s-ALCHrSR6jawIe6blDo8ebiXM,2179
5
7
  extensions/catalog/pyproject.toml,sha256=lLNxY6v04c8I5QK_zKw_E6sJTArSJRA_V-79ktaA3Hk,279
8
+ extensions/catalog/s3.py,sha256=Sw5t8_kVRprn3uGGJCiHn7M9zw1CLaCOFj6YErtfG0o,287
6
9
  extensions/job_executor/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
10
  extensions/job_executor/__init__.py,sha256=3zS2m6dg-L6SkKfL0kr4AxVUVmVJcepV6eipyMvQR6s,6006
8
11
  extensions/job_executor/k8s.py,sha256=V5k6Rnf_sAFqptVbCrWs_x5sl3x3fSHwO96IZoiJxKU,15342
@@ -14,7 +17,7 @@ extensions/nodes/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
17
  extensions/nodes/nodes.py,sha256=WdOmep4uxmY2mTOtsuVZ5QhYl96jqJprkG6jkIg7BVg,34774
15
18
  extensions/nodes/pyproject.toml,sha256=YTu-ETN3JNFSkMzzWeOwn4m-O2nbRH-PmiPBALDCUw4,278
16
19
  extensions/pipeline_executor/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- extensions/pipeline_executor/__init__.py,sha256=bobyC4BWmDKCnMQsuyj9buQX7tZOFxuwU3Coq9-QgR0,25568
20
+ extensions/pipeline_executor/__init__.py,sha256=lk_QmbfzXNrgpF_KvMPuPpzxp0B8SJobDHWrK_0Q5FE,24359
18
21
  extensions/pipeline_executor/argo.py,sha256=nnlR_D6arQMUSgAevnW1RXeN48SoB1wVcEfQ4TBireY,34543
19
22
  extensions/pipeline_executor/local.py,sha256=H8s6AdML_9_f-vdGG_6k0y9FbLqAqvA1S_7xMNyARzY,1946
20
23
  extensions/pipeline_executor/local_container.py,sha256=HOT9I-cPDCvgy6_bzNEtl4jPhTyeYSn1GK7lplH3vDA,12515
@@ -33,14 +36,14 @@ extensions/secrets/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
33
36
  extensions/secrets/dotenv.py,sha256=FbYYd_pVuJuVuIDIvXbzKuSSQ9GPq7xJXTDbJMTQbhM,1583
34
37
  extensions/secrets/pyproject.toml,sha256=mLJNImNcBlbLKHh-0ugVWT9V83R4RibyyYDtBCSqVF4,282
35
38
  runnable/__init__.py,sha256=n14AnTUUEYxXlTJ6-YLT0tMmeFb7Co_3kNldV6pgKSs,662
36
- runnable/catalog.py,sha256=b9N40kTv1IBidzlWjkHcBGyYhq6qIDHZfBuFenzjsMI,4924
39
+ runnable/catalog.py,sha256=W_erYbLZ-ffuA9RQuWVqz1DUJOuWayf32ne32IDbAbc,4358
37
40
  runnable/cli.py,sha256=3BiKSj95h2Drn__YlchMPZ5rBMafuRb2OGIsVpbsO5Y,8788
38
41
  runnable/context.py,sha256=by5uepmuCP0dmM9BmsliXihSes5QEFejwAsmekcqylE,1388
39
- runnable/datastore.py,sha256=9y5enzn6AXLHLdwvgkdjGPrBkVlrcjfbaAHsst-lJzg,32466
42
+ runnable/datastore.py,sha256=ZobM1aVkgeUJ2fZYt63IFDsoNzObwc93hdByegS5YKQ,32396
40
43
  runnable/defaults.py,sha256=3o9IVGryyCE6PoQTOoaIaHHTbJGEzmdXMcwzOhwAYoI,3518
41
44
  runnable/entrypoints.py,sha256=xkUa568-7x9xALz13qW14DxS1nnLDKwLwdIBJZG-vM0,18982
42
45
  runnable/exceptions.py,sha256=LFbp0-Qxg2PAMLEVt7w2whhBxSG-5pzUEv5qN-Rc4_c,3003
43
- runnable/executor.py,sha256=ne-iRQqGuEmmuApnkBDz1_hokVcjFrbe7BvWqXCG1Ys,15684
46
+ runnable/executor.py,sha256=UCBBtyD0khl9QjT4SRTFMQDHDLWfJUC2U4_b3KQzaBE,15127
44
47
  runnable/graph.py,sha256=poQz5zcvq89ju_u5sYlunQLPbHnXTaUmjcvstPwvT4U,16536
45
48
  runnable/names.py,sha256=vn92Kv9ANROYSZX6Z4z1v_WA3WiEdIYmG6KEStBFZug,8134
46
49
  runnable/nodes.py,sha256=YU9u7r1ESzui1uVtJ1dgwdv1ozyJnF2k-MCFieT8CLI,17519
@@ -48,10 +51,10 @@ runnable/parameters.py,sha256=LyQb1d0SaFeI4PJ_yDYt9wArm9ThSPASWb36TwIdDUs,5213
48
51
  runnable/pickler.py,sha256=ydJ_eti_U1F4l-YacFp7BWm6g5vTn04UXye25S1HVok,2684
49
52
  runnable/sdk.py,sha256=T1nqDpLN9fULvvU9L-oY0EHqYdKUI9qk7oekLynm02Y,33568
50
53
  runnable/secrets.py,sha256=PXcEJw-4WPzeWRLfsatcPPyr1zkqgHzdRWRcS9vvpvM,2354
51
- runnable/tasks.py,sha256=SYy9eZOs1iCwu1IX5O9WyXk6DMpVsqaruQtMX-YX0bY,29207
52
- runnable/utils.py,sha256=hJUfRmIgU20weWPmBOHF22F6svBU0A_0nqifRMuXKs0,19822
53
- runnable-0.25.0.dist-info/METADATA,sha256=bpDSeecHPHb9qCHycgxbAtPFpuEx73t1bO_OAal8dN8,9945
54
- runnable-0.25.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
55
- runnable-0.25.0.dist-info/entry_points.txt,sha256=seek5WVGvwYALm8lZ0TfPXwG5NaCeUKjU8urF8k3gvY,1621
56
- runnable-0.25.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
57
- runnable-0.25.0.dist-info/RECORD,,
54
+ runnable/tasks.py,sha256=X6xijut7ffwpfYDcXoN6y0AcRVd7fWHs676DJ00Kma4,29134
55
+ runnable/utils.py,sha256=hBr7oGwGL2VgfITlQCTz-a1iwvvf7Mfl-HY8UdENZac,19929
56
+ runnable-0.26.0.dist-info/METADATA,sha256=IiPhsPo9Vws83V72pYoPNG7cdexyVi7Ctf49lsgv1bY,10047
57
+ runnable-0.26.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
58
+ runnable-0.26.0.dist-info/entry_points.txt,sha256=UCXvfBsVLpBjQY6znXNVzF6hof3Lro7oxtUD0t7kUp4,1704
59
+ runnable-0.26.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
60
+ runnable-0.26.0.dist-info/RECORD,,
@@ -4,6 +4,8 @@ runnable = runnable.cli:app
4
4
  [catalog]
5
5
  do-nothing = runnable.catalog:DoNothingCatalog
6
6
  file-system = extensions.catalog.file_system:FileSystemCatalog
7
+ minio = extensions.catalog.minio:MinioCatalog
8
+ s3 = extensions.catalog.s3:S3Catalog
7
9
 
8
10
  [job_executor]
9
11
  k8s-job = extensions.job_executor.k8s:K8sJobExecutor