runnable 0.25.0__tar.gz → 0.26.0__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. {runnable-0.25.0 → runnable-0.26.0}/PKG-INFO +4 -1
  2. runnable-0.25.0/extensions/catalog/file_system.py → runnable-0.26.0/extensions/catalog/any_path.py +33 -85
  3. runnable-0.26.0/extensions/catalog/file_system.py +52 -0
  4. runnable-0.26.0/extensions/catalog/minio.py +69 -0
  5. runnable-0.26.0/extensions/catalog/s3.py +11 -0
  6. {runnable-0.25.0 → runnable-0.26.0}/extensions/pipeline_executor/__init__.py +3 -34
  7. {runnable-0.25.0 → runnable-0.26.0}/pyproject.toml +7 -1
  8. {runnable-0.25.0 → runnable-0.26.0}/runnable/catalog.py +8 -28
  9. {runnable-0.25.0 → runnable-0.26.0}/runnable/datastore.py +2 -2
  10. {runnable-0.25.0 → runnable-0.26.0}/runnable/executor.py +0 -17
  11. {runnable-0.25.0 → runnable-0.26.0}/runnable/tasks.py +1 -3
  12. {runnable-0.25.0 → runnable-0.26.0}/runnable/utils.py +21 -18
  13. {runnable-0.25.0 → runnable-0.26.0}/.gitignore +0 -0
  14. {runnable-0.25.0 → runnable-0.26.0}/LICENSE +0 -0
  15. {runnable-0.25.0 → runnable-0.26.0}/README.md +0 -0
  16. {runnable-0.25.0 → runnable-0.26.0}/extensions/README.md +0 -0
  17. {runnable-0.25.0 → runnable-0.26.0}/extensions/__init__.py +0 -0
  18. {runnable-0.25.0 → runnable-0.26.0}/extensions/catalog/README.md +0 -0
  19. {runnable-0.25.0 → runnable-0.26.0}/extensions/catalog/pyproject.toml +0 -0
  20. {runnable-0.25.0 → runnable-0.26.0}/extensions/job_executor/README.md +0 -0
  21. {runnable-0.25.0 → runnable-0.26.0}/extensions/job_executor/__init__.py +0 -0
  22. {runnable-0.25.0 → runnable-0.26.0}/extensions/job_executor/k8s.py +0 -0
  23. {runnable-0.25.0 → runnable-0.26.0}/extensions/job_executor/k8s_job_spec.yaml +0 -0
  24. {runnable-0.25.0 → runnable-0.26.0}/extensions/job_executor/local.py +0 -0
  25. {runnable-0.25.0 → runnable-0.26.0}/extensions/job_executor/local_container.py +0 -0
  26. {runnable-0.25.0 → runnable-0.26.0}/extensions/job_executor/pyproject.toml +0 -0
  27. {runnable-0.25.0 → runnable-0.26.0}/extensions/nodes/README.md +0 -0
  28. {runnable-0.25.0 → runnable-0.26.0}/extensions/nodes/nodes.py +0 -0
  29. {runnable-0.25.0 → runnable-0.26.0}/extensions/nodes/pyproject.toml +0 -0
  30. {runnable-0.25.0 → runnable-0.26.0}/extensions/pipeline_executor/README.md +0 -0
  31. {runnable-0.25.0 → runnable-0.26.0}/extensions/pipeline_executor/argo.py +0 -0
  32. {runnable-0.25.0 → runnable-0.26.0}/extensions/pipeline_executor/local.py +0 -0
  33. {runnable-0.25.0 → runnable-0.26.0}/extensions/pipeline_executor/local_container.py +0 -0
  34. {runnable-0.25.0 → runnable-0.26.0}/extensions/pipeline_executor/mocked.py +0 -0
  35. {runnable-0.25.0 → runnable-0.26.0}/extensions/pipeline_executor/pyproject.toml +0 -0
  36. {runnable-0.25.0 → runnable-0.26.0}/extensions/pipeline_executor/retry.py +0 -0
  37. {runnable-0.25.0 → runnable-0.26.0}/extensions/run_log_store/README.md +0 -0
  38. {runnable-0.25.0 → runnable-0.26.0}/extensions/run_log_store/__init__.py +0 -0
  39. {runnable-0.25.0 → runnable-0.26.0}/extensions/run_log_store/chunked_fs.py +0 -0
  40. {runnable-0.25.0 → runnable-0.26.0}/extensions/run_log_store/db/implementation_FF.py +0 -0
  41. {runnable-0.25.0 → runnable-0.26.0}/extensions/run_log_store/db/integration_FF.py +0 -0
  42. {runnable-0.25.0 → runnable-0.26.0}/extensions/run_log_store/file_system.py +0 -0
  43. {runnable-0.25.0 → runnable-0.26.0}/extensions/run_log_store/generic_chunked.py +0 -0
  44. {runnable-0.25.0 → runnable-0.26.0}/extensions/run_log_store/pyproject.toml +0 -0
  45. {runnable-0.25.0 → runnable-0.26.0}/extensions/secrets/README.md +0 -0
  46. {runnable-0.25.0 → runnable-0.26.0}/extensions/secrets/dotenv.py +0 -0
  47. {runnable-0.25.0 → runnable-0.26.0}/extensions/secrets/pyproject.toml +0 -0
  48. {runnable-0.25.0 → runnable-0.26.0}/runnable/__init__.py +0 -0
  49. {runnable-0.25.0 → runnable-0.26.0}/runnable/cli.py +0 -0
  50. {runnable-0.25.0 → runnable-0.26.0}/runnable/context.py +0 -0
  51. {runnable-0.25.0 → runnable-0.26.0}/runnable/defaults.py +0 -0
  52. {runnable-0.25.0 → runnable-0.26.0}/runnable/entrypoints.py +0 -0
  53. {runnable-0.25.0 → runnable-0.26.0}/runnable/exceptions.py +0 -0
  54. {runnable-0.25.0 → runnable-0.26.0}/runnable/graph.py +0 -0
  55. {runnable-0.25.0 → runnable-0.26.0}/runnable/names.py +0 -0
  56. {runnable-0.25.0 → runnable-0.26.0}/runnable/nodes.py +0 -0
  57. {runnable-0.25.0 → runnable-0.26.0}/runnable/parameters.py +0 -0
  58. {runnable-0.25.0 → runnable-0.26.0}/runnable/pickler.py +0 -0
  59. {runnable-0.25.0 → runnable-0.26.0}/runnable/sdk.py +0 -0
  60. {runnable-0.25.0 → runnable-0.26.0}/runnable/secrets.py +0 -0
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: runnable
3
- Version: 0.25.0
3
+ Version: 0.26.0
4
4
  Summary: Add your description here
5
5
  Author-email: "Vammi, Vijay" <vijay.vammi@astrazeneca.com>
6
6
  License-File: LICENSE
7
7
  Requires-Python: >=3.10
8
8
  Requires-Dist: click-plugins>=1.1.1
9
9
  Requires-Dist: click<=8.1.3
10
+ Requires-Dist: cloudpathlib>=0.20.0
10
11
  Requires-Dist: dill>=0.3.9
11
12
  Requires-Dist: pydantic>=2.10.3
12
13
  Requires-Dist: python-dotenv>=1.0.1
@@ -23,6 +24,8 @@ Provides-Extra: k8s
23
24
  Requires-Dist: kubernetes>=31.0.0; extra == 'k8s'
24
25
  Provides-Extra: notebook
25
26
  Requires-Dist: ploomber-engine>=0.0.33; extra == 'notebook'
27
+ Provides-Extra: s3
28
+ Requires-Dist: cloudpathlib[s3]; extra == 's3'
26
29
  Description-Content-Type: text/markdown
27
30
 
28
31
 
@@ -1,17 +1,20 @@
1
1
  import logging
2
2
  import os
3
3
  import shutil
4
+ from abc import abstractmethod
4
5
  from pathlib import Path
5
- from typing import Any, Dict, List, Optional
6
+ from typing import Any, Dict, List
7
+
8
+ from cloudpathlib import CloudPath
6
9
 
7
10
  from runnable import defaults, utils
8
- from runnable.catalog import BaseCatalog, is_catalog_out_of_sync
11
+ from runnable.catalog import BaseCatalog
9
12
  from runnable.datastore import DataCatalog
10
13
 
11
14
  logger = logging.getLogger(defaults.LOGGER_NAME)
12
15
 
13
16
 
14
- class FileSystemCatalog(BaseCatalog):
17
+ class AnyPathCatalog(BaseCatalog):
15
18
  """
16
19
  A Catalog handler that uses the local file system for cataloging.
17
20
 
@@ -27,22 +30,24 @@ class FileSystemCatalog(BaseCatalog):
27
30
 
28
31
  """
29
32
 
30
- service_name: str = "file-system"
31
- catalog_location: str = defaults.CATALOG_LOCATION_FOLDER
33
+ @abstractmethod
34
+ def get_summary(self) -> Dict[str, Any]: ...
32
35
 
33
- def get_catalog_location(self):
34
- return self.catalog_location
36
+ @abstractmethod
37
+ def upload_to_catalog(self, file: Path) -> None: ...
35
38
 
36
- def get_summary(self) -> Dict[str, Any]:
37
- summary = {
38
- "Catalog Location": self.get_catalog_location(),
39
- }
39
+ @abstractmethod
40
+ def download_from_catalog(self, file: Path | CloudPath) -> None: ...
40
41
 
41
- return summary
42
+ @abstractmethod
43
+ def get_catalog_location(self) -> Path | CloudPath:
44
+ """
45
+ For local file systems, this is the .catalog/run_id/compute_data_folder
46
+ For cloud systems, this is s3://bucket/run_id/compute_data_folder
47
+ """
48
+ ...
42
49
 
43
- def get(
44
- self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
45
- ) -> List[DataCatalog]:
50
+ def get(self, name: str) -> List[DataCatalog]:
46
51
  """
47
52
  Get the file by matching glob pattern to the name
48
53
 
@@ -56,29 +61,7 @@ class FileSystemCatalog(BaseCatalog):
56
61
  Returns:
57
62
  List(object) : A list of catalog objects
58
63
  """
59
- logger.info(
60
- f"Using the {self.service_name} catalog and trying to get {name} for run_id: {run_id}"
61
- )
62
-
63
- copy_to = self.compute_data_folder
64
- if compute_data_folder:
65
- copy_to = compute_data_folder
66
-
67
- copy_to = Path(copy_to) # type: ignore
68
-
69
- catalog_location = self.get_catalog_location()
70
- run_catalog = Path(catalog_location) / run_id / copy_to
71
-
72
- logger.debug(
73
- f"Copying objects to {copy_to} from the run catalog location of {run_catalog}"
74
- )
75
-
76
- if not utils.does_dir_exist(run_catalog):
77
- msg = (
78
- f"Expected Catalog to be present at: {run_catalog} but not found.\n"
79
- "Note: Please make sure that some data was put in the catalog before trying to get from it.\n"
80
- )
81
- raise Exception(msg)
64
+ run_catalog = self.get_catalog_location()
82
65
 
83
66
  # Iterate through the contents of the run_catalog and copy the files that fit the name pattern
84
67
  # We should also return a list of data hashes
@@ -97,34 +80,21 @@ class FileSystemCatalog(BaseCatalog):
97
80
  if str(file).endswith(".execution.log"):
98
81
  continue
99
82
 
100
- relative_file_path = file.relative_to(run_catalog)
83
+ self.download_from_catalog(file)
84
+ relative_file_path = file.relative_to(run_catalog) # type: ignore
101
85
 
102
86
  data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
103
- data_catalog.catalog_handler_location = catalog_location
104
87
  data_catalog.catalog_relative_path = str(relative_file_path)
105
- data_catalog.data_hash = utils.get_data_hash(str(file))
88
+ data_catalog.data_hash = utils.get_data_hash(str(relative_file_path))
106
89
  data_catalog.stage = "get"
107
90
  data_catalogs.append(data_catalog)
108
91
 
109
- # Make the directory in the data folder if required
110
- Path(copy_to / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
111
- shutil.copy(file, copy_to / relative_file_path)
112
-
113
- logger.info(f"Copied {file} from {run_catalog} to {copy_to}")
114
-
115
92
  if not data_catalogs:
116
93
  raise Exception(f"Did not find any files matching {name} in {run_catalog}")
117
94
 
118
95
  return data_catalogs
119
96
 
120
- def put(
121
- self,
122
- name: str,
123
- run_id: str,
124
- compute_data_folder: str = "",
125
- synced_catalogs: Optional[List[DataCatalog]] = None,
126
- **kwargs,
127
- ) -> List[DataCatalog]:
97
+ def put(self, name: str) -> List[DataCatalog]:
128
98
  """
129
99
  Put the files matching the glob pattern into the catalog.
130
100
 
@@ -142,26 +112,16 @@ class FileSystemCatalog(BaseCatalog):
142
112
  Returns:
143
113
  List(object) : A list of catalog objects
144
114
  """
115
+ run_id = self._context.run_id
145
116
  logger.info(
146
117
  f"Using the {self.service_name} catalog and trying to put {name} for run_id: {run_id}"
147
118
  )
148
119
 
149
- copy_from = self.compute_data_folder
150
- if compute_data_folder:
151
- copy_from = compute_data_folder
152
- copy_from = Path(copy_from) # type: ignore
153
-
154
- catalog_location = self.get_catalog_location()
155
- run_catalog = Path(catalog_location) / run_id
156
- utils.safe_make_dir(run_catalog)
120
+ copy_from = Path(self.compute_data_folder)
157
121
 
158
- logger.debug(
159
- f"Copying objects from {copy_from} to the run catalog location of {run_catalog}"
160
- )
161
-
162
- if not utils.does_dir_exist(copy_from):
122
+ if not copy_from.is_dir():
163
123
  msg = (
164
- f"Expected compute data folder to be present at: {compute_data_folder} but not found. \n"
124
+ f"Expected compute data folder to be present at: {copy_from} but not found. \n"
165
125
  "Note: runnable does not create the compute data folder for you. Please ensure that the "
166
126
  "folder exists.\n"
167
127
  )
@@ -169,8 +129,7 @@ class FileSystemCatalog(BaseCatalog):
169
129
 
170
130
  # Iterate through the contents of copy_from and if the name matches, we move them to the run_catalog
171
131
  # We should also return a list of datastore.DataCatalog items
172
-
173
- glob_files = copy_from.glob(name) # type: ignore
132
+ glob_files = copy_from.glob(name)
174
133
  logger.debug(
175
134
  f"Glob identified {glob_files} as matches to from the compute data folder: {copy_from}"
176
135
  )
@@ -182,10 +141,9 @@ class FileSystemCatalog(BaseCatalog):
182
141
  # Need not add a data catalog for the folder
183
142
  continue
184
143
 
185
- relative_file_path = file.relative_to(".")
144
+ relative_file_path = file.relative_to(copy_from)
186
145
 
187
146
  data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
188
- data_catalog.catalog_handler_location = catalog_location
189
147
  data_catalog.catalog_relative_path = (
190
148
  run_id + os.sep + str(relative_file_path)
191
149
  )
@@ -193,18 +151,8 @@ class FileSystemCatalog(BaseCatalog):
193
151
  data_catalog.stage = "put"
194
152
  data_catalogs.append(data_catalog)
195
153
 
196
- if is_catalog_out_of_sync(data_catalog, synced_catalogs):
197
- logger.info(f"{data_catalog.name} was found to be changed, syncing")
198
-
199
- # Make the directory in the catalog if required
200
- Path(run_catalog / relative_file_path.parent).mkdir(
201
- parents=True, exist_ok=True
202
- )
203
- shutil.copy(file, run_catalog / relative_file_path)
204
- else:
205
- logger.info(
206
- f"{data_catalog.name} was found to be unchanged, ignoring syncing"
207
- )
154
+ # TODO: Think about syncing only if the file is changed
155
+ self.upload_to_catalog(file)
208
156
 
209
157
  if not data_catalogs:
210
158
  raise Exception(f"Did not find any files matching {name} in {copy_from}")
@@ -0,0 +1,52 @@
1
+ import logging
2
+ import shutil
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ from cloudpathlib import CloudPath
7
+ from pydantic import Field
8
+
9
+ from extensions.catalog.any_path import AnyPathCatalog
10
+ from runnable import defaults
11
+
12
+ logger = logging.getLogger(defaults.LOGGER_NAME)
13
+
14
+
15
+ class FileSystemCatalog(AnyPathCatalog):
16
+ service_name: str = "file-system"
17
+
18
+ catalog_location: str = Field(default=defaults.CATALOG_LOCATION_FOLDER)
19
+
20
+ def get_summary(self) -> dict[str, Any]:
21
+ return {
22
+ "compute_data_folder": self.compute_data_folder,
23
+ "catalog_location": self.catalog_location,
24
+ }
25
+
26
+ def get_catalog_location(self) -> Path:
27
+ run_id = self._context.run_id
28
+ return Path(self.catalog_location) / run_id / self.compute_data_folder
29
+
30
+ def download_from_catalog(self, file: Path | CloudPath) -> None:
31
+ assert isinstance(file, Path)
32
+
33
+ run_catalog = self.get_catalog_location()
34
+ relative_file_path = file.relative_to(run_catalog)
35
+
36
+ copy_to = self.compute_data_folder
37
+ # Make the directory in the data folder if required
38
+ Path(copy_to / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
39
+ shutil.copy(file, copy_to / relative_file_path)
40
+
41
+ def upload_to_catalog(self, file: Path) -> None:
42
+ run_catalog = self.get_catalog_location()
43
+ run_catalog.mkdir(parents=True, exist_ok=True)
44
+
45
+ logger.debug(
46
+ f"Copying objects from {self.compute_data_folder} to the run catalog location of {run_catalog}"
47
+ )
48
+
49
+ relative_file_path = file.relative_to(self.compute_data_folder)
50
+
51
+ (run_catalog / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
52
+ shutil.copy(file, run_catalog / relative_file_path)
@@ -0,0 +1,69 @@
1
+ import logging
2
+ from functools import lru_cache
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ from cloudpathlib import CloudPath, S3Client, S3Path
7
+
8
+ from extensions.catalog.any_path import AnyPathCatalog
9
+ from runnable import defaults
10
+
11
+ logger = logging.getLogger(defaults.LOGGER_NAME)
12
+
13
+
14
+ @lru_cache
15
+ def get_minio_client(
16
+ endpoint_url: str, aws_access_key_id: str, aws_secret_access_key: str
17
+ ) -> S3Client:
18
+ return S3Client(
19
+ endpoint_url=endpoint_url,
20
+ aws_access_key_id=aws_access_key_id,
21
+ aws_secret_access_key=aws_secret_access_key,
22
+ )
23
+
24
+
25
+ class MinioCatalog(AnyPathCatalog):
26
+ service_name: str = "minio"
27
+
28
+ endpoint_url: str = "http://localhost:9002"
29
+ aws_access_key_id: str = "minioadmin"
30
+ aws_secret_access_key: str = "minioadmin"
31
+ bucket: str = "runnable"
32
+
33
+ def get_summary(self) -> dict[str, Any]:
34
+ return {
35
+ "service_name": self.service_name,
36
+ "compute_data_folder": self.compute_data_folder,
37
+ "endpoint_url": self.endpoint_url,
38
+ "bucket": self.bucket,
39
+ }
40
+
41
+ def get_catalog_location(self) -> S3Path:
42
+ run_id = self._context.run_id
43
+
44
+ return S3Path(
45
+ f"s3://{self.bucket}/{run_id}/{self.compute_data_folder}".strip("."),
46
+ client=get_minio_client(
47
+ self.endpoint_url, self.aws_access_key_id, self.aws_secret_access_key
48
+ ),
49
+ )
50
+
51
+ def download_from_catalog(self, file: Path | CloudPath) -> None:
52
+ assert isinstance(file, S3Path)
53
+
54
+ relative_file_path = file.relative_to(self.get_catalog_location())
55
+
56
+ file_to_download = Path(self.compute_data_folder) / relative_file_path
57
+ file_to_download.parent.mkdir(parents=True, exist_ok=True)
58
+
59
+ file.download_to(file_to_download)
60
+
61
+ def upload_to_catalog(self, file: Path) -> None:
62
+ run_catalog = self.get_catalog_location()
63
+
64
+ relative_file_path = file.relative_to(self.compute_data_folder)
65
+ (run_catalog / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
66
+
67
+ file_in_cloud = run_catalog / file
68
+ assert isinstance(file_in_cloud, S3Path)
69
+ file_in_cloud.upload_from(file)
@@ -0,0 +1,11 @@
1
+ from cloudpathlib import S3Path
2
+
3
+ from extensions.catalog.any_path import AnyPathCatalog
4
+
5
+
6
+ class S3Catalog(AnyPathCatalog):
7
+ service_name: str = "s3"
8
+
9
+ def get_path(self, path: str) -> S3Path:
10
+ # TODO: Might need to assert the credentials are set
11
+ return S3Path(path)
@@ -151,54 +151,25 @@ class GenericPipelineExecutor(BasePipelineExecutor):
151
151
  # Nothing to get/put from the catalog
152
152
  return None
153
153
 
154
- compute_data_folder = self.get_effective_compute_data_folder()
155
-
156
154
  data_catalogs = []
157
155
  for name_pattern in node_catalog_settings.get(stage) or []:
158
156
  if stage == "get":
159
157
  data_catalog = self._context.catalog_handler.get(
160
158
  name=name_pattern,
161
- run_id=self._context.run_id,
162
- compute_data_folder=compute_data_folder,
163
159
  )
164
160
 
165
161
  elif stage == "put":
166
162
  data_catalog = self._context.catalog_handler.put(
167
163
  name=name_pattern,
168
- run_id=self._context.run_id,
169
- compute_data_folder=compute_data_folder,
170
- synced_catalogs=synced_catalogs,
171
164
  )
165
+ else:
166
+ raise Exception(f"Stage {stage} not supported")
172
167
 
173
168
  logger.debug(f"Added data catalog: {data_catalog} to step log")
174
169
  data_catalogs.extend(data_catalog)
175
170
 
176
171
  return data_catalogs
177
172
 
178
- def get_effective_compute_data_folder(self) -> str:
179
- """
180
- Get the effective compute data folder for the given stage.
181
- If there is nothing to catalog, we return None.
182
-
183
- The default is the compute data folder of the catalog but this can be over-ridden by the node.
184
-
185
- Args:
186
- stage (str): The stage we are in the process of cataloging
187
-
188
-
189
- Returns:
190
- str: The compute data folder as defined by the node defaulting to catalog handler
191
- """
192
- assert isinstance(self._context_node, BaseNode)
193
- compute_data_folder = self._context.catalog_handler.compute_data_folder
194
-
195
- catalog_settings = self._context_node._get_catalog_settings()
196
- effective_compute_data_folder = (
197
- catalog_settings.get("compute_data_folder", "") or compute_data_folder
198
- )
199
-
200
- return effective_compute_data_folder
201
-
202
173
  @property
203
174
  def step_attempt_number(self) -> int:
204
175
  """
@@ -219,9 +190,7 @@ class GenericPipelineExecutor(BasePipelineExecutor):
219
190
  )
220
191
  task_console.save_text(log_file_name)
221
192
  # Put the log file in the catalog
222
- self._context.catalog_handler.put(
223
- name=log_file_name, run_id=self._context.run_id
224
- )
193
+ self._context.catalog_handler.put(name=log_file_name)
225
194
  os.remove(log_file_name)
226
195
 
227
196
  def _execute_node(
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "runnable"
3
- version = "0.25.0"
3
+ version = "0.26.0"
4
4
  description = "Add your description here"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -18,6 +18,7 @@ dependencies = [
18
18
  "setuptools>=75.6.0",
19
19
  "python-dotenv>=1.0.1",
20
20
  "typer>=0.15.1",
21
+ "cloudpathlib>=0.20.0",
21
22
  ]
22
23
 
23
24
  [project.optional-dependencies]
@@ -33,6 +34,9 @@ examples = [
33
34
  k8s = [
34
35
  "kubernetes>=31.0.0",
35
36
  ]
37
+ s3 = [
38
+ "cloudpathlib[s3]"
39
+ ]
36
40
 
37
41
  [dependency-groups]
38
42
  dev = [
@@ -112,6 +116,8 @@ include = [
112
116
  [project.entry-points.'catalog']
113
117
  "do-nothing" = "runnable.catalog:DoNothingCatalog"
114
118
  "file-system" = "extensions.catalog.file_system:FileSystemCatalog"
119
+ "s3" = "extensions.catalog.s3:S3Catalog"
120
+ "minio" = "extensions.catalog.minio:MinioCatalog"
115
121
 
116
122
  [project.entry-points.'run_log_store']
117
123
  "buffered" = "runnable.datastore:BufferRunLogstore"
@@ -2,7 +2,7 @@ import logging
2
2
  from abc import ABC, abstractmethod
3
3
  from typing import Any, Dict, List, Optional
4
4
 
5
- from pydantic import BaseModel, ConfigDict
5
+ from pydantic import BaseModel, ConfigDict, Field
6
6
 
7
7
  import runnable.context as context
8
8
  from runnable import defaults
@@ -43,6 +43,9 @@ class BaseCatalog(ABC, BaseModel):
43
43
 
44
44
  service_name: str = ""
45
45
  service_type: str = "catalog"
46
+
47
+ compute_data_folder: str = Field(default=defaults.COMPUTE_DATA_FOLDER)
48
+
46
49
  model_config = ConfigDict(extra="forbid")
47
50
 
48
51
  @abstractmethod
@@ -52,14 +55,8 @@ class BaseCatalog(ABC, BaseModel):
52
55
  def _context(self):
53
56
  return context.run_context
54
57
 
55
- @property
56
- def compute_data_folder(self) -> str:
57
- return defaults.COMPUTE_DATA_FOLDER
58
-
59
58
  @abstractmethod
60
- def get(
61
- self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
62
- ) -> List[DataCatalog]:
59
+ def get(self, name: str) -> List[DataCatalog]:
63
60
  """
64
61
  Get the catalog item by 'name' for the 'run id' and store it in compute data folder.
65
62
 
@@ -79,14 +76,7 @@ class BaseCatalog(ABC, BaseModel):
79
76
  raise NotImplementedError
80
77
 
81
78
  @abstractmethod
82
- def put(
83
- self,
84
- name: str,
85
- run_id: str,
86
- compute_data_folder: str = "",
87
- synced_catalogs: Optional[List[DataCatalog]] = None,
88
- **kwargs,
89
- ) -> List[DataCatalog]:
79
+ def put(self, name: str) -> List[DataCatalog]:
90
80
  """
91
81
  Put the file by 'name' from the 'compute_data_folder' in the catalog for the run_id.
92
82
 
@@ -140,23 +130,14 @@ class DoNothingCatalog(BaseCatalog):
140
130
  def get_summary(self) -> Dict[str, Any]:
141
131
  return {}
142
132
 
143
- def get(
144
- self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
145
- ) -> List[DataCatalog]:
133
+ def get(self, name: str) -> List[DataCatalog]:
146
134
  """
147
135
  Does nothing
148
136
  """
149
137
  logger.info("Using a do-nothing catalog, doing nothing in get")
150
138
  return []
151
139
 
152
- def put(
153
- self,
154
- name: str,
155
- run_id: str,
156
- compute_data_folder: str = "",
157
- synced_catalogs: Optional[List[DataCatalog]] = None,
158
- **kwargs,
159
- ) -> List[DataCatalog]:
140
+ def put(self, name: str) -> List[DataCatalog]:
160
141
  """
161
142
  Does nothing
162
143
  """
@@ -168,4 +149,3 @@ class DoNothingCatalog(BaseCatalog):
168
149
  Does nothing
169
150
  """
170
151
  logger.info("Using a do-nothing catalog, doing nothing while sync between runs")
171
- logger.info("Using a do-nothing catalog, doing nothing while sync between runs")
@@ -114,7 +114,7 @@ class ObjectParameter(BaseModel):
114
114
 
115
115
  # If the object was serialised, get it from the catalog
116
116
  catalog_handler = context.run_context.catalog_handler
117
- catalog_handler.get(name=self.file_name, run_id=context.run_context.run_id)
117
+ catalog_handler.get(name=self.file_name)
118
118
  obj = context.run_context.pickler.load(path=self.file_name)
119
119
  os.remove(self.file_name) # Remove after loading
120
120
  return obj
@@ -128,7 +128,7 @@ class ObjectParameter(BaseModel):
128
128
  context.run_context.pickler.dump(data=data, path=self.file_name)
129
129
 
130
130
  catalog_handler = context.run_context.catalog_handler
131
- catalog_handler.put(name=self.file_name, run_id=context.run_context.run_id)
131
+ catalog_handler.put(name=self.file_name)
132
132
  os.remove(self.file_name) # Remove after loading
133
133
 
134
134
 
@@ -173,23 +173,6 @@ class BasePipelineExecutor(BaseExecutor):
173
173
  """
174
174
  ...
175
175
 
176
- @abstractmethod
177
- def get_effective_compute_data_folder(self) -> Optional[str]:
178
- """
179
- Get the effective compute data folder for the given stage.
180
- If there is nothing to catalog, we return None.
181
-
182
- The default is the compute data folder of the catalog but this can be over-ridden by the node.
183
-
184
- Args:
185
- stage (str): The stage we are in the process of cataloging
186
-
187
-
188
- Returns:
189
- Optional[str]: The compute data folder as defined by catalog handler or the node or None.
190
- """
191
- ...
192
-
193
176
  @abstractmethod
194
177
  def _sync_catalog(
195
178
  self, stage: str, synced_catalogs=None
@@ -501,9 +501,7 @@ class NotebookTaskType(BaseTaskType):
501
501
  pm.execute_notebook(**kwds)
502
502
  task_console.print(out_file.getvalue())
503
503
 
504
- context.run_context.catalog_handler.put(
505
- name=notebook_output_path, run_id=context.run_context.run_id
506
- )
504
+ context.run_context.catalog_handler.put(name=notebook_output_path)
507
505
 
508
506
  client = PloomberClient.from_path(path=notebook_output_path)
509
507
  namespace = client.get_namespace()
@@ -359,26 +359,26 @@ def diff_dict(d1: Dict[str, Any], d2: Dict[str, Any]) -> Dict[str, Any]:
359
359
  return diff
360
360
 
361
361
 
362
- def hash_bytestr_iter(bytesiter, hasher, ashexstr=True): # pylint: disable=C0116
363
- """Hashes the given bytesiter using the given hasher."""
364
- for block in bytesiter: # pragma: no cover
365
- hasher.update(block)
366
- return hasher.hexdigest() if ashexstr else hasher.digest() # pragma: no cover
362
+ # def hash_bytestr_iter(bytesiter, hasher, ashexstr=True): # pylint: disable=C0116
363
+ # """Hashes the given bytesiter using the given hasher."""
364
+ # for block in bytesiter: # pragma: no cover
365
+ # hasher.update(block)
366
+ # return hasher.hexdigest() if ashexstr else hasher.digest() # pragma: no cover
367
367
 
368
368
 
369
- def file_as_blockiter(afile, blocksize=65536): # pylint: disable=C0116
370
- """From a StackOverflow answer: that is used to generate a MD5 hash of a large files.
371
- # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file.
369
+ # def file_as_blockiter(afile, blocksize=65536): # pylint: disable=C0116
370
+ # """From a StackOverflow answer: that is used to generate a MD5 hash of a large files.
371
+ # # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file.
372
372
 
373
- """
374
- with afile: # pragma: no cover
375
- block = afile.read(blocksize)
376
- while len(block) > 0:
377
- yield block
378
- block = afile.read(blocksize)
373
+ # """
374
+ # with afile: # pragma: no cover
375
+ # block = afile.read(blocksize)
376
+ # while len(block) > 0:
377
+ # yield block
378
+ # block = afile.read(blocksize)
379
379
 
380
380
 
381
- def get_data_hash(file_name: str):
381
+ def get_data_hash(file_name: str) -> str:
382
382
  """Returns the hash of the data file.
383
383
 
384
384
  Args:
@@ -389,9 +389,12 @@ def get_data_hash(file_name: str):
389
389
  """
390
390
  # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file
391
391
  # TODO: For a big file, we should only hash the first few bytes
392
- return hash_bytestr_iter(
393
- file_as_blockiter(open(file_name, "rb")), hashlib.sha256()
394
- ) # pragma: no cover
392
+ with open(file_name, "rb") as f:
393
+ file_hash = hashlib.md5()
394
+ for chunk in iter(lambda: f.read(4096), b""):
395
+ file_hash.update(chunk)
396
+
397
+ return file_hash.hexdigest()
395
398
 
396
399
 
397
400
  # TODO: This is not the right place for this.
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes