runnable 0.17.1__py3-none-any.whl → 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. extensions/README.md +0 -0
  2. extensions/__init__.py +0 -0
  3. extensions/catalog/README.md +0 -0
  4. extensions/catalog/file_system.py +253 -0
  5. extensions/catalog/pyproject.toml +14 -0
  6. extensions/job_executor/README.md +0 -0
  7. extensions/job_executor/__init__.py +160 -0
  8. extensions/job_executor/k8s.py +484 -0
  9. extensions/job_executor/k8s_job_spec.yaml +37 -0
  10. extensions/job_executor/local.py +61 -0
  11. extensions/job_executor/local_container.py +192 -0
  12. extensions/job_executor/pyproject.toml +16 -0
  13. extensions/nodes/README.md +0 -0
  14. extensions/nodes/nodes.py +954 -0
  15. extensions/nodes/pyproject.toml +15 -0
  16. extensions/pipeline_executor/README.md +0 -0
  17. extensions/pipeline_executor/__init__.py +644 -0
  18. extensions/pipeline_executor/argo.py +1307 -0
  19. extensions/pipeline_executor/argo_specification.yaml +51 -0
  20. extensions/pipeline_executor/local.py +62 -0
  21. extensions/pipeline_executor/local_container.py +362 -0
  22. extensions/pipeline_executor/mocked.py +161 -0
  23. extensions/pipeline_executor/pyproject.toml +16 -0
  24. extensions/pipeline_executor/retry.py +180 -0
  25. extensions/run_log_store/README.md +0 -0
  26. extensions/run_log_store/__init__.py +0 -0
  27. extensions/run_log_store/chunked_fs.py +113 -0
  28. extensions/run_log_store/db/implementation_FF.py +163 -0
  29. extensions/run_log_store/db/integration_FF.py +0 -0
  30. extensions/run_log_store/file_system.py +145 -0
  31. extensions/run_log_store/generic_chunked.py +599 -0
  32. extensions/run_log_store/pyproject.toml +15 -0
  33. extensions/secrets/README.md +0 -0
  34. extensions/secrets/dotenv.py +62 -0
  35. extensions/secrets/pyproject.toml +15 -0
  36. runnable/__init__.py +1 -0
  37. runnable/catalog.py +1 -2
  38. runnable/entrypoints.py +1 -5
  39. runnable/executor.py +1 -1
  40. runnable/parameters.py +0 -9
  41. runnable/utils.py +5 -25
  42. {runnable-0.17.1.dist-info → runnable-0.19.0.dist-info}/METADATA +1 -7
  43. runnable-0.19.0.dist-info/RECORD +58 -0
  44. {runnable-0.17.1.dist-info → runnable-0.19.0.dist-info}/entry_points.txt +1 -0
  45. runnable-0.17.1.dist-info/RECORD +0 -23
  46. {runnable-0.17.1.dist-info → runnable-0.19.0.dist-info}/WHEEL +0 -0
  47. {runnable-0.17.1.dist-info → runnable-0.19.0.dist-info}/licenses/LICENSE +0 -0
extensions/README.md ADDED
File without changes
extensions/__init__.py ADDED
File without changes
File without changes
@@ -0,0 +1,253 @@
1
+ import logging
2
+ import os
3
+ import shutil
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from runnable import defaults, utils
8
+ from runnable.catalog import BaseCatalog, is_catalog_out_of_sync
9
+ from runnable.datastore import DataCatalog
10
+
11
+ logger = logging.getLogger(defaults.LOGGER_NAME)
12
+
13
+
14
+ class FileSystemCatalog(BaseCatalog):
15
+ """
16
+ A Catalog handler that uses the local file system for cataloging.
17
+
18
+ Note: Do not use this if the steps of the pipeline run on different compute environments.
19
+
20
+ Example config:
21
+
22
+ catalog:
23
+ type: file-system
24
+ config:
25
+ catalog_location: The location to store the catalog.
26
+ compute_data_folder: The folder to source the data from.
27
+
28
+ """
29
+
30
+ service_name: str = "file-system"
31
+ catalog_location: str = defaults.CATALOG_LOCATION_FOLDER
32
+
33
+ def get_catalog_location(self):
34
+ return self.catalog_location
35
+
36
+ def get_summary(self) -> Dict[str, Any]:
37
+ summary = {
38
+ "Catalog Location": self.get_catalog_location(),
39
+ }
40
+
41
+ return summary
42
+
43
+ def get(
44
+ self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
45
+ ) -> List[DataCatalog]:
46
+ """
47
+ Get the file by matching glob pattern to the name
48
+
49
+ Args:
50
+ name ([str]): A glob matching the file name
51
+ run_id ([str]): The run id
52
+
53
+ Raises:
54
+ Exception: If the catalog location does not exist
55
+
56
+ Returns:
57
+ List(object) : A list of catalog objects
58
+ """
59
+ logger.info(
60
+ f"Using the {self.service_name} catalog and trying to get {name} for run_id: {run_id}"
61
+ )
62
+
63
+ copy_to = self.compute_data_folder
64
+ if compute_data_folder:
65
+ copy_to = compute_data_folder
66
+
67
+ copy_to = Path(copy_to) # type: ignore
68
+
69
+ catalog_location = self.get_catalog_location()
70
+ run_catalog = Path(catalog_location) / run_id / copy_to
71
+
72
+ logger.debug(
73
+ f"Copying objects to {copy_to} from the run catalog location of {run_catalog}"
74
+ )
75
+
76
+ if not utils.does_dir_exist(run_catalog):
77
+ msg = (
78
+ f"Expected Catalog to be present at: {run_catalog} but not found.\n"
79
+ "Note: Please make sure that some data was put in the catalog before trying to get from it.\n"
80
+ )
81
+ raise Exception(msg)
82
+
83
+ # Iterate through the contents of the run_catalog and copy the files that fit the name pattern
84
+ # We should also return a list of data hashes
85
+ glob_files = run_catalog.glob(name)
86
+ logger.debug(
87
+ f"Glob identified {glob_files} as matches to from the catalog location: {run_catalog}"
88
+ )
89
+
90
+ data_catalogs = []
91
+ run_log_store = self._context.run_log_store
92
+ for file in glob_files:
93
+ if file.is_dir():
94
+ # Need not add a data catalog for the folder
95
+ continue
96
+
97
+ if str(file).endswith(".execution.log"):
98
+ continue
99
+
100
+ relative_file_path = file.relative_to(run_catalog)
101
+
102
+ data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
103
+ data_catalog.catalog_handler_location = catalog_location
104
+ data_catalog.catalog_relative_path = str(relative_file_path)
105
+ data_catalog.data_hash = utils.get_data_hash(str(file))
106
+ data_catalog.stage = "get"
107
+ data_catalogs.append(data_catalog)
108
+
109
+ # Make the directory in the data folder if required
110
+ Path(copy_to / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
111
+ shutil.copy(file, copy_to / relative_file_path)
112
+
113
+ logger.info(f"Copied {file} from {run_catalog} to {copy_to}")
114
+
115
+ if not data_catalogs:
116
+ raise Exception(f"Did not find any files matching {name} in {run_catalog}")
117
+
118
+ return data_catalogs
119
+
120
+ def put(
121
+ self,
122
+ name: str,
123
+ run_id: str,
124
+ compute_data_folder: str = "",
125
+ synced_catalogs: Optional[List[DataCatalog]] = None,
126
+ **kwargs,
127
+ ) -> List[DataCatalog]:
128
+ """
129
+ Put the files matching the glob pattern into the catalog.
130
+
131
+ If previously synced catalogs are provided, and no changes were observed, we do not sync them.
132
+
133
+ Args:
134
+ name (str): The glob pattern of the files to catalog
135
+ run_id (str): The run id of the run
136
+ compute_data_folder (str, optional): The compute data folder to sync from. Defaults to settings default.
137
+ synced_catalogs (dict, optional): dictionary of previously synced catalogs. Defaults to None.
138
+
139
+ Raises:
140
+ Exception: If the compute data folder does not exist.
141
+
142
+ Returns:
143
+ List(object) : A list of catalog objects
144
+ """
145
+ logger.info(
146
+ f"Using the {self.service_name} catalog and trying to put {name} for run_id: {run_id}"
147
+ )
148
+
149
+ copy_from = self.compute_data_folder
150
+ if compute_data_folder:
151
+ copy_from = compute_data_folder
152
+ copy_from = Path(copy_from) # type: ignore
153
+
154
+ catalog_location = self.get_catalog_location()
155
+ run_catalog = Path(catalog_location) / run_id
156
+ utils.safe_make_dir(run_catalog)
157
+
158
+ logger.debug(
159
+ f"Copying objects from {copy_from} to the run catalog location of {run_catalog}"
160
+ )
161
+
162
+ if not utils.does_dir_exist(copy_from):
163
+ msg = (
164
+ f"Expected compute data folder to be present at: {compute_data_folder} but not found. \n"
165
+ "Note: runnable does not create the compute data folder for you. Please ensure that the "
166
+ "folder exists.\n"
167
+ )
168
+ raise Exception(msg)
169
+
170
+ # Iterate through the contents of copy_from and if the name matches, we move them to the run_catalog
171
+ # We should also return a list of datastore.DataCatalog items
172
+
173
+ glob_files = copy_from.glob(name) # type: ignore
174
+ logger.debug(
175
+ f"Glob identified {glob_files} as matches to from the compute data folder: {copy_from}"
176
+ )
177
+
178
+ data_catalogs = []
179
+ run_log_store = self._context.run_log_store
180
+ for file in glob_files:
181
+ if file.is_dir():
182
+ # Need not add a data catalog for the folder
183
+ continue
184
+
185
+ relative_file_path = file.relative_to(".")
186
+
187
+ data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
188
+ data_catalog.catalog_handler_location = catalog_location
189
+ data_catalog.catalog_relative_path = (
190
+ run_id + os.sep + str(relative_file_path)
191
+ )
192
+ data_catalog.data_hash = utils.get_data_hash(str(file))
193
+ data_catalog.stage = "put"
194
+ data_catalogs.append(data_catalog)
195
+
196
+ if is_catalog_out_of_sync(data_catalog, synced_catalogs):
197
+ logger.info(f"{data_catalog.name} was found to be changed, syncing")
198
+
199
+ # Make the directory in the catalog if required
200
+ Path(run_catalog / relative_file_path.parent).mkdir(
201
+ parents=True, exist_ok=True
202
+ )
203
+ shutil.copy(file, run_catalog / relative_file_path)
204
+ else:
205
+ logger.info(
206
+ f"{data_catalog.name} was found to be unchanged, ignoring syncing"
207
+ )
208
+
209
+ if not data_catalogs:
210
+ raise Exception(f"Did not find any files matching {name} in {copy_from}")
211
+
212
+ return data_catalogs
213
+
214
+ def sync_between_runs(self, previous_run_id: str, run_id: str):
215
+ """
216
+ Given the previous run id, sync the catalogs between the current one and previous
217
+
218
+ Args:
219
+ previous_run_id (str): The previous run id to sync the catalogs from
220
+ run_id (str): The run_id to which the data catalogs should be synced to.
221
+
222
+ Raises:
223
+ Exception: If the previous run log does not exist in the catalog
224
+
225
+ """
226
+ logger.info(
227
+ f"Using the {self.service_name} catalog and syncing catalogs"
228
+ "between old: {previous_run_id} to new: {run_id}"
229
+ )
230
+
231
+ catalog_location = Path(self.get_catalog_location())
232
+ run_catalog = catalog_location / run_id
233
+ utils.safe_make_dir(run_catalog)
234
+
235
+ if not utils.does_dir_exist(catalog_location / previous_run_id):
236
+ msg = (
237
+ f"Catalogs from previous run : {previous_run_id} are not found.\n"
238
+ "Note: Please provision the catalog objects generated by previous run in the same catalog location"
239
+ " as the current run, even if the catalog handler for the previous run was different"
240
+ )
241
+ raise Exception(msg)
242
+
243
+ cataloged_files = list((catalog_location / previous_run_id).glob("*"))
244
+
245
+ for cataloged_file in cataloged_files:
246
+ if str(cataloged_file).endswith("execution.log"):
247
+ continue
248
+
249
+ if cataloged_file.is_file():
250
+ shutil.copy(cataloged_file, run_catalog / cataloged_file.name)
251
+ else:
252
+ shutil.copytree(cataloged_file, run_catalog / cataloged_file.name)
253
+ logger.info(f"Copied file from: {cataloged_file} to {run_catalog}")
@@ -0,0 +1,14 @@
1
+ [project]
2
+ name = "catalog"
3
+ version = "0.0.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = []
8
+
9
+ [build-system]
10
+ requires = ["hatchling"]
11
+ build-backend = "hatchling.build"
12
+
13
+ [tool.hatch.build.targets.wheel]
14
+ packages = ["."]
File without changes
@@ -0,0 +1,160 @@
1
+ import logging
2
+ import os
3
+ from typing import Dict, List, Optional
4
+
5
+ from runnable import context, defaults, exceptions, parameters, utils
6
+ from runnable.datastore import DataCatalog, JobLog, JsonParameter
7
+ from runnable.executor import BaseJobExecutor
8
+
9
+ logger = logging.getLogger(defaults.LOGGER_NAME)
10
+
11
+
12
+ class GenericJobExecutor(BaseJobExecutor):
13
+ """
14
+ The skeleton of an executor class.
15
+ Any implementation of an executor should inherit this class and over-ride accordingly.
16
+
17
+ This is a loaded base class which has a lot of methods already implemented for "typical" executions.
18
+ Look at the function docs to understand how to use them appropriately.
19
+
20
+ For any implementation:
21
+ 1). Who/when should the run log be set up?
22
+ 2). Who/When should the step log be set up?
23
+
24
+ """
25
+
26
+ service_name: str = ""
27
+ service_type: str = "job_executor"
28
+
29
+ @property
30
+ def _context(self):
31
+ assert context.run_context
32
+ return context.run_context
33
+
34
+ def _get_parameters(self) -> Dict[str, JsonParameter]:
35
+ """
36
+ Consolidate the parameters from the environment variables
37
+ and the parameters file.
38
+
39
+ The parameters defined in the environment variables take precedence over the parameters file.
40
+
41
+ Returns:
42
+ _type_: _description_
43
+ """
44
+ params: Dict[str, JsonParameter] = {}
45
+ if self._context.parameters_file:
46
+ user_defined = utils.load_yaml(self._context.parameters_file) or {}
47
+
48
+ for key, value in user_defined.items():
49
+ params[key] = JsonParameter(value=value, kind="json")
50
+
51
+ # Update these with some from the environment variables
52
+ params.update(parameters.get_user_set_parameters())
53
+ logger.debug(f"parameters as seen by executor: {params}")
54
+ return params
55
+
56
+ def _set_up_run_log(self, exists_ok=False):
57
+ """
58
+ Create a run log and put that in the run log store
59
+
60
+ If exists_ok, we allow the run log to be already present in the run log store.
61
+ """
62
+ try:
63
+ attempt_run_log = self._context.run_log_store.get_run_log_by_id(
64
+ run_id=self._context.run_id, full=False
65
+ )
66
+
67
+ logger.warning(
68
+ f"The run log by id: {self._context.run_id} already exists, is this designed?"
69
+ )
70
+ raise exceptions.RunLogExistsError(
71
+ f"The run log by id: {self._context.run_id} already exists and is {attempt_run_log.status}"
72
+ )
73
+ except exceptions.RunLogNotFoundError:
74
+ pass
75
+ except exceptions.RunLogExistsError:
76
+ if exists_ok:
77
+ return
78
+ raise
79
+
80
+ # Consolidate and get the parameters
81
+ params = self._get_parameters()
82
+
83
+ self._context.run_log_store.create_run_log(
84
+ run_id=self._context.run_id,
85
+ tag=self._context.tag,
86
+ status=defaults.PROCESSING,
87
+ dag_hash=self._context.dag_hash,
88
+ )
89
+ # Any interaction with run log store attributes should happen via API if available.
90
+ self._context.run_log_store.set_parameters(
91
+ run_id=self._context.run_id, parameters=params
92
+ )
93
+
94
+ # Update run_config
95
+ run_config = utils.get_run_config()
96
+ logger.debug(f"run_config as seen by executor: {run_config}")
97
+ self._context.run_log_store.set_run_config(
98
+ run_id=self._context.run_id, run_config=run_config
99
+ )
100
+
101
+ @property
102
+ def step_attempt_number(self) -> int:
103
+ """
104
+ The attempt number of the current step.
105
+ Orchestrators should use this step to submit multiple attempts of the job.
106
+
107
+ Returns:
108
+ int: The attempt number of the current step. Defaults to 1.
109
+ """
110
+ return int(os.environ.get(defaults.ATTEMPT_NUMBER, 1))
111
+
112
+ def add_code_identities(self, job_log: JobLog, **kwargs):
113
+ """
114
+ Add code identities specific to the implementation.
115
+
116
+ The Base class has an implementation of adding git code identities.
117
+
118
+ Args:
119
+ step_log (object): The step log object
120
+ node (BaseNode): The node we are adding the step log for
121
+ """
122
+ job_log.code_identities.append(utils.get_git_code_identity())
123
+
124
+ def send_return_code(self, stage="traversal"):
125
+ """
126
+ Convenience function used by pipeline to send return code to the caller of the cli
127
+
128
+ Raises:
129
+ Exception: If the pipeline execution failed
130
+ """
131
+ run_id = self._context.run_id
132
+
133
+ run_log = self._context.run_log_store.get_run_log_by_id(
134
+ run_id=run_id, full=False
135
+ )
136
+ if run_log.status == defaults.FAIL:
137
+ raise exceptions.ExecutionFailedError(run_id=run_id)
138
+
139
+ def _sync_catalog(
140
+ self,
141
+ catalog_settings=Optional[List[str]],
142
+ ) -> List[DataCatalog] | None:
143
+ if not catalog_settings:
144
+ logger.info("No catalog settings found")
145
+ return None
146
+
147
+ compute_data_folder = self._context.catalog_handler.compute_data_folder
148
+
149
+ data_catalogs = []
150
+ for name_pattern in catalog_settings:
151
+ data_catalog = self._context.catalog_handler.put(
152
+ name=name_pattern,
153
+ run_id=self._context.run_id,
154
+ compute_data_folder=compute_data_folder,
155
+ )
156
+
157
+ logger.debug(f"Added data catalog: {data_catalog} to job log")
158
+ data_catalogs.extend(data_catalog)
159
+
160
+ return data_catalogs