runnable 0.50.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. extensions/README.md +0 -0
  2. extensions/__init__.py +0 -0
  3. extensions/catalog/README.md +0 -0
  4. extensions/catalog/any_path.py +214 -0
  5. extensions/catalog/file_system.py +52 -0
  6. extensions/catalog/minio.py +72 -0
  7. extensions/catalog/pyproject.toml +14 -0
  8. extensions/catalog/s3.py +11 -0
  9. extensions/job_executor/README.md +0 -0
  10. extensions/job_executor/__init__.py +236 -0
  11. extensions/job_executor/emulate.py +70 -0
  12. extensions/job_executor/k8s.py +553 -0
  13. extensions/job_executor/k8s_job_spec.yaml +37 -0
  14. extensions/job_executor/local.py +35 -0
  15. extensions/job_executor/local_container.py +161 -0
  16. extensions/job_executor/pyproject.toml +16 -0
  17. extensions/nodes/README.md +0 -0
  18. extensions/nodes/__init__.py +0 -0
  19. extensions/nodes/conditional.py +301 -0
  20. extensions/nodes/fail.py +78 -0
  21. extensions/nodes/loop.py +394 -0
  22. extensions/nodes/map.py +477 -0
  23. extensions/nodes/parallel.py +281 -0
  24. extensions/nodes/pyproject.toml +15 -0
  25. extensions/nodes/stub.py +93 -0
  26. extensions/nodes/success.py +78 -0
  27. extensions/nodes/task.py +156 -0
  28. extensions/pipeline_executor/README.md +0 -0
  29. extensions/pipeline_executor/__init__.py +871 -0
  30. extensions/pipeline_executor/argo.py +1266 -0
  31. extensions/pipeline_executor/emulate.py +119 -0
  32. extensions/pipeline_executor/local.py +226 -0
  33. extensions/pipeline_executor/local_container.py +369 -0
  34. extensions/pipeline_executor/mocked.py +159 -0
  35. extensions/pipeline_executor/pyproject.toml +16 -0
  36. extensions/run_log_store/README.md +0 -0
  37. extensions/run_log_store/__init__.py +0 -0
  38. extensions/run_log_store/any_path.py +100 -0
  39. extensions/run_log_store/chunked_fs.py +122 -0
  40. extensions/run_log_store/chunked_minio.py +141 -0
  41. extensions/run_log_store/file_system.py +91 -0
  42. extensions/run_log_store/generic_chunked.py +549 -0
  43. extensions/run_log_store/minio.py +114 -0
  44. extensions/run_log_store/pyproject.toml +15 -0
  45. extensions/secrets/README.md +0 -0
  46. extensions/secrets/dotenv.py +62 -0
  47. extensions/secrets/pyproject.toml +15 -0
  48. runnable/__init__.py +108 -0
  49. runnable/catalog.py +141 -0
  50. runnable/cli.py +484 -0
  51. runnable/context.py +730 -0
  52. runnable/datastore.py +1058 -0
  53. runnable/defaults.py +159 -0
  54. runnable/entrypoints.py +390 -0
  55. runnable/exceptions.py +137 -0
  56. runnable/executor.py +561 -0
  57. runnable/gantt.py +1646 -0
  58. runnable/graph.py +501 -0
  59. runnable/names.py +546 -0
  60. runnable/nodes.py +593 -0
  61. runnable/parameters.py +217 -0
  62. runnable/pickler.py +96 -0
  63. runnable/sdk.py +1277 -0
  64. runnable/secrets.py +92 -0
  65. runnable/tasks.py +1268 -0
  66. runnable/telemetry.py +142 -0
  67. runnable/utils.py +423 -0
  68. runnable-0.50.0.dist-info/METADATA +189 -0
  69. runnable-0.50.0.dist-info/RECORD +72 -0
  70. runnable-0.50.0.dist-info/WHEEL +4 -0
  71. runnable-0.50.0.dist-info/entry_points.txt +53 -0
  72. runnable-0.50.0.dist-info/licenses/LICENSE +201 -0
runnable/datastore.py ADDED
@@ -0,0 +1,1058 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+ from abc import ABC, abstractmethod
6
+ from datetime import datetime
7
+ from typing import (
8
+ Annotated,
9
+ Any,
10
+ Dict,
11
+ List,
12
+ Literal,
13
+ Optional,
14
+ OrderedDict,
15
+ Tuple,
16
+ Union,
17
+ )
18
+
19
+ from pydantic import BaseModel, Field, computed_field
20
+
21
+ from runnable import defaults, exceptions
22
+
23
+ logger = logging.getLogger(defaults.LOGGER_NAME)
24
+
25
+
26
+ JSONType = Union[Dict[str, Any], List[Any], str, int, float, bool, None]
27
+
28
+
29
+ class DataCatalog(BaseModel, extra="allow"):
30
+ """
31
+ The captured attributes of a catalog item.
32
+ """
33
+
34
+ name: str #  The name of the dataset
35
+ data_hash: str = "" # The sha1 hash of the file
36
+ catalog_relative_path: str = "" # The file path relative the catalog location
37
+ catalog_handler_location: str = "" # The location of the catalog
38
+ stage: str = "" # The stage at which we recorded it get, put etc
39
+
40
+ # Needed for set operations to work on DataCatalog objects
41
+ def __hash__(self):
42
+ """
43
+ Needed to Uniqueize DataCatalog objects.
44
+ """
45
+ return hash(self.name)
46
+
47
+ def __eq__(self, other):
48
+ """
49
+ Needed for set operations to work on DataCatalog objects
50
+ """
51
+ if not isinstance(other, DataCatalog):
52
+ return False
53
+ return other.name == self.name
54
+
55
+
56
+ class JsonParameter(BaseModel):
57
+ kind: Literal["json"]
58
+ value: JSONType
59
+
60
+ @computed_field # type: ignore
61
+ @property
62
+ def description(self) -> JSONType:
63
+ # truncate value if its longer than 10 chars
64
+ return (
65
+ self.value
66
+ if not isinstance(self.value, str) or len(self.value) <= 10
67
+ else f"{self.value[:10]}..."
68
+ )
69
+
70
+ def get_value(self) -> JSONType:
71
+ return self.value
72
+
73
+
74
+ class MetricParameter(BaseModel):
75
+ kind: Literal["metric"]
76
+ value: JSONType
77
+
78
+ @computed_field # type: ignore
79
+ @property
80
+ def description(self) -> JSONType:
81
+ # truncate value if its longer than 10 chars
82
+ return (
83
+ self.value
84
+ if not isinstance(self.value, str) or len(self.value) <= 10
85
+ else f"{self.value[:10]}..."
86
+ )
87
+
88
+ def get_value(self) -> JSONType:
89
+ return self.value
90
+
91
+
92
+ class ObjectParameter(BaseModel):
93
+ kind: Literal["object"]
94
+ value: str # The name of the pickled object
95
+
96
+ @computed_field # type: ignore
97
+ @property
98
+ def description(self) -> str:
99
+ current_context = context.get_run_context()
100
+ if current_context and current_context.object_serialisation:
101
+ return f"Pickled object stored in catalog as: {self.value}"
102
+
103
+ return f"Object stored in memory as: {self.value}"
104
+
105
+ @property
106
+ def file_name(self) -> str:
107
+ current_context = context.get_run_context()
108
+ if current_context is None:
109
+ raise RuntimeError("No run context available")
110
+ return f"{self.value}{current_context.pickler.extension}"
111
+
112
+ def get_value(self) -> Any:
113
+ current_context = context.get_run_context()
114
+ if current_context is None:
115
+ raise RuntimeError("No run context available")
116
+
117
+ # If there was no serialisation, return the object from the return objects
118
+ if not current_context.object_serialisation:
119
+ return current_context.return_objects[self.value]
120
+
121
+ # If the object was serialised, get it from the catalog
122
+ catalog_handler = current_context.catalog
123
+ catalog_handler.get(name=self.file_name)
124
+ obj = current_context.pickler.load(path=self.file_name)
125
+ os.remove(self.file_name) # Remove after loading
126
+ return obj
127
+
128
+ def put_object(self, data: Any) -> None:
129
+ current_context = context.get_run_context()
130
+ if current_context is None:
131
+ raise RuntimeError("No run context available")
132
+
133
+ if not current_context.object_serialisation:
134
+ current_context.return_objects[self.value] = data
135
+ return
136
+
137
+ # If the object was serialised, put it in the catalog
138
+ current_context.pickler.dump(data=data, path=self.file_name)
139
+
140
+ catalog_handler = current_context.catalog
141
+ catalog_handler.put(name=self.file_name)
142
+ os.remove(self.file_name) # Remove after loading
143
+
144
+
145
+ Parameter = Annotated[
146
+ Union[JsonParameter, ObjectParameter, MetricParameter], Field(discriminator="kind")
147
+ ]
148
+
149
+
150
+ class StepAttempt(BaseModel):
151
+ """
152
+ The captured attributes of an Attempt of a step.
153
+ """
154
+
155
+ attempt_number: int = 1
156
+ start_time: str = ""
157
+ end_time: str = ""
158
+ status: str = "FAIL"
159
+ message: str = ""
160
+ input_parameters: Dict[str, Parameter] = Field(default_factory=dict)
161
+ output_parameters: Dict[str, Parameter] = Field(default_factory=dict)
162
+ user_defined_metrics: Dict[str, Parameter] = Field(default_factory=dict)
163
+ retry_indicator: str = Field(
164
+ default="",
165
+ description="Indicator for retry executions to distinguish attempt logs",
166
+ )
167
+ code_identities: List["CodeIdentity"] = Field(
168
+ default_factory=list, description="Code identities for this specific attempt"
169
+ )
170
+
171
+ @property
172
+ def duration(self):
173
+ start = datetime.fromisoformat(self.start_time)
174
+ end = datetime.fromisoformat(self.end_time)
175
+
176
+ return str(end - start)
177
+
178
+
179
+ class CodeIdentity(BaseModel, extra="allow"):
180
+ """
181
+ The captured attributes of a code identity of a step.
182
+ """
183
+
184
+ code_identifier: Optional[str] = "" # GIT sha code or docker image id
185
+ code_identifier_type: Optional[str] = "" # git or docker
186
+ code_identifier_dependable: Optional[bool] = (
187
+ False # If git, checks if the tree is clean.
188
+ )
189
+ code_identifier_url: Optional[str] = (
190
+ "" # The git remote url or docker repository url
191
+ )
192
+ code_identifier_message: Optional[str] = "" # Any optional message
193
+
194
+
195
+ class StepLog(BaseModel):
196
+ """
197
+ The data class capturing the data of a Step
198
+ """
199
+
200
+ name: str
201
+ internal_name: str # Should be the dot notation of the step
202
+ status: str = "FAIL" #  Should have a better default
203
+ step_type: str = "task"
204
+ message: str = ""
205
+ mock: bool = False
206
+ attempts: List[StepAttempt] = Field(default_factory=list)
207
+ branches: Dict[str, BranchLog] = Field(default_factory=dict)
208
+ data_catalog: List[DataCatalog] = Field(default_factory=list)
209
+
210
+ def get_summary(self) -> Dict[str, Any]:
211
+ """
212
+ Summarize the step log to log
213
+ """
214
+ summary: Dict[str, Any] = {}
215
+
216
+ summary["Name"] = self.internal_name
217
+ summary["Input catalog content"] = [
218
+ dc.name for dc in self.data_catalog if dc.stage == "get"
219
+ ]
220
+ summary["Available parameters"] = [
221
+ (p, v.description)
222
+ for attempt in self.attempts
223
+ for p, v in attempt.input_parameters.items()
224
+ ]
225
+
226
+ summary["Output catalog content"] = [
227
+ dc.name for dc in self.data_catalog if dc.stage == "put"
228
+ ]
229
+ summary["Output parameters"] = [
230
+ (p, v.description)
231
+ for attempt in self.attempts
232
+ for p, v in attempt.output_parameters.items()
233
+ ]
234
+
235
+ summary["Metrics"] = [
236
+ (p, v.description)
237
+ for attempt in self.attempts
238
+ for p, v in attempt.user_defined_metrics.items()
239
+ ]
240
+
241
+ cis = []
242
+ for attempt in self.attempts:
243
+ for ci in attempt.code_identities:
244
+ message = f"{ci.code_identifier_type}:{ci.code_identifier}"
245
+ if not ci.code_identifier_dependable:
246
+ message += " but is not dependable"
247
+ cis.append(message)
248
+
249
+ summary["Code identities"] = cis
250
+
251
+ summary["status"] = self.status
252
+
253
+ return summary
254
+
255
+ def get_data_catalogs_by_stage(self, stage="put") -> List[DataCatalog]:
256
+ """
257
+ Given a stage, return the data catalogs according to the stage
258
+
259
+ Args:
260
+ stage (str, optional): The stage at which the data was cataloged. Defaults to 'put'.
261
+
262
+ Raises:
263
+ Exception: If the stage was not in get or put.
264
+
265
+ Returns:
266
+ List[DataCatalog]: The list of data catalogs as per the stage.
267
+ """
268
+ if stage not in ["get", "put"]:
269
+ raise Exception("Stage should be in get or put")
270
+
271
+ data_catalogs = []
272
+ if self.branches:
273
+ for _, branch in self.branches.items():
274
+ data_catalogs.extend(branch.get_data_catalogs_by_stage(stage=stage))
275
+
276
+ return [dc for dc in self.data_catalog if dc.stage == stage] + data_catalogs
277
+
278
+ def add_data_catalogs(self, data_catalogs: List[DataCatalog]):
279
+ """
280
+ Add the data catalogs as asked by the user
281
+
282
+ Args:
283
+ dict_catalogs ([DataCatalog]): A list of data catalog items
284
+ """
285
+
286
+ if not self.data_catalog:
287
+ self.data_catalog = []
288
+ for data_catalog in data_catalogs:
289
+ self.data_catalog.append(data_catalog)
290
+
291
+
292
+ class BranchLog(BaseModel):
293
+ """
294
+ The dataclass of captured data about a branch of a composite node.
295
+
296
+ Returns:
297
+ [type]: [description]
298
+ """
299
+
300
+ internal_name: str
301
+ status: str = "FAIL"
302
+ steps: OrderedDict[str, StepLog] = Field(default_factory=OrderedDict)
303
+ parameters: Dict[str, Parameter] = Field(default_factory=dict)
304
+
305
+ def get_data_catalogs_by_stage(self, stage="put") -> List[DataCatalog]:
306
+ """
307
+ Given a stage, return the data catalogs according to the stage
308
+
309
+ Args:
310
+ stage (str, optional): The stage at which the data was cataloged. Defaults to 'put'.
311
+
312
+ Raises:
313
+ Exception: If the stage was not in get or put.
314
+
315
+ Returns:
316
+ List[DataCatalog]: The list of data catalogs as per the stage.
317
+ """
318
+ if stage not in ["get", "put"]:
319
+ raise Exception("Stage should be in get or put")
320
+
321
+ data_catalogs = []
322
+ for _, step in self.steps.items():
323
+ data_catalogs.extend(step.get_data_catalogs_by_stage(stage=stage))
324
+
325
+ return data_catalogs
326
+
327
+
328
+ # Needed for BranchLog of StepLog to be referenced
329
+ StepLog.model_rebuild()
330
+
331
+
332
+ class JobLog(BaseModel):
333
+ """
334
+ The data class capturing the data of a job
335
+ This should be treated as a step log
336
+ """
337
+
338
+ status: str = defaults.FAIL
339
+ message: str = ""
340
+ mock: bool = False
341
+ code_identities: List[CodeIdentity] = Field(default_factory=list)
342
+ attempts: List[StepAttempt] = Field(default_factory=list)
343
+ data_catalog: List[DataCatalog] = Field(default_factory=list)
344
+
345
+ def add_data_catalogs(self, data_catalogs: List[DataCatalog]):
346
+ """
347
+ Add the data catalogs as asked by the user
348
+
349
+ Args:
350
+ dict_catalogs ([DataCatalog]): A list of data catalog items
351
+ """
352
+
353
+ if not self.data_catalog:
354
+ self.data_catalog = []
355
+ for data_catalog in data_catalogs:
356
+ self.data_catalog.append(data_catalog)
357
+
358
+ def get_summary(self) -> Dict[str, Any]:
359
+ """
360
+ Summarize the step log to log
361
+ """
362
+ summary: Dict[str, Any] = {}
363
+
364
+ summary["Available parameters"] = [
365
+ (p, v.description)
366
+ for attempt in self.attempts
367
+ for p, v in attempt.input_parameters.items()
368
+ ]
369
+
370
+ summary["Output catalog content"] = [
371
+ dc.name for dc in self.data_catalog if dc.stage == "put"
372
+ ]
373
+ summary["Output parameters"] = [
374
+ (p, v.description)
375
+ for attempt in self.attempts
376
+ for p, v in attempt.output_parameters.items()
377
+ ]
378
+
379
+ summary["Metrics"] = [
380
+ (p, v.description)
381
+ for attempt in self.attempts
382
+ for p, v in attempt.user_defined_metrics.items()
383
+ ]
384
+
385
+ cis = []
386
+ for ci in self.code_identities:
387
+ message = f"{ci.code_identifier_type}:{ci.code_identifier}"
388
+ if not ci.code_identifier_dependable:
389
+ message += " but is not dependable"
390
+ cis.append(message)
391
+
392
+ summary["Code identities"] = cis
393
+
394
+ summary["status"] = self.status
395
+
396
+ return summary
397
+
398
+
399
+ class RunLog(BaseModel):
400
+ """
401
+ The data captured as part of Run Log
402
+ """
403
+
404
+ run_id: str
405
+ dag_hash: Optional[str] = None
406
+ tag: Optional[str] = ""
407
+ status: str = defaults.FAIL
408
+ steps: OrderedDict[str, StepLog] = Field(default_factory=OrderedDict)
409
+ job: Optional[JobLog] = None
410
+ parameters: Dict[str, Parameter] = Field(default_factory=dict)
411
+ run_config: Dict[str, Any] = Field(default_factory=dict)
412
+
413
+ def get_summary(self) -> Dict[str, Any]:
414
+ summary: Dict[str, Any] = {}
415
+
416
+ current_context = context.get_run_context()
417
+ if current_context is None:
418
+ raise RuntimeError("No run context available")
419
+
420
+ summary["Unique execution id"] = self.run_id
421
+ summary["status"] = self.status
422
+
423
+ summary["Catalog Location"] = current_context.catalog.get_summary()
424
+ summary["Full Run log present at: "] = (
425
+ current_context.run_log_store.get_summary()
426
+ )
427
+
428
+ run_log = current_context.run_log_store.get_run_log_by_id(
429
+ run_id=current_context.run_id, full=True
430
+ )
431
+
432
+ summary["Final Parameters"] = {
433
+ p: v.description for p, v in run_log.parameters.items()
434
+ }
435
+ summary["Collected metrics"] = {
436
+ p: v.description
437
+ for p, v in run_log.parameters.items()
438
+ if v.kind == "metric"
439
+ }
440
+
441
+ return summary
442
+
443
+ def get_data_catalogs_by_stage(self, stage: str = "put") -> List[DataCatalog]:
444
+ """
445
+ Return all the cataloged data by the stage at which they were cataloged.
446
+
447
+ Raises:
448
+ Exception: If stage was not either put or get.
449
+
450
+ Args:
451
+ stage (str, optional): [description]. Defaults to 'put'.
452
+ """
453
+ if stage not in ["get", "put"]:
454
+ raise Exception("Only get or put are allowed in stage")
455
+
456
+ data_catalogs = []
457
+ for _, step in self.steps.items():
458
+ data_catalogs.extend(step.get_data_catalogs_by_stage(stage=stage))
459
+
460
+ return list(set(data_catalogs))
461
+
462
+ def search_branch_by_internal_name(
463
+ self, i_name: str
464
+ ) -> Tuple[Union[BranchLog, RunLog], Union[StepLog, None]]:
465
+ """
466
+ Given a branch internal name, search for it in the run log.
467
+
468
+ If the branch internal name is none, its the run log itself.
469
+
470
+ Args:
471
+ i_name (str): [description]
472
+
473
+ Raises:
474
+ exceptions.BranchLogNotFoundError: [description]
475
+
476
+ Returns:
477
+ Tuple[BranchLog, StepLog]: [description]
478
+ """
479
+ # internal name is null for base dag
480
+ if not i_name:
481
+ return self, None
482
+
483
+ dot_path = i_name.split(".")
484
+
485
+ # any internal name of a branch when split against .
486
+ # goes step.branch.step.branch
487
+ # If its odd, its a step, if its even its a branch
488
+ current_steps = self.steps
489
+ current_step = None
490
+ current_branch = None
491
+
492
+ try:
493
+ for i in range(len(dot_path)):
494
+ if i % 2:
495
+ # Its odd, so we are in branch
496
+ # Get the branch that holds the step
497
+ current_branch = current_step.branches[".".join(dot_path[: i + 1])] # type: ignore
498
+ current_steps = current_branch.steps
499
+ logger.debug(f"Finding branch {i_name} in branch: {current_branch}")
500
+ else:
501
+ # Its even, so we are in step, we start here!
502
+ # Get the step that holds the branch
503
+ current_step = current_steps[".".join(dot_path[: i + 1])]
504
+ logger.debug(f"Finding branch for {i_name} in step: {current_step}")
505
+
506
+ logger.debug(
507
+ f"current branch : {current_branch}, current step {current_step}"
508
+ )
509
+ if current_branch and current_step:
510
+ return current_branch, current_step
511
+ except KeyError as _e:
512
+ raise exceptions.BranchLogNotFoundError(self.run_id, i_name) from _e
513
+
514
+ # If we are here, we have not found the branch
515
+ raise exceptions.BranchLogNotFoundError(self.run_id, i_name)
516
+
517
+ def search_step_by_internal_name(
518
+ self, i_name: str
519
+ ) -> Tuple[StepLog, Union[BranchLog, None]]:
520
+ """
521
+ Given a steps internal name, search for the step name.
522
+
523
+ If the step name when split against '.' is 1, it is the run log
524
+
525
+ Args:
526
+ i_name (str): [description]
527
+
528
+ Raises:
529
+ exceptions.StepLogNotFoundError: [description]
530
+
531
+ Returns:
532
+ Tuple[StepLog, BranchLog]: [description]
533
+ """
534
+ dot_path = i_name.split(".")
535
+ if len(dot_path) == 1:
536
+ try:
537
+ return self.steps[i_name], None
538
+ except KeyError as e:
539
+ raise exceptions.StepLogNotFoundError(self.run_id, i_name) from e
540
+
541
+ current_steps = self.steps
542
+ current_step = None
543
+ current_branch = None
544
+ try:
545
+ for i in range(len(dot_path)):
546
+ if i % 2:
547
+ # Its odd, so we are in brach name
548
+ current_branch = current_step.branches[".".join(dot_path[: i + 1])] # type: ignore
549
+ current_steps = current_branch.steps
550
+ logger.debug(
551
+ f"Finding step log for {i_name} in branch: {current_branch}"
552
+ )
553
+ else:
554
+ # Its even, so we are in step, we start here!
555
+ current_step = current_steps[".".join(dot_path[: i + 1])]
556
+ logger.debug(
557
+ f"Finding step log for {i_name} in step: {current_step}"
558
+ )
559
+
560
+ logger.debug(
561
+ f"current branch : {current_branch}, current step {current_step}"
562
+ )
563
+ if current_branch and current_step:
564
+ return current_step, current_branch
565
+ except KeyError as _e:
566
+ raise exceptions.StepLogNotFoundError(self.run_id, i_name) from _e
567
+
568
+ # If we are here, we have not found the step
569
+ raise exceptions.StepLogNotFoundError(self.run_id, i_name)
570
+
571
+
572
+ class BaseRunLogStore(ABC, BaseModel):
573
+ """
574
+ The base class of a Run Log Store with many common methods implemented.
575
+ """
576
+
577
+ service_name: str = ""
578
+ service_type: str = "run_log_store"
579
+ supports_parallel_writes: bool = False
580
+
581
+ @abstractmethod
582
+ def get_summary(self) -> Dict[str, Any]: ...
583
+
584
+ @property
585
+ def _context(self):
586
+ current_context = context.get_run_context()
587
+ if current_context is None:
588
+ raise RuntimeError("No run context available")
589
+ return current_context
590
+
591
+ @abstractmethod
592
+ def create_run_log(
593
+ self,
594
+ run_id: str,
595
+ dag_hash: str = "",
596
+ use_cached: bool = False,
597
+ tag: str = "",
598
+ original_run_id: str = "",
599
+ status: str = defaults.CREATED,
600
+ ):
601
+ """
602
+ Creates a Run Log object by using the config
603
+
604
+ Logically the method should do the following:
605
+ * Creates a Run log
606
+ * Adds it to the db
607
+ * Return the log
608
+ Raises:
609
+ NotImplementedError: This is a base class and therefore has no default implementation
610
+ """
611
+
612
+ raise NotImplementedError
613
+
614
+ @abstractmethod
615
+ def get_run_log_by_id(self, run_id: str, full: bool = False) -> RunLog:
616
+ """
617
+ Retrieves a Run log from the database using the config and the run_id
618
+
619
+ Args:
620
+ run_id (str): The run_id of the run
621
+ full (bool): return the full run log store or only the RunLog object
622
+
623
+ Returns:
624
+ RunLog: The RunLog object identified by the run_id
625
+
626
+ Logically the method should:
627
+ * Returns the run_log defined by id from the data store defined by the config
628
+
629
+ Raises:
630
+ NotImplementedError: This is a base class and therefore has no default implementation
631
+ RunLogNotFoundError: If the run log for run_id is not found in the datastore
632
+ """
633
+
634
+ raise NotImplementedError
635
+
636
+ @abstractmethod
637
+ def put_run_log(self, run_log: RunLog):
638
+ """
639
+ Puts the Run Log in the database as defined by the config
640
+
641
+ Args:
642
+ run_log (RunLog): The Run log of the run
643
+
644
+ Logically the method should:
645
+ Puts the run_log into the database
646
+
647
+ Raises:
648
+ NotImplementedError: This is a base class and therefore has no default implementation
649
+ """
650
+ raise NotImplementedError
651
+
652
+ def update_run_log_status(self, run_id: str, status: str):
653
+ """
654
+ Updates the status of the Run Log defined by the run_id
655
+
656
+ Args:
657
+ run_id (str): The run_id of the run
658
+ status (str): The new status of the run
659
+ """
660
+ logger.info(f"Updating status of run_id {run_id} to {status}")
661
+ run_log = self.get_run_log_by_id(run_id, full=False)
662
+ run_log.status = status
663
+ self.put_run_log(run_log)
664
+
665
+ def get_parameters(
666
+ self, run_id: str, internal_branch_name: str = ""
667
+ ) -> Dict[str, Parameter]:
668
+ """
669
+ Get the parameters from the Run log defined by the run_id.
670
+
671
+ If internal_branch_name is provided, returns parameters scoped to that branch.
672
+ Otherwise returns root-level parameters.
673
+
674
+ Args:
675
+ run_id (str): The run_id of the run
676
+ internal_branch_name (str): Optional branch name for scoped parameters
677
+
678
+ Returns:
679
+ dict: A dictionary of the run_log parameters
680
+ Raises:
681
+ RunLogNotFoundError: If the run log for run_id is not found in the datastore
682
+ """
683
+ run_log = self.get_run_log_by_id(run_id=run_id)
684
+
685
+ if not internal_branch_name:
686
+ return run_log.parameters
687
+
688
+ branch, _ = run_log.search_branch_by_internal_name(internal_branch_name)
689
+ assert isinstance(branch, BranchLog)
690
+ return branch.parameters
691
+
692
+ def set_parameters(
693
+ self,
694
+ run_id: str,
695
+ parameters: Dict[str, Parameter],
696
+ internal_branch_name: str = "",
697
+ ):
698
+ """
699
+ Update the parameters of the Run log with the new parameters
700
+
701
+ This method would over-write the parameters, if the parameter exists in the run log already
702
+
703
+ If internal_branch_name is provided, sets parameters on that branch.
704
+ Otherwise sets root-level parameters.
705
+
706
+ The method should:
707
+ * Call get_run_log_by_id(run_id) to retrieve the run_log
708
+ * Update the parameters of the run_log
709
+ * Call put_run_log(run_log) to put the run_log in the datastore
710
+
711
+ Args:
712
+ run_id (str): The run_id of the run
713
+ parameters (dict): The parameters to update in the run log
714
+ internal_branch_name (str): Optional branch name for scoped parameters
715
+ Raises:
716
+ RunLogNotFoundError: If the run log for run_id is not found in the datastore
717
+ """
718
+ run_log = self.get_run_log_by_id(run_id=run_id)
719
+
720
+ if not internal_branch_name:
721
+ run_log.parameters.update(parameters)
722
+ self.put_run_log(run_log=run_log)
723
+ else:
724
+ branch, _ = run_log.search_branch_by_internal_name(internal_branch_name)
725
+ assert isinstance(branch, BranchLog)
726
+ branch.parameters.update(parameters)
727
+ # Update the branch back in the run log for file-based stores
728
+ self.add_branch_log(branch, run_id)
729
+
730
+ def get_run_config(self, run_id: str) -> dict:
731
+ """
732
+ Given a run_id, return the run_config used to perform the run.
733
+
734
+ Args:
735
+ run_id (str): The run_id of the run
736
+
737
+ Returns:
738
+ dict: The run config used for the run
739
+ """
740
+
741
+ run_log = self.get_run_log_by_id(run_id=run_id)
742
+ return run_log.run_config
743
+
744
+ def set_run_config(self, run_id: str, run_config: dict):
745
+ """Set the run config used to run the run_id
746
+
747
+ Args:
748
+ run_id (str): The run_id of the run
749
+ run_config (dict): The run_config of the run
750
+ """
751
+
752
+ run_log = self.get_run_log_by_id(run_id=run_id)
753
+ run_log.run_config.update(run_config)
754
+ self.put_run_log(run_log=run_log)
755
+
756
+ def create_step_log(self, name: str, internal_name: str):
757
+ """
758
+ Create a step log by the name and internal name
759
+
760
+ The method does not update the Run Log with the step log at this point in time.
761
+ This method is just an interface for external modules to create a step log
762
+
763
+
764
+ Args:
765
+ name (str): The friendly name of the step log
766
+ internal_name (str): The internal naming of the step log. The internal naming is a dot path convention
767
+
768
+ Returns:
769
+ StepLog: A uncommitted step log object
770
+ """
771
+ logger.info(f"{self.service_name} Creating a Step Log: {internal_name}")
772
+ return StepLog(name=name, internal_name=internal_name, status=defaults.CREATED)
773
+
774
+ def get_step_log(self, internal_name: str, run_id: str) -> StepLog:
775
+ """
776
+ Get a step log from the datastore for run_id and the internal naming of the step log
777
+
778
+ The internal naming of the step log is a dot path convention.
779
+
780
+ The method should:
781
+ * Call get_run_log_by_id(run_id) to retrieve the run_log
782
+ * Identify the step location by decoding the internal naming
783
+ * Return the step log
784
+
785
+ Args:
786
+ internal_name (str): The internal name of the step log
787
+ run_id (str): The run_id of the run
788
+
789
+ Returns:
790
+ StepLog: The step log object for the step defined by the internal naming and run_id
791
+
792
+ Raises:
793
+ RunLogNotFoundError: If the run log for run_id is not found in the datastore
794
+ StepLogNotFoundError: If the step log for internal_name is not found in the datastore for run_id
795
+ """
796
+ logger.info(
797
+ f"{self.service_name} Getting the step log: {internal_name} of {run_id}"
798
+ )
799
+ run_log = self.get_run_log_by_id(run_id=run_id)
800
+ step_log, _ = run_log.search_step_by_internal_name(internal_name)
801
+ return step_log
802
+
803
+ def add_step_log(self, step_log: StepLog, run_id: str):
804
+ """
805
+ Add the step log in the run log as identified by the run_id in the datastore
806
+
807
+ The method should:
808
+ * Call get_run_log_by_id(run_id) to retrieve the run_log
809
+ * Identify the branch to add the step by decoding the step_logs internal name
810
+ * Add the step log to the identified branch log
811
+ * Call put_run_log(run_log) to put the run_log in the datastore
812
+
813
+ Args:
814
+ step_log (StepLog): The Step log to add to the database
815
+ run_id (str): The run id of the run
816
+
817
+ Raises:
818
+ RunLogNotFoundError: If the run log for run_id is not found in the datastore
819
+ BranchLogNotFoundError: If the branch of the step log for internal_name is not found in the datastore
820
+ for run_id
821
+ """
822
+ logger.info(f"{self.service_name} Adding the step log to DB: {step_log.name}")
823
+ run_log = self.get_run_log_by_id(run_id=run_id)
824
+
825
+ branch_to_add = ".".join(step_log.internal_name.split(".")[:-1])
826
+ branch, _ = run_log.search_branch_by_internal_name(branch_to_add)
827
+
828
+ if branch is None:
829
+ branch = run_log
830
+ branch.steps[step_log.internal_name] = step_log
831
+ self.put_run_log(run_log=run_log)
832
+
833
+ def create_branch_log(
834
+ self,
835
+ internal_branch_name: str,
836
+ parameters: Optional[Dict[str, Parameter]] = None,
837
+ ) -> BranchLog:
838
+ """
839
+ Creates a uncommitted branch log object by the internal name given
840
+
841
+ Args:
842
+ internal_branch_name (str): Creates a branch log by name internal_branch_name
843
+ parameters (dict, optional): Initial parameters for the branch
844
+
845
+ Returns:
846
+ BranchLog: Uncommitted and initialized with defaults BranchLog object
847
+ """
848
+ # Create a new BranchLog
849
+ logger.info(
850
+ f"{self.service_name} Creating a Branch Log : {internal_branch_name}"
851
+ )
852
+ branch_log = BranchLog(
853
+ internal_name=internal_branch_name, status=defaults.CREATED
854
+ )
855
+ if parameters:
856
+ branch_log.parameters.update(parameters)
857
+ return branch_log
858
+
859
+ def get_branch_log(
860
+ self, internal_branch_name: str, run_id: str
861
+ ) -> Union[BranchLog, RunLog]:
862
+ """
863
+ Returns the branch log by the internal branch name for the run id
864
+
865
+ If the internal branch name is none, returns the run log
866
+
867
+ Args:
868
+ internal_branch_name (str): The internal branch name to retrieve.
869
+ run_id (str): The run id of interest
870
+
871
+ Returns:
872
+ BranchLog: The branch log or the run log as requested.
873
+ """
874
+ run_log = self.get_run_log_by_id(run_id=run_id)
875
+ if not internal_branch_name:
876
+ return run_log
877
+ branch, _ = run_log.search_branch_by_internal_name(internal_branch_name)
878
+ return branch
879
+
880
+ def add_branch_log(self, branch_log: Union[BranchLog, RunLog], run_id: str):
881
+ """
882
+ The method should:
883
+ # Get the run log
884
+ # Get the branch and step containing the branch
885
+ # Add the branch to the step
886
+ # Write the run_log
887
+
888
+ The branch log could some times be a Run log and should be handled appropriately
889
+
890
+ Args:
891
+ branch_log (BranchLog): The branch log/run log to add to the database
892
+ run_id (str): The run id to which the branch/run log is added
893
+ """
894
+
895
+ internal_branch_name = None
896
+
897
+ if isinstance(branch_log, BranchLog):
898
+ internal_branch_name = branch_log.internal_name
899
+
900
+ if not internal_branch_name:
901
+ self.put_run_log(branch_log) # type: ignore # We are dealing with base dag here
902
+ return
903
+
904
+ run_log = self.get_run_log_by_id(run_id=run_id)
905
+
906
+ step_name = ".".join(internal_branch_name.split(".")[:-1])
907
+ step, _ = run_log.search_step_by_internal_name(step_name)
908
+
909
+ step.branches[internal_branch_name] = branch_log # type: ignore
910
+ self.put_run_log(run_log)
911
+
912
+ def create_code_identity(self) -> CodeIdentity:
913
+ """
914
+ Creates an uncommitted Code identity class
915
+
916
+ Returns:
917
+ CodeIdentity: An uncommitted code identity class
918
+ """
919
+ logger.info(f"{self.service_name} Creating Code identity")
920
+ return CodeIdentity()
921
+
922
+ def create_data_catalog(self, name: str) -> DataCatalog:
923
+ """
924
+ Create a uncommitted data catalog object
925
+
926
+ Args:
927
+ name (str): The name of the data catalog item to put
928
+
929
+ Returns:
930
+ DataCatalog: The DataCatalog object.
931
+ """
932
+ logger.info(f"{self.service_name} Creating Data Catalog for {name}")
933
+ return DataCatalog(name=name)
934
+
935
+ def create_job_log(self) -> JobLog:
936
+ """
937
+ Creates a Job log and adds it to the db
938
+
939
+ Refer to BaseRunLogStore.create_job_log
940
+ """
941
+ logger.info(f"{self.service_name} Creating a Job Log and adding it to DB")
942
+ return JobLog(status=defaults.CREATED)
943
+
944
+ def get_job_log(self, run_id: str) -> JobLog:
945
+ """
946
+ Returns the run_log defined by id
947
+
948
+ Raises Exception if not found
949
+ """
950
+ logger.info(f"{self.service_name} Getting the run log from DB for {run_id}")
951
+ run_log = self.get_run_log_by_id(run_id)
952
+
953
+ try:
954
+ assert run_log.job
955
+ except AssertionError as exc:
956
+ raise exceptions.JobLogNotFoundError(run_id) from exc
957
+
958
+ return run_log.job
959
+
960
+ def add_job_log(self, run_id: str, job_log: JobLog):
961
+ """
962
+ Adds the job log to the run log
963
+
964
+ Args:
965
+ run_id (str): The run_id of the run
966
+ job_log (JobLog): The job log to add to the run log
967
+ """
968
+ logger.info(f"{self.service_name} Adding the job log to DB for: {run_id}")
969
+ run_log = self.get_run_log_by_id(run_id=run_id)
970
+ run_log.job = job_log
971
+ run_log.status = job_log.status
972
+ self.put_run_log(run_log=run_log)
973
+
974
+
975
+ class BufferRunLogstore(BaseRunLogStore):
976
+ """
977
+ A in-memory run log store.
978
+
979
+ This Run Log store will not persist any results.
980
+
981
+ When to use:
982
+ When testing some part of the pipeline.
983
+
984
+ Do not use:
985
+ When you need to compare between runs or in production set up
986
+
987
+ This Run Log Store is concurrent write safe as it is in memory
988
+
989
+ Example config:
990
+ run_log:
991
+ type: buffered
992
+
993
+ """
994
+
995
+ service_name: str = "buffered"
996
+
997
+ run_log: Optional[RunLog] = Field(
998
+ default=None, exclude=True
999
+ ) # For a buffered Run Log, this is the database
1000
+ job_log: Optional[JobLog] = Field(
1001
+ default=None, exclude=True
1002
+ ) # For a buffered Run Log, this is the database
1003
+
1004
+ def get_summary(self) -> Dict[str, Any]:
1005
+ summary = {"Type": self.service_name, "Location": "Not persisted"}
1006
+
1007
+ return summary
1008
+
1009
+ def create_run_log(
1010
+ self,
1011
+ run_id: str,
1012
+ dag_hash: str = "",
1013
+ use_cached: bool = False,
1014
+ tag: str = "",
1015
+ original_run_id: str = "",
1016
+ status: str = defaults.CREATED,
1017
+ ) -> RunLog:
1018
+ """
1019
+ # Creates a Run log
1020
+ # Adds it to the db
1021
+ # Return the log
1022
+
1023
+ Refer to BaseRunLogStore.create_run_log
1024
+ """
1025
+
1026
+ logger.info(f"{self.service_name} Creating a Run Log and adding it to DB")
1027
+ self.run_log = RunLog(
1028
+ run_id=run_id,
1029
+ dag_hash=dag_hash,
1030
+ tag=tag,
1031
+ status=status,
1032
+ )
1033
+ return self.run_log
1034
+
1035
+ def get_run_log_by_id(self, run_id: str, full: bool = False):
1036
+ """
1037
+ # Returns the run_log defined by id
1038
+ # Raises Exception if not found
1039
+ """
1040
+
1041
+ logger.info(f"{self.service_name} Getting the run log from DB for {run_id}")
1042
+ if self.run_log:
1043
+ return self.run_log
1044
+
1045
+ raise exceptions.RunLogNotFoundError(run_id)
1046
+
1047
+ def put_run_log(self, run_log: RunLog):
1048
+ """
1049
+ # Puts the run log in the db
1050
+ # Raises Exception if not found
1051
+ """
1052
+ logger.info(
1053
+ f"{self.service_name} Putting the run log in the DB: {run_log.run_id}"
1054
+ )
1055
+ self.run_log = run_log
1056
+
1057
+
1058
+ import runnable.context as context # noqa: F401, E402