ddeutil-workflow 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1186 +0,0 @@
1
- # ------------------------------------------------------------------------------
2
- # Copyright (c) 2022 Korawich Anuttra. All rights reserved.
3
- # Licensed under the MIT License. See LICENSE in the project root for
4
- # license information.
5
- # ------------------------------------------------------------------------------
6
- from __future__ import annotations
7
-
8
- import copy
9
- import os
10
- import time
11
- from concurrent.futures import (
12
- FIRST_EXCEPTION,
13
- Future,
14
- ThreadPoolExecutor,
15
- as_completed,
16
- wait,
17
- )
18
- from datetime import datetime, timedelta
19
- from heapq import heappush
20
- from pickle import PickleError
21
- from queue import Queue
22
- from textwrap import dedent
23
- from threading import Event
24
- from typing import Optional
25
- from zoneinfo import ZoneInfo
26
-
27
- from pydantic import BaseModel, Field
28
- from pydantic.functional_validators import field_validator, model_validator
29
- from typing_extensions import Self
30
-
31
- from .__types import (
32
- DictData,
33
- DictStr,
34
- Matrix,
35
- MatrixExclude,
36
- MatrixInclude,
37
- TupleStr,
38
- )
39
- from .cron import CronRunner
40
- from .exceptions import (
41
- JobException,
42
- PipelineException,
43
- StageException,
44
- UtilException,
45
- )
46
- from .log import FileLog, Log, get_logger
47
- from .on import On
48
- from .stage import Stage
49
- from .utils import (
50
- Loader,
51
- Param,
52
- Result,
53
- cross_product,
54
- dash2underscore,
55
- delay,
56
- filter_func,
57
- gen_id,
58
- get_diff_sec,
59
- has_template,
60
- param2template,
61
- )
62
-
63
- logger = get_logger("ddeutil.workflow")
64
-
65
-
66
- __all__: TupleStr = (
67
- "Strategy",
68
- "Job",
69
- "Pipeline",
70
- )
71
-
72
-
73
- class Strategy(BaseModel):
74
- """Strategy Model that will combine a matrix together for running the
75
- special job.
76
-
77
- Data Validate:
78
- >>> strategy = {
79
- ... 'max-parallel': 1,
80
- ... 'fail-fast': False,
81
- ... 'matrix': {
82
- ... 'first': [1, 2, 3],
83
- ... 'second': ['foo', 'bar'],
84
- ... },
85
- ... 'include': [{'first': 4, 'second': 'foo'}],
86
- ... 'exclude': [{'first': 1, 'second': 'bar'}],
87
- ... }
88
- """
89
-
90
- fail_fast: bool = Field(
91
- default=False,
92
- serialization_alias="fail-fast",
93
- )
94
- max_parallel: int = Field(
95
- default=1,
96
- gt=0,
97
- description=(
98
- "The maximum number of executor thread pool that want to run "
99
- "parallel"
100
- ),
101
- serialization_alias="max-parallel",
102
- )
103
- matrix: Matrix = Field(
104
- default_factory=dict,
105
- description=(
106
- "A matrix values that want to cross product to possible strategies."
107
- ),
108
- )
109
- include: MatrixInclude = Field(
110
- default_factory=list,
111
- description="A list of additional matrix that want to adds-in.",
112
- )
113
- exclude: MatrixExclude = Field(
114
- default_factory=list,
115
- description="A list of exclude matrix that want to filter-out.",
116
- )
117
-
118
- @model_validator(mode="before")
119
- def __prepare_keys(cls, values: DictData) -> DictData:
120
- """Rename key that use dash to underscore because Python does not
121
- support this character exist in any variable name.
122
- """
123
- dash2underscore("max-parallel", values)
124
- dash2underscore("fail-fast", values)
125
- return values
126
-
127
- def is_set(self) -> bool:
128
- """Return True if this strategy was set from yaml template."""
129
- return len(self.matrix) > 0
130
-
131
- def make(self) -> list[DictStr]:
132
- """Return List of product of matrix values that already filter with
133
- exclude and add include.
134
-
135
- :rtype: list[DictStr]
136
- """
137
- # NOTE: If it does not set matrix, it will return list of an empty dict.
138
- if not (mt := self.matrix):
139
- return [{}]
140
-
141
- final: list[DictStr] = []
142
- for r in cross_product(matrix=mt):
143
- if any(
144
- all(r[k] == v for k, v in exclude.items())
145
- for exclude in self.exclude
146
- ):
147
- continue
148
- final.append(r)
149
-
150
- # NOTE: If it is empty matrix and include, it will return list of an
151
- # empty dict.
152
- if not final and not self.include:
153
- return [{}]
154
-
155
- # NOTE: Add include to generated matrix with exclude list.
156
- add: list[DictStr] = []
157
- for include in self.include:
158
- # VALIDATE:
159
- # Validate any key in include list should be a subset of some one
160
- # in matrix.
161
- if all(not (set(include.keys()) <= set(m.keys())) for m in final):
162
- raise ValueError("Include should have the keys equal to matrix")
163
-
164
- # VALIDATE:
165
- # Validate value of include does not duplicate with generated
166
- # matrix.
167
- if any(
168
- all(include.get(k) == v for k, v in m.items())
169
- for m in [*final, *add]
170
- ):
171
- continue
172
- add.append(include)
173
- final.extend(add)
174
- return final
175
-
176
-
177
- class Job(BaseModel):
178
- """Job Model (group of stages).
179
-
180
- This job model allow you to use for-loop that call matrix strategy. If
181
- you pass matrix mapping and it able to generate, you will see it running
182
- with loop of matrix values.
183
-
184
- Data Validate:
185
- >>> job = {
186
- ... "runs-on": None,
187
- ... "strategy": {
188
- ... "max-parallel": 1,
189
- ... "matrix": {
190
- ... "first": [1, 2, 3],
191
- ... "second": ['foo', 'bar'],
192
- ... },
193
- ... },
194
- ... "needs": [],
195
- ... "stages": [
196
- ... {
197
- ... "name": "Some stage",
198
- ... "run": "print('Hello World')",
199
- ... },
200
- ... ...
201
- ... ],
202
- ... }
203
- """
204
-
205
- id: Optional[str] = Field(
206
- default=None,
207
- description=(
208
- "A job ID, this value will add from pipeline after validation "
209
- "process."
210
- ),
211
- )
212
- desc: Optional[str] = Field(
213
- default=None,
214
- description="A job description that can be string of markdown content.",
215
- )
216
- runs_on: Optional[str] = Field(
217
- default=None,
218
- description="A target executor node for this job use to execution.",
219
- serialization_alias="runs-on",
220
- )
221
- stages: list[Stage] = Field(
222
- default_factory=list,
223
- description="A list of Stage of this job.",
224
- )
225
- needs: list[str] = Field(
226
- default_factory=list,
227
- description="A list of the job ID that want to run before this job.",
228
- )
229
- strategy: Strategy = Field(
230
- default_factory=Strategy,
231
- description="A strategy matrix that want to generate.",
232
- )
233
- run_id: Optional[str] = Field(
234
- default=None,
235
- description="A running job ID.",
236
- repr=False,
237
- exclude=True,
238
- )
239
-
240
- @model_validator(mode="before")
241
- def __prepare_keys(cls, values: DictData) -> DictData:
242
- """Rename key that use dash to underscore because Python does not
243
- support this character exist in any variable name.
244
- """
245
- dash2underscore("runs-on", values)
246
- return values
247
-
248
- @field_validator("desc", mode="after")
249
- def ___prepare_desc(cls, value: str) -> str:
250
- """Prepare description string that was created on a template."""
251
- return dedent(value)
252
-
253
- @model_validator(mode="after")
254
- def __prepare_running_id(self):
255
- if self.run_id is None:
256
- self.run_id = gen_id(self.id or "", unique=True)
257
-
258
- # VALIDATE: Validate job id should not dynamic with params template.
259
- if has_template(self.id):
260
- raise ValueError("Job ID should not has any template.")
261
-
262
- return self
263
-
264
- def get_running_id(self, run_id: str) -> Self:
265
- """Return Job model object that changing job running ID with an
266
- input running ID.
267
-
268
- :param run_id: A replace job running ID.
269
- :rtype: Self
270
- """
271
- return self.model_copy(update={"run_id": run_id})
272
-
273
- def stage(self, stage_id: str) -> Stage:
274
- """Return stage model that match with an input stage ID."""
275
- for stage in self.stages:
276
- if stage_id == (stage.id or ""):
277
- return stage
278
- raise ValueError(f"Stage ID {stage_id} does not exists")
279
-
280
- def set_outputs(self, output: DictData) -> DictData:
281
- """Setting output of job execution"""
282
- if len(output) > 1 and self.strategy.is_set():
283
- return {"strategies": output}
284
- return output[next(iter(output))]
285
-
286
- def execute_strategy(
287
- self,
288
- strategy: DictData,
289
- params: DictData,
290
- *,
291
- event: Event | None = None,
292
- ) -> Result:
293
- """Job Strategy execution with passing dynamic parameters from the
294
- pipeline execution to strategy matrix.
295
-
296
- This execution is the minimum level execution of job model.
297
-
298
- :param strategy: A metrix strategy value.
299
- :param params: A dynamic parameters.
300
- :param event: An manger event that pass to the PoolThreadExecutor.
301
- :rtype: Result
302
-
303
- :raise JobException: If it has any error from StageException or
304
- UtilException.
305
- """
306
- # NOTE: Force stop this execution if event was set from main execution.
307
- if event and event.is_set():
308
- return Result(
309
- status=1,
310
- context={
311
- gen_id(strategy): {
312
- "matrix": strategy,
313
- "stages": {},
314
- "error_message": {
315
- "message": "Process Event stopped before execution"
316
- },
317
- },
318
- },
319
- )
320
-
321
- # NOTE: Create strategy execution context and update a matrix and copied
322
- # of params. So, the context value will have structure like;
323
- # ---
324
- # {
325
- # "params": { ... }, <== Current input params
326
- # "jobs": { ... }, <== Current input params
327
- # "matrix": { ... } <== Current strategy value
328
- # }
329
- #
330
- context: DictData = params
331
- context.update({"matrix": strategy})
332
-
333
- # IMPORTANT: The stage execution only run sequentially one-by-one.
334
- for stage in self.stages:
335
-
336
- # IMPORTANT: Change any stage running IDs to this job running ID.
337
- stage: Stage = stage.get_running_id(self.run_id)
338
-
339
- _st_name: str = stage.id or stage.name
340
-
341
- if stage.is_skipped(params=context):
342
- logger.info(
343
- f"({self.run_id}) [JOB]: Skip the stage: {_st_name!r}"
344
- )
345
- continue
346
-
347
- logger.info(
348
- f"({self.run_id}) [JOB]: Start execute the stage: {_st_name!r}"
349
- )
350
-
351
- # NOTE: Logging a matrix that pass on this stage execution.
352
- if strategy:
353
- logger.info(f"({self.run_id}) [JOB]: Matrix: {strategy}")
354
-
355
- # NOTE:
356
- # I do not use below syntax because `params` dict be the
357
- # reference memory pointer and it was changed when I action
358
- # anything like update or re-construct this.
359
- #
360
- # ... params |= stage.execute(params=params)
361
- #
362
- # This step will add the stage result to ``stages`` key in
363
- # that stage id. It will have structure like;
364
- # ---
365
- # {
366
- # "params": { ... },
367
- # "jobs": { ... },
368
- # "matrix": { ... },
369
- # "stages": { { "stage-id-1": ... }, ... }
370
- # }
371
- #
372
- if event and event.is_set():
373
- return Result(
374
- status=1,
375
- context={
376
- gen_id(strategy): {
377
- "matrix": strategy,
378
- # NOTE: If job strategy executor use multithreading,
379
- # it will not filter function object from context.
380
- # ---
381
- # "stages": filter_func(context.pop("stages", {})),
382
- "stages": context.pop("stages", {}),
383
- "error_message": {
384
- "message": (
385
- "Process Event stopped before execution"
386
- ),
387
- },
388
- },
389
- },
390
- )
391
- try:
392
- rs: Result = stage.execute(params=context)
393
- stage.set_outputs(rs.context, to=context)
394
- except (StageException, UtilException) as err:
395
- logger.error(
396
- f"({self.run_id}) [JOB]: {err.__class__.__name__}: {err}"
397
- )
398
- raise JobException(
399
- f"Get stage execution error: {err.__class__.__name__}: "
400
- f"{err}"
401
- ) from None
402
-
403
- # NOTE: Remove new stage object that was created from
404
- # ``get_running_id`` method.
405
- del stage
406
-
407
- return Result(
408
- status=0,
409
- context={
410
- gen_id(strategy): {
411
- "matrix": strategy,
412
- # NOTE: (WF001) filter own created function from stages
413
- # value, because it does not dump with pickle when you
414
- # execute with multiprocess.
415
- #
416
- "stages": filter_func(context.pop("stages", {})),
417
- },
418
- },
419
- )
420
-
421
- def execute(self, params: DictData | None = None) -> Result:
422
- """Job execution with passing dynamic parameters from the pipeline
423
- execution. It will generate matrix values at the first step and for-loop
424
- any metrix to all stages dependency.
425
-
426
- :param params: An input parameters that use on job execution.
427
- :rtype: Result
428
- """
429
- context: DictData = {}
430
-
431
- # NOTE: Normal Job execution.
432
- if (not self.strategy.is_set()) or self.strategy.max_parallel == 1:
433
- for strategy in self.strategy.make():
434
- rs: Result = self.execute_strategy(
435
- strategy, params=copy.deepcopy(params)
436
- )
437
- context.update(rs.context)
438
- return Result(
439
- status=0,
440
- context=context,
441
- )
442
-
443
- # # WARNING: (WF001) I got error that raise when use
444
- # # ``ProcessPoolExecutor``;
445
- # # ---
446
- # # _pickle.PicklingError: Can't pickle
447
- # # <function ??? at 0x000001F0BE80F160>: attribute lookup ???
448
- # # on ddeutil.workflow.stage failed
449
- # #
450
- # # from multiprocessing import Event, Manager
451
- # with Manager() as manager:
452
- # event: Event = manager.Event()
453
- #
454
- # # NOTE: Start process pool executor for running strategy executor
455
- # # in parallel mode.
456
- # with ProcessPoolExecutor(
457
- # max_workers=self.strategy.max_parallel
458
- # ) as executor:
459
- # futures: list[Future] = [
460
- # executor.submit(
461
- # self.execute_strategy,
462
- # strategy,
463
- # params=copy.deepcopy(params),
464
- # event=event,
465
- # )
466
- # for strategy in self.strategy.make()
467
- # ]
468
- # if self.strategy.fail_fast:
469
- # rs = self.__catch_fail_fast(event, futures)
470
- # else:
471
- # rs = self.__catch_all_completed(futures)
472
-
473
- # NOTE: Create event for cancel executor stop running.
474
- event: Event = Event()
475
-
476
- with ThreadPoolExecutor(
477
- max_workers=self.strategy.max_parallel
478
- ) as executor:
479
- futures: list[Future] = [
480
- executor.submit(
481
- self.execute_strategy,
482
- strategy,
483
- params=copy.deepcopy(params),
484
- event=event,
485
- )
486
- for strategy in self.strategy.make()
487
- ]
488
-
489
- # NOTE: Dynamic catching futures object with fail-fast flag.
490
- if self.strategy.fail_fast:
491
- rs: Result = self.__catch_fail_fast(event, futures)
492
- else:
493
- rs: Result = self.__catch_all_completed(futures)
494
- return Result(
495
- status=0,
496
- context=rs.context,
497
- )
498
-
499
- def __catch_fail_fast(self, event: Event, futures: list[Future]) -> Result:
500
- """Job parallel pool futures catching with fail-fast mode. That will
501
- stop all not done futures if it receive the first exception from all
502
- running futures.
503
-
504
- :param event:
505
- :param futures: A list of futures.
506
- :rtype: Result
507
- """
508
- context: DictData = {}
509
- # NOTE: Get results from a collection of tasks with a
510
- # timeout that has the first exception.
511
- done, not_done = wait(
512
- futures, timeout=1800, return_when=FIRST_EXCEPTION
513
- )
514
- nd: str = (
515
- f", the strategies do not run is {not_done}" if not_done else ""
516
- )
517
- logger.debug(f"({self.run_id}) [JOB]: Strategy is set Fail Fast{nd}")
518
-
519
- if len(done) != len(futures):
520
-
521
- # NOTE: Stop all running tasks
522
- event.set()
523
-
524
- # NOTE: Cancel any scheduled tasks
525
- for future in futures:
526
- future.cancel()
527
-
528
- status: int = 0
529
- for future in done:
530
- if future.exception():
531
- status = 1
532
- logger.error(
533
- f"({self.run_id}) [JOB]: One stage failed with: "
534
- f"{future.exception()}, shutting down this future."
535
- )
536
- elif future.cancelled():
537
- continue
538
- else:
539
- rs: Result = future.result(timeout=60)
540
- context.update(rs.context)
541
- return Result(status=status, context=context)
542
-
543
- def __catch_all_completed(self, futures: list[Future]) -> Result:
544
- """Job parallel pool futures catching with all-completed mode.
545
-
546
- :param futures: A list of futures.
547
- :rtype: Result
548
- """
549
- context: DictData = {}
550
- status: int = 0
551
- for future in as_completed(futures):
552
- try:
553
- rs: Result = future.result(timeout=60)
554
- context.update(rs.context)
555
- except PickleError as err:
556
- # NOTE: (WF001) I do not want to fix this issue because
557
- # it does not make sense and over-engineering with
558
- # this bug fix process.
559
- raise JobException(
560
- f"PyStage that create object on locals does use "
561
- f"parallel in strategy execution;\n\t{err}"
562
- ) from None
563
- except TimeoutError:
564
- status = 1
565
- logger.warning(
566
- f"({self.run_id}) [JOB]: Task is hanging. Attempting to "
567
- f"kill."
568
- )
569
- future.cancel()
570
- time.sleep(0.1)
571
- if not future.cancelled():
572
- logger.warning(
573
- f"({self.run_id}) [JOB]: Failed to cancel the task."
574
- )
575
- else:
576
- logger.warning(
577
- f"({self.run_id}) [JOB]: Task canceled successfully."
578
- )
579
- except JobException as err:
580
- status = 1
581
- logger.error(
582
- f"({self.run_id}) [JOB]: Get stage exception with "
583
- f"fail-fast does not set;\n{err.__class__.__name__}:\n\t"
584
- f"{err}"
585
- )
586
- return Result(status=status, context=context)
587
-
588
-
589
- class Pipeline(BaseModel):
590
- """Pipeline Model this is the main future of this project because it use to
591
- be workflow data for running everywhere that you want or using it to
592
- scheduler task in background. It use lightweight coding line from Pydantic
593
- Model and enhance execute method on it.
594
- """
595
-
596
- name: str = Field(description="A pipeline name.")
597
- desc: Optional[str] = Field(
598
- default=None,
599
- description=(
600
- "A pipeline description that can be string of markdown content."
601
- ),
602
- )
603
- params: dict[str, Param] = Field(
604
- default_factory=dict,
605
- description="A parameters that want to use on this pipeline.",
606
- )
607
- on: list[On] = Field(
608
- default_factory=list,
609
- description="A list of On instance for this pipeline schedule.",
610
- )
611
- jobs: dict[str, Job] = Field(
612
- default_factory=dict,
613
- description="A mapping of job ID and job model that already loaded.",
614
- )
615
- run_id: Optional[str] = Field(
616
- default=None,
617
- description="A running pipeline ID.",
618
- repr=False,
619
- exclude=True,
620
- )
621
-
622
- @property
623
- def new_run_id(self) -> str:
624
- """Running ID of this pipeline that always generate new unique value."""
625
- return gen_id(self.name, unique=True)
626
-
627
- @classmethod
628
- def from_loader(
629
- cls,
630
- name: str,
631
- externals: DictData | None = None,
632
- ) -> Self:
633
- """Create Pipeline instance from the Loader object that only receive
634
- an input pipeline name. The loader object will use this pipeline name to
635
- searching configuration data of this pipeline model in conf path.
636
-
637
- :param name: A pipeline name that want to pass to Loader object.
638
- :param externals: An external parameters that want to pass to Loader
639
- object.
640
- :rtype: Self
641
- """
642
- loader: Loader = Loader(name, externals=(externals or {}))
643
-
644
- # NOTE: Validate the config type match with current connection model
645
- if loader.type != cls:
646
- raise ValueError(f"Type {loader.type} does not match with {cls}")
647
-
648
- loader_data: DictData = copy.deepcopy(loader.data)
649
-
650
- # NOTE: Add name to loader data
651
- loader_data["name"] = name.replace(" ", "_")
652
-
653
- # NOTE: Prepare `on` data
654
- cls.__bypass_on(loader_data)
655
- return cls.model_validate(obj=loader_data)
656
-
657
- @classmethod
658
- def __bypass_on(cls, data: DictData, externals: DictData | None = None):
659
- """Bypass the on data to loaded config data."""
660
- if on := data.pop("on", []):
661
- if isinstance(on, str):
662
- on = [on]
663
- if any(not isinstance(i, (dict, str)) for i in on):
664
- raise TypeError("The ``on`` key should be list of str or dict")
665
-
666
- # NOTE: Pass on value to Loader and keep on model object to on field
667
- data["on"] = [
668
- (
669
- Loader(n, externals=(externals or {})).data
670
- if isinstance(n, str)
671
- else n
672
- )
673
- for n in on
674
- ]
675
- return data
676
-
677
- @model_validator(mode="before")
678
- def __prepare_params(cls, values: DictData) -> DictData:
679
- """Prepare the params key."""
680
- # NOTE: Prepare params type if it passing with only type value.
681
- if params := values.pop("params", {}):
682
- values["params"] = {
683
- p: (
684
- {"type": params[p]}
685
- if isinstance(params[p], str)
686
- else params[p]
687
- )
688
- for p in params
689
- }
690
- return values
691
-
692
- @field_validator("desc", mode="after")
693
- def ___prepare_desc(cls, value: str) -> str:
694
- """Prepare description string that was created on a template."""
695
- return dedent(value)
696
-
697
- @model_validator(mode="after")
698
- def __validate_jobs_need_and_prepare_running_id(self):
699
- """Validate each need job in any jobs should exists."""
700
- for job in self.jobs:
701
- if not_exist := [
702
- need for need in self.jobs[job].needs if need not in self.jobs
703
- ]:
704
- raise PipelineException(
705
- f"This needed jobs: {not_exist} do not exist in this "
706
- f"pipeline, {self.name!r}"
707
- )
708
-
709
- # NOTE: update a job id with its job id from pipeline template
710
- self.jobs[job].id = job
711
-
712
- if self.run_id is None:
713
- self.run_id = self.new_run_id
714
-
715
- # VALIDATE: Validate pipeline name should not dynamic with params
716
- # template.
717
- if has_template(self.name):
718
- raise ValueError(
719
- f"Pipeline name should not has any template, please check, "
720
- f"{self.name!r}."
721
- )
722
-
723
- return self
724
-
725
- def get_running_id(self, run_id: str) -> Self:
726
- """Return Pipeline model object that changing pipeline running ID with
727
- an input running ID.
728
-
729
- :param run_id: A replace pipeline running ID.
730
- :rtype: Self
731
- """
732
- return self.model_copy(update={"run_id": run_id})
733
-
734
- def job(self, name: str) -> Job:
735
- """Return Job model that exists on this pipeline.
736
-
737
- :param name: A job name that want to get from a mapping of job models.
738
- :type name: str
739
-
740
- :rtype: Job
741
- :returns: A job model that exists on this pipeline by input name.
742
- """
743
- if name not in self.jobs:
744
- raise ValueError(
745
- f"A Job {name!r} does not exists in this pipeline, "
746
- f"{self.name!r}"
747
- )
748
- return self.jobs[name]
749
-
750
- def parameterize(self, params: DictData) -> DictData:
751
- """Prepare parameters before passing to execution process. This method
752
- will create jobs key to params mapping that will keep any result from
753
- job execution.
754
-
755
- :param params: A parameter mapping that receive from pipeline execution.
756
- :rtype: DictData
757
- """
758
- # VALIDATE: Incoming params should have keys that set on this pipeline.
759
- if check_key := tuple(
760
- f"{k!r}"
761
- for k in self.params
762
- if (k not in params and self.params[k].required)
763
- ):
764
- raise PipelineException(
765
- f"Required Param on this pipeline setting does not set: "
766
- f"{', '.join(check_key)}."
767
- )
768
-
769
- # NOTE: mapping type of param before adding it to params variable.
770
- return {
771
- "params": (
772
- params
773
- | {
774
- k: self.params[k].receive(params[k])
775
- for k in params
776
- if k in self.params
777
- }
778
- ),
779
- "jobs": {},
780
- }
781
-
782
- def release(
783
- self,
784
- on: On,
785
- params: DictData,
786
- queue: list[datetime],
787
- *,
788
- waiting_sec: int = 60,
789
- sleep_interval: int = 15,
790
- log: Log = None,
791
- ) -> Result:
792
- """Start running pipeline with the on schedule in period of 30 minutes.
793
- That mean it will still running at background 30 minutes until the
794
- schedule matching with its time.
795
-
796
- This method allow pipeline use log object to save the execution
797
- result to log destination like file log to local `/logs` directory.
798
-
799
- :param on: An on schedule value.
800
- :param params: A pipeline parameter that pass to execute method.
801
- :param queue: A list of release time that already running.
802
- :param waiting_sec: A second period value that allow pipeline execute.
803
- :param sleep_interval: A second value that want to waiting until time
804
- to execute.
805
- :param log: A log object that want to save execution result.
806
- :rtype: Result
807
- """
808
- log: Log = log or FileLog
809
- tz: ZoneInfo = ZoneInfo(os.getenv("WORKFLOW_CORE_TIMEZONE", "UTC"))
810
- gen: CronRunner = on.generate(
811
- datetime.now(tz=tz).replace(second=0, microsecond=0)
812
- + timedelta(seconds=1)
813
- )
814
- cron_tz: ZoneInfo = gen.tz
815
-
816
- # NOTE: get next schedule time that generate from now.
817
- next_time: datetime = gen.next
818
-
819
- # NOTE: get next utils it does not logger.
820
- while log.is_pointed(self.name, next_time, queue=queue):
821
- next_time: datetime = gen.next
822
-
823
- # NOTE: push this next running time to log queue
824
- heappush(queue, next_time)
825
-
826
- # VALIDATE: Check the different time between the next schedule time and
827
- # now that less than waiting period (second unit).
828
- if get_diff_sec(next_time, tz=cron_tz) > waiting_sec:
829
- logger.debug(
830
- f"({self.run_id}) [CORE]: {self.name!r} : {on.cronjob} : "
831
- f"Does not closely >> {next_time:%Y-%m-%d %H:%M:%S}"
832
- )
833
-
834
- # NOTE: Remove next datetime from queue.
835
- queue.remove(next_time)
836
-
837
- time.sleep(0.15)
838
- return Result(
839
- status=0,
840
- context={
841
- "params": params,
842
- "poking": {"skipped": [str(on.cronjob)], "run": []},
843
- },
844
- )
845
-
846
- logger.debug(
847
- f"({self.run_id}) [CORE]: {self.name!r} : {on.cronjob} : "
848
- f"Closely to run >> {next_time:%Y-%m-%d %H:%M:%S}"
849
- )
850
-
851
- # NOTE: Release when the time is nearly to schedule time.
852
- while (duration := get_diff_sec(next_time, tz=cron_tz)) > (
853
- sleep_interval + 5
854
- ):
855
- logger.debug(
856
- f"({self.run_id}) [CORE]: {self.name!r} : {on.cronjob} : "
857
- f"Sleep until: {duration}"
858
- )
859
- time.sleep(sleep_interval)
860
-
861
- time.sleep(0.5)
862
-
863
- # NOTE: Release parameter that use to change if params has
864
- # templating.
865
- release_params: DictData = {
866
- "release": {
867
- "logical_date": next_time,
868
- },
869
- }
870
-
871
- # WARNING: Re-create pipeline object that use new running pipeline
872
- # ID.
873
- runner: Self = self.get_running_id(run_id=self.new_run_id)
874
- rs: Result = runner.execute(
875
- params=param2template(params, release_params),
876
- )
877
- logger.debug(
878
- f"({runner.run_id}) [CORE]: {self.name!r} : {on.cronjob} : "
879
- f"End release {next_time:%Y-%m-%d %H:%M:%S}"
880
- )
881
-
882
- # NOTE: Delete a copied pipeline instance for saving memory.
883
- del runner
884
-
885
- rs.set_parent_run_id(self.run_id)
886
- rs_log: Log = log.model_validate(
887
- {
888
- "name": self.name,
889
- "on": str(on.cronjob),
890
- "release": next_time,
891
- "context": rs.context,
892
- "parent_run_id": rs.run_id,
893
- "run_id": rs.run_id,
894
- }
895
- )
896
- # NOTE: Saving execution result to destination of the input log object.
897
- rs_log.save(excluded=None)
898
-
899
- queue.remove(next_time)
900
- time.sleep(0.05)
901
- return Result(
902
- status=0,
903
- context={
904
- "params": params,
905
- "poking": {"skipped": [], "run": [str(on.cronjob)]},
906
- },
907
- )
908
-
909
- def poke(
910
- self,
911
- params: DictData | None = None,
912
- *,
913
- log: Log | None = None,
914
- ) -> list[Result]:
915
- """Poke pipeline with threading executor pool for executing with all its
916
- schedules that was set on the `on` value. This method will observe its
917
- schedule that nearing to run with the ``self.release()`` method.
918
-
919
- :param params: A parameters that want to pass to the release method.
920
- :param log: A log object that want to use on this poking process.
921
- :rtype: list[Result]
922
- """
923
- logger.info(
924
- f"({self.run_id}) [POKING]: Start Poking: {self.name!r} ..."
925
- )
926
-
927
- # NOTE: If this pipeline does not set the on schedule, it will return
928
- # empty result.
929
- if len(self.on) == 0:
930
- return []
931
-
932
- params: DictData = params or {}
933
- queue: list[datetime] = []
934
- results: list[Result] = []
935
-
936
- wk: int = int(os.getenv("WORKFLOW_CORE_MAX_PIPELINE_POKING") or "4")
937
- with ThreadPoolExecutor(max_workers=wk) as executor:
938
- # TODO: If I want to run infinite loop.
939
- futures: list[Future] = []
940
- for on in self.on:
941
- futures.append(
942
- executor.submit(
943
- self.release,
944
- on,
945
- params=params,
946
- log=log,
947
- queue=queue,
948
- )
949
- )
950
- delay()
951
-
952
- # WARNING: This poking method does not allow to use fail-fast logic
953
- # to catching parallel execution result.
954
- for future in as_completed(futures):
955
- results.append(future.result(timeout=60))
956
-
957
- if len(queue) > 0:
958
- logger.error(
959
- f"({self.run_id}) [POKING]: Log Queue does empty when poking "
960
- f"process was finishing."
961
- )
962
-
963
- return results
964
-
965
- def execute_job(
966
- self,
967
- job: str,
968
- params: DictData,
969
- ) -> Result:
970
- """Job Executor that use on pipeline executor.
971
-
972
- :param job: A job ID that want to execute.
973
- :param params: A params that was parameterized from pipeline execution.
974
- :rtype: Result
975
- """
976
- # VALIDATE: check a job ID that exists in this pipeline or not.
977
- if job not in self.jobs:
978
- raise PipelineException(
979
- f"The job ID: {job} does not exists on {self.name!r} pipeline."
980
- )
981
- try:
982
- logger.info(f"({self.run_id}) [PIPELINE]: Start execute: {job!r}")
983
-
984
- # IMPORTANT:
985
- # Change any job running IDs to this pipeline running ID.
986
- job_obj: Job = self.jobs[job].get_running_id(self.run_id)
987
- j_rs: Result = job_obj.execute(params=params)
988
-
989
- except JobException as err:
990
- raise PipelineException(f"{job}: JobException: {err}") from None
991
-
992
- return Result(
993
- status=j_rs.status,
994
- context={job: job_obj.set_outputs(j_rs.context)},
995
- )
996
-
997
- def execute(
998
- self,
999
- params: DictData | None = None,
1000
- *,
1001
- timeout: int = 60,
1002
- ) -> Result:
1003
- """Execute pipeline with passing dynamic parameters to any jobs that
1004
- included in the pipeline.
1005
-
1006
- :param params: An input parameters that use on pipeline execution that
1007
- will parameterize before using it.
1008
- :param timeout: A pipeline execution time out in second unit that use
1009
- for limit time of execution and waiting job dependency.
1010
- :rtype: Result
1011
-
1012
- See Also:
1013
- ---
1014
-
1015
- The result of execution process for each jobs and stages on this
1016
- pipeline will keeping in dict which able to catch out with all jobs and
1017
- stages by dot annotation.
1018
-
1019
- For example, when I want to use the output from previous stage, I
1020
- can access it with syntax:
1021
-
1022
- ... ${job-name}.stages.${stage-id}.outputs.${key}
1023
-
1024
- """
1025
- logger.info(f"({self.run_id}) [CORE]: Start Execute: {self.name!r} ...")
1026
- params: DictData = params or {}
1027
- ts: float = time.monotonic()
1028
-
1029
- # NOTE: It should not do anything if it does not have job.
1030
- if not self.jobs:
1031
- logger.warning(
1032
- f"({self.run_id}) [PIPELINE]: This pipeline: {self.name!r} "
1033
- f"does not have any jobs"
1034
- )
1035
- return Result(status=0, context=params)
1036
-
1037
- # NOTE: Create a job queue that keep the job that want to running after
1038
- # it dependency condition.
1039
- jq: Queue = Queue()
1040
- for job_id in self.jobs:
1041
- jq.put(job_id)
1042
-
1043
- # NOTE: Create result context that will pass this context to any
1044
- # execution dependency.
1045
- context: DictData = self.parameterize(params)
1046
- try:
1047
- worker: int = int(os.getenv("WORKFLOW_CORE_MAX_JOB_PARALLEL", "2"))
1048
- (
1049
- self.__exec_non_threading(context, ts, jq, timeout=timeout)
1050
- if worker == 1
1051
- else self.__exec_threading(
1052
- context, ts, jq, worker=worker, timeout=timeout
1053
- )
1054
- )
1055
- return Result(status=0, context=context)
1056
- except PipelineException as err:
1057
- context.update(
1058
- {"error_message": f"{err.__class__.__name__}: {err}"}
1059
- )
1060
- return Result(status=1, context=context)
1061
-
1062
- def __exec_threading(
1063
- self,
1064
- context: DictData,
1065
- ts: float,
1066
- job_queue: Queue,
1067
- *,
1068
- worker: int = 2,
1069
- timeout: int = 600,
1070
- ) -> DictData:
1071
- """Pipeline threading execution.
1072
-
1073
- :param context: A context pipeline data that want to downstream passing.
1074
- :param ts: A start timestamp that use for checking execute time should
1075
- timeout.
1076
- :param timeout: A second value unit that bounding running time.
1077
- :param worker: A number of threading executor pool size.
1078
- :rtype: DictData
1079
- """
1080
- not_time_out_flag: bool = True
1081
- logger.debug(
1082
- f"({self.run_id}): [CORE]: Run {self.name} with threading job "
1083
- f"executor"
1084
- )
1085
-
1086
- # IMPORTANT: The job execution can run parallel and waiting by
1087
- # needed.
1088
- with ThreadPoolExecutor(max_workers=worker) as executor:
1089
- futures: list[Future] = []
1090
-
1091
- while not job_queue.empty() and (
1092
- not_time_out_flag := ((time.monotonic() - ts) < timeout)
1093
- ):
1094
- job_id: str = job_queue.get()
1095
- job: Job = self.jobs[job_id]
1096
-
1097
- if any(need not in context["jobs"] for need in job.needs):
1098
- job_queue.put(job_id)
1099
- time.sleep(0.25)
1100
- continue
1101
-
1102
- futures.append(
1103
- executor.submit(
1104
- self.execute_job,
1105
- job_id,
1106
- params=copy.deepcopy(context),
1107
- ),
1108
- )
1109
- job_queue.task_done()
1110
-
1111
- # NOTE: Wait for all items to finish processing
1112
- job_queue.join()
1113
-
1114
- for future in as_completed(futures):
1115
- if err := future.exception():
1116
- logger.error(f"{err}")
1117
- raise PipelineException(f"{err}")
1118
-
1119
- # NOTE: Update job result to pipeline result.
1120
- context["jobs"].update(future.result(timeout=20).conext)
1121
-
1122
- if not_time_out_flag:
1123
- return context
1124
-
1125
- # NOTE: Raise timeout error.
1126
- logger.warning(
1127
- f"({self.run_id}) [PIPELINE]: Execution of pipeline, {self.name!r} "
1128
- f", was timeout"
1129
- )
1130
- raise PipelineException(
1131
- f"Execution of pipeline: {self.name} was timeout"
1132
- )
1133
-
1134
- def __exec_non_threading(
1135
- self,
1136
- context: DictData,
1137
- ts: float,
1138
- job_queue: Queue,
1139
- *,
1140
- timeout: int = 600,
1141
- ) -> DictData:
1142
- """Pipeline non-threading execution that use sequential job running
1143
- and waiting previous run successful.
1144
-
1145
- :param context: A context pipeline data that want to downstream passing.
1146
- :param ts: A start timestamp that use for checking execute time should
1147
- timeout.
1148
- :param timeout: A second value unit that bounding running time.
1149
- :rtype: DictData
1150
- """
1151
- not_time_out_flag: bool = True
1152
- logger.debug(
1153
- f"({self.run_id}) [CORE]: Run {self.name} with non-threading job "
1154
- f"executor"
1155
- )
1156
-
1157
- while not job_queue.empty() and (
1158
- not_time_out_flag := ((time.monotonic() - ts) < timeout)
1159
- ):
1160
- job_id: str = job_queue.get()
1161
- job: Job = self.jobs[job_id]
1162
-
1163
- # NOTE:
1164
- if any(need not in context["jobs"] for need in job.needs):
1165
- job_queue.put(job_id)
1166
- time.sleep(0.25)
1167
- continue
1168
-
1169
- # NOTE: Start job execution.
1170
- job_rs = self.execute_job(job_id, params=copy.deepcopy(context))
1171
- context["jobs"].update(job_rs.context)
1172
- job_queue.task_done()
1173
-
1174
- # NOTE: Wait for all items to finish processing
1175
- job_queue.join()
1176
-
1177
- if not_time_out_flag:
1178
- return context
1179
-
1180
- # NOTE: Raise timeout error.
1181
- logger.warning(
1182
- f"({self.run_id}) [PIPELINE]: Execution of pipeline was timeout"
1183
- )
1184
- raise PipelineException(
1185
- f"Execution of pipeline: {self.name} was timeout"
1186
- )