ddeutil-workflow 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,39 +11,47 @@ import logging
11
11
  import os
12
12
  import time
13
13
  from collections.abc import Iterator
14
- from concurrent.futures import Future, ProcessPoolExecutor, as_completed
15
- from dataclasses import dataclass
14
+ from concurrent.futures import (
15
+ Future,
16
+ ProcessPoolExecutor,
17
+ ThreadPoolExecutor,
18
+ as_completed,
19
+ )
20
+ from dataclasses import dataclass, field
16
21
  from datetime import datetime, timedelta
17
22
  from functools import wraps
18
23
  from heapq import heappush
24
+ from queue import Queue
25
+ from textwrap import dedent
19
26
  from threading import Thread
20
27
  from typing import Optional
21
28
  from zoneinfo import ZoneInfo
22
29
 
23
30
  from dotenv import load_dotenv
24
31
  from pydantic import BaseModel, Field
25
- from pydantic.functional_validators import model_validator
32
+ from pydantic.functional_validators import field_validator, model_validator
26
33
  from typing_extensions import Self
27
34
 
28
35
  try:
29
- from schedule import CancelJob, Scheduler
36
+ from schedule import CancelJob
30
37
  except ImportError:
31
- raise ImportError(
32
- "Should install schedule package before use this module."
33
- ) from None
38
+ CancelJob = None
34
39
 
35
40
  from .__types import DictData
36
41
  from .cron import CronRunner
37
- from .exceptions import WorkflowException
42
+ from .exceptions import JobException, WorkflowException
43
+ from .job import Job
38
44
  from .log import FileLog, Log, get_logger
39
45
  from .on import On
40
- from .pipeline import Pipeline
41
46
  from .utils import (
42
47
  Loader,
48
+ Param,
43
49
  Result,
44
50
  batch,
45
51
  delay,
52
+ gen_id,
46
53
  get_diff_sec,
54
+ has_template,
47
55
  param2template,
48
56
  )
49
57
 
@@ -53,23 +61,630 @@ logging.getLogger("schedule").setLevel(logging.INFO)
53
61
 
54
62
 
55
63
  __all__ = (
56
- "PipelineSchedule",
64
+ "Workflow",
65
+ "WorkflowSchedule",
66
+ "WorkflowTask",
57
67
  "Schedule",
58
- "workflow",
68
+ "workflow_runner",
69
+ "workflow_task",
59
70
  )
60
71
 
61
72
 
62
- class PipelineSchedule(BaseModel):
63
- """Pipeline schedule Pydantic Model."""
73
+ class Workflow(BaseModel):
74
+ """Workflow Model this is the main future of this project because it use to
75
+ be workflow data for running everywhere that you want or using it to
76
+ scheduler task in background. It use lightweight coding line from Pydantic
77
+ Model and enhance execute method on it.
78
+ """
64
79
 
65
- name: str = Field(description="A pipeline name.")
80
+ name: str = Field(description="A workflow name.")
81
+ desc: Optional[str] = Field(
82
+ default=None,
83
+ description=(
84
+ "A workflow description that can be string of markdown content."
85
+ ),
86
+ )
87
+ params: dict[str, Param] = Field(
88
+ default_factory=dict,
89
+ description="A parameters that want to use on this workflow.",
90
+ )
91
+ on: list[On] = Field(
92
+ default_factory=list,
93
+ description="A list of On instance for this workflow schedule.",
94
+ )
95
+ jobs: dict[str, Job] = Field(
96
+ default_factory=dict,
97
+ description="A mapping of job ID and job model that already loaded.",
98
+ )
99
+ run_id: Optional[str] = Field(
100
+ default=None,
101
+ description="A running workflow ID.",
102
+ repr=False,
103
+ exclude=True,
104
+ )
105
+
106
+ @property
107
+ def new_run_id(self) -> str:
108
+ """Running ID of this workflow that always generate new unique value."""
109
+ return gen_id(self.name, unique=True)
110
+
111
+ @classmethod
112
+ def from_loader(
113
+ cls,
114
+ name: str,
115
+ externals: DictData | None = None,
116
+ ) -> Self:
117
+ """Create Workflow instance from the Loader object that only receive
118
+ an input workflow name. The loader object will use this workflow name to
119
+ searching configuration data of this workflow model in conf path.
120
+
121
+ :param name: A workflow name that want to pass to Loader object.
122
+ :param externals: An external parameters that want to pass to Loader
123
+ object.
124
+ :rtype: Self
125
+ """
126
+ loader: Loader = Loader(name, externals=(externals or {}))
127
+
128
+ # NOTE: Validate the config type match with current connection model
129
+ if loader.type != cls:
130
+ raise ValueError(f"Type {loader.type} does not match with {cls}")
131
+
132
+ loader_data: DictData = copy.deepcopy(loader.data)
133
+
134
+ # NOTE: Add name to loader data
135
+ loader_data["name"] = name.replace(" ", "_")
136
+
137
+ # NOTE: Prepare `on` data
138
+ cls.__bypass_on(loader_data)
139
+ return cls.model_validate(obj=loader_data)
140
+
141
+ @classmethod
142
+ def __bypass_on(cls, data: DictData, externals: DictData | None = None):
143
+ """Bypass the on data to loaded config data."""
144
+ if on := data.pop("on", []):
145
+ if isinstance(on, str):
146
+ on = [on]
147
+ if any(not isinstance(i, (dict, str)) for i in on):
148
+ raise TypeError("The ``on`` key should be list of str or dict")
149
+
150
+ # NOTE: Pass on value to Loader and keep on model object to on field
151
+ data["on"] = [
152
+ (
153
+ Loader(n, externals=(externals or {})).data
154
+ if isinstance(n, str)
155
+ else n
156
+ )
157
+ for n in on
158
+ ]
159
+ return data
160
+
161
+ @model_validator(mode="before")
162
+ def __prepare_params(cls, values: DictData) -> DictData:
163
+ """Prepare the params key."""
164
+ # NOTE: Prepare params type if it passing with only type value.
165
+ if params := values.pop("params", {}):
166
+ values["params"] = {
167
+ p: (
168
+ {"type": params[p]}
169
+ if isinstance(params[p], str)
170
+ else params[p]
171
+ )
172
+ for p in params
173
+ }
174
+ return values
175
+
176
+ @field_validator("desc", mode="after")
177
+ def ___prepare_desc(cls, value: str) -> str:
178
+ """Prepare description string that was created on a template."""
179
+ return dedent(value)
180
+
181
+ @model_validator(mode="after")
182
+ def __validate_jobs_need_and_prepare_running_id(self):
183
+ """Validate each need job in any jobs should exists."""
184
+ for job in self.jobs:
185
+ if not_exist := [
186
+ need for need in self.jobs[job].needs if need not in self.jobs
187
+ ]:
188
+ raise WorkflowException(
189
+ f"This needed jobs: {not_exist} do not exist in this "
190
+ f"workflow, {self.name!r}"
191
+ )
192
+
193
+ # NOTE: update a job id with its job id from workflow template
194
+ self.jobs[job].id = job
195
+
196
+ if self.run_id is None:
197
+ self.run_id = self.new_run_id
198
+
199
+ # VALIDATE: Validate workflow name should not dynamic with params
200
+ # template.
201
+ if has_template(self.name):
202
+ raise ValueError(
203
+ f"Workflow name should not has any template, please check, "
204
+ f"{self.name!r}."
205
+ )
206
+
207
+ return self
208
+
209
+ def get_running_id(self, run_id: str) -> Self:
210
+ """Return Workflow model object that changing workflow running ID with
211
+ an input running ID.
212
+
213
+ :param run_id: A replace workflow running ID.
214
+ :rtype: Self
215
+ """
216
+ return self.model_copy(update={"run_id": run_id})
217
+
218
+ def job(self, name: str) -> Job:
219
+ """Return Job model that exists on this workflow.
220
+
221
+ :param name: A job name that want to get from a mapping of job models.
222
+ :type name: str
223
+
224
+ :rtype: Job
225
+ :returns: A job model that exists on this workflow by input name.
226
+ """
227
+ if name not in self.jobs:
228
+ raise ValueError(
229
+ f"A Job {name!r} does not exists in this workflow, "
230
+ f"{self.name!r}"
231
+ )
232
+ return self.jobs[name]
233
+
234
+ def parameterize(self, params: DictData) -> DictData:
235
+ """Prepare parameters before passing to execution process. This method
236
+ will create jobs key to params mapping that will keep any result from
237
+ job execution.
238
+
239
+ :param params: A parameter mapping that receive from workflow execution.
240
+ :rtype: DictData
241
+ """
242
+ # VALIDATE: Incoming params should have keys that set on this workflow.
243
+ if check_key := tuple(
244
+ f"{k!r}"
245
+ for k in self.params
246
+ if (k not in params and self.params[k].required)
247
+ ):
248
+ raise WorkflowException(
249
+ f"Required Param on this workflow setting does not set: "
250
+ f"{', '.join(check_key)}."
251
+ )
252
+
253
+ # NOTE: mapping type of param before adding it to params variable.
254
+ return {
255
+ "params": (
256
+ params
257
+ | {
258
+ k: self.params[k].receive(params[k])
259
+ for k in params
260
+ if k in self.params
261
+ }
262
+ ),
263
+ "jobs": {},
264
+ }
265
+
266
+ def release(
267
+ self,
268
+ on: On,
269
+ params: DictData,
270
+ queue: list[datetime],
271
+ *,
272
+ waiting_sec: int = 60,
273
+ sleep_interval: int = 15,
274
+ log: Log = None,
275
+ ) -> Result:
276
+ """Start running workflow with the on schedule in period of 30 minutes.
277
+ That mean it will still running at background 30 minutes until the
278
+ schedule matching with its time.
279
+
280
+ This method allow workflow use log object to save the execution
281
+ result to log destination like file log to local `/logs` directory.
282
+
283
+ :param on: An on schedule value.
284
+ :param params: A workflow parameter that pass to execute method.
285
+ :param queue: A list of release time that already running.
286
+ :param waiting_sec: A second period value that allow workflow execute.
287
+ :param sleep_interval: A second value that want to waiting until time
288
+ to execute.
289
+ :param log: A log object that want to save execution result.
290
+ :rtype: Result
291
+ """
292
+ logger.debug(
293
+ f"({self.run_id}) [CORE]: {self.name!r}: {on.cronjob} : run with "
294
+ f"queue id: {id(queue)}"
295
+ )
296
+ log: Log = log or FileLog
297
+ tz: ZoneInfo = ZoneInfo(os.getenv("WORKFLOW_CORE_TIMEZONE", "UTC"))
298
+ gen: CronRunner = on.generate(
299
+ datetime.now(tz=tz).replace(second=0, microsecond=0)
300
+ + timedelta(seconds=1)
301
+ )
302
+ cron_tz: ZoneInfo = gen.tz
303
+
304
+ # NOTE: get next schedule time that generate from now.
305
+ next_time: datetime = gen.next
306
+
307
+ # NOTE: get next utils it does not logger.
308
+ while log.is_pointed(self.name, next_time, queue=queue):
309
+ next_time: datetime = gen.next
310
+
311
+ # NOTE: push this next running time to log queue
312
+ heappush(queue, next_time)
313
+
314
+ # VALIDATE: Check the different time between the next schedule time and
315
+ # now that less than waiting period (second unit).
316
+ if get_diff_sec(next_time, tz=cron_tz) > waiting_sec:
317
+ logger.debug(
318
+ f"({self.run_id}) [CORE]: {self.name!r} : {on.cronjob} : "
319
+ f"Does not closely >> {next_time:%Y-%m-%d %H:%M:%S}"
320
+ )
321
+
322
+ # NOTE: Remove next datetime from queue.
323
+ queue.remove(next_time)
324
+
325
+ time.sleep(0.15)
326
+ return Result(
327
+ status=0,
328
+ context={
329
+ "params": params,
330
+ "poking": {"skipped": [str(on.cronjob)], "run": []},
331
+ },
332
+ )
333
+
334
+ logger.debug(
335
+ f"({self.run_id}) [CORE]: {self.name!r} : {on.cronjob} : "
336
+ f"Closely to run >> {next_time:%Y-%m-%d %H:%M:%S}"
337
+ )
338
+
339
+ # NOTE: Release when the time is nearly to schedule time.
340
+ while (duration := get_diff_sec(next_time, tz=cron_tz)) > (
341
+ sleep_interval + 5
342
+ ):
343
+ logger.debug(
344
+ f"({self.run_id}) [CORE]: {self.name!r} : {on.cronjob} : "
345
+ f"Sleep until: {duration}"
346
+ )
347
+ time.sleep(sleep_interval)
348
+
349
+ time.sleep(0.5)
350
+
351
+ # NOTE: Release parameter that use to change if params has
352
+ # templating.
353
+ release_params: DictData = {
354
+ "release": {
355
+ "logical_date": next_time,
356
+ },
357
+ }
358
+
359
+ # WARNING: Re-create workflow object that use new running workflow
360
+ # ID.
361
+ runner: Self = self.get_running_id(run_id=self.new_run_id)
362
+ rs: Result = runner.execute(
363
+ params=param2template(params, release_params),
364
+ )
365
+ logger.debug(
366
+ f"({runner.run_id}) [CORE]: {self.name!r} : {on.cronjob} : "
367
+ f"End release {next_time:%Y-%m-%d %H:%M:%S}"
368
+ )
369
+
370
+ # NOTE: Delete a copied workflow instance for saving memory.
371
+ del runner
372
+
373
+ rs.set_parent_run_id(self.run_id)
374
+ rs_log: Log = log.model_validate(
375
+ {
376
+ "name": self.name,
377
+ "on": str(on.cronjob),
378
+ "release": next_time,
379
+ "context": rs.context,
380
+ "parent_run_id": rs.run_id,
381
+ "run_id": rs.run_id,
382
+ }
383
+ )
384
+ # NOTE: Saving execution result to destination of the input log object.
385
+ rs_log.save(excluded=None)
386
+
387
+ queue.remove(next_time)
388
+ time.sleep(0.05)
389
+ return Result(
390
+ status=0,
391
+ context={
392
+ "params": params,
393
+ "poking": {"skipped": [], "run": [str(on.cronjob)]},
394
+ },
395
+ )
396
+
397
+ def poke(
398
+ self,
399
+ params: DictData | None = None,
400
+ *,
401
+ log: Log | None = None,
402
+ ) -> list[Result]:
403
+ """Poke workflow with threading executor pool for executing with all its
404
+ schedules that was set on the `on` value. This method will observe its
405
+ schedule that nearing to run with the ``self.release()`` method.
406
+
407
+ :param params: A parameters that want to pass to the release method.
408
+ :param log: A log object that want to use on this poking process.
409
+ :rtype: list[Result]
410
+ """
411
+ logger.info(
412
+ f"({self.run_id}) [POKING]: Start Poking: {self.name!r} ..."
413
+ )
414
+
415
+ # NOTE: If this workflow does not set the on schedule, it will return
416
+ # empty result.
417
+ if len(self.on) == 0:
418
+ return []
419
+
420
+ params: DictData = params or {}
421
+ queue: list[datetime] = []
422
+ results: list[Result] = []
423
+
424
+ worker: int = int(os.getenv("WORKFLOW_CORE_MAX_NUM_POKING") or "4")
425
+ with ThreadPoolExecutor(max_workers=worker) as executor:
426
+ # TODO: If I want to run infinite loop.
427
+ futures: list[Future] = []
428
+ for on in self.on:
429
+ futures.append(
430
+ executor.submit(
431
+ self.release,
432
+ on,
433
+ params=params,
434
+ log=log,
435
+ queue=queue,
436
+ )
437
+ )
438
+ delay(second=0.15)
439
+
440
+ # WARNING: This poking method does not allow to use fail-fast logic
441
+ # to catching parallel execution result.
442
+ for future in as_completed(futures):
443
+ results.append(future.result(timeout=60))
444
+
445
+ if len(queue) > 0:
446
+ logger.error(
447
+ f"({self.run_id}) [POKING]: Log Queue does empty when poking "
448
+ f"process was finishing."
449
+ )
450
+
451
+ return results
452
+
453
+ def execute_job(
454
+ self,
455
+ job: str,
456
+ params: DictData,
457
+ ) -> Result:
458
+ """Job Executor that use on workflow executor.
459
+
460
+ :param job: A job ID that want to execute.
461
+ :param params: A params that was parameterized from workflow execution.
462
+ :rtype: Result
463
+ """
464
+ # VALIDATE: check a job ID that exists in this workflow or not.
465
+ if job not in self.jobs:
466
+ raise WorkflowException(
467
+ f"The job ID: {job} does not exists on {self.name!r} workflow."
468
+ )
469
+ try:
470
+ logger.info(f"({self.run_id}) [WORKFLOW]: Start execute: {job!r}")
471
+
472
+ # IMPORTANT:
473
+ # Change any job running IDs to this workflow running ID.
474
+ job_obj: Job = self.jobs[job].get_running_id(self.run_id)
475
+ j_rs: Result = job_obj.execute(params=params)
476
+
477
+ except JobException as err:
478
+ raise WorkflowException(f"{job}: JobException: {err}") from None
479
+
480
+ return Result(
481
+ status=j_rs.status,
482
+ context={job: job_obj.set_outputs(j_rs.context)},
483
+ )
484
+
485
+ def execute(
486
+ self,
487
+ params: DictData | None = None,
488
+ *,
489
+ timeout: int = 60,
490
+ ) -> Result:
491
+ """Execute workflow with passing dynamic parameters to any jobs that
492
+ included in the workflow.
493
+
494
+ :param params: An input parameters that use on workflow execution that
495
+ will parameterize before using it.
496
+ :param timeout: A workflow execution time out in second unit that use
497
+ for limit time of execution and waiting job dependency.
498
+ :rtype: Result
499
+
500
+ See Also:
501
+ ---
502
+
503
+ The result of execution process for each jobs and stages on this
504
+ workflow will keeping in dict which able to catch out with all jobs and
505
+ stages by dot annotation.
506
+
507
+ For example, when I want to use the output from previous stage, I
508
+ can access it with syntax:
509
+
510
+ ... ${job-name}.stages.${stage-id}.outputs.${key}
511
+
512
+ """
513
+ logger.info(f"({self.run_id}) [CORE]: Start Execute: {self.name!r} ...")
514
+ params: DictData = params or {}
515
+ ts: float = time.monotonic()
516
+
517
+ # NOTE: It should not do anything if it does not have job.
518
+ if not self.jobs:
519
+ logger.warning(
520
+ f"({self.run_id}) [WORKFLOW]: This workflow: {self.name!r} "
521
+ f"does not have any jobs"
522
+ )
523
+ return Result(status=0, context=params)
524
+
525
+ # NOTE: Create a job queue that keep the job that want to running after
526
+ # it dependency condition.
527
+ jq: Queue = Queue()
528
+ for job_id in self.jobs:
529
+ jq.put(job_id)
530
+
531
+ # NOTE: Create result context that will pass this context to any
532
+ # execution dependency.
533
+ context: DictData = self.parameterize(params)
534
+ try:
535
+ worker: int = int(os.getenv("WORKFLOW_CORE_MAX_JOB_PARALLEL", "2"))
536
+ (
537
+ self.__exec_non_threading(context, ts, jq, timeout=timeout)
538
+ if worker == 1
539
+ else self.__exec_threading(
540
+ context, ts, jq, worker=worker, timeout=timeout
541
+ )
542
+ )
543
+ return Result(status=0, context=context)
544
+ except WorkflowException as err:
545
+ context.update(
546
+ {"error_message": f"{err.__class__.__name__}: {err}"}
547
+ )
548
+ return Result(status=1, context=context)
549
+
550
+ def __exec_threading(
551
+ self,
552
+ context: DictData,
553
+ ts: float,
554
+ job_queue: Queue,
555
+ *,
556
+ worker: int = 2,
557
+ timeout: int = 600,
558
+ ) -> DictData:
559
+ """Workflow threading execution.
560
+
561
+ :param context: A context workflow data that want to downstream passing.
562
+ :param ts: A start timestamp that use for checking execute time should
563
+ timeout.
564
+ :param timeout: A second value unit that bounding running time.
565
+ :param worker: A number of threading executor pool size.
566
+ :rtype: DictData
567
+ """
568
+ not_time_out_flag: bool = True
569
+ logger.debug(
570
+ f"({self.run_id}): [CORE]: Run {self.name} with threading job "
571
+ f"executor"
572
+ )
573
+
574
+ # IMPORTANT: The job execution can run parallel and waiting by
575
+ # needed.
576
+ with ThreadPoolExecutor(max_workers=worker) as executor:
577
+ futures: list[Future] = []
578
+
579
+ while not job_queue.empty() and (
580
+ not_time_out_flag := ((time.monotonic() - ts) < timeout)
581
+ ):
582
+ job_id: str = job_queue.get()
583
+ job: Job = self.jobs[job_id]
584
+
585
+ if any(need not in context["jobs"] for need in job.needs):
586
+ job_queue.put(job_id)
587
+ time.sleep(0.25)
588
+ continue
589
+
590
+ futures.append(
591
+ executor.submit(
592
+ self.execute_job,
593
+ job_id,
594
+ params=copy.deepcopy(context),
595
+ ),
596
+ )
597
+ job_queue.task_done()
598
+
599
+ # NOTE: Wait for all items to finish processing
600
+ job_queue.join()
601
+
602
+ for future in as_completed(futures):
603
+ if err := future.exception():
604
+ logger.error(f"{err}")
605
+ raise WorkflowException(f"{err}")
606
+
607
+ # NOTE: Update job result to workflow result.
608
+ context["jobs"].update(future.result(timeout=20).conext)
609
+
610
+ if not_time_out_flag:
611
+ return context
612
+
613
+ # NOTE: Raise timeout error.
614
+ logger.warning(
615
+ f"({self.run_id}) [WORKFLOW]: Execution of workflow, {self.name!r} "
616
+ f", was timeout"
617
+ )
618
+ raise WorkflowException(
619
+ f"Execution of workflow: {self.name} was timeout"
620
+ )
621
+
622
+ def __exec_non_threading(
623
+ self,
624
+ context: DictData,
625
+ ts: float,
626
+ job_queue: Queue,
627
+ *,
628
+ timeout: int = 600,
629
+ ) -> DictData:
630
+ """Workflow non-threading execution that use sequential job running
631
+ and waiting previous run successful.
632
+
633
+ :param context: A context workflow data that want to downstream passing.
634
+ :param ts: A start timestamp that use for checking execute time should
635
+ timeout.
636
+ :param timeout: A second value unit that bounding running time.
637
+ :rtype: DictData
638
+ """
639
+ not_time_out_flag: bool = True
640
+ logger.debug(
641
+ f"({self.run_id}) [CORE]: Run {self.name} with non-threading job "
642
+ f"executor"
643
+ )
644
+
645
+ while not job_queue.empty() and (
646
+ not_time_out_flag := ((time.monotonic() - ts) < timeout)
647
+ ):
648
+ job_id: str = job_queue.get()
649
+ job: Job = self.jobs[job_id]
650
+
651
+ # NOTE:
652
+ if any(need not in context["jobs"] for need in job.needs):
653
+ job_queue.put(job_id)
654
+ time.sleep(0.25)
655
+ continue
656
+
657
+ # NOTE: Start job execution.
658
+ job_rs = self.execute_job(job_id, params=copy.deepcopy(context))
659
+ context["jobs"].update(job_rs.context)
660
+ job_queue.task_done()
661
+
662
+ # NOTE: Wait for all items to finish processing
663
+ job_queue.join()
664
+
665
+ if not_time_out_flag:
666
+ return context
667
+
668
+ # NOTE: Raise timeout error.
669
+ logger.warning(
670
+ f"({self.run_id}) [WORKFLOW]: Execution of workflow was timeout"
671
+ )
672
+ raise WorkflowException(
673
+ f"Execution of workflow: {self.name} was timeout"
674
+ )
675
+
676
+
677
+ class WorkflowSchedule(BaseModel):
678
+ """Workflow schedule Pydantic Model."""
679
+
680
+ name: str = Field(description="A workflow name.")
66
681
  on: list[On] = Field(
67
682
  default_factory=list,
68
683
  description="An override On instance value.",
69
684
  )
70
685
  params: DictData = Field(
71
686
  default_factory=dict,
72
- description="A parameters that want to use to pipeline execution.",
687
+ description="A parameters that want to use to workflow execution.",
73
688
  )
74
689
 
75
690
  @model_validator(mode="before")
@@ -106,7 +721,7 @@ class PipelineSchedule(BaseModel):
106
721
 
107
722
  class Schedule(BaseModel):
108
723
  """Schedule Pydantic Model that use to run with scheduler package. It does
109
- not equal the on value in Pipeline model but it use same logic to running
724
+ not equal the on value in Workflow model but it use same logic to running
110
725
  release date with crontab interval.
111
726
  """
112
727
 
@@ -116,9 +731,9 @@ class Schedule(BaseModel):
116
731
  "A schedule description that can be string of markdown content."
117
732
  ),
118
733
  )
119
- pipelines: list[PipelineSchedule] = Field(
734
+ workflows: list[WorkflowSchedule] = Field(
120
735
  default_factory=list,
121
- description="A list of PipelineSchedule models.",
736
+ description="A list of WorkflowSchedule models.",
122
737
  )
123
738
 
124
739
  @classmethod
@@ -145,49 +760,48 @@ class Schedule(BaseModel):
145
760
  start_date: datetime,
146
761
  queue: dict[str, list[datetime]],
147
762
  running: dict[str, list[datetime]],
763
+ *,
148
764
  externals: DictData | None = None,
149
- ) -> list[PipelineTask]:
765
+ ) -> list[WorkflowTask]:
150
766
  """Generate Task from the current datetime.
151
767
 
152
768
  :param start_date: A start date that get from the workflow schedule.
153
- :param queue:
154
- :param running:
769
+ :param queue: A mapping of name and list of datetime for queue.
770
+ :param running: A mapping of name and list of datetime for running.
155
771
  :param externals: An external parameters that pass to the Loader object.
156
- :rtype: list[PipelineTask]
772
+ :rtype: list[WorkflowTask]
157
773
  """
158
774
 
159
- # NOTE: Create pair of pipeline and on.
160
- pipeline_tasks: list[PipelineTask] = []
775
+ # NOTE: Create pair of workflow and on.
776
+ workflow_tasks: list[WorkflowTask] = []
161
777
  externals: DictData = externals or {}
162
778
 
163
- for pipe in self.pipelines:
164
- pipeline: Pipeline = Pipeline.from_loader(
165
- pipe.name, externals=externals
166
- )
779
+ for wfs in self.workflows:
780
+ wf: Workflow = Workflow.from_loader(wfs.name, externals=externals)
167
781
 
168
782
  # NOTE: Create default list of release datetime.
169
- queue[pipe.name]: list[datetime] = []
170
- running[pipe.name]: list[datetime] = []
783
+ queue[wfs.name]: list[datetime] = []
784
+ running[wfs.name]: list[datetime] = []
171
785
 
172
- for on in pipeline.on:
786
+ for on in wf.on:
173
787
  on_gen = on.generate(start_date)
174
788
  next_running_date = on_gen.next
175
- while next_running_date in queue[pipe.name]:
789
+ while next_running_date in queue[wfs.name]:
176
790
  next_running_date = on_gen.next
177
791
 
178
- heappush(queue[pipe.name], next_running_date)
792
+ heappush(queue[wfs.name], next_running_date)
179
793
 
180
- pipeline_tasks.append(
181
- PipelineTask(
182
- pipeline=pipeline,
794
+ workflow_tasks.append(
795
+ WorkflowTask(
796
+ workflow=wf,
183
797
  on=on,
184
- params=pipe.params,
798
+ params=wfs.params,
185
799
  queue=queue,
186
800
  running=running,
187
801
  ),
188
802
  )
189
803
 
190
- return pipeline_tasks
804
+ return workflow_tasks
191
805
 
192
806
 
193
807
  def catch_exceptions(cancel_on_failure=False):
@@ -227,27 +841,27 @@ def catch_exceptions_method(cancel_on_failure=False):
227
841
 
228
842
 
229
843
  @dataclass(frozen=True)
230
- class PipelineTask:
231
- """Pipeline task dataclass that use to keep mapping data and objects for
844
+ class WorkflowTask:
845
+ """Workflow task dataclass that use to keep mapping data and objects for
232
846
  passing in multithreading task.
233
847
  """
234
848
 
235
- pipeline: Pipeline
849
+ workflow: Workflow
236
850
  on: On
237
- params: DictData
238
- queue: list[datetime]
239
- running: list[datetime]
851
+ params: DictData = field(compare=False, hash=False)
852
+ queue: list[datetime] = field(compare=False, hash=False)
853
+ running: list[datetime] = field(compare=False, hash=False)
240
854
 
241
855
  @catch_exceptions_method(cancel_on_failure=True)
242
856
  def release(self, log: Log | None = None) -> None:
243
- """Pipeline release, it will use with the same logic of
244
- `pipeline.release` method.
857
+ """Workflow release, it will use with the same logic of
858
+ `workflow.release` method.
245
859
 
246
860
  :param log: A log object.
247
861
  """
248
862
  tz: ZoneInfo = ZoneInfo(os.getenv("WORKFLOW_CORE_TIMEZONE", "UTC"))
249
863
  log: Log = log or FileLog
250
- pipeline: Pipeline = self.pipeline
864
+ wf: Workflow = self.workflow
251
865
  on: On = self.on
252
866
 
253
867
  gen: CronRunner = on.generate(
@@ -259,40 +873,38 @@ class PipelineTask:
259
873
  next_time: datetime = gen.next
260
874
 
261
875
  # NOTE: get next utils it does not running.
262
- while log.is_pointed(
263
- pipeline.name, next_time, queue=self.running[pipeline.name]
264
- ):
876
+ while log.is_pointed(wf.name, next_time, queue=self.running[wf.name]):
265
877
  next_time: datetime = gen.next
266
878
 
267
879
  logger.debug(
268
- f"({pipeline.run_id}) [CORE]: {pipeline.name!r} : {on.cronjob} : "
880
+ f"({wf.run_id}) [CORE]: {wf.name!r} : {on.cronjob} : "
269
881
  f"{next_time:%Y-%m-%d %H:%M:%S}"
270
882
  )
271
- heappush(self.running[pipeline.name], next_time)
883
+ heappush(self.running[wf.name], next_time)
272
884
 
273
885
  if get_diff_sec(next_time, tz=cron_tz) > 55:
274
886
  logger.debug(
275
- f"({pipeline.run_id}) [CORE]: {pipeline.name!r} : {on.cronjob} "
887
+ f"({wf.run_id}) [CORE]: {wf.name!r} : {on.cronjob} "
276
888
  f": Does not closely >> {next_time:%Y-%m-%d %H:%M:%S}"
277
889
  )
278
890
 
279
891
  # NOTE: Add this next running datetime that not in period to queue
280
892
  # and remove it to running.
281
- self.running[pipeline.name].remove(next_time)
282
- heappush(self.queue[pipeline.name], next_time)
893
+ self.running[wf.name].remove(next_time)
894
+ heappush(self.queue[wf.name], next_time)
283
895
 
284
896
  time.sleep(0.2)
285
897
  return
286
898
 
287
899
  logger.debug(
288
- f"({pipeline.run_id}) [CORE]: {pipeline.name!r} : {on.cronjob} : "
900
+ f"({wf.run_id}) [CORE]: {wf.name!r} : {on.cronjob} : "
289
901
  f"Closely to run >> {next_time:%Y-%m-%d %H:%M:%S}"
290
902
  )
291
903
 
292
904
  # NOTE: Release when the time is nearly to schedule time.
293
905
  while (duration := get_diff_sec(next_time, tz=tz)) > (15 + 5):
294
906
  logger.debug(
295
- f"({pipeline.run_id}) [CORE]: {pipeline.name!r} : {on.cronjob} "
907
+ f"({wf.run_id}) [CORE]: {wf.name!r} : {on.cronjob} "
296
908
  f": Sleep until: {duration}"
297
909
  )
298
910
  time.sleep(15)
@@ -307,26 +919,26 @@ class PipelineTask:
307
919
  },
308
920
  }
309
921
 
310
- # WARNING: Re-create pipeline object that use new running pipeline
922
+ # WARNING: Re-create workflow object that use new running workflow
311
923
  # ID.
312
- runner: Pipeline = pipeline.get_running_id(run_id=pipeline.new_run_id)
924
+ runner: Workflow = wf.get_running_id(run_id=wf.new_run_id)
313
925
  rs: Result = runner.execute(
314
926
  params=param2template(self.params, release_params),
315
927
  )
316
928
  logger.debug(
317
- f"({runner.run_id}) [CORE]: {pipeline.name!r} : {on.cronjob} : "
929
+ f"({runner.run_id}) [CORE]: {wf.name!r} : {on.cronjob} : "
318
930
  f"End release - {next_time:%Y-%m-%d %H:%M:%S}"
319
931
  )
320
932
 
321
933
  del runner
322
934
 
323
935
  # NOTE: Set parent ID on this result.
324
- rs.set_parent_run_id(pipeline.run_id)
936
+ rs.set_parent_run_id(wf.run_id)
325
937
 
326
938
  # NOTE: Save result to log object saving.
327
939
  rs_log: Log = log.model_validate(
328
940
  {
329
- "name": pipeline.name,
941
+ "name": wf.name,
330
942
  "on": str(on.cronjob),
331
943
  "release": next_time,
332
944
  "context": rs.context,
@@ -337,24 +949,31 @@ class PipelineTask:
337
949
  rs_log.save(excluded=None)
338
950
 
339
951
  # NOTE: remove this release date from running
340
- self.running[pipeline.name].remove(next_time)
952
+ self.running[wf.name].remove(next_time)
341
953
 
342
954
  # IMPORTANT:
343
- # Add the next running datetime to pipeline queue
955
+ # Add the next running datetime to workflow queue
344
956
  finish_time: datetime = datetime.now(tz=cron_tz).replace(
345
957
  second=0, microsecond=0
346
958
  )
347
959
  future_running_time: datetime = gen.next
348
960
  while (
349
- future_running_time in self.running[pipeline.name]
350
- or future_running_time in self.queue[pipeline.name]
961
+ future_running_time in self.running[wf.name]
962
+ or future_running_time in self.queue[wf.name]
351
963
  or future_running_time < finish_time
352
964
  ):
353
965
  future_running_time: datetime = gen.next
354
966
 
355
- heappush(self.queue[pipeline.name], future_running_time)
967
+ heappush(self.queue[wf.name], future_running_time)
356
968
  logger.debug(f"[CORE]: {'-' * 100}")
357
969
 
970
+ def __eq__(self, other):
971
+ if isinstance(other, WorkflowTask):
972
+ return (
973
+ self.workflow.name == other.workflow.name
974
+ and self.on.cronjob == other.on.cronjob
975
+ )
976
+
358
977
 
359
978
  def queue2str(queue: list[datetime]) -> Iterator[str]:
360
979
  return (f"{q:%Y-%m-%d %H:%M:%S}" for q in queue)
@@ -362,16 +981,16 @@ def queue2str(queue: list[datetime]) -> Iterator[str]:
362
981
 
363
982
  @catch_exceptions(cancel_on_failure=True)
364
983
  def workflow_task(
365
- pipeline_tasks: list[PipelineTask],
984
+ workflow_tasks: list[WorkflowTask],
366
985
  stop: datetime,
367
986
  threads: dict[str, Thread],
368
987
  ) -> CancelJob | None:
369
- """Workflow task generator that create release pair of pipeline and on to
988
+ """Workflow task generator that create release pair of workflow and on to
370
989
  the threading in background.
371
990
 
372
991
  This workflow task will start every minute at :02 second.
373
992
 
374
- :param pipeline_tasks:
993
+ :param workflow_tasks:
375
994
  :param stop:
376
995
  :param threads:
377
996
  :rtype: CancelJob | None
@@ -380,11 +999,11 @@ def workflow_task(
380
999
  start_date: datetime = datetime.now(tz=tz)
381
1000
  start_date_minute: datetime = start_date.replace(second=0, microsecond=0)
382
1001
 
383
- if start_date > stop:
1002
+ if start_date > stop.replace(tzinfo=tz):
384
1003
  logger.info("[WORKFLOW]: Stop this schedule with datetime stopper.")
385
1004
  while len(threads) > 0:
386
1005
  logger.warning(
387
- "[WORKFLOW]: Waiting pipeline release thread that still "
1006
+ "[WORKFLOW]: Waiting workflow release thread that still "
388
1007
  "running in background."
389
1008
  )
390
1009
  time.sleep(15)
@@ -392,68 +1011,68 @@ def workflow_task(
392
1011
  return CancelJob
393
1012
 
394
1013
  # IMPORTANT:
395
- # Filter pipeline & on that should to run with `pipeline_release`
1014
+ # Filter workflow & on that should to run with `workflow_release`
396
1015
  # function. It will deplicate running with different schedule value
397
1016
  # because I use current time in this condition.
398
1017
  #
399
- # For example, if a pipeline A queue has '00:02:00' time that
1018
+ # For example, if a workflow A queue has '00:02:00' time that
400
1019
  # should to run and its schedule has '*/2 * * * *' and '*/35 * * * *'.
401
1020
  # This condition will release with 2 threading job.
402
1021
  #
403
1022
  # '00:02:00' --> '*/2 * * * *' --> running
404
1023
  # --> '*/35 * * * *' --> skip
405
1024
  #
406
- for task in pipeline_tasks:
1025
+ for task in workflow_tasks:
407
1026
 
408
1027
  # NOTE: Get incoming datetime queue.
409
1028
  logger.debug(
410
- f"[WORKFLOW]: Current queue: {task.pipeline.name!r} : "
411
- f"{list(queue2str(task.queue[task.pipeline.name]))}"
1029
+ f"[WORKFLOW]: Current queue: {task.workflow.name!r} : "
1030
+ f"{list(queue2str(task.queue[task.workflow.name]))}"
412
1031
  )
413
1032
 
414
1033
  # NOTE: Create minute unit value for any scheduler datetime that
415
- # checking a pipeline task should run in this datetime.
1034
+ # checking a workflow task should run in this datetime.
416
1035
  current_running_time: datetime = start_date_minute.astimezone(
417
1036
  tz=ZoneInfo(task.on.tz)
418
1037
  )
419
1038
  if (
420
- len(task.queue[task.pipeline.name]) > 0
421
- and current_running_time != task.queue[task.pipeline.name][0]
1039
+ len(task.queue[task.workflow.name]) > 0
1040
+ and current_running_time != task.queue[task.workflow.name][0]
422
1041
  ) or (
423
1042
  task.on.next(current_running_time)
424
- != task.queue[task.pipeline.name][0]
1043
+ != task.queue[task.workflow.name][0]
425
1044
  ):
426
1045
  logger.debug(
427
1046
  f"[WORKFLOW]: Skip schedule "
428
1047
  f"{current_running_time:%Y-%m-%d %H:%M:%S} "
429
- f"for : {task.pipeline.name!r} : {task.on.cronjob}"
1048
+ f"for : {task.workflow.name!r} : {task.on.cronjob}"
430
1049
  )
431
1050
  continue
432
- elif len(task.queue[task.pipeline.name]) == 0:
1051
+ elif len(task.queue[task.workflow.name]) == 0:
433
1052
  logger.warning(
434
- f"[WORKFLOW]: Queue is empty for : {task.pipeline.name!r} : "
1053
+ f"[WORKFLOW]: Queue is empty for : {task.workflow.name!r} : "
435
1054
  f"{task.on.cronjob}"
436
1055
  )
437
1056
  continue
438
1057
 
439
1058
  # NOTE: Remove this datetime from queue.
440
- task.queue[task.pipeline.name].pop(0)
1059
+ task.queue[task.workflow.name].pop(0)
441
1060
 
442
1061
  # NOTE: Create thread name that able to tracking with observe schedule
443
1062
  # job.
444
1063
  thread_name: str = (
445
- f"{task.pipeline.name}|{str(task.on.cronjob)}|"
1064
+ f"{task.workflow.name}|{str(task.on.cronjob)}|"
446
1065
  f"{current_running_time:%Y%m%d%H%M}"
447
1066
  )
448
- pipe_thread: Thread = Thread(
1067
+ wf_thread: Thread = Thread(
449
1068
  target=task.release,
450
1069
  name=thread_name,
451
1070
  daemon=True,
452
1071
  )
453
1072
 
454
- threads[thread_name] = pipe_thread
1073
+ threads[thread_name] = wf_thread
455
1074
 
456
- pipe_thread.start()
1075
+ wf_thread.start()
457
1076
 
458
1077
  delay()
459
1078
 
@@ -468,7 +1087,7 @@ def workflow_long_running_task(threads: dict[str, Thread]) -> None:
468
1087
  :rtype: None
469
1088
  """
470
1089
  logger.debug(
471
- "[MONITOR]: Start checking long running pipeline release task."
1090
+ "[MONITOR]: Start checking long running workflow release task."
472
1091
  )
473
1092
  snapshot_threads = list(threads.keys())
474
1093
  for t_name in snapshot_threads:
@@ -485,18 +1104,25 @@ def workflow_control(
485
1104
  ) -> list[str]:
486
1105
  """Workflow scheduler control.
487
1106
 
488
- :param schedules: A list of pipeline names that want to schedule running.
1107
+ :param schedules: A list of workflow names that want to schedule running.
489
1108
  :param stop: An datetime value that use to stop running schedule.
490
1109
  :param externals: An external parameters that pass to Loader.
491
1110
  :rtype: list[str]
492
1111
  """
1112
+ try:
1113
+ from schedule import Scheduler
1114
+ except ImportError:
1115
+ raise ImportError(
1116
+ "Should install schedule package before use this module."
1117
+ ) from None
1118
+
493
1119
  tz: ZoneInfo = ZoneInfo(os.getenv("WORKFLOW_CORE_TIMEZONE", "UTC"))
494
1120
  schedule: Scheduler = Scheduler()
495
1121
  start_date: datetime = datetime.now(tz=tz)
496
1122
 
497
1123
  # NOTE: Design workflow queue caching.
498
1124
  # ---
499
- # {"pipeline-name": [<release-datetime>, <release-datetime>, ...]}
1125
+ # {"workflow-name": [<release-datetime>, <release-datetime>, ...]}
500
1126
  #
501
1127
  wf_queue: dict[str, list[datetime]] = {}
502
1128
  wf_running: dict[str, list[datetime]] = {}
@@ -506,18 +1132,20 @@ def workflow_control(
506
1132
  second=0, microsecond=0
507
1133
  )
508
1134
 
509
- # NOTE: Create pair of pipeline and on from schedule model.
510
- pipeline_tasks: list[PipelineTask] = []
1135
+ # NOTE: Create pair of workflow and on from schedule model.
1136
+ workflow_tasks: list[WorkflowTask] = []
511
1137
  for name in schedules:
512
1138
  sch: Schedule = Schedule.from_loader(name, externals=externals)
513
- pipeline_tasks.extend(
514
- sch.tasks(start_date_waiting, wf_queue, wf_running, externals)
1139
+ workflow_tasks.extend(
1140
+ sch.tasks(
1141
+ start_date_waiting, wf_queue, wf_running, externals=externals
1142
+ ),
515
1143
  )
516
1144
 
517
1145
  # NOTE: This schedule job will start every minute at :02 seconds.
518
1146
  schedule.every(1).minutes.at(":02").do(
519
1147
  workflow_task,
520
- pipeline_tasks=pipeline_tasks,
1148
+ workflow_tasks=workflow_tasks,
521
1149
  stop=stop
522
1150
  or (
523
1151
  start_date
@@ -545,7 +1173,7 @@ def workflow_control(
545
1173
  if not schedule.get_jobs("control"):
546
1174
  schedule.clear("monitor")
547
1175
  logger.warning(
548
- f"[WORKFLOW]: Pipeline release thread: {thread_releases}"
1176
+ f"[WORKFLOW]: Workflow release thread: {thread_releases}"
549
1177
  )
550
1178
  logger.warning("[WORKFLOW]: Does not have any schedule jobs !!!")
551
1179
  break
@@ -559,33 +1187,33 @@ def workflow_control(
559
1187
  return schedules
560
1188
 
561
1189
 
562
- def workflow(
1190
+ def workflow_runner(
563
1191
  stop: datetime | None = None,
564
1192
  externals: DictData | None = None,
565
1193
  excluded: list[str] | None = None,
566
1194
  ) -> list[str]:
567
1195
  """Workflow application that running multiprocessing schedule with chunk of
568
- pipelines that exists in config path.
1196
+ workflows that exists in config path.
569
1197
 
570
1198
  :param stop:
571
1199
  :param excluded:
572
1200
  :param externals:
573
1201
  :rtype: list[str]
574
1202
 
575
- This function will get all pipelines that include on value that was
1203
+ This function will get all workflows that include on value that was
576
1204
  created in config path and chuck it with WORKFLOW_APP_SCHEDULE_PER_PROCESS
577
1205
  value to multiprocess executor pool.
578
1206
 
579
1207
  The current workflow logic:
580
1208
  ---
581
1209
  PIPELINES ==> process 01 ==> schedule 1 minute --> thread of release
582
- pipeline task 01 01
1210
+ workflow task 01 01
583
1211
  --> thread of release
584
- pipeline task 01 02
1212
+ workflow task 01 02
585
1213
  ==> process 02 ==> schedule 1 minute --> thread of release
586
- pipeline task 02 01
1214
+ workflow task 02 01
587
1215
  --> thread of release
588
- pipeline task 02 02
1216
+ workflow task 02 02
589
1217
  ==> ...
590
1218
  """
591
1219
  excluded: list[str] = excluded or []
@@ -613,8 +1241,3 @@ def workflow(
613
1241
  raise WorkflowException(str(err)) from err
614
1242
  results.extend(future.result(timeout=1))
615
1243
  return results
616
-
617
-
618
- if __name__ == "__main__":
619
- workflow_rs: list[str] = workflow()
620
- logger.info(f"Application run success: {workflow_rs}")