ddeutil-workflow 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,708 +6,615 @@
6
6
  from __future__ import annotations
7
7
 
8
8
  import copy
9
+ import json
10
+ import logging
11
+ import os
12
+ import time
9
13
  from collections.abc import Iterator
10
- from dataclasses import dataclass, field
14
+ from concurrent.futures import Future, ProcessPoolExecutor, as_completed
15
+ from dataclasses import dataclass
11
16
  from datetime import datetime, timedelta
12
- from functools import partial, total_ordering
13
- from typing import Callable, Optional, Union
14
- from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
15
-
16
- from ddeutil.core import (
17
- is_int,
18
- isinstance_check,
19
- must_split,
17
+ from functools import wraps
18
+ from heapq import heappush
19
+ from threading import Thread
20
+ from typing import Optional
21
+ from zoneinfo import ZoneInfo
22
+
23
+ from dotenv import load_dotenv
24
+ from pydantic import BaseModel, Field
25
+ from pydantic.functional_validators import model_validator
26
+ from typing_extensions import Self
27
+
28
+ try:
29
+ from schedule import CancelJob, Scheduler
30
+ except ImportError:
31
+ raise ImportError(
32
+ "Should install schedule package before use this module."
33
+ ) from None
34
+
35
+ from .__types import DictData
36
+ from .cron import CronRunner
37
+ from .exceptions import WorkflowException
38
+ from .log import FileLog, Log, get_logger
39
+ from .on import On
40
+ from .pipeline import Pipeline
41
+ from .utils import (
42
+ Loader,
43
+ Result,
44
+ batch,
45
+ delay,
46
+ get_diff_sec,
47
+ param2template,
20
48
  )
21
- from ddeutil.core.dtutils import (
22
- next_date,
23
- replace_date,
49
+
50
+ load_dotenv()
51
+ logger = get_logger("ddeutil.workflow")
52
+ logging.getLogger("schedule").setLevel(logging.INFO)
53
+
54
+
55
+ __all__ = (
56
+ "PipelineSchedule",
57
+ "Schedule",
58
+ "workflow",
24
59
  )
25
60
 
26
- WEEKDAYS: dict[str, int] = {
27
- "Sun": 0,
28
- "Mon": 1,
29
- "Tue": 2,
30
- "Wed": 3,
31
- "Thu": 4,
32
- "Fri": 5,
33
- "Sat": 6,
34
- }
35
61
 
62
+ class PipelineSchedule(BaseModel):
63
+ """Pipeline schedule Pydantic Model."""
36
64
 
37
- @dataclass(frozen=True)
38
- class Unit:
39
- name: str
40
- range: partial
41
- min: int
42
- max: int
43
- alt: list[str] = field(default_factory=list)
44
-
45
- def __repr__(self) -> str:
46
- return (
47
- f"{self.__class__}(name={self.name!r}, range={self.range},"
48
- f"min={self.min}, max={self.max}"
49
- f"{f', alt={self.alt}' if self.alt else ''})"
50
- )
65
+ name: str = Field(description="A pipeline name.")
66
+ on: list[On] = Field(
67
+ default_factory=list,
68
+ description="An override On instance value.",
69
+ )
70
+ params: DictData = Field(
71
+ default_factory=dict,
72
+ description="A parameters that want to use to pipeline execution.",
73
+ )
51
74
 
75
+ @model_validator(mode="before")
76
+ def __prepare__values(cls, values: DictData) -> DictData:
77
+ """Prepare incoming values before validating with model fields."""
52
78
 
53
- @dataclass
54
- class Options:
55
- output_weekday_names: bool = False
56
- output_month_names: bool = False
57
- output_hashes: bool = False
58
-
59
-
60
- CRON_UNITS: tuple[Unit, ...] = (
61
- Unit(
62
- name="minute",
63
- range=partial(range, 0, 60),
64
- min=0,
65
- max=59,
66
- ),
67
- Unit(
68
- name="hour",
69
- range=partial(range, 0, 24),
70
- min=0,
71
- max=23,
72
- ),
73
- Unit(
74
- name="day",
75
- range=partial(range, 1, 32),
76
- min=1,
77
- max=31,
78
- ),
79
- Unit(
80
- name="month",
81
- range=partial(range, 1, 13),
82
- min=1,
83
- max=12,
84
- alt=[
85
- "JAN",
86
- "FEB",
87
- "MAR",
88
- "APR",
89
- "MAY",
90
- "JUN",
91
- "JUL",
92
- "AUG",
93
- "SEP",
94
- "OCT",
95
- "NOV",
96
- "DEC",
97
- ],
98
- ),
99
- Unit(
100
- name="weekday",
101
- range=partial(range, 0, 7),
102
- min=0,
103
- max=6,
104
- alt=[
105
- "SUN",
106
- "MON",
107
- "TUE",
108
- "WED",
109
- "THU",
110
- "FRI",
111
- "SAT",
112
- ],
113
- ),
114
- )
79
+ values["name"] = values["name"].replace(" ", "_")
115
80
 
116
- CRON_UNITS_YEAR: tuple[Unit, ...] = CRON_UNITS + (
117
- Unit(
118
- name="year",
119
- range=partial(range, 1990, 2101),
120
- min=1990,
121
- max=2100,
122
- ),
123
- )
81
+ cls.__bypass_on(values)
82
+ return values
83
+
84
+ @classmethod
85
+ def __bypass_on(cls, data: DictData, externals: DictData | None = None):
86
+ """Bypass the on data to loaded config data."""
87
+ if on := data.pop("on", []):
88
+
89
+ if isinstance(on, str):
90
+ on = [on]
124
91
 
92
+ if any(not isinstance(n, (dict, str)) for n in on):
93
+ raise TypeError("The ``on`` key should be list of str or dict")
125
94
 
126
- @total_ordering
127
- class CronPart:
128
- """Part of Cron object that represent a collection of positive integers."""
95
+ # NOTE: Pass on value to Loader and keep on model object to on field
96
+ data["on"] = [
97
+ (
98
+ Loader(n, externals=(externals or {})).data
99
+ if isinstance(n, str)
100
+ else n
101
+ )
102
+ for n in on
103
+ ]
104
+ return data
129
105
 
130
- __slots__: tuple[str, ...] = (
131
- "unit",
132
- "options",
133
- "values",
106
+
107
+ class Schedule(BaseModel):
108
+ """Schedule Pydantic Model that use to run with scheduler package. It does
109
+ not equal the on value in Pipeline model but it use same logic to running
110
+ release date with crontab interval.
111
+ """
112
+
113
+ desc: Optional[str] = Field(
114
+ default=None,
115
+ description=(
116
+ "A schedule description that can be string of markdown content."
117
+ ),
118
+ )
119
+ pipelines: list[PipelineSchedule] = Field(
120
+ default_factory=list,
121
+ description="A list of PipelineSchedule models.",
134
122
  )
135
123
 
136
- def __init__(
137
- self,
138
- unit: Unit,
139
- values: str | list[int],
140
- options: Options,
141
- ) -> None:
142
- self.unit: Unit = unit
143
- self.options: Options = options
144
-
145
- if isinstance(values, str):
146
- values: list[int] = self.from_str(values) if values != "?" else []
147
- elif isinstance_check(values, list[int]):
148
- values: list[int] = self.replace_weekday(values)
149
- else:
150
- raise TypeError(f"Invalid type of value in cron part: {values}.")
151
-
152
- self.values: list[int] = self.out_of_range(
153
- sorted(dict.fromkeys(values))
154
- )
124
+ @classmethod
125
+ def from_loader(
126
+ cls,
127
+ name: str,
128
+ externals: DictData | None = None,
129
+ ) -> Self:
130
+ loader: Loader = Loader(name, externals=(externals or {}))
155
131
 
156
- def __str__(self) -> str:
157
- """Generate String value from part of cronjob."""
158
- _hash: str = "H" if self.options.output_hashes else "*"
132
+ # NOTE: Validate the config type match with current connection model
133
+ if loader.type != cls:
134
+ raise ValueError(f"Type {loader.type} does not match with {cls}")
159
135
 
160
- if self.is_full:
161
- return _hash
136
+ loader_data: DictData = copy.deepcopy(loader.data)
162
137
 
163
- if self.is_interval:
164
- if self.is_full_interval:
165
- return f"{_hash}/{self.step}"
166
- _hash: str = (
167
- f"H({self.filler(self.min)}-{self.filler(self.max)})"
168
- if _hash == "H"
169
- else f"{self.filler(self.min)}-{self.filler(self.max)}"
170
- )
171
- return f"{_hash}/{self.step}"
138
+ # NOTE: Add name to loader data
139
+ loader_data["name"] = name.replace(" ", "_")
172
140
 
173
- cron_range_strings: list[str] = []
174
- for cron_range in self.ranges():
175
- if isinstance(cron_range, list):
176
- cron_range_strings.append(
177
- f"{self.filler(cron_range[0])}-{self.filler(cron_range[1])}"
178
- )
179
- else:
180
- cron_range_strings.append(f"{self.filler(cron_range)}")
181
- return ",".join(cron_range_strings) if cron_range_strings else "?"
182
-
183
- def __repr__(self) -> str:
184
- return (
185
- f"{self.__class__.__name__}"
186
- f"(unit={self.unit}, values={self.__str__()!r})"
187
- )
141
+ return cls.model_validate(obj=loader_data)
188
142
 
189
- def __lt__(self, other) -> bool:
190
- if isinstance(other, CronPart):
191
- return self.values < other.values
192
- elif isinstance(other, list):
193
- return self.values < other
194
-
195
- def __eq__(self, other) -> bool:
196
- if isinstance(other, CronPart):
197
- return self.values == other.values
198
- elif isinstance(other, list):
199
- return self.values == other
200
-
201
- @property
202
- def min(self) -> int:
203
- """Returns the smallest value in the range."""
204
- return self.values[0]
205
-
206
- @property
207
- def max(self) -> int:
208
- """Returns the largest value in the range."""
209
- return self.values[-1]
210
-
211
- @property
212
- def step(self) -> Optional[int]:
213
- """Returns the difference between first and second elements in the
214
- range.
143
+ def tasks(
144
+ self,
145
+ start_date: datetime,
146
+ queue: dict[str, list[datetime]],
147
+ running: dict[str, list[datetime]],
148
+ externals: DictData | None = None,
149
+ ) -> list[PipelineTask]:
150
+ """Generate Task from the current datetime.
151
+
152
+ :param start_date: A start date that get from the workflow schedule.
153
+ :param queue:
154
+ :param running:
155
+ :param externals: An external parameters that pass to the Loader object.
156
+ :rtype: list[PipelineTask]
215
157
  """
216
- if (
217
- len(self.values) > 2
218
- and (step := self.values[1] - self.values[0]) > 1
219
- ):
220
- return step
221
158
 
222
- @property
223
- def is_full(self) -> bool:
224
- """Returns true if range has all the values of the unit."""
225
- return len(self.values) == (self.unit.max - self.unit.min + 1)
159
+ # NOTE: Create pair of pipeline and on.
160
+ pipeline_tasks: list[PipelineTask] = []
161
+ externals: DictData = externals or {}
226
162
 
227
- def from_str(self, value: str) -> tuple[int, ...]:
228
- """Parses a string as a range of positive integers. The string should
229
- include only `-` and `,` special strings.
163
+ for pipe in self.pipelines:
164
+ pipeline: Pipeline = Pipeline.from_loader(
165
+ pipe.name, externals=externals
166
+ )
230
167
 
231
- :param value: A string value that want to parse
232
- :type value: str
168
+ # NOTE: Create default list of release datetime.
169
+ queue[pipe.name]: list[datetime] = []
170
+ running[pipe.name]: list[datetime] = []
171
+
172
+ for on in pipeline.on:
173
+ on_gen = on.generate(start_date)
174
+ next_running_date = on_gen.next
175
+ while next_running_date in queue[pipe.name]:
176
+ next_running_date = on_gen.next
177
+
178
+ heappush(queue[pipe.name], next_running_date)
179
+
180
+ pipeline_tasks.append(
181
+ PipelineTask(
182
+ pipeline=pipeline,
183
+ on=on,
184
+ params=pipe.params,
185
+ queue=queue,
186
+ running=running,
187
+ ),
188
+ )
233
189
 
234
- TODO: support for `L`, `W`, and `#`
235
- TODO: if you didn't care what day of the week the 7th was, you
236
- could enter ? in the Day-of-week field.
237
- TODO: L : the Day-of-month or Day-of-week fields specifies the last day
238
- of the month or week.
239
- DEV: use -1 for represent with L
240
- TODO: W : In the Day-of-month field, 3W specifies the weekday closest
241
- to the third day of the month.
242
- TODO: # : 3#2 would be the second Tuesday of the month,
243
- the 3 refers to Tuesday because it is the third day of each week.
190
+ return pipeline_tasks
244
191
 
245
- Noted:
246
- - 0 10 * * ? *
247
- Run at 10:00 am (UTC) every day
248
192
 
249
- - 15 12 * * ? *
250
- Run at 12:15 pm (UTC) every day
193
+ def catch_exceptions(cancel_on_failure=False):
194
+ """Catch exception error from scheduler job."""
251
195
 
252
- - 0 18 ? * MON-FRI *
253
- Run at 6:00 pm (UTC) every Monday through Friday
196
+ def catch_exceptions_decorator(func):
197
+ @wraps(func)
198
+ def wrapper(*args, **kwargs):
199
+ try:
200
+ return func(*args, **kwargs)
201
+ except Exception as err:
202
+ logger.exception(err)
203
+ if cancel_on_failure:
204
+ return CancelJob
254
205
 
255
- - 0 8 1 * ? *
256
- Run at 8:00 am (UTC) every 1st day of the month
206
+ return wrapper
257
207
 
258
- - 0/15 * * * ? *
259
- Run every 15 minutes
208
+ return catch_exceptions_decorator
260
209
 
261
- - 0/10 * ? * MON-FRI *
262
- Run every 10 minutes Monday through Friday
263
210
 
264
- - 0/5 8-17 ? * MON-FRI *
265
- Run every 5 minutes Monday through Friday between 8:00 am and
266
- 5:55 pm (UTC)
211
+ def catch_exceptions_method(cancel_on_failure=False):
212
+ """Catch exception error from scheduler job."""
267
213
 
268
- - 5,35 14 * * ? *
269
- Run every day, at 5 and 35 minutes past 2:00 pm (UTC)
214
+ def catch_exceptions_decorator(func):
215
+ @wraps(func)
216
+ def wrapper(self, *args, **kwargs):
217
+ try:
218
+ return func(self, *args, **kwargs)
219
+ except Exception as err:
220
+ logger.exception(err)
221
+ if cancel_on_failure:
222
+ return CancelJob
270
223
 
271
- - 15 10 ? * 6L 2002-2005
272
- Run at 10:15am UTC on the last Friday of each month during the
273
- years 2002 to 2005
224
+ return wrapper
274
225
 
275
- :rtype: tuple[int, ...]
276
- """
277
- interval_list: list[list[int]] = []
278
- for _value in self.replace_alternative(value.upper()).split(","):
279
- if _value == "?":
280
- continue
281
- elif _value.count("/") > 1:
282
- raise ValueError(
283
- f"Invalid value {_value!r} in cron part {value!r}"
284
- )
226
+ return catch_exceptions_decorator
285
227
 
286
- value_range, value_step = must_split(_value, "/", maxsplit=1)
287
- value_range_list: list[int] = self.out_of_range(
288
- self._parse_range(value_range)
289
- )
290
-
291
- if (value_step and not is_int(value_step)) or value_step == "":
292
- raise ValueError(
293
- f"Invalid interval step value {value_step!r} for "
294
- f"{self.unit.name!r}"
295
- )
296
228
 
297
- interval_list.append(self._interval(value_range_list, value_step))
298
- return tuple(item for sublist in interval_list for item in sublist)
229
+ @dataclass(frozen=True)
230
+ class PipelineTask:
231
+ """Pipeline task dataclass that use to keep mapping data and objects for
232
+ passing in multithreading task.
233
+ """
299
234
 
300
- def replace_alternative(self, value: str) -> str:
301
- """Replaces the alternative representations of numbers in a string."""
302
- for i, alt in enumerate(self.unit.alt):
303
- if alt in value:
304
- value: str = value.replace(alt, str(self.unit.min + i))
305
- return value
235
+ pipeline: Pipeline
236
+ on: On
237
+ params: DictData
238
+ queue: list[datetime]
239
+ running: list[datetime]
306
240
 
307
- def replace_weekday(self, values: list[int] | Iterator[int]) -> list[int]:
308
- """Replaces all 7 with 0 as Sunday can be represented by both.
241
+ @catch_exceptions_method(cancel_on_failure=True)
242
+ def release(self, log: Log | None = None) -> None:
243
+ """Pipeline release, it will use with the same logic of
244
+ `pipeline.release` method.
309
245
 
310
- :param values: list or iter of int that want to mode by 7
311
- :rtype: list[int]
246
+ :param log: A log object.
312
247
  """
313
- if self.unit.name == "weekday":
314
- # NOTE: change weekday value in range 0-6 (div-mod by 7).
315
- return [value % 7 for value in values]
316
- return list(values)
248
+ tz: ZoneInfo = ZoneInfo(os.getenv("WORKFLOW_CORE_TIMEZONE", "UTC"))
249
+ log: Log = log or FileLog
250
+ pipeline: Pipeline = self.pipeline
251
+ on: On = self.on
317
252
 
318
- def out_of_range(self, values: list[int]) -> list[int]:
319
- """Return an integer is a value out of range was found, otherwise None.
253
+ gen: CronRunner = on.generate(
254
+ datetime.now(tz=tz).replace(second=0, microsecond=0)
255
+ )
256
+ cron_tz: ZoneInfo = gen.tz
320
257
 
321
- :param values: A list of int value
322
- :type values: list[int]
258
+ # NOTE: get next schedule time that generate from now.
259
+ next_time: datetime = gen.next
323
260
 
324
- :rtype: list[int]
325
- """
326
- if values:
327
- if (first := values[0]) < self.unit.min:
328
- raise ValueError(
329
- f"Value {first!r} out of range for {self.unit.name!r}"
330
- )
331
- elif (last := values[-1]) > self.unit.max:
332
- raise ValueError(
333
- f"Value {last!r} out of range for {self.unit.name!r}"
334
- )
335
- return values
261
+ # NOTE: get next utils it does not running.
262
+ while log.is_pointed(
263
+ pipeline.name, next_time, queue=self.running[pipeline.name]
264
+ ):
265
+ next_time: datetime = gen.next
336
266
 
337
- def _parse_range(self, value: str) -> list[int]:
338
- """Parses a range string."""
339
- if value == "*":
340
- return list(self.unit.range())
341
- elif value.count("-") > 1:
342
- raise ValueError(f"Invalid value {value}")
343
- try:
344
- sub_parts: list[int] = list(map(int, value.split("-")))
345
- except ValueError as exc:
346
- raise ValueError(f"Invalid value {value!r} --> {exc}") from exc
347
-
348
- if len(sub_parts) == 2:
349
- min_value, max_value = sub_parts
350
- if max_value < min_value:
351
- raise ValueError(f"Max range is less than min range in {value}")
352
- sub_parts: list[int] = list(range(min_value, max_value + 1))
353
- return self.replace_weekday(sub_parts)
354
-
355
- def _interval(
356
- self,
357
- values: list[int],
358
- step: int | None = None,
359
- ) -> list[int]:
360
- """Applies an interval step to a collection of values."""
361
- if not step:
362
- return values
363
- elif (_step := int(step)) < 1:
364
- raise ValueError(
365
- f"Invalid interval step value {_step!r} for "
366
- f"{self.unit.name!r}"
367
- )
368
- min_value: int = values[0]
369
- return [
370
- value
371
- for value in values
372
- if (value % _step == min_value % _step) or (value == min_value)
373
- ]
267
+ logger.debug(
268
+ f"({pipeline.run_id}) [CORE]: {pipeline.name!r} : {on.cronjob} : "
269
+ f"{next_time:%Y-%m-%d %H:%M:%S}"
270
+ )
271
+ heappush(self.running[pipeline.name], next_time)
374
272
 
375
- @property
376
- def is_interval(self) -> bool:
377
- """Returns true if the range can be represented as an interval."""
378
- if not (step := self.step):
379
- return False
380
- for idx, value in enumerate(self.values):
381
- if idx == 0:
382
- continue
383
- elif (value - self.values[idx - 1]) != step:
384
- return False
385
- return True
386
-
387
- @property
388
- def is_full_interval(self) -> bool:
389
- """Returns true if the range contains all the interval values."""
390
- if step := self.step:
391
- return (
392
- self.min == self.unit.min
393
- and (self.max + step) > self.unit.max
394
- and (
395
- len(self.values)
396
- == (round((self.max - self.min) / step) + 1)
397
- )
273
+ if get_diff_sec(next_time, tz=cron_tz) > 55:
274
+ logger.debug(
275
+ f"({pipeline.run_id}) [CORE]: {pipeline.name!r} : {on.cronjob} "
276
+ f": Does not closely >> {next_time:%Y-%m-%d %H:%M:%S}"
398
277
  )
399
- return False
400
278
 
401
- def ranges(self) -> list[Union[int, list[int]]]:
402
- """Returns the range as an array of ranges defined as arrays of
403
- positive integers.
279
+ # NOTE: Add this next running datetime that not in period to queue
280
+ # and remove it to running.
281
+ self.running[pipeline.name].remove(next_time)
282
+ heappush(self.queue[pipeline.name], next_time)
404
283
 
405
- :rtype: list[Union[int, list[int]]]
406
- """
407
- multi_dim_values: list[Union[int, list[int]]] = []
408
- start_number: Optional[int] = None
409
- for idx, value in enumerate(self.values):
410
- try:
411
- next_value: int = self.values[idx + 1]
412
- except IndexError:
413
- next_value: int = -1
414
- if value != (next_value - 1):
415
- # NOTE: ``next_value`` is not the subsequent number
416
- if start_number is None:
417
- # NOTE:
418
- # The last number of the list ``self.values`` is not in a
419
- # range.
420
- multi_dim_values.append(value)
421
- else:
422
- multi_dim_values.append([start_number, value])
423
- start_number: Optional[int] = None
424
- elif start_number is None:
425
- start_number: Optional[int] = value
426
- return multi_dim_values
427
-
428
- def filler(self, value: int) -> int | str:
429
- """Formats weekday and month names as string when the relevant options
430
- are set.
431
-
432
- :param value: a int value
433
- :type value: int
434
-
435
- :rtype: int | str
436
- """
437
- return (
438
- self.unit.alt[value - self.unit.min]
439
- if (
440
- (
441
- self.options.output_weekday_names
442
- and self.unit.name == "weekday"
443
- )
444
- or (
445
- self.options.output_month_names
446
- and self.unit.name == "month"
447
- )
448
- )
449
- else value
284
+ time.sleep(0.2)
285
+ return
286
+
287
+ logger.debug(
288
+ f"({pipeline.run_id}) [CORE]: {pipeline.name!r} : {on.cronjob} : "
289
+ f"Closely to run >> {next_time:%Y-%m-%d %H:%M:%S}"
450
290
  )
451
291
 
292
+ # NOTE: Release when the time is nearly to schedule time.
293
+ while (duration := get_diff_sec(next_time, tz=tz)) > (15 + 5):
294
+ logger.debug(
295
+ f"({pipeline.run_id}) [CORE]: {pipeline.name!r} : {on.cronjob} "
296
+ f": Sleep until: {duration}"
297
+ )
298
+ time.sleep(15)
452
299
 
453
- @total_ordering
454
- class CronJob:
455
- """The Cron Job Converter object that generate datetime dimension of cron
456
- job schedule format,
300
+ time.sleep(0.5)
457
301
 
458
- ... * * * * * <command to execute>
302
+ # NOTE: Release parameter that use to change if params has
303
+ # templating.
304
+ release_params: DictData = {
305
+ "release": {
306
+ "logical_date": next_time,
307
+ },
308
+ }
459
309
 
460
- (i) minute (0 - 59)
461
- (ii) hour (0 - 23)
462
- (iii) day of the month (1 - 31)
463
- (iv) month (1 - 12)
464
- (v) day of the week (0 - 6) (Sunday to Saturday; 7 is also Sunday
465
- on some systems)
310
+ # WARNING: Re-create pipeline object that use new running pipeline
311
+ # ID.
312
+ runner: Pipeline = pipeline.get_running_id(run_id=pipeline.new_run_id)
313
+ rs: Result = runner.execute(
314
+ params=param2template(self.params, release_params),
315
+ )
316
+ logger.debug(
317
+ f"({runner.run_id}) [CORE]: {pipeline.name!r} : {on.cronjob} : "
318
+ f"End release - {next_time:%Y-%m-%d %H:%M:%S}"
319
+ )
466
320
 
467
- This object implement necessary methods and properties for using cron
468
- job value with other object like Schedule.
469
- Support special value with `/`, `*`, `-`, `,`, and `?` (in day of month
470
- and day of week value).
321
+ del runner
322
+
323
+ # NOTE: Set parent ID on this result.
324
+ rs.set_parent_run_id(pipeline.run_id)
325
+
326
+ # NOTE: Save result to log object saving.
327
+ rs_log: Log = log.model_validate(
328
+ {
329
+ "name": pipeline.name,
330
+ "on": str(on.cronjob),
331
+ "release": next_time,
332
+ "context": rs.context,
333
+ "parent_run_id": rs.run_id,
334
+ "run_id": rs.run_id,
335
+ }
336
+ )
337
+ rs_log.save(excluded=None)
471
338
 
472
- References:
473
- - https://github.com/Sonic0/cron-converter
474
- - https://pypi.org/project/python-crontab/
475
- """
339
+ # NOTE: remove this release date from running
340
+ self.running[pipeline.name].remove(next_time)
476
341
 
477
- cron_length: int = 5
478
- cron_units: tuple[Unit, ...] = CRON_UNITS
342
+ # IMPORTANT:
343
+ # Add the next running datetime to pipeline queue
344
+ finish_time: datetime = datetime.now(tz=cron_tz).replace(
345
+ second=0, microsecond=0
346
+ )
347
+ future_running_time: datetime = gen.next
348
+ while (
349
+ future_running_time in self.running[pipeline.name]
350
+ or future_running_time in self.queue[pipeline.name]
351
+ or future_running_time < finish_time
352
+ ):
353
+ future_running_time: datetime = gen.next
479
354
 
480
- def __init__(
481
- self,
482
- value: Union[list[list[int]], str],
483
- *,
484
- option: Optional[dict[str, bool]] = None,
485
- ) -> None:
486
- if isinstance(value, str):
487
- value: list[str] = value.strip().split()
488
- elif not isinstance_check(value, list[list[int]]):
489
- raise TypeError(
490
- f"{self.__class__.__name__} cron value does not support "
491
- f"type: {type(value)}."
492
- )
355
+ heappush(self.queue[pipeline.name], future_running_time)
356
+ logger.debug(f"[CORE]: {'-' * 100}")
493
357
 
494
- # NOTE: Validate length of crontab of this class.
495
- if len(value) != self.cron_length:
496
- raise ValueError(
497
- f"Invalid cron value does not have length equal "
498
- f"{self.cron_length}: {value}."
499
- )
500
- self.options: Options = Options(**(option or {}))
501
358
 
502
- # NOTE: Start initial crontab for each part
503
- self.parts: list[CronPart] = [
504
- CronPart(unit, values=item, options=self.options)
505
- for item, unit in zip(value, self.cron_units)
506
- ]
359
+ def queue2str(queue: list[datetime]) -> Iterator[str]:
360
+ return (f"{q:%Y-%m-%d %H:%M:%S}" for q in queue)
507
361
 
508
- # NOTE: Validate values of `day` and `dow` from parts.
509
- if self.day == self.dow == []:
510
- raise ValueError(
511
- "Invalid cron value when set the `?` on day of month and "
512
- "day of week together"
513
- )
514
362
 
515
- def __str__(self) -> str:
516
- """Return joining with space of each value in parts."""
517
- return " ".join(str(part) for part in self.parts)
363
+ @catch_exceptions(cancel_on_failure=True)
364
+ def workflow_task(
365
+ pipeline_tasks: list[PipelineTask],
366
+ stop: datetime,
367
+ threads: dict[str, Thread],
368
+ ) -> CancelJob | None:
369
+ """Workflow task generator that create release pair of pipeline and on to
370
+ the threading in background.
518
371
 
519
- def __repr__(self) -> str:
520
- return (
521
- f"{self.__class__.__name__}(value={self.__str__()!r}, "
522
- f"option={self.options.__dict__})"
523
- )
372
+ This workflow task will start every minute at :02 second.
524
373
 
525
- def __lt__(self, other) -> bool:
526
- return any(
527
- part < other_part
528
- for part, other_part in zip(self.parts_order, other.parts_order)
374
+ :param pipeline_tasks:
375
+ :param stop:
376
+ :param threads:
377
+ :rtype: CancelJob | None
378
+ """
379
+ tz: ZoneInfo = ZoneInfo(os.getenv("WORKFLOW_CORE_TIMEZONE", "UTC"))
380
+ start_date: datetime = datetime.now(tz=tz)
381
+ start_date_minute: datetime = start_date.replace(second=0, microsecond=0)
382
+
383
+ if start_date > stop:
384
+ logger.info("[WORKFLOW]: Stop this schedule with datetime stopper.")
385
+ while len(threads) > 0:
386
+ logger.warning(
387
+ "[WORKFLOW]: Waiting pipeline release thread that still "
388
+ "running in background."
389
+ )
390
+ time.sleep(15)
391
+ workflow_long_running_task(threads)
392
+ return CancelJob
393
+
394
+ # IMPORTANT:
395
+ # Filter pipeline & on that should to run with `pipeline_release`
396
+ # function. It will deplicate running with different schedule value
397
+ # because I use current time in this condition.
398
+ #
399
+ # For example, if a pipeline A queue has '00:02:00' time that
400
+ # should to run and its schedule has '*/2 * * * *' and '*/35 * * * *'.
401
+ # This condition will release with 2 threading job.
402
+ #
403
+ # '00:02:00' --> '*/2 * * * *' --> running
404
+ # --> '*/35 * * * *' --> skip
405
+ #
406
+ for task in pipeline_tasks:
407
+
408
+ # NOTE: Get incoming datetime queue.
409
+ logger.debug(
410
+ f"[WORKFLOW]: Current queue: {task.pipeline.name!r} : "
411
+ f"{list(queue2str(task.queue[task.pipeline.name]))}"
529
412
  )
530
413
 
531
- def __eq__(self, other) -> bool:
532
- return all(
533
- part == other_part
534
- for part, other_part in zip(self.parts, other.parts)
414
+ # NOTE: Create minute unit value for any scheduler datetime that
415
+ # checking a pipeline task should run in this datetime.
416
+ current_running_time: datetime = start_date_minute.astimezone(
417
+ tz=ZoneInfo(task.on.tz)
535
418
  )
419
+ if (
420
+ len(task.queue[task.pipeline.name]) > 0
421
+ and current_running_time != task.queue[task.pipeline.name][0]
422
+ ) or (
423
+ task.on.next(current_running_time)
424
+ != task.queue[task.pipeline.name][0]
425
+ ):
426
+ logger.debug(
427
+ f"[WORKFLOW]: Skip schedule "
428
+ f"{current_running_time:%Y-%m-%d %H:%M:%S} "
429
+ f"for : {task.pipeline.name!r} : {task.on.cronjob}"
430
+ )
431
+ continue
432
+ elif len(task.queue[task.pipeline.name]) == 0:
433
+ logger.warning(
434
+ f"[WORKFLOW]: Queue is empty for : {task.pipeline.name!r} : "
435
+ f"{task.on.cronjob}"
436
+ )
437
+ continue
536
438
 
537
- @property
538
- def parts_order(self) -> Iterator[CronPart]:
539
- return reversed(self.parts[:3] + [self.parts[4], self.parts[3]])
439
+ # NOTE: Remove this datetime from queue.
440
+ task.queue[task.pipeline.name].pop(0)
540
441
 
541
- @property
542
- def minute(self) -> CronPart:
543
- """Return part of minute."""
544
- return self.parts[0]
442
+ # NOTE: Create thread name that able to tracking with observe schedule
443
+ # job.
444
+ thread_name: str = (
445
+ f"{task.pipeline.name}|{str(task.on.cronjob)}|"
446
+ f"{current_running_time:%Y%m%d%H%M}"
447
+ )
448
+ pipe_thread: Thread = Thread(
449
+ target=task.release,
450
+ name=thread_name,
451
+ daemon=True,
452
+ )
545
453
 
546
- @property
547
- def hour(self) -> CronPart:
548
- """Return part of hour."""
549
- return self.parts[1]
454
+ threads[thread_name] = pipe_thread
550
455
 
551
- @property
552
- def day(self) -> CronPart:
553
- """Return part of day."""
554
- return self.parts[2]
456
+ pipe_thread.start()
555
457
 
556
- @property
557
- def month(self) -> CronPart:
558
- """Return part of month."""
559
- return self.parts[3]
458
+ delay()
560
459
 
561
- @property
562
- def dow(self) -> CronPart:
563
- """Return part of day of month."""
564
- return self.parts[4]
460
+ logger.debug(f"[WORKFLOW]: {'=' * 100}")
565
461
 
566
- def to_list(self) -> list[list[int]]:
567
- """Returns the cron schedule as a 2-dimensional list of integers."""
568
- return [part.values for part in self.parts]
569
462
 
570
- def schedule(
571
- self,
572
- date: datetime | None = None,
573
- *,
574
- tz: str | None = None,
575
- ) -> CronRunner:
576
- """Returns the schedule datetime runner with this cronjob. It would run
577
- ``next``, ``prev``, or ``reset`` to generate running date that you want.
578
-
579
- :param date: An initial date that want to mark as the start point.
580
- :param tz: A string timezone that want to change on runner.
581
- :rtype: CronRunner
582
- """
583
- return CronRunner(self, date, tz=tz)
463
+ def workflow_long_running_task(threads: dict[str, Thread]) -> None:
464
+ """Workflow schedule for monitoring long running thread from the schedule
465
+ control.
584
466
 
467
+ :param threads: A mapping of Thread object and its name.
468
+ :rtype: None
469
+ """
470
+ logger.debug(
471
+ "[MONITOR]: Start checking long running pipeline release task."
472
+ )
473
+ snapshot_threads = list(threads.keys())
474
+ for t_name in snapshot_threads:
585
475
 
586
- class CronJobYear(CronJob):
587
- cron_length = 6
588
- cron_units = CRON_UNITS_YEAR
476
+ # NOTE: remove the thread that running success.
477
+ if not threads[t_name].is_alive():
478
+ threads.pop(t_name)
589
479
 
590
- @property
591
- def year(self) -> CronPart:
592
- """Return part of year."""
593
- return self.parts[5]
594
480
 
481
+ def workflow_control(
482
+ schedules: list[str],
483
+ stop: datetime | None = None,
484
+ externals: DictData | None = None,
485
+ ) -> list[str]:
486
+ """Workflow scheduler control.
595
487
 
596
- class CronRunner:
597
- """Create an instance of Date Runner object for datetime generate with
598
- cron schedule object value.
488
+ :param schedules: A list of pipeline names that want to schedule running.
489
+ :param stop: An datetime value that use to stop running schedule.
490
+ :param externals: An external parameters that pass to Loader.
491
+ :rtype: list[str]
599
492
  """
600
-
601
- __slots__: tuple[str, ...] = (
602
- "__start_date",
603
- "cron",
604
- "date",
605
- "reset_flag",
606
- "tz",
493
+ tz: ZoneInfo = ZoneInfo(os.getenv("WORKFLOW_CORE_TIMEZONE", "UTC"))
494
+ schedule: Scheduler = Scheduler()
495
+ start_date: datetime = datetime.now(tz=tz)
496
+
497
+ # NOTE: Design workflow queue caching.
498
+ # ---
499
+ # {"pipeline-name": [<release-datetime>, <release-datetime>, ...]}
500
+ #
501
+ wf_queue: dict[str, list[datetime]] = {}
502
+ wf_running: dict[str, list[datetime]] = {}
503
+ thread_releases: dict[str, Thread] = {}
504
+
505
+ start_date_waiting: datetime = (start_date + timedelta(minutes=1)).replace(
506
+ second=0, microsecond=0
607
507
  )
608
508
 
609
- def __init__(
610
- self,
611
- cron: CronJob | CronJobYear,
612
- date: datetime | None = None,
613
- *,
614
- tz: str | None = None,
615
- ) -> None:
616
- # NOTE: Prepare timezone if this value does not set, it will use UTC.
617
- self.tz: ZoneInfo = ZoneInfo("UTC")
618
- if tz:
619
- try:
620
- self.tz = ZoneInfo(tz)
621
- except ZoneInfoNotFoundError as err:
622
- raise ValueError(f"Invalid timezone: {tz}") from err
623
-
624
- # NOTE: Prepare date
625
- if date:
626
- if not isinstance(date, datetime):
627
- raise ValueError(
628
- "Input schedule start time is not a valid datetime object."
629
- )
630
- if tz is None:
631
- self.tz = date.tzinfo
632
- self.date: datetime = date.astimezone(self.tz)
633
- else:
634
- self.date: datetime = datetime.now(tz=self.tz)
635
-
636
- # NOTE: Add one minute if the second value more than 0.
637
- if self.date.second > 0:
638
- self.date: datetime = self.date + timedelta(minutes=1)
639
-
640
- self.__start_date: datetime = self.date
641
- self.cron: CronJob | CronJobYear = cron
642
- self.reset_flag: bool = True
643
-
644
- def reset(self) -> None:
645
- """Resets the iterator to start time."""
646
- self.date: datetime = self.__start_date
647
- self.reset_flag: bool = True
648
-
649
- @property
650
- def next(self) -> datetime:
651
- """Returns the next time of the schedule."""
652
- self.date = (
653
- self.date
654
- if self.reset_flag
655
- else (self.date + timedelta(minutes=+1))
509
+ # NOTE: Create pair of pipeline and on from schedule model.
510
+ pipeline_tasks: list[PipelineTask] = []
511
+ for name in schedules:
512
+ sch: Schedule = Schedule.from_loader(name, externals=externals)
513
+ pipeline_tasks.extend(
514
+ sch.tasks(start_date_waiting, wf_queue, wf_running, externals)
656
515
  )
657
- return self.find_date(reverse=False)
658
516
 
659
- @property
660
- def prev(self) -> datetime:
661
- """Returns the previous time of the schedule."""
662
- self.date: datetime = self.date + timedelta(minutes=-1)
663
- return self.find_date(reverse=True)
664
-
665
- def find_date(self, reverse: bool = False) -> datetime:
666
- """Returns the time the schedule would run by `next` or `prev`.
517
+ # NOTE: This schedule job will start every minute at :02 seconds.
518
+ schedule.every(1).minutes.at(":02").do(
519
+ workflow_task,
520
+ pipeline_tasks=pipeline_tasks,
521
+ stop=stop
522
+ or (
523
+ start_date
524
+ + timedelta(
525
+ **json.loads(
526
+ os.getenv("WORKFLOW_APP_STOP_BOUNDARY_DELTA")
527
+ or '{"minutes": 5, "seconds": 20}'
528
+ )
529
+ )
530
+ ),
531
+ threads=thread_releases,
532
+ ).tag("control")
533
+
534
+ # NOTE: Checking zombie task with schedule job will start every 5 minute.
535
+ schedule.every(5).minutes.at(":10").do(
536
+ workflow_long_running_task,
537
+ threads=thread_releases,
538
+ ).tag("monitor")
539
+
540
+ # NOTE: Start running schedule
541
+ logger.info(f"[WORKFLOW]: Start schedule: {schedules}")
542
+ while True:
543
+ schedule.run_pending()
544
+ time.sleep(1)
545
+ if not schedule.get_jobs("control"):
546
+ schedule.clear("monitor")
547
+ logger.warning(
548
+ f"[WORKFLOW]: Pipeline release thread: {thread_releases}"
549
+ )
550
+ logger.warning("[WORKFLOW]: Does not have any schedule jobs !!!")
551
+ break
667
552
 
668
- :param reverse: A reverse flag.
669
- """
670
- # NOTE: Set reset flag to false if start any action.
671
- self.reset_flag: bool = False
672
- for _ in range(25):
673
- if all(
674
- not self.__shift_date(mode, reverse)
675
- for mode in ("month", "day", "hour", "minute")
676
- ):
677
- return copy.deepcopy(self.date.replace(second=0, microsecond=0))
678
- raise RecursionError("Unable to find execution time for schedule")
679
-
680
- def __shift_date(self, mode: str, reverse: bool = False) -> bool:
681
- """Increments the mode value until matches with the schedule."""
682
- switch: dict[str, str] = {
683
- "month": "year",
684
- "day": "month",
685
- "hour": "day",
686
- "minute": "hour",
687
- }
688
- current_value: int = getattr(self.date, switch[mode])
689
- _addition_condition: Callable[[], bool] = (
690
- (
691
- lambda: WEEKDAYS.get(self.date.strftime("%a"))
692
- not in self.cron.dow.values
553
+ logger.warning(
554
+ f"Queue: {[list(queue2str(wf_queue[wf])) for wf in wf_queue]}"
555
+ )
556
+ logger.warning(
557
+ f"Running: {[list(queue2str(wf_running[wf])) for wf in wf_running]}"
558
+ )
559
+ return schedules
560
+
561
+
562
+ def workflow(
563
+ stop: datetime | None = None,
564
+ externals: DictData | None = None,
565
+ excluded: list[str] | None = None,
566
+ ) -> list[str]:
567
+ """Workflow application that running multiprocessing schedule with chunk of
568
+ pipelines that exists in config path.
569
+
570
+ :param stop:
571
+ :param excluded:
572
+ :param externals:
573
+ :rtype: list[str]
574
+
575
+ This function will get all pipelines that include on value that was
576
+ created in config path and chuck it with WORKFLOW_APP_SCHEDULE_PER_PROCESS
577
+ value to multiprocess executor pool.
578
+
579
+ The current workflow logic:
580
+ ---
581
+ PIPELINES ==> process 01 ==> schedule 1 minute --> thread of release
582
+ pipeline task 01 01
583
+ --> thread of release
584
+ pipeline task 01 02
585
+ ==> process 02 ==> schedule 1 minute --> thread of release
586
+ pipeline task 02 01
587
+ --> thread of release
588
+ pipeline task 02 02
589
+ ==> ...
590
+ """
591
+ excluded: list[str] = excluded or []
592
+
593
+ with ProcessPoolExecutor(
594
+ max_workers=int(os.getenv("WORKFLOW_APP_PROCESS_WORKER") or "2"),
595
+ ) as executor:
596
+ futures: list[Future] = [
597
+ executor.submit(
598
+ workflow_control,
599
+ schedules=[load[0] for load in loader],
600
+ stop=stop,
601
+ externals=(externals or {}),
693
602
  )
694
- if mode == "day"
695
- else lambda: False
696
- )
697
- # NOTE: Start while-loop for checking this date include in this cronjob.
698
- while (
699
- getattr(self.date, mode) not in getattr(self.cron, mode).values
700
- ) or _addition_condition():
701
- self.date: datetime = next_date(self.date, mode, reverse=reverse)
702
- self.date: datetime = replace_date(self.date, mode, reverse=reverse)
703
- if current_value != getattr(self.date, switch[mode]):
704
- return mode != "month"
705
- return False
603
+ for loader in batch(
604
+ Loader.finds(Schedule, excluded=excluded),
605
+ n=int(os.getenv("WORKFLOW_APP_SCHEDULE_PER_PROCESS") or "100"),
606
+ )
607
+ ]
706
608
 
609
+ results: list[str] = []
610
+ for future in as_completed(futures):
611
+ if err := future.exception():
612
+ logger.error(str(err))
613
+ raise WorkflowException(str(err)) from err
614
+ results.extend(future.result(timeout=1))
615
+ return results
707
616
 
708
- __all__ = (
709
- "CronJob",
710
- "CronJobYear",
711
- "CronRunner",
712
- "WEEKDAYS",
713
- )
617
+
618
+ if __name__ == "__main__":
619
+ workflow_rs: list[str] = workflow()
620
+ logger.info(f"Application run success: {workflow_rs}")