ddeutil-workflow 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,39 +5,57 @@
5
5
  # ------------------------------------------------------------------------------
6
6
  from __future__ import annotations
7
7
 
8
+ import contextlib
8
9
  import inspect
10
+ import itertools
9
11
  import logging
10
12
  import subprocess
13
+ import sys
14
+ import time
15
+ import uuid
11
16
  from abc import ABC, abstractmethod
12
- from datetime import date, datetime
13
17
  from inspect import Parameter
18
+ from pathlib import Path
19
+ from queue import Queue
14
20
  from subprocess import CompletedProcess
15
- from typing import Any, Callable, Literal, Optional, Union
21
+ from typing import Any, Callable, Optional, Union
16
22
 
17
- from ddeutil.io.models.lineage import dt_now
23
+ import msgspec as spec
18
24
  from pydantic import BaseModel, Field
19
25
  from pydantic.functional_validators import model_validator
20
26
  from typing_extensions import Self
21
27
 
22
28
  from .__regex import RegexConf
23
- from .__types import DictData
24
- from .exceptions import PyException, TaskException
29
+ from .__types import DictData, DictStr
30
+ from .exceptions import TaskException
25
31
  from .loader import Loader, map_params
26
- from .utils import make_registry
32
+ from .utils import Params, make_exec, make_registry
27
33
 
28
34
 
29
35
  class BaseStage(BaseModel, ABC):
30
- """Base Stage Model."""
31
-
32
- id: Optional[str] = None
33
- name: str
36
+ """Base Stage Model that keep only id and name fields."""
37
+
38
+ id: Optional[str] = Field(
39
+ default=None,
40
+ description=(
41
+ "The stage ID that use to keep execution output or getting by job "
42
+ "owner."
43
+ ),
44
+ )
45
+ name: str = Field(
46
+ description="The stage name that want to logging when start execution."
47
+ )
34
48
 
35
49
  @abstractmethod
36
50
  def execute(self, params: DictData) -> DictData:
51
+ """Execute abstraction method that action something by sub-model class.
52
+
53
+ :param params: A parameter data that want to use in this execution.
54
+ """
37
55
  raise NotImplementedError("Stage should implement ``execute`` method.")
38
56
 
39
57
  def set_outputs(self, rs: DictData, params: DictData) -> DictData:
40
- """Set outputs to params"""
58
+ """Set an outputs from execution process to an input params."""
41
59
  if self.id is None:
42
60
  return params
43
61
 
@@ -61,12 +79,30 @@ class ShellStage(BaseStage):
61
79
  """Shell statement stage."""
62
80
 
63
81
  shell: str
64
- env: dict[str, str] = Field(default_factory=dict)
82
+ env: DictStr = Field(default_factory=dict)
65
83
 
66
84
  @staticmethod
85
+ @contextlib.contextmanager
67
86
  def __prepare_shell(shell: str):
68
- """Prepare shell statement string that include newline"""
69
- return shell.replace("\n", ";")
87
+ """Return context of prepared shell statement that want to execute. This
88
+ step will write the `.sh` file before giving this file name to context.
89
+ After that, it will auto delete this file automatic.
90
+
91
+ :param shell: A shell statement that want to prepare.
92
+ """
93
+ f_name: str = f"{uuid.uuid4()}.sh"
94
+ f_shebang: str = "bash" if sys.platform.startswith("win") else "sh"
95
+ with open(f"./{f_name}", mode="w", newline="\n") as f:
96
+ f.write(f"#!/bin/{f_shebang}\n")
97
+
98
+ # NOTE: make sure that shell script file does not have `\r` char.
99
+ f.write(shell.replace("\r\n", "\n"))
100
+
101
+ make_exec(f"./{f_name}")
102
+
103
+ yield [f_shebang, f_name]
104
+
105
+ Path(f_name).unlink()
70
106
 
71
107
  def set_outputs(self, rs: CompletedProcess, params: DictData) -> DictData:
72
108
  """Set outputs to params"""
@@ -81,8 +117,7 @@ class ShellStage(BaseStage):
81
117
  # NOTE: The output will fileter unnecessary keys from ``_locals``.
82
118
  "outputs": {
83
119
  "return_code": rs.returncode,
84
- "stdout": rs.stdout,
85
- "stderr": rs.stderr,
120
+ "stdout": rs.stdout.rstrip("\n"),
86
121
  },
87
122
  }
88
123
  return params
@@ -91,19 +126,21 @@ class ShellStage(BaseStage):
91
126
  """Execute the Shell & Powershell statement with the Python build-in
92
127
  ``subprocess`` package.
93
128
  """
94
- rs: CompletedProcess = subprocess.run(
95
- self.__prepare_shell(self.shell),
96
- capture_output=True,
97
- text=True,
98
- shell=True,
99
- )
129
+ with self.__prepare_shell(self.shell) as sh:
130
+ with open(sh[-1]) as f:
131
+ logging.debug(f.read())
132
+ logging.info(f"Shell-Execute: {sh}")
133
+ rs: CompletedProcess = subprocess.run(
134
+ sh,
135
+ shell=False,
136
+ capture_output=True,
137
+ text=True,
138
+ )
100
139
  if rs.returncode > 0:
101
- print(f"{rs.stderr}\nRunning Statement:\n---\n{self.shell}")
102
- # FIXME: raise err for this execution.
103
- # raise ShellException(
104
- # f"{rs.stderr}\nRunning Statement:\n---\n"
105
- # f"{self.shell}"
106
- # )
140
+ logging.error(f"{rs.stderr}\nRunning Statement:\n---\n{self.shell}")
141
+ raise TaskException(
142
+ f"{rs.stderr}\nRunning Statement:\n---\n{self.shell}"
143
+ )
107
144
  self.set_outputs(rs, params)
108
145
  return params
109
146
 
@@ -116,7 +153,7 @@ class PyStage(BaseStage):
116
153
  run: str
117
154
  vars: DictData = Field(default_factory=dict)
118
155
 
119
- def get_var(self, params: DictData) -> DictData:
156
+ def get_vars(self, params: DictData) -> DictData:
120
157
  """Return variables"""
121
158
  rs = self.vars.copy()
122
159
  for p, v in self.vars.items():
@@ -149,12 +186,12 @@ class PyStage(BaseStage):
149
186
  :returns: A parameters from an input that was mapped output if the stage
150
187
  ID was set.
151
188
  """
152
- _globals: DictData = globals() | params | self.get_var(params)
189
+ _globals: DictData = globals() | params | self.get_vars(params)
153
190
  _locals: DictData = {}
154
191
  try:
155
192
  exec(map_params(self.run, params), _globals, _locals)
156
193
  except Exception as err:
157
- raise PyException(
194
+ raise TaskException(
158
195
  f"{err.__class__.__name__}: {err}\nRunning Statement:\n---\n"
159
196
  f"{self.run}"
160
197
  ) from None
@@ -164,13 +201,17 @@ class PyStage(BaseStage):
164
201
  return params | {k: _globals[k] for k in params if k in _globals}
165
202
 
166
203
 
167
- class TaskSearch(BaseModel):
168
- """Task Search Model"""
204
+ class TaskSearch(spec.Struct, kw_only=True, tag="task"):
205
+ """Task Search Struct that use the `msgspec` for the best performance."""
169
206
 
170
207
  path: str
171
208
  func: str
172
209
  tag: str
173
210
 
211
+ def to_dict(self) -> DictData:
212
+ """Return dict data from struct fields."""
213
+ return {f: getattr(self, f) for f in self.__struct_fields__}
214
+
174
215
 
175
216
  class TaskStage(BaseStage):
176
217
  """Task executor stage that running the Python function."""
@@ -183,7 +224,7 @@ class TaskStage(BaseStage):
183
224
  """Extract Task string value to task function."""
184
225
  if not (found := RegexConf.RE_TASK_FMT.search(task)):
185
226
  raise ValueError("Task does not match with task format regex.")
186
- tasks = TaskSearch(**found.groupdict())
227
+ tasks: TaskSearch = TaskSearch(**found.groupdict())
187
228
 
188
229
  # NOTE: Registry object should implement on this package only.
189
230
  # TODO: This prefix value to search registry should dynamic with
@@ -238,153 +279,131 @@ Stage = Union[
238
279
 
239
280
 
240
281
  class Strategy(BaseModel):
241
- """Strategy Model"""
282
+ """Strategy Model that will combine a matrix together for running the
283
+ special job.
284
+
285
+ Examples:
286
+ >>> strategy = {
287
+ ... 'matrix': {
288
+ ... 'first': [1, 2, 3],
289
+ ... 'second': ['foo', 'bar']
290
+ ... },
291
+ ... 'include': [{'first': 4, 'second': 'foo'}],
292
+ ... 'exclude': [{'first': 1, 'second': 'bar'}],
293
+ ... }
294
+ """
295
+
296
+ fail_fast: bool = Field(default=False)
297
+ max_parallel: int = Field(default=-1)
298
+ matrix: dict[str, Union[list[str], list[int]]] = Field(default_factory=dict)
299
+ include: list[dict[str, Union[str, int]]] = Field(default_factory=list)
300
+ exclude: list[dict[str, Union[str, int]]] = Field(default_factory=list)
242
301
 
243
- matrix: list[str] = Field(default_factory=list)
244
- include: list[str] = Field(default_factory=list)
245
- exclude: list[str] = Field(default_factory=list)
302
+ @model_validator(mode="before")
303
+ def __prepare_keys(cls, values: DictData) -> DictData:
304
+ if "max-parallel" in values:
305
+ values["max_parallel"] = values.pop("max-parallel")
306
+ if "fail-fast" in values:
307
+ values["fail_fast"] = values.pop("fail-fast")
308
+ return values
246
309
 
247
310
 
248
311
  class Job(BaseModel):
249
312
  """Job Model"""
250
313
 
314
+ runs_on: Optional[str] = Field(default=None)
251
315
  stages: list[Stage] = Field(default_factory=list)
252
316
  needs: list[str] = Field(default_factory=list)
253
317
  strategy: Strategy = Field(default_factory=Strategy)
254
318
 
319
+ @model_validator(mode="before")
320
+ def __prepare_keys(cls, values: DictData) -> DictData:
321
+ if "runs-on" in values:
322
+ values["runs_on"] = values.pop("runs-on")
323
+ return values
324
+
255
325
  def stage(self, stage_id: str) -> Stage:
326
+ """Return stage model that match with an input stage ID."""
256
327
  for stage in self.stages:
257
328
  if stage_id == (stage.id or ""):
258
329
  return stage
259
330
  raise ValueError(f"Stage ID {stage_id} does not exists")
260
331
 
332
+ def make_strategy(self) -> list[DictStr]:
333
+ """Return List of combination of matrix values that already filter with
334
+ exclude and add include values.
335
+ """
336
+ if not (mt := self.strategy.matrix):
337
+ return [{}]
338
+ final: list[DictStr] = []
339
+ for r in [
340
+ {_k: _v for e in mapped for _k, _v in e.items()}
341
+ for mapped in itertools.product(
342
+ *[[{k: v} for v in vs] for k, vs in mt.items()]
343
+ )
344
+ ]:
345
+ if any(
346
+ all(r[k] == v for k, v in exclude.items())
347
+ for exclude in self.strategy.exclude
348
+ ):
349
+ continue
350
+ final.append(r)
351
+
352
+ if not final:
353
+ return [{}]
354
+
355
+ for include in self.strategy.include:
356
+ if include.keys() != final[0].keys():
357
+ raise ValueError("Include should have the keys equal to matrix")
358
+ if any(all(include[k] == v for k, v in f.items()) for f in final):
359
+ continue
360
+ final.append(include)
361
+ return final
362
+
261
363
  def execute(self, params: DictData | None = None) -> DictData:
262
364
  """Execute job with passing dynamic parameters from the pipeline."""
263
- for stage in self.stages:
264
- # NOTE:
265
- # I do not use below syntax because `params` dict be the
266
- # reference memory pointer and it was changed when I action
267
- # anything like update or re-construct this.
268
- # ... params |= stage.execute(params=params)
269
- stage.execute(params=params)
365
+ for strategy in self.make_strategy():
366
+ params.update({"matrix": strategy})
367
+
368
+ # IMPORTANT: The stage execution only run sequentially one-by-one.
369
+ for stage in self.stages:
370
+ logging.info(
371
+ f"[JOB]: Start execute the stage: "
372
+ f"{(stage.id if stage.id else stage.name)!r}"
373
+ )
374
+
375
+ # NOTE:
376
+ # I do not use below syntax because `params` dict be the
377
+ # reference memory pointer and it was changed when I action
378
+ # anything like update or re-construct this.
379
+ # ... params |= stage.execute(params=params)
380
+ stage.execute(params=params)
381
+ # TODO: We should not return matrix key to outside
270
382
  return params
271
383
 
272
384
 
273
- class BaseParams(BaseModel, ABC):
274
- """Base Parameter that use to make Params Model."""
275
-
276
- desc: Optional[str] = None
277
- required: bool = True
278
- type: str
279
-
280
- @abstractmethod
281
- def receive(self, value: Optional[Any] = None) -> Any:
282
- raise ValueError(
283
- "Receive value and validate typing before return valid value."
284
- )
285
-
286
-
287
- class DefaultParams(BaseParams):
288
- """Default Parameter that will check default if it required"""
289
-
290
- default: Optional[str] = None
291
-
292
- @abstractmethod
293
- def receive(self, value: Optional[Any] = None) -> Any:
294
- raise ValueError(
295
- "Receive value and validate typing before return valid value."
296
- )
297
-
298
- @model_validator(mode="after")
299
- def check_default(self) -> Self:
300
- if not self.required and self.default is None:
301
- raise ValueError(
302
- "Default should set when this parameter does not required."
303
- )
304
- return self
305
-
306
-
307
- class DatetimeParams(DefaultParams):
308
- """Datetime parameter."""
309
-
310
- type: Literal["datetime"] = "datetime"
311
- required: bool = False
312
- default: datetime = Field(default_factory=dt_now)
313
-
314
- def receive(self, value: str | datetime | date | None = None) -> datetime:
315
- if value is None:
316
- return self.default
317
-
318
- if isinstance(value, datetime):
319
- return value
320
- elif isinstance(value, date):
321
- return datetime(value.year, value.month, value.day)
322
- elif not isinstance(value, str):
323
- raise ValueError(
324
- f"Value that want to convert to datetime does not support for "
325
- f"type: {type(value)}"
326
- )
327
- return datetime.fromisoformat(value)
328
-
329
-
330
- class StrParams(DefaultParams):
331
- """String parameter."""
332
-
333
- type: Literal["str"] = "str"
334
-
335
- def receive(self, value: Optional[str] = None) -> str | None:
336
- if value is None:
337
- return self.default
338
- return str(value)
339
-
340
-
341
- class IntParams(DefaultParams):
342
- """Integer parameter."""
343
-
344
- type: Literal["int"] = "int"
345
-
346
- def receive(self, value: Optional[int] = None) -> int | None:
347
- if value is None:
348
- return self.default
349
- if not isinstance(value, int):
350
- try:
351
- return int(str(value))
352
- except TypeError as err:
353
- raise ValueError(
354
- f"Value that want to convert to integer does not support "
355
- f"for type: {type(value)}"
356
- ) from err
357
- return value
358
-
359
-
360
- class ChoiceParams(BaseParams):
361
- type: Literal["choice"] = "choice"
362
- options: list[str]
363
-
364
- def receive(self, value: Optional[str] = None) -> str:
365
- """Receive value that match with options."""
366
- # NOTE:
367
- # Return the first value in options if does not pass any input value
368
- if value is None:
369
- return self.options[0]
370
- if any(value not in self.options):
371
- raise ValueError(f"{value} does not match any value in options")
372
- return value
373
-
374
-
375
- Params = Union[
376
- ChoiceParams,
377
- DatetimeParams,
378
- StrParams,
379
- ]
380
-
381
-
382
385
  class Pipeline(BaseModel):
383
- """Pipeline Model"""
386
+ """Pipeline Model this is the main feature of this project because it use to
387
+ be workflow data for running everywhere that you want. It use lightweight
388
+ coding line to execute it.
389
+ """
384
390
 
385
391
  params: dict[str, Params] = Field(default_factory=dict)
386
392
  jobs: dict[str, Job]
387
393
 
394
+ @model_validator(mode="before")
395
+ def __prepare_params(cls, values: DictData) -> DictData:
396
+ if params := values.pop("params", {}):
397
+ values["params"] = {
398
+ p: (
399
+ {"type": params[p]}
400
+ if isinstance(params[p], str)
401
+ else params[p]
402
+ )
403
+ for p in params
404
+ }
405
+ return values
406
+
388
407
  @classmethod
389
408
  def from_loader(
390
409
  cls,
@@ -399,6 +418,10 @@ class Pipeline(BaseModel):
399
418
  params=loader.data["params"],
400
419
  )
401
420
 
421
+ @model_validator(mode="after")
422
+ def job_checking_needs(self):
423
+ return self
424
+
402
425
  def job(self, name: str) -> Job:
403
426
  """Return Job model that exists on this pipeline.
404
427
 
@@ -406,13 +429,23 @@ class Pipeline(BaseModel):
406
429
  :type name: str
407
430
 
408
431
  :rtype: Job
432
+ :returns: A job model that exists on this pipeline by input name.
409
433
  """
410
434
  if name not in self.jobs:
411
- raise ValueError(f"Job {name} does not exists")
435
+ raise ValueError(f"Job {name!r} does not exists")
412
436
  return self.jobs[name]
413
437
 
414
- def execute(self, params: DictData | None = None) -> DictData:
415
- """Execute pipeline with passing dynamic parameters.
438
+ def execute(
439
+ self,
440
+ params: DictData | None = None,
441
+ time_out: int = 60,
442
+ ) -> DictData:
443
+ """Execute pipeline with passing dynamic parameters to any jobs that
444
+ included in the pipeline.
445
+
446
+ :param params: An input parameters that use on pipeline execution.
447
+ :param time_out: A time out second value for limit time of this
448
+ execution.
416
449
 
417
450
  See Also:
418
451
 
@@ -427,8 +460,7 @@ class Pipeline(BaseModel):
427
460
 
428
461
  """
429
462
  params: DictData = params or {}
430
- check_key = tuple(f"{k!r}" for k in self.params if k not in params)
431
- if check_key:
463
+ if check_key := tuple(f"{k!r}" for k in self.params if k not in params):
432
464
  raise ValueError(
433
465
  f"Parameters that needed on pipeline does not pass: "
434
466
  f"{', '.join(check_key)}."
@@ -445,12 +477,41 @@ class Pipeline(BaseModel):
445
477
  for k in params
446
478
  if k in self.params
447
479
  }
448
- )
480
+ ),
481
+ "jobs": {},
449
482
  }
483
+
484
+ jq = Queue()
450
485
  for job_id in self.jobs:
451
- print(f"[PIPELINE]: Start execute the job: {job_id!r}")
486
+ jq.put(job_id)
487
+
488
+ ts: float = time.monotonic()
489
+ not_time_out_flag = True
490
+
491
+ # IMPORTANT: The job execution can run parallel and waiting by needed.
492
+ while not jq.empty() and (
493
+ not_time_out_flag := ((time.monotonic() - ts) < time_out)
494
+ ):
495
+ job_id: str = jq.get()
496
+ logging.info(f"[PIPELINE]: Start execute the job: {job_id!r}")
452
497
  job: Job = self.jobs[job_id]
498
+
453
499
  # TODO: Condition on ``needs`` of this job was set. It should create
454
500
  # multithreading process on this step.
501
+ # But, I don't know how to handle changes params between each job
502
+ # execution while its use them together.
503
+ # ---
504
+ # >>> import multiprocessing
505
+ # >>> with multiprocessing.Pool(processes=3) as pool:
506
+ # ... results = pool.starmap(merge_names, ('', '', ...))
507
+ #
508
+ if any(params["jobs"].get(need) for need in job.needs):
509
+ jq.put(job_id)
455
510
  job.execute(params=params)
511
+ params["jobs"][job_id] = {
512
+ "stages": params.pop("stages", {}),
513
+ "matrix": params.pop("matrix", {}),
514
+ }
515
+ if not not_time_out_flag:
516
+ raise RuntimeError("Execution of pipeline was time out")
456
517
  return params
@@ -18,8 +18,8 @@ from .__types import DictData
18
18
  from .loader import Loader
19
19
 
20
20
 
21
- class BaseScdl(BaseModel):
22
- """Base Scdl (Schedule) Model"""
21
+ class BaseSchedule(BaseModel):
22
+ """Base Schedule (Schedule) Model"""
23
23
 
24
24
  model_config = ConfigDict(arbitrary_types_allowed=True)
25
25
 
@@ -61,16 +61,16 @@ class BaseScdl(BaseModel):
61
61
  return self.cronjob.schedule(date=(start.astimezone(ZoneInfo(self.tz))))
62
62
 
63
63
 
64
- class Scdl(BaseScdl):
65
- """Scdl (Schedule) Model.
64
+ class Schedule(BaseSchedule):
65
+ """Schedule (Schedule) Model.
66
66
 
67
67
  See Also:
68
68
  * ``generate()`` is the main usecase of this schedule object.
69
69
  """
70
70
 
71
71
 
72
- class ScdlBkk(Scdl):
73
- """Asia Bangkok Scdl (Schedule) timezone Model.
72
+ class ScheduleBkk(Schedule):
73
+ """Asia Bangkok Schedule (Schedule) timezone Model.
74
74
 
75
75
  This model use for change timezone from utc to Asia/Bangkok
76
76
  """
@@ -78,5 +78,5 @@ class ScdlBkk(Scdl):
78
78
  tz: Annotated[str, Field(description="Timezone")] = "Asia/Bangkok"
79
79
 
80
80
 
81
- class AwsScdl(BaseScdl):
81
+ class AwsSchedule(BaseSchedule):
82
82
  """Implement Schedule for AWS Service."""
@@ -4,7 +4,7 @@ import math
4
4
  try:
5
5
  import pandas as pd
6
6
 
7
- logging.debug(f"Polars version: {pd.__version__}")
7
+ logging.debug(f"Pandas version: {pd.__version__}")
8
8
  except ImportError as err:
9
9
  raise ImportError(
10
10
  "``split_iterable`` function want to use pandas package that does"
@@ -5,13 +5,21 @@
5
5
  # ------------------------------------------------------------------------------
6
6
  from __future__ import annotations
7
7
 
8
+ import logging
8
9
  from typing import Any
9
10
  from uuid import uuid4
10
11
 
11
- import polars as pl
12
+ try:
13
+ import polars as pl
14
+
15
+ logging.debug(f"Polars version: {pl.__version__}")
16
+ except ImportError:
17
+ raise ImportError(
18
+ "Please install polars if you want to use any relate task"
19
+ ) from None
12
20
  import pyarrow.parquet as pq
13
- from ddeutil.workflow.dataset import PolarsCsv, PolarsParq
14
21
  from ddeutil.workflow.utils import tag
22
+ from ddeutil.workflow.vendors.pl import PolarsCsv, PolarsParq
15
23
 
16
24
 
17
25
  def polars_dtype():