PyPI - ddeutil-workflow - Versions diffs - 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl - Mend

ddeutil-workflow 0.0.2py3-none-any.whl → 0.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

ddeutil/workflow/__about__.py +1 -1
ddeutil/workflow/__types.py +1 -0
ddeutil/workflow/conn.py +13 -10
ddeutil/workflow/exceptions.py +0 -20
ddeutil/workflow/loader.py +39 -11
ddeutil/workflow/pipeline.py +223 -162
ddeutil/workflow/schedule.py +7 -7
ddeutil/workflow/tasks/_pandas.py +1 -1
ddeutil/workflow/tasks/_polars.py +10 -2
ddeutil/workflow/utils.py +123 -1
ddeutil/workflow/vendors/__dataset.py +127 -0
ddeutil/workflow/vendors/az.py +0 -0
ddeutil/workflow/vendors/pd.py +13 -0
ddeutil/workflow/vendors/pg.py +11 -0
ddeutil/workflow/{dataset.py → vendors/pl.py} +3 -133
{ddeutil_workflow-0.0.2.dist-info → ddeutil_workflow-0.0.4.dist-info}/METADATA +19 -15
ddeutil_workflow-0.0.4.dist-info/RECORD +29 -0
ddeutil_workflow-0.0.2.dist-info/RECORD +0 -25
/ddeutil/workflow/vendors/{aws_warpped.py → aws.py} +0 -0
/ddeutil/workflow/vendors/{minio_warpped.py → minio.py} +0 -0
/ddeutil/workflow/vendors/{sftp_wrapped.py → sftp.py} +0 -0
{ddeutil_workflow-0.0.2.dist-info → ddeutil_workflow-0.0.4.dist-info}/LICENSE +0 -0
{ddeutil_workflow-0.0.2.dist-info → ddeutil_workflow-0.0.4.dist-info}/WHEEL +0 -0
{ddeutil_workflow-0.0.2.dist-info → ddeutil_workflow-0.0.4.dist-info}/top_level.txt +0 -0

ddeutil/workflow/pipeline.py CHANGED Viewed

@@ -5,39 +5,57 @@
 # ------------------------------------------------------------------------------
 from __future__ import annotations
+import contextlib
 import inspect
+import itertools
 import logging
 import subprocess
+import sys
+import time
+import uuid
 from abc import ABC, abstractmethod
-from datetime import date, datetime
 from inspect import Parameter
+from pathlib import Path
+from queue import Queue
 from subprocess import CompletedProcess
-from typing import Any, Callable, Literal, Optional, Union
+from typing import Any, Callable, Optional, Union
-from ddeutil.io.models.lineage import dt_now
+import msgspec as spec
 from pydantic import BaseModel, Field
 from pydantic.functional_validators import model_validator
 from typing_extensions import Self
 from .__regex import RegexConf
-from .__types import DictData
-from .exceptions import PyException, TaskException
+from .__types import DictData, DictStr
+from .exceptions import TaskException
 from .loader import Loader, map_params
-from .utils import make_registry
+from .utils import Params, make_exec, make_registry
 class BaseStage(BaseModel, ABC):
-    """Base Stage Model."""
-    id: Optional[str] = None
-    name: str
+    """Base Stage Model that keep only id and name fields."""
+    id: Optional[str] = Field(
+        default=None,
+        description=(
+            "The stage ID that use to keep execution output or getting by job "
+            "owner."
+        ),
+    )
+    name: str = Field(
+        description="The stage name that want to logging when start execution."
+    )
     @abstractmethod
     def execute(self, params: DictData) -> DictData:
+        """Execute abstraction method that action something by sub-model class.
+        :param params: A parameter data that want to use in this execution.
+        """
         raise NotImplementedError("Stage should implement ``execute`` method.")
     def set_outputs(self, rs: DictData, params: DictData) -> DictData:
-        """Set outputs to params"""
+        """Set an outputs from execution process to an input params."""
         if self.id is None:
             return params
@@ -61,12 +79,30 @@ class ShellStage(BaseStage):
     """Shell statement stage."""
     shell: str
-    env: dict[str, str] = Field(default_factory=dict)
+    env: DictStr = Field(default_factory=dict)
     @staticmethod
+    @contextlib.contextmanager
     def __prepare_shell(shell: str):
-        """Prepare shell statement string that include newline"""
-        return shell.replace("\n", ";")
+        """Return context of prepared shell statement that want to execute. This
+        step will write the `.sh` file before giving this file name to context.
+        After that, it will auto delete this file automatic.
+        :param shell: A shell statement that want to prepare.
+        """
+        f_name: str = f"{uuid.uuid4()}.sh"
+        f_shebang: str = "bash" if sys.platform.startswith("win") else "sh"
+        with open(f"./{f_name}", mode="w", newline="\n") as f:
+            f.write(f"#!/bin/{f_shebang}\n")
+            # NOTE: make sure that shell script file does not have `\r` char.
+            f.write(shell.replace("\r\n", "\n"))
+        make_exec(f"./{f_name}")
+        yield [f_shebang, f_name]
+        Path(f_name).unlink()
     def set_outputs(self, rs: CompletedProcess, params: DictData) -> DictData:
         """Set outputs to params"""
@@ -81,8 +117,7 @@ class ShellStage(BaseStage):
             # NOTE: The output will fileter unnecessary keys from ``_locals``.
             "outputs": {
                 "return_code": rs.returncode,
-                "stdout": rs.stdout,
-                "stderr": rs.stderr,
+                "stdout": rs.stdout.rstrip("\n"),
             },
         }
         return params
@@ -91,19 +126,21 @@ class ShellStage(BaseStage):
         """Execute the Shell & Powershell statement with the Python build-in
         ``subprocess`` package.
         """
-        rs: CompletedProcess = subprocess.run(
-            self.__prepare_shell(self.shell),
-            capture_output=True,
-            text=True,
-            shell=True,
-        )
+        with self.__prepare_shell(self.shell) as sh:
+            with open(sh[-1]) as f:
+                logging.debug(f.read())
+            logging.info(f"Shell-Execute: {sh}")
+            rs: CompletedProcess = subprocess.run(
+                sh,
+                shell=False,
+                capture_output=True,
+                text=True,
+            )
         if rs.returncode > 0:
-            print(f"{rs.stderr}\nRunning Statement:\n---\n{self.shell}")
-            # FIXME: raise err for this execution.
-            # raise ShellException(
-            #     f"{rs.stderr}\nRunning Statement:\n---\n"
-            #     f"{self.shell}"
-            # )
+            logging.error(f"{rs.stderr}\nRunning Statement:\n---\n{self.shell}")
+            raise TaskException(
+                f"{rs.stderr}\nRunning Statement:\n---\n{self.shell}"
+            )
         self.set_outputs(rs, params)
         return params
@@ -116,7 +153,7 @@ class PyStage(BaseStage):
     run: str
     vars: DictData = Field(default_factory=dict)
-    def get_var(self, params: DictData) -> DictData:
+    def get_vars(self, params: DictData) -> DictData:
         """Return variables"""
         rs = self.vars.copy()
         for p, v in self.vars.items():
@@ -149,12 +186,12 @@ class PyStage(BaseStage):
         :returns: A parameters from an input that was mapped output if the stage
             ID was set.
         """
-        _globals: DictData = globals() | params | self.get_var(params)
+        _globals: DictData = globals() | params | self.get_vars(params)
         _locals: DictData = {}
         try:
             exec(map_params(self.run, params), _globals, _locals)
         except Exception as err:
-            raise PyException(
+            raise TaskException(
                 f"{err.__class__.__name__}: {err}\nRunning Statement:\n---\n"
                 f"{self.run}"
             ) from None
@@ -164,13 +201,17 @@ class PyStage(BaseStage):
         return params | {k: _globals[k] for k in params if k in _globals}
-class TaskSearch(BaseModel):
-    """Task Search Model"""
+class TaskSearch(spec.Struct, kw_only=True, tag="task"):
+    """Task Search Struct that use the `msgspec` for the best performance."""
     path: str
     func: str
     tag: str
+    def to_dict(self) -> DictData:
+        """Return dict data from struct fields."""
+        return {f: getattr(self, f) for f in self.__struct_fields__}
 class TaskStage(BaseStage):
     """Task executor stage that running the Python function."""
@@ -183,7 +224,7 @@ class TaskStage(BaseStage):
         """Extract Task string value to task function."""
         if not (found := RegexConf.RE_TASK_FMT.search(task)):
             raise ValueError("Task does not match with task format regex.")
-        tasks = TaskSearch(**found.groupdict())
+        tasks: TaskSearch = TaskSearch(**found.groupdict())
         # NOTE: Registry object should implement on this package only.
         # TODO: This prefix value to search registry should dynamic with
@@ -238,153 +279,131 @@ Stage = Union[
 class Strategy(BaseModel):
-    """Strategy Model"""
+    """Strategy Model that will combine a matrix together for running the
+    special job.
+    Examples:
+        >>> strategy = {
+        ...     'matrix': {
+        ...         'first': [1, 2, 3],
+        ...         'second': ['foo', 'bar']
+        ...     },
+        ...     'include': [{'first': 4, 'second': 'foo'}],
+        ...     'exclude': [{'first': 1, 'second': 'bar'}],
+        ... }
+    """
+    fail_fast: bool = Field(default=False)
+    max_parallel: int = Field(default=-1)
+    matrix: dict[str, Union[list[str], list[int]]] = Field(default_factory=dict)
+    include: list[dict[str, Union[str, int]]] = Field(default_factory=list)
+    exclude: list[dict[str, Union[str, int]]] = Field(default_factory=list)
-    matrix: list[str] = Field(default_factory=list)
-    include: list[str] = Field(default_factory=list)
-    exclude: list[str] = Field(default_factory=list)
+    @model_validator(mode="before")
+    def __prepare_keys(cls, values: DictData) -> DictData:
+        if "max-parallel" in values:
+            values["max_parallel"] = values.pop("max-parallel")
+        if "fail-fast" in values:
+            values["fail_fast"] = values.pop("fail-fast")
+        return values
 class Job(BaseModel):
     """Job Model"""
+    runs_on: Optional[str] = Field(default=None)
     stages: list[Stage] = Field(default_factory=list)
     needs: list[str] = Field(default_factory=list)
     strategy: Strategy = Field(default_factory=Strategy)
+    @model_validator(mode="before")
+    def __prepare_keys(cls, values: DictData) -> DictData:
+        if "runs-on" in values:
+            values["runs_on"] = values.pop("runs-on")
+        return values
     def stage(self, stage_id: str) -> Stage:
+        """Return stage model that match with an input stage ID."""
         for stage in self.stages:
             if stage_id == (stage.id or ""):
                 return stage
         raise ValueError(f"Stage ID {stage_id} does not exists")
+    def make_strategy(self) -> list[DictStr]:
+        """Return List of combination of matrix values that already filter with
+        exclude and add include values.
+        """
+        if not (mt := self.strategy.matrix):
+            return [{}]
+        final: list[DictStr] = []
+        for r in [
+            {_k: _v for e in mapped for _k, _v in e.items()}
+            for mapped in itertools.product(
+                *[[{k: v} for v in vs] for k, vs in mt.items()]
+            )
+        ]:
+            if any(
+                all(r[k] == v for k, v in exclude.items())
+                for exclude in self.strategy.exclude
+            ):
+                continue
+            final.append(r)
+        if not final:
+            return [{}]
+        for include in self.strategy.include:
+            if include.keys() != final[0].keys():
+                raise ValueError("Include should have the keys equal to matrix")
+            if any(all(include[k] == v for k, v in f.items()) for f in final):
+                continue
+            final.append(include)
+        return final
     def execute(self, params: DictData | None = None) -> DictData:
         """Execute job with passing dynamic parameters from the pipeline."""
-        for stage in self.stages:
-            # NOTE:
-            #       I do not use below syntax because `params` dict be the
-            #   reference memory pointer and it was changed when I action
-            #   anything like update or re-construct this.
-            #       ... params |= stage.execute(params=params)
-            stage.execute(params=params)
+        for strategy in self.make_strategy():
+            params.update({"matrix": strategy})
+            # IMPORTANT: The stage execution only run sequentially one-by-one.
+            for stage in self.stages:
+                logging.info(
+                    f"[JOB]: Start execute the stage: "
+                    f"{(stage.id if stage.id else stage.name)!r}"
+                )
+                # NOTE:
+                #       I do not use below syntax because `params` dict be the
+                #   reference memory pointer and it was changed when I action
+                #   anything like update or re-construct this.
+                #       ... params |= stage.execute(params=params)
+                stage.execute(params=params)
+        # TODO: We should not return matrix key to outside
         return params
-class BaseParams(BaseModel, ABC):
-    """Base Parameter that use to make Params Model."""
-    desc: Optional[str] = None
-    required: bool = True
-    type: str
-    @abstractmethod
-    def receive(self, value: Optional[Any] = None) -> Any:
-        raise ValueError(
-            "Receive value and validate typing before return valid value."
-        )
-class DefaultParams(BaseParams):
-    """Default Parameter that will check default if it required"""
-    default: Optional[str] = None
-    @abstractmethod
-    def receive(self, value: Optional[Any] = None) -> Any:
-        raise ValueError(
-            "Receive value and validate typing before return valid value."
-        )
-    @model_validator(mode="after")
-    def check_default(self) -> Self:
-        if not self.required and self.default is None:
-            raise ValueError(
-                "Default should set when this parameter does not required."
-            )
-        return self
-class DatetimeParams(DefaultParams):
-    """Datetime parameter."""
-    type: Literal["datetime"] = "datetime"
-    required: bool = False
-    default: datetime = Field(default_factory=dt_now)
-    def receive(self, value: str | datetime | date | None = None) -> datetime:
-        if value is None:
-            return self.default
-        if isinstance(value, datetime):
-            return value
-        elif isinstance(value, date):
-            return datetime(value.year, value.month, value.day)
-        elif not isinstance(value, str):
-            raise ValueError(
-                f"Value that want to convert to datetime does not support for "
-                f"type: {type(value)}"
-            )
-        return datetime.fromisoformat(value)
-class StrParams(DefaultParams):
-    """String parameter."""
-    type: Literal["str"] = "str"
-    def receive(self, value: Optional[str] = None) -> str | None:
-        if value is None:
-            return self.default
-        return str(value)
-class IntParams(DefaultParams):
-    """Integer parameter."""
-    type: Literal["int"] = "int"
-    def receive(self, value: Optional[int] = None) -> int | None:
-        if value is None:
-            return self.default
-        if not isinstance(value, int):
-            try:
-                return int(str(value))
-            except TypeError as err:
-                raise ValueError(
-                    f"Value that want to convert to integer does not support "
-                    f"for type: {type(value)}"
-                ) from err
-        return value
-class ChoiceParams(BaseParams):
-    type: Literal["choice"] = "choice"
-    options: list[str]
-    def receive(self, value: Optional[str] = None) -> str:
-        """Receive value that match with options."""
-        # NOTE:
-        #   Return the first value in options if does not pass any input value
-        if value is None:
-            return self.options[0]
-        if any(value not in self.options):
-            raise ValueError(f"{value} does not match any value in options")
-        return value
-Params = Union[
-    ChoiceParams,
-    DatetimeParams,
-    StrParams,
-]
 class Pipeline(BaseModel):
-    """Pipeline Model"""
+    """Pipeline Model this is the main feature of this project because it use to
+    be workflow data for running everywhere that you want. It use lightweight
+    coding line to execute it.
+    """
     params: dict[str, Params] = Field(default_factory=dict)
     jobs: dict[str, Job]
+    @model_validator(mode="before")
+    def __prepare_params(cls, values: DictData) -> DictData:
+        if params := values.pop("params", {}):
+            values["params"] = {
+                p: (
+                    {"type": params[p]}
+                    if isinstance(params[p], str)
+                    else params[p]
+                )
+                for p in params
+            }
+        return values
     @classmethod
     def from_loader(
         cls,
@@ -399,6 +418,10 @@ class Pipeline(BaseModel):
             params=loader.data["params"],
         )
+    @model_validator(mode="after")
+    def job_checking_needs(self):
+        return self
     def job(self, name: str) -> Job:
         """Return Job model that exists on this pipeline.
@@ -406,13 +429,23 @@ class Pipeline(BaseModel):
         :type name: str
         :rtype: Job
+        :returns: A job model that exists on this pipeline by input name.
         """
         if name not in self.jobs:
-            raise ValueError(f"Job {name} does not exists")
+            raise ValueError(f"Job {name!r} does not exists")
         return self.jobs[name]
-    def execute(self, params: DictData | None = None) -> DictData:
-        """Execute pipeline with passing dynamic parameters.
+    def execute(
+        self,
+        params: DictData | None = None,
+        time_out: int = 60,
+    ) -> DictData:
+        """Execute pipeline with passing dynamic parameters to any jobs that
+        included in the pipeline.
+        :param params: An input parameters that use on pipeline execution.
+        :param time_out: A time out second value for limit time of this
+            execution.
         See Also:
@@ -427,8 +460,7 @@ class Pipeline(BaseModel):
         """
         params: DictData = params or {}
-        check_key = tuple(f"{k!r}" for k in self.params if k not in params)
-        if check_key:
+        if check_key := tuple(f"{k!r}" for k in self.params if k not in params):
             raise ValueError(
                 f"Parameters that needed on pipeline does not pass: "
                 f"{', '.join(check_key)}."
@@ -445,12 +477,41 @@ class Pipeline(BaseModel):
                     for k in params
                     if k in self.params
                 }
-            )
+            ),
+            "jobs": {},
         }
+        jq = Queue()
         for job_id in self.jobs:
-            print(f"[PIPELINE]: Start execute the job: {job_id!r}")
+            jq.put(job_id)
+        ts: float = time.monotonic()
+        not_time_out_flag = True
+        # IMPORTANT: The job execution can run parallel and waiting by needed.
+        while not jq.empty() and (
+            not_time_out_flag := ((time.monotonic() - ts) < time_out)
+        ):
+            job_id: str = jq.get()
+            logging.info(f"[PIPELINE]: Start execute the job: {job_id!r}")
             job: Job = self.jobs[job_id]
             # TODO: Condition on ``needs`` of this job was set. It should create
             #   multithreading process on this step.
+            #   But, I don't know how to handle changes params between each job
+            #   execution while its use them together.
+            #   ---
+            #   >>> import multiprocessing
+            #   >>> with multiprocessing.Pool(processes=3) as pool:
+            #   ...     results = pool.starmap(merge_names, ('', '', ...))
+            #
+            if any(params["jobs"].get(need) for need in job.needs):
+                jq.put(job_id)
             job.execute(params=params)
+            params["jobs"][job_id] = {
+                "stages": params.pop("stages", {}),
+                "matrix": params.pop("matrix", {}),
+            }
+        if not not_time_out_flag:
+            raise RuntimeError("Execution of pipeline was time out")
         return params

ddeutil/workflow/schedule.py CHANGED Viewed

@@ -18,8 +18,8 @@ from .__types import DictData
 from .loader import Loader
-class BaseScdl(BaseModel):
-    """Base Scdl (Schedule) Model"""
+class BaseSchedule(BaseModel):
+    """Base Schedule (Schedule) Model"""
     model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -61,16 +61,16 @@ class BaseScdl(BaseModel):
         return self.cronjob.schedule(date=(start.astimezone(ZoneInfo(self.tz))))
-class Scdl(BaseScdl):
-    """Scdl (Schedule) Model.
+class Schedule(BaseSchedule):
+    """Schedule (Schedule) Model.
     See Also:
         * ``generate()`` is the main usecase of this schedule object.
     """
-class ScdlBkk(Scdl):
-    """Asia Bangkok Scdl (Schedule) timezone Model.
+class ScheduleBkk(Schedule):
+    """Asia Bangkok Schedule (Schedule) timezone Model.
     This model use for change timezone from utc to Asia/Bangkok
     """
@@ -78,5 +78,5 @@ class ScdlBkk(Scdl):
     tz: Annotated[str, Field(description="Timezone")] = "Asia/Bangkok"
-class AwsScdl(BaseScdl):
+class AwsSchedule(BaseSchedule):
     """Implement Schedule for AWS Service."""

ddeutil/workflow/tasks/_pandas.py CHANGED Viewed

@@ -4,7 +4,7 @@ import math
 try:
     import pandas as pd
-    logging.debug(f"Polars version: {pd.__version__}")
+    logging.debug(f"Pandas version: {pd.__version__}")
 except ImportError as err:
     raise ImportError(
         "``split_iterable`` function want to use pandas package that does"

ddeutil/workflow/tasks/_polars.py CHANGED Viewed

@@ -5,13 +5,21 @@
 # ------------------------------------------------------------------------------
 from __future__ import annotations
+import logging
 from typing import Any
 from uuid import uuid4
-import polars as pl
+try:
+    import polars as pl
+    logging.debug(f"Polars version: {pl.__version__}")
+except ImportError:
+    raise ImportError(
+        "Please install polars if you want to use any relate task"
+    ) from None
 import pyarrow.parquet as pq
-from ddeutil.workflow.dataset import PolarsCsv, PolarsParq
 from ddeutil.workflow.utils import tag
+from ddeutil.workflow.vendors.pl import PolarsCsv, PolarsParq
 def polars_dtype():

ddeutil-workflow 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

ddeutil-workflow 0.0.2py3-none-any.whl → 0.0.4py3-none-any.whl