PyPI - ddeutil-workflow - Versions diffs - 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl - Mend

ddeutil-workflow 0.0.3py3-none-any.whl → 0.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

ddeutil/workflow/__about__.py +1 -1
ddeutil/workflow/{vendors/__schedule.py → __scheduler.py} +153 -135
ddeutil/workflow/loader.py +9 -1
ddeutil/workflow/on.py +143 -0
ddeutil/workflow/pipeline.py +102 -46
ddeutil/workflow/tasks/__init__.py +1 -1
ddeutil/workflow/tasks/dummy.py +52 -0
ddeutil/workflow/utils.py +33 -5
{ddeutil_workflow-0.0.3.dist-info → ddeutil_workflow-0.0.5.dist-info}/METADATA +57 -58
ddeutil_workflow-0.0.5.dist-info/RECORD +17 -0
{ddeutil_workflow-0.0.3.dist-info → ddeutil_workflow-0.0.5.dist-info}/WHEEL +1 -1
ddeutil/workflow/conn.py +0 -240
ddeutil/workflow/schedule.py +0 -82
ddeutil/workflow/tasks/_pandas.py +0 -54
ddeutil/workflow/tasks/_polars.py +0 -92
ddeutil/workflow/vendors/__dataset.py +0 -127
ddeutil/workflow/vendors/__dict.py +0 -333
ddeutil/workflow/vendors/__init__.py +0 -0
ddeutil/workflow/vendors/aws.py +0 -185
ddeutil/workflow/vendors/az.py +0 -0
ddeutil/workflow/vendors/minio.py +0 -11
ddeutil/workflow/vendors/pd.py +0 -13
ddeutil/workflow/vendors/pg.py +0 -11
ddeutil/workflow/vendors/pl.py +0 -172
ddeutil/workflow/vendors/sftp.py +0 -209
ddeutil_workflow-0.0.3.dist-info/RECORD +0 -29
{ddeutil_workflow-0.0.3.dist-info → ddeutil_workflow-0.0.5.dist-info}/LICENSE +0 -0
{ddeutil_workflow-0.0.3.dist-info → ddeutil_workflow-0.0.5.dist-info}/top_level.txt +0 -0

ddeutil/workflow/on.py ADDED Viewed

@@ -0,0 +1,143 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) 2022 Korawich Anuttra. All rights reserved.
+# Licensed under the MIT License. See LICENSE in the project root for
+# license information.
+# ------------------------------------------------------------------------------
+from __future__ import annotations
+from datetime import datetime
+from typing import Annotated, Literal
+from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
+from pydantic import BaseModel, ConfigDict, Field
+from pydantic.functional_validators import field_validator
+from typing_extensions import Self
+try:
+    from .__schedule import WEEKDAYS
+    from .__types import DictData, DictStr
+    from .loader import CronJob, CronRunner, Loader
+except ImportError:
+    from ddeutil.workflow.__scheduler import WEEKDAYS, CronJob, CronRunner
+    from ddeutil.workflow.__types import DictData, DictStr
+    from ddeutil.workflow.loader import Loader
+def interval2crontab(
+    interval: Literal["daily", "weekly", "monthly"],
+    day: str = "monday",
+    time: str = "00:00",
+) -> str:
+    """Return the crontab string that was generated from specific values.
+    :param interval: A interval value that is one of 'daily', 'weekly', or
+        'monthly'.
+    :param day: A day value that will be day of week.
+    :param time: A time value that passing with format '%H:%M'.
+    Examples:
+        >>> interval2crontab(interval='daily', time='01:30')
+        '1 30 * * *'
+        >>> interval2crontab(interval='weekly', day='friday', time='18:30')
+        '18 30 * * 5'
+        >>> interval2crontab(interval='monthly', time='00:00')
+        '0 0 1 * *'
+    """
+    h, m = tuple(
+        i.lstrip("0") if i != "00" else "0" for i in time.split(":", maxsplit=1)
+    )
+    return (
+        f"{h} {m} {'1' if interval == 'monthly' else '*'} * "
+        f"{WEEKDAYS[day[:3].title()] if interval == 'weekly' else '*'}"
+    )
+class Schedule(BaseModel):
+    """Schedule Model
+    See Also:
+        * ``generate()`` is the main usecase of this schedule object.
+    """
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    # NOTE: This is fields of the base schedule.
+    cronjob: Annotated[CronJob, Field(description="Cron job of this schedule")]
+    tz: Annotated[str, Field(description="A timezone string value")] = "Etc/UTC"
+    extras: Annotated[
+        DictData,
+        Field(
+            default_factory=dict,
+            description="An extras mapping parameters",
+        ),
+    ]
+    @classmethod
+    def from_value(cls, value: DictStr, externals: DictData) -> Self:
+        """Constructor from values that will generate crontab by function.
+        :param value: A mapping value that will generate crontab before create
+            schedule model.
+        :param externals: A extras external parameter that will keep in extras.
+        """
+        passing: DictStr = {}
+        if "timezone" in value:
+            passing["tz"] = value.pop("timezone")
+        passing["cronjob"] = interval2crontab(
+            **{v: value[v] for v in value if v in ("interval", "day", "time")}
+        )
+        return cls(extras=externals, **passing)
+    @classmethod
+    def from_loader(
+        cls,
+        name: str,
+        externals: DictData,
+    ) -> Self:
+        """Constructor from the name of config that will use loader object for
+        getting the data.
+        :param name: A name of config that will getting from loader.
+        :param externals: A extras external parameter that will keep in extras.
+        """
+        loader: Loader = Loader(name, externals=externals)
+        # NOTE: Validate the config type match with current connection model
+        if loader.type != cls:
+            raise ValueError(f"Type {loader.type} does not match with {cls}")
+        if "interval" in loader.data:
+            return cls.from_value(loader.data, externals=externals)
+        if "cronjob" not in loader.data:
+            raise ValueError("Config does not set ``cronjob`` value")
+        if "timezone" in loader.data:
+            return cls(
+                cronjob=loader.data["cronjob"],
+                tz=loader.data["timezone"],
+                extras=externals,
+            )
+        return cls(cronjob=loader.data["cronjob"], extras=externals)
+    @field_validator("tz")
+    def __validate_tz(cls, value: str):
+        """Validate timezone value that able to initialize with ZoneInfo after
+        it passing to this model in before mode."""
+        try:
+            _ = ZoneInfo(value)
+            return value
+        except ZoneInfoNotFoundError as err:
+            raise ValueError(f"Invalid timezone: {value}") from err
+    @field_validator("cronjob", mode="before")
+    def __prepare_cronjob(cls, value: str | CronJob) -> CronJob:
+        """Prepare crontab value that able to receive with string type."""
+        return CronJob(value) if isinstance(value, str) else value
+    def generate(self, start: str | datetime) -> CronRunner:
+        """Return Cron runner object."""
+        if not isinstance(start, datetime):
+            start: datetime = datetime.fromisoformat(start)
+        return self.cronjob.schedule(date=(start.astimezone(ZoneInfo(self.tz))))
+class AwsSchedule(Schedule):
+    """Implement Schedule for AWS Service."""

ddeutil/workflow/pipeline.py CHANGED Viewed

@@ -5,18 +5,21 @@
 # ------------------------------------------------------------------------------
 from __future__ import annotations
+import contextlib
 import inspect
 import itertools
 import logging
 import subprocess
+import sys
 import time
+import uuid
 from abc import ABC, abstractmethod
 from inspect import Parameter
+from pathlib import Path
 from queue import Queue
 from subprocess import CompletedProcess
 from typing import Any, Callable, Optional, Union
-import msgspec as spec
 from pydantic import BaseModel, Field
 from pydantic.functional_validators import model_validator
 from typing_extensions import Self
@@ -25,11 +28,14 @@ from .__regex import RegexConf
 from .__types import DictData, DictStr
 from .exceptions import TaskException
 from .loader import Loader, map_params
-from .utils import Params, make_registry
+from .utils import Params, TaskSearch, make_exec, make_registry
 class BaseStage(BaseModel, ABC):
-    """Base Stage Model that keep only id and name fields."""
+    """Base Stage Model that keep only id and name fields for the stage
+    metadata. If you want to implement any custom stage, you can use this class
+    to parent and implement ``self.execute()`` method only.
+    """
     id: Optional[str] = Field(
         default=None,
@@ -45,13 +51,20 @@ class BaseStage(BaseModel, ABC):
     @abstractmethod
     def execute(self, params: DictData) -> DictData:
         """Execute abstraction method that action something by sub-model class.
+        This is important method that make this class is able to be the stage.
         :param params: A parameter data that want to use in this execution.
+        :rtype: DictData
         """
         raise NotImplementedError("Stage should implement ``execute`` method.")
     def set_outputs(self, rs: DictData, params: DictData) -> DictData:
-        """Set an outputs from execution process to an input params."""
+        """Set an outputs from execution process to an input params.
+        :param rs: A result data that want to extract to an output key.
+        :param params: A context data that want to add output result.
+        :rtype: DictData
+        """
         if self.id is None:
             return params
@@ -63,24 +76,57 @@ class BaseStage(BaseModel, ABC):
 class EmptyStage(BaseStage):
-    """Empty stage that is doing nothing and logging the name of stage only."""
+    """Empty stage that do nothing (context equal empty stage) and logging the
+    name of stage only to stdout.
+    """
     def execute(self, params: DictData) -> DictData:
-        """Execute for the Empty stage that do only logging out."""
-        logging.info(f"Execute: {self.name!r}")
+        """Execution method for the Empty stage that do only logging out to
+        stdout.
+        :param params: A context data that want to add output result. But this
+            stage does not pass any output.
+        """
+        logging.info(f"[STAGE]: Empty-Execute: {self.name!r}")
         return params
 class ShellStage(BaseStage):
-    """Shell statement stage."""
+    """Shell stage that execute bash script on the current OS. That mean if your
+    current OS is Windows, it will running bash in the WSL.
+    """
-    shell: str
-    env: DictStr = Field(default_factory=dict)
+    shell: str = Field(description="A shell statement that want to execute.")
+    env: DictStr = Field(
+        default_factory=dict,
+        description=(
+            "An environment variable mapping that want to set before execute "
+            "this shell statement."
+        ),
+    )
-    @staticmethod
-    def __prepare_shell(shell: str):
-        """Prepare shell statement string that include newline"""
-        return shell.replace("\n", ";")
+    @contextlib.contextmanager
+    def __prepare_shell(self):
+        """Return context of prepared shell statement that want to execute. This
+        step will write the `.sh` file before giving this file name to context.
+        After that, it will auto delete this file automatic.
+        """
+        f_name: str = f"{uuid.uuid4()}.sh"
+        f_shebang: str = "bash" if sys.platform.startswith("win") else "sh"
+        with open(f"./{f_name}", mode="w", newline="\n") as f:
+            f.write(f"#!/bin/{f_shebang}\n")
+            for k in self.env:
+                f.write(f"{k}='{self.env[k]}';\n")
+            # NOTE: make sure that shell script file does not have `\r` char.
+            f.write(self.shell.replace("\r\n", "\n"))
+        make_exec(f"./{f_name}")
+        yield [f_shebang, f_name]
+        Path(f_name).unlink()
     def set_outputs(self, rs: CompletedProcess, params: DictData) -> DictData:
         """Set outputs to params"""
@@ -95,8 +141,7 @@ class ShellStage(BaseStage):
             # NOTE: The output will fileter unnecessary keys from ``_locals``.
             "outputs": {
                 "return_code": rs.returncode,
-                "stdout": rs.stdout,
-                "stderr": rs.stderr,
+                "stdout": rs.stdout.rstrip("\n"),
             },
         }
         return params
@@ -105,19 +150,22 @@ class ShellStage(BaseStage):
         """Execute the Shell & Powershell statement with the Python build-in
         ``subprocess`` package.
         """
-        rs: CompletedProcess = subprocess.run(
-            self.__prepare_shell(self.shell),
-            capture_output=True,
-            text=True,
-            shell=True,
-        )
+        with self.__prepare_shell() as sh:
+            logging.info(f"[STAGE]: Shell-Execute: {sh}")
+            rs: CompletedProcess = subprocess.run(
+                sh,
+                shell=False,
+                capture_output=True,
+                text=True,
+            )
         if rs.returncode > 0:
-            print(f"{rs.stderr}\nRunning Statement:\n---\n{self.shell}")
-            # FIXME: raise err for this execution.
-            # raise TaskException(
-            #     f"{rs.stderr}\nRunning Statement:\n---\n"
-            #     f"{self.shell}"
-            # )
+            err: str = (
+                rs.stderr.encode("utf-8").decode("utf-16")
+                if "\\x00" in rs.stderr
+                else rs.stderr
+            )
+            logging.error(f"{err}\nRunning Statement:\n---\n{self.shell}")
+            raise TaskException(f"{err}\nRunning Statement:\n---\n{self.shell}")
         self.set_outputs(rs, params)
         return params
@@ -138,7 +186,12 @@ class PyStage(BaseStage):
         return rs
     def set_outputs(self, rs: DictData, params: DictData) -> DictData:
-        """Set outputs to params"""
+        """Set an outputs from execution process to an input params.
+        :param rs: A result data that want to extract to an output key.
+        :param params: A context data that want to add output result.
+        :rtype: DictData
+        """
         # NOTE: skipping set outputs of stage execution when id does not set.
         if self.id is None:
             return params
@@ -178,18 +231,6 @@ class PyStage(BaseStage):
         return params | {k: _globals[k] for k in params if k in _globals}
-class TaskSearch(spec.Struct, kw_only=True, tag="task"):
-    """Task Search Struct that use the `msgspec` for the best performance."""
-    path: str
-    func: str
-    tag: str
-    def to_dict(self) -> DictData:
-        """Return dict data from struct fields."""
-        return {f: getattr(self, f) for f in self.__struct_fields__}
 class TaskStage(BaseStage):
     """Task executor stage that running the Python function."""
@@ -286,11 +327,14 @@ class Strategy(BaseModel):
 class Job(BaseModel):
-    """Job Model"""
+    """Job Model that is able to call a group of stages."""
     runs_on: Optional[str] = Field(default=None)
     stages: list[Stage] = Field(default_factory=list)
-    needs: list[str] = Field(default_factory=list)
+    needs: list[str] = Field(
+        default_factory=list,
+        description="A list of the job ID that want to run before this job.",
+    )
     strategy: Strategy = Field(default_factory=Strategy)
     @model_validator(mode="before")
@@ -365,11 +409,14 @@ class Pipeline(BaseModel):
     coding line to execute it.
     """
+    desc: Optional[str] = Field(default=None)
     params: dict[str, Params] = Field(default_factory=dict)
+    on: dict[str, DictStr] = Field(default_factory=dict)
     jobs: dict[str, Job]
     @model_validator(mode="before")
     def __prepare_params(cls, values: DictData) -> DictData:
+        # NOTE: Prepare params type if it passing with only type value.
         if params := values.pop("params", {}):
             values["params"] = {
                 p: (
@@ -385,8 +432,9 @@ class Pipeline(BaseModel):
     def from_loader(
         cls,
         name: str,
-        externals: Optional[DictData] = None,
+        externals: DictData | None = None,
     ) -> Self:
+        """Create Pipeline instance from the Loader object."""
         loader: Loader = Loader(name, externals=(externals or {}))
         if "jobs" not in loader.data:
             raise ValueError("Config does not set ``jobs`` value")
@@ -421,8 +469,10 @@ class Pipeline(BaseModel):
         included in the pipeline.
         :param params: An input parameters that use on pipeline execution.
-        :param time_out: A time out second value for limit time of this
-            execution.
+        :param time_out: A time out in second unit that use for limit time of
+            this pipeline execution.
+        ---
         See Also:
@@ -446,6 +496,7 @@ class Pipeline(BaseModel):
         if any(p not in params for p in self.params if self.params[p].required):
             raise ValueError("Required parameter does not pass")
+        # NOTE: mapping type of param before adding it to params variable.
         params: DictData = {
             "params": (
                 params
@@ -458,6 +509,8 @@ class Pipeline(BaseModel):
             "jobs": {},
         }
+        # NOTE: create a job queue that keep the job that want to running after
+        #   it dependency condition.
         jq = Queue()
         for job_id in self.jobs:
             jq.put(job_id)
@@ -472,6 +525,7 @@ class Pipeline(BaseModel):
             job_id: str = jq.get()
             logging.info(f"[PIPELINE]: Start execute the job: {job_id!r}")
             job: Job = self.jobs[job_id]
             # TODO: Condition on ``needs`` of this job was set. It should create
             #   multithreading process on this step.
             #   But, I don't know how to handle changes params between each job
@@ -480,8 +534,10 @@ class Pipeline(BaseModel):
             #   >>> import multiprocessing
             #   >>> with multiprocessing.Pool(processes=3) as pool:
             #   ...     results = pool.starmap(merge_names, ('', '', ...))
+            #
             if any(params["jobs"].get(need) for need in job.needs):
                 jq.put(job_id)
             job.execute(params=params)
             params["jobs"][job_id] = {
                 "stages": params.pop("stages", {}),

ddeutil/workflow/tasks/__init__.py CHANGED Viewed

@@ -3,4 +3,4 @@
 # Licensed under the MIT License. See LICENSE in the project root for
 # license information.
 # ------------------------------------------------------------------------------
-from ._polars import *
+from .dummy import *

ddeutil/workflow/tasks/dummy.py ADDED Viewed

@@ -0,0 +1,52 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) 2022 Korawich Anuttra. All rights reserved.
+# Licensed under the MIT License. See LICENSE in the project root for
+# license information.
+# ------------------------------------------------------------------------------
+from __future__ import annotations
+from typing import Any
+from ddeutil.workflow.utils import tag
+@tag("polars-dir", name="el-csv-to-parquet")
+def dummy_task_1(
+    source: str,
+    sink: str,
+    conversion: dict[str, Any] | None = None,
+) -> dict[str, int]:
+    """Extract Load data from CSV to Parquet file.
+    :param source:
+    :param sink:
+    :param conversion:
+    """
+    print("Start EL for CSV to Parquet with Polars Engine")
+    print("---")
+    print(f"Reading data from {source}")
+    conversion: dict[str, Any] = conversion or {}
+    if conversion:
+        print("Start Schema Conversion ...")
+    print(f"Writing data to {sink}")
+    return {"records": 1}
+@tag("polars-dir-scan", name="el-csv-to-parquet")
+def dummy_task_2(
+    source: str,
+    sink: str,
+    conversion: dict[str, Any] | None = None,
+) -> dict[str, int]:
+    print("Start EL for CSV to Parquet with Polars Engine")
+    print("---")
+    print(f"Reading data from {source}")
+    conversion: dict[str, Any] = conversion or {}
+    if conversion:
+        print("Start Schema Conversion ...")
+    print(f"Writing data to {sink}")
+    return {"records": 1}

ddeutil/workflow/utils.py CHANGED Viewed

@@ -6,18 +6,23 @@
 from __future__ import annotations
 import inspect
+import stat
 from abc import ABC, abstractmethod
 from datetime import date, datetime
 from functools import wraps
 from importlib import import_module
+from pathlib import Path
 from typing import Any, Callable, Literal, Optional, Protocol, Union
+import msgspec as spec
 from ddeutil.core import lazy
 from ddeutil.io.models.lineage import dt_now
 from pydantic import BaseModel, Field
 from pydantic.functional_validators import model_validator
 from typing_extensions import Self
+from .__types import DictData
 class TagFunc(Protocol):
     """Tag Function Protocol"""
@@ -28,16 +33,16 @@ class TagFunc(Protocol):
     def __call__(self, *args, **kwargs): ...
-def tag(tag_value: str, name: str | None = None):
+def tag(value: str, name: str | None = None):
     """Tag decorator function that set function attributes, ``tag`` and ``name``
     for making registries variable.
-    :param: tag_value: A tag value for make different use-case of a function.
+    :param: value: A tag value for make different use-case of a function.
     :param: name: A name that keeping in registries.
     """
-    def func_internal(func: TagFunc):
-        func.tag = tag_value
+    def func_internal(func: callable) -> TagFunc:
+        func.tag = value
         func.name = name or func.__name__.replace("_", "-")
         @wraps(func)
@@ -50,7 +55,10 @@ def tag(tag_value: str, name: str | None = None):
 def make_registry(module: str) -> dict[str, dict[str, Callable[[], TagFunc]]]:
-    """Return registries of all functions that able to called with task."""
+    """Return registries of all functions that able to called with task.
+    :param module: A module prefix that want to import registry.
+    """
     rs: dict[str, dict[str, Callable[[], Callable]]] = {}
     for fstr, func in inspect.getmembers(
         import_module(module), inspect.isfunction
@@ -178,3 +186,23 @@ Params = Union[
     DatetimeParams,
     StrParams,
 ]
+def make_exec(path: str | Path):
+    """Change mode of file to be executable file."""
+    f: Path = Path(path) if isinstance(path, str) else path
+    f.chmod(f.stat().st_mode | stat.S_IEXEC)
+class TaskSearch(spec.Struct, kw_only=True, tag="task"):
+    """Task Search Struct that use the `msgspec` for the best performance data
+    serialize.
+    """
+    path: str
+    func: str
+    tag: str
+    def to_dict(self) -> DictData:
+        """Return dict data from struct fields."""
+        return {f: getattr(self, f) for f in self.__struct_fields__}

ddeutil-workflow 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

ddeutil-workflow 0.0.3py3-none-any.whl → 0.0.5py3-none-any.whl