ddeutil-workflow 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddeutil/workflow/__about__.py +1 -1
- ddeutil/workflow/__init__.py +9 -0
- ddeutil/workflow/__types.py +43 -1
- ddeutil/workflow/exceptions.py +13 -1
- ddeutil/workflow/loader.py +16 -110
- ddeutil/workflow/on.py +195 -0
- ddeutil/workflow/pipeline.py +351 -371
- ddeutil/workflow/{vendors/__schedule.py → scheduler.py} +222 -176
- ddeutil/workflow/stage.py +402 -0
- ddeutil/workflow/utils.py +219 -28
- {ddeutil_workflow-0.0.4.dist-info → ddeutil_workflow-0.0.6.dist-info}/METADATA +118 -90
- ddeutil_workflow-0.0.6.dist-info/RECORD +15 -0
- {ddeutil_workflow-0.0.4.dist-info → ddeutil_workflow-0.0.6.dist-info}/WHEEL +1 -1
- ddeutil/workflow/__regex.py +0 -44
- ddeutil/workflow/conn.py +0 -240
- ddeutil/workflow/schedule.py +0 -82
- ddeutil/workflow/tasks/__init__.py +0 -6
- ddeutil/workflow/tasks/_pandas.py +0 -54
- ddeutil/workflow/tasks/_polars.py +0 -92
- ddeutil/workflow/vendors/__dataset.py +0 -127
- ddeutil/workflow/vendors/__dict.py +0 -333
- ddeutil/workflow/vendors/__init__.py +0 -0
- ddeutil/workflow/vendors/aws.py +0 -185
- ddeutil/workflow/vendors/az.py +0 -0
- ddeutil/workflow/vendors/minio.py +0 -11
- ddeutil/workflow/vendors/pd.py +0 -13
- ddeutil/workflow/vendors/pg.py +0 -11
- ddeutil/workflow/vendors/pl.py +0 -172
- ddeutil/workflow/vendors/sftp.py +0 -209
- ddeutil_workflow-0.0.4.dist-info/RECORD +0 -29
- {ddeutil_workflow-0.0.4.dist-info → ddeutil_workflow-0.0.6.dist-info}/LICENSE +0 -0
- {ddeutil_workflow-0.0.4.dist-info → ddeutil_workflow-0.0.6.dist-info}/top_level.txt +0 -0
ddeutil/workflow/utils.py
CHANGED
@@ -6,20 +6,105 @@
|
|
6
6
|
from __future__ import annotations
|
7
7
|
|
8
8
|
import inspect
|
9
|
+
import os
|
9
10
|
import stat
|
10
11
|
from abc import ABC, abstractmethod
|
12
|
+
from collections.abc import Iterator
|
13
|
+
from dataclasses import dataclass, field
|
11
14
|
from datetime import date, datetime
|
12
15
|
from functools import wraps
|
16
|
+
from hashlib import md5
|
13
17
|
from importlib import import_module
|
18
|
+
from itertools import product
|
14
19
|
from pathlib import Path
|
15
20
|
from typing import Any, Callable, Literal, Optional, Protocol, Union
|
21
|
+
from zoneinfo import ZoneInfo
|
16
22
|
|
17
|
-
from ddeutil.core import lazy
|
23
|
+
from ddeutil.core import getdot, hasdot, lazy
|
24
|
+
from ddeutil.io import PathData
|
18
25
|
from ddeutil.io.models.lineage import dt_now
|
19
26
|
from pydantic import BaseModel, Field
|
20
27
|
from pydantic.functional_validators import model_validator
|
21
28
|
from typing_extensions import Self
|
22
29
|
|
30
|
+
from .__types import DictData, Matrix, Re
|
31
|
+
|
32
|
+
|
33
|
+
class Engine(BaseModel):
|
34
|
+
"""Engine Model"""
|
35
|
+
|
36
|
+
paths: PathData = Field(default_factory=PathData)
|
37
|
+
registry: list[str] = Field(
|
38
|
+
default_factory=lambda: [
|
39
|
+
"ddeutil.workflow",
|
40
|
+
],
|
41
|
+
)
|
42
|
+
|
43
|
+
@model_validator(mode="before")
|
44
|
+
def __prepare_registry(cls, values: DictData) -> DictData:
|
45
|
+
"""Prepare registry value that passing with string type. It convert the
|
46
|
+
string type to list of string.
|
47
|
+
"""
|
48
|
+
if (_regis := values.get("registry")) and isinstance(_regis, str):
|
49
|
+
values["registry"] = [_regis]
|
50
|
+
return values
|
51
|
+
|
52
|
+
|
53
|
+
class ConfParams(BaseModel):
|
54
|
+
"""Params Model"""
|
55
|
+
|
56
|
+
engine: Engine = Field(
|
57
|
+
default_factory=Engine,
|
58
|
+
description="A engine mapping values.",
|
59
|
+
)
|
60
|
+
|
61
|
+
|
62
|
+
def config() -> ConfParams:
|
63
|
+
"""Load Config data from ``workflows-conf.yaml`` file."""
|
64
|
+
root_path: str = os.getenv("WORKFLOW_ROOT_PATH", ".")
|
65
|
+
|
66
|
+
regis: list[str] = []
|
67
|
+
if regis_env := os.getenv("WORKFLOW_CORE_REGISTRY"):
|
68
|
+
regis = [r.strip() for r in regis_env.split(",")]
|
69
|
+
|
70
|
+
conf_path: str = (
|
71
|
+
f"{root_path}/{conf_env}"
|
72
|
+
if (conf_env := os.getenv("WORKFLOW_CORE_PATH_CONF"))
|
73
|
+
else None
|
74
|
+
)
|
75
|
+
return ConfParams.model_validate(
|
76
|
+
obj={
|
77
|
+
"engine": {
|
78
|
+
"registry": regis,
|
79
|
+
"paths": {
|
80
|
+
"root": root_path,
|
81
|
+
"conf": conf_path,
|
82
|
+
},
|
83
|
+
},
|
84
|
+
}
|
85
|
+
)
|
86
|
+
|
87
|
+
|
88
|
+
def gen_id(value: Any, *, sensitive: bool = True, unique: bool = False) -> str:
|
89
|
+
"""Generate running ID for able to tracking. This generate process use `md5`
|
90
|
+
function.
|
91
|
+
|
92
|
+
:param value:
|
93
|
+
:param sensitive:
|
94
|
+
:param unique:
|
95
|
+
:rtype: str
|
96
|
+
"""
|
97
|
+
if not isinstance(value, str):
|
98
|
+
value: str = str(value)
|
99
|
+
|
100
|
+
tz: ZoneInfo = ZoneInfo(os.getenv("WORKFLOW_CORE_TIMEZONE", "UTC"))
|
101
|
+
return md5(
|
102
|
+
(
|
103
|
+
f"{(value if sensitive else value.lower())}"
|
104
|
+
+ (f"{datetime.now(tz=tz):%Y%m%d%H%M%S%f}" if unique else "")
|
105
|
+
).encode()
|
106
|
+
).hexdigest()
|
107
|
+
|
23
108
|
|
24
109
|
class TagFunc(Protocol):
|
25
110
|
"""Tag Function Protocol"""
|
@@ -30,50 +115,68 @@ class TagFunc(Protocol):
|
|
30
115
|
def __call__(self, *args, **kwargs): ...
|
31
116
|
|
32
117
|
|
33
|
-
def tag(
|
118
|
+
def tag(value: str, name: str | None = None):
|
34
119
|
"""Tag decorator function that set function attributes, ``tag`` and ``name``
|
35
120
|
for making registries variable.
|
36
121
|
|
37
|
-
:param:
|
122
|
+
:param: value: A tag value for make different use-case of a function.
|
38
123
|
:param: name: A name that keeping in registries.
|
39
124
|
"""
|
40
125
|
|
41
|
-
def func_internal(func: TagFunc
|
42
|
-
func.tag =
|
126
|
+
def func_internal(func: callable) -> TagFunc:
|
127
|
+
func.tag = value
|
43
128
|
func.name = name or func.__name__.replace("_", "-")
|
44
129
|
|
45
130
|
@wraps(func)
|
46
131
|
def wrapped(*args, **kwargs):
|
47
132
|
return func(*args, **kwargs)
|
48
133
|
|
134
|
+
# TODO: pass result from a wrapped to Result model
|
135
|
+
# >>> return Result.model_validate(obj=wrapped)
|
49
136
|
return wrapped
|
50
137
|
|
51
138
|
return func_internal
|
52
139
|
|
53
140
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
141
|
+
Registry = dict[str, Callable[[], TagFunc]]
|
142
|
+
|
143
|
+
|
144
|
+
def make_registry(submodule: str) -> dict[str, Registry]:
|
145
|
+
"""Return registries of all functions that able to called with task.
|
146
|
+
|
147
|
+
:param submodule: A module prefix that want to import registry.
|
148
|
+
"""
|
149
|
+
rs: dict[str, Registry] = {}
|
150
|
+
for module in config().engine.registry:
|
151
|
+
# NOTE: try to sequential import task functions
|
152
|
+
try:
|
153
|
+
importer = import_module(f"{module}.{submodule}")
|
154
|
+
except ModuleNotFoundError:
|
61
155
|
continue
|
62
156
|
|
63
|
-
|
157
|
+
for fstr, func in inspect.getmembers(importer, inspect.isfunction):
|
158
|
+
# NOTE: check function attribute that already set tag by
|
159
|
+
# ``utils.tag`` decorator.
|
160
|
+
if not hasattr(func, "tag"):
|
161
|
+
continue
|
162
|
+
|
163
|
+
# NOTE: Create new register name if it not exists
|
164
|
+
if func.name not in rs:
|
165
|
+
rs[func.name] = {func.tag: lazy(f"{module}.{submodule}.{fstr}")}
|
166
|
+
continue
|
167
|
+
|
64
168
|
if func.tag in rs[func.name]:
|
65
169
|
raise ValueError(
|
66
|
-
f"The tag {func.tag!r} already exists on
|
170
|
+
f"The tag {func.tag!r} already exists on "
|
171
|
+
f"{module}.{submodule}, you should change this tag name or "
|
172
|
+
f"change it func name."
|
67
173
|
)
|
68
|
-
rs[func.name][func.tag] = lazy(f"{module}.{fstr}")
|
69
|
-
continue
|
174
|
+
rs[func.name][func.tag] = lazy(f"{module}.{submodule}.{fstr}")
|
70
175
|
|
71
|
-
# NOTE: Create new register name if it not exists
|
72
|
-
rs[func.name] = {func.tag: lazy(f"{module}.{fstr}")}
|
73
176
|
return rs
|
74
177
|
|
75
178
|
|
76
|
-
class
|
179
|
+
class BaseParam(BaseModel, ABC):
|
77
180
|
"""Base Parameter that use to make Params Model."""
|
78
181
|
|
79
182
|
desc: Optional[str] = None
|
@@ -87,7 +190,7 @@ class BaseParams(BaseModel, ABC):
|
|
87
190
|
)
|
88
191
|
|
89
192
|
|
90
|
-
class
|
193
|
+
class DefaultParam(BaseParam):
|
91
194
|
"""Default Parameter that will check default if it required"""
|
92
195
|
|
93
196
|
default: Optional[str] = None
|
@@ -107,7 +210,7 @@ class DefaultParams(BaseParams):
|
|
107
210
|
return self
|
108
211
|
|
109
212
|
|
110
|
-
class
|
213
|
+
class DatetimeParam(DefaultParam):
|
111
214
|
"""Datetime parameter."""
|
112
215
|
|
113
216
|
type: Literal["datetime"] = "datetime"
|
@@ -130,7 +233,7 @@ class DatetimeParams(DefaultParams):
|
|
130
233
|
return datetime.fromisoformat(value)
|
131
234
|
|
132
235
|
|
133
|
-
class
|
236
|
+
class StrParam(DefaultParam):
|
134
237
|
"""String parameter."""
|
135
238
|
|
136
239
|
type: Literal["str"] = "str"
|
@@ -141,7 +244,7 @@ class StrParams(DefaultParams):
|
|
141
244
|
return str(value)
|
142
245
|
|
143
246
|
|
144
|
-
class
|
247
|
+
class IntParam(DefaultParam):
|
145
248
|
"""Integer parameter."""
|
146
249
|
|
147
250
|
type: Literal["int"] = "int"
|
@@ -160,7 +263,7 @@ class IntParams(DefaultParams):
|
|
160
263
|
return value
|
161
264
|
|
162
265
|
|
163
|
-
class
|
266
|
+
class ChoiceParam(BaseParam):
|
164
267
|
type: Literal["choice"] = "choice"
|
165
268
|
options: list[str]
|
166
269
|
|
@@ -175,13 +278,101 @@ class ChoiceParams(BaseParams):
|
|
175
278
|
return value
|
176
279
|
|
177
280
|
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
281
|
+
Param = Union[
|
282
|
+
ChoiceParam,
|
283
|
+
DatetimeParam,
|
284
|
+
StrParam,
|
182
285
|
]
|
183
286
|
|
184
287
|
|
288
|
+
@dataclass
|
289
|
+
class Result:
|
290
|
+
"""Result Dataclass object for passing parameter and receiving output from
|
291
|
+
the pipeline execution.
|
292
|
+
"""
|
293
|
+
|
294
|
+
status: int = field(default=2)
|
295
|
+
context: DictData = field(default_factory=dict)
|
296
|
+
|
297
|
+
|
185
298
|
def make_exec(path: str | Path):
|
299
|
+
"""Change mode of file to be executable file."""
|
186
300
|
f: Path = Path(path) if isinstance(path, str) else path
|
187
301
|
f.chmod(f.stat().st_mode | stat.S_IEXEC)
|
302
|
+
|
303
|
+
|
304
|
+
def param2template(
|
305
|
+
value: Any,
|
306
|
+
params: dict[str, Any],
|
307
|
+
*,
|
308
|
+
repr_flag: bool = False,
|
309
|
+
) -> Any:
|
310
|
+
"""Pass param to template string that can search by ``RE_CALLER`` regular
|
311
|
+
expression.
|
312
|
+
|
313
|
+
:param value: A value that want to mapped with an params
|
314
|
+
:param params: A parameter value that getting with matched regular
|
315
|
+
expression.
|
316
|
+
:param repr_flag: A repr flag for using repr instead of str if it set be
|
317
|
+
true.
|
318
|
+
|
319
|
+
:rtype: Any
|
320
|
+
:returns: An any getter value from the params input.
|
321
|
+
"""
|
322
|
+
if isinstance(value, dict):
|
323
|
+
return {k: param2template(value[k], params) for k in value}
|
324
|
+
elif isinstance(value, (list, tuple, set)):
|
325
|
+
return type(value)([param2template(i, params) for i in value])
|
326
|
+
elif not isinstance(value, str):
|
327
|
+
return value
|
328
|
+
|
329
|
+
if not Re.RE_CALLER.search(value):
|
330
|
+
return value
|
331
|
+
|
332
|
+
for found in Re.RE_CALLER.finditer(value):
|
333
|
+
|
334
|
+
# NOTE: get caller value that setting inside; ``${{ <caller-value> }}``
|
335
|
+
caller: str = found.group("caller")
|
336
|
+
if not hasdot(caller, params):
|
337
|
+
raise ValueError(f"params does not set caller: {caller!r}")
|
338
|
+
|
339
|
+
getter: Any = getdot(caller, params)
|
340
|
+
|
341
|
+
# NOTE: check type of vars
|
342
|
+
if isinstance(getter, (str, int)):
|
343
|
+
value: str = value.replace(
|
344
|
+
found.group(0), (repr(getter) if repr_flag else str(getter)), 1
|
345
|
+
)
|
346
|
+
continue
|
347
|
+
|
348
|
+
# NOTE:
|
349
|
+
# If type of getter caller does not formatting, it will return origin
|
350
|
+
# value from the ``getdot`` function.
|
351
|
+
if value.replace(found.group(0), "", 1) != "":
|
352
|
+
raise ValueError(
|
353
|
+
"Callable variable should not pass other outside ${{ ... }}"
|
354
|
+
)
|
355
|
+
return getter
|
356
|
+
return value
|
357
|
+
|
358
|
+
|
359
|
+
def dash2underscore(
|
360
|
+
key: str,
|
361
|
+
values: DictData,
|
362
|
+
*,
|
363
|
+
fixed: str | None = None,
|
364
|
+
) -> DictData:
|
365
|
+
"""Change key name that has dash to underscore."""
|
366
|
+
if key in values:
|
367
|
+
values[(fixed or key.replace("-", "_"))] = values.pop(key)
|
368
|
+
return values
|
369
|
+
|
370
|
+
|
371
|
+
def cross_product(matrix: Matrix) -> Iterator:
|
372
|
+
"""Iterator of products value from matrix."""
|
373
|
+
yield from (
|
374
|
+
{_k: _v for e in mapped for _k, _v in e.items()}
|
375
|
+
for mapped in product(
|
376
|
+
*[[{k: v} for v in vs] for k, vs in matrix.items()]
|
377
|
+
)
|
378
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: ddeutil-workflow
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.6
|
4
4
|
Summary: Data Developer & Engineer Workflow Utility Objects
|
5
5
|
Author-email: ddeutils <korawich.anu@gmail.com>
|
6
6
|
License: MIT
|
@@ -9,7 +9,7 @@ Project-URL: Source Code, https://github.com/ddeutils/ddeutil-workflow/
|
|
9
9
|
Keywords: data,workflow,utility,pipeline
|
10
10
|
Classifier: Topic :: Utilities
|
11
11
|
Classifier: Natural Language :: English
|
12
|
-
Classifier: Development Status ::
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
13
13
|
Classifier: Intended Audience :: Developers
|
14
14
|
Classifier: Operating System :: OS Independent
|
15
15
|
Classifier: Programming Language :: Python
|
@@ -23,35 +23,33 @@ Description-Content-Type: text/markdown
|
|
23
23
|
License-File: LICENSE
|
24
24
|
Requires-Dist: fmtutil
|
25
25
|
Requires-Dist: ddeutil-io
|
26
|
-
Requires-Dist: python-dotenv
|
27
|
-
Provides-Extra:
|
28
|
-
Requires-Dist:
|
29
|
-
Requires-Dist:
|
30
|
-
|
31
|
-
|
32
|
-
Requires-Dist: fsspec ==2024.5.0 ; extra == 'test'
|
33
|
-
Requires-Dist: polars ==0.20.31 ; extra == 'test'
|
34
|
-
Requires-Dist: pyarrow ==16.1.0 ; extra == 'test'
|
35
|
-
|
36
|
-
# Data Utility: _Workflow_
|
26
|
+
Requires-Dist: python-dotenv ==1.0.1
|
27
|
+
Provides-Extra: app
|
28
|
+
Requires-Dist: fastapi ==0.112.0 ; extra == 'app'
|
29
|
+
Requires-Dist: apscheduler[sqlalchemy] ==3.10.4 ; extra == 'app'
|
30
|
+
|
31
|
+
# Workflow
|
37
32
|
|
38
33
|
[](https://github.com/ddeutils/ddeutil-workflow/actions/workflows/tests.yml)
|
39
34
|
[](https://pypi.org/project/ddeutil-workflow/)
|
40
35
|
[](https://github.com/ddeutils/ddeutil-workflow)
|
36
|
+
[](https://github.com/ddeutils/ddeutil-workflow/blob/main/LICENSE)
|
41
37
|
|
42
38
|
**Table of Contents**:
|
43
39
|
|
44
40
|
- [Installation](#installation)
|
45
41
|
- [Getting Started](#getting-started)
|
46
|
-
|
47
|
-
- [
|
48
|
-
- [
|
49
|
-
- [
|
50
|
-
- [Python](#python)
|
51
|
-
- [
|
52
|
-
- [
|
53
|
-
|
54
|
-
|
42
|
+
- [Core Features](#core-features)
|
43
|
+
- [On](#on)
|
44
|
+
- [Pipeline](#pipeline)
|
45
|
+
- [Usage](#usage)
|
46
|
+
- [Python & Bash](#python--bash)
|
47
|
+
- [Hook (EL)](#hook-extract--load)
|
48
|
+
- [Hook (T)](#hook-transform)
|
49
|
+
- [Configuration](#configuration)
|
50
|
+
- [Deployment](#deployment)
|
51
|
+
|
52
|
+
This **Workflow** objects was created for easy to make a simple metadata
|
55
53
|
driven pipeline that able to **ETL, T, EL, or ELT** by `.yaml` file.
|
56
54
|
|
57
55
|
I think we should not create the multiple pipeline per use-case if we able to
|
@@ -74,13 +72,18 @@ pipeline.
|
|
74
72
|
pip install ddeutil-workflow
|
75
73
|
```
|
76
74
|
|
77
|
-
This project need `ddeutil-io
|
75
|
+
This project need `ddeutil-io` extension namespace packages. If you want to install
|
76
|
+
this package with application add-ons, you should add `app` in installation;
|
77
|
+
|
78
|
+
```shell
|
79
|
+
pip install ddeutil-workflow[app]
|
80
|
+
```
|
78
81
|
|
79
82
|
## Getting Started
|
80
83
|
|
81
84
|
The first step, you should start create the connections and datasets for In and
|
82
85
|
Out of you data that want to use in pipeline of workflow. Some of this component
|
83
|
-
is similar component of the **Airflow** because I like it concepts.
|
86
|
+
is similar component of the **Airflow** because I like it orchestration concepts.
|
84
87
|
|
85
88
|
The main feature of this project is the `Pipeline` object that can call any
|
86
89
|
registries function. The pipeline can handle everything that you want to do, it
|
@@ -91,88 +94,84 @@ will passing parameters and catching the output for re-use it to next step.
|
|
91
94
|
> dynamic registries instead of main features because it have a lot of maintain
|
92
95
|
> vendor codes and deps. (I do not have time to handle this features)
|
93
96
|
|
94
|
-
###
|
97
|
+
### On
|
95
98
|
|
96
|
-
The
|
99
|
+
The **On** is schedule object.
|
97
100
|
|
98
101
|
```yaml
|
99
|
-
|
100
|
-
type:
|
101
|
-
|
102
|
+
on_every_5_min:
|
103
|
+
type: on.On
|
104
|
+
cron: "*/5 * * * *"
|
102
105
|
```
|
103
106
|
|
104
107
|
```python
|
105
|
-
from ddeutil.workflow.
|
108
|
+
from ddeutil.workflow.on import On
|
106
109
|
|
107
|
-
|
108
|
-
assert
|
109
|
-
```
|
110
|
+
schedule = On.from_loader(name='on_every_5_min', externals={})
|
111
|
+
assert '*/5 * * * *' == str(schedule.cronjob)
|
110
112
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
```yaml
|
118
|
-
ds_postgres_customer_tbl:
|
119
|
-
type: dataset.PostgresTbl
|
120
|
-
conn: 'conn_postgres_data'
|
121
|
-
features:
|
122
|
-
id: serial primary key
|
123
|
-
name: varchar( 100 ) not null
|
113
|
+
cron_iter = schedule.generate('2022-01-01 00:00:00')
|
114
|
+
assert '2022-01-01 00:05:00' f"{cron_iter.next:%Y-%m-%d %H:%M:%S}"
|
115
|
+
assert '2022-01-01 00:10:00' f"{cron_iter.next:%Y-%m-%d %H:%M:%S}"
|
116
|
+
assert '2022-01-01 00:15:00' f"{cron_iter.next:%Y-%m-%d %H:%M:%S}"
|
117
|
+
assert '2022-01-01 00:20:00' f"{cron_iter.next:%Y-%m-%d %H:%M:%S}"
|
124
118
|
```
|
125
119
|
|
126
|
-
|
127
|
-
from ddeutil.workflow.vendors.pg import PostgresTbl
|
128
|
-
|
129
|
-
dataset = PostgresTbl.from_loader(name='ds_postgres_customer_tbl', externals={})
|
130
|
-
assert dataset.exists()
|
131
|
-
```
|
120
|
+
### Pipeline
|
132
121
|
|
133
|
-
|
122
|
+
The **Pipeline** object that is the core feature of this project.
|
134
123
|
|
135
124
|
```yaml
|
136
|
-
|
137
|
-
type:
|
138
|
-
|
125
|
+
run_py_local:
|
126
|
+
type: ddeutil.workflow.pipeline.Pipeline
|
127
|
+
on: 'on_every_5_min'
|
128
|
+
params:
|
129
|
+
author-run:
|
130
|
+
type: str
|
131
|
+
run-date:
|
132
|
+
type: datetime
|
139
133
|
```
|
140
134
|
|
141
135
|
```python
|
142
|
-
from ddeutil.workflow.
|
143
|
-
|
144
|
-
scdl = Schedule.from_loader(name='schd_for_node', externals={})
|
145
|
-
assert '*/5 * * * *' == str(scdl.cronjob)
|
136
|
+
from ddeutil.workflow.pipeline import Pipeline
|
146
137
|
|
147
|
-
|
148
|
-
|
149
|
-
assert '2022-01-01 00:10:00' f"{cron_iterate.next:%Y-%m-%d %H:%M:%S}"
|
150
|
-
assert '2022-01-01 00:15:00' f"{cron_iterate.next:%Y-%m-%d %H:%M:%S}"
|
151
|
-
assert '2022-01-01 00:20:00' f"{cron_iterate.next:%Y-%m-%d %H:%M:%S}"
|
152
|
-
assert '2022-01-01 00:25:00' f"{cron_iterate.next:%Y-%m-%d %H:%M:%S}"
|
138
|
+
pipe = Pipeline.from_loader(name='run_py_local', externals={})
|
139
|
+
pipe.execute(params={'author-run': 'Local Workflow', 'run-date': '2024-01-01'})
|
153
140
|
```
|
154
141
|
|
155
|
-
|
142
|
+
> [!NOTE]
|
143
|
+
> The above parameter use short declarative statement. You can pass a parameter
|
144
|
+
> type to the key of a parameter name.
|
145
|
+
> ```yaml
|
146
|
+
> params:
|
147
|
+
> author-run: str
|
148
|
+
> run-date: datetime
|
149
|
+
> ```
|
150
|
+
>
|
151
|
+
> And for the type, you can remove `ddeutil.workflow` prefix because we can find
|
152
|
+
> it by looping search from `WORKFLOW_CORE_REGISTRY` value.
|
153
|
+
|
154
|
+
## Usage
|
156
155
|
|
157
156
|
This is examples that use workflow file for running common Data Engineering
|
158
157
|
use-case.
|
159
158
|
|
160
|
-
|
159
|
+
> [!IMPORTANT]
|
160
|
+
> I recommend you to use `task` stage for all actions that you want to do with
|
161
|
+
> pipeline object.
|
161
162
|
|
162
|
-
|
163
|
+
### Python & Bash
|
163
164
|
|
164
165
|
```yaml
|
165
166
|
run_py_local:
|
166
|
-
type:
|
167
|
+
type: pipeline.Pipeline
|
167
168
|
params:
|
168
|
-
author-run:
|
169
|
-
|
170
|
-
run-date:
|
171
|
-
type: datetime
|
169
|
+
author-run: str
|
170
|
+
run-date: datetime
|
172
171
|
jobs:
|
173
172
|
first-job:
|
174
173
|
stages:
|
175
|
-
- name: Printing Information
|
174
|
+
- name: "Printing Information"
|
176
175
|
id: define-func
|
177
176
|
run: |
|
178
177
|
x = '${{ params.author-run }}'
|
@@ -181,7 +180,7 @@ run_py_local:
|
|
181
180
|
def echo(name: str):
|
182
181
|
print(f'Hello {name}')
|
183
182
|
|
184
|
-
- name: Run Sequence and use var from Above
|
183
|
+
- name: "Run Sequence and use var from Above"
|
185
184
|
vars:
|
186
185
|
x: ${{ params.author-run }}
|
187
186
|
run: |
|
@@ -189,11 +188,17 @@ run_py_local:
|
|
189
188
|
# Change x value
|
190
189
|
x: int = 1
|
191
190
|
|
192
|
-
- name: Call Function
|
191
|
+
- name: "Call Function"
|
193
192
|
vars:
|
194
193
|
echo: ${{ stages.define-func.outputs.echo }}
|
195
194
|
run: |
|
196
195
|
echo('Caller')
|
196
|
+
second-job:
|
197
|
+
stages:
|
198
|
+
- name: "Echo Bash Script"
|
199
|
+
id: shell-echo
|
200
|
+
bash: |
|
201
|
+
echo "Hello World from Shell"
|
197
202
|
```
|
198
203
|
|
199
204
|
```python
|
@@ -207,24 +212,23 @@ pipe.execute(params={'author-run': 'Local Workflow', 'run-date': '2024-01-01'})
|
|
207
212
|
> Hello Local Workflow
|
208
213
|
> Receive x from above with Local Workflow
|
209
214
|
> Hello Caller
|
215
|
+
> Hello World from Shell
|
210
216
|
```
|
211
217
|
|
212
|
-
###
|
218
|
+
### Hook (Extract & Load)
|
213
219
|
|
214
220
|
```yaml
|
215
221
|
pipe_el_pg_to_lake:
|
216
|
-
type:
|
222
|
+
type: pipeline.Pipeline
|
217
223
|
params:
|
218
|
-
run-date:
|
219
|
-
|
220
|
-
author-email:
|
221
|
-
type: str
|
224
|
+
run-date: datetime
|
225
|
+
author-email: str
|
222
226
|
jobs:
|
223
227
|
extract-load:
|
224
228
|
stages:
|
225
229
|
- name: "Extract Load from Postgres to Lake"
|
226
230
|
id: extract-load
|
227
|
-
|
231
|
+
uses: tasks/postgres-to-delta@polars
|
228
232
|
with:
|
229
233
|
source:
|
230
234
|
conn: conn_postgres_url
|
@@ -236,11 +240,11 @@ pipe_el_pg_to_lake:
|
|
236
240
|
endpoint: "/${{ params.name }}"
|
237
241
|
```
|
238
242
|
|
239
|
-
###
|
243
|
+
### Hook (Transform)
|
240
244
|
|
241
245
|
```yaml
|
242
|
-
|
243
|
-
type:
|
246
|
+
pipeline_hook_mssql_proc:
|
247
|
+
type: pipeline.Pipeline
|
244
248
|
params:
|
245
249
|
run_date: datetime
|
246
250
|
sp_name: str
|
@@ -251,7 +255,7 @@ pipe_hook_mssql_proc:
|
|
251
255
|
stages:
|
252
256
|
- name: "Transform Data in MS SQL Server"
|
253
257
|
id: transform
|
254
|
-
|
258
|
+
uses: tasks/mssql-proc@odbc
|
255
259
|
with:
|
256
260
|
exec: ${{ params.sp_name }}
|
257
261
|
params:
|
@@ -261,6 +265,30 @@ pipe_hook_mssql_proc:
|
|
261
265
|
target: ${{ params.target_name }}
|
262
266
|
```
|
263
267
|
|
264
|
-
##
|
268
|
+
## Configuration
|
265
269
|
|
266
|
-
|
270
|
+
```bash
|
271
|
+
export WORKFLOW_ROOT_PATH=.
|
272
|
+
export WORKFLOW_CORE_REGISTRY=ddeutil.workflow,tests.utils
|
273
|
+
export WORKFLOW_CORE_PATH_CONF=conf
|
274
|
+
```
|
275
|
+
|
276
|
+
Application config:
|
277
|
+
|
278
|
+
```bash
|
279
|
+
export WORKFLOW_APP_DB_URL=postgresql+asyncpg://user:pass@localhost:5432/schedule
|
280
|
+
export WORKFLOW_APP_INTERVAL=10
|
281
|
+
```
|
282
|
+
|
283
|
+
## Deployment
|
284
|
+
|
285
|
+
This package able to run as a application service for receive manual trigger
|
286
|
+
from the master node via RestAPI.
|
287
|
+
|
288
|
+
> [!WARNING]
|
289
|
+
> This feature do not start yet because I still research and find the best tool
|
290
|
+
> to use it provision an app service, like `starlette`, `fastapi`, `apscheduler`.
|
291
|
+
|
292
|
+
```shell
|
293
|
+
(venv) $ workflow start -p 7070
|
294
|
+
```
|