ddeutil-workflow 0.0.5__tar.gz → 0.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ddeutil_workflow-0.0.5/src/ddeutil_workflow.egg-info → ddeutil_workflow-0.0.7}/PKG-INFO +144 -68
- {ddeutil_workflow-0.0.5 → ddeutil_workflow-0.0.7}/README.md +137 -66
- {ddeutil_workflow-0.0.5 → ddeutil_workflow-0.0.7}/pyproject.toml +18 -5
- ddeutil_workflow-0.0.7/src/ddeutil/workflow/__about__.py +1 -0
- ddeutil_workflow-0.0.7/src/ddeutil/workflow/__init__.py +31 -0
- ddeutil_workflow-0.0.5/src/ddeutil/workflow/__regex.py → ddeutil_workflow-0.0.7/src/ddeutil/workflow/__types.py +24 -4
- ddeutil_workflow-0.0.7/src/ddeutil/workflow/api.py +120 -0
- ddeutil_workflow-0.0.7/src/ddeutil/workflow/app.py +41 -0
- {ddeutil_workflow-0.0.5 → ddeutil_workflow-0.0.7}/src/ddeutil/workflow/exceptions.py +16 -1
- ddeutil_workflow-0.0.7/src/ddeutil/workflow/loader.py +80 -0
- ddeutil_workflow-0.0.7/src/ddeutil/workflow/log.py +30 -0
- {ddeutil_workflow-0.0.5 → ddeutil_workflow-0.0.7}/src/ddeutil/workflow/on.py +78 -26
- ddeutil_workflow-0.0.7/src/ddeutil/workflow/pipeline.py +733 -0
- ddeutil_workflow-0.0.7/src/ddeutil/workflow/repeat.py +134 -0
- ddeutil_workflow-0.0.7/src/ddeutil/workflow/route.py +78 -0
- ddeutil_workflow-0.0.5/src/ddeutil/workflow/__scheduler.py → ddeutil_workflow-0.0.7/src/ddeutil/workflow/scheduler.py +73 -45
- ddeutil_workflow-0.0.7/src/ddeutil/workflow/stage.py +431 -0
- ddeutil_workflow-0.0.7/src/ddeutil/workflow/utils.py +602 -0
- {ddeutil_workflow-0.0.5 → ddeutil_workflow-0.0.7/src/ddeutil_workflow.egg-info}/PKG-INFO +144 -68
- {ddeutil_workflow-0.0.5 → ddeutil_workflow-0.0.7}/src/ddeutil_workflow.egg-info/SOURCES.txt +19 -10
- ddeutil_workflow-0.0.7/src/ddeutil_workflow.egg-info/requires.txt +11 -0
- ddeutil_workflow-0.0.7/tests/test__conf_exist.py +11 -0
- ddeutil_workflow-0.0.7/tests/test__regex.py +90 -0
- {ddeutil_workflow-0.0.5 → ddeutil_workflow-0.0.7}/tests/test_on.py +14 -5
- {ddeutil_workflow-0.0.5 → ddeutil_workflow-0.0.7}/tests/test_pipeline.py +3 -9
- ddeutil_workflow-0.0.7/tests/test_pipeline_desc.py +11 -0
- ddeutil_workflow-0.0.7/tests/test_pipeline_if.py +28 -0
- ddeutil_workflow-0.0.7/tests/test_pipeline_matrix.py +159 -0
- ddeutil_workflow-0.0.7/tests/test_pipeline_on.py +12 -0
- {ddeutil_workflow-0.0.5 → ddeutil_workflow-0.0.7}/tests/test_pipeline_params.py +1 -1
- {ddeutil_workflow-0.0.5 → ddeutil_workflow-0.0.7}/tests/test_pipeline_run.py +40 -45
- {ddeutil_workflow-0.0.5 → ddeutil_workflow-0.0.7}/tests/test_pipeline_task.py +26 -12
- ddeutil_workflow-0.0.5/tests/test_base_schedule.py → ddeutil_workflow-0.0.7/tests/test_scheduler.py +61 -18
- ddeutil_workflow-0.0.7/tests/test_stage_trigger.py +32 -0
- ddeutil_workflow-0.0.7/tests/test_utils.py +8 -0
- ddeutil_workflow-0.0.7/tests/test_utils_param2template.py +71 -0
- ddeutil_workflow-0.0.7/tests/test_utils_result.py +22 -0
- ddeutil_workflow-0.0.5/src/ddeutil/workflow/__about__.py +0 -1
- ddeutil_workflow-0.0.5/src/ddeutil/workflow/__init__.py +0 -0
- ddeutil_workflow-0.0.5/src/ddeutil/workflow/__types.py +0 -12
- ddeutil_workflow-0.0.5/src/ddeutil/workflow/loader.py +0 -182
- ddeutil_workflow-0.0.5/src/ddeutil/workflow/pipeline.py +0 -548
- ddeutil_workflow-0.0.5/src/ddeutil/workflow/tasks/__init__.py +0 -6
- ddeutil_workflow-0.0.5/src/ddeutil/workflow/tasks/dummy.py +0 -52
- ddeutil_workflow-0.0.5/src/ddeutil/workflow/utils.py +0 -208
- ddeutil_workflow-0.0.5/src/ddeutil_workflow.egg-info/requires.txt +0 -4
- ddeutil_workflow-0.0.5/tests/test_base_data.py +0 -13
- ddeutil_workflow-0.0.5/tests/test_base_regex.py +0 -46
- ddeutil_workflow-0.0.5/tests/test_loader.py +0 -6
- ddeutil_workflow-0.0.5/tests/test_pipeline_matrix.py +0 -29
- {ddeutil_workflow-0.0.5 → ddeutil_workflow-0.0.7}/LICENSE +0 -0
- {ddeutil_workflow-0.0.5 → ddeutil_workflow-0.0.7}/setup.cfg +0 -0
- {ddeutil_workflow-0.0.5 → ddeutil_workflow-0.0.7}/src/ddeutil_workflow.egg-info/dependency_links.txt +0 -0
- {ddeutil_workflow-0.0.5 → ddeutil_workflow-0.0.7}/src/ddeutil_workflow.egg-info/top_level.txt +0 -0
- /ddeutil_workflow-0.0.5/tests/test_base_local_and_global.py → /ddeutil_workflow-0.0.7/tests/test__local_and_global.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: ddeutil-workflow
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.7
|
4
4
|
Summary: Data Developer & Engineer Workflow Utility Objects
|
5
5
|
Author-email: ddeutils <korawich.anu@gmail.com>
|
6
6
|
License: MIT
|
@@ -24,36 +24,42 @@ License-File: LICENSE
|
|
24
24
|
Requires-Dist: fmtutil
|
25
25
|
Requires-Dist: ddeutil-io
|
26
26
|
Requires-Dist: python-dotenv==1.0.1
|
27
|
-
|
27
|
+
Provides-Extra: app
|
28
|
+
Requires-Dist: schedule<2.0.0,==1.2.2; extra == "app"
|
29
|
+
Provides-Extra: api
|
30
|
+
Requires-Dist: fastapi[standard]==0.112.0; extra == "api"
|
31
|
+
Requires-Dist: apscheduler[sqlalchemy]<4.0.0,==3.10.4; extra == "api"
|
32
|
+
Requires-Dist: croniter==3.0.3; extra == "api"
|
28
33
|
|
29
|
-
#
|
34
|
+
# Workflow
|
30
35
|
|
31
36
|
[](https://github.com/ddeutils/ddeutil-workflow/actions/workflows/tests.yml)
|
32
37
|
[](https://pypi.org/project/ddeutil-workflow/)
|
33
38
|
[](https://github.com/ddeutils/ddeutil-workflow)
|
34
39
|
[](https://github.com/ddeutils/ddeutil-workflow/blob/main/LICENSE)
|
35
40
|
|
36
|
-
|
37
41
|
**Table of Contents**:
|
38
42
|
|
39
43
|
- [Installation](#installation)
|
40
44
|
- [Getting Started](#getting-started)
|
41
|
-
- [
|
42
|
-
- [
|
43
|
-
|
44
|
-
- [
|
45
|
-
- [
|
46
|
-
- [
|
47
|
-
- [Hooks (T)](#tasks-transform)
|
45
|
+
- [On](#on)
|
46
|
+
- [Pipeline](#pipeline)
|
47
|
+
- [Usage](#usage)
|
48
|
+
- [Python & Bash](#python--bash)
|
49
|
+
- [Hook (EL)](#hook-extract--load)
|
50
|
+
- [Hook (T)](#hook-transform)
|
48
51
|
- [Configuration](#configuration)
|
52
|
+
- [Deployment](#deployment)
|
49
53
|
|
50
|
-
This **
|
51
|
-
driven pipeline that able to **ETL, T, EL, or
|
54
|
+
This **Workflow** objects was created for easy to make a simple metadata
|
55
|
+
driven for data pipeline orchestration that able to use for **ETL, T, EL, or
|
56
|
+
ELT** by a `.yaml` file template.
|
52
57
|
|
53
|
-
I think
|
54
|
-
write
|
55
|
-
|
56
|
-
|
58
|
+
In my opinion, I think it should not create duplicate pipeline codes if I can
|
59
|
+
write with dynamic input parameters on the one template pipeline that just change
|
60
|
+
the input parameters per use-case instead.
|
61
|
+
This way I can handle a lot of logical pipelines in our orgs with only metadata
|
62
|
+
configuration. It called **Metadata Driven Data Pipeline**.
|
57
63
|
|
58
64
|
Next, we should get some monitoring tools for manage logging that return from
|
59
65
|
pipeline running. Because it not show us what is a use-case that running data
|
@@ -70,7 +76,16 @@ pipeline.
|
|
70
76
|
pip install ddeutil-workflow
|
71
77
|
```
|
72
78
|
|
73
|
-
This project need `ddeutil-io
|
79
|
+
This project need `ddeutil-io` extension namespace packages. If you want to install
|
80
|
+
this package with application add-ons, you should add `app` in installation;
|
81
|
+
|
82
|
+
```shell
|
83
|
+
pip install ddeutil-workflow[app]
|
84
|
+
```
|
85
|
+
|
86
|
+
```shell
|
87
|
+
pip install ddeutil-workflow[api]
|
88
|
+
```
|
74
89
|
|
75
90
|
## Getting Started
|
76
91
|
|
@@ -87,38 +102,42 @@ will passing parameters and catching the output for re-use it to next step.
|
|
87
102
|
> dynamic registries instead of main features because it have a lot of maintain
|
88
103
|
> vendor codes and deps. (I do not have time to handle this features)
|
89
104
|
|
90
|
-
|
105
|
+
### On
|
91
106
|
|
92
|
-
|
107
|
+
The **On** is schedule object.
|
93
108
|
|
94
109
|
```yaml
|
95
|
-
|
96
|
-
type:
|
110
|
+
on_every_5_min:
|
111
|
+
type: on.On
|
97
112
|
cron: "*/5 * * * *"
|
98
113
|
```
|
99
114
|
|
100
115
|
```python
|
101
|
-
from ddeutil.workflow.on import
|
116
|
+
from ddeutil.workflow.on import On
|
102
117
|
|
103
|
-
|
104
|
-
assert '*/5 * * * *' == str(
|
118
|
+
schedule = On.from_loader(name='on_every_5_min', externals={})
|
119
|
+
assert '*/5 * * * *' == str(schedule.cronjob)
|
105
120
|
|
106
|
-
|
107
|
-
assert '2022-01-01 00:05:00' f"{
|
108
|
-
assert '2022-01-01 00:10:00' f"{
|
109
|
-
assert '2022-01-01 00:15:00' f"{
|
110
|
-
assert '2022-01-01 00:20:00' f"{
|
111
|
-
assert '2022-01-01 00:25:00' f"{cron_iterate.next:%Y-%m-%d %H:%M:%S}"
|
121
|
+
cron_iter = schedule.generate('2022-01-01 00:00:00')
|
122
|
+
assert '2022-01-01 00:05:00' f"{cron_iter.next:%Y-%m-%d %H:%M:%S}"
|
123
|
+
assert '2022-01-01 00:10:00' f"{cron_iter.next:%Y-%m-%d %H:%M:%S}"
|
124
|
+
assert '2022-01-01 00:15:00' f"{cron_iter.next:%Y-%m-%d %H:%M:%S}"
|
125
|
+
assert '2022-01-01 00:20:00' f"{cron_iter.next:%Y-%m-%d %H:%M:%S}"
|
112
126
|
```
|
113
127
|
|
114
|
-
---
|
115
|
-
|
116
128
|
### Pipeline
|
117
129
|
|
130
|
+
The **Pipeline** object that is the core feature of this project.
|
131
|
+
|
118
132
|
```yaml
|
119
133
|
run_py_local:
|
120
134
|
type: ddeutil.workflow.pipeline.Pipeline
|
121
|
-
|
135
|
+
on: 'on_every_5_min'
|
136
|
+
params:
|
137
|
+
author-run:
|
138
|
+
type: str
|
139
|
+
run-date:
|
140
|
+
type: datetime
|
122
141
|
```
|
123
142
|
|
124
143
|
```python
|
@@ -128,27 +147,39 @@ pipe = Pipeline.from_loader(name='run_py_local', externals={})
|
|
128
147
|
pipe.execute(params={'author-run': 'Local Workflow', 'run-date': '2024-01-01'})
|
129
148
|
```
|
130
149
|
|
131
|
-
|
150
|
+
> [!NOTE]
|
151
|
+
> The above parameter use short declarative statement. You can pass a parameter
|
152
|
+
> type to the key of a parameter name.
|
153
|
+
> ```yaml
|
154
|
+
> params:
|
155
|
+
> author-run: str
|
156
|
+
> run-date: datetime
|
157
|
+
> ```
|
158
|
+
>
|
159
|
+
> And for the type, you can remove `ddeutil.workflow` prefix because we can find
|
160
|
+
> it by looping search from `WORKFLOW_CORE_REGISTRY` value.
|
161
|
+
|
162
|
+
## Usage
|
132
163
|
|
133
164
|
This is examples that use workflow file for running common Data Engineering
|
134
165
|
use-case.
|
135
166
|
|
136
|
-
|
167
|
+
> [!IMPORTANT]
|
168
|
+
> I recommend you to use `task` stage for all actions that you want to do with
|
169
|
+
> pipeline object.
|
137
170
|
|
138
|
-
|
171
|
+
### Python & Bash
|
139
172
|
|
140
173
|
```yaml
|
141
174
|
run_py_local:
|
142
|
-
type:
|
175
|
+
type: pipeline.Pipeline
|
143
176
|
params:
|
144
|
-
author-run:
|
145
|
-
|
146
|
-
run-date:
|
147
|
-
type: datetime
|
177
|
+
author-run: str
|
178
|
+
run-date: datetime
|
148
179
|
jobs:
|
149
180
|
first-job:
|
150
181
|
stages:
|
151
|
-
- name: Printing Information
|
182
|
+
- name: "Printing Information"
|
152
183
|
id: define-func
|
153
184
|
run: |
|
154
185
|
x = '${{ params.author-run }}'
|
@@ -157,7 +188,7 @@ run_py_local:
|
|
157
188
|
def echo(name: str):
|
158
189
|
print(f'Hello {name}')
|
159
190
|
|
160
|
-
- name: Run Sequence and use var from Above
|
191
|
+
- name: "Run Sequence and use var from Above"
|
161
192
|
vars:
|
162
193
|
x: ${{ params.author-run }}
|
163
194
|
run: |
|
@@ -165,16 +196,16 @@ run_py_local:
|
|
165
196
|
# Change x value
|
166
197
|
x: int = 1
|
167
198
|
|
168
|
-
- name: Call Function
|
199
|
+
- name: "Call Function"
|
169
200
|
vars:
|
170
201
|
echo: ${{ stages.define-func.outputs.echo }}
|
171
202
|
run: |
|
172
203
|
echo('Caller')
|
173
204
|
second-job:
|
174
205
|
stages:
|
175
|
-
- name: Echo
|
206
|
+
- name: "Echo Bash Script"
|
176
207
|
id: shell-echo
|
177
|
-
|
208
|
+
bash: |
|
178
209
|
echo "Hello World from Shell"
|
179
210
|
```
|
180
211
|
|
@@ -192,24 +223,20 @@ pipe.execute(params={'author-run': 'Local Workflow', 'run-date': '2024-01-01'})
|
|
192
223
|
> Hello World from Shell
|
193
224
|
```
|
194
225
|
|
195
|
-
|
196
|
-
|
197
|
-
### Tasks (Extract & Load)
|
226
|
+
### Hook (Extract & Load)
|
198
227
|
|
199
228
|
```yaml
|
200
229
|
pipe_el_pg_to_lake:
|
201
|
-
type:
|
230
|
+
type: pipeline.Pipeline
|
202
231
|
params:
|
203
|
-
run-date:
|
204
|
-
|
205
|
-
author-email:
|
206
|
-
type: str
|
232
|
+
run-date: datetime
|
233
|
+
author-email: str
|
207
234
|
jobs:
|
208
235
|
extract-load:
|
209
236
|
stages:
|
210
237
|
- name: "Extract Load from Postgres to Lake"
|
211
238
|
id: extract-load
|
212
|
-
|
239
|
+
uses: tasks/postgres-to-delta@polars
|
213
240
|
with:
|
214
241
|
source:
|
215
242
|
conn: conn_postgres_url
|
@@ -221,15 +248,23 @@ pipe_el_pg_to_lake:
|
|
221
248
|
endpoint: "/${{ params.name }}"
|
222
249
|
```
|
223
250
|
|
224
|
-
|
251
|
+
Implement hook:
|
225
252
|
|
226
|
-
|
253
|
+
```python
|
254
|
+
from ddeutil.workflow.utils import tag
|
227
255
|
|
228
|
-
|
256
|
+
@tag('polars', alias='postgres-to-delta')
|
257
|
+
def postgres_to_delta(source, sink):
|
258
|
+
return {
|
259
|
+
"source": source, "sink": sink
|
260
|
+
}
|
261
|
+
```
|
262
|
+
|
263
|
+
### Hook (Transform)
|
229
264
|
|
230
265
|
```yaml
|
231
|
-
|
232
|
-
type:
|
266
|
+
pipeline_hook_mssql_proc:
|
267
|
+
type: pipeline.Pipeline
|
233
268
|
params:
|
234
269
|
run_date: datetime
|
235
270
|
sp_name: str
|
@@ -240,7 +275,7 @@ pipe_hook_mssql_proc:
|
|
240
275
|
stages:
|
241
276
|
- name: "Transform Data in MS SQL Server"
|
242
277
|
id: transform
|
243
|
-
|
278
|
+
uses: tasks/mssql-proc@odbc
|
244
279
|
with:
|
245
280
|
exec: ${{ params.sp_name }}
|
246
281
|
params:
|
@@ -250,16 +285,57 @@ pipe_hook_mssql_proc:
|
|
250
285
|
target: ${{ params.target_name }}
|
251
286
|
```
|
252
287
|
|
253
|
-
|
254
|
-
|
255
|
-
|
288
|
+
Implement hook:
|
289
|
+
|
290
|
+
```python
|
291
|
+
from ddeutil.workflow.utils import tag
|
292
|
+
|
293
|
+
@tag('odbc', alias='mssql-proc')
|
294
|
+
def odbc_mssql_procedure(_exec: str, params: dict):
|
295
|
+
return {
|
296
|
+
"exec": _exec, "params": params
|
297
|
+
}
|
298
|
+
```
|
256
299
|
|
257
300
|
## Configuration
|
258
301
|
|
259
|
-
```
|
302
|
+
```bash
|
303
|
+
export WORKFLOW_ROOT_PATH=.
|
304
|
+
export WORKFLOW_CORE_REGISTRY=ddeutil.workflow,tests.utils
|
305
|
+
export WORKFLOW_CORE_REGISTRY_FILTER=ddeutil.workflow.utils
|
306
|
+
export WORKFLOW_CORE_PATH_CONF=conf
|
307
|
+
export WORKFLOW_CORE_TIMEZONE=Asia/Bangkok
|
308
|
+
export WORKFLOW_CORE_DEFAULT_STAGE_ID=true
|
260
309
|
|
310
|
+
export WORKFLOW_CORE_MAX_PIPELINE_POKING=4
|
311
|
+
export WORKFLOW_CORE_MAX_JOB_PARALLEL=2
|
261
312
|
```
|
262
313
|
|
263
|
-
|
314
|
+
Application config:
|
264
315
|
|
265
|
-
|
316
|
+
```bash
|
317
|
+
export WORKFLOW_APP_DB_URL=postgresql+asyncpg://user:pass@localhost:5432/schedule
|
318
|
+
export WORKFLOW_APP_INTERVAL=10
|
319
|
+
```
|
320
|
+
|
321
|
+
## Deployment
|
322
|
+
|
323
|
+
This package able to run as a application service for receive manual trigger
|
324
|
+
from the master node via RestAPI or use to be Scheduler background service
|
325
|
+
like crontab job but via Python API.
|
326
|
+
|
327
|
+
### Schedule Service
|
328
|
+
|
329
|
+
```shell
|
330
|
+
(venv) $ python src.ddeutil.workflow.app
|
331
|
+
```
|
332
|
+
|
333
|
+
### API Server
|
334
|
+
|
335
|
+
```shell
|
336
|
+
(venv) $ uvicorn src.ddeutil.workflow.api:app --host 0.0.0.0 --port 80 --reload
|
337
|
+
```
|
338
|
+
|
339
|
+
> [!NOTE]
|
340
|
+
> If this package already deploy, it able to use
|
341
|
+
> `uvicorn ddeutil.workflow.api:app --host 0.0.0.0 --port 80`
|
@@ -1,31 +1,32 @@
|
|
1
|
-
#
|
1
|
+
# Workflow
|
2
2
|
|
3
3
|
[](https://github.com/ddeutils/ddeutil-workflow/actions/workflows/tests.yml)
|
4
4
|
[](https://pypi.org/project/ddeutil-workflow/)
|
5
5
|
[](https://github.com/ddeutils/ddeutil-workflow)
|
6
6
|
[](https://github.com/ddeutils/ddeutil-workflow/blob/main/LICENSE)
|
7
7
|
|
8
|
-
|
9
8
|
**Table of Contents**:
|
10
9
|
|
11
10
|
- [Installation](#installation)
|
12
11
|
- [Getting Started](#getting-started)
|
13
|
-
- [
|
14
|
-
- [
|
15
|
-
|
16
|
-
- [
|
17
|
-
- [
|
18
|
-
- [
|
19
|
-
- [Hooks (T)](#tasks-transform)
|
12
|
+
- [On](#on)
|
13
|
+
- [Pipeline](#pipeline)
|
14
|
+
- [Usage](#usage)
|
15
|
+
- [Python & Bash](#python--bash)
|
16
|
+
- [Hook (EL)](#hook-extract--load)
|
17
|
+
- [Hook (T)](#hook-transform)
|
20
18
|
- [Configuration](#configuration)
|
19
|
+
- [Deployment](#deployment)
|
21
20
|
|
22
|
-
This **
|
23
|
-
driven pipeline that able to **ETL, T, EL, or
|
21
|
+
This **Workflow** objects was created for easy to make a simple metadata
|
22
|
+
driven for data pipeline orchestration that able to use for **ETL, T, EL, or
|
23
|
+
ELT** by a `.yaml` file template.
|
24
24
|
|
25
|
-
I think
|
26
|
-
write
|
27
|
-
|
28
|
-
|
25
|
+
In my opinion, I think it should not create duplicate pipeline codes if I can
|
26
|
+
write with dynamic input parameters on the one template pipeline that just change
|
27
|
+
the input parameters per use-case instead.
|
28
|
+
This way I can handle a lot of logical pipelines in our orgs with only metadata
|
29
|
+
configuration. It called **Metadata Driven Data Pipeline**.
|
29
30
|
|
30
31
|
Next, we should get some monitoring tools for manage logging that return from
|
31
32
|
pipeline running. Because it not show us what is a use-case that running data
|
@@ -42,7 +43,16 @@ pipeline.
|
|
42
43
|
pip install ddeutil-workflow
|
43
44
|
```
|
44
45
|
|
45
|
-
This project need `ddeutil-io
|
46
|
+
This project need `ddeutil-io` extension namespace packages. If you want to install
|
47
|
+
this package with application add-ons, you should add `app` in installation;
|
48
|
+
|
49
|
+
```shell
|
50
|
+
pip install ddeutil-workflow[app]
|
51
|
+
```
|
52
|
+
|
53
|
+
```shell
|
54
|
+
pip install ddeutil-workflow[api]
|
55
|
+
```
|
46
56
|
|
47
57
|
## Getting Started
|
48
58
|
|
@@ -59,38 +69,42 @@ will passing parameters and catching the output for re-use it to next step.
|
|
59
69
|
> dynamic registries instead of main features because it have a lot of maintain
|
60
70
|
> vendor codes and deps. (I do not have time to handle this features)
|
61
71
|
|
62
|
-
|
72
|
+
### On
|
63
73
|
|
64
|
-
|
74
|
+
The **On** is schedule object.
|
65
75
|
|
66
76
|
```yaml
|
67
|
-
|
68
|
-
type:
|
77
|
+
on_every_5_min:
|
78
|
+
type: on.On
|
69
79
|
cron: "*/5 * * * *"
|
70
80
|
```
|
71
81
|
|
72
82
|
```python
|
73
|
-
from ddeutil.workflow.on import
|
83
|
+
from ddeutil.workflow.on import On
|
74
84
|
|
75
|
-
|
76
|
-
assert '*/5 * * * *' == str(
|
85
|
+
schedule = On.from_loader(name='on_every_5_min', externals={})
|
86
|
+
assert '*/5 * * * *' == str(schedule.cronjob)
|
77
87
|
|
78
|
-
|
79
|
-
assert '2022-01-01 00:05:00' f"{
|
80
|
-
assert '2022-01-01 00:10:00' f"{
|
81
|
-
assert '2022-01-01 00:15:00' f"{
|
82
|
-
assert '2022-01-01 00:20:00' f"{
|
83
|
-
assert '2022-01-01 00:25:00' f"{cron_iterate.next:%Y-%m-%d %H:%M:%S}"
|
88
|
+
cron_iter = schedule.generate('2022-01-01 00:00:00')
|
89
|
+
assert '2022-01-01 00:05:00' f"{cron_iter.next:%Y-%m-%d %H:%M:%S}"
|
90
|
+
assert '2022-01-01 00:10:00' f"{cron_iter.next:%Y-%m-%d %H:%M:%S}"
|
91
|
+
assert '2022-01-01 00:15:00' f"{cron_iter.next:%Y-%m-%d %H:%M:%S}"
|
92
|
+
assert '2022-01-01 00:20:00' f"{cron_iter.next:%Y-%m-%d %H:%M:%S}"
|
84
93
|
```
|
85
94
|
|
86
|
-
---
|
87
|
-
|
88
95
|
### Pipeline
|
89
96
|
|
97
|
+
The **Pipeline** object that is the core feature of this project.
|
98
|
+
|
90
99
|
```yaml
|
91
100
|
run_py_local:
|
92
101
|
type: ddeutil.workflow.pipeline.Pipeline
|
93
|
-
|
102
|
+
on: 'on_every_5_min'
|
103
|
+
params:
|
104
|
+
author-run:
|
105
|
+
type: str
|
106
|
+
run-date:
|
107
|
+
type: datetime
|
94
108
|
```
|
95
109
|
|
96
110
|
```python
|
@@ -100,27 +114,39 @@ pipe = Pipeline.from_loader(name='run_py_local', externals={})
|
|
100
114
|
pipe.execute(params={'author-run': 'Local Workflow', 'run-date': '2024-01-01'})
|
101
115
|
```
|
102
116
|
|
103
|
-
|
117
|
+
> [!NOTE]
|
118
|
+
> The above parameter use short declarative statement. You can pass a parameter
|
119
|
+
> type to the key of a parameter name.
|
120
|
+
> ```yaml
|
121
|
+
> params:
|
122
|
+
> author-run: str
|
123
|
+
> run-date: datetime
|
124
|
+
> ```
|
125
|
+
>
|
126
|
+
> And for the type, you can remove `ddeutil.workflow` prefix because we can find
|
127
|
+
> it by looping search from `WORKFLOW_CORE_REGISTRY` value.
|
128
|
+
|
129
|
+
## Usage
|
104
130
|
|
105
131
|
This is examples that use workflow file for running common Data Engineering
|
106
132
|
use-case.
|
107
133
|
|
108
|
-
|
134
|
+
> [!IMPORTANT]
|
135
|
+
> I recommend you to use `task` stage for all actions that you want to do with
|
136
|
+
> pipeline object.
|
109
137
|
|
110
|
-
|
138
|
+
### Python & Bash
|
111
139
|
|
112
140
|
```yaml
|
113
141
|
run_py_local:
|
114
|
-
type:
|
142
|
+
type: pipeline.Pipeline
|
115
143
|
params:
|
116
|
-
author-run:
|
117
|
-
|
118
|
-
run-date:
|
119
|
-
type: datetime
|
144
|
+
author-run: str
|
145
|
+
run-date: datetime
|
120
146
|
jobs:
|
121
147
|
first-job:
|
122
148
|
stages:
|
123
|
-
- name: Printing Information
|
149
|
+
- name: "Printing Information"
|
124
150
|
id: define-func
|
125
151
|
run: |
|
126
152
|
x = '${{ params.author-run }}'
|
@@ -129,7 +155,7 @@ run_py_local:
|
|
129
155
|
def echo(name: str):
|
130
156
|
print(f'Hello {name}')
|
131
157
|
|
132
|
-
- name: Run Sequence and use var from Above
|
158
|
+
- name: "Run Sequence and use var from Above"
|
133
159
|
vars:
|
134
160
|
x: ${{ params.author-run }}
|
135
161
|
run: |
|
@@ -137,16 +163,16 @@ run_py_local:
|
|
137
163
|
# Change x value
|
138
164
|
x: int = 1
|
139
165
|
|
140
|
-
- name: Call Function
|
166
|
+
- name: "Call Function"
|
141
167
|
vars:
|
142
168
|
echo: ${{ stages.define-func.outputs.echo }}
|
143
169
|
run: |
|
144
170
|
echo('Caller')
|
145
171
|
second-job:
|
146
172
|
stages:
|
147
|
-
- name: Echo
|
173
|
+
- name: "Echo Bash Script"
|
148
174
|
id: shell-echo
|
149
|
-
|
175
|
+
bash: |
|
150
176
|
echo "Hello World from Shell"
|
151
177
|
```
|
152
178
|
|
@@ -164,24 +190,20 @@ pipe.execute(params={'author-run': 'Local Workflow', 'run-date': '2024-01-01'})
|
|
164
190
|
> Hello World from Shell
|
165
191
|
```
|
166
192
|
|
167
|
-
|
168
|
-
|
169
|
-
### Tasks (Extract & Load)
|
193
|
+
### Hook (Extract & Load)
|
170
194
|
|
171
195
|
```yaml
|
172
196
|
pipe_el_pg_to_lake:
|
173
|
-
type:
|
197
|
+
type: pipeline.Pipeline
|
174
198
|
params:
|
175
|
-
run-date:
|
176
|
-
|
177
|
-
author-email:
|
178
|
-
type: str
|
199
|
+
run-date: datetime
|
200
|
+
author-email: str
|
179
201
|
jobs:
|
180
202
|
extract-load:
|
181
203
|
stages:
|
182
204
|
- name: "Extract Load from Postgres to Lake"
|
183
205
|
id: extract-load
|
184
|
-
|
206
|
+
uses: tasks/postgres-to-delta@polars
|
185
207
|
with:
|
186
208
|
source:
|
187
209
|
conn: conn_postgres_url
|
@@ -193,15 +215,23 @@ pipe_el_pg_to_lake:
|
|
193
215
|
endpoint: "/${{ params.name }}"
|
194
216
|
```
|
195
217
|
|
196
|
-
|
218
|
+
Implement hook:
|
197
219
|
|
198
|
-
|
220
|
+
```python
|
221
|
+
from ddeutil.workflow.utils import tag
|
199
222
|
|
200
|
-
|
223
|
+
@tag('polars', alias='postgres-to-delta')
|
224
|
+
def postgres_to_delta(source, sink):
|
225
|
+
return {
|
226
|
+
"source": source, "sink": sink
|
227
|
+
}
|
228
|
+
```
|
229
|
+
|
230
|
+
### Hook (Transform)
|
201
231
|
|
202
232
|
```yaml
|
203
|
-
|
204
|
-
type:
|
233
|
+
pipeline_hook_mssql_proc:
|
234
|
+
type: pipeline.Pipeline
|
205
235
|
params:
|
206
236
|
run_date: datetime
|
207
237
|
sp_name: str
|
@@ -212,7 +242,7 @@ pipe_hook_mssql_proc:
|
|
212
242
|
stages:
|
213
243
|
- name: "Transform Data in MS SQL Server"
|
214
244
|
id: transform
|
215
|
-
|
245
|
+
uses: tasks/mssql-proc@odbc
|
216
246
|
with:
|
217
247
|
exec: ${{ params.sp_name }}
|
218
248
|
params:
|
@@ -222,16 +252,57 @@ pipe_hook_mssql_proc:
|
|
222
252
|
target: ${{ params.target_name }}
|
223
253
|
```
|
224
254
|
|
225
|
-
|
226
|
-
|
227
|
-
|
255
|
+
Implement hook:
|
256
|
+
|
257
|
+
```python
|
258
|
+
from ddeutil.workflow.utils import tag
|
259
|
+
|
260
|
+
@tag('odbc', alias='mssql-proc')
|
261
|
+
def odbc_mssql_procedure(_exec: str, params: dict):
|
262
|
+
return {
|
263
|
+
"exec": _exec, "params": params
|
264
|
+
}
|
265
|
+
```
|
228
266
|
|
229
267
|
## Configuration
|
230
268
|
|
231
|
-
```
|
269
|
+
```bash
|
270
|
+
export WORKFLOW_ROOT_PATH=.
|
271
|
+
export WORKFLOW_CORE_REGISTRY=ddeutil.workflow,tests.utils
|
272
|
+
export WORKFLOW_CORE_REGISTRY_FILTER=ddeutil.workflow.utils
|
273
|
+
export WORKFLOW_CORE_PATH_CONF=conf
|
274
|
+
export WORKFLOW_CORE_TIMEZONE=Asia/Bangkok
|
275
|
+
export WORKFLOW_CORE_DEFAULT_STAGE_ID=true
|
232
276
|
|
277
|
+
export WORKFLOW_CORE_MAX_PIPELINE_POKING=4
|
278
|
+
export WORKFLOW_CORE_MAX_JOB_PARALLEL=2
|
233
279
|
```
|
234
280
|
|
235
|
-
|
281
|
+
Application config:
|
236
282
|
|
237
|
-
|
283
|
+
```bash
|
284
|
+
export WORKFLOW_APP_DB_URL=postgresql+asyncpg://user:pass@localhost:5432/schedule
|
285
|
+
export WORKFLOW_APP_INTERVAL=10
|
286
|
+
```
|
287
|
+
|
288
|
+
## Deployment
|
289
|
+
|
290
|
+
This package able to run as a application service for receive manual trigger
|
291
|
+
from the master node via RestAPI or use to be Scheduler background service
|
292
|
+
like crontab job but via Python API.
|
293
|
+
|
294
|
+
### Schedule Service
|
295
|
+
|
296
|
+
```shell
|
297
|
+
(venv) $ python src.ddeutil.workflow.app
|
298
|
+
```
|
299
|
+
|
300
|
+
### API Server
|
301
|
+
|
302
|
+
```shell
|
303
|
+
(venv) $ uvicorn src.ddeutil.workflow.api:app --host 0.0.0.0 --port 80 --reload
|
304
|
+
```
|
305
|
+
|
306
|
+
> [!NOTE]
|
307
|
+
> If this package already deploy, it able to use
|
308
|
+
> `uvicorn ddeutil.workflow.api:app --host 0.0.0.0 --port 80`
|