ddeutil-workflow 0.0.4__tar.gz → 0.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ddeutil_workflow-0.0.4/src/ddeutil_workflow.egg-info → ddeutil_workflow-0.0.6}/PKG-INFO +118 -90
- {ddeutil_workflow-0.0.4 → ddeutil_workflow-0.0.6}/README.md +111 -78
- {ddeutil_workflow-0.0.4 → ddeutil_workflow-0.0.6}/pyproject.toml +8 -18
- ddeutil_workflow-0.0.6/src/ddeutil/workflow/__about__.py +1 -0
- {ddeutil_workflow-0.0.4/src/ddeutil/workflow/tasks → ddeutil_workflow-0.0.6/src/ddeutil/workflow}/__init__.py +4 -1
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/__regex.py → ddeutil_workflow-0.0.6/src/ddeutil/workflow/__types.py +13 -3
- {ddeutil_workflow-0.0.4 → ddeutil_workflow-0.0.6}/src/ddeutil/workflow/exceptions.py +13 -1
- ddeutil_workflow-0.0.6/src/ddeutil/workflow/loader.py +80 -0
- ddeutil_workflow-0.0.6/src/ddeutil/workflow/on.py +195 -0
- ddeutil_workflow-0.0.6/src/ddeutil/workflow/pipeline.py +497 -0
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/vendors/__schedule.py → ddeutil_workflow-0.0.6/src/ddeutil/workflow/scheduler.py +222 -176
- ddeutil_workflow-0.0.6/src/ddeutil/workflow/stage.py +402 -0
- ddeutil_workflow-0.0.6/src/ddeutil/workflow/utils.py +378 -0
- {ddeutil_workflow-0.0.4 → ddeutil_workflow-0.0.6/src/ddeutil_workflow.egg-info}/PKG-INFO +118 -90
- ddeutil_workflow-0.0.6/src/ddeutil_workflow.egg-info/SOURCES.txt +34 -0
- ddeutil_workflow-0.0.6/src/ddeutil_workflow.egg-info/requires.txt +7 -0
- ddeutil_workflow-0.0.6/tests/test__conf_exist.py +11 -0
- ddeutil_workflow-0.0.4/tests/test_base_local_and_global.py → ddeutil_workflow-0.0.6/tests/test__local_and_global.py +4 -4
- ddeutil_workflow-0.0.4/tests/test_base_regex.py → ddeutil_workflow-0.0.6/tests/test__regex.py +11 -3
- ddeutil_workflow-0.0.4/tests/test_schedule.py → ddeutil_workflow-0.0.6/tests/test_on.py +22 -4
- {ddeutil_workflow-0.0.4 → ddeutil_workflow-0.0.6}/tests/test_pipeline.py +3 -9
- ddeutil_workflow-0.0.6/tests/test_pipeline_desc.py +11 -0
- ddeutil_workflow-0.0.6/tests/test_pipeline_if.py +28 -0
- ddeutil_workflow-0.0.6/tests/test_pipeline_matrix.py +87 -0
- ddeutil_workflow-0.0.6/tests/test_pipeline_on.py +12 -0
- {ddeutil_workflow-0.0.4 → ddeutil_workflow-0.0.6}/tests/test_pipeline_params.py +1 -1
- {ddeutil_workflow-0.0.4 → ddeutil_workflow-0.0.6}/tests/test_pipeline_run.py +44 -33
- {ddeutil_workflow-0.0.4 → ddeutil_workflow-0.0.6}/tests/test_pipeline_task.py +11 -13
- ddeutil_workflow-0.0.6/tests/test_scheduler.py +118 -0
- ddeutil_workflow-0.0.6/tests/test_stage_trigger.py +10 -0
- ddeutil_workflow-0.0.6/tests/test_utils.py +8 -0
- ddeutil_workflow-0.0.6/tests/test_utils_result.py +22 -0
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/__about__.py +0 -1
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/__init__.py +0 -0
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/__types.py +0 -12
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/conn.py +0 -240
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/loader.py +0 -174
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/pipeline.py +0 -517
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/schedule.py +0 -82
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/tasks/_pandas.py +0 -54
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/tasks/_polars.py +0 -92
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/utils.py +0 -187
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/vendors/__dataset.py +0 -127
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/vendors/__dict.py +0 -333
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/vendors/__init__.py +0 -0
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/vendors/aws.py +0 -185
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/vendors/az.py +0 -0
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/vendors/minio.py +0 -11
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/vendors/pd.py +0 -13
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/vendors/pg.py +0 -11
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/vendors/pl.py +0 -172
- ddeutil_workflow-0.0.4/src/ddeutil/workflow/vendors/sftp.py +0 -209
- ddeutil_workflow-0.0.4/src/ddeutil_workflow.egg-info/SOURCES.txt +0 -44
- ddeutil_workflow-0.0.4/src/ddeutil_workflow.egg-info/requires.txt +0 -12
- ddeutil_workflow-0.0.4/tests/test_base_data.py +0 -14
- ddeutil_workflow-0.0.4/tests/test_conn.py +0 -93
- ddeutil_workflow-0.0.4/tests/test_dataset.py +0 -90
- ddeutil_workflow-0.0.4/tests/test_loader.py +0 -6
- ddeutil_workflow-0.0.4/tests/test_pipeline_matrix.py +0 -29
- {ddeutil_workflow-0.0.4 → ddeutil_workflow-0.0.6}/LICENSE +0 -0
- {ddeutil_workflow-0.0.4 → ddeutil_workflow-0.0.6}/setup.cfg +0 -0
- {ddeutil_workflow-0.0.4 → ddeutil_workflow-0.0.6}/src/ddeutil_workflow.egg-info/dependency_links.txt +0 -0
- {ddeutil_workflow-0.0.4 → ddeutil_workflow-0.0.6}/src/ddeutil_workflow.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: ddeutil-workflow
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.6
|
4
4
|
Summary: Data Developer & Engineer Workflow Utility Objects
|
5
5
|
Author-email: ddeutils <korawich.anu@gmail.com>
|
6
6
|
License: MIT
|
@@ -9,7 +9,7 @@ Project-URL: Source Code, https://github.com/ddeutils/ddeutil-workflow/
|
|
9
9
|
Keywords: data,workflow,utility,pipeline
|
10
10
|
Classifier: Topic :: Utilities
|
11
11
|
Classifier: Natural Language :: English
|
12
|
-
Classifier: Development Status ::
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
13
13
|
Classifier: Intended Audience :: Developers
|
14
14
|
Classifier: Operating System :: OS Independent
|
15
15
|
Classifier: Programming Language :: Python
|
@@ -23,35 +23,33 @@ Description-Content-Type: text/markdown
|
|
23
23
|
License-File: LICENSE
|
24
24
|
Requires-Dist: fmtutil
|
25
25
|
Requires-Dist: ddeutil-io
|
26
|
-
Requires-Dist: python-dotenv
|
27
|
-
Provides-Extra:
|
28
|
-
Requires-Dist:
|
29
|
-
Requires-Dist:
|
30
|
-
|
31
|
-
|
32
|
-
Requires-Dist: fsspec==2024.5.0; extra == "test"
|
33
|
-
Requires-Dist: polars==0.20.31; extra == "test"
|
34
|
-
Requires-Dist: pyarrow==16.1.0; extra == "test"
|
35
|
-
|
36
|
-
# Data Utility: _Workflow_
|
26
|
+
Requires-Dist: python-dotenv==1.0.1
|
27
|
+
Provides-Extra: app
|
28
|
+
Requires-Dist: fastapi==0.112.0; extra == "app"
|
29
|
+
Requires-Dist: apscheduler[sqlalchemy]==3.10.4; extra == "app"
|
30
|
+
|
31
|
+
# Workflow
|
37
32
|
|
38
33
|
[](https://github.com/ddeutils/ddeutil-workflow/actions/workflows/tests.yml)
|
39
34
|
[](https://pypi.org/project/ddeutil-workflow/)
|
40
35
|
[](https://github.com/ddeutils/ddeutil-workflow)
|
36
|
+
[](https://github.com/ddeutils/ddeutil-workflow/blob/main/LICENSE)
|
41
37
|
|
42
38
|
**Table of Contents**:
|
43
39
|
|
44
40
|
- [Installation](#installation)
|
45
41
|
- [Getting Started](#getting-started)
|
46
|
-
|
47
|
-
- [
|
48
|
-
- [
|
49
|
-
- [
|
50
|
-
- [Python](#python)
|
51
|
-
- [
|
52
|
-
- [
|
53
|
-
|
54
|
-
|
42
|
+
- [Core Features](#core-features)
|
43
|
+
- [On](#on)
|
44
|
+
- [Pipeline](#pipeline)
|
45
|
+
- [Usage](#usage)
|
46
|
+
- [Python & Bash](#python--bash)
|
47
|
+
- [Hook (EL)](#hook-extract--load)
|
48
|
+
- [Hook (T)](#hook-transform)
|
49
|
+
- [Configuration](#configuration)
|
50
|
+
- [Deployment](#deployment)
|
51
|
+
|
52
|
+
This **Workflow** objects was created for easy to make a simple metadata
|
55
53
|
driven pipeline that able to **ETL, T, EL, or ELT** by `.yaml` file.
|
56
54
|
|
57
55
|
I think we should not create the multiple pipeline per use-case if we able to
|
@@ -74,13 +72,18 @@ pipeline.
|
|
74
72
|
pip install ddeutil-workflow
|
75
73
|
```
|
76
74
|
|
77
|
-
This project need `ddeutil-io
|
75
|
+
This project need `ddeutil-io` extension namespace packages. If you want to install
|
76
|
+
this package with application add-ons, you should add `app` in installation;
|
77
|
+
|
78
|
+
```shell
|
79
|
+
pip install ddeutil-workflow[app]
|
80
|
+
```
|
78
81
|
|
79
82
|
## Getting Started
|
80
83
|
|
81
84
|
The first step, you should start create the connections and datasets for In and
|
82
85
|
Out of you data that want to use in pipeline of workflow. Some of this component
|
83
|
-
is similar component of the **Airflow** because I like it concepts.
|
86
|
+
is similar component of the **Airflow** because I like it orchestration concepts.
|
84
87
|
|
85
88
|
The main feature of this project is the `Pipeline` object that can call any
|
86
89
|
registries function. The pipeline can handle everything that you want to do, it
|
@@ -91,88 +94,84 @@ will passing parameters and catching the output for re-use it to next step.
|
|
91
94
|
> dynamic registries instead of main features because it have a lot of maintain
|
92
95
|
> vendor codes and deps. (I do not have time to handle this features)
|
93
96
|
|
94
|
-
###
|
97
|
+
### On
|
95
98
|
|
96
|
-
The
|
99
|
+
The **On** is schedule object.
|
97
100
|
|
98
101
|
```yaml
|
99
|
-
|
100
|
-
type:
|
101
|
-
|
102
|
+
on_every_5_min:
|
103
|
+
type: on.On
|
104
|
+
cron: "*/5 * * * *"
|
102
105
|
```
|
103
106
|
|
104
107
|
```python
|
105
|
-
from ddeutil.workflow.
|
108
|
+
from ddeutil.workflow.on import On
|
106
109
|
|
107
|
-
|
108
|
-
assert
|
109
|
-
```
|
110
|
+
schedule = On.from_loader(name='on_every_5_min', externals={})
|
111
|
+
assert '*/5 * * * *' == str(schedule.cronjob)
|
110
112
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
```yaml
|
118
|
-
ds_postgres_customer_tbl:
|
119
|
-
type: dataset.PostgresTbl
|
120
|
-
conn: 'conn_postgres_data'
|
121
|
-
features:
|
122
|
-
id: serial primary key
|
123
|
-
name: varchar( 100 ) not null
|
113
|
+
cron_iter = schedule.generate('2022-01-01 00:00:00')
|
114
|
+
assert '2022-01-01 00:05:00' f"{cron_iter.next:%Y-%m-%d %H:%M:%S}"
|
115
|
+
assert '2022-01-01 00:10:00' f"{cron_iter.next:%Y-%m-%d %H:%M:%S}"
|
116
|
+
assert '2022-01-01 00:15:00' f"{cron_iter.next:%Y-%m-%d %H:%M:%S}"
|
117
|
+
assert '2022-01-01 00:20:00' f"{cron_iter.next:%Y-%m-%d %H:%M:%S}"
|
124
118
|
```
|
125
119
|
|
126
|
-
|
127
|
-
from ddeutil.workflow.vendors.pg import PostgresTbl
|
128
|
-
|
129
|
-
dataset = PostgresTbl.from_loader(name='ds_postgres_customer_tbl', externals={})
|
130
|
-
assert dataset.exists()
|
131
|
-
```
|
120
|
+
### Pipeline
|
132
121
|
|
133
|
-
|
122
|
+
The **Pipeline** object that is the core feature of this project.
|
134
123
|
|
135
124
|
```yaml
|
136
|
-
|
137
|
-
type:
|
138
|
-
|
125
|
+
run_py_local:
|
126
|
+
type: ddeutil.workflow.pipeline.Pipeline
|
127
|
+
on: 'on_every_5_min'
|
128
|
+
params:
|
129
|
+
author-run:
|
130
|
+
type: str
|
131
|
+
run-date:
|
132
|
+
type: datetime
|
139
133
|
```
|
140
134
|
|
141
135
|
```python
|
142
|
-
from ddeutil.workflow.
|
143
|
-
|
144
|
-
scdl = Schedule.from_loader(name='schd_for_node', externals={})
|
145
|
-
assert '*/5 * * * *' == str(scdl.cronjob)
|
136
|
+
from ddeutil.workflow.pipeline import Pipeline
|
146
137
|
|
147
|
-
|
148
|
-
|
149
|
-
assert '2022-01-01 00:10:00' f"{cron_iterate.next:%Y-%m-%d %H:%M:%S}"
|
150
|
-
assert '2022-01-01 00:15:00' f"{cron_iterate.next:%Y-%m-%d %H:%M:%S}"
|
151
|
-
assert '2022-01-01 00:20:00' f"{cron_iterate.next:%Y-%m-%d %H:%M:%S}"
|
152
|
-
assert '2022-01-01 00:25:00' f"{cron_iterate.next:%Y-%m-%d %H:%M:%S}"
|
138
|
+
pipe = Pipeline.from_loader(name='run_py_local', externals={})
|
139
|
+
pipe.execute(params={'author-run': 'Local Workflow', 'run-date': '2024-01-01'})
|
153
140
|
```
|
154
141
|
|
155
|
-
|
142
|
+
> [!NOTE]
|
143
|
+
> The above parameter use short declarative statement. You can pass a parameter
|
144
|
+
> type to the key of a parameter name.
|
145
|
+
> ```yaml
|
146
|
+
> params:
|
147
|
+
> author-run: str
|
148
|
+
> run-date: datetime
|
149
|
+
> ```
|
150
|
+
>
|
151
|
+
> And for the type, you can remove `ddeutil.workflow` prefix because we can find
|
152
|
+
> it by looping search from `WORKFLOW_CORE_REGISTRY` value.
|
153
|
+
|
154
|
+
## Usage
|
156
155
|
|
157
156
|
This is examples that use workflow file for running common Data Engineering
|
158
157
|
use-case.
|
159
158
|
|
160
|
-
|
159
|
+
> [!IMPORTANT]
|
160
|
+
> I recommend you to use `task` stage for all actions that you want to do with
|
161
|
+
> pipeline object.
|
161
162
|
|
162
|
-
|
163
|
+
### Python & Bash
|
163
164
|
|
164
165
|
```yaml
|
165
166
|
run_py_local:
|
166
|
-
type:
|
167
|
+
type: pipeline.Pipeline
|
167
168
|
params:
|
168
|
-
author-run:
|
169
|
-
|
170
|
-
run-date:
|
171
|
-
type: datetime
|
169
|
+
author-run: str
|
170
|
+
run-date: datetime
|
172
171
|
jobs:
|
173
172
|
first-job:
|
174
173
|
stages:
|
175
|
-
- name: Printing Information
|
174
|
+
- name: "Printing Information"
|
176
175
|
id: define-func
|
177
176
|
run: |
|
178
177
|
x = '${{ params.author-run }}'
|
@@ -181,7 +180,7 @@ run_py_local:
|
|
181
180
|
def echo(name: str):
|
182
181
|
print(f'Hello {name}')
|
183
182
|
|
184
|
-
- name: Run Sequence and use var from Above
|
183
|
+
- name: "Run Sequence and use var from Above"
|
185
184
|
vars:
|
186
185
|
x: ${{ params.author-run }}
|
187
186
|
run: |
|
@@ -189,11 +188,17 @@ run_py_local:
|
|
189
188
|
# Change x value
|
190
189
|
x: int = 1
|
191
190
|
|
192
|
-
- name: Call Function
|
191
|
+
- name: "Call Function"
|
193
192
|
vars:
|
194
193
|
echo: ${{ stages.define-func.outputs.echo }}
|
195
194
|
run: |
|
196
195
|
echo('Caller')
|
196
|
+
second-job:
|
197
|
+
stages:
|
198
|
+
- name: "Echo Bash Script"
|
199
|
+
id: shell-echo
|
200
|
+
bash: |
|
201
|
+
echo "Hello World from Shell"
|
197
202
|
```
|
198
203
|
|
199
204
|
```python
|
@@ -207,24 +212,23 @@ pipe.execute(params={'author-run': 'Local Workflow', 'run-date': '2024-01-01'})
|
|
207
212
|
> Hello Local Workflow
|
208
213
|
> Receive x from above with Local Workflow
|
209
214
|
> Hello Caller
|
215
|
+
> Hello World from Shell
|
210
216
|
```
|
211
217
|
|
212
|
-
###
|
218
|
+
### Hook (Extract & Load)
|
213
219
|
|
214
220
|
```yaml
|
215
221
|
pipe_el_pg_to_lake:
|
216
|
-
type:
|
222
|
+
type: pipeline.Pipeline
|
217
223
|
params:
|
218
|
-
run-date:
|
219
|
-
|
220
|
-
author-email:
|
221
|
-
type: str
|
224
|
+
run-date: datetime
|
225
|
+
author-email: str
|
222
226
|
jobs:
|
223
227
|
extract-load:
|
224
228
|
stages:
|
225
229
|
- name: "Extract Load from Postgres to Lake"
|
226
230
|
id: extract-load
|
227
|
-
|
231
|
+
uses: tasks/postgres-to-delta@polars
|
228
232
|
with:
|
229
233
|
source:
|
230
234
|
conn: conn_postgres_url
|
@@ -236,11 +240,11 @@ pipe_el_pg_to_lake:
|
|
236
240
|
endpoint: "/${{ params.name }}"
|
237
241
|
```
|
238
242
|
|
239
|
-
###
|
243
|
+
### Hook (Transform)
|
240
244
|
|
241
245
|
```yaml
|
242
|
-
|
243
|
-
type:
|
246
|
+
pipeline_hook_mssql_proc:
|
247
|
+
type: pipeline.Pipeline
|
244
248
|
params:
|
245
249
|
run_date: datetime
|
246
250
|
sp_name: str
|
@@ -251,7 +255,7 @@ pipe_hook_mssql_proc:
|
|
251
255
|
stages:
|
252
256
|
- name: "Transform Data in MS SQL Server"
|
253
257
|
id: transform
|
254
|
-
|
258
|
+
uses: tasks/mssql-proc@odbc
|
255
259
|
with:
|
256
260
|
exec: ${{ params.sp_name }}
|
257
261
|
params:
|
@@ -261,6 +265,30 @@ pipe_hook_mssql_proc:
|
|
261
265
|
target: ${{ params.target_name }}
|
262
266
|
```
|
263
267
|
|
264
|
-
##
|
268
|
+
## Configuration
|
265
269
|
|
266
|
-
|
270
|
+
```bash
|
271
|
+
export WORKFLOW_ROOT_PATH=.
|
272
|
+
export WORKFLOW_CORE_REGISTRY=ddeutil.workflow,tests.utils
|
273
|
+
export WORKFLOW_CORE_PATH_CONF=conf
|
274
|
+
```
|
275
|
+
|
276
|
+
Application config:
|
277
|
+
|
278
|
+
```bash
|
279
|
+
export WORKFLOW_APP_DB_URL=postgresql+asyncpg://user:pass@localhost:5432/schedule
|
280
|
+
export WORKFLOW_APP_INTERVAL=10
|
281
|
+
```
|
282
|
+
|
283
|
+
## Deployment
|
284
|
+
|
285
|
+
This package able to run as a application service for receive manual trigger
|
286
|
+
from the master node via RestAPI.
|
287
|
+
|
288
|
+
> [!WARNING]
|
289
|
+
> This feature do not start yet because I still research and find the best tool
|
290
|
+
> to use it provision an app service, like `starlette`, `fastapi`, `apscheduler`.
|
291
|
+
|
292
|
+
```shell
|
293
|
+
(venv) $ workflow start -p 7070
|
294
|
+
```
|
@@ -1,22 +1,25 @@
|
|
1
|
-
#
|
1
|
+
# Workflow
|
2
2
|
|
3
3
|
[](https://github.com/ddeutils/ddeutil-workflow/actions/workflows/tests.yml)
|
4
4
|
[](https://pypi.org/project/ddeutil-workflow/)
|
5
5
|
[](https://github.com/ddeutils/ddeutil-workflow)
|
6
|
+
[](https://github.com/ddeutils/ddeutil-workflow/blob/main/LICENSE)
|
6
7
|
|
7
8
|
**Table of Contents**:
|
8
9
|
|
9
10
|
- [Installation](#installation)
|
10
11
|
- [Getting Started](#getting-started)
|
11
|
-
|
12
|
-
- [
|
13
|
-
- [
|
14
|
-
- [
|
15
|
-
- [Python](#python)
|
16
|
-
- [
|
17
|
-
- [
|
18
|
-
|
19
|
-
|
12
|
+
- [Core Features](#core-features)
|
13
|
+
- [On](#on)
|
14
|
+
- [Pipeline](#pipeline)
|
15
|
+
- [Usage](#usage)
|
16
|
+
- [Python & Bash](#python--bash)
|
17
|
+
- [Hook (EL)](#hook-extract--load)
|
18
|
+
- [Hook (T)](#hook-transform)
|
19
|
+
- [Configuration](#configuration)
|
20
|
+
- [Deployment](#deployment)
|
21
|
+
|
22
|
+
This **Workflow** objects was created for easy to make a simple metadata
|
20
23
|
driven pipeline that able to **ETL, T, EL, or ELT** by `.yaml` file.
|
21
24
|
|
22
25
|
I think we should not create the multiple pipeline per use-case if we able to
|
@@ -39,13 +42,18 @@ pipeline.
|
|
39
42
|
pip install ddeutil-workflow
|
40
43
|
```
|
41
44
|
|
42
|
-
This project need `ddeutil-io
|
45
|
+
This project need `ddeutil-io` extension namespace packages. If you want to install
|
46
|
+
this package with application add-ons, you should add `app` in installation;
|
47
|
+
|
48
|
+
```shell
|
49
|
+
pip install ddeutil-workflow[app]
|
50
|
+
```
|
43
51
|
|
44
52
|
## Getting Started
|
45
53
|
|
46
54
|
The first step, you should start create the connections and datasets for In and
|
47
55
|
Out of you data that want to use in pipeline of workflow. Some of this component
|
48
|
-
is similar component of the **Airflow** because I like it concepts.
|
56
|
+
is similar component of the **Airflow** because I like it orchestration concepts.
|
49
57
|
|
50
58
|
The main feature of this project is the `Pipeline` object that can call any
|
51
59
|
registries function. The pipeline can handle everything that you want to do, it
|
@@ -56,88 +64,84 @@ will passing parameters and catching the output for re-use it to next step.
|
|
56
64
|
> dynamic registries instead of main features because it have a lot of maintain
|
57
65
|
> vendor codes and deps. (I do not have time to handle this features)
|
58
66
|
|
59
|
-
###
|
67
|
+
### On
|
60
68
|
|
61
|
-
The
|
69
|
+
The **On** is schedule object.
|
62
70
|
|
63
71
|
```yaml
|
64
|
-
|
65
|
-
type:
|
66
|
-
|
72
|
+
on_every_5_min:
|
73
|
+
type: on.On
|
74
|
+
cron: "*/5 * * * *"
|
67
75
|
```
|
68
76
|
|
69
77
|
```python
|
70
|
-
from ddeutil.workflow.
|
71
|
-
|
72
|
-
conn = Conn.from_loader(name='conn_postgres_data', externals={})
|
73
|
-
assert conn.ping()
|
74
|
-
```
|
78
|
+
from ddeutil.workflow.on import On
|
75
79
|
|
76
|
-
|
80
|
+
schedule = On.from_loader(name='on_every_5_min', externals={})
|
81
|
+
assert '*/5 * * * *' == str(schedule.cronjob)
|
77
82
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
ds_postgres_customer_tbl:
|
84
|
-
type: dataset.PostgresTbl
|
85
|
-
conn: 'conn_postgres_data'
|
86
|
-
features:
|
87
|
-
id: serial primary key
|
88
|
-
name: varchar( 100 ) not null
|
83
|
+
cron_iter = schedule.generate('2022-01-01 00:00:00')
|
84
|
+
assert '2022-01-01 00:05:00' f"{cron_iter.next:%Y-%m-%d %H:%M:%S}"
|
85
|
+
assert '2022-01-01 00:10:00' f"{cron_iter.next:%Y-%m-%d %H:%M:%S}"
|
86
|
+
assert '2022-01-01 00:15:00' f"{cron_iter.next:%Y-%m-%d %H:%M:%S}"
|
87
|
+
assert '2022-01-01 00:20:00' f"{cron_iter.next:%Y-%m-%d %H:%M:%S}"
|
89
88
|
```
|
90
89
|
|
91
|
-
|
92
|
-
from ddeutil.workflow.vendors.pg import PostgresTbl
|
93
|
-
|
94
|
-
dataset = PostgresTbl.from_loader(name='ds_postgres_customer_tbl', externals={})
|
95
|
-
assert dataset.exists()
|
96
|
-
```
|
90
|
+
### Pipeline
|
97
91
|
|
98
|
-
|
92
|
+
The **Pipeline** object that is the core feature of this project.
|
99
93
|
|
100
94
|
```yaml
|
101
|
-
|
102
|
-
type:
|
103
|
-
|
95
|
+
run_py_local:
|
96
|
+
type: ddeutil.workflow.pipeline.Pipeline
|
97
|
+
on: 'on_every_5_min'
|
98
|
+
params:
|
99
|
+
author-run:
|
100
|
+
type: str
|
101
|
+
run-date:
|
102
|
+
type: datetime
|
104
103
|
```
|
105
104
|
|
106
105
|
```python
|
107
|
-
from ddeutil.workflow.
|
108
|
-
|
109
|
-
scdl = Schedule.from_loader(name='schd_for_node', externals={})
|
110
|
-
assert '*/5 * * * *' == str(scdl.cronjob)
|
106
|
+
from ddeutil.workflow.pipeline import Pipeline
|
111
107
|
|
112
|
-
|
113
|
-
|
114
|
-
assert '2022-01-01 00:10:00' f"{cron_iterate.next:%Y-%m-%d %H:%M:%S}"
|
115
|
-
assert '2022-01-01 00:15:00' f"{cron_iterate.next:%Y-%m-%d %H:%M:%S}"
|
116
|
-
assert '2022-01-01 00:20:00' f"{cron_iterate.next:%Y-%m-%d %H:%M:%S}"
|
117
|
-
assert '2022-01-01 00:25:00' f"{cron_iterate.next:%Y-%m-%d %H:%M:%S}"
|
108
|
+
pipe = Pipeline.from_loader(name='run_py_local', externals={})
|
109
|
+
pipe.execute(params={'author-run': 'Local Workflow', 'run-date': '2024-01-01'})
|
118
110
|
```
|
119
111
|
|
120
|
-
|
112
|
+
> [!NOTE]
|
113
|
+
> The above parameter use short declarative statement. You can pass a parameter
|
114
|
+
> type to the key of a parameter name.
|
115
|
+
> ```yaml
|
116
|
+
> params:
|
117
|
+
> author-run: str
|
118
|
+
> run-date: datetime
|
119
|
+
> ```
|
120
|
+
>
|
121
|
+
> And for the type, you can remove `ddeutil.workflow` prefix because we can find
|
122
|
+
> it by looping search from `WORKFLOW_CORE_REGISTRY` value.
|
123
|
+
|
124
|
+
## Usage
|
121
125
|
|
122
126
|
This is examples that use workflow file for running common Data Engineering
|
123
127
|
use-case.
|
124
128
|
|
125
|
-
|
129
|
+
> [!IMPORTANT]
|
130
|
+
> I recommend you to use `task` stage for all actions that you want to do with
|
131
|
+
> pipeline object.
|
126
132
|
|
127
|
-
|
133
|
+
### Python & Bash
|
128
134
|
|
129
135
|
```yaml
|
130
136
|
run_py_local:
|
131
|
-
type:
|
137
|
+
type: pipeline.Pipeline
|
132
138
|
params:
|
133
|
-
author-run:
|
134
|
-
|
135
|
-
run-date:
|
136
|
-
type: datetime
|
139
|
+
author-run: str
|
140
|
+
run-date: datetime
|
137
141
|
jobs:
|
138
142
|
first-job:
|
139
143
|
stages:
|
140
|
-
- name: Printing Information
|
144
|
+
- name: "Printing Information"
|
141
145
|
id: define-func
|
142
146
|
run: |
|
143
147
|
x = '${{ params.author-run }}'
|
@@ -146,7 +150,7 @@ run_py_local:
|
|
146
150
|
def echo(name: str):
|
147
151
|
print(f'Hello {name}')
|
148
152
|
|
149
|
-
- name: Run Sequence and use var from Above
|
153
|
+
- name: "Run Sequence and use var from Above"
|
150
154
|
vars:
|
151
155
|
x: ${{ params.author-run }}
|
152
156
|
run: |
|
@@ -154,11 +158,17 @@ run_py_local:
|
|
154
158
|
# Change x value
|
155
159
|
x: int = 1
|
156
160
|
|
157
|
-
- name: Call Function
|
161
|
+
- name: "Call Function"
|
158
162
|
vars:
|
159
163
|
echo: ${{ stages.define-func.outputs.echo }}
|
160
164
|
run: |
|
161
165
|
echo('Caller')
|
166
|
+
second-job:
|
167
|
+
stages:
|
168
|
+
- name: "Echo Bash Script"
|
169
|
+
id: shell-echo
|
170
|
+
bash: |
|
171
|
+
echo "Hello World from Shell"
|
162
172
|
```
|
163
173
|
|
164
174
|
```python
|
@@ -172,24 +182,23 @@ pipe.execute(params={'author-run': 'Local Workflow', 'run-date': '2024-01-01'})
|
|
172
182
|
> Hello Local Workflow
|
173
183
|
> Receive x from above with Local Workflow
|
174
184
|
> Hello Caller
|
185
|
+
> Hello World from Shell
|
175
186
|
```
|
176
187
|
|
177
|
-
###
|
188
|
+
### Hook (Extract & Load)
|
178
189
|
|
179
190
|
```yaml
|
180
191
|
pipe_el_pg_to_lake:
|
181
|
-
type:
|
192
|
+
type: pipeline.Pipeline
|
182
193
|
params:
|
183
|
-
run-date:
|
184
|
-
|
185
|
-
author-email:
|
186
|
-
type: str
|
194
|
+
run-date: datetime
|
195
|
+
author-email: str
|
187
196
|
jobs:
|
188
197
|
extract-load:
|
189
198
|
stages:
|
190
199
|
- name: "Extract Load from Postgres to Lake"
|
191
200
|
id: extract-load
|
192
|
-
|
201
|
+
uses: tasks/postgres-to-delta@polars
|
193
202
|
with:
|
194
203
|
source:
|
195
204
|
conn: conn_postgres_url
|
@@ -201,11 +210,11 @@ pipe_el_pg_to_lake:
|
|
201
210
|
endpoint: "/${{ params.name }}"
|
202
211
|
```
|
203
212
|
|
204
|
-
###
|
213
|
+
### Hook (Transform)
|
205
214
|
|
206
215
|
```yaml
|
207
|
-
|
208
|
-
type:
|
216
|
+
pipeline_hook_mssql_proc:
|
217
|
+
type: pipeline.Pipeline
|
209
218
|
params:
|
210
219
|
run_date: datetime
|
211
220
|
sp_name: str
|
@@ -216,7 +225,7 @@ pipe_hook_mssql_proc:
|
|
216
225
|
stages:
|
217
226
|
- name: "Transform Data in MS SQL Server"
|
218
227
|
id: transform
|
219
|
-
|
228
|
+
uses: tasks/mssql-proc@odbc
|
220
229
|
with:
|
221
230
|
exec: ${{ params.sp_name }}
|
222
231
|
params:
|
@@ -226,6 +235,30 @@ pipe_hook_mssql_proc:
|
|
226
235
|
target: ${{ params.target_name }}
|
227
236
|
```
|
228
237
|
|
229
|
-
##
|
238
|
+
## Configuration
|
230
239
|
|
231
|
-
|
240
|
+
```bash
|
241
|
+
export WORKFLOW_ROOT_PATH=.
|
242
|
+
export WORKFLOW_CORE_REGISTRY=ddeutil.workflow,tests.utils
|
243
|
+
export WORKFLOW_CORE_PATH_CONF=conf
|
244
|
+
```
|
245
|
+
|
246
|
+
Application config:
|
247
|
+
|
248
|
+
```bash
|
249
|
+
export WORKFLOW_APP_DB_URL=postgresql+asyncpg://user:pass@localhost:5432/schedule
|
250
|
+
export WORKFLOW_APP_INTERVAL=10
|
251
|
+
```
|
252
|
+
|
253
|
+
## Deployment
|
254
|
+
|
255
|
+
This package able to run as a application service for receive manual trigger
|
256
|
+
from the master node via RestAPI.
|
257
|
+
|
258
|
+
> [!WARNING]
|
259
|
+
> This feature do not start yet because I still research and find the best tool
|
260
|
+
> to use it provision an app service, like `starlette`, `fastapi`, `apscheduler`.
|
261
|
+
|
262
|
+
```shell
|
263
|
+
(venv) $ workflow start -p 7070
|
264
|
+
```
|