ddeutil-workflow 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddeutil/workflow/__about__.py +1 -1
- ddeutil/workflow/conn.py +31 -29
- ddeutil/workflow/dataset.py +1 -5
- ddeutil/workflow/exceptions.py +0 -50
- ddeutil/workflow/loader.py +26 -190
- ddeutil/workflow/pipeline.py +201 -83
- ddeutil/workflow/schedule.py +3 -8
- ddeutil/workflow/tasks/__init__.py +6 -10
- ddeutil/workflow/tasks/_pandas.py +54 -0
- ddeutil/workflow/tasks/_polars.py +45 -2
- ddeutil/workflow/utils.py +65 -0
- {ddeutil_workflow-0.0.1.dist-info → ddeutil_workflow-0.0.2.dist-info}/METADATA +26 -15
- ddeutil_workflow-0.0.2.dist-info/RECORD +25 -0
- ddeutil/workflow/hooks/__init__.py +0 -9
- ddeutil/workflow/hooks/_postgres.py +0 -2
- ddeutil/workflow/utils/__init__.py +0 -0
- ddeutil/workflow/utils/receive.py +0 -33
- ddeutil/workflow/utils/selection.py +0 -2
- ddeutil_workflow-0.0.1.dist-info/RECORD +0 -28
- {ddeutil_workflow-0.0.1.dist-info → ddeutil_workflow-0.0.2.dist-info}/LICENSE +0 -0
- {ddeutil_workflow-0.0.1.dist-info → ddeutil_workflow-0.0.2.dist-info}/WHEEL +0 -0
- {ddeutil_workflow-0.0.1.dist-info → ddeutil_workflow-0.0.2.dist-info}/top_level.txt +0 -0
ddeutil/workflow/pipeline.py
CHANGED
@@ -6,40 +6,58 @@
|
|
6
6
|
from __future__ import annotations
|
7
7
|
|
8
8
|
import inspect
|
9
|
+
import logging
|
9
10
|
import subprocess
|
11
|
+
from abc import ABC, abstractmethod
|
12
|
+
from datetime import date, datetime
|
10
13
|
from inspect import Parameter
|
11
14
|
from subprocess import CompletedProcess
|
12
|
-
from typing import Any, Callable, Optional, Union
|
15
|
+
from typing import Any, Callable, Literal, Optional, Union
|
13
16
|
|
17
|
+
from ddeutil.io.models.lineage import dt_now
|
14
18
|
from pydantic import BaseModel, Field
|
19
|
+
from pydantic.functional_validators import model_validator
|
15
20
|
from typing_extensions import Self
|
16
21
|
|
17
22
|
from .__regex import RegexConf
|
18
23
|
from .__types import DictData
|
19
|
-
from .exceptions import
|
20
|
-
from .loader import Loader,
|
24
|
+
from .exceptions import PyException, TaskException
|
25
|
+
from .loader import Loader, map_params
|
26
|
+
from .utils import make_registry
|
21
27
|
|
22
28
|
|
23
|
-
class
|
29
|
+
class BaseStage(BaseModel, ABC):
|
30
|
+
"""Base Stage Model."""
|
24
31
|
|
32
|
+
id: Optional[str] = None
|
33
|
+
name: str
|
25
34
|
|
26
|
-
|
35
|
+
@abstractmethod
|
36
|
+
def execute(self, params: DictData) -> DictData:
|
37
|
+
raise NotImplementedError("Stage should implement ``execute`` method.")
|
27
38
|
|
39
|
+
def set_outputs(self, rs: DictData, params: DictData) -> DictData:
|
40
|
+
"""Set outputs to params"""
|
41
|
+
if self.id is None:
|
42
|
+
return params
|
28
43
|
|
29
|
-
|
44
|
+
if "stages" not in params:
|
45
|
+
params["stages"] = {}
|
30
46
|
|
47
|
+
params["stages"][self.id] = {"outputs": rs}
|
48
|
+
return params
|
31
49
|
|
32
|
-
class EmptyStage(BaseModel):
|
33
|
-
"""Empty stage that is doing nothing and logging the name of stage only."""
|
34
50
|
|
35
|
-
|
36
|
-
name
|
51
|
+
class EmptyStage(BaseStage):
|
52
|
+
"""Empty stage that is doing nothing and logging the name of stage only."""
|
37
53
|
|
38
|
-
def execute(self, params:
|
54
|
+
def execute(self, params: DictData) -> DictData:
|
55
|
+
"""Execute for the Empty stage that do only logging out."""
|
56
|
+
logging.info(f"Execute: {self.name!r}")
|
39
57
|
return params
|
40
58
|
|
41
59
|
|
42
|
-
class ShellStage(
|
60
|
+
class ShellStage(BaseStage):
|
43
61
|
"""Shell statement stage."""
|
44
62
|
|
45
63
|
shell: str
|
@@ -50,9 +68,7 @@ class ShellStage(EmptyStage):
|
|
50
68
|
"""Prepare shell statement string that include newline"""
|
51
69
|
return shell.replace("\n", ";")
|
52
70
|
|
53
|
-
def set_outputs(
|
54
|
-
self, rs: CompletedProcess, params: dict[str, Any]
|
55
|
-
) -> dict[str, Any]:
|
71
|
+
def set_outputs(self, rs: CompletedProcess, params: DictData) -> DictData:
|
56
72
|
"""Set outputs to params"""
|
57
73
|
# NOTE: skipping set outputs of stage execution when id does not set.
|
58
74
|
if self.id is None:
|
@@ -71,7 +87,7 @@ class ShellStage(EmptyStage):
|
|
71
87
|
}
|
72
88
|
return params
|
73
89
|
|
74
|
-
def execute(self, params:
|
90
|
+
def execute(self, params: DictData) -> DictData:
|
75
91
|
"""Execute the Shell & Powershell statement with the Python build-in
|
76
92
|
``subprocess`` package.
|
77
93
|
"""
|
@@ -92,24 +108,22 @@ class ShellStage(EmptyStage):
|
|
92
108
|
return params
|
93
109
|
|
94
110
|
|
95
|
-
class PyStage(
|
111
|
+
class PyStage(BaseStage):
|
96
112
|
"""Python executor stage that running the Python statement that receive
|
97
113
|
globals nad additional variables.
|
98
114
|
"""
|
99
115
|
|
100
116
|
run: str
|
101
|
-
vars:
|
117
|
+
vars: DictData = Field(default_factory=dict)
|
102
118
|
|
103
|
-
def get_var(self, params:
|
119
|
+
def get_var(self, params: DictData) -> DictData:
|
104
120
|
"""Return variables"""
|
105
121
|
rs = self.vars.copy()
|
106
122
|
for p, v in self.vars.items():
|
107
|
-
rs[p] =
|
123
|
+
rs[p] = map_params(v, params)
|
108
124
|
return rs
|
109
125
|
|
110
|
-
def set_outputs(
|
111
|
-
self, lc: dict[str, Any], params: dict[str, Any]
|
112
|
-
) -> dict[str, Any]:
|
126
|
+
def set_outputs(self, rs: DictData, params: DictData) -> DictData:
|
113
127
|
"""Set outputs to params"""
|
114
128
|
# NOTE: skipping set outputs of stage execution when id does not set.
|
115
129
|
if self.id is None:
|
@@ -120,25 +134,25 @@ class PyStage(EmptyStage):
|
|
120
134
|
|
121
135
|
params["stages"][self.id] = {
|
122
136
|
# NOTE: The output will fileter unnecessary keys from ``_locals``.
|
123
|
-
"outputs": {k:
|
137
|
+
"outputs": {k: rs[k] for k in rs if k != "__annotations__"},
|
124
138
|
}
|
125
139
|
return params
|
126
140
|
|
127
|
-
def execute(self, params:
|
141
|
+
def execute(self, params: DictData) -> DictData:
|
128
142
|
"""Execute the Python statement that pass all globals and input params
|
129
143
|
to globals argument on ``exec`` build-in function.
|
130
144
|
|
131
145
|
:param params: A parameter that want to pass before run any statement.
|
132
|
-
:type params:
|
146
|
+
:type params: DictData
|
133
147
|
|
134
|
-
:rtype:
|
148
|
+
:rtype: DictData
|
135
149
|
:returns: A parameters from an input that was mapped output if the stage
|
136
150
|
ID was set.
|
137
151
|
"""
|
138
|
-
_globals:
|
139
|
-
_locals:
|
152
|
+
_globals: DictData = globals() | params | self.get_var(params)
|
153
|
+
_locals: DictData = {}
|
140
154
|
try:
|
141
|
-
exec(
|
155
|
+
exec(map_params(self.run, params), _globals, _locals)
|
142
156
|
except Exception as err:
|
143
157
|
raise PyException(
|
144
158
|
f"{err.__class__.__name__}: {err}\nRunning Statement:\n---\n"
|
@@ -151,14 +165,18 @@ class PyStage(EmptyStage):
|
|
151
165
|
|
152
166
|
|
153
167
|
class TaskSearch(BaseModel):
|
168
|
+
"""Task Search Model"""
|
169
|
+
|
154
170
|
path: str
|
155
171
|
func: str
|
156
172
|
tag: str
|
157
173
|
|
158
174
|
|
159
|
-
class TaskStage(
|
175
|
+
class TaskStage(BaseStage):
|
176
|
+
"""Task executor stage that running the Python function."""
|
177
|
+
|
160
178
|
task: str
|
161
|
-
args:
|
179
|
+
args: DictData
|
162
180
|
|
163
181
|
@staticmethod
|
164
182
|
def extract_task(task: str) -> Callable[[], Callable[[Any], Any]]:
|
@@ -167,27 +185,15 @@ class TaskStage(EmptyStage):
|
|
167
185
|
raise ValueError("Task does not match with task format regex.")
|
168
186
|
tasks = TaskSearch(**found.groupdict())
|
169
187
|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
except ImportError:
|
180
|
-
|
181
|
-
# NOTE: Try to import this task function fom target module.
|
182
|
-
try:
|
183
|
-
return import_string(
|
184
|
-
f"ddeutil.workflow.{tasks.path}.{tasks.func}"
|
185
|
-
)
|
186
|
-
except ImportError:
|
187
|
-
raise NotImplementedError(
|
188
|
-
f"ddeutil.workflow.{tasks.path} does not implement "
|
189
|
-
f"registries or {tasks.func}."
|
190
|
-
) from None
|
188
|
+
# NOTE: Registry object should implement on this package only.
|
189
|
+
# TODO: This prefix value to search registry should dynamic with
|
190
|
+
# config file.
|
191
|
+
rgt = make_registry(f"ddeutil.workflow.{tasks.path}")
|
192
|
+
if tasks.func not in rgt:
|
193
|
+
raise NotImplementedError(
|
194
|
+
f"ddeutil.workflow.{tasks.path}.registries does not "
|
195
|
+
f"implement registry: {tasks.func}."
|
196
|
+
)
|
191
197
|
|
192
198
|
if tasks.tag not in rgt[tasks.func]:
|
193
199
|
raise NotImplementedError(
|
@@ -197,7 +203,7 @@ class TaskStage(EmptyStage):
|
|
197
203
|
)
|
198
204
|
return rgt[tasks.func][tasks.tag]
|
199
205
|
|
200
|
-
def execute(self, params:
|
206
|
+
def execute(self, params: DictData) -> DictData:
|
201
207
|
"""Execute the Task function."""
|
202
208
|
task_caller = self.extract_task(self.task)()
|
203
209
|
if not callable(task_caller):
|
@@ -215,17 +221,11 @@ class TaskStage(EmptyStage):
|
|
215
221
|
f"does not set to args"
|
216
222
|
)
|
217
223
|
try:
|
218
|
-
rs = task_caller(**self.args)
|
224
|
+
rs = task_caller(**map_params(self.args, params))
|
219
225
|
except Exception as err:
|
220
226
|
raise TaskException(f"{err.__class__.__name__}: {err}") from err
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
class HookStage(EmptyStage):
|
225
|
-
hook: str
|
226
|
-
args: dict[str, Any]
|
227
|
-
|
228
|
-
def execute(self, params: dict[str, Any]) -> dict[str, Any]: ...
|
227
|
+
self.set_outputs(rs, params)
|
228
|
+
return params
|
229
229
|
|
230
230
|
|
231
231
|
# NOTE: Order of parsing stage data
|
@@ -233,14 +233,24 @@ Stage = Union[
|
|
233
233
|
PyStage,
|
234
234
|
ShellStage,
|
235
235
|
TaskStage,
|
236
|
-
HookStage,
|
237
236
|
EmptyStage,
|
238
237
|
]
|
239
238
|
|
240
239
|
|
240
|
+
class Strategy(BaseModel):
|
241
|
+
"""Strategy Model"""
|
242
|
+
|
243
|
+
matrix: list[str] = Field(default_factory=list)
|
244
|
+
include: list[str] = Field(default_factory=list)
|
245
|
+
exclude: list[str] = Field(default_factory=list)
|
246
|
+
|
247
|
+
|
241
248
|
class Job(BaseModel):
|
249
|
+
"""Job Model"""
|
250
|
+
|
242
251
|
stages: list[Stage] = Field(default_factory=list)
|
243
252
|
needs: list[str] = Field(default_factory=list)
|
253
|
+
strategy: Strategy = Field(default_factory=Strategy)
|
244
254
|
|
245
255
|
def stage(self, stage_id: str) -> Stage:
|
246
256
|
for stage in self.stages:
|
@@ -248,7 +258,8 @@ class Job(BaseModel):
|
|
248
258
|
return stage
|
249
259
|
raise ValueError(f"Stage ID {stage_id} does not exists")
|
250
260
|
|
251
|
-
def execute(self, params:
|
261
|
+
def execute(self, params: DictData | None = None) -> DictData:
|
262
|
+
"""Execute job with passing dynamic parameters from the pipeline."""
|
252
263
|
for stage in self.stages:
|
253
264
|
# NOTE:
|
254
265
|
# I do not use below syntax because `params` dict be the
|
@@ -259,45 +270,148 @@ class Job(BaseModel):
|
|
259
270
|
return params
|
260
271
|
|
261
272
|
|
262
|
-
class
|
263
|
-
|
264
|
-
include: list[str]
|
265
|
-
exclude: list[str]
|
273
|
+
class BaseParams(BaseModel, ABC):
|
274
|
+
"""Base Parameter that use to make Params Model."""
|
266
275
|
|
276
|
+
desc: Optional[str] = None
|
277
|
+
required: bool = True
|
278
|
+
type: str
|
267
279
|
|
268
|
-
|
269
|
-
|
280
|
+
@abstractmethod
|
281
|
+
def receive(self, value: Optional[Any] = None) -> Any:
|
282
|
+
raise ValueError(
|
283
|
+
"Receive value and validate typing before return valid value."
|
284
|
+
)
|
285
|
+
|
286
|
+
|
287
|
+
class DefaultParams(BaseParams):
|
288
|
+
"""Default Parameter that will check default if it required"""
|
289
|
+
|
290
|
+
default: Optional[str] = None
|
291
|
+
|
292
|
+
@abstractmethod
|
293
|
+
def receive(self, value: Optional[Any] = None) -> Any:
|
294
|
+
raise ValueError(
|
295
|
+
"Receive value and validate typing before return valid value."
|
296
|
+
)
|
297
|
+
|
298
|
+
@model_validator(mode="after")
|
299
|
+
def check_default(self) -> Self:
|
300
|
+
if not self.required and self.default is None:
|
301
|
+
raise ValueError(
|
302
|
+
"Default should set when this parameter does not required."
|
303
|
+
)
|
304
|
+
return self
|
305
|
+
|
306
|
+
|
307
|
+
class DatetimeParams(DefaultParams):
|
308
|
+
"""Datetime parameter."""
|
309
|
+
|
310
|
+
type: Literal["datetime"] = "datetime"
|
311
|
+
required: bool = False
|
312
|
+
default: datetime = Field(default_factory=dt_now)
|
313
|
+
|
314
|
+
def receive(self, value: str | datetime | date | None = None) -> datetime:
|
315
|
+
if value is None:
|
316
|
+
return self.default
|
317
|
+
|
318
|
+
if isinstance(value, datetime):
|
319
|
+
return value
|
320
|
+
elif isinstance(value, date):
|
321
|
+
return datetime(value.year, value.month, value.day)
|
322
|
+
elif not isinstance(value, str):
|
323
|
+
raise ValueError(
|
324
|
+
f"Value that want to convert to datetime does not support for "
|
325
|
+
f"type: {type(value)}"
|
326
|
+
)
|
327
|
+
return datetime.fromisoformat(value)
|
328
|
+
|
329
|
+
|
330
|
+
class StrParams(DefaultParams):
|
331
|
+
"""String parameter."""
|
332
|
+
|
333
|
+
type: Literal["str"] = "str"
|
270
334
|
|
271
|
-
|
335
|
+
def receive(self, value: Optional[str] = None) -> str | None:
|
336
|
+
if value is None:
|
337
|
+
return self.default
|
338
|
+
return str(value)
|
339
|
+
|
340
|
+
|
341
|
+
class IntParams(DefaultParams):
|
342
|
+
"""Integer parameter."""
|
343
|
+
|
344
|
+
type: Literal["int"] = "int"
|
345
|
+
|
346
|
+
def receive(self, value: Optional[int] = None) -> int | None:
|
347
|
+
if value is None:
|
348
|
+
return self.default
|
349
|
+
if not isinstance(value, int):
|
350
|
+
try:
|
351
|
+
return int(str(value))
|
352
|
+
except TypeError as err:
|
353
|
+
raise ValueError(
|
354
|
+
f"Value that want to convert to integer does not support "
|
355
|
+
f"for type: {type(value)}"
|
356
|
+
) from err
|
357
|
+
return value
|
358
|
+
|
359
|
+
|
360
|
+
class ChoiceParams(BaseParams):
|
361
|
+
type: Literal["choice"] = "choice"
|
362
|
+
options: list[str]
|
363
|
+
|
364
|
+
def receive(self, value: Optional[str] = None) -> str:
|
365
|
+
"""Receive value that match with options."""
|
366
|
+
# NOTE:
|
367
|
+
# Return the first value in options if does not pass any input value
|
368
|
+
if value is None:
|
369
|
+
return self.options[0]
|
370
|
+
if any(value not in self.options):
|
371
|
+
raise ValueError(f"{value} does not match any value in options")
|
372
|
+
return value
|
373
|
+
|
374
|
+
|
375
|
+
Params = Union[
|
376
|
+
ChoiceParams,
|
377
|
+
DatetimeParams,
|
378
|
+
StrParams,
|
379
|
+
]
|
272
380
|
|
273
381
|
|
274
382
|
class Pipeline(BaseModel):
|
275
383
|
"""Pipeline Model"""
|
276
384
|
|
277
|
-
params: dict[str,
|
385
|
+
params: dict[str, Params] = Field(default_factory=dict)
|
278
386
|
jobs: dict[str, Job]
|
279
387
|
|
280
388
|
@classmethod
|
281
389
|
def from_loader(
|
282
390
|
cls,
|
283
391
|
name: str,
|
284
|
-
externals: DictData,
|
392
|
+
externals: Optional[DictData] = None,
|
285
393
|
) -> Self:
|
286
|
-
loader: Loader = Loader(name, externals=externals)
|
394
|
+
loader: Loader = Loader(name, externals=(externals or {}))
|
287
395
|
if "jobs" not in loader.data:
|
288
|
-
raise
|
396
|
+
raise ValueError("Config does not set ``jobs`` value")
|
289
397
|
return cls(
|
290
398
|
jobs=loader.data["jobs"],
|
291
|
-
params=loader.params
|
399
|
+
params=loader.data["params"],
|
292
400
|
)
|
293
401
|
|
294
402
|
def job(self, name: str) -> Job:
|
295
|
-
"""Return Job model that exists on this pipeline.
|
403
|
+
"""Return Job model that exists on this pipeline.
|
404
|
+
|
405
|
+
:param name: A job name that want to get from a mapping of job models.
|
406
|
+
:type name: str
|
407
|
+
|
408
|
+
:rtype: Job
|
409
|
+
"""
|
296
410
|
if name not in self.jobs:
|
297
411
|
raise ValueError(f"Job {name} does not exists")
|
298
412
|
return self.jobs[name]
|
299
413
|
|
300
|
-
def execute(self, params:
|
414
|
+
def execute(self, params: DictData | None = None) -> DictData:
|
301
415
|
"""Execute pipeline with passing dynamic parameters.
|
302
416
|
|
303
417
|
See Also:
|
@@ -312,18 +426,22 @@ class Pipeline(BaseModel):
|
|
312
426
|
... "<job-name>.stages.<stage-id>.outputs.<key>"
|
313
427
|
|
314
428
|
"""
|
315
|
-
params:
|
429
|
+
params: DictData = params or {}
|
316
430
|
check_key = tuple(f"{k!r}" for k in self.params if k not in params)
|
317
431
|
if check_key:
|
318
432
|
raise ValueError(
|
319
433
|
f"Parameters that needed on pipeline does not pass: "
|
320
434
|
f"{', '.join(check_key)}."
|
321
435
|
)
|
322
|
-
|
436
|
+
|
437
|
+
if any(p not in params for p in self.params if self.params[p].required):
|
438
|
+
raise ValueError("Required parameter does not pass")
|
439
|
+
|
440
|
+
params: DictData = {
|
323
441
|
"params": (
|
324
442
|
params
|
325
443
|
| {
|
326
|
-
k: self.params[k](params[k])
|
444
|
+
k: self.params[k].receive(params[k])
|
327
445
|
for k in params
|
328
446
|
if k in self.params
|
329
447
|
}
|
@@ -331,7 +449,7 @@ class Pipeline(BaseModel):
|
|
331
449
|
}
|
332
450
|
for job_id in self.jobs:
|
333
451
|
print(f"[PIPELINE]: Start execute the job: {job_id!r}")
|
334
|
-
job = self.jobs[job_id]
|
452
|
+
job: Job = self.jobs[job_id]
|
335
453
|
# TODO: Condition on ``needs`` of this job was set. It should create
|
336
454
|
# multithreading process on this step.
|
337
455
|
job.execute(params=params)
|
ddeutil/workflow/schedule.py
CHANGED
@@ -9,15 +9,13 @@ from datetime import datetime
|
|
9
9
|
from typing import Annotated
|
10
10
|
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
11
11
|
|
12
|
-
from ddeutil.io import Params
|
13
12
|
from ddeutil.workflow.vendors.__schedule import CronJob, CronRunner
|
14
13
|
from pydantic import BaseModel, ConfigDict, Field
|
15
14
|
from pydantic.functional_validators import field_validator
|
16
15
|
from typing_extensions import Self
|
17
16
|
|
18
17
|
from .__types import DictData
|
19
|
-
from .
|
20
|
-
from .loader import SimLoad
|
18
|
+
from .loader import Loader
|
21
19
|
|
22
20
|
|
23
21
|
class BaseScdl(BaseModel):
|
@@ -37,14 +35,11 @@ class BaseScdl(BaseModel):
|
|
37
35
|
def from_loader(
|
38
36
|
cls,
|
39
37
|
name: str,
|
40
|
-
params: Params,
|
41
38
|
externals: DictData,
|
42
39
|
) -> Self:
|
43
|
-
loader:
|
40
|
+
loader: Loader = Loader(name, externals=externals)
|
44
41
|
if "cronjob" not in loader.data:
|
45
|
-
raise
|
46
|
-
"cronjob", "Config does not set ``cronjob``"
|
47
|
-
)
|
42
|
+
raise ValueError("Config does not set ``cronjob`` value")
|
48
43
|
return cls(cronjob=loader.data["cronjob"], extras=externals)
|
49
44
|
|
50
45
|
@field_validator("tz")
|
@@ -1,10 +1,6 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
"polars": lazy("ddeutil.workflow.tasks._polars.csv_to_parquet"),
|
8
|
-
"polars-dir": lazy("ddeutil.workflow.tasks._polars.csv_to_parquet_dir"),
|
9
|
-
},
|
10
|
-
}
|
1
|
+
# ------------------------------------------------------------------------------
|
2
|
+
# Copyright (c) 2022 Korawich Anuttra. All rights reserved.
|
3
|
+
# Licensed under the MIT License. See LICENSE in the project root for
|
4
|
+
# license information.
|
5
|
+
# ------------------------------------------------------------------------------
|
6
|
+
from ._polars import *
|
@@ -0,0 +1,54 @@
|
|
1
|
+
import logging
|
2
|
+
import math
|
3
|
+
|
4
|
+
try:
|
5
|
+
import pandas as pd
|
6
|
+
|
7
|
+
logging.debug(f"Polars version: {pd.__version__}")
|
8
|
+
except ImportError as err:
|
9
|
+
raise ImportError(
|
10
|
+
"``split_iterable`` function want to use pandas package that does"
|
11
|
+
"not install on your interpreter."
|
12
|
+
) from err
|
13
|
+
|
14
|
+
|
15
|
+
def split_iterable(iterable, chunk_size=None, generator_flag: bool = True):
|
16
|
+
"""
|
17
|
+
Split an iterable into mini batch with batch length of batch_number
|
18
|
+
supports batch of a pandas dataframe
|
19
|
+
usage:
|
20
|
+
>> for i in split_iterable([1,2,3,4,5], chunk_size=2):
|
21
|
+
>> print(i)
|
22
|
+
[1, 2]
|
23
|
+
[3, 4]
|
24
|
+
[5]
|
25
|
+
|
26
|
+
for idx, mini_data in split_iterable(batch(df, chunk_size=10)):
|
27
|
+
print(idx)
|
28
|
+
print(mini_data)
|
29
|
+
"""
|
30
|
+
|
31
|
+
chunk_size: int = chunk_size or 25000
|
32
|
+
num_chunks = math.ceil(len(iterable) / chunk_size)
|
33
|
+
if generator_flag:
|
34
|
+
for _ in range(num_chunks):
|
35
|
+
if isinstance(iterable, pd.DataFrame):
|
36
|
+
yield iterable.iloc[_ * chunk_size : (_ + 1) * chunk_size]
|
37
|
+
else:
|
38
|
+
yield iterable[_ * chunk_size : (_ + 1) * chunk_size]
|
39
|
+
else:
|
40
|
+
_chunks: list = []
|
41
|
+
for _ in range(num_chunks):
|
42
|
+
if isinstance(iterable, pd.DataFrame):
|
43
|
+
_chunks.append(
|
44
|
+
iterable.iloc[_ * chunk_size : (_ + 1) * chunk_size]
|
45
|
+
)
|
46
|
+
else:
|
47
|
+
_chunks.append(iterable[_ * chunk_size : (_ + 1) * chunk_size])
|
48
|
+
return _chunks
|
49
|
+
|
50
|
+
|
51
|
+
def chunks(dataframe: pd.DataFrame, n: int):
|
52
|
+
"""Yield successive n-sized chunks from dataframe."""
|
53
|
+
for i in range(0, len(dataframe), n):
|
54
|
+
yield dataframe.iloc[i : i + n]
|
@@ -11,23 +11,41 @@ from uuid import uuid4
|
|
11
11
|
import polars as pl
|
12
12
|
import pyarrow.parquet as pq
|
13
13
|
from ddeutil.workflow.dataset import PolarsCsv, PolarsParq
|
14
|
+
from ddeutil.workflow.utils import tag
|
14
15
|
|
15
16
|
|
17
|
+
def polars_dtype():
|
18
|
+
return {
|
19
|
+
"str": pl.Utf8,
|
20
|
+
"int": pl.Int32,
|
21
|
+
}
|
22
|
+
|
23
|
+
|
24
|
+
@tag("polars-dir", name="el-csv-to-parquet")
|
16
25
|
def csv_to_parquet_dir(
|
17
26
|
source: str,
|
18
27
|
sink: str,
|
19
28
|
conversion: dict[str, Any] | None = None,
|
20
|
-
):
|
29
|
+
) -> dict[str, int]:
|
30
|
+
"""Extract Load data from CSV to Parquet file.
|
31
|
+
|
32
|
+
:param source:
|
33
|
+
:param sink:
|
34
|
+
:param conversion:
|
35
|
+
"""
|
21
36
|
print("Start EL for CSV to Parquet with Polars Engine")
|
22
37
|
print("---")
|
23
38
|
# STEP 01: Read the source data to Polars.
|
24
39
|
src_dataset: PolarsCsv = PolarsCsv.from_loader(name=source, externals={})
|
25
|
-
src_df = src_dataset.load()
|
40
|
+
src_df: pl.DataFrame = src_dataset.load()
|
26
41
|
print(src_df)
|
27
42
|
|
28
43
|
# STEP 02: Schema conversion on Polars DataFrame.
|
29
44
|
conversion: dict[str, Any] = conversion or {}
|
30
45
|
if conversion:
|
46
|
+
src_df = src_df.with_columns(
|
47
|
+
*[pl.col(c).cast(col.type).alias(col.name) for c, col in conversion]
|
48
|
+
)
|
31
49
|
print("Start Schema Conversion ...")
|
32
50
|
|
33
51
|
# STEP 03: Write data to parquet file format.
|
@@ -39,3 +57,28 @@ def csv_to_parquet_dir(
|
|
39
57
|
basename_template=f"{sink.object}-{uuid4().hex}-{{i}}.snappy.parquet",
|
40
58
|
)
|
41
59
|
return {"records": src_df.select(pl.len()).item()}
|
60
|
+
|
61
|
+
|
62
|
+
@tag("polars-dir-scan", name="el-csv-to-parquet")
|
63
|
+
def csv_to_parquet_dir_scan(
|
64
|
+
source: str,
|
65
|
+
sink: str,
|
66
|
+
conversion: dict[str, Any] | None = None,
|
67
|
+
) -> dict[str, int]:
|
68
|
+
print("Start EL for CSV to Parquet with Polars Engine")
|
69
|
+
print("---")
|
70
|
+
# STEP 01: Read the source data to Polars.
|
71
|
+
src_dataset: PolarsCsv = PolarsCsv.from_loader(name=source, externals={})
|
72
|
+
src_df: pl.LazyFrame = src_dataset.scan()
|
73
|
+
|
74
|
+
if conversion:
|
75
|
+
...
|
76
|
+
|
77
|
+
sink = PolarsParq.from_loader(name=sink, externals={})
|
78
|
+
pq.write_to_dataset(
|
79
|
+
table=src_df.collect().to_arrow(),
|
80
|
+
root_path=f"{sink.conn.endpoint}/{sink.object}",
|
81
|
+
compression="snappy",
|
82
|
+
basename_template=f"{sink.object}-{uuid4().hex}-{{i}}.snappy.parquet",
|
83
|
+
)
|
84
|
+
return {"records": src_df.select(pl.len()).collect().item()}
|