ddeutil-workflow 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddeutil/workflow/__about__.py +1 -1
- ddeutil/workflow/__init__.py +9 -0
- ddeutil/workflow/__types.py +43 -1
- ddeutil/workflow/exceptions.py +13 -1
- ddeutil/workflow/loader.py +16 -110
- ddeutil/workflow/on.py +195 -0
- ddeutil/workflow/pipeline.py +351 -371
- ddeutil/workflow/{vendors/__schedule.py → scheduler.py} +222 -176
- ddeutil/workflow/stage.py +402 -0
- ddeutil/workflow/utils.py +219 -28
- {ddeutil_workflow-0.0.4.dist-info → ddeutil_workflow-0.0.6.dist-info}/METADATA +118 -90
- ddeutil_workflow-0.0.6.dist-info/RECORD +15 -0
- {ddeutil_workflow-0.0.4.dist-info → ddeutil_workflow-0.0.6.dist-info}/WHEEL +1 -1
- ddeutil/workflow/__regex.py +0 -44
- ddeutil/workflow/conn.py +0 -240
- ddeutil/workflow/schedule.py +0 -82
- ddeutil/workflow/tasks/__init__.py +0 -6
- ddeutil/workflow/tasks/_pandas.py +0 -54
- ddeutil/workflow/tasks/_polars.py +0 -92
- ddeutil/workflow/vendors/__dataset.py +0 -127
- ddeutil/workflow/vendors/__dict.py +0 -333
- ddeutil/workflow/vendors/__init__.py +0 -0
- ddeutil/workflow/vendors/aws.py +0 -185
- ddeutil/workflow/vendors/az.py +0 -0
- ddeutil/workflow/vendors/minio.py +0 -11
- ddeutil/workflow/vendors/pd.py +0 -13
- ddeutil/workflow/vendors/pg.py +0 -11
- ddeutil/workflow/vendors/pl.py +0 -172
- ddeutil/workflow/vendors/sftp.py +0 -209
- ddeutil_workflow-0.0.4.dist-info/RECORD +0 -29
- {ddeutil_workflow-0.0.4.dist-info → ddeutil_workflow-0.0.6.dist-info}/LICENSE +0 -0
- {ddeutil_workflow-0.0.4.dist-info → ddeutil_workflow-0.0.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,15 @@
|
|
1
|
+
ddeutil/workflow/__about__.py,sha256=VEYa91VchyTUnF57lvvquHvitTViBxxLXuhcEnr4TAY,27
|
2
|
+
ddeutil/workflow/__init__.py,sha256=Y5wLiJ0zS1CfoSOZ0oo7OL3LNMKvPmpUO4fVHuAOv8E,429
|
3
|
+
ddeutil/workflow/__types.py,sha256=PfwDZBnTwe2JImD7UFS0J6Nq-1TcjBGebOVzJZoSuTQ,1354
|
4
|
+
ddeutil/workflow/exceptions.py,sha256=9O12c4aNLi0dyjVBgCLveV3HN9PXcZfwFfLXdgm3Ffs,626
|
5
|
+
ddeutil/workflow/loader.py,sha256=_ZD-XP5P7VbUeqItrUVPaKIZu6dMUZ2aywbCbReW1hQ,2778
|
6
|
+
ddeutil/workflow/on.py,sha256=YoEqDbzJUwqOA3JRltbvlYr0rNTtxdmb7cWMxl8U19k,6717
|
7
|
+
ddeutil/workflow/pipeline.py,sha256=8mIvY34_fsiqscBa9JB94MgN3Km5fkuD2iaNZSAQVuM,17843
|
8
|
+
ddeutil/workflow/scheduler.py,sha256=FqmkvWCqwJ4eRf8aDn5Ce4FcNWqmcvu2aTTfL34lfgs,22184
|
9
|
+
ddeutil/workflow/stage.py,sha256=bDJiGS21gYlYbFDnLTKH9aIbXfej9fT-V1ADoPX7w4s,13829
|
10
|
+
ddeutil/workflow/utils.py,sha256=xapKxxnqIzlbKA45GaRcWn-VL30AhE7M8f46ynr-vbI,11173
|
11
|
+
ddeutil_workflow-0.0.6.dist-info/LICENSE,sha256=nGFZ1QEhhhWeMHf9n99_fdt4vQaXS29xWKxt-OcLywk,1085
|
12
|
+
ddeutil_workflow-0.0.6.dist-info/METADATA,sha256=5X6ewXGn96MR9rDhVmmaoTuIwuRmfi72t7ezM8wxbvw,8612
|
13
|
+
ddeutil_workflow-0.0.6.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
14
|
+
ddeutil_workflow-0.0.6.dist-info/top_level.txt,sha256=m9M6XeSWDwt_yMsmH6gcOjHZVK5O0-vgtNBuncHjzW4,8
|
15
|
+
ddeutil_workflow-0.0.6.dist-info/RECORD,,
|
ddeutil/workflow/__regex.py
DELETED
@@ -1,44 +0,0 @@
|
|
1
|
-
# -------------------------------------------------------------------------
|
2
|
-
# Copyright (c) 2022 Korawich Anuttra. All rights reserved.
|
3
|
-
# Licensed under the MIT License. See LICENSE in the project root for
|
4
|
-
# license information.
|
5
|
-
# --------------------------------------------------------------------------
|
6
|
-
import re
|
7
|
-
from re import (
|
8
|
-
IGNORECASE,
|
9
|
-
MULTILINE,
|
10
|
-
UNICODE,
|
11
|
-
VERBOSE,
|
12
|
-
Pattern,
|
13
|
-
)
|
14
|
-
|
15
|
-
|
16
|
-
class RegexConf:
|
17
|
-
"""Regular expression config."""
|
18
|
-
|
19
|
-
# NOTE: Search caller
|
20
|
-
__re_caller: str = r"""
|
21
|
-
\$
|
22
|
-
{{
|
23
|
-
\s*(?P<caller>
|
24
|
-
[a-zA-Z0-9_.\s'\"\[\]\(\)\-\{}]+?
|
25
|
-
)\s*
|
26
|
-
}}
|
27
|
-
"""
|
28
|
-
RE_CALLER: Pattern = re.compile(
|
29
|
-
__re_caller, MULTILINE | IGNORECASE | UNICODE | VERBOSE
|
30
|
-
)
|
31
|
-
|
32
|
-
# NOTE: Search task
|
33
|
-
__re_task_fmt: str = r"""
|
34
|
-
^
|
35
|
-
(?P<path>[^/@]+)
|
36
|
-
/
|
37
|
-
(?P<func>[^@]+)
|
38
|
-
@
|
39
|
-
(?P<tag>.+)
|
40
|
-
$
|
41
|
-
"""
|
42
|
-
RE_TASK_FMT: Pattern = re.compile(
|
43
|
-
__re_task_fmt, MULTILINE | IGNORECASE | UNICODE | VERBOSE
|
44
|
-
)
|
ddeutil/workflow/conn.py
DELETED
@@ -1,240 +0,0 @@
|
|
1
|
-
# ------------------------------------------------------------------------------
|
2
|
-
# Copyright (c) 2022 Korawich Anuttra. All rights reserved.
|
3
|
-
# Licensed under the MIT License. See LICENSE in the project root for
|
4
|
-
# license information.
|
5
|
-
# ------------------------------------------------------------------------------
|
6
|
-
from __future__ import annotations
|
7
|
-
|
8
|
-
import logging
|
9
|
-
from collections.abc import Iterator
|
10
|
-
from pathlib import Path
|
11
|
-
from typing import Annotated, Any, Literal, Optional, TypeVar
|
12
|
-
|
13
|
-
from ddeutil.io.models.conn import Conn as ConnModel
|
14
|
-
from pydantic import BaseModel, ConfigDict, Field
|
15
|
-
from pydantic.functional_validators import field_validator
|
16
|
-
from pydantic.types import SecretStr
|
17
|
-
from typing_extensions import Self
|
18
|
-
|
19
|
-
from .__types import DictData, TupleStr
|
20
|
-
from .loader import Loader
|
21
|
-
|
22
|
-
EXCLUDED_EXTRAS: TupleStr = (
|
23
|
-
"type",
|
24
|
-
"url",
|
25
|
-
)
|
26
|
-
|
27
|
-
|
28
|
-
class BaseConn(BaseModel):
|
29
|
-
"""Base Conn (Connection) Model"""
|
30
|
-
|
31
|
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
32
|
-
|
33
|
-
# NOTE: This is fields
|
34
|
-
dialect: str
|
35
|
-
host: Optional[str] = None
|
36
|
-
port: Optional[int] = None
|
37
|
-
user: Optional[str] = None
|
38
|
-
pwd: Optional[SecretStr] = None
|
39
|
-
endpoint: str
|
40
|
-
extras: Annotated[
|
41
|
-
DictData,
|
42
|
-
Field(default_factory=dict, description="Extras mapping of parameters"),
|
43
|
-
]
|
44
|
-
|
45
|
-
@classmethod
|
46
|
-
def from_dict(cls, values: DictData) -> Self:
|
47
|
-
"""Construct Connection Model from dict data. This construct is
|
48
|
-
different with ``.model_validate()`` because it will prepare the values
|
49
|
-
before using it if the data dose not have 'url'.
|
50
|
-
|
51
|
-
:param values: A dict data that use to construct this model.
|
52
|
-
"""
|
53
|
-
# NOTE: filter out the fields of this model.
|
54
|
-
filter_data: DictData = {
|
55
|
-
k: values.pop(k)
|
56
|
-
for k in values.copy()
|
57
|
-
if k not in cls.model_fields and k not in EXCLUDED_EXTRAS
|
58
|
-
}
|
59
|
-
if "url" in values:
|
60
|
-
url: ConnModel = ConnModel.from_url(values.pop("url"))
|
61
|
-
return cls(
|
62
|
-
dialect=url.dialect,
|
63
|
-
host=url.host,
|
64
|
-
port=url.port,
|
65
|
-
user=url.user,
|
66
|
-
pwd=url.pwd,
|
67
|
-
# NOTE:
|
68
|
-
# I will replace None endpoint with memory value for SQLite
|
69
|
-
# connection string.
|
70
|
-
endpoint=(url.endpoint or "memory"),
|
71
|
-
# NOTE: This order will show that externals this the top level.
|
72
|
-
extras=(url.options | filter_data),
|
73
|
-
)
|
74
|
-
return cls.model_validate(
|
75
|
-
obj={
|
76
|
-
"extras": (values.pop("extras", {}) | filter_data),
|
77
|
-
**values,
|
78
|
-
}
|
79
|
-
)
|
80
|
-
|
81
|
-
@classmethod
|
82
|
-
def from_loader(cls, name: str, externals: DictData) -> Self:
|
83
|
-
"""Construct Connection with Loader object with specific config name.
|
84
|
-
|
85
|
-
:param name: A config name.
|
86
|
-
:param externals: A external data that want to adding to extras.
|
87
|
-
"""
|
88
|
-
loader: Loader = Loader(name, externals=externals)
|
89
|
-
# NOTE: Validate the config type match with current connection model
|
90
|
-
if loader.type != cls:
|
91
|
-
raise ValueError(f"Type {loader.type} does not match with {cls}")
|
92
|
-
return cls.from_dict(
|
93
|
-
{
|
94
|
-
"extras": (loader.data.pop("extras", {}) | externals),
|
95
|
-
**loader.data,
|
96
|
-
}
|
97
|
-
)
|
98
|
-
|
99
|
-
@field_validator("endpoint")
|
100
|
-
def __prepare_slash(cls, value: str) -> str:
|
101
|
-
"""Prepare slash character that map double form URL model loading."""
|
102
|
-
if value.startswith("//"):
|
103
|
-
return value[1:]
|
104
|
-
return value
|
105
|
-
|
106
|
-
|
107
|
-
class Conn(BaseConn):
|
108
|
-
"""Conn (Connection) Model that implement any necessary methods. This object
|
109
|
-
should be the base for abstraction to any connection model object.
|
110
|
-
"""
|
111
|
-
|
112
|
-
def get_spec(self) -> str:
|
113
|
-
"""Return full connection url that construct from all fields."""
|
114
|
-
return (
|
115
|
-
f"{self.dialect}://{self.user or ''}"
|
116
|
-
f"{f':{self.pwd}' if self.pwd else ''}"
|
117
|
-
f"{self.host or ''}{f':{self.port}' if self.port else ''}"
|
118
|
-
f"/{self.endpoint}"
|
119
|
-
)
|
120
|
-
|
121
|
-
def ping(self) -> bool:
|
122
|
-
"""Ping the connection that able to use with this field value."""
|
123
|
-
raise NotImplementedError("Ping does not implement")
|
124
|
-
|
125
|
-
def glob(self, pattern: str) -> Iterator[Any]:
|
126
|
-
"""Return a list of object from the endpoint of this connection."""
|
127
|
-
raise NotImplementedError("Glob does not implement")
|
128
|
-
|
129
|
-
def find_object(self, _object: str):
|
130
|
-
raise NotImplementedError("Glob does not implement")
|
131
|
-
|
132
|
-
|
133
|
-
class FlSys(Conn):
|
134
|
-
"""File System Connection."""
|
135
|
-
|
136
|
-
dialect: Literal["local"] = "local"
|
137
|
-
|
138
|
-
def ping(self) -> bool:
|
139
|
-
return Path(self.endpoint).exists()
|
140
|
-
|
141
|
-
def glob(self, pattern: str) -> Iterator[Path]:
|
142
|
-
yield from Path(self.endpoint).rglob(pattern=pattern)
|
143
|
-
|
144
|
-
def find_object(self, _object: str) -> bool:
|
145
|
-
return (Path(self.endpoint) / _object).exists()
|
146
|
-
|
147
|
-
|
148
|
-
class SFTP(Conn):
|
149
|
-
"""SFTP Server Connection."""
|
150
|
-
|
151
|
-
dialect: Literal["sftp"] = "sftp"
|
152
|
-
|
153
|
-
def __client(self):
|
154
|
-
from .vendors.sftp import WrapSFTP
|
155
|
-
|
156
|
-
return WrapSFTP(
|
157
|
-
host=self.host,
|
158
|
-
port=self.port,
|
159
|
-
user=self.user,
|
160
|
-
pwd=self.pwd.get_secret_value(),
|
161
|
-
)
|
162
|
-
|
163
|
-
def ping(self) -> bool:
|
164
|
-
with self.__client().simple_client():
|
165
|
-
return True
|
166
|
-
|
167
|
-
def glob(self, pattern: str) -> Iterator[str]:
|
168
|
-
yield from self.__client().walk(pattern=pattern)
|
169
|
-
|
170
|
-
|
171
|
-
class Db(Conn):
|
172
|
-
"""RDBMS System Connection"""
|
173
|
-
|
174
|
-
def ping(self) -> bool:
|
175
|
-
from sqlalchemy import create_engine
|
176
|
-
from sqlalchemy.engine import URL, Engine
|
177
|
-
from sqlalchemy.exc import OperationalError
|
178
|
-
|
179
|
-
engine: Engine = create_engine(
|
180
|
-
url=URL.create(
|
181
|
-
self.dialect,
|
182
|
-
username=self.user,
|
183
|
-
password=self.pwd.get_secret_value() if self.pwd else None,
|
184
|
-
host=self.host,
|
185
|
-
port=self.port,
|
186
|
-
database=self.endpoint,
|
187
|
-
query={},
|
188
|
-
),
|
189
|
-
execution_options={},
|
190
|
-
)
|
191
|
-
try:
|
192
|
-
return engine.connect()
|
193
|
-
except OperationalError as err:
|
194
|
-
logging.warning(str(err))
|
195
|
-
return False
|
196
|
-
|
197
|
-
|
198
|
-
class SQLite(Db):
|
199
|
-
dialect: Literal["sqlite"]
|
200
|
-
|
201
|
-
|
202
|
-
class ODBC(Conn): ...
|
203
|
-
|
204
|
-
|
205
|
-
class Doc(Conn):
|
206
|
-
"""No SQL System Connection"""
|
207
|
-
|
208
|
-
|
209
|
-
class Mongo(Doc): ...
|
210
|
-
|
211
|
-
|
212
|
-
class SSHCred(BaseModel):
|
213
|
-
ssh_host: str
|
214
|
-
ssh_user: str
|
215
|
-
ssh_password: Optional[SecretStr] = Field(default=None)
|
216
|
-
ssh_private_key: Optional[str] = Field(default=None)
|
217
|
-
ssh_private_key_pwd: Optional[SecretStr] = Field(default=None)
|
218
|
-
ssh_port: int = Field(default=22)
|
219
|
-
|
220
|
-
|
221
|
-
class S3Cred(BaseModel):
|
222
|
-
aws_access_key: str
|
223
|
-
aws_secret_access_key: SecretStr
|
224
|
-
region: str = Field(default="ap-southeast-1")
|
225
|
-
role_arn: Optional[str] = Field(default=None)
|
226
|
-
role_name: Optional[str] = Field(default=None)
|
227
|
-
mfa_serial: Optional[str] = Field(default=None)
|
228
|
-
|
229
|
-
|
230
|
-
class AZServPrinCred(BaseModel):
|
231
|
-
tenant: str
|
232
|
-
client_id: str
|
233
|
-
secret_id: SecretStr
|
234
|
-
|
235
|
-
|
236
|
-
class GoogleCred(BaseModel):
|
237
|
-
google_json_path: str
|
238
|
-
|
239
|
-
|
240
|
-
SubclassConn = TypeVar("SubclassConn", bound=Conn)
|
ddeutil/workflow/schedule.py
DELETED
@@ -1,82 +0,0 @@
|
|
1
|
-
# ------------------------------------------------------------------------------
|
2
|
-
# Copyright (c) 2022 Korawich Anuttra. All rights reserved.
|
3
|
-
# Licensed under the MIT License. See LICENSE in the project root for
|
4
|
-
# license information.
|
5
|
-
# ------------------------------------------------------------------------------
|
6
|
-
from __future__ import annotations
|
7
|
-
|
8
|
-
from datetime import datetime
|
9
|
-
from typing import Annotated
|
10
|
-
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
11
|
-
|
12
|
-
from ddeutil.workflow.vendors.__schedule import CronJob, CronRunner
|
13
|
-
from pydantic import BaseModel, ConfigDict, Field
|
14
|
-
from pydantic.functional_validators import field_validator
|
15
|
-
from typing_extensions import Self
|
16
|
-
|
17
|
-
from .__types import DictData
|
18
|
-
from .loader import Loader
|
19
|
-
|
20
|
-
|
21
|
-
class BaseSchedule(BaseModel):
|
22
|
-
"""Base Schedule (Schedule) Model"""
|
23
|
-
|
24
|
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
25
|
-
|
26
|
-
# NOTE: This is fields
|
27
|
-
cronjob: Annotated[CronJob, Field(description="Cron job of this schedule")]
|
28
|
-
tz: Annotated[str, Field(description="Timezone")] = "utc"
|
29
|
-
extras: Annotated[
|
30
|
-
DictData,
|
31
|
-
Field(default_factory=dict, description="Extras mapping of parameters"),
|
32
|
-
]
|
33
|
-
|
34
|
-
@classmethod
|
35
|
-
def from_loader(
|
36
|
-
cls,
|
37
|
-
name: str,
|
38
|
-
externals: DictData,
|
39
|
-
) -> Self:
|
40
|
-
loader: Loader = Loader(name, externals=externals)
|
41
|
-
if "cronjob" not in loader.data:
|
42
|
-
raise ValueError("Config does not set ``cronjob`` value")
|
43
|
-
return cls(cronjob=loader.data["cronjob"], extras=externals)
|
44
|
-
|
45
|
-
@field_validator("tz")
|
46
|
-
def __validate_tz(cls, value: str):
|
47
|
-
try:
|
48
|
-
_ = ZoneInfo(value)
|
49
|
-
return value
|
50
|
-
except ZoneInfoNotFoundError as err:
|
51
|
-
raise ValueError(f"Invalid timezone: {value}") from err
|
52
|
-
|
53
|
-
@field_validator("cronjob", mode="before")
|
54
|
-
def __prepare_cronjob(cls, value: str | CronJob) -> CronJob:
|
55
|
-
return CronJob(value) if isinstance(value, str) else value
|
56
|
-
|
57
|
-
def generate(self, start: str | datetime) -> CronRunner:
|
58
|
-
"""Return Cron runner object."""
|
59
|
-
if not isinstance(start, datetime):
|
60
|
-
start: datetime = datetime.fromisoformat(start)
|
61
|
-
return self.cronjob.schedule(date=(start.astimezone(ZoneInfo(self.tz))))
|
62
|
-
|
63
|
-
|
64
|
-
class Schedule(BaseSchedule):
|
65
|
-
"""Schedule (Schedule) Model.
|
66
|
-
|
67
|
-
See Also:
|
68
|
-
* ``generate()`` is the main usecase of this schedule object.
|
69
|
-
"""
|
70
|
-
|
71
|
-
|
72
|
-
class ScheduleBkk(Schedule):
|
73
|
-
"""Asia Bangkok Schedule (Schedule) timezone Model.
|
74
|
-
|
75
|
-
This model use for change timezone from utc to Asia/Bangkok
|
76
|
-
"""
|
77
|
-
|
78
|
-
tz: Annotated[str, Field(description="Timezone")] = "Asia/Bangkok"
|
79
|
-
|
80
|
-
|
81
|
-
class AwsSchedule(BaseSchedule):
|
82
|
-
"""Implement Schedule for AWS Service."""
|
@@ -1,6 +0,0 @@
|
|
1
|
-
# ------------------------------------------------------------------------------
|
2
|
-
# Copyright (c) 2022 Korawich Anuttra. All rights reserved.
|
3
|
-
# Licensed under the MIT License. See LICENSE in the project root for
|
4
|
-
# license information.
|
5
|
-
# ------------------------------------------------------------------------------
|
6
|
-
from ._polars import *
|
@@ -1,54 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
import math
|
3
|
-
|
4
|
-
try:
|
5
|
-
import pandas as pd
|
6
|
-
|
7
|
-
logging.debug(f"Pandas version: {pd.__version__}")
|
8
|
-
except ImportError as err:
|
9
|
-
raise ImportError(
|
10
|
-
"``split_iterable`` function want to use pandas package that does"
|
11
|
-
"not install on your interpreter."
|
12
|
-
) from err
|
13
|
-
|
14
|
-
|
15
|
-
def split_iterable(iterable, chunk_size=None, generator_flag: bool = True):
|
16
|
-
"""
|
17
|
-
Split an iterable into mini batch with batch length of batch_number
|
18
|
-
supports batch of a pandas dataframe
|
19
|
-
usage:
|
20
|
-
>> for i in split_iterable([1,2,3,4,5], chunk_size=2):
|
21
|
-
>> print(i)
|
22
|
-
[1, 2]
|
23
|
-
[3, 4]
|
24
|
-
[5]
|
25
|
-
|
26
|
-
for idx, mini_data in split_iterable(batch(df, chunk_size=10)):
|
27
|
-
print(idx)
|
28
|
-
print(mini_data)
|
29
|
-
"""
|
30
|
-
|
31
|
-
chunk_size: int = chunk_size or 25000
|
32
|
-
num_chunks = math.ceil(len(iterable) / chunk_size)
|
33
|
-
if generator_flag:
|
34
|
-
for _ in range(num_chunks):
|
35
|
-
if isinstance(iterable, pd.DataFrame):
|
36
|
-
yield iterable.iloc[_ * chunk_size : (_ + 1) * chunk_size]
|
37
|
-
else:
|
38
|
-
yield iterable[_ * chunk_size : (_ + 1) * chunk_size]
|
39
|
-
else:
|
40
|
-
_chunks: list = []
|
41
|
-
for _ in range(num_chunks):
|
42
|
-
if isinstance(iterable, pd.DataFrame):
|
43
|
-
_chunks.append(
|
44
|
-
iterable.iloc[_ * chunk_size : (_ + 1) * chunk_size]
|
45
|
-
)
|
46
|
-
else:
|
47
|
-
_chunks.append(iterable[_ * chunk_size : (_ + 1) * chunk_size])
|
48
|
-
return _chunks
|
49
|
-
|
50
|
-
|
51
|
-
def chunks(dataframe: pd.DataFrame, n: int):
|
52
|
-
"""Yield successive n-sized chunks from dataframe."""
|
53
|
-
for i in range(0, len(dataframe), n):
|
54
|
-
yield dataframe.iloc[i : i + n]
|
@@ -1,92 +0,0 @@
|
|
1
|
-
# ------------------------------------------------------------------------------
|
2
|
-
# Copyright (c) 2022 Korawich Anuttra. All rights reserved.
|
3
|
-
# Licensed under the MIT License. See LICENSE in the project root for
|
4
|
-
# license information.
|
5
|
-
# ------------------------------------------------------------------------------
|
6
|
-
from __future__ import annotations
|
7
|
-
|
8
|
-
import logging
|
9
|
-
from typing import Any
|
10
|
-
from uuid import uuid4
|
11
|
-
|
12
|
-
try:
|
13
|
-
import polars as pl
|
14
|
-
|
15
|
-
logging.debug(f"Polars version: {pl.__version__}")
|
16
|
-
except ImportError:
|
17
|
-
raise ImportError(
|
18
|
-
"Please install polars if you want to use any relate task"
|
19
|
-
) from None
|
20
|
-
import pyarrow.parquet as pq
|
21
|
-
from ddeutil.workflow.utils import tag
|
22
|
-
from ddeutil.workflow.vendors.pl import PolarsCsv, PolarsParq
|
23
|
-
|
24
|
-
|
25
|
-
def polars_dtype():
|
26
|
-
return {
|
27
|
-
"str": pl.Utf8,
|
28
|
-
"int": pl.Int32,
|
29
|
-
}
|
30
|
-
|
31
|
-
|
32
|
-
@tag("polars-dir", name="el-csv-to-parquet")
|
33
|
-
def csv_to_parquet_dir(
|
34
|
-
source: str,
|
35
|
-
sink: str,
|
36
|
-
conversion: dict[str, Any] | None = None,
|
37
|
-
) -> dict[str, int]:
|
38
|
-
"""Extract Load data from CSV to Parquet file.
|
39
|
-
|
40
|
-
:param source:
|
41
|
-
:param sink:
|
42
|
-
:param conversion:
|
43
|
-
"""
|
44
|
-
print("Start EL for CSV to Parquet with Polars Engine")
|
45
|
-
print("---")
|
46
|
-
# STEP 01: Read the source data to Polars.
|
47
|
-
src_dataset: PolarsCsv = PolarsCsv.from_loader(name=source, externals={})
|
48
|
-
src_df: pl.DataFrame = src_dataset.load()
|
49
|
-
print(src_df)
|
50
|
-
|
51
|
-
# STEP 02: Schema conversion on Polars DataFrame.
|
52
|
-
conversion: dict[str, Any] = conversion or {}
|
53
|
-
if conversion:
|
54
|
-
src_df = src_df.with_columns(
|
55
|
-
*[pl.col(c).cast(col.type).alias(col.name) for c, col in conversion]
|
56
|
-
)
|
57
|
-
print("Start Schema Conversion ...")
|
58
|
-
|
59
|
-
# STEP 03: Write data to parquet file format.
|
60
|
-
sink = PolarsParq.from_loader(name=sink, externals={})
|
61
|
-
pq.write_to_dataset(
|
62
|
-
table=src_df.to_arrow(),
|
63
|
-
root_path=f"{sink.conn.endpoint}/{sink.object}",
|
64
|
-
compression="snappy",
|
65
|
-
basename_template=f"{sink.object}-{uuid4().hex}-{{i}}.snappy.parquet",
|
66
|
-
)
|
67
|
-
return {"records": src_df.select(pl.len()).item()}
|
68
|
-
|
69
|
-
|
70
|
-
@tag("polars-dir-scan", name="el-csv-to-parquet")
|
71
|
-
def csv_to_parquet_dir_scan(
|
72
|
-
source: str,
|
73
|
-
sink: str,
|
74
|
-
conversion: dict[str, Any] | None = None,
|
75
|
-
) -> dict[str, int]:
|
76
|
-
print("Start EL for CSV to Parquet with Polars Engine")
|
77
|
-
print("---")
|
78
|
-
# STEP 01: Read the source data to Polars.
|
79
|
-
src_dataset: PolarsCsv = PolarsCsv.from_loader(name=source, externals={})
|
80
|
-
src_df: pl.LazyFrame = src_dataset.scan()
|
81
|
-
|
82
|
-
if conversion:
|
83
|
-
...
|
84
|
-
|
85
|
-
sink = PolarsParq.from_loader(name=sink, externals={})
|
86
|
-
pq.write_to_dataset(
|
87
|
-
table=src_df.collect().to_arrow(),
|
88
|
-
root_path=f"{sink.conn.endpoint}/{sink.object}",
|
89
|
-
compression="snappy",
|
90
|
-
basename_template=f"{sink.object}-{uuid4().hex}-{{i}}.snappy.parquet",
|
91
|
-
)
|
92
|
-
return {"records": src_df.select(pl.len()).collect().item()}
|
@@ -1,127 +0,0 @@
|
|
1
|
-
# ------------------------------------------------------------------------------
|
2
|
-
# Copyright (c) 2022 Korawich Anuttra. All rights reserved.
|
3
|
-
# Licensed under the MIT License. See LICENSE in the project root for
|
4
|
-
# license information.
|
5
|
-
# ------------------------------------------------------------------------------
|
6
|
-
from __future__ import annotations
|
7
|
-
|
8
|
-
from datetime import datetime
|
9
|
-
from typing import Annotated, Any, Optional
|
10
|
-
|
11
|
-
from fmtutil import Datetime, FormatterGroupType, make_group
|
12
|
-
from fmtutil.utils import escape_fmt_group
|
13
|
-
from pydantic import BaseModel, Field
|
14
|
-
from typing_extensions import Self
|
15
|
-
|
16
|
-
from ..__types import DictData, TupleStr
|
17
|
-
from ..conn import SubclassConn
|
18
|
-
from ..loader import Loader
|
19
|
-
|
20
|
-
EXCLUDED_EXTRAS: TupleStr = ("type",)
|
21
|
-
OBJ_FMTS: FormatterGroupType = make_group({"datetime": Datetime})
|
22
|
-
|
23
|
-
|
24
|
-
class BaseDataset(BaseModel):
|
25
|
-
"""Base Dataset Model. This model implement only loading construction."""
|
26
|
-
|
27
|
-
conn: Annotated[SubclassConn, Field(description="Connection Model")]
|
28
|
-
endpoint: Annotated[
|
29
|
-
Optional[str],
|
30
|
-
Field(description="Endpoint of connection"),
|
31
|
-
] = None
|
32
|
-
object: str = Field(description="Dataset object that want to contract")
|
33
|
-
features: list = Field(default_factory=list)
|
34
|
-
extras: dict[str, Any] = Field(default_factory=dict)
|
35
|
-
|
36
|
-
@classmethod
|
37
|
-
def from_loader(
|
38
|
-
cls,
|
39
|
-
name: str,
|
40
|
-
externals: DictData,
|
41
|
-
) -> Self:
|
42
|
-
"""Construct Connection with Loader object with specific config name.
|
43
|
-
|
44
|
-
:param name: A name of dataset that want to load from config file.
|
45
|
-
:param externals: An external parameters.
|
46
|
-
"""
|
47
|
-
loader: Loader = Loader(name, externals=externals)
|
48
|
-
|
49
|
-
# NOTE: Validate the config type match with current dataset model
|
50
|
-
if loader.type != cls:
|
51
|
-
raise ValueError(f"Type {loader.type} does not match with {cls}")
|
52
|
-
|
53
|
-
filter_data: DictData = {
|
54
|
-
k: loader.data.pop(k)
|
55
|
-
for k in loader.data.copy()
|
56
|
-
if k not in cls.model_fields and k not in EXCLUDED_EXTRAS
|
57
|
-
}
|
58
|
-
|
59
|
-
if "conn" not in loader.data:
|
60
|
-
raise ValueError("Dataset config does not set ``conn`` value")
|
61
|
-
|
62
|
-
# NOTE: Start loading connection config
|
63
|
-
conn_name: str = loader.data.pop("conn")
|
64
|
-
conn_loader: Loader = Loader(conn_name, externals=externals)
|
65
|
-
conn_model: SubclassConn = conn_loader.type.from_loader(
|
66
|
-
name=conn_name, externals=externals
|
67
|
-
)
|
68
|
-
|
69
|
-
# NOTE: Override ``endpoint`` value to getter connection data.
|
70
|
-
if "endpoint" in loader.data:
|
71
|
-
# NOTE: Update endpoint path without Pydantic validator.
|
72
|
-
conn_model.__dict__["endpoint"] = loader.data["endpoint"]
|
73
|
-
else:
|
74
|
-
loader.data.update({"endpoint": conn_model.endpoint})
|
75
|
-
return cls.model_validate(
|
76
|
-
obj={
|
77
|
-
"extras": (
|
78
|
-
loader.data.pop("extras", {}) | filter_data | externals
|
79
|
-
),
|
80
|
-
"conn": conn_model,
|
81
|
-
**loader.data,
|
82
|
-
}
|
83
|
-
)
|
84
|
-
|
85
|
-
|
86
|
-
class Dataset(BaseDataset):
|
87
|
-
|
88
|
-
def exists(self) -> bool:
|
89
|
-
raise NotImplementedError("Object exists does not implement")
|
90
|
-
|
91
|
-
def format_object(
|
92
|
-
self,
|
93
|
-
_object: str | None = None,
|
94
|
-
dt: str | datetime | None = None,
|
95
|
-
) -> str:
|
96
|
-
"""Format the object value that implement datetime"""
|
97
|
-
if dt is None:
|
98
|
-
dt = datetime.now()
|
99
|
-
dt: datetime = (
|
100
|
-
dt if isinstance(dt, datetime) else datetime.fromisoformat(dt)
|
101
|
-
)
|
102
|
-
return (
|
103
|
-
OBJ_FMTS({"datetime": dt})
|
104
|
-
.format(escape_fmt_group(_object or self.object))
|
105
|
-
.replace("\\", "")
|
106
|
-
)
|
107
|
-
|
108
|
-
|
109
|
-
class FlDataset(Dataset):
|
110
|
-
|
111
|
-
def exists(self) -> bool:
|
112
|
-
return self.conn.find_object(self.object)
|
113
|
-
|
114
|
-
|
115
|
-
class TblDataset(Dataset):
|
116
|
-
|
117
|
-
def exists(self) -> bool:
|
118
|
-
return self.conn.find_object(self.object)
|
119
|
-
|
120
|
-
|
121
|
-
class FlDataFrame(Dataset):
|
122
|
-
|
123
|
-
def exists(self) -> bool:
|
124
|
-
return self.conn.find_object(self.object)
|
125
|
-
|
126
|
-
|
127
|
-
class TblDataFrame(Dataset): ...
|