ddeutil-workflow 0.0.4__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,92 +0,0 @@
1
- # ------------------------------------------------------------------------------
2
- # Copyright (c) 2022 Korawich Anuttra. All rights reserved.
3
- # Licensed under the MIT License. See LICENSE in the project root for
4
- # license information.
5
- # ------------------------------------------------------------------------------
6
- from __future__ import annotations
7
-
8
- import logging
9
- from typing import Any
10
- from uuid import uuid4
11
-
12
- try:
13
- import polars as pl
14
-
15
- logging.debug(f"Polars version: {pl.__version__}")
16
- except ImportError:
17
- raise ImportError(
18
- "Please install polars if you want to use any relate task"
19
- ) from None
20
- import pyarrow.parquet as pq
21
- from ddeutil.workflow.utils import tag
22
- from ddeutil.workflow.vendors.pl import PolarsCsv, PolarsParq
23
-
24
-
25
- def polars_dtype():
26
- return {
27
- "str": pl.Utf8,
28
- "int": pl.Int32,
29
- }
30
-
31
-
32
- @tag("polars-dir", name="el-csv-to-parquet")
33
- def csv_to_parquet_dir(
34
- source: str,
35
- sink: str,
36
- conversion: dict[str, Any] | None = None,
37
- ) -> dict[str, int]:
38
- """Extract Load data from CSV to Parquet file.
39
-
40
- :param source:
41
- :param sink:
42
- :param conversion:
43
- """
44
- print("Start EL for CSV to Parquet with Polars Engine")
45
- print("---")
46
- # STEP 01: Read the source data to Polars.
47
- src_dataset: PolarsCsv = PolarsCsv.from_loader(name=source, externals={})
48
- src_df: pl.DataFrame = src_dataset.load()
49
- print(src_df)
50
-
51
- # STEP 02: Schema conversion on Polars DataFrame.
52
- conversion: dict[str, Any] = conversion or {}
53
- if conversion:
54
- src_df = src_df.with_columns(
55
- *[pl.col(c).cast(col.type).alias(col.name) for c, col in conversion]
56
- )
57
- print("Start Schema Conversion ...")
58
-
59
- # STEP 03: Write data to parquet file format.
60
- sink = PolarsParq.from_loader(name=sink, externals={})
61
- pq.write_to_dataset(
62
- table=src_df.to_arrow(),
63
- root_path=f"{sink.conn.endpoint}/{sink.object}",
64
- compression="snappy",
65
- basename_template=f"{sink.object}-{uuid4().hex}-{{i}}.snappy.parquet",
66
- )
67
- return {"records": src_df.select(pl.len()).item()}
68
-
69
-
70
- @tag("polars-dir-scan", name="el-csv-to-parquet")
71
- def csv_to_parquet_dir_scan(
72
- source: str,
73
- sink: str,
74
- conversion: dict[str, Any] | None = None,
75
- ) -> dict[str, int]:
76
- print("Start EL for CSV to Parquet with Polars Engine")
77
- print("---")
78
- # STEP 01: Read the source data to Polars.
79
- src_dataset: PolarsCsv = PolarsCsv.from_loader(name=source, externals={})
80
- src_df: pl.LazyFrame = src_dataset.scan()
81
-
82
- if conversion:
83
- ...
84
-
85
- sink = PolarsParq.from_loader(name=sink, externals={})
86
- pq.write_to_dataset(
87
- table=src_df.collect().to_arrow(),
88
- root_path=f"{sink.conn.endpoint}/{sink.object}",
89
- compression="snappy",
90
- basename_template=f"{sink.object}-{uuid4().hex}-{{i}}.snappy.parquet",
91
- )
92
- return {"records": src_df.select(pl.len()).collect().item()}
@@ -1,127 +0,0 @@
1
- # ------------------------------------------------------------------------------
2
- # Copyright (c) 2022 Korawich Anuttra. All rights reserved.
3
- # Licensed under the MIT License. See LICENSE in the project root for
4
- # license information.
5
- # ------------------------------------------------------------------------------
6
- from __future__ import annotations
7
-
8
- from datetime import datetime
9
- from typing import Annotated, Any, Optional
10
-
11
- from fmtutil import Datetime, FormatterGroupType, make_group
12
- from fmtutil.utils import escape_fmt_group
13
- from pydantic import BaseModel, Field
14
- from typing_extensions import Self
15
-
16
- from ..__types import DictData, TupleStr
17
- from ..conn import SubclassConn
18
- from ..loader import Loader
19
-
20
- EXCLUDED_EXTRAS: TupleStr = ("type",)
21
- OBJ_FMTS: FormatterGroupType = make_group({"datetime": Datetime})
22
-
23
-
24
- class BaseDataset(BaseModel):
25
- """Base Dataset Model. This model implement only loading construction."""
26
-
27
- conn: Annotated[SubclassConn, Field(description="Connection Model")]
28
- endpoint: Annotated[
29
- Optional[str],
30
- Field(description="Endpoint of connection"),
31
- ] = None
32
- object: str = Field(description="Dataset object that want to contract")
33
- features: list = Field(default_factory=list)
34
- extras: dict[str, Any] = Field(default_factory=dict)
35
-
36
- @classmethod
37
- def from_loader(
38
- cls,
39
- name: str,
40
- externals: DictData,
41
- ) -> Self:
42
- """Construct Connection with Loader object with specific config name.
43
-
44
- :param name: A name of dataset that want to load from config file.
45
- :param externals: An external parameters.
46
- """
47
- loader: Loader = Loader(name, externals=externals)
48
-
49
- # NOTE: Validate the config type match with current dataset model
50
- if loader.type != cls:
51
- raise ValueError(f"Type {loader.type} does not match with {cls}")
52
-
53
- filter_data: DictData = {
54
- k: loader.data.pop(k)
55
- for k in loader.data.copy()
56
- if k not in cls.model_fields and k not in EXCLUDED_EXTRAS
57
- }
58
-
59
- if "conn" not in loader.data:
60
- raise ValueError("Dataset config does not set ``conn`` value")
61
-
62
- # NOTE: Start loading connection config
63
- conn_name: str = loader.data.pop("conn")
64
- conn_loader: Loader = Loader(conn_name, externals=externals)
65
- conn_model: SubclassConn = conn_loader.type.from_loader(
66
- name=conn_name, externals=externals
67
- )
68
-
69
- # NOTE: Override ``endpoint`` value to getter connection data.
70
- if "endpoint" in loader.data:
71
- # NOTE: Update endpoint path without Pydantic validator.
72
- conn_model.__dict__["endpoint"] = loader.data["endpoint"]
73
- else:
74
- loader.data.update({"endpoint": conn_model.endpoint})
75
- return cls.model_validate(
76
- obj={
77
- "extras": (
78
- loader.data.pop("extras", {}) | filter_data | externals
79
- ),
80
- "conn": conn_model,
81
- **loader.data,
82
- }
83
- )
84
-
85
-
86
- class Dataset(BaseDataset):
87
-
88
- def exists(self) -> bool:
89
- raise NotImplementedError("Object exists does not implement")
90
-
91
- def format_object(
92
- self,
93
- _object: str | None = None,
94
- dt: str | datetime | None = None,
95
- ) -> str:
96
- """Format the object value that implement datetime"""
97
- if dt is None:
98
- dt = datetime.now()
99
- dt: datetime = (
100
- dt if isinstance(dt, datetime) else datetime.fromisoformat(dt)
101
- )
102
- return (
103
- OBJ_FMTS({"datetime": dt})
104
- .format(escape_fmt_group(_object or self.object))
105
- .replace("\\", "")
106
- )
107
-
108
-
109
- class FlDataset(Dataset):
110
-
111
- def exists(self) -> bool:
112
- return self.conn.find_object(self.object)
113
-
114
-
115
- class TblDataset(Dataset):
116
-
117
- def exists(self) -> bool:
118
- return self.conn.find_object(self.object)
119
-
120
-
121
- class FlDataFrame(Dataset):
122
-
123
- def exists(self) -> bool:
124
- return self.conn.find_object(self.object)
125
-
126
-
127
- class TblDataFrame(Dataset): ...
@@ -1,333 +0,0 @@
1
- """
2
- Reference:
3
- * https://github.com/LarsHill/metadict
4
- """
5
-
6
- from __future__ import annotations
7
-
8
- import contextlib
9
- import copy
10
- import keyword
11
- import re
12
- import warnings
13
- from collections.abc import (
14
- Iterable,
15
- Iterator,
16
- KeysView,
17
- Mapping,
18
- MutableMapping,
19
- )
20
- from re import Pattern
21
- from typing import (
22
- Any,
23
- Optional,
24
- TypeVar,
25
- )
26
-
27
- from typing_extensions import Self
28
-
29
-
30
- def _warning(
31
- message,
32
- category=UserWarning,
33
- filename="",
34
- lineno=-1,
35
- file=None,
36
- line="",
37
- ):
38
- """Monkey patch `warnings` to show UserWarning without the line information
39
- of warnings call.
40
- """
41
- msg = warnings.WarningMessage(
42
- message, category, filename, lineno, file, line
43
- )
44
- print(f"{msg.category.__name__}: {msg.message}")
45
-
46
-
47
- warnings.showwarning = _warning
48
-
49
- KT = TypeVar("KT")
50
- VT = TypeVar("VT")
51
-
52
- # NOTE: regex to enforce python variable/attribute syntax
53
- ALLOWED_VAR_SYNTAX: Pattern = re.compile(r"[a-zA-Z_]\w*")
54
-
55
-
56
- def complies_variable_syntax(name: Any) -> bool:
57
- """Checks whether a given object is a string which complies the python
58
- variable syntax.
59
- """
60
- if not isinstance(name, str) or keyword.iskeyword(name):
61
- return False
62
- name_cleaned = "".join(re.findall(ALLOWED_VAR_SYNTAX, name))
63
- return name_cleaned == name
64
-
65
-
66
- class MetaDict(MutableMapping[KT, VT], dict):
67
- """Class that extends `dict` to access and assign keys via attribute dot
68
- notation.
69
-
70
- Examples:
71
- >>> d = MetaDict({'foo': {'bar': [{'a': 1}, {'a': 2}]}})
72
- >>> d.foo.bar[1].a
73
- 2
74
- >>> d["foo"]["bar"][1]["a"]
75
- 2
76
- >>> d.bar = 'demo'
77
- >>> d.bar
78
- 'demo'
79
-
80
- `MetaDict` inherits from MutableMapping to avoid overwriting all `dict`
81
- methods. In addition, it inherits from `dict` to pass the quite common
82
- `isinstance(obj, dict) check.
83
-
84
- Also, inheriting from `dict` enables json encoding/decoding without a
85
- custom encoder.
86
- """
87
-
88
- def __init__(self, *args, nested_assign: bool = False, **kwargs) -> None:
89
- # NOTE: check that 'nested_assign' is of type bool
90
- if not isinstance(nested_assign, bool):
91
- raise TypeError(
92
- "Keyword argument 'nested_assign' must be an instance of "
93
- "type 'bool'"
94
- )
95
-
96
- # NOTE: init internal attributes and data store
97
- self.__dict__["_data"]: dict[KT, VT] = {}
98
- self.__dict__["_nested_assign"] = nested_assign
99
- self.__dict__["_parent"] = kwargs.pop("_parent", None)
100
- self.__dict__["_key"] = kwargs.pop("_key", None)
101
-
102
- # update state of data store
103
- self.update(*args, **kwargs)
104
-
105
- # call `dict` constructor with stored data to enable object encoding
106
- # (e.g. `json.dumps()`) that relies on `dict`
107
- dict.__init__(self, self._data)
108
-
109
- def __len__(self) -> int:
110
- return len(self._data)
111
-
112
- def __iter__(self) -> Iterator[KT]:
113
- return iter(self._data)
114
-
115
- def __setitem__(self, key: KT, value: VT) -> None:
116
- # show a warning if the assigned key or attribute is used internally
117
- # (e.g `items`, `keys`, etc.)
118
- try:
119
- self.__getattribute__(key)
120
- key_is_protected = True
121
- except (AttributeError, TypeError):
122
- key_is_protected = False
123
- if key_is_protected:
124
- warnings.warn(
125
- f"'{self.__class__.__name__}' object uses '{key}' internally. "
126
- f"'{key}' can only be accessed via `obj['{key}']`.",
127
- stacklevel=2,
128
- )
129
-
130
- # set key recursively
131
- self._data[key] = self._from_object(value)
132
-
133
- # update parent when nested keys or attributes are assigned
134
- parent = self.__dict__.pop("_parent", None)
135
- key = self.__dict__.get("_key", None)
136
- if parent is not None:
137
- parent[key] = self._data
138
-
139
- def __getitem__(self, key: KT) -> VT:
140
- try:
141
- value = self._data[key]
142
- except KeyError:
143
- if self.nested_assign:
144
- return self.__missing__(key)
145
- raise
146
-
147
- return value
148
-
149
- def __missing__(self, key: KT) -> Self:
150
- return self.__class__(
151
- _parent=self, _key=key, nested_assign=self._nested_assign
152
- )
153
-
154
- def __delitem__(self, key: KT) -> None:
155
- del self._data[key]
156
-
157
- def __setattr__(self, attr: str, val: VT) -> None:
158
- self[attr] = val
159
-
160
- def __getattr__(self, key: KT) -> VT:
161
- try:
162
- return self[key]
163
- except KeyError:
164
- raise AttributeError(
165
- f"'{self.__class__.__name__}' object has no attribute '{key}'"
166
- ) from None
167
-
168
- def __delattr__(self, key: KT) -> None:
169
- try:
170
- del self[key]
171
- except KeyError:
172
- raise AttributeError(
173
- f"'{self.__class__.__name__}' object has no attribute '{key}'"
174
- ) from None
175
-
176
- def __str__(self) -> str:
177
- return str(self._data)
178
-
179
- def __repr__(self) -> str:
180
- return repr(self._data)
181
-
182
- @staticmethod
183
- def repack_args(cls: type, state: dict) -> MetaDict:
184
- """Repack and rename keyword arguments stored in state before feeding
185
- to class constructor
186
- """
187
- _data = state.pop("_data")
188
- _nested_assign = state.pop("_nested_assign")
189
- return cls(_data, nested_assign=_nested_assign, **state)
190
-
191
- def __reduce__(self) -> tuple:
192
- """Return state information for pickling."""
193
- return MetaDict.repack_args, (self.__class__, self.__dict__)
194
-
195
- def __dir__(self) -> Iterable[str]:
196
- """Extend dir list with accessible dict keys (enables autocompletion
197
- when using dot notation)
198
- """
199
- dict_keys = [
200
- key for key in self._data.keys() if complies_variable_syntax(key)
201
- ]
202
- return dir(type(self)) + dict_keys
203
-
204
- def copy(self) -> Self:
205
- return self.__copy__()
206
-
207
- def __copy__(self) -> Self:
208
- cls = self.__class__
209
- result = cls.__new__(cls)
210
- result.__dict__.update(
211
- {k: copy.copy(v) for k, v in self.__dict__.items()}
212
- )
213
- return result
214
-
215
- @classmethod
216
- def fromkeys(
217
- cls,
218
- iterable: Iterable[KT],
219
- value: Optional[VT] = None,
220
- ) -> Self:
221
- """Constructor MetaDict form keys iterator.
222
-
223
- Examples:
224
- >>> def iter_keys() -> Iterable[str]:
225
- ... for i in range(3):
226
- ... yield f"k{i}"
227
- >>> MetaDict.fromkeys(iterable=iter_keys())
228
- {'k0': None, 'k1': None, 'k2': None}
229
- """
230
- return cls({key: value for key in iterable})
231
-
232
- def to_dict(self) -> dict:
233
- return MetaDict._to_object(self._data)
234
-
235
- @staticmethod
236
- def _to_object(obj: Any) -> Any:
237
- """Recursively converts all nested MetaDicts to dicts."""
238
-
239
- if isinstance(obj, (list, tuple, set)):
240
- if MetaDict._contains_mapping(obj):
241
- value = type(obj)(MetaDict._to_object(x) for x in obj)
242
- else:
243
- value = obj
244
- elif isinstance(obj, Mapping):
245
- value = {k: MetaDict._to_object(v) for k, v in obj.items()}
246
- else:
247
- value = obj
248
-
249
- return value
250
-
251
- def _from_object(self, obj: Any) -> Any:
252
- """Recursively converts all nested dicts to MetaDicts."""
253
-
254
- if isinstance(obj, (list, tuple, set)):
255
- if MetaDict._contains_mapping(obj):
256
- value = type(obj)(self._from_object(x) for x in obj)
257
- else:
258
- value = obj
259
- elif isinstance(obj, MetaDict):
260
- value = obj
261
- elif isinstance(obj, Mapping):
262
- value = self.__class__(
263
- {k: self._from_object(v) for k, v in obj.items()},
264
- nested_assign=self._nested_assign,
265
- )
266
- else:
267
- value = obj
268
-
269
- return value
270
-
271
- def _set_nested_assignment(self, val: bool):
272
- self.__dict__["_nested_assign"] = val
273
- for value in self.values():
274
- if isinstance(value, (list, tuple, set)):
275
- for elem in value:
276
- if isinstance(elem, MetaDict):
277
- elem._set_nested_assignment(val)
278
- elif isinstance(value, MetaDict):
279
- value._set_nested_assignment(val)
280
-
281
- def enable_nested_assignment(self):
282
- self._set_nested_assignment(True)
283
-
284
- def disable_nested_assignment(self):
285
- self._set_nested_assignment(False)
286
-
287
- @contextlib.contextmanager
288
- def enabling_nested_assignment(self):
289
- """Context manager which temporarily enables nested key/attribute
290
- assignment.
291
- """
292
- nested_assign = self.nested_assign
293
- if not nested_assign:
294
- self.enable_nested_assignment()
295
- try:
296
- yield self
297
- finally:
298
- if not nested_assign:
299
- self.disable_nested_assignment()
300
-
301
- @property
302
- def nested_assign(self):
303
- return self._nested_assign
304
-
305
- @staticmethod
306
- def _contains_mapping(
307
- iterable: Iterable, ignore: Optional[type] = None
308
- ) -> bool:
309
- """Recursively checks whether an Iterable contains an instance of
310
- Mapping.
311
- """
312
- for x in iterable:
313
- if isinstance(x, Mapping):
314
- if ignore is None or not isinstance(x, ignore):
315
- return True
316
- elif isinstance(x, (list, set, tuple)):
317
- return MetaDict._contains_mapping(x, ignore)
318
- return False
319
-
320
- # NOTE: Add the following inherited methods from collections.abc.Mapping
321
- # directly to make pycharm happy to checking.
322
- # (removing an annoying warning for dict unpacking)
323
- def __contains__(self, key):
324
- try:
325
- self[key]
326
- except KeyError:
327
- return False
328
- else:
329
- return True
330
-
331
- def keys(self):
332
- """D.keys() -> a set-like object providing a view on D's keys"""
333
- return KeysView(self)
File without changes