ddeutil-workflow 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,54 @@
1
+ import logging
2
+ import math
3
+
4
+ try:
5
+ import pandas as pd
6
+
7
+ logging.debug(f"Pandas version: {pd.__version__}")
8
+ except ImportError as err:
9
+ raise ImportError(
10
+ "``split_iterable`` function want to use pandas package that does"
11
+ "not install on your interpreter."
12
+ ) from err
13
+
14
+
15
+ def split_iterable(iterable, chunk_size=None, generator_flag: bool = True):
16
+ """
17
+ Split an iterable into mini batch with batch length of batch_number
18
+ supports batch of a pandas dataframe
19
+ usage:
20
+ >> for i in split_iterable([1,2,3,4,5], chunk_size=2):
21
+ >> print(i)
22
+ [1, 2]
23
+ [3, 4]
24
+ [5]
25
+
26
+ for idx, mini_data in split_iterable(batch(df, chunk_size=10)):
27
+ print(idx)
28
+ print(mini_data)
29
+ """
30
+
31
+ chunk_size: int = chunk_size or 25000
32
+ num_chunks = math.ceil(len(iterable) / chunk_size)
33
+ if generator_flag:
34
+ for _ in range(num_chunks):
35
+ if isinstance(iterable, pd.DataFrame):
36
+ yield iterable.iloc[_ * chunk_size : (_ + 1) * chunk_size]
37
+ else:
38
+ yield iterable[_ * chunk_size : (_ + 1) * chunk_size]
39
+ else:
40
+ _chunks: list = []
41
+ for _ in range(num_chunks):
42
+ if isinstance(iterable, pd.DataFrame):
43
+ _chunks.append(
44
+ iterable.iloc[_ * chunk_size : (_ + 1) * chunk_size]
45
+ )
46
+ else:
47
+ _chunks.append(iterable[_ * chunk_size : (_ + 1) * chunk_size])
48
+ return _chunks
49
+
50
+
51
+ def chunks(dataframe: pd.DataFrame, n: int):
52
+ """Yield successive n-sized chunks from dataframe."""
53
+ for i in range(0, len(dataframe), n):
54
+ yield dataframe.iloc[i : i + n]
@@ -5,29 +5,55 @@
5
5
  # ------------------------------------------------------------------------------
6
6
  from __future__ import annotations
7
7
 
8
+ import logging
8
9
  from typing import Any
9
10
  from uuid import uuid4
10
11
 
11
- import polars as pl
12
+ try:
13
+ import polars as pl
14
+
15
+ logging.debug(f"Polars version: {pl.__version__}")
16
+ except ImportError:
17
+ raise ImportError(
18
+ "Please install polars if you want to use any relate task"
19
+ ) from None
12
20
  import pyarrow.parquet as pq
13
- from ddeutil.workflow.dataset import PolarsCsv, PolarsParq
21
+ from ddeutil.workflow.utils import tag
22
+ from ddeutil.workflow.vendors.pl import PolarsCsv, PolarsParq
23
+
24
+
25
+ def polars_dtype():
26
+ return {
27
+ "str": pl.Utf8,
28
+ "int": pl.Int32,
29
+ }
14
30
 
15
31
 
32
+ @tag("polars-dir", name="el-csv-to-parquet")
16
33
  def csv_to_parquet_dir(
17
34
  source: str,
18
35
  sink: str,
19
36
  conversion: dict[str, Any] | None = None,
20
- ):
37
+ ) -> dict[str, int]:
38
+ """Extract Load data from CSV to Parquet file.
39
+
40
+ :param source:
41
+ :param sink:
42
+ :param conversion:
43
+ """
21
44
  print("Start EL for CSV to Parquet with Polars Engine")
22
45
  print("---")
23
46
  # STEP 01: Read the source data to Polars.
24
47
  src_dataset: PolarsCsv = PolarsCsv.from_loader(name=source, externals={})
25
- src_df = src_dataset.load()
48
+ src_df: pl.DataFrame = src_dataset.load()
26
49
  print(src_df)
27
50
 
28
51
  # STEP 02: Schema conversion on Polars DataFrame.
29
52
  conversion: dict[str, Any] = conversion or {}
30
53
  if conversion:
54
+ src_df = src_df.with_columns(
55
+ *[pl.col(c).cast(col.type).alias(col.name) for c, col in conversion]
56
+ )
31
57
  print("Start Schema Conversion ...")
32
58
 
33
59
  # STEP 03: Write data to parquet file format.
@@ -39,3 +65,28 @@ def csv_to_parquet_dir(
39
65
  basename_template=f"{sink.object}-{uuid4().hex}-{{i}}.snappy.parquet",
40
66
  )
41
67
  return {"records": src_df.select(pl.len()).item()}
68
+
69
+
70
+ @tag("polars-dir-scan", name="el-csv-to-parquet")
71
+ def csv_to_parquet_dir_scan(
72
+ source: str,
73
+ sink: str,
74
+ conversion: dict[str, Any] | None = None,
75
+ ) -> dict[str, int]:
76
+ print("Start EL for CSV to Parquet with Polars Engine")
77
+ print("---")
78
+ # STEP 01: Read the source data to Polars.
79
+ src_dataset: PolarsCsv = PolarsCsv.from_loader(name=source, externals={})
80
+ src_df: pl.LazyFrame = src_dataset.scan()
81
+
82
+ if conversion:
83
+ ...
84
+
85
+ sink = PolarsParq.from_loader(name=sink, externals={})
86
+ pq.write_to_dataset(
87
+ table=src_df.collect().to_arrow(),
88
+ root_path=f"{sink.conn.endpoint}/{sink.object}",
89
+ compression="snappy",
90
+ basename_template=f"{sink.object}-{uuid4().hex}-{{i}}.snappy.parquet",
91
+ )
92
+ return {"records": src_df.select(pl.len()).collect().item()}
@@ -0,0 +1,180 @@
1
+ # ------------------------------------------------------------------------------
2
+ # Copyright (c) 2022 Korawich Anuttra. All rights reserved.
3
+ # Licensed under the MIT License. See LICENSE in the project root for
4
+ # license information.
5
+ # ------------------------------------------------------------------------------
6
+ from __future__ import annotations
7
+
8
+ import inspect
9
+ from abc import ABC, abstractmethod
10
+ from datetime import date, datetime
11
+ from functools import wraps
12
+ from importlib import import_module
13
+ from typing import Any, Callable, Literal, Optional, Protocol, Union
14
+
15
+ from ddeutil.core import lazy
16
+ from ddeutil.io.models.lineage import dt_now
17
+ from pydantic import BaseModel, Field
18
+ from pydantic.functional_validators import model_validator
19
+ from typing_extensions import Self
20
+
21
+
22
+ class TagFunc(Protocol):
23
+ """Tag Function Protocol"""
24
+
25
+ name: str
26
+ tag: str
27
+
28
+ def __call__(self, *args, **kwargs): ...
29
+
30
+
31
+ def tag(tag_value: str, name: str | None = None):
32
+ """Tag decorator function that set function attributes, ``tag`` and ``name``
33
+ for making registries variable.
34
+
35
+ :param: tag_value: A tag value for make different use-case of a function.
36
+ :param: name: A name that keeping in registries.
37
+ """
38
+
39
+ def func_internal(func: TagFunc):
40
+ func.tag = tag_value
41
+ func.name = name or func.__name__.replace("_", "-")
42
+
43
+ @wraps(func)
44
+ def wrapped(*args, **kwargs):
45
+ return func(*args, **kwargs)
46
+
47
+ return wrapped
48
+
49
+ return func_internal
50
+
51
+
52
+ def make_registry(module: str) -> dict[str, dict[str, Callable[[], TagFunc]]]:
53
+ """Return registries of all functions that able to called with task."""
54
+ rs: dict[str, dict[str, Callable[[], Callable]]] = {}
55
+ for fstr, func in inspect.getmembers(
56
+ import_module(module), inspect.isfunction
57
+ ):
58
+ if not hasattr(func, "tag"):
59
+ continue
60
+
61
+ if func.name in rs:
62
+ if func.tag in rs[func.name]:
63
+ raise ValueError(
64
+ f"The tag {func.tag!r} already exists on module {module}"
65
+ )
66
+ rs[func.name][func.tag] = lazy(f"{module}.{fstr}")
67
+ continue
68
+
69
+ # NOTE: Create new register name if it not exists
70
+ rs[func.name] = {func.tag: lazy(f"{module}.{fstr}")}
71
+ return rs
72
+
73
+
74
+ class BaseParams(BaseModel, ABC):
75
+ """Base Parameter that use to make Params Model."""
76
+
77
+ desc: Optional[str] = None
78
+ required: bool = True
79
+ type: str
80
+
81
+ @abstractmethod
82
+ def receive(self, value: Optional[Any] = None) -> Any:
83
+ raise ValueError(
84
+ "Receive value and validate typing before return valid value."
85
+ )
86
+
87
+
88
+ class DefaultParams(BaseParams):
89
+ """Default Parameter that will check default if it required"""
90
+
91
+ default: Optional[str] = None
92
+
93
+ @abstractmethod
94
+ def receive(self, value: Optional[Any] = None) -> Any:
95
+ raise ValueError(
96
+ "Receive value and validate typing before return valid value."
97
+ )
98
+
99
+ @model_validator(mode="after")
100
+ def check_default(self) -> Self:
101
+ if not self.required and self.default is None:
102
+ raise ValueError(
103
+ "Default should set when this parameter does not required."
104
+ )
105
+ return self
106
+
107
+
108
+ class DatetimeParams(DefaultParams):
109
+ """Datetime parameter."""
110
+
111
+ type: Literal["datetime"] = "datetime"
112
+ required: bool = False
113
+ default: datetime = Field(default_factory=dt_now)
114
+
115
+ def receive(self, value: str | datetime | date | None = None) -> datetime:
116
+ if value is None:
117
+ return self.default
118
+
119
+ if isinstance(value, datetime):
120
+ return value
121
+ elif isinstance(value, date):
122
+ return datetime(value.year, value.month, value.day)
123
+ elif not isinstance(value, str):
124
+ raise ValueError(
125
+ f"Value that want to convert to datetime does not support for "
126
+ f"type: {type(value)}"
127
+ )
128
+ return datetime.fromisoformat(value)
129
+
130
+
131
+ class StrParams(DefaultParams):
132
+ """String parameter."""
133
+
134
+ type: Literal["str"] = "str"
135
+
136
+ def receive(self, value: Optional[str] = None) -> str | None:
137
+ if value is None:
138
+ return self.default
139
+ return str(value)
140
+
141
+
142
+ class IntParams(DefaultParams):
143
+ """Integer parameter."""
144
+
145
+ type: Literal["int"] = "int"
146
+
147
+ def receive(self, value: Optional[int] = None) -> int | None:
148
+ if value is None:
149
+ return self.default
150
+ if not isinstance(value, int):
151
+ try:
152
+ return int(str(value))
153
+ except TypeError as err:
154
+ raise ValueError(
155
+ f"Value that want to convert to integer does not support "
156
+ f"for type: {type(value)}"
157
+ ) from err
158
+ return value
159
+
160
+
161
+ class ChoiceParams(BaseParams):
162
+ type: Literal["choice"] = "choice"
163
+ options: list[str]
164
+
165
+ def receive(self, value: Optional[str] = None) -> str:
166
+ """Receive value that match with options."""
167
+ # NOTE:
168
+ # Return the first value in options if does not pass any input value
169
+ if value is None:
170
+ return self.options[0]
171
+ if any(value not in self.options):
172
+ raise ValueError(f"{value} does not match any value in options")
173
+ return value
174
+
175
+
176
+ Params = Union[
177
+ ChoiceParams,
178
+ DatetimeParams,
179
+ StrParams,
180
+ ]
@@ -0,0 +1,127 @@
1
+ # ------------------------------------------------------------------------------
2
+ # Copyright (c) 2022 Korawich Anuttra. All rights reserved.
3
+ # Licensed under the MIT License. See LICENSE in the project root for
4
+ # license information.
5
+ # ------------------------------------------------------------------------------
6
+ from __future__ import annotations
7
+
8
+ from datetime import datetime
9
+ from typing import Annotated, Any, Optional
10
+
11
+ from fmtutil import Datetime, FormatterGroupType, make_group
12
+ from fmtutil.utils import escape_fmt_group
13
+ from pydantic import BaseModel, Field
14
+ from typing_extensions import Self
15
+
16
+ from ..__types import DictData, TupleStr
17
+ from ..conn import SubclassConn
18
+ from ..loader import Loader
19
+
20
+ EXCLUDED_EXTRAS: TupleStr = ("type",)
21
+ OBJ_FMTS: FormatterGroupType = make_group({"datetime": Datetime})
22
+
23
+
24
+ class BaseDataset(BaseModel):
25
+ """Base Dataset Model. This model implement only loading construction."""
26
+
27
+ conn: Annotated[SubclassConn, Field(description="Connection Model")]
28
+ endpoint: Annotated[
29
+ Optional[str],
30
+ Field(description="Endpoint of connection"),
31
+ ] = None
32
+ object: str = Field(description="Dataset object that want to contract")
33
+ features: list = Field(default_factory=list)
34
+ extras: dict[str, Any] = Field(default_factory=dict)
35
+
36
+ @classmethod
37
+ def from_loader(
38
+ cls,
39
+ name: str,
40
+ externals: DictData,
41
+ ) -> Self:
42
+ """Construct Connection with Loader object with specific config name.
43
+
44
+ :param name: A name of dataset that want to load from config file.
45
+ :param externals: An external parameters.
46
+ """
47
+ loader: Loader = Loader(name, externals=externals)
48
+
49
+ # NOTE: Validate the config type match with current dataset model
50
+ if loader.type != cls:
51
+ raise ValueError(f"Type {loader.type} does not match with {cls}")
52
+
53
+ filter_data: DictData = {
54
+ k: loader.data.pop(k)
55
+ for k in loader.data.copy()
56
+ if k not in cls.model_fields and k not in EXCLUDED_EXTRAS
57
+ }
58
+
59
+ if "conn" not in loader.data:
60
+ raise ValueError("Dataset config does not set ``conn`` value")
61
+
62
+ # NOTE: Start loading connection config
63
+ conn_name: str = loader.data.pop("conn")
64
+ conn_loader: Loader = Loader(conn_name, externals=externals)
65
+ conn_model: SubclassConn = conn_loader.type.from_loader(
66
+ name=conn_name, externals=externals
67
+ )
68
+
69
+ # NOTE: Override ``endpoint`` value to getter connection data.
70
+ if "endpoint" in loader.data:
71
+ # NOTE: Update endpoint path without Pydantic validator.
72
+ conn_model.__dict__["endpoint"] = loader.data["endpoint"]
73
+ else:
74
+ loader.data.update({"endpoint": conn_model.endpoint})
75
+ return cls.model_validate(
76
+ obj={
77
+ "extras": (
78
+ loader.data.pop("extras", {}) | filter_data | externals
79
+ ),
80
+ "conn": conn_model,
81
+ **loader.data,
82
+ }
83
+ )
84
+
85
+
86
+ class Dataset(BaseDataset):
87
+
88
+ def exists(self) -> bool:
89
+ raise NotImplementedError("Object exists does not implement")
90
+
91
+ def format_object(
92
+ self,
93
+ _object: str | None = None,
94
+ dt: str | datetime | None = None,
95
+ ) -> str:
96
+ """Format the object value that implement datetime"""
97
+ if dt is None:
98
+ dt = datetime.now()
99
+ dt: datetime = (
100
+ dt if isinstance(dt, datetime) else datetime.fromisoformat(dt)
101
+ )
102
+ return (
103
+ OBJ_FMTS({"datetime": dt})
104
+ .format(escape_fmt_group(_object or self.object))
105
+ .replace("\\", "")
106
+ )
107
+
108
+
109
+ class FlDataset(Dataset):
110
+
111
+ def exists(self) -> bool:
112
+ return self.conn.find_object(self.object)
113
+
114
+
115
+ class TblDataset(Dataset):
116
+
117
+ def exists(self) -> bool:
118
+ return self.conn.find_object(self.object)
119
+
120
+
121
+ class FlDataFrame(Dataset):
122
+
123
+ def exists(self) -> bool:
124
+ return self.conn.find_object(self.object)
125
+
126
+
127
+ class TblDataFrame(Dataset): ...
@@ -0,0 +1,13 @@
1
+ class PandasCSV: ...
2
+
3
+
4
+ class PandasJson: ...
5
+
6
+
7
+ class PandasParq: ...
8
+
9
+
10
+ class PandasDb: ...
11
+
12
+
13
+ class PandasExcel: ...
@@ -0,0 +1,11 @@
1
+ # ------------------------------------------------------------------------------
2
+ # Copyright (c) 2022 Korawich Anuttra. All rights reserved.
3
+ # Licensed under the MIT License. See LICENSE in the project root for
4
+ # license information.
5
+ # ------------------------------------------------------------------------------
6
+ from __future__ import annotations
7
+
8
+ from .__dataset import TblDataset
9
+
10
+
11
+ class PostgresTbl(TblDataset): ...
@@ -6,12 +6,10 @@
6
6
  from __future__ import annotations
7
7
 
8
8
  from datetime import datetime
9
- from typing import Annotated, Any, Optional
9
+ from typing import Any, Optional
10
10
 
11
11
  from fmtutil import Datetime, FormatterGroupType, make_group
12
- from fmtutil.utils import escape_fmt_group
13
12
  from pydantic import BaseModel, Field
14
- from typing_extensions import Self
15
13
 
16
14
  try:
17
15
  import polars as pl
@@ -20,137 +18,11 @@ except ImportError:
20
18
  "Please install polars package\n\t\t$ pip install polars"
21
19
  ) from None
22
20
 
23
- from .__types import DictData, TupleStr
24
- from .conn import SubclassConn
25
- from .loader import Loader
21
+ from ..__types import TupleStr
22
+ from .__dataset import FlDataFrame, TblDataFrame
26
23
 
27
24
  EXCLUDED_EXTRAS: TupleStr = ("type",)
28
- OBJ_FMTS: FormatterGroupType = make_group(
29
- {
30
- "datetime": Datetime,
31
- }
32
- )
33
-
34
-
35
- class BaseDataset(BaseModel):
36
- """Base Dataset Model. This model implement only loading constructor."""
37
-
38
- conn: Annotated[SubclassConn, Field(description="Connection Model")]
39
- endpoint: Annotated[
40
- Optional[str],
41
- Field(description="Endpoint of connection"),
42
- ] = None
43
- object: str
44
- features: list = Field(default_factory=list)
45
- extras: dict[str, Any] = Field(default_factory=dict)
46
-
47
- @classmethod
48
- def from_loader(
49
- cls,
50
- name: str,
51
- externals: DictData,
52
- ) -> Self:
53
- """Construct Connection with Loader object with specific config name.
54
-
55
- :param name: A name of dataset that want to load from config file.
56
- :param externals: An external parameters.
57
- """
58
- loader: Loader = Loader(name, externals=externals)
59
-
60
- # NOTE: Validate the config type match with current dataset model
61
- if loader.type != cls:
62
- raise ValueError(f"Type {loader.type} does not match with {cls}")
63
-
64
- filter_data: DictData = {
65
- k: loader.data.pop(k)
66
- for k in loader.data.copy()
67
- if k not in cls.model_fields and k not in EXCLUDED_EXTRAS
68
- }
69
-
70
- if "conn" not in loader.data:
71
- raise ValueError("Dataset config does not set ``conn`` value")
72
-
73
- # NOTE: Start loading connection config
74
- conn_name: str = loader.data.pop("conn")
75
- conn_loader: Loader = Loader(conn_name, externals=externals)
76
- conn_model: SubclassConn = conn_loader.type.from_loader(
77
- name=conn_name, externals=externals
78
- )
79
-
80
- # NOTE: Override ``endpoint`` value to getter connection data.
81
- if "endpoint" in loader.data:
82
- # NOTE: Update endpoint path without Pydantic validator.
83
- conn_model.__dict__["endpoint"] = loader.data["endpoint"]
84
- else:
85
- loader.data.update({"endpoint": conn_model.endpoint})
86
- return cls.model_validate(
87
- obj={
88
- "extras": (
89
- loader.data.pop("extras", {}) | filter_data | externals
90
- ),
91
- "conn": conn_model,
92
- **loader.data,
93
- }
94
- )
95
-
96
-
97
- class Dataset(BaseDataset):
98
-
99
- def exists(self) -> bool:
100
- raise NotImplementedError("Object exists does not implement")
101
-
102
- def format_object(
103
- self,
104
- _object: str | None = None,
105
- dt: str | datetime | None = None,
106
- ) -> str:
107
- """Format the object value that implement datetime"""
108
- if dt is None:
109
- dt = datetime.now()
110
- dt: datetime = (
111
- dt if isinstance(dt, datetime) else datetime.fromisoformat(dt)
112
- )
113
- return (
114
- OBJ_FMTS({"datetime": dt})
115
- .format(escape_fmt_group(_object or self.object))
116
- .replace("\\", "")
117
- )
118
-
119
-
120
- class FlDataset(Dataset):
121
-
122
- def exists(self) -> bool:
123
- return self.conn.find_object(self.object)
124
-
125
-
126
- class TblDataset(Dataset):
127
-
128
- def exists(self) -> bool:
129
- return self.conn.find_object(self.object)
130
-
131
-
132
- class FlDataFrame(Dataset):
133
-
134
- def exists(self) -> bool:
135
- return self.conn.find_object(self.object)
136
-
137
-
138
- class TblDataFrame(Dataset): ...
139
-
140
-
141
- class PandasCSV: ...
142
-
143
-
144
- class PandasJson: ...
145
-
146
-
147
- class PandasParq: ...
148
-
149
-
150
- class PandasDb: ...
151
-
152
-
153
- class PandasExcel: ...
25
+ OBJ_FMTS: FormatterGroupType = make_group({"datetime": Datetime})
154
26
 
155
27
 
156
28
  class PolarsCsvArgs(BaseModel):
@@ -297,10 +169,4 @@ class PolarsParq(FlDataFrame):
297
169
  )
298
170
 
299
171
 
300
- class PostgresTbl(TblDataset): ...
301
-
302
-
303
- class SqliteTbl(TblDataset): ...
304
-
305
-
306
172
  class PolarsPostgres(TblDataFrame): ...