sibi-dst 0.3.20__py3-none-any.whl → 0.3.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +1 -1
- sibi_dst/df_helper/__init__.py +2 -2
- sibi_dst/df_helper/_df_helper.py +34 -33
- sibi_dst/df_helper/_parquet_artifact.py +4 -1
- sibi_dst/df_helper/_parquet_reader.py +2 -1
- sibi_dst/df_helper/backends/django/__init__.py +1 -2
- sibi_dst/df_helper/backends/django/_django_db_connection.py +1 -1
- sibi_dst/df_helper/backends/django/_django_load_from_db.py +6 -8
- sibi_dst/df_helper/backends/django/_django_sql_model_builder.py +5 -5
- sibi_dst/df_helper/backends/django/_io_dask.py +0 -1
- sibi_dst/df_helper/backends/django/_io_dask_alt.py +5 -4
- sibi_dst/df_helper/backends/http/__init__.py +2 -2
- sibi_dst/df_helper/backends/http/_http_config.py +6 -3
- sibi_dst/df_helper/backends/parquet/__init__.py +3 -3
- sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py +4 -2
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +12 -7
- sibi_dst/df_helper/backends/sql_alchemy/__init__.py +2 -2
- sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py +3 -1
- sibi_dst/df_helper/backends/sql_alchemy/_sqlachemy_filter_handler.py +2 -3
- sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_db_connection.py +3 -3
- sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_load_from_db.py +2 -2
- sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_model_builder.py +5 -3
- sibi_dst/df_helper/backends/sql_model/__init__.py +1 -1
- sibi_dst/df_helper/backends/sql_model/_sqlmodel_db_connection.py +5 -4
- sibi_dst/df_helper/backends/sql_model/_sqlmodel_load_from_db.py +13 -11
- sibi_dst/df_helper/core/_defaults.py +9 -6
- sibi_dst/df_helper/core/_filter_handler.py +7 -4
- sibi_dst/df_helper/core/_params_config.py +3 -2
- sibi_dst/df_helper/core/_query_config.py +0 -2
- sibi_dst/utils/__init__.py +10 -9
- sibi_dst/utils/_airflow_manager.py +4 -3
- sibi_dst/utils/_clickhouse_writer.py +16 -13
- sibi_dst/utils/_credentials.py +1 -1
- sibi_dst/utils/_data_wrapper.py +7 -4
- sibi_dst/utils/_date_utils.py +11 -5
- sibi_dst/utils/_df_utils.py +9 -5
- sibi_dst/utils/_file_utils.py +3 -1
- sibi_dst/utils/_filepath_generator.py +4 -2
- sibi_dst/utils/_log_utils.py +1 -1
- sibi_dst/utils/_parquet_saver.py +0 -2
- sibi_dst/utils/_storage_manager.py +1 -1
- {sibi_dst-0.3.20.dist-info → sibi_dst-0.3.21.dist-info}/METADATA +1 -1
- sibi_dst-0.3.21.dist-info/RECORD +47 -0
- sibi_dst-0.3.20.dist-info/RECORD +0 -47
- {sibi_dst-0.3.20.dist-info → sibi_dst-0.3.21.dist-info}/WHEEL +0 -0
@@ -1,19 +1,21 @@
|
|
1
|
-
import dask.dataframe as dd
|
2
|
-
from sqlmodel import Session, select, text
|
3
|
-
from typing import Any, Dict, Optional
|
4
1
|
import logging
|
2
|
+
from typing import Any, Dict, Optional
|
3
|
+
|
4
|
+
import dask.dataframe as dd
|
5
5
|
import pandas as pd
|
6
|
+
from sqlmodel import Session, select, text
|
7
|
+
|
6
8
|
|
7
9
|
class SQLModelLoadFromDb:
|
8
10
|
df: dd.DataFrame
|
9
11
|
|
10
12
|
def __init__(
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
13
|
+
self,
|
14
|
+
db_connection,
|
15
|
+
db_query: Optional[Dict[str, Any]] = None,
|
16
|
+
db_params: Optional[Dict[str, Any]] = None,
|
17
|
+
logger=None,
|
18
|
+
**kwargs,
|
17
19
|
):
|
18
20
|
"""
|
19
21
|
Initialize the loader with database connection, query, and parameters.
|
@@ -74,7 +76,7 @@ class SQLModelLoadFromDb:
|
|
74
76
|
results = session.exec(query).fetchall()
|
75
77
|
|
76
78
|
# Convert query results to a Dask DataFrame
|
77
|
-
print("results:",results)
|
79
|
+
print("results:", results)
|
78
80
|
if results:
|
79
81
|
df = dd.from_pandas(pd.DataFrame([r.dict() for r in results]), npartitions=1)
|
80
82
|
else:
|
@@ -96,4 +98,4 @@ class SQLModelLoadFromDb:
|
|
96
98
|
if field_map:
|
97
99
|
rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
|
98
100
|
if rename_mapping:
|
99
|
-
self.df = self.df.rename(columns=rename_mapping, meta={v: "object" for v in rename_mapping.values()})
|
101
|
+
self.df = self.df.rename(columns=rename_mapping, meta={v: "object" for v in rename_mapping.values()})
|
@@ -54,8 +54,10 @@ django_field_conversion_map_dask: Dict[str, callable] = {
|
|
54
54
|
"BooleanField": lambda x: x.astype(bool),
|
55
55
|
"NullBooleanField": lambda x: x.astype(bool),
|
56
56
|
"DateTimeField": lambda x: pd.to_datetime(x, errors="coerce"),
|
57
|
-
"DateField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
|
58
|
-
|
57
|
+
"DateField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
|
58
|
+
meta=("date", "object")),
|
59
|
+
"TimeField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time,
|
60
|
+
meta=("time", "object")),
|
59
61
|
"DurationField": lambda x: pd.to_timedelta(x, errors="coerce"),
|
60
62
|
"JSONField": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
|
61
63
|
"ArrayField": lambda x: x.map_partitions(lambda s: s.apply(eval), meta=("array", "object")),
|
@@ -72,12 +74,15 @@ sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
|
|
72
74
|
Numeric.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
|
73
75
|
Boolean.__name__: lambda x: x.astype(bool),
|
74
76
|
DateTime.__name__: lambda x: pd.to_datetime(x, errors="coerce"),
|
75
|
-
Date.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
|
76
|
-
|
77
|
+
Date.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
|
78
|
+
meta=("date", "object")),
|
79
|
+
Time.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time,
|
80
|
+
meta=("time", "object")),
|
77
81
|
JSON.__name__: lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
|
78
82
|
UUID.__name__: lambda x: x.astype(str),
|
79
83
|
}
|
80
84
|
|
85
|
+
|
81
86
|
# Conversion map with normalized SQLAlchemy field types
|
82
87
|
# sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
|
83
88
|
# "String": lambda x: x.map_partitions(lambda s: s.astype(str), meta=("string", "string")),
|
@@ -129,5 +134,3 @@ def normalize_sqlalchemy_type(field_type):
|
|
129
134
|
|
130
135
|
# Fallback to raw class name
|
131
136
|
return field_type.__class__.__name__
|
132
|
-
|
133
|
-
|
@@ -1,10 +1,13 @@
|
|
1
1
|
import datetime
|
2
|
+
|
2
3
|
import dask.dataframe as dd
|
3
4
|
import pandas as pd
|
4
5
|
from sqlalchemy import func, cast
|
5
6
|
from sqlalchemy.sql.sqltypes import Date, Time
|
7
|
+
|
6
8
|
from sibi_dst.utils import Logger
|
7
9
|
|
10
|
+
|
8
11
|
class FilterHandler:
|
9
12
|
def __init__(self, backend, logger=None):
|
10
13
|
"""
|
@@ -15,7 +18,8 @@ class FilterHandler:
|
|
15
18
|
logger: Optional logger for debugging purposes.
|
16
19
|
"""
|
17
20
|
self.backend = backend
|
18
|
-
self.logger = logger or Logger.default_logger(
|
21
|
+
self.logger = logger or Logger.default_logger(
|
22
|
+
logger_name=self.__class__.__name__) # No-op logger if none provided
|
19
23
|
self.backend_methods = self._get_backend_methods(backend)
|
20
24
|
|
21
25
|
def apply_filters(self, query_or_df, model=None, filters=None):
|
@@ -34,7 +38,7 @@ class FilterHandler:
|
|
34
38
|
for key, value in filters.items():
|
35
39
|
field_name, casting, operation = self._parse_filter_key(key)
|
36
40
|
parsed_value = self._parse_filter_value(casting, value)
|
37
|
-
#print(field_name, casting, operation, parsed_value)
|
41
|
+
# print(field_name, casting, operation, parsed_value)
|
38
42
|
# Get the column and apply backend-specific transformations
|
39
43
|
if self.backend == "sqlalchemy":
|
40
44
|
column = self.backend_methods["get_column"](field_name, model, casting)
|
@@ -67,7 +71,6 @@ class FilterHandler:
|
|
67
71
|
|
68
72
|
return field_name, casting, operation
|
69
73
|
|
70
|
-
|
71
74
|
def _parse_filter_value(self, casting, value):
|
72
75
|
"""
|
73
76
|
Convert filter value to appropriate type based on the casting (e.g., date).
|
@@ -213,4 +216,4 @@ class FilterHandler:
|
|
213
216
|
return [
|
214
217
|
"gte", "lte", "gt", "lt", "exact", "in", "range",
|
215
218
|
"contains", "startswith", "endswith", "isnull",
|
216
|
-
]
|
219
|
+
]
|
@@ -1,7 +1,7 @@
|
|
1
|
+
from typing import Optional, Dict, Union, List
|
1
2
|
|
2
3
|
from pydantic import BaseModel, model_validator, Field
|
3
4
|
|
4
|
-
from typing import Optional, Dict, Union, List
|
5
5
|
dataframe_params: Dict[str, Union[None, str, bool, int, None]] = {
|
6
6
|
"fieldnames": None,
|
7
7
|
"index_col": None,
|
@@ -25,6 +25,7 @@ dataframe_options: Dict[str, Union[bool, str, int, None]] = {
|
|
25
25
|
|
26
26
|
LOOKUP_SEP = "__"
|
27
27
|
|
28
|
+
|
28
29
|
class ParamsConfig(BaseModel):
|
29
30
|
field_map: Optional[Dict] = Field(default_factory=dict)
|
30
31
|
legacy_filters: bool = False
|
@@ -76,4 +77,4 @@ class ParamsConfig(BaseModel):
|
|
76
77
|
new_filter_field = LOOKUP_SEP.join(new_parts)
|
77
78
|
new_filters[new_filter_field] = value
|
78
79
|
|
79
|
-
self.filters = new_filters
|
80
|
+
self.filters = new_filters
|
sibi_dst/utils/__init__.py
CHANGED
@@ -1,18 +1,19 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
+
|
3
|
+
from ._airflow_manager import AirflowDAGManager
|
4
|
+
from ._clickhouse_writer import ClickHouseWriter
|
2
5
|
from ._credentials import *
|
3
|
-
from ._log_utils import Logger
|
4
|
-
from ._date_utils import *
|
5
6
|
from ._data_utils import DataUtils
|
7
|
+
from ._data_wrapper import DataWrapper
|
8
|
+
from ._date_utils import *
|
9
|
+
from ._df_utils import DfUtils
|
6
10
|
from ._file_utils import FileUtils
|
7
11
|
from ._filepath_generator import FilePathGenerator
|
8
|
-
from .
|
9
|
-
from ._storage_manager import StorageManager
|
12
|
+
from ._log_utils import Logger
|
10
13
|
from ._parquet_saver import ParquetSaver
|
11
|
-
from .
|
12
|
-
from ._data_wrapper import DataWrapper
|
13
|
-
from ._airflow_manager import AirflowDAGManager
|
14
|
+
from ._storage_manager import StorageManager
|
14
15
|
|
15
|
-
__all__=[
|
16
|
+
__all__ = [
|
16
17
|
"ConfigManager",
|
17
18
|
"ConfigLoader",
|
18
19
|
"Logger",
|
@@ -27,4 +28,4 @@ __all__=[
|
|
27
28
|
"DfUtils",
|
28
29
|
"ClickHouseWriter",
|
29
30
|
"AirflowDAGManager",
|
30
|
-
]
|
31
|
+
]
|
@@ -1,8 +1,9 @@
|
|
1
1
|
import os
|
2
|
-
from jinja2 import Template
|
3
2
|
from datetime import datetime
|
3
|
+
|
4
4
|
import fsspec
|
5
5
|
import httpx
|
6
|
+
from jinja2 import Template
|
6
7
|
|
7
8
|
"""
|
8
9
|
A manager to dynamically generate, save, and upload Airflow DAGs via SSH using fsspec.
|
@@ -54,8 +55,8 @@ with DAG(
|
|
54
55
|
{% endfor %}
|
55
56
|
"""
|
56
57
|
|
57
|
-
class AirflowDAGManager:
|
58
58
|
|
59
|
+
class AirflowDAGManager:
|
59
60
|
|
60
61
|
def __init__(self, output_dir, remote_dags_path, ssh_host, ssh_user, ssh_password, url, auth, wrapper_module_path):
|
61
62
|
"""
|
@@ -208,4 +209,4 @@ class AirflowDAGManager:
|
|
208
209
|
return response.json()
|
209
210
|
except httpx.RequestError as e:
|
210
211
|
print(f"Failed to trigger DAG {dag_id}: {e}")
|
211
|
-
raise
|
212
|
+
raise
|
@@ -1,9 +1,12 @@
|
|
1
|
+
from concurrent.futures import ThreadPoolExecutor
|
2
|
+
|
1
3
|
import clickhouse_connect
|
4
|
+
import pandas as pd
|
2
5
|
from clickhouse_driver import Client
|
3
6
|
from dask.dataframe import dd
|
4
|
-
|
7
|
+
|
5
8
|
from sibi_dst.utils import Logger
|
6
|
-
|
9
|
+
|
7
10
|
|
8
11
|
class ClickHouseWriter:
|
9
12
|
dtype_to_clickhouse = {
|
@@ -19,20 +22,20 @@ class ClickHouseWriter:
|
|
19
22
|
df: dd.DataFrame
|
20
23
|
|
21
24
|
def __init__(self, logger=None, **kwargs):
|
22
|
-
self.clickhouse_host = kwargs.setdefault('host',"localhost")
|
23
|
-
self.clickhouse_port = kwargs.setdefault('port',8123)
|
24
|
-
self.clickhouse_dbname = kwargs.setdefault('database','sibi_data')
|
25
|
-
self.clickhouse_user = kwargs.setdefault('user','default')
|
26
|
-
self.clickhouse_password = kwargs.setdefault('password','')
|
27
|
-
self.clickhouse_table = kwargs.setdefault('table','test_sibi_table')
|
25
|
+
self.clickhouse_host = kwargs.setdefault('host', "localhost")
|
26
|
+
self.clickhouse_port = kwargs.setdefault('port', 8123)
|
27
|
+
self.clickhouse_dbname = kwargs.setdefault('database', 'sibi_data')
|
28
|
+
self.clickhouse_user = kwargs.setdefault('user', 'default')
|
29
|
+
self.clickhouse_password = kwargs.setdefault('password', '')
|
30
|
+
self.clickhouse_table = kwargs.setdefault('table', 'test_sibi_table')
|
28
31
|
|
29
32
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
30
33
|
self.client = None
|
31
|
-
self.order_by=kwargs.setdefault('order_by','id')
|
34
|
+
self.order_by = kwargs.setdefault('order_by', 'id')
|
32
35
|
|
33
36
|
def save_to_clickhouse(self, df, **kwargs):
|
34
37
|
self.df = df.copy()
|
35
|
-
self.order_by = kwargs.setdefault('order_by',self.order_by)
|
38
|
+
self.order_by = kwargs.setdefault('order_by', self.order_by)
|
36
39
|
if len(self.df.head().index) == 0:
|
37
40
|
self.logger.debug("Dataframe is empty")
|
38
41
|
return
|
@@ -86,8 +89,8 @@ class ClickHouseWriter:
|
|
86
89
|
if engine is None:
|
87
90
|
engine = f"ENGINE = MergeTree() order by {self.order_by}"
|
88
91
|
dtypes = self.df.dtypes
|
89
|
-
clickhouse_schema = self._generate_clickhouse_schema(dtypes,self.dtype_to_clickhouse)
|
90
|
-
create_table_sql= f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
|
92
|
+
clickhouse_schema = self._generate_clickhouse_schema(dtypes, self.dtype_to_clickhouse)
|
93
|
+
create_table_sql = f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
|
91
94
|
self.logger.debug(f"Creating table SQL:{create_table_sql}")
|
92
95
|
if self.client:
|
93
96
|
self.client.command(create_table_sql)
|
@@ -200,4 +203,4 @@ class ClickHouseWriter:
|
|
200
203
|
with ThreadPoolExecutor() as executor:
|
201
204
|
executor.map(write_partition, partitions, range(len(partitions)))
|
202
205
|
except Exception as e:
|
203
|
-
self.logger.error(f"Error during multi-partition write: {e}")
|
206
|
+
self.logger.error(f"Error during multi-partition write: {e}")
|
sibi_dst/utils/_credentials.py
CHANGED
sibi_dst/utils/_data_wrapper.py
CHANGED
@@ -1,12 +1,15 @@
|
|
1
1
|
import datetime
|
2
2
|
from typing import Type, Any, Dict, Optional
|
3
|
+
|
3
4
|
import fsspec
|
4
5
|
import pandas as pd
|
5
6
|
from IPython.display import display
|
6
|
-
from sibi_dst.utils import Logger
|
7
7
|
from tqdm import tqdm
|
8
|
+
|
9
|
+
from sibi_dst.utils import Logger
|
8
10
|
from sibi_dst.utils import ParquetSaver
|
9
11
|
|
12
|
+
|
10
13
|
class DataWrapper:
|
11
14
|
DEFAULT_MAX_AGE_MINUTES = 1440
|
12
15
|
DEFAULT_HISTORY_DAYS_THRESHOLD = 30
|
@@ -89,7 +92,7 @@ class DataWrapper:
|
|
89
92
|
# Filter dates in the category where `update_required` is True
|
90
93
|
dates_to_process = update_plan_table[
|
91
94
|
(update_plan_table["update_category"] == category) & (update_plan_table["update_required"])
|
92
|
-
|
95
|
+
]["date"].tolist()
|
93
96
|
|
94
97
|
date_iterator = dates_to_process
|
95
98
|
if self.show_progress:
|
@@ -130,7 +133,7 @@ class DataWrapper:
|
|
130
133
|
data_object = self.dataclass(**self.class_params)
|
131
134
|
df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
|
132
135
|
|
133
|
-
if len(df.index)==0:
|
136
|
+
if len(df.index) == 0:
|
134
137
|
self.logger.error("No data found for the specified date.")
|
135
138
|
return
|
136
139
|
|
@@ -194,7 +197,7 @@ class DataWrapper:
|
|
194
197
|
"missing_file": missing_file,
|
195
198
|
"update_required": update_required,
|
196
199
|
"update_category": category,
|
197
|
-
"datawrapper class":self.dataclass.__name__
|
200
|
+
"datawrapper class": self.dataclass.__name__
|
198
201
|
})
|
199
202
|
|
200
203
|
update_plan_table = pd.DataFrame(rows)
|
sibi_dst/utils/_date_utils.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
import datetime
|
2
|
-
from typing import Union, Tuple, Callable, Dict
|
2
|
+
from typing import Union, Tuple, Callable, Dict
|
3
3
|
|
4
4
|
import numpy as np
|
5
5
|
import pandas as pd
|
6
|
+
|
6
7
|
from sibi_dst.utils import Logger
|
7
8
|
|
8
9
|
|
@@ -32,7 +33,8 @@ class DateUtils:
|
|
32
33
|
raise ValueError(f"Unsupported date format: {value}")
|
33
34
|
|
34
35
|
@classmethod
|
35
|
-
def calc_week_range(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> Tuple[
|
36
|
+
def calc_week_range(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> Tuple[
|
37
|
+
datetime.date, datetime.date]:
|
36
38
|
"""
|
37
39
|
Calculate the start and end of the week for a given reference date.
|
38
40
|
"""
|
@@ -49,7 +51,8 @@ class DateUtils:
|
|
49
51
|
return datetime.date(year, 1, 1), datetime.date(year, 12, 31)
|
50
52
|
|
51
53
|
@classmethod
|
52
|
-
def get_first_day_of_the_quarter(cls, reference_date: Union[
|
54
|
+
def get_first_day_of_the_quarter(cls, reference_date: Union[
|
55
|
+
str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
|
53
56
|
"""
|
54
57
|
Get the first day of the quarter for a given date.
|
55
58
|
"""
|
@@ -58,7 +61,8 @@ class DateUtils:
|
|
58
61
|
return datetime.date(reference_date.year, 3 * quarter - 2, 1)
|
59
62
|
|
60
63
|
@classmethod
|
61
|
-
def get_last_day_of_the_quarter(cls, reference_date: Union[
|
64
|
+
def get_last_day_of_the_quarter(cls, reference_date: Union[
|
65
|
+
str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
|
62
66
|
"""
|
63
67
|
Get the last day of the quarter for a given date.
|
64
68
|
"""
|
@@ -116,10 +120,12 @@ class DateUtils:
|
|
116
120
|
'current_month': lambda: cls.get_month_range(n=0),
|
117
121
|
'last_month': lambda: cls.get_month_range(n=-1),
|
118
122
|
'current_year': lambda: cls.get_year_timerange(today().year),
|
119
|
-
'current_quarter': lambda: (
|
123
|
+
'current_quarter': lambda: (
|
124
|
+
cls.get_first_day_of_the_quarter(today()), cls.get_last_day_of_the_quarter(today())),
|
120
125
|
'ytd': lambda: (datetime.date(today().year, 1, 1), today()),
|
121
126
|
}
|
122
127
|
|
128
|
+
|
123
129
|
class BusinessDays:
|
124
130
|
def __init__(self, holiday_list, logger):
|
125
131
|
"""
|
sibi_dst/utils/_df_utils.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
-
import pandas as pd
|
2
1
|
import dask.dataframe as dd
|
2
|
+
import pandas as pd
|
3
|
+
|
3
4
|
from ._log_utils import Logger
|
4
5
|
|
6
|
+
|
5
7
|
class DfUtils:
|
6
8
|
def __init__(self, logger=None):
|
7
9
|
"""
|
@@ -210,7 +212,7 @@ class DfUtils:
|
|
210
212
|
df['Total'] = df.sum(axis=1, numeric_only=True)
|
211
213
|
return df
|
212
214
|
|
213
|
-
def summarise_data(self,df, summary_column, values_column, rule='D', agg_func='count'):
|
215
|
+
def summarise_data(self, df, summary_column, values_column, rule='D', agg_func='count'):
|
214
216
|
"""
|
215
217
|
Summarizes data by creating a pivot table and resampling.
|
216
218
|
|
@@ -233,10 +235,12 @@ class DfUtils:
|
|
233
235
|
df = df.set_index(dd.to_datetime(df.index))
|
234
236
|
|
235
237
|
# Group by index and summary columns
|
236
|
-
df_grouped = df.groupby([dd.to_datetime(df.index)] + [summary_column])[values_column].agg(
|
238
|
+
df_grouped = df.groupby([dd.to_datetime(df.index)] + [summary_column])[values_column].agg(
|
239
|
+
agg_func).reset_index()
|
237
240
|
|
238
241
|
# Pivot the table
|
239
|
-
df_pivot = df_grouped.pivot_table(index='index', columns=summary_column, values=values_column,
|
242
|
+
df_pivot = df_grouped.pivot_table(index='index', columns=summary_column, values=values_column,
|
243
|
+
aggfunc='sum').fillna(0)
|
240
244
|
|
241
245
|
# Resample
|
242
246
|
df_pivot.index = dd.to_datetime(df_pivot.index)
|
@@ -269,4 +273,4 @@ class DfUtils:
|
|
269
273
|
Returns:
|
270
274
|
DataFrame: Resampled pivot table.
|
271
275
|
"""
|
272
|
-
return DfUtils.summarise_data(df, summary_columns, value_columns, rule=rule, agg_func=agg_func)
|
276
|
+
return DfUtils.summarise_data(df, summary_columns, value_columns, rule=rule, agg_func=agg_func)
|
sibi_dst/utils/_file_utils.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
import shutil
|
2
2
|
from pathlib import Path
|
3
3
|
from typing import Optional
|
4
|
+
|
4
5
|
import fsspec
|
5
6
|
|
6
7
|
from sibi_dst.utils import Logger
|
7
8
|
|
9
|
+
|
8
10
|
class FileUtils:
|
9
11
|
def __init__(self, logger=None):
|
10
12
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
@@ -27,7 +29,7 @@ class FileUtils:
|
|
27
29
|
fs.mkdirs(path)
|
28
30
|
|
29
31
|
@staticmethod
|
30
|
-
def construct_full_path(storage_path:str, parquet_filename: Optional[str]) -> Path:
|
32
|
+
def construct_full_path(storage_path: str, parquet_filename: Optional[str]) -> Path:
|
31
33
|
"""Construct and return the full path for the parquet file."""
|
32
34
|
fs, base_path = fsspec.core.url_to_fs(storage_path)
|
33
35
|
parquet_filename = parquet_filename or "default.parquet"
|
@@ -1,7 +1,8 @@
|
|
1
1
|
import datetime
|
2
|
-
import fsspec
|
3
2
|
import re
|
4
3
|
|
4
|
+
import fsspec
|
5
|
+
|
5
6
|
from sibi_dst.utils import Logger
|
6
7
|
|
7
8
|
|
@@ -150,6 +151,7 @@ class FilePathGenerator:
|
|
150
151
|
return datetime.datetime.strptime(date, '%Y-%m-%d')
|
151
152
|
return date
|
152
153
|
|
154
|
+
|
153
155
|
"""
|
154
156
|
Usage:
|
155
157
|
# Initialize the generator
|
@@ -182,4 +184,4 @@ for fp in file_paths:
|
|
182
184
|
|
183
185
|
df_pandas = pd.concat(dataframes, ignore_index=True)
|
184
186
|
print(df_pandas.head())
|
185
|
-
"""
|
187
|
+
"""
|
sibi_dst/utils/_log_utils.py
CHANGED
sibi_dst/utils/_parquet_saver.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
from typing import Optional
|
3
3
|
|
4
|
-
import dask_expr
|
5
4
|
import fsspec
|
6
5
|
import pyarrow as pa
|
7
6
|
|
@@ -103,4 +102,3 @@ class ParquetSaver:
|
|
103
102
|
self.df_result.to_parquet(
|
104
103
|
str(full_path), engine="pyarrow", schema=schema, write_index=False
|
105
104
|
)
|
106
|
-
|
@@ -0,0 +1,47 @@
|
|
1
|
+
sibi_dst/__init__.py,sha256=CLHfzrFNqklNx5uMKAPtbZfkbBbVYR5qsiMro0RTfmA,252
|
2
|
+
sibi_dst/df_helper/__init__.py,sha256=aiAu7j1SWDiw3RVI4UJmvLcADP34OfrJTCYpdupPGII,234
|
3
|
+
sibi_dst/df_helper/_df_helper.py,sha256=3fibDnRAX4R0v-xgfG87BKLR-ZCg8AZWrKDIO7qbBPM,13953
|
4
|
+
sibi_dst/df_helper/_parquet_artifact.py,sha256=nx1wTEyrjARpCCPNwBxYiBROee3CSb6c-u7Cpme_tdk,4978
|
5
|
+
sibi_dst/df_helper/_parquet_reader.py,sha256=sbe8DsScNT2h6huNsz8mUxVnUGpJeRzbaONZ3u2sQeQ,1685
|
6
|
+
sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
sibi_dst/df_helper/backends/django/__init__.py,sha256=Py4XGV8OnWv_6qkdlJ0hhc1_xT_NLMS712N00CJOg0w,270
|
8
|
+
sibi_dst/df_helper/backends/django/_django_db_connection.py,sha256=GypF84Ej7ViTID4r3UEReRGmLiyfMtEknPI_NINSm3g,1641
|
9
|
+
sibi_dst/df_helper/backends/django/_django_load_from_db.py,sha256=GLsAsuEQD1cXfEm7BuxofZfR32VwZNEfwR9c-AZn-x0,5555
|
10
|
+
sibi_dst/df_helper/backends/django/_django_sql_model_builder.py,sha256=xyOq0JY0k9380aBeV66RXbeXL-mF22CczbQQoXLDhuo,14884
|
11
|
+
sibi_dst/df_helper/backends/django/_io_dask.py,sha256=jryDojeA62rB3seRaWWMjsAmekKacK5xctwCQGVklPQ,9063
|
12
|
+
sibi_dst/df_helper/backends/django/_io_dask_alt.py,sha256=HUiThJ2hymh95KBN_I5aWiz8z6STd4C48e1y9_vZd6Y,6829
|
13
|
+
sibi_dst/df_helper/backends/http/__init__.py,sha256=d1pfgYxbiYg7E0Iw8RbJ7xfqIfJShqqTBQQGU_S6OOo,105
|
14
|
+
sibi_dst/df_helper/backends/http/_http_config.py,sha256=l6GdzTsknfzyf8LAo_TuIWeiswLRRrLcmqAmirxpH8Q,2132
|
15
|
+
sibi_dst/df_helper/backends/parquet/__init__.py,sha256=ed2H3DrhV6wWNc5e1YiP5ScZErt3Hp1dfTDCcDObrgo,190
|
16
|
+
sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py,sha256=Q8Ic9PLDGT4L97yqr20mr_NsdEeMMOlFkT7Z12yYCxI,3663
|
17
|
+
sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=5fAv7KzSRvCpW-6ZiXcvrWAyf1KThs1qCgtrzGo3x8A,4503
|
18
|
+
sibi_dst/df_helper/backends/sql_alchemy/__init__.py,sha256=072YxHXqV1wn6xo6clrgvh8kpvlOmboIQW9tyOPHXAY,369
|
19
|
+
sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py,sha256=UuAHzZWBADsTwGhwZTJzR66Xdh189OR81C1IITwzls0,5620
|
20
|
+
sibi_dst/df_helper/backends/sql_alchemy/_sqlachemy_filter_handler.py,sha256=58RCda1Hg_nsuJw-2V36IstsT8O84IQFgsdE7FnqvMk,4655
|
21
|
+
sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_db_connection.py,sha256=6Jxj3bhGJadHcJOCXtNo0YZUMO6RNgbtXhlghkfOtW8,1648
|
22
|
+
sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=9z7gUy441T6ww1hpbt3xgxv0jFf8W_zzq0PfkkB126E,2237
|
23
|
+
sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_model_builder.py,sha256=Bmhh6VvmBfNfBA2JpuEdsYD_193yJ768Si2TvkY9HmU,4405
|
24
|
+
sibi_dst/df_helper/backends/sql_model/__init__.py,sha256=sbulOpXqT0O5k1FsSbhr3pPgi5sV_uJ_hPWpoJa_754,226
|
25
|
+
sibi_dst/df_helper/backends/sql_model/_sqlmodel_db_connection.py,sha256=ama9oTLBQ2ATUVrX2OGvMT23ia5RblakIgZ7f2O9ZgA,4267
|
26
|
+
sibi_dst/df_helper/backends/sql_model/_sqlmodel_load_from_db.py,sha256=5PTnXN_ZFvtdLnov1CjEFQE77YvuoGwG1dj0KQzrY_k,3714
|
27
|
+
sibi_dst/df_helper/core/__init__.py,sha256=o4zDwgVmaijde3oix0ezb6KLxI5QFy-SGUhFTDVFLT4,569
|
28
|
+
sibi_dst/df_helper/core/_defaults.py,sha256=eNpHD2sZxir-2xO0b3_V16ryw8YP_5FfpIKK0HNuiN4,7011
|
29
|
+
sibi_dst/df_helper/core/_filter_handler.py,sha256=1-IdviSYi5Hc28KckO4dkYHDfQ8X9SUb6kwfobm16_E,8580
|
30
|
+
sibi_dst/df_helper/core/_params_config.py,sha256=mM1CnF29zls5LXx7rpKY8uix_GyOG5smO4ry_OX31IU,3477
|
31
|
+
sibi_dst/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
|
32
|
+
sibi_dst/utils/__init__.py,sha256=5WeBMxhNGB8ZpHUrp1NOQf8Kn0bLOtjrerFjQdFTa7U,787
|
33
|
+
sibi_dst/utils/_airflow_manager.py,sha256=-d44EKUZNYJyp4wuNwRvilRQktunArPOB5fZuWdQv10,7526
|
34
|
+
sibi_dst/utils/_clickhouse_writer.py,sha256=dL5pixjn4cj0Rwpc3POfCcY2D-aQCMbPSECX0dKATyE,8587
|
35
|
+
sibi_dst/utils/_credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
|
36
|
+
sibi_dst/utils/_data_utils.py,sha256=ch4j5FEs8ZnniUzpbeLO-b4Yco_6nwCu71xHaVqMGi4,7050
|
37
|
+
sibi_dst/utils/_data_wrapper.py,sha256=BmKFO70xVX3AjpGRzqrc6HS1Uw4xerZDx3IpFrcrIIo,9020
|
38
|
+
sibi_dst/utils/_date_utils.py,sha256=CMAZBNwVj7cvERcNiTA8Pf7_5EjV9By9yxkYJpkqz1g,10656
|
39
|
+
sibi_dst/utils/_df_utils.py,sha256=NHnEJG9KDeRuqfE60kwBOO21B6Hvjh7PzE5B8cQrIXc,10986
|
40
|
+
sibi_dst/utils/_file_utils.py,sha256=JpsybYj3XvVJisSBeVU6YSaZnYRm4_6YWTI3TLnnY4Y,1257
|
41
|
+
sibi_dst/utils/_filepath_generator.py,sha256=hjI7gQwfwRToPeuzoUQDayHKQrr4Ivhi4Chl1J4Phlk,6689
|
42
|
+
sibi_dst/utils/_log_utils.py,sha256=AvKu5Qmi9LXezA7gdkG7yV-MvzZeav8c3PK8s-DwTGE,2314
|
43
|
+
sibi_dst/utils/_parquet_saver.py,sha256=hLrWr1G132y94eLopDPPGQGDsAiR1lQ8id4QQtGYPE4,4349
|
44
|
+
sibi_dst/utils/_storage_manager.py,sha256=7nkfeBW_2xlF59pGj7V2aY5TLwpJnPQuPVclqjavJOA,3856
|
45
|
+
sibi_dst-0.3.21.dist-info/METADATA,sha256=P0GRxm9kh8V1mSjJgfvBSQplvfTuDnPbZZ71uOzwPJQ,2134
|
46
|
+
sibi_dst-0.3.21.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
47
|
+
sibi_dst-0.3.21.dist-info/RECORD,,
|
sibi_dst-0.3.20.dist-info/RECORD
DELETED
@@ -1,47 +0,0 @@
|
|
1
|
-
sibi_dst/__init__.py,sha256=1KaC0LYTHxjpENq-NXI325WcEYZ8GCBrHGkLoFxEcu0,251
|
2
|
-
sibi_dst/df_helper/__init__.py,sha256=rbTr9CqwbJhu8pbZabwfcOqhm-5hm2iXk0vVBtK01bA,231
|
3
|
-
sibi_dst/df_helper/_df_helper.py,sha256=e6e32CRTCKjFVvYMytWTuBVpwB1VcnVQ1T4Rg8KXWvY,13917
|
4
|
-
sibi_dst/df_helper/_parquet_artifact.py,sha256=ctISmwxP9icFCXsELBjbPiz-FK3CEojN7yNIlStdOWw,4974
|
5
|
-
sibi_dst/df_helper/_parquet_reader.py,sha256=A8qWuWQiaiS7pk4sD5EDAvGs-qz7VfziINXpSA7o00U,1683
|
6
|
-
sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
sibi_dst/df_helper/backends/django/__init__.py,sha256=VkvYql-PUyCKOfoKx5aGdUAki8X-kULfhnCvgSQzHDQ,311
|
8
|
-
sibi_dst/df_helper/backends/django/_django_db_connection.py,sha256=9fGvXXgqPk_TC7BvaSljxTgNDWoXi_LZxKabEgExznM,1654
|
9
|
-
sibi_dst/df_helper/backends/django/_django_load_from_db.py,sha256=E_6ptiouluyLziXkNy_MztRi36qqW7-3AvlafL78Sug,5592
|
10
|
-
sibi_dst/df_helper/backends/django/_django_sql_model_builder.py,sha256=GprCh2c6PFlRBUCir4hh8pmf4Eqb-4OIK6Vz_xXQjMw,14864
|
11
|
-
sibi_dst/df_helper/backends/django/_io_dask.py,sha256=P3WmkuFzmWRzFchjsVD2OElIR3stuevwDH9G6Mu8IWE,9080
|
12
|
-
sibi_dst/df_helper/backends/django/_io_dask_alt.py,sha256=zDjLyYxBeL0ffn3yfE_7vqMLMpeEEk2o-zMr66sKkDw,6827
|
13
|
-
sibi_dst/df_helper/backends/http/__init__.py,sha256=AG9JSDRyVna2r1yxCQ9HcY32EaGnzWsfKgNLgPpSXjY,102
|
14
|
-
sibi_dst/df_helper/backends/http/_http_config.py,sha256=NN3bol7NgBTDv70yOX7hJkazt1-dAAdFWVkYyHdIXsI,2128
|
15
|
-
sibi_dst/df_helper/backends/parquet/__init__.py,sha256=ClkyIsIh_ovEwqm0dTrkXImbPjLDTVHW2NQqqfQwWAw,187
|
16
|
-
sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py,sha256=6iFvblnVq0qj89QvieQuYxe_2RPX5ArKfq5zBcEIj90,3660
|
17
|
-
sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=09b9yLPREvx6ebs62B9qEqJt1cCKJz97plGW82i4630,4414
|
18
|
-
sibi_dst/df_helper/backends/sql_alchemy/__init__.py,sha256=FHorj40SbHc0OBzQ_ieG6MG-HLbf0tw6I_5eoIjJkOI,369
|
19
|
-
sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py,sha256=YYhjt5rL1yomcrby4i4bD5wPVDzRJpZZbxHp5CM40tQ,5414
|
20
|
-
sibi_dst/df_helper/backends/sql_alchemy/_sqlachemy_filter_handler.py,sha256=KShsLJYGVxN0ps9Wot7fF0nR0wW9WzcPIcWZ9f5vdBo,4654
|
21
|
-
sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_db_connection.py,sha256=HtMsfH5com4dLVJxh3wdMUpQI3mz0cKDJz0CmFS2S8U,1648
|
22
|
-
sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=QkR-_S4zqJpwH9dJ5cqXW8iy9XoAFUXmcsgUSm3PbLo,2251
|
23
|
-
sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_model_builder.py,sha256=RjtKEk-i8EmX98rwqkq1Bg7IgPwYDduL967gsl9T73c,4401
|
24
|
-
sibi_dst/df_helper/backends/sql_model/__init__.py,sha256=MXd4OOdTqR4cENSV733SGodPO6eQMCexANs-3w0qL5U,226
|
25
|
-
sibi_dst/df_helper/backends/sql_model/_sqlmodel_db_connection.py,sha256=6jmMjKIv5Btysj3kZMaXQ98IqKQkhnOC-JWtb1B8rus,4265
|
26
|
-
sibi_dst/df_helper/backends/sql_model/_sqlmodel_load_from_db.py,sha256=jYwkIz7_E9Z6Mqw1a9TCWKWD146Tbx7mcQFxIpmKgKU,3686
|
27
|
-
sibi_dst/df_helper/core/__init__.py,sha256=o4zDwgVmaijde3oix0ezb6KLxI5QFy-SGUhFTDVFLT4,569
|
28
|
-
sibi_dst/df_helper/core/_defaults.py,sha256=pJU-lX7w4nrt0Anx35j08mVr_0oMGn1bTA_iCl_p1qI,6700
|
29
|
-
sibi_dst/df_helper/core/_filter_handler.py,sha256=SYZqpX4Vt6GAGR0L0LohlDOdjLLWQXJDiWWqFG-lSu0,8563
|
30
|
-
sibi_dst/df_helper/core/_params_config.py,sha256=hO-PddoaGjFebqJFgtn76WwVHcCjzPW3z5i3NyK6mDw,3475
|
31
|
-
sibi_dst/df_helper/core/_query_config.py,sha256=HEiyR_fBJjIMum-PSQroY3KaefQ2SpW1w1SQS8oT-NU,489
|
32
|
-
sibi_dst/utils/__init__.py,sha256=TV229dPIIEzU5qCLI1G6fnCZW-VirUwSuffp7z7OTFg,783
|
33
|
-
sibi_dst/utils/_airflow_manager.py,sha256=rlt3eolR5QvtxWhAtBTCpHXvxftnKM-ibPMv3fVwNZk,7524
|
34
|
-
sibi_dst/utils/_clickhouse_writer.py,sha256=JcnWN2635ATCOaFiB6NYglNXDwqKw0jC7Urs9WOZE20,8571
|
35
|
-
sibi_dst/utils/_credentials.py,sha256=8i6z7y3y5S-6mSk4xrT2AwhzCA32mTn1n1iYX9IVyHk,1724
|
36
|
-
sibi_dst/utils/_data_utils.py,sha256=ch4j5FEs8ZnniUzpbeLO-b4Yco_6nwCu71xHaVqMGi4,7050
|
37
|
-
sibi_dst/utils/_data_wrapper.py,sha256=cvUkGRiPfCyLD4XcoX7FWLYzM8gnHBGR1pJ08PMneCk,9010
|
38
|
-
sibi_dst/utils/_date_utils.py,sha256=KYB07puKDrSG8tOm_i1HGX0TjLNUtSWjwfsCYBmW9co,10619
|
39
|
-
sibi_dst/utils/_df_utils.py,sha256=pjEfkof9hggXQgYerG0p4DXrwBeIRynJFg4IX3Yrb4c,10919
|
40
|
-
sibi_dst/utils/_file_utils.py,sha256=5EN90c8N1n9d-_xwz2RzaYcXRMQY_rws2Q3EA3pNAog,1254
|
41
|
-
sibi_dst/utils/_filepath_generator.py,sha256=ytPSZ9GYOnnSP25zwA-0NjFHupPRZyXwixWnn_68_n0,6686
|
42
|
-
sibi_dst/utils/_log_utils.py,sha256=rPp8z1UglwvqzBOOAvMOct0syQZ-54gGYafnJDRYZN4,2313
|
43
|
-
sibi_dst/utils/_parquet_saver.py,sha256=3BK0XXgMOOAdIw4OzbwMxmDrzDw3_MKi8RTpulIVUe0,4367
|
44
|
-
sibi_dst/utils/_storage_manager.py,sha256=KP2HBXnLUMMquqcO30ecfuoU7g1z8RtaV3Dv0TvEXoY,3856
|
45
|
-
sibi_dst-0.3.20.dist-info/METADATA,sha256=Bum7DGRTuXnOPHglo8OKh1jCt0pITgvoyjSfetGXpO4,2134
|
46
|
-
sibi_dst-0.3.20.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
47
|
-
sibi_dst-0.3.20.dist-info/RECORD,,
|
File without changes
|