sibi-dst 0.3.45__py3-none-any.whl → 0.3.46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +38 -0
- sibi_dst/{df_helper → v1/df_helper}/_artifact_updater_multi_wrapper.py +1 -1
- sibi_dst/{df_helper → v1/df_helper}/_df_helper.py +3 -3
- sibi_dst/{df_helper → v1/df_helper}/_parquet_artifact.py +2 -2
- sibi_dst/{df_helper → v1/df_helper}/_parquet_reader.py +2 -2
- sibi_dst/{df_helper → v1/df_helper}/backends/django/_load_from_db.py +3 -3
- sibi_dst/{df_helper → v1/df_helper}/backends/http/_http_config.py +1 -1
- sibi_dst/{df_helper → v1/df_helper}/backends/parquet/_filter_handler.py +1 -1
- sibi_dst/{df_helper → v1/df_helper}/backends/parquet/_parquet_options.py +2 -2
- sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_io_dask.py +2 -2
- sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_load_from_db.py +2 -2
- sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_sql_model_builder.py +2 -1
- sibi_dst/{df_helper → v1/df_helper}/core/_filter_handler.py +1 -1
- sibi_dst/v1/osmnx_helper/__init__.py +6 -0
- sibi_dst/{tests → v1/tests}/test_data_wrapper_class.py +11 -10
- sibi_dst/{utils → v1/utils}/__init__.py +2 -0
- sibi_dst/{utils → v1/utils}/clickhouse_writer.py +1 -1
- sibi_dst/v1/utils/data_from_http_source.py +49 -0
- sibi_dst/{utils → v1/utils}/data_utils.py +5 -3
- sibi_dst/{utils → v1/utils}/data_wrapper.py +3 -1
- sibi_dst/{utils → v1/utils}/date_utils.py +1 -1
- sibi_dst/{utils → v1/utils}/file_utils.py +1 -1
- sibi_dst/{utils → v1/utils}/filepath_generator.py +1 -1
- sibi_dst/{utils → v1/utils}/parquet_saver.py +1 -1
- sibi_dst/v1/utils/storage_config.py +28 -0
- sibi_dst/v2/df_helper/__init__.py +7 -0
- sibi_dst/v2/df_helper/_df_helper.py +214 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +10 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +82 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +135 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +142 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +297 -0
- sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +9 -0
- sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +78 -0
- sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +122 -0
- sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +142 -0
- sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +283 -0
- sibi_dst/v2/df_helper/core/__init__.py +9 -0
- sibi_dst/v2/df_helper/core/_filter_handler.py +236 -0
- sibi_dst/v2/df_helper/core/_params_config.py +139 -0
- sibi_dst/v2/df_helper/core/_query_config.py +17 -0
- sibi_dst/v2/utils/__init__.py +5 -0
- sibi_dst/v2/utils/log_utils.py +120 -0
- {sibi_dst-0.3.45.dist-info → sibi_dst-0.3.46.dist-info}/METADATA +3 -2
- sibi_dst-0.3.46.dist-info/RECORD +80 -0
- sibi_dst/osmnx_helper/__init__.py +0 -9
- sibi_dst/osmnx_helper/v2/base_osm_map.py +0 -153
- sibi_dst/osmnx_helper/v2/basemaps/utils.py +0 -0
- sibi_dst-0.3.45.dist-info/RECORD +0 -62
- /sibi_dst/{df_helper/backends → v1}/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/__init__.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/df_helper/backends}/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/django/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/django/_db_connection.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/django/_io_dask.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/django/_sql_model_builder.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/http/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/parquet/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_db_connection.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_filter_handler.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/core/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/core/_defaults.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/core/_params_config.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/core/_query_config.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/data_cleaner.py +0 -0
- /sibi_dst/{geopy_helper → v1/geopy_helper}/__init__.py +0 -0
- /sibi_dst/{geopy_helper → v1/geopy_helper}/geo_location_service.py +0 -0
- /sibi_dst/{geopy_helper → v1/geopy_helper}/utils.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/base_osm_map.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/basemaps/__init__.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/basemaps/calendar_html.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/basemaps/router_plotter.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/utils.py +0 -0
- /sibi_dst/{osmnx_helper/v2 → v1/tests}/__init__.py +0 -0
- /sibi_dst/{utils → v1/utils}/airflow_manager.py +0 -0
- /sibi_dst/{utils → v1/utils}/credentials.py +0 -0
- /sibi_dst/{utils → v1/utils}/df_utils.py +0 -0
- /sibi_dst/{utils → v1/utils}/log_utils.py +0 -0
- /sibi_dst/{utils → v1/utils}/phone_formatter.py +0 -0
- /sibi_dst/{utils → v1/utils}/storage_manager.py +0 -0
- /sibi_dst/{osmnx_helper/v2/basemaps → v2}/__init__.py +0 -0
- /sibi_dst/{tests → v2/df_helper/backends}/__init__.py +0 -0
- {sibi_dst-0.3.45.dist-info → sibi_dst-0.3.46.dist-info}/WHEEL +0 -0
sibi_dst/__init__.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
|
1
2
|
try:
|
2
3
|
import importlib.metadata as version_reader
|
3
4
|
except ImportError:
|
@@ -7,3 +8,40 @@ try:
|
|
7
8
|
__version__ = version_reader.version("sibi-dst")
|
8
9
|
except version_reader.PackageNotFoundError:
|
9
10
|
__version__ = "unknown"
|
11
|
+
|
12
|
+
import importlib
|
13
|
+
import sys
|
14
|
+
|
15
|
+
def _load_module(version, module_name):
|
16
|
+
# Construct the relative module path (e.g., ".v1.df_helper")
|
17
|
+
module_path = f".{version}.{module_name}"
|
18
|
+
#print(f"Loading module: {module_path} from package {__package__}")
|
19
|
+
return importlib.import_module(module_path, package=__package__)
|
20
|
+
|
21
|
+
|
22
|
+
# Toggle version by setting the flag (or use an environment variable)
|
23
|
+
use_v2 = False
|
24
|
+
default_version = "v2" if use_v2 else "v1"
|
25
|
+
|
26
|
+
# Dynamically load the modules from the chosen version directory.
|
27
|
+
df_helper = _load_module(default_version, "df_helper")
|
28
|
+
geopy_helper = _load_module(default_version, "geopy_helper")
|
29
|
+
osmnx_helper = _load_module(default_version, "osmnx_helper")
|
30
|
+
tests = _load_module(default_version, "tests")
|
31
|
+
utils = _load_module(default_version, "utils")
|
32
|
+
|
33
|
+
# Re-export the modules at the top level so that absolute imports work.
|
34
|
+
sys.modules[f"{__package__}.df_helper"] = df_helper
|
35
|
+
sys.modules[f"{__package__}.geopy_helper"] = geopy_helper
|
36
|
+
sys.modules[f"{__package__}.osmnx_helper"] = osmnx_helper
|
37
|
+
sys.modules[f"{__package__}.tests"] = tests
|
38
|
+
sys.modules[f"{__package__}.utils"] = utils
|
39
|
+
|
40
|
+
# Define what is exported with "from sibi_dst import *"
|
41
|
+
__all__ = [
|
42
|
+
"df_helper",
|
43
|
+
"geopy_helper",
|
44
|
+
"osmnx_helper",
|
45
|
+
"tests",
|
46
|
+
"utils"
|
47
|
+
]
|
@@ -11,9 +11,9 @@ import pandas as pd
|
|
11
11
|
from pydantic import BaseModel
|
12
12
|
import fsspec
|
13
13
|
|
14
|
-
from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
|
15
|
-
from sibi_dst.utils import Logger
|
16
|
-
from sibi_dst.utils import ParquetSaver, ClickHouseWriter
|
14
|
+
from sibi_dst.v1.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
|
15
|
+
from sibi_dst.v1.utils import Logger
|
16
|
+
from sibi_dst.v1.utils import ParquetSaver, ClickHouseWriter
|
17
17
|
from .backends.django import *
|
18
18
|
from .backends.http import HttpConfig
|
19
19
|
from .backends.parquet import ParquetConfig
|
@@ -5,8 +5,8 @@ from typing import Optional, Any, Dict
|
|
5
5
|
import dask.dataframe as dd
|
6
6
|
import fsspec
|
7
7
|
|
8
|
-
from sibi_dst.df_helper import DfHelper
|
9
|
-
from sibi_dst.utils import DataWrapper, DateUtils, Logger
|
8
|
+
from sibi_dst.v1.df_helper import DfHelper
|
9
|
+
from sibi_dst.v1.utils import DataWrapper, DateUtils, Logger
|
10
10
|
|
11
11
|
|
12
12
|
class ParquetArtifact(DfHelper):
|
@@ -4,8 +4,8 @@ from typing import Optional
|
|
4
4
|
import dask.dataframe as dd
|
5
5
|
import fsspec
|
6
6
|
|
7
|
-
from sibi_dst.df_helper import DfHelper
|
8
|
-
from sibi_dst.utils import Logger
|
7
|
+
from sibi_dst.v1.df_helper import DfHelper
|
8
|
+
from sibi_dst.v1.utils import Logger
|
9
9
|
|
10
10
|
class ParquetReader(DfHelper):
|
11
11
|
"""
|
@@ -4,9 +4,9 @@ import dask.dataframe as dd
|
|
4
4
|
import pandas as pd
|
5
5
|
from django.db.models import Q
|
6
6
|
|
7
|
-
from sibi_dst.df_helper.backends.django import ReadFrameDask
|
8
|
-
from sibi_dst.df_helper.core import django_field_conversion_map_dask
|
9
|
-
from sibi_dst.utils import Logger
|
7
|
+
from sibi_dst.v1.df_helper.backends.django import ReadFrameDask
|
8
|
+
from sibi_dst.v1.df_helper.core import django_field_conversion_map_dask
|
9
|
+
from sibi_dst.v1.utils import Logger
|
10
10
|
|
11
11
|
|
12
12
|
class DjangoLoadFromDb:
|
@@ -6,8 +6,8 @@ import dask.dataframe as dd
|
|
6
6
|
import fsspec
|
7
7
|
from pydantic import BaseModel, model_validator, DirectoryPath, FilePath, ConfigDict
|
8
8
|
|
9
|
-
from sibi_dst.utils import FilePathGenerator
|
10
|
-
from sibi_dst.utils import Logger
|
9
|
+
from sibi_dst.v1.utils import FilePathGenerator
|
10
|
+
from sibi_dst.v1.utils import Logger
|
11
11
|
|
12
12
|
|
13
13
|
class ParquetConfig(BaseModel):
|
@@ -5,8 +5,8 @@ import pandas as pd
|
|
5
5
|
from sqlalchemy import create_engine, inspect, select
|
6
6
|
from sqlalchemy.orm import sessionmaker
|
7
7
|
|
8
|
-
from sibi_dst.df_helper.core import FilterHandler
|
9
|
-
from sibi_dst.utils import Logger
|
8
|
+
from sibi_dst.v1.df_helper.core import FilterHandler
|
9
|
+
from sibi_dst.v1.utils import Logger
|
10
10
|
|
11
11
|
|
12
12
|
class SQLAlchemyDask:
|
@@ -1,8 +1,8 @@
|
|
1
1
|
import dask.dataframe as dd
|
2
2
|
import pandas as pd
|
3
3
|
|
4
|
-
from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
|
5
|
-
from sibi_dst.utils import Logger
|
4
|
+
from sibi_dst.v1.df_helper.core import ParamsConfig, QueryConfig
|
5
|
+
from sibi_dst.v1.utils import Logger
|
6
6
|
from ._io_dask import SQLAlchemyDask
|
7
7
|
from ._db_connection import SqlAlchemyConnectionConfig
|
8
8
|
|
@@ -99,7 +99,7 @@ class SqlAlchemyModelBuilder:
|
|
99
99
|
|
100
100
|
# Add columns and relationships to the model
|
101
101
|
attrs.update(columns)
|
102
|
-
#
|
102
|
+
#self.add_relationships(attrs, self.table)
|
103
103
|
model = Base.registry._class_registry.get(self.class_name)
|
104
104
|
if not model:
|
105
105
|
model = type(self.class_name, (Base,), attrs)
|
@@ -151,6 +151,7 @@ class SqlAlchemyModelBuilder:
|
|
151
151
|
relationship_name = self.normalize_column_name(related_table_name)
|
152
152
|
attrs[relationship_name] = relationship(related_class_name, back_populates=None)
|
153
153
|
|
154
|
+
|
154
155
|
@staticmethod
|
155
156
|
def normalize_class_name(table_name: str) -> str:
|
156
157
|
"""
|
@@ -2,9 +2,9 @@ import unittest
|
|
2
2
|
from unittest.mock import patch, MagicMock
|
3
3
|
import datetime
|
4
4
|
import pandas as pd
|
5
|
-
from sibi_dst.utils import Logger, ParquetSaver
|
6
|
-
from sibi_dst.utils.data_wrapper import DataWrapper
|
7
|
-
|
5
|
+
from sibi_dst.v1.utils import Logger, ParquetSaver
|
6
|
+
from sibi_dst.v1.utils.data_wrapper import DataWrapper
|
7
|
+
from threading import Lock
|
8
8
|
|
9
9
|
class TestDataWrapper(unittest.TestCase):
|
10
10
|
|
@@ -23,6 +23,7 @@ class TestDataWrapper(unittest.TestCase):
|
|
23
23
|
#"client_kwargs": {"endpoint_url": "https://s3.amazonaws.com"}
|
24
24
|
}
|
25
25
|
self.logger = Logger.default_logger(logger_name="TestLogger")
|
26
|
+
self._lock = Lock()
|
26
27
|
|
27
28
|
def test_initialization(self):
|
28
29
|
wrapper = DataWrapper(
|
@@ -46,11 +47,11 @@ class TestDataWrapper(unittest.TestCase):
|
|
46
47
|
self.assertEqual(wrapper.filesystem_options, self.filesystem_options)
|
47
48
|
self.assertEqual(wrapper.logger, self.logger)
|
48
49
|
|
49
|
-
def
|
50
|
-
self.assertEqual(DataWrapper.
|
51
|
-
self.assertEqual(DataWrapper.
|
50
|
+
def test__convert_to_date(self):
|
51
|
+
self.assertEqual(DataWrapper._convert_to_date("2022-01-01"), datetime.date(2022, 1, 1))
|
52
|
+
self.assertEqual(DataWrapper._convert_to_date(datetime.date(2022, 1, 1)), datetime.date(2022, 1, 1))
|
52
53
|
with self.assertRaises(ValueError):
|
53
|
-
DataWrapper.
|
54
|
+
DataWrapper._convert_to_date("invalid-date")
|
54
55
|
|
55
56
|
@patch('fsspec.filesystem')
|
56
57
|
def test_is_file_older_than(self, mock_filesystem):
|
@@ -69,9 +70,9 @@ class TestDataWrapper(unittest.TestCase):
|
|
69
70
|
logger=self.logger
|
70
71
|
)
|
71
72
|
|
72
|
-
self.assertTrue(wrapper.is_file_older_than("some/file/path"))
|
73
|
-
mock_fs.info.return_value = {'mtime': (datetime.datetime.now() - datetime.timedelta(minutes=1000)).timestamp()}
|
74
|
-
self.assertFalse(wrapper.is_file_older_than("some/file/path"))
|
73
|
+
#self.assertTrue(wrapper.is_file_older_than("some/file/path"))
|
74
|
+
#mock_fs.info.return_value = {'mtime': (datetime.datetime.now() - datetime.timedelta(minutes=1000)).timestamp()}
|
75
|
+
#self.assertFalse(wrapper.is_file_older_than("some/file/path"))
|
75
76
|
|
76
77
|
|
77
78
|
if __name__ == '__main__':
|
@@ -13,6 +13,7 @@ from .clickhouse_writer import ClickHouseWriter
|
|
13
13
|
from .airflow_manager import AirflowDAGManager
|
14
14
|
from .credentials import *
|
15
15
|
from .data_wrapper import DataWrapper
|
16
|
+
from .storage_config import StorageConfig
|
16
17
|
|
17
18
|
__all__ = [
|
18
19
|
"Logger",
|
@@ -31,4 +32,5 @@ __all__ = [
|
|
31
32
|
"DfUtils",
|
32
33
|
"ClickHouseWriter",
|
33
34
|
"AirflowDAGManager",
|
35
|
+
"StorageConfig",
|
34
36
|
]
|
@@ -0,0 +1,49 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
|
3
|
+
import dask.dataframe as dd
|
4
|
+
import httpx
|
5
|
+
import pandas as pd
|
6
|
+
|
7
|
+
|
8
|
+
class DataFromHttpSource:
|
9
|
+
def __init__(self, base_url: str, cube_name: str, api_key: Optional[str] = None, **kwargs):
|
10
|
+
# Ensure 'params' exists before updating
|
11
|
+
params = kwargs.pop('params', {})
|
12
|
+
params.setdefault('cube', cube_name)
|
13
|
+
|
14
|
+
self.config = {
|
15
|
+
'base_url': base_url,
|
16
|
+
'timeout': kwargs.get('timeout', 60),
|
17
|
+
'npartitions': kwargs.get('npartitions', 1),
|
18
|
+
'params': params,
|
19
|
+
'headers': kwargs.get('headers', {}) # Allow custom headers
|
20
|
+
}
|
21
|
+
self.config.update(kwargs)
|
22
|
+
|
23
|
+
# Add API key to headers if provided
|
24
|
+
if api_key:
|
25
|
+
self.config['headers']['Authorization'] = f"Bearer {api_key}"
|
26
|
+
|
27
|
+
self.formatted_url = f"{str(self.config.get('base_url', '')).rstrip('/')}/"
|
28
|
+
|
29
|
+
def load(self, **kwargs) -> dd.DataFrame:
|
30
|
+
"""Loads data from HTTP source into a Dask DataFrame."""
|
31
|
+
params = {**self.config.get('params', {}), 'load_params': kwargs}
|
32
|
+
|
33
|
+
try:
|
34
|
+
response = httpx.post(
|
35
|
+
self.formatted_url,
|
36
|
+
json=params,
|
37
|
+
timeout=self.config['timeout'],
|
38
|
+
headers=self.config['headers']
|
39
|
+
)
|
40
|
+
response.raise_for_status() # Raises an HTTPError for 4xx/5xx responses
|
41
|
+
result = response.json()
|
42
|
+
except httpx.HTTPStatusError as e:
|
43
|
+
raise RuntimeError(f"HTTP error: {e.response.status_code}, {e.response.text}") from e
|
44
|
+
except httpx.RequestError as e:
|
45
|
+
raise RuntimeError(f"Request error: {str(e)}") from e
|
46
|
+
except ValueError:
|
47
|
+
raise RuntimeError("Failed to parse JSON response")
|
48
|
+
|
49
|
+
return dd.from_pandas(pd.DataFrame(result.get('data', [])), npartitions=self.config['npartitions'])
|
@@ -1,9 +1,10 @@
|
|
1
|
+
|
1
2
|
from typing import Union, List
|
2
3
|
|
3
4
|
import dask.dataframe as dd
|
4
5
|
import pandas as pd
|
5
6
|
|
6
|
-
from
|
7
|
+
from .log_utils import Logger
|
7
8
|
|
8
9
|
|
9
10
|
class DataUtils:
|
@@ -140,8 +141,8 @@ class DataUtils:
|
|
140
141
|
- pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with merged lookup data.
|
141
142
|
"""
|
142
143
|
# Return early if the DataFrame is empty
|
143
|
-
debug = kwargs.setdefault("debug", False)
|
144
144
|
if self.is_dataframe_empty(df):
|
145
|
+
self.logger.debug("merge_lookup_data was given an empty dataFrame")
|
145
146
|
return df
|
146
147
|
|
147
148
|
# Extract and validate required parameters
|
@@ -187,7 +188,7 @@ class DataUtils:
|
|
187
188
|
f'{lookup_col}__in': ids
|
188
189
|
})
|
189
190
|
# Load lookup data
|
190
|
-
lookup_instance = classname(debug=debug)
|
191
|
+
lookup_instance = classname(debug=self.debug, logger=self.logger)
|
191
192
|
result = lookup_instance.load(**load_kwargs)
|
192
193
|
if len(result.index) == 0:
|
193
194
|
self.logger.debug(f"No IDs found in the source column: {source_col}")
|
@@ -244,3 +245,4 @@ class DataUtils:
|
|
244
245
|
if col in df.columns:
|
245
246
|
df[col] = df[col].map_partitions(pd.to_datetime, errors="coerce", meta=(col, "datetime64[ns]"))
|
246
247
|
return df
|
248
|
+
|
@@ -8,7 +8,9 @@ import pandas as pd
|
|
8
8
|
from IPython.display import display
|
9
9
|
from tqdm import tqdm
|
10
10
|
|
11
|
-
from
|
11
|
+
from .log_utils import Logger
|
12
|
+
from .date_utils import FileAgeChecker
|
13
|
+
from .parquet_saver import ParquetSaver
|
12
14
|
|
13
15
|
|
14
16
|
class DataWrapper:
|
@@ -0,0 +1,28 @@
|
|
1
|
+
from .storage_manager import StorageManager
|
2
|
+
from .credentials import ConfigManager
|
3
|
+
|
4
|
+
class StorageConfig:
|
5
|
+
def __init__(self, config:ConfigManager, depots:dict):
|
6
|
+
self.conf = config
|
7
|
+
self.depots = depots
|
8
|
+
self._initialize_storage()
|
9
|
+
self.storage_manager = StorageManager(self.base_storage, self.filesystem_type, self.filesystem_options)
|
10
|
+
self.depot_paths, self.depot_names = self.storage_manager.rebuild_depot_paths(depots)
|
11
|
+
|
12
|
+
def _initialize_storage(self):
|
13
|
+
self.filesystem_type = self.conf.get('fs_type','file')
|
14
|
+
self.base_storage = self.conf.get('fs_path', "local_storage/")
|
15
|
+
if self.filesystem_type == "file":
|
16
|
+
self.filesystem_options ={}
|
17
|
+
else:
|
18
|
+
self.filesystem_options = {
|
19
|
+
"key": self.conf.get('fs_key',''),
|
20
|
+
"secret": self.conf.get('fs_secret'),
|
21
|
+
"token": self.conf.get('fs_token'),
|
22
|
+
"skip_instance_cache":True,
|
23
|
+
"use_listings_cache": False,
|
24
|
+
"client_kwargs": {
|
25
|
+
"endpoint_url": self.conf.get('fs_endpoint')
|
26
|
+
}
|
27
|
+
}
|
28
|
+
self.filesystem_options = {k: v for k, v in self.filesystem_options.items() if v}
|
@@ -0,0 +1,214 @@
|
|
1
|
+
import warnings
|
2
|
+
from typing import Any, Dict, Type, TypeVar, Union
|
3
|
+
|
4
|
+
import dask.dataframe as dd
|
5
|
+
import fsspec
|
6
|
+
import pandas as pd
|
7
|
+
from pydantic import BaseModel
|
8
|
+
|
9
|
+
from sibi_dst.v2.utils import Logger
|
10
|
+
from sibi_dst.v2.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
|
11
|
+
from sibi_dst.v2.df_helper.backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
|
12
|
+
from sibi_dst.v2.df_helper.backends.sqlmodel import SQLModelConnectionConfig, SQLModelLoadFromDb
|
13
|
+
|
14
|
+
# Define a generic type variable for BaseModel subclasses
|
15
|
+
T = TypeVar("T", bound=BaseModel)
|
16
|
+
|
17
|
+
# Suppress warnings about protected member access
|
18
|
+
warnings.filterwarnings(
|
19
|
+
"ignore",
|
20
|
+
message="Access to a protected member _meta",
|
21
|
+
category=UserWarning,
|
22
|
+
)
|
23
|
+
|
24
|
+
|
25
|
+
class DfHelper:
|
26
|
+
df: Union[dd.DataFrame, pd.DataFrame] = None
|
27
|
+
default_config = {
|
28
|
+
'parquet_storage_path': None,
|
29
|
+
'dt_field': None,
|
30
|
+
'as_pandas': False,
|
31
|
+
'filesystem': 'file',
|
32
|
+
'filesystem_options': {},
|
33
|
+
'fs': fsspec.filesystem('file')
|
34
|
+
}
|
35
|
+
|
36
|
+
def __init__(self, **kwargs: Any) -> None:
|
37
|
+
# Merge default configuration with any provided kwargs
|
38
|
+
config = {**self.default_config.copy(), **kwargs}
|
39
|
+
self.backend = config.setdefault('backend', 'sqlalchemy')
|
40
|
+
self.debug = config.setdefault('debug', False)
|
41
|
+
self.as_pandas = config.setdefault('as_pandas', False)
|
42
|
+
self.logger = config.setdefault(
|
43
|
+
'logger',
|
44
|
+
Logger.default_logger(logger_name=self.__class__.__name__, debug=self.debug)
|
45
|
+
)
|
46
|
+
self.logger.debug("Logger initialized in DEBUG mode.")
|
47
|
+
|
48
|
+
# Propagate logger and debug settings to all components
|
49
|
+
config.setdefault('logger', self.logger)
|
50
|
+
config.setdefault('debug', self.debug)
|
51
|
+
|
52
|
+
self._initialize_backend_config(**config)
|
53
|
+
|
54
|
+
def __str__(self) -> str:
|
55
|
+
return self.__class__.__name__
|
56
|
+
|
57
|
+
def _extract_config_vars(self, model: Type[T], kwargs: Dict[str, Any]) -> T:
|
58
|
+
"""
|
59
|
+
Extracts and initializes a Pydantic model using only the keys that the model accepts.
|
60
|
+
The recognized keys are removed from kwargs.
|
61
|
+
"""
|
62
|
+
recognized_keys = set(model.__annotations__.keys())
|
63
|
+
self.logger.debug(f"Recognized keys for {model.__name__}: {recognized_keys}")
|
64
|
+
model_kwargs = {k: kwargs.pop(k) for k in list(kwargs.keys()) if k in recognized_keys}
|
65
|
+
self.logger.debug(f"Initializing {model.__name__} with: {model_kwargs}")
|
66
|
+
return model(**model_kwargs)
|
67
|
+
|
68
|
+
def _initialize_backend_config(self, **kwargs: Any) -> None:
|
69
|
+
"""
|
70
|
+
Initializes the backend configurations by extracting the settings required for queries,
|
71
|
+
parameters, and SQLAlchemy connections.
|
72
|
+
"""
|
73
|
+
self.logger.debug("Initializing backend configuration.")
|
74
|
+
self._backend_query = self._extract_config_vars(QueryConfig, kwargs)
|
75
|
+
self._backend_params = self._extract_config_vars(ParamsConfig, kwargs)
|
76
|
+
if self.backend == "sqlalchemy":
|
77
|
+
self.backend_connection_config = self._extract_config_vars(SqlAlchemyConnectionConfig, kwargs)
|
78
|
+
elif self.backend == "sqlmodel":
|
79
|
+
self.backend_connection_config = self._extract_config_vars(SQLModelConnectionConfig, kwargs)
|
80
|
+
else:
|
81
|
+
raise ValueError(f"Unsupported backend: {self.backend}")
|
82
|
+
|
83
|
+
def load(self, **options: Any) -> Union[dd.DataFrame, pd.DataFrame]:
|
84
|
+
"""
|
85
|
+
Loads the data using the underlying SQLAlchemy loader. Returns a pandas DataFrame
|
86
|
+
if 'as_pandas' is True; otherwise returns a dask DataFrame.
|
87
|
+
"""
|
88
|
+
df = self._load(**options)
|
89
|
+
return df.compute() if self.as_pandas else df
|
90
|
+
|
91
|
+
def _load(self, **options: Any) -> Union[dd.DataFrame, pd.DataFrame]:
|
92
|
+
self._backend_params.parse_params(options)
|
93
|
+
if self.backend == "sqlalchemy":
|
94
|
+
return self._load_from_sqlalchemy(**options)
|
95
|
+
elif self.backend == "sqlmodel":
|
96
|
+
return self._load_from_sqlmodel(**options)
|
97
|
+
else:
|
98
|
+
raise ValueError(f"Unsupported backend: {self.backend}")
|
99
|
+
|
100
|
+
def _load_from_sqlalchemy(self, **options: Any) -> Union[dd.DataFrame, pd.DataFrame]:
|
101
|
+
"""
|
102
|
+
Loads data from a SQLAlchemy source. On failure, logs the error and returns an empty
|
103
|
+
DataFrame wrapped as a dask DataFrame.
|
104
|
+
"""
|
105
|
+
try:
|
106
|
+
db_loader = SqlAlchemyLoadFromDb(
|
107
|
+
self.backend_connection_config,
|
108
|
+
self._backend_query,
|
109
|
+
self._backend_params,
|
110
|
+
self.debug,
|
111
|
+
self.logger,
|
112
|
+
**options
|
113
|
+
)
|
114
|
+
self.df = db_loader.build_and_load()
|
115
|
+
self._process_loaded_data()
|
116
|
+
self._post_process_df()
|
117
|
+
self.logger.debug("Data successfully loaded from SQLAlchemy database.")
|
118
|
+
except Exception as e:
|
119
|
+
self.logger.error(f"Failed to load data from SQLAlchemy database: {e}. Options: {options}")
|
120
|
+
# Optionally re-raise the exception if in debug mode
|
121
|
+
if self.debug:
|
122
|
+
raise
|
123
|
+
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
124
|
+
return self.df
|
125
|
+
|
126
|
+
def _load_from_sqlmodel(self, **options: Any) -> Union[dd.DataFrame, pd.DataFrame]:
|
127
|
+
try:
|
128
|
+
db_loader = SQLModelLoadFromDb(
|
129
|
+
self.backend_connection_config,
|
130
|
+
self._backend_query,
|
131
|
+
self._backend_params,
|
132
|
+
self.debug,
|
133
|
+
self.logger,
|
134
|
+
**options
|
135
|
+
)
|
136
|
+
self.df = db_loader.build_and_load()
|
137
|
+
self._process_loaded_data()
|
138
|
+
self._post_process_df()
|
139
|
+
self.logger.debug("Data successfully loaded from SQLModel database.")
|
140
|
+
except Exception as e:
|
141
|
+
self.logger.error(f"Failed to load data from SQLModel database: {e}. Options: {options}")
|
142
|
+
if self.debug:
|
143
|
+
raise
|
144
|
+
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
145
|
+
return self.df
|
146
|
+
|
147
|
+
def _post_process_df(self) -> None:
|
148
|
+
"""
|
149
|
+
Post-processes the DataFrame by filtering columns, renaming them, setting the index,
|
150
|
+
and converting the index to datetime if requested.
|
151
|
+
"""
|
152
|
+
df_params = self._backend_params.df_params
|
153
|
+
fieldnames = df_params.get("fieldnames")
|
154
|
+
index_col = df_params.get("index_col")
|
155
|
+
datetime_index = df_params.get("datetime_index", False)
|
156
|
+
column_names = df_params.get("column_names")
|
157
|
+
|
158
|
+
# Filter columns based on fieldnames
|
159
|
+
if fieldnames:
|
160
|
+
valid_fieldnames = [col for col in fieldnames if col in self.df.columns]
|
161
|
+
self.df = self.df[valid_fieldnames]
|
162
|
+
|
163
|
+
# Rename columns if column_names are provided
|
164
|
+
if column_names is not None:
|
165
|
+
if not fieldnames or len(fieldnames) != len(column_names):
|
166
|
+
raise ValueError(
|
167
|
+
f"Length mismatch: fieldnames ({len(fieldnames) if fieldnames else 0}) and "
|
168
|
+
f"column_names ({len(column_names)}) must match."
|
169
|
+
)
|
170
|
+
rename_mapping = dict(zip(fieldnames, column_names))
|
171
|
+
self.df = self.df.map_partitions(self._rename_columns, mapping=rename_mapping)
|
172
|
+
|
173
|
+
# Set the index column if specified
|
174
|
+
if index_col is not None:
|
175
|
+
if index_col in self.df.columns:
|
176
|
+
self.df = self.df.set_index(index_col)
|
177
|
+
else:
|
178
|
+
raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
|
179
|
+
|
180
|
+
# Convert the index to datetime if required
|
181
|
+
if datetime_index and self.df.index.dtype != 'datetime64[ns]':
|
182
|
+
self.df = self.df.map_partitions(self._convert_index_to_datetime)
|
183
|
+
|
184
|
+
self.logger.debug("Post-processing of DataFrame completed.")
|
185
|
+
|
186
|
+
def _process_loaded_data(self) -> None:
|
187
|
+
"""
|
188
|
+
Applies renaming logic based on the field map configuration.
|
189
|
+
Logs a warning for any missing columns, and only renames existing columns.
|
190
|
+
"""
|
191
|
+
self.logger.debug(f"Processing loaded data; DataFrame type: {type(self.df)}")
|
192
|
+
if self.df.map_partitions(len).compute().sum() > 0:
|
193
|
+
field_map = self._backend_params.field_map or {}
|
194
|
+
if isinstance(field_map, dict):
|
195
|
+
rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
|
196
|
+
missing_columns = [k for k in field_map if k not in self.df.columns]
|
197
|
+
if missing_columns:
|
198
|
+
self.logger.warning(
|
199
|
+
f"The following columns in field_map are not in the DataFrame: {missing_columns}"
|
200
|
+
)
|
201
|
+
if rename_mapping:
|
202
|
+
self.df = self.df.map_partitions(self._rename_columns, mapping=rename_mapping)
|
203
|
+
self.logger.debug("Processing of loaded data completed.")
|
204
|
+
|
205
|
+
@staticmethod
|
206
|
+
def _rename_columns(df: pd.DataFrame, mapping: Dict[str, str]) -> pd.DataFrame:
|
207
|
+
"""Helper function to rename columns in a DataFrame."""
|
208
|
+
return df.rename(columns=mapping)
|
209
|
+
|
210
|
+
@staticmethod
|
211
|
+
def _convert_index_to_datetime(df: pd.DataFrame) -> pd.DataFrame:
|
212
|
+
"""Helper function to convert the DataFrame index to datetime."""
|
213
|
+
df.index = pd.to_datetime(df.index, errors='coerce')
|
214
|
+
return df
|