sibi-dst 2025.1.5__tar.gz → 2025.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/PKG-INFO +3 -1
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/pyproject.toml +3 -1
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +12 -12
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/_parquet_artifact.py +10 -32
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/_parquet_reader.py +2 -3
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +2 -1
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/utils/base.py +23 -3
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/utils/clickhouse_writer.py +2 -1
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/utils/data_wrapper.py +9 -19
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/utils/update_planner.py +5 -5
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/README.md +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/__init__.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/_df_helper.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/data_cleaner.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/geopy_helper/__init__.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/geopy_helper/utils.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/osmnx_helper/__init__.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/osmnx_helper/utils.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/tests/__init__.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/utils/__init__.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/utils/credentials.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/utils/data_from_http_source.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/utils/data_utils.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/utils/date_utils.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/utils/df_utils.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/utils/file_utils.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/utils/filepath_generator.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/utils/log_utils.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/utils/manifest_manager.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/utils/parquet_saver.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/utils/phone_formatter.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/utils/storage_config.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/utils/storage_manager.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/utils/webdav_client.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/__init__.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/__init__.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/utils/__init__.py +0 -0
- {sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 2025.1.
|
3
|
+
Version: 2025.1.6
|
4
4
|
Summary: Data Science Toolkit
|
5
5
|
Author: Luis Valverde
|
6
6
|
Author-email: lvalverdeb@gmail.com
|
@@ -13,6 +13,8 @@ Requires-Dist: clickhouse-connect (>=0.8.18,<0.9.0)
|
|
13
13
|
Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
|
14
14
|
Requires-Dist: dask[complete] (>=2025.5.1,<2026.0.0)
|
15
15
|
Requires-Dist: mysqlclient (>=2.2.7,<3.0.0)
|
16
|
+
Requires-Dist: opentelemetry-exporter-otlp (>=1.35.0,<2.0.0)
|
17
|
+
Requires-Dist: opentelemetry-sdk (>=1.35.0,<2.0.0)
|
16
18
|
Requires-Dist: pandas (>=2.3.1,<3.0.0)
|
17
19
|
Requires-Dist: psycopg2 (>=2.9.10,<3.0.0)
|
18
20
|
Requires-Dist: pyarrow (>=20.0.0,<21.0.0)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "sibi-dst"
|
3
|
-
version = "2025.1.
|
3
|
+
version = "2025.1.6"
|
4
4
|
description = "Data Science Toolkit"
|
5
5
|
authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
|
6
6
|
readme = "README.md"
|
@@ -22,6 +22,8 @@ sqlalchemy = "^2.0.41"
|
|
22
22
|
pymysql = "^1.1.1"
|
23
23
|
pyarrow = "^20.0.0"
|
24
24
|
rich = "^14.0.0"
|
25
|
+
opentelemetry-exporter-otlp = "^1.35.0"
|
26
|
+
opentelemetry-sdk = "^1.35.0"
|
25
27
|
|
26
28
|
[tool.poetry.group.dev]
|
27
29
|
optional = true
|
{sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py
RENAMED
@@ -2,7 +2,7 @@ import datetime
|
|
2
2
|
import time
|
3
3
|
import random
|
4
4
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
5
|
-
from typing import Any, Callable, Dict, List, Optional, Type
|
5
|
+
from typing import Any, Callable, Dict, List, Optional, Type, ClassVar
|
6
6
|
|
7
7
|
from sibi_dst.utils import ManagedResource
|
8
8
|
|
@@ -14,7 +14,7 @@ class ArtifactUpdaterMultiWrapperThreaded(ManagedResource):
|
|
14
14
|
This version is refactored for a pure multi-threaded environment, aligning
|
15
15
|
the orchestration model with the underlying threaded workers (DataWrapper).
|
16
16
|
"""
|
17
|
-
|
17
|
+
wrapped_classes: Dict[str, List[Type]]
|
18
18
|
def __init__(
|
19
19
|
self,
|
20
20
|
wrapped_classes: Dict[str, List[Type]],
|
@@ -26,7 +26,7 @@ class ArtifactUpdaterMultiWrapperThreaded(ManagedResource):
|
|
26
26
|
backoff_jitter: float = 0.1,
|
27
27
|
priority_fn: Optional[Callable[[Type], int]] = None,
|
28
28
|
artifact_class_kwargs: Optional[Dict[str, Any]] = None,
|
29
|
-
**kwargs: Any
|
29
|
+
**kwargs: Dict[str, Any]
|
30
30
|
) -> None:
|
31
31
|
super().__init__(**kwargs)
|
32
32
|
self.wrapped_classes = wrapped_classes
|
@@ -75,14 +75,14 @@ class ArtifactUpdaterMultiWrapperThreaded(ManagedResource):
|
|
75
75
|
except Exception as e:
|
76
76
|
self.logger.warning(f"priority_fn error for {name}: {e}")
|
77
77
|
|
78
|
-
# Fallback to size estimate if available
|
79
|
-
if hasattr(artifact_cls, 'get_size_estimate'):
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
78
|
+
# # Fallback to size estimate if available
|
79
|
+
# if hasattr(artifact_cls, 'get_size_estimate'):
|
80
|
+
# try:
|
81
|
+
# # This performs blocking I/O
|
82
|
+
# return artifact_cls(**self.artifact_class_kwargs).get_size_estimate()
|
83
|
+
#
|
84
|
+
# except Exception as e:
|
85
|
+
# self.logger.warning(f"get_size_estimate failed for {name}: {e}")
|
86
86
|
|
87
87
|
# Default priority
|
88
88
|
return 999
|
@@ -115,7 +115,7 @@ class ArtifactUpdaterMultiWrapperThreaded(ManagedResource):
|
|
115
115
|
# If all retries fail, raise an exception to be caught by the main loop
|
116
116
|
raise RuntimeError(f"{name} failed after {self.retry_attempts} attempts.")
|
117
117
|
|
118
|
-
def update_data(self, data_type: str, **kwargs: Any) -> None:
|
118
|
+
async def update_data(self, data_type: str, **kwargs: Any) -> None:
|
119
119
|
"""
|
120
120
|
Entry point to update all artifacts of a given type using a ThreadPoolExecutor.
|
121
121
|
"""
|
@@ -1,8 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
import datetime
|
3
|
-
import logging
|
4
3
|
import threading
|
5
|
-
from typing import Optional, Any, Dict
|
4
|
+
from typing import Optional, Any, Dict, ClassVar
|
6
5
|
|
7
6
|
import dask.dataframe as dd
|
8
7
|
import fsspec
|
@@ -55,7 +54,7 @@ class ParquetArtifact(DfHelper):
|
|
55
54
|
:ivar fs: Filesystem object used for storage operations.
|
56
55
|
:type fs: fsspec.AbstractFileSystem
|
57
56
|
"""
|
58
|
-
DEFAULT_CONFIG = {
|
57
|
+
DEFAULT_CONFIG: ClassVar[Dict[str, str]] = {
|
59
58
|
'backend': 'parquet'
|
60
59
|
}
|
61
60
|
|
@@ -91,8 +90,6 @@ class ParquetArtifact(DfHelper):
|
|
91
90
|
}
|
92
91
|
self.df: Optional[dd.DataFrame] = None
|
93
92
|
super().__init__(**self.config)
|
94
|
-
#self._own_logger = False
|
95
|
-
#self._setup_logging()
|
96
93
|
self.data_wrapper_class = data_wrapper_class
|
97
94
|
|
98
95
|
self.date_field = self._validate_required('date_field')
|
@@ -101,16 +98,6 @@ class ParquetArtifact(DfHelper):
|
|
101
98
|
self.parquet_start_date = self._validate_required('parquet_start_date')
|
102
99
|
self.parquet_end_date = self._validate_required('parquet_end_date')
|
103
100
|
|
104
|
-
# Filesystem setup
|
105
|
-
#self.filesystem_type = self.config.setdefault('filesystem_type', 'file')
|
106
|
-
#self.filesystem_options = self.config.setdefault('filesystem_options', {})
|
107
|
-
#self.fs = self.config.setdefault('fs', None)
|
108
|
-
#self._own_fs = self.fs is None
|
109
|
-
#if self.fs is None:
|
110
|
-
# self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
|
111
|
-
# self._own_fs = True
|
112
|
-
#self.config.setdefault('fs', self.fs)
|
113
|
-
## Populate to parameters to pass to data_wrapper_class
|
114
101
|
self.class_params = self.config.pop('class_params', {
|
115
102
|
'debug': self.debug,
|
116
103
|
'logger': self.logger,
|
@@ -125,15 +112,6 @@ class ParquetArtifact(DfHelper):
|
|
125
112
|
self.update_planner_params = {}
|
126
113
|
self.datawrapper_params = {}
|
127
114
|
|
128
|
-
#def _setup_logging(self):
|
129
|
-
# """Initialize logger and debug settings."""
|
130
|
-
# self.debug = self.config.get('debug', False)
|
131
|
-
# logger = self.config.get('logger', None)
|
132
|
-
# self._own_logger = logger is None
|
133
|
-
# self.logger = logger or Logger.default_logger(
|
134
|
-
# logger_name=f'Parquet_Artifact_InstanceOf_{self.__class__.__name__}')
|
135
|
-
# self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
136
|
-
|
137
115
|
def _validate_required(self, key: str) -> Any:
|
138
116
|
"""Validate required configuration fields."""
|
139
117
|
value = self.config.setdefault(key, None)
|
@@ -211,28 +189,28 @@ class ParquetArtifact(DfHelper):
|
|
211
189
|
"""
|
212
190
|
Synchronously estimates artifact size for use in multi-threaded environments.
|
213
191
|
|
214
|
-
This method
|
215
|
-
|
216
|
-
event loop conflicts.
|
192
|
+
This method safely executes asynchronous I/O operations from a synchronous
|
193
|
+
context, handling variations in fsspec filesystem implementations.
|
217
194
|
"""
|
218
195
|
|
219
196
|
async def _get_total_bytes_async():
|
220
197
|
"""A helper async coroutine to perform the I/O."""
|
221
198
|
import asyncio
|
222
199
|
|
223
|
-
# Use the async versions of fsspec methods (e.g., _glob, _size)
|
224
200
|
files = await self.fs._glob(f"{self.parquet_storage_path}/*.parquet")
|
225
201
|
if not files:
|
226
202
|
return 0
|
227
203
|
|
228
|
-
# Concurrently gather the size of all files for performance
|
229
204
|
size_tasks = [self.fs._size(f) for f in files]
|
230
205
|
sizes = await asyncio.gather(*size_tasks)
|
231
206
|
return sum(s for s in sizes if s is not None)
|
232
207
|
|
233
|
-
|
234
|
-
|
235
|
-
|
208
|
+
try:
|
209
|
+
# Attempt the standard fsspec method first
|
210
|
+
total_bytes = self.fs.sync(_get_total_bytes_async())
|
211
|
+
except AttributeError:
|
212
|
+
# fallback for filesystems like s3fs that lack .sync()
|
213
|
+
total_bytes = self.fs.loop.run_until_complete(_get_total_bytes_async())
|
236
214
|
|
237
215
|
# Convert to megabytes, ensuring a minimum of 1
|
238
216
|
return max(1, int(total_bytes / (1024 ** 2)))
|
@@ -1,11 +1,10 @@
|
|
1
1
|
import logging
|
2
|
-
from typing import Optional
|
2
|
+
from typing import Optional, ClassVar, Dict
|
3
3
|
|
4
4
|
import dask.dataframe as dd
|
5
5
|
import fsspec
|
6
6
|
|
7
7
|
from sibi_dst.df_helper import DfHelper
|
8
|
-
from sibi_dst.utils import Logger
|
9
8
|
|
10
9
|
class ParquetReader(DfHelper):
|
11
10
|
"""
|
@@ -44,7 +43,7 @@ class ParquetReader(DfHelper):
|
|
44
43
|
Parquet storage.
|
45
44
|
:type fs: fsspec.AbstractFileSystem
|
46
45
|
"""
|
47
|
-
DEFAULT_CONFIG = {
|
46
|
+
DEFAULT_CONFIG: ClassVar[Dict[str, int]] = {
|
48
47
|
'backend': 'parquet'
|
49
48
|
}
|
50
49
|
|
{sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/backends/parquet/_parquet_options.py
RENAMED
@@ -85,7 +85,8 @@ class ParquetConfig(BaseModel):
|
|
85
85
|
if self.logger is None:
|
86
86
|
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
87
87
|
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
88
|
-
|
88
|
+
if self.fs is None:
|
89
|
+
raise ValueError('Parquet Options: File system (fs) must be specified')
|
89
90
|
|
90
91
|
if self.parquet_storage_path is None:
|
91
92
|
raise ValueError('Parquet storage path must be specified')
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import asyncio
|
1
2
|
from .log_utils import Logger
|
2
3
|
|
3
4
|
class ManagedResource:
|
@@ -61,7 +62,7 @@ class ManagedResource:
|
|
61
62
|
|
62
63
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
63
64
|
"""Exit the runtime context and trigger cleanup for 'async with' statements."""
|
64
|
-
self.
|
65
|
+
await self.acleanup()
|
65
66
|
return False # Propagate exceptions
|
66
67
|
|
67
68
|
def __repr__(self) -> str:
|
@@ -80,7 +81,7 @@ class ManagedResource:
|
|
80
81
|
|
81
82
|
def cleanup(self):
|
82
83
|
"""
|
83
|
-
|
84
|
+
Cleanup resources managed by this instance.
|
84
85
|
"""
|
85
86
|
if self._own_fs and hasattr(self.fs, "clear_instance_cache"):
|
86
87
|
if self.logger:
|
@@ -88,10 +89,29 @@ class ManagedResource:
|
|
88
89
|
self.fs.clear_instance_cache()
|
89
90
|
|
90
91
|
if self._own_logger and hasattr(self.logger, "shutdown"):
|
91
|
-
# Ensure logger exists before trying to use or shut it down
|
92
|
+
# Ensure the logger exists before trying to use or shut it down
|
92
93
|
if self.logger:
|
93
94
|
self.logger.debug(f"'{self.__class__.__name__}' is shutting down its own logger.")
|
94
95
|
self.logger.shutdown()
|
95
96
|
self.logger = None # Set to None after shutdown
|
96
97
|
|
97
98
|
self._entered = False
|
99
|
+
|
100
|
+
async def acleanup(self):
|
101
|
+
"""
|
102
|
+
Async Cleanup resources managed by this instance.
|
103
|
+
"""
|
104
|
+
if self._own_fs and hasattr(self.fs, "clear_instance_cache"):
|
105
|
+
if self.logger:
|
106
|
+
self.logger.debug(f"'{self.__class__.__name__}' is clearing its own filesystem cache.")
|
107
|
+
self.fs.clear_instance_cache()
|
108
|
+
|
109
|
+
if self._own_logger and hasattr(self.logger, "shutdown"):
|
110
|
+
# Ensure the logger exists before trying to use or shut it down
|
111
|
+
if self.logger:
|
112
|
+
self.logger.debug(f"'{self.__class__.__name__}' is shutting down its own logger.")
|
113
|
+
self.logger.shutdown()
|
114
|
+
self.logger = None # Set to None after shutdown
|
115
|
+
|
116
|
+
self._entered = False
|
117
|
+
|
@@ -1,4 +1,5 @@
|
|
1
1
|
from concurrent.futures import ThreadPoolExecutor
|
2
|
+
from typing import ClassVar, Dict
|
2
3
|
|
3
4
|
import clickhouse_connect
|
4
5
|
import pandas as pd
|
@@ -36,7 +37,7 @@ class ClickHouseWriter(ManagedResource):
|
|
36
37
|
:ivar order_by: Field or column name to use for table ordering.
|
37
38
|
:type order_by: str
|
38
39
|
"""
|
39
|
-
dtype_to_clickhouse = {
|
40
|
+
dtype_to_clickhouse: ClassVar[Dict[str, str]] = {
|
40
41
|
'int64': 'Int64',
|
41
42
|
'int32': 'Int32',
|
42
43
|
'float64': 'Float64',
|
@@ -3,7 +3,7 @@ import logging
|
|
3
3
|
import threading
|
4
4
|
import time
|
5
5
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
6
|
-
from typing import Type, Any, Dict, Optional, Union, List
|
6
|
+
from typing import Type, Any, Dict, Optional, Union, List, ClassVar
|
7
7
|
|
8
8
|
import fsspec
|
9
9
|
import pandas as pd
|
@@ -15,15 +15,15 @@ from .parquet_saver import ParquetSaver
|
|
15
15
|
|
16
16
|
|
17
17
|
class DataWrapper(ManagedResource):
|
18
|
-
DEFAULT_PRIORITY_MAP = {
|
18
|
+
DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
|
19
19
|
"overwrite": 1,
|
20
20
|
"missing_in_history": 2,
|
21
21
|
"existing_but_stale": 3,
|
22
22
|
"missing_outside_history": 4,
|
23
23
|
"file_is_recent": 0
|
24
24
|
}
|
25
|
-
DEFAULT_MAX_AGE_MINUTES = 1440
|
26
|
-
DEFAULT_HISTORY_DAYS_THRESHOLD = 30
|
25
|
+
DEFAULT_MAX_AGE_MINUTES: int = 1440
|
26
|
+
DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
|
27
27
|
|
28
28
|
def __init__(
|
29
29
|
self,
|
@@ -31,12 +31,8 @@ class DataWrapper(ManagedResource):
|
|
31
31
|
date_field: str,
|
32
32
|
data_path: str,
|
33
33
|
parquet_filename: str,
|
34
|
-
#fs: Optional[fsspec.AbstractFileSystem] = None,
|
35
|
-
#debug: bool = False,
|
36
|
-
#verbose: bool = False,
|
37
34
|
class_params: Optional[Dict] = None,
|
38
35
|
load_params: Optional[Dict] = None,
|
39
|
-
#logger: Logger = None,
|
40
36
|
show_progress: bool = False,
|
41
37
|
timeout: float = 30,
|
42
38
|
max_threads: int = 3,
|
@@ -47,14 +43,8 @@ class DataWrapper(ManagedResource):
|
|
47
43
|
self.date_field = date_field
|
48
44
|
self.data_path = self._ensure_forward_slash(data_path)
|
49
45
|
self.parquet_filename = parquet_filename
|
50
|
-
#self.fs = fs or None
|
51
46
|
if self.fs is None:
|
52
47
|
raise ValueError("Datawrapper requires a File system (fs) to be provided .")
|
53
|
-
#self.debug = debug
|
54
|
-
#self.verbose = verbose
|
55
|
-
#self._own_logger = logger is None
|
56
|
-
#self.logger = logger or Logger.default_logger(logger_name=self.dataclass.__name__)
|
57
|
-
#self.logger.set_level(logging.DEBUG if debug else logging.INFO)
|
58
48
|
self.show_progress = show_progress
|
59
49
|
self.timeout = timeout
|
60
50
|
self.max_threads = max_threads
|
@@ -71,7 +61,6 @@ class DataWrapper(ManagedResource):
|
|
71
61
|
self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
|
72
62
|
self.mmanifest = kwargs.get("mmanifest", None)
|
73
63
|
self.update_planner=kwargs.get("update_planner", None)
|
74
|
-
# self.datacls = self.dataclass(**self.class_params)
|
75
64
|
|
76
65
|
|
77
66
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
@@ -174,7 +163,7 @@ class DataWrapper(ManagedResource):
|
|
174
163
|
load_time = time.perf_counter() - load_start
|
175
164
|
|
176
165
|
if hasattr(local_class_instance, "total_records"):
|
177
|
-
self.logger.debug(f"Total records loaded by {local_class_instance}: {local_class_instance.total_records}")
|
166
|
+
self.logger.debug(f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
|
178
167
|
if int(local_class_instance.total_records) == 0: # If no records were loaded but not due to an error
|
179
168
|
if self.mmanifest:
|
180
169
|
self.mmanifest.record(
|
@@ -183,12 +172,13 @@ class DataWrapper(ManagedResource):
|
|
183
172
|
self.logger.info(f"No data found for {date}. Logged to missing manifest.")
|
184
173
|
return
|
185
174
|
save_start = time.perf_counter()
|
186
|
-
ParquetSaver(
|
175
|
+
with ParquetSaver(
|
187
176
|
df_result=df,
|
188
177
|
parquet_storage_path=path,
|
189
178
|
fs=self.fs,
|
190
179
|
logger=self.logger
|
191
|
-
)
|
180
|
+
) as ps:
|
181
|
+
ps.save_to_parquet(self.parquet_filename, overwrite=True)
|
192
182
|
save_time = time.perf_counter() - save_start
|
193
183
|
|
194
184
|
total_time = time.perf_counter() - overall_start
|
@@ -218,4 +208,4 @@ class DataWrapper(ManagedResource):
|
|
218
208
|
return
|
219
209
|
df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
|
220
210
|
df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
|
221
|
-
self.logger.info("Benchmark Summary:\n" + df_bench.to_string())
|
211
|
+
self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string())
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import datetime
|
2
2
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
3
|
-
from typing import List, Optional, Dict, Union, Tuple, Set, Iterator
|
3
|
+
from typing import List, Optional, Dict, Union, Tuple, Set, Iterator, ClassVar
|
4
4
|
import pandas as pd
|
5
5
|
from .date_utils import FileAgeChecker
|
6
6
|
from pydantic import BaseModel, Field
|
@@ -55,7 +55,7 @@ class UpdatePlanner(ManagedResource):
|
|
55
55
|
generate_plan() will overwrite self.plan and self.df_req, and returns a DataFrame of required updates.
|
56
56
|
"""
|
57
57
|
|
58
|
-
DEFAULT_PRIORITY_MAP =
|
58
|
+
DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]]={
|
59
59
|
"file_is_recent": 0,
|
60
60
|
"missing_ignored": 0,
|
61
61
|
"overwrite_forced": 1,
|
@@ -64,8 +64,8 @@ class UpdatePlanner(ManagedResource):
|
|
64
64
|
"stale_in_history": 4,
|
65
65
|
}
|
66
66
|
|
67
|
-
DEFAULT_MAX_AGE_MINUTES = 1440
|
68
|
-
DEFAULT_HISTORY_DAYS_THRESHOLD = 30
|
67
|
+
DEFAULT_MAX_AGE_MINUTES: int = 1440
|
68
|
+
DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
|
69
69
|
|
70
70
|
def __init__(
|
71
71
|
self,
|
@@ -217,7 +217,7 @@ class UpdatePlanner(ManagedResource):
|
|
217
217
|
for priority in sorted(required_updates["update_priority"].unique()):
|
218
218
|
dates_df = required_updates[required_updates["update_priority"] == priority]
|
219
219
|
# Sort dates within the priority group
|
220
|
-
sorted_dates = dates_df.sort_values(by="date", ascending=not self.reverse_order)
|
220
|
+
sorted_dates = dates_df.sort_values(by=["date"], ascending=not self.reverse_order)
|
221
221
|
dates = sorted_dates["date"].tolist()
|
222
222
|
if dates:
|
223
223
|
yield priority, dates
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/backends/parquet/_filter_handler.py
RENAMED
File without changes
|
File without changes
|
{sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py
RENAMED
File without changes
|
File without changes
|
{sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py
RENAMED
File without changes
|
{sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py
RENAMED
File without changes
|
{sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py
RENAMED
File without changes
|
{sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py
RENAMED
File without changes
|
{sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py
RENAMED
File without changes
|
{sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py
RENAMED
File without changes
|
File without changes
|
{sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py
RENAMED
File without changes
|
File without changes
|
{sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py
RENAMED
File without changes
|
{sibi_dst-2025.1.5 → sibi_dst-2025.1.6}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|