PyPI - sibi-dst - Versions diffs - 2025.9.4__py3-none-any.whl → 2025.9.6__py3-none-any.whl - Mend

sibi-dst 2025.9.4py3-none-any.whl → 2025.9.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

sibi_dst/tests/test_baseclass.py +403 -0
sibi_dst/utils/base.py +0 -254
sibi_dst/utils/boilerplate/__init__.py +4 -1
sibi_dst/utils/boilerplate/hybrid_data_loader.py +144 -0
sibi_dst/utils/clickhouse_writer.py +138 -13
sibi_dst/utils/dask_utils.py +1 -1
{sibi_dst-2025.9.4.dist-info → sibi_dst-2025.9.6.dist-info}/METADATA +1 -1
{sibi_dst-2025.9.4.dist-info → sibi_dst-2025.9.6.dist-info}/RECORD +9 -7
{sibi_dst-2025.9.4.dist-info → sibi_dst-2025.9.6.dist-info}/WHEEL +0 -0

sibi_dst/tests/test_baseclass.py ADDED Viewed

@@ -0,0 +1,403 @@
+import asyncio
+import json
+import threading
+from typing import Any, Dict
+from unittest.mock import MagicMock
+import fsspec
+from sibi_dst.utils import Logger
+from sibi_dst.utils import ManagedResource
+from sibi_dst.utils.base import _QueueSSE  # Replace 'your_module' with actual module name
+# ------------------------------ Test Fixtures ------------------------------
+class TestResource(ManagedResource):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.cleanup_called = False
+        self.acleanup_called = False
+    def _cleanup(self) -> None:
+        self.cleanup_called = True
+        super()._cleanup()
+    async def _acleanup(self) -> None:
+        self.acleanup_called = True
+        await super()._acleanup()
+class MockSSESink:
+    def __init__(self):
+        self.events = []
+        self.closed = False
+    async def send(self, event: str, data: Dict[str, Any]) -> None:
+        self.events.append({"event": event, "data": data})
+    async def aclose(self) -> None:
+        self.closed = True
+class MockSyncSSESink:
+    def __init__(self):
+        self.events = []
+        self.closed = False
+    def send(self, event: str, data: Dict[str, Any]) -> None:
+        self.events.append({"event": event, "data": data})
+    def close(self) -> None:
+        self.closed = True
+# ------------------------------ Mock fsspec filesystem ------------------------------
+class MockFileSystem(fsspec.AbstractFileSystem):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.closed = False
+    def close(self):
+        self.closed = True
+# ------------------------------ Utility for Event Loop ------------------------------
+def run_async_test(coro):
+    """Run async test safely in different environments."""
+    try:
+        # Try to get existing event loop (for Jupyter/IPython)
+        loop = asyncio.get_event_loop()
+        if loop.is_running():
+            # In Jupyter, create a new task
+            task = loop.create_task(coro)
+            return task
+        else:
+            return loop.run_until_complete(coro)
+    except RuntimeError:
+        # No event loop running, use asyncio.run()
+        return asyncio.run(coro)
+# ------------------------------ Lifecycle Tests ------------------------------
+def test_double_close_no_error():
+    """Test that calling close() multiple times doesn't raise errors."""
+    resource = TestResource()
+    resource.close()
+    resource.close()  # Should not raise
+    assert resource.closed
+def test_double_aclose_no_error():
+    """Test that calling aclose() multiple times doesn't raise errors."""
+    async def test():
+        resource = TestResource()
+        await resource.aclose()
+        await resource.aclose()  # Should not raise
+        assert resource.closed
+    run_async_test(test())
+def test_context_manager_sync():
+    """Test sync context manager behavior."""
+    with TestResource() as resource:
+        assert not resource.closed
+    assert resource.closed
+    assert resource.cleanup_called
+def test_context_manager_async():
+    """Test async context manager behavior."""
+    async def test():
+        async with TestResource() as resource:
+            assert not resource.closed
+        assert resource.closed
+        assert resource.acleanup_called
+    run_async_test(test())
+# ------------------------------ SSE Emission Tests ------------------------------
+def test_auto_sse_creation():
+    """Test automatic SSE creation when auto_sse=True."""
+    resource = TestResource(auto_sse=True)
+    sse = resource.get_sse()
+    assert sse is not None
+    assert isinstance(sse, _QueueSSE)
+    assert resource._owns_sse
+def test_sse_emission_with_async_sink():
+    """Test SSE emission with async send method."""
+    async def test():
+        sink = MockSSESink()
+        resource = TestResource(sse=sink)
+        await resource.emit("test_event", key="value")
+        assert len(sink.events) == 1
+        assert sink.events[0]["event"] == "test_event"
+        assert sink.events[0]["data"] == {"key": "value"}
+    run_async_test(test())
+def test_sse_emission_with_sync_sink():
+    """Test SSE emission with sync send method wrapped in async."""
+    sink = MockSyncSSESink()
+    resource = TestResource(sse=sink)
+    async def test():
+        await resource.emit("test_event", key="value")
+        assert len(sink.events) == 1
+        assert sink.events[0]["event"] == "test_event"
+        assert sink.events[0]["data"] == {"key": "value"}
+    run_async_test(test())
+def test_sse_put_method_support():
+    """Test SSE emission with put method."""
+    class PutSink:
+        def __init__(self):
+            self.items = []
+        async def put(self, item: Dict[str, Any]) -> None:
+            self.items.append(item)
+    async def test():
+        sink = PutSink()
+        resource = TestResource(sse=sink)
+        await resource.emit("test_event", key="value")
+        assert len(sink.items) == 1
+        item = sink.items[0]
+        assert item["event"] == "test_event"
+        assert json.loads(item["data"]) == {"key": "value"}
+    run_async_test(test())
+def test_sse_no_emitter_no_error():
+    """Test that emit on resource without emitter doesn't raise."""
+    resource = TestResource()
+    # Should not raise error
+    async def test():
+        await resource.emit("test_event", key="value")
+    run_async_test(test())
+def test_sse_emission_after_close():
+    """Test that emit after close is no-op."""
+    async def test():
+        sink = MockSSESink()
+        resource = TestResource(sse=sink)
+        await resource.aclose()
+        await resource.emit("test_event", key="value")  # Should not raise
+        assert len(sink.events) == 0
+    run_async_test(test())
+# ------------------------------ Cleanup Interplay Tests ------------------------------
+def test_sync_cleanup_called_on_sync_close():
+    """Test that sync cleanup is called during sync close."""
+    resource = TestResource()
+    resource.close()
+    assert resource.cleanup_called
+    assert not resource.acleanup_called
+def test_async_cleanup_called_on_async_close():
+    """Test that async cleanup is called during async close."""
+    async def test():
+        resource = TestResource()
+        await resource.aclose()
+        assert resource.acleanup_called
+        assert not resource.cleanup_called
+    run_async_test(test())
+# ------------------------------ Logger Tests ------------------------------
+def test_logger_ownership():
+    """Test that logger is owned when not provided externally."""
+    resource = TestResource()
+    assert resource._owns_logger
+    assert resource.logger is not None
+def test_external_logger_not_owned():
+    """Test that external logger is not owned."""
+    external_logger = Logger.default_logger("test")
+    resource = TestResource(logger=external_logger)
+    assert not resource._owns_logger
+    assert resource.logger is external_logger
+def test_logger_level_configuration():
+    """Test logger level configuration based on verbose/debug flags."""
+    # Default (warning level)
+    resource = TestResource()
+    assert hasattr(resource.logger, 'level')
+    # Verbose (info level)
+    resource = TestResource(verbose=True)
+    assert hasattr(resource.logger, 'level')
+    # Debug (debug level)
+    resource = TestResource(debug=True)
+    assert hasattr(resource.logger, 'level')
+# ------------------------------ Lazy Instantiation Tests ------------------------------
+def test_lazy_fs_instantiation():
+    """Test lazy filesystem instantiation via factory."""
+    fs_instance = MockFileSystem()
+    factory_called = [False]
+    def fs_factory():
+        factory_called[0] = True
+        return fs_instance
+    resource = TestResource(fs_factory=fs_factory)
+    assert not factory_called[0]  # Not called yet
+    fs = resource._ensure_fs()
+    assert factory_called[0]
+    assert fs is fs_instance
+    assert resource.fs is fs_instance
+def test_lazy_sse_instantiation():
+    """Test lazy SSE instantiation via factory."""
+    sink_instance = MockSSESink()
+    factory_called = [False]
+    def sse_factory():
+        factory_called[0] = True
+        return sink_instance
+    resource = TestResource(sse_factory=sse_factory)
+    assert not factory_called[0]  # Not called yet
+    sse = resource._ensure_sse()
+    assert factory_called[0]
+    assert sse is sink_instance
+    assert resource._sse is sink_instance
+def test_lazy_fs_not_called_if_fs_provided():
+    """Test that factory is not called if fs is provided directly."""
+    fs_instance = MockFileSystem()
+    factory = MagicMock()
+    resource = TestResource(fs=fs_instance, fs_factory=factory)
+    fs = resource._ensure_fs()
+    assert fs is fs_instance
+    factory.assert_not_called()
+def test_lazy_sse_not_called_if_sse_provided():
+    """Test that factory is not called if sse is provided directly."""
+    sink_instance = MockSSESink()
+    factory = MagicMock()
+    resource = TestResource(sse=sink_instance, sse_factory=factory)
+    sse = resource._ensure_sse()
+    assert sse is sink_instance
+    factory.assert_not_called()
+# ------------------------------ Thread Safety Tests ------------------------------
+def test_thread_safe_close():
+    """Test that close operations are thread-safe."""
+    resource = TestResource()
+    results = []
+    errors = []
+    def close_resource():
+        try:
+            resource.close()
+            results.append("success")
+        except Exception as e:
+            errors.append(str(e))
+            results.append(f"error: {e}")
+    # Start multiple threads trying to close simultaneously
+    threads = [threading.Thread(target=close_resource) for _ in range(5)]
+    for t in threads:
+        t.start()
+    for t in threads:
+        t.join()
+    # Debug information
+    print(f"Results: {results}")
+    print(f"Errors: {errors}")
+    print(f"Resource closed: {resource.closed}")
+    # Should have at least one success (the first one) and no exceptions
+    success_count = results.count("success")
+    error_count = len([r for r in results if r.startswith("error")])
+    # At least one should succeed
+    assert success_count >= 1, f"Expected at least 1 success, got {success_count}"
+    # No errors should occur
+    assert error_count == 0, f"Expected 0 errors, got {error_count}"
+    # Resource should be closed
+    assert resource.closed, "Resource should be closed"
+# ------------------------------ Individual Test Functions ------------------------------
+# You can now run individual tests like this:
+if __name__ == "__main__":
+    # Run individual tests
+    test_double_close_no_error()
+    print("✓ test_double_close_no_error passed")
+    test_sync_cleanup_called_on_sync_close()
+    print("✓ test_sync_cleanup_called_on_sync_close passed")
+    test_logger_ownership()
+    print("✓ test_logger_ownership passed")
+    test_external_logger_not_owned()
+    print("✓ test_external_logger_not_owned passed")
+    test_lazy_fs_instantiation()
+    print("✓ test_lazy_fs_instantiation passed")
+    test_lazy_sse_instantiation()
+    print("✓ test_lazy_sse_instantiation passed")
+    test_lazy_fs_not_called_if_fs_provided()
+    print("✓ test_lazy_fs_not_called_if_fs_provided passed")
+    test_lazy_sse_not_called_if_sse_provided()
+    print("✓ test_lazy_sse_not_called_if_sse_provided passed")
+    test_thread_safe_close()
+    print("✓ test_thread_safe_close passed")
+    test_auto_sse_creation()
+    print("✓ test_auto_sse_creation passed")
+    print("All tests completed!")

sibi_dst/utils/base.py CHANGED Viewed

@@ -441,257 +441,3 @@ class ManagedResource(abc.ABC):
         except Exception:
             pass
-## Before SSE handling
-# import abc
-# import threading
-# import weakref
-# from typing import Self, Optional, Callable
-#
-# import fsspec
-#
-# from sibi_dst.utils import Logger
-#
-#
-# class ManagedResource(abc.ABC):
-#     """
-#     Boilerplate ABC for components that manage a logger and an optional fsspec filesystem,
-#     with sync/async lifecycle helpers, lazy FS creation via an optional factory, and
-#     configurable cleanup-error logging.
-#     """
-#
-#     def __init__(
-#         self,
-#         *,
-#         verbose: bool = False,
-#         debug: bool = False,
-#         log_cleanup_errors: bool = True,
-#         logger: Optional[Logger] = None,
-#         fs: Optional[fsspec.AbstractFileSystem] = None,
-#         fs_factory: Optional[Callable[[], fsspec.AbstractFileSystem]] = None,
-#         **_: object,
-#     ) -> None:
-#         # ---- Declared upfront for type checkers
-#         self.logger: Logger
-#         self.fs: Optional[fsspec.AbstractFileSystem] = None
-#         self._fs_factory: Optional[Callable[[], fsspec.AbstractFileSystem]] = None
-#         self._owns_logger: bool = False
-#         self._owns_fs: bool = False
-#         self._is_closed: bool = False
-#         self._closing: bool = False
-#         self._close_lock = threading.RLock()
-#
-#         self.verbose = verbose
-#         self.debug = debug
-#         self._log_cleanup_errors = log_cleanup_errors
-#
-#         # ---- Logger ownership
-#         if logger is None:
-#             self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
-#             self._owns_logger = True
-#             level = Logger.DEBUG if self.debug else (Logger.INFO if self.verbose else Logger.WARNING)
-#             self.logger.set_level(level)
-#         else:
-#             self.logger = logger
-#             self._owns_logger = False  # do not mutate external logger
-#
-#         # ---- FS ownership & lazy creation
-#         if fs is not None:
-#             self.fs = fs
-#             self._owns_fs = False
-#             self._fs_factory = None
-#         elif fs_factory is not None:
-#             # Lazy: don't create until first use
-#             self._fs_factory = fs_factory
-#             self._owns_fs = True  # we will own it *if* created
-#             self.fs = None
-#         else:
-#             self.fs = None
-#             self._owns_fs = False
-#             self._fs_factory = None
-#
-#         # Register a GC-time finalizer that does not capture self
-#         self_ref = weakref.ref(self)
-#         self._finalizer = weakref.finalize(self, self._finalize_static, self_ref)
-#
-#         if self.debug:
-#             try:
-#                 self.logger.debug("Component %s initialized. %s", self.__class__.__name__, repr(self))
-#             except Exception:
-#                 pass
-#
-#     # ---------- Introspection ----------
-#     @property
-#     def is_closed(self) -> bool:
-#         return self._is_closed
-#
-#     @property
-#     def closed(self) -> bool:  # alias
-#         return self._is_closed
-#
-#     def __repr__(self) -> str:
-#         class_name = self.__class__.__name__
-#         logger_status = "own" if self._owns_logger else "external"
-#         if self.fs is None and self._fs_factory is not None:
-#             fs_status = "own(lazy)"
-#         elif self.fs is None:
-#             fs_status = "none"
-#         else:
-#             fs_status = "own" if self._owns_fs else "external"
-#         return (f"<{class_name} debug={self.debug} verbose={self.verbose} "
-#                 f"log_cleanup_errors={self._log_cleanup_errors} "
-#                 f"logger={logger_status} fs={fs_status}>")
-#
-#     # ---------- Subclass hooks ----------
-#     def _cleanup(self) -> None:
-#         """Sync cleanup for resources created BY THE SUBCLASS."""
-#         return
-#
-#     async def _acleanup(self) -> None:
-#         """Async cleanup for resources created BY THE SUBCLASS."""
-#         return
-#
-#     # ---------- FS helpers ----------
-#     def _ensure_fs(self) -> Optional[fsspec.AbstractFileSystem]:
-#         """Create the FS lazily if a factory was provided. Return fs (or None)."""
-#         if self.fs is None and self._fs_factory is not None:
-#             created = self._fs_factory()
-#             if not isinstance(created, fsspec.AbstractFileSystem):
-#                 raise TypeError(f"fs_factory() must return fsspec.AbstractFileSystem, got {type(created)!r}")
-#             self.fs = created
-#             # _owns_fs already True when factory is present
-#         return self.fs
-#
-#     def require_fs(self) -> fsspec.AbstractFileSystem:
-#         """Return a filesystem or raise if not configured/creatable."""
-#         fs = self._ensure_fs()
-#         if fs is None:
-#             raise RuntimeError(
-#                 f"{self.__class__.__name__}: filesystem is required but not configured"
-#             )
-#         return fs
-#
-#     # ---------- Shared shutdown helpers (no logging; safe for late shutdown) ----------
-#     def _release_owned_fs(self) -> None:
-#         if self._owns_fs:
-#             # ensure creation state is respected even if never used
-#             _ = self.fs or None  # no-op; if never created, nothing to close
-#             if self.fs is not None:
-#                 close = getattr(self.fs, "close", None)
-#                 try:
-#                     if callable(close):
-#                         close()
-#                 finally:
-#                     self.fs = None
-#
-#     def _shutdown_logger(self) -> None:
-#         if self._owns_logger:
-#             try:
-#                 self.logger.shutdown()
-#             except Exception:
-#                 pass
-#
-#     def _shutdown_owned_resources(self) -> None:
-#         self._release_owned_fs()
-#         self._shutdown_logger()
-#
-#     # ---------- Public lifecycle (sync) ----------
-#     def close(self) -> None:
-#         with self._close_lock:
-#             if self._is_closed or self._closing:
-#                 return
-#             self._closing = True
-#
-#         try:
-#             self._cleanup()
-#         except Exception:
-#             # Only include traceback when debug=True
-#             if self._log_cleanup_errors:
-#                 try:
-#                     self.logger.error(
-#                         "Error during %s._cleanup()", self.__class__.__name__,
-#                         exc_info=self.debug
-#                     )
-#                 except Exception:
-#                     pass
-#             raise
-#         finally:
-#             with self._close_lock:
-#                 self._is_closed = True
-#                 self._closing = False
-#             self._shutdown_owned_resources()
-#             if self.debug:
-#                 try:
-#                     self.logger.debug("Component %s closed.", self.__class__.__name__)
-#                 except Exception:
-#                     pass
-#
-#     # ---------- Public lifecycle (async) ----------
-#     async def aclose(self) -> None:
-#         with self._close_lock:
-#             if self._is_closed or self._closing:
-#                 return
-#             self._closing = True
-#
-#         try:
-#             await self._acleanup()
-#         except Exception:
-#             # Only include traceback when debug=True
-#             if self._log_cleanup_errors:
-#                 try:
-#                     self.logger.error(
-#                         "Error during %s._acleanup()", self.__class__.__name__,
-#                         exc_info=self.debug
-#                     )
-#                 except Exception:
-#                     pass
-#             raise
-#         finally:
-#             with self._close_lock:
-#                 self._is_closed = True
-#                 self._closing = False
-#             self._shutdown_owned_resources()
-#             if self.debug:
-#                 try:
-#                     self.logger.debug("Async component %s closed.", self.__class__.__name__)
-#                 except Exception:
-#                     pass
-#
-#     # ---------- Context managers ----------
-#     def __enter__(self) -> Self:
-#         return self
-#
-#     def __exit__(self, exc_type, exc, tb) -> bool:
-#         self.close()
-#         return False  # propagate exceptions
-#
-#     async def __aenter__(self) -> Self:
-#         return self
-#
-#     async def __aexit__(self, exc_type, exc, tb) -> bool:
-#         await self.aclose()
-#         return False
-#
-#     # ---------- Finalizer ( at Garbage Collection-time absolutely silent) ----------
-#     @staticmethod
-#     def _finalize_static(ref: "weakref.ReferenceType[ManagedResource]") -> None:
-#         obj = ref()
-#         if obj is None:
-#             return
-#         # No logging here; interpreter may be tearing down.
-#         # Best-effort silent cleanup; avoid locks and context managers.
-#         try:
-#             if not obj._is_closed:
-#                 try:
-#                     obj._cleanup()
-#                 except Exception:
-#                     pass
-#                 obj._is_closed = True
-#                 try:
-#                     obj._shutdown_owned_resources()
-#                 except Exception:
-#                     pass
-#         except Exception:
-#             # do not show anything at garbage collection time
-#             pass
-#

sibi_dst/utils/boilerplate/__init__.py CHANGED Viewed

@@ -2,10 +2,13 @@ from .base_parquet_artifact import BaseParquetArtifact
 from .base_data_cube import BaseDataCube
 from .base_attacher import make_attacher
 from .base_parquet_reader import BaseParquetReader
+from .hybrid_data_loader import HybridDataLoader
 __all__ = [
     "BaseDataCube",
     "BaseParquetArtifact",
     "make_attacher",
-    "BaseParquetReader"
+    "BaseParquetReader",
+    "HybridDataLoader",
 ]

sibi_dst/utils/boilerplate/hybrid_data_loader.py ADDED Viewed

@@ -0,0 +1,144 @@
+import dask.dataframe as dd
+import datetime
+import pandas as pd
+from typing import Optional
+from sibi_dst.utils import Logger
+from sibi_dst.utils.dask_utils import dask_is_empty
+today = datetime.date.today()
+yesterday = today - datetime.timedelta(days=1)
+TODAY_STR = today.strftime('%Y-%m-%d')
+YESTERDAY_STR = yesterday.strftime('%Y-%m-%d')
+class HybridDataLoader:
+    """
+    A generic data loader that orchestrates loading from a historical
+    source and an optional live source.
+    """
+    def __init__(self, start_date: str, end_date: str, historical_reader, live_reader, date_field: str, **kwargs):
+        self.start_date = self._validate_date_format(start_date)
+        self.end_date = self._validate_date_format(end_date)
+        self.historical_reader = historical_reader
+        self.live_reader = live_reader
+        self.date_field = date_field
+        self.logger = kwargs.get('logger', Logger.default_logger(logger_name=__name__))
+        self.debug = kwargs.get('debug', False)
+        self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
+        # Validate date range
+        self._validate_date_range()
+        # Determine loading strategy
+        self._should_read_live = self.end_date == TODAY_STR
+        self._is_single_today = (self.start_date == TODAY_STR and self.end_date == TODAY_STR)
+        self._is_single_historical = (self.start_date == self.end_date and self.end_date != TODAY_STR)
+    def _validate_date_format(self, date_str: str) -> str:
+        """Validate that date string is in correct format."""
+        try:
+            datetime.datetime.strptime(date_str, '%Y-%m-%d')
+            return date_str
+        except ValueError:
+            raise ValueError(f"Date '{date_str}' is not in valid YYYY-MM-DD format")
+    def _validate_date_range(self):
+        """Validate that start date is not after end date."""
+        start = datetime.datetime.strptime(self.start_date, '%Y-%m-%d').date()
+        end = datetime.datetime.strptime(self.end_date, '%Y-%m-%d').date()
+        if end < start:
+            raise ValueError(f"End date ({self.end_date}) cannot be before start date ({self.start_date})")
+    def _align_schema_to_live(self, historical_df: dd.DataFrame, live_df: dd.DataFrame) -> dd.DataFrame:
+        """Forces the historical dataframe schema to match the live one."""
+        self.logger.debug("Aligning historical schema to match live schema.")
+        historical_cols = set(historical_df.columns)
+        live_cols = set(live_df.columns)
+        # Add missing columns to historical dataframe
+        for col in live_cols - historical_cols:
+            historical_df[col] = None
+        # Reorder columns to match live dataframe
+        return historical_df[list(live_df.columns)]
+    def _create_empty_dataframe(self) -> dd.DataFrame:
+        """Create an empty dask dataframe with proper structure."""
+        return dd.from_pandas(pd.DataFrame(), npartitions=1)
+    async def _load_today_data(self, **kwargs) -> Optional[dd.DataFrame]:
+        """Load today's data from the live reader."""
+        self.logger.debug(f"Loading today's live data...")
+        date_filter = {f"{self.date_field}__date": TODAY_STR}
+        filters = {**kwargs, **date_filter}
+        try:
+            today_df = await self.live_reader(
+                logger=self.logger,
+                debug=self.debug
+            ).aload(**filters)
+            return today_df
+        except Exception as e:
+            self.logger.error(f"Failed to load today's data: {e}")
+            if not self.debug:
+                return None
+            raise
+    async def _load_historical_data(self, start_date: str, end_date: str, **kwargs) -> dd.DataFrame:
+        """Load historical data from the historical reader."""
+        self.logger.debug(f"Loading historical data from {start_date} to {end_date}...")
+        try:
+            return await self.historical_reader(
+                parquet_start_date=start_date,
+                parquet_end_date=end_date,
+                logger=self.logger,
+                debug=self.debug
+            ).aload(**kwargs)
+        except Exception as e:
+            self.logger.error(f"Failed to load historical data from {start_date} to {end_date}: {e}")
+            if not self.debug:
+                return self._create_empty_dataframe()
+            raise
+    async def aload(self, **kwargs) -> dd.DataFrame:
+        """
+        Loads data from the historical source and, if required, the live source,
+        then concatenates them.
+        """
+        # Case 1: Only today's data requested
+        if self._is_single_today:
+            today_df = await self._load_today_data(**kwargs)
+            return today_df if today_df is not None else self._create_empty_dataframe()
+        # Case 2: Pure historical data (end date is not today)
+        if not self._should_read_live:
+            return await self._load_historical_data(self.start_date, self.end_date, **kwargs)
+        # Case 3: Mixed historical + live scenario (end date is today)
+        # Load historical data up to yesterday
+        historical_df = await self._load_historical_data(self.start_date, YESTERDAY_STR, **kwargs)
+        # Load today's data
+        today_df = await self._load_today_data(**kwargs)
+        # Combine dataframes
+        if today_df is not None and not dask_is_empty(today_df):
+            # Align schemas if needed
+            if len(historical_df.columns) > 0 and len(today_df.columns) > 0:
+                try:
+                    historical_df = self._align_schema_to_live(historical_df, today_df)
+                except Exception as e:
+                    self.logger.warning(f"Failed to align schemas: {e}")
+            return dd.concat([historical_df, today_df], ignore_index=True)
+        else:
+            return historical_df
+    def __repr__(self):
+        return (f"HybridDataLoader(start_date='{self.start_date}', "
+                f"end_date='{self.end_date}', "
+                f"loading_live={self._should_read_live})")

sibi_dst/utils/clickhouse_writer.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import ClassVar, Dict, Optional, Any, Iterable, Tuple
 import pandas as pd
 import dask.dataframe as dd
 import clickhouse_connect
+import numpy as np
 from . import ManagedResource
@@ -27,6 +28,7 @@ class ClickHouseWriter(ManagedResource):
       - Optional overwrite (drop + recreate)
       - Partitioned, batched inserts
       - Per-thread clients to avoid session conflicts
+      - Proper PyArrow dtype handling
     """
     # Default dtype mapping (pandas/dask → ClickHouse)
@@ -109,7 +111,11 @@ class ClickHouseWriter(ManagedResource):
             return
         # lazily fill missing values per-partition (no global compute)
-        df = df.map_partitions(type(self)._fill_missing_partition, meta=df._meta)
+        # Use the new method that ensures correct types for ClickHouse
+        df = df.map_partitions(
+            type(self)._process_partition_for_clickhouse_compatible,
+            meta=df._meta
+        )
         # (re)create table
         ow = self.overwrite if overwrite is None else bool(overwrite)
@@ -121,7 +127,7 @@ class ClickHouseWriter(ManagedResource):
             self._command(f"DROP TABLE IF EXISTS {self._ident(self.table)}")
             self.logger.info(f"Dropped table {self.table} (overwrite=True)")
-        create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql};"
+        create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql}"
         self._command(create_sql)
         self.logger.info(f"Ensured table {self.table} exists")
@@ -159,6 +165,26 @@ class ClickHouseWriter(ManagedResource):
         return ", ".join(pieces)
     def _map_dtype(self, dtype: Any) -> str:
+        dtype_str = str(dtype).lower()
+        # Handle PyArrow dtypes
+        if "[pyarrow]" in dtype_str:
+            if "int64" in dtype_str:
+                return "Int64"
+            elif "int32" in dtype_str:
+                return "Int32"
+            elif "float64" in dtype_str or "double" in dtype_str:
+                return "Float64"
+            elif "float32" in dtype_str:
+                return "Float32"
+            elif "bool" in dtype_str:
+                return "UInt8"
+            elif "timestamp" in dtype_str: # PyArrow timestamp
+                return "DateTime"
+            elif "string" in dtype_str: # PyArrow string
+                return "String"
+            else:
+                return "String" # fallback
         # Handle pandas extension dtypes explicitly
         if isinstance(dtype, pd.Int64Dtype):
             return "Int64"
@@ -170,19 +196,29 @@ class ClickHouseWriter(ManagedResource):
             return "Float64"
         if isinstance(dtype, pd.StringDtype):
             return "String"
-        if "datetime64" in str(dtype):
+        if "datetime64" in dtype_str:
             return "DateTime"
         return self.DTYPE_MAP.get(str(dtype), "String")
     def _should_mark_nullable(self, dtype: Any) -> bool:
-        s = str(dtype)
+        dtype_str = str(dtype).lower()
+        # PyArrow types are generally nullable, but let's be specific
+        if "[pyarrow]" in dtype_str:
+             # For PyArrow, make strings and timestamps nullable, numbers usually not unless data has nulls
+             base_type = dtype_str.replace("[pyarrow]", "")
+             if base_type in ["string", "large_string"] or "timestamp" in base_type:
+                 return True
+             # For numeric PyArrow, check if the actual data contains nulls (hard to do here)
+             # Let's default to not nullable for numeric unless explicitly needed
+             return False # Conservative for PyArrow numerics
         if isinstance(dtype, (pd.StringDtype, pd.BooleanDtype, pd.Int64Dtype, pd.Int32Dtype, pd.Float64Dtype)):
             return True
-        if "datetime64" in s:
+        if "datetime64" in dtype_str:
             return True
         # object/category almost always nullable
-        if s in ("object", "category", "string"):
+        if dtype_str in ("object", "category", "string"):
             return True
         return False
@@ -203,6 +239,10 @@ class ClickHouseWriter(ManagedResource):
         # Ensure column ordering is stable
         cols = list(pdf.columns)
+        # --- CRITICAL FIX: Ensure datetime columns are compatible BEFORE insertion ---
+        # This is the key step to prevent the numpy.datetime64 error
+        pdf = self._ensure_clickhouse_compatible_datetime_types(pdf)
         # Split into batches (to avoid giant single insert)
         for start in range(0, len(pdf), self.insert_chunksize):
             batch = pdf.iloc[start:start + self.insert_chunksize]
@@ -215,30 +255,116 @@ class ClickHouseWriter(ManagedResource):
     def _insert_df(self, cols: Iterable[str], df: pd.DataFrame) -> None:
         client = self._get_client()
         # clickhouse-connect supports insert_df
+        # The df passed here should now have compatible datetime types
         client.insert_df(self.table, df[cols], settings={"async_insert": 1, "wait_end_of_query": 1})
-    # ------------- missing values (lazy) -------------
+    # ------------- missing values & type conversion (lazy) -------------
     @staticmethod
-    def _fill_missing_partition(pdf: pd.DataFrame) -> pd.DataFrame:
-        # (unchanged body)
+    def _process_partition_for_clickhouse_compatible(pdf: pd.DataFrame) -> pd.DataFrame:
+        """
+        Process a partition to fill missing values and ensure initial data types are consistent.
+        This is the first step of data preparation.
+        """
+        pdf = pdf.copy() # Avoid modifying original
         for col in pdf.columns:
             s = pdf[col]
-            if pd.api.types.is_integer_dtype(s.dtype):
+            dtype_str = str(s.dtype).lower()
+            # --- Handle PyArrow dtypes ---
+            if "[pyarrow]" in dtype_str:
+                try:
+                    if "string" in dtype_str:
+                        # Convert PyArrow string to object, fillna with empty string
+                        pdf[col] = s.astype('object').fillna("")
+                    elif "timestamp" in dtype_str:
+                        # Convert PyArrow timestamp to pandas datetime, NaT for nulls
+                        pdf[col] = pd.to_datetime(s, errors='coerce') # errors='coerce' handles conversion issues
+                    elif "int" in dtype_str:
+                        # Convert PyArrow int to pandas int, fillna with 0 for non-nullable
+                        pdf[col] = s.fillna(0)
+                    elif "float" in dtype_str or "double" in dtype_str:
+                        pdf[col] = s.fillna(0.0)
+                    elif "bool" in dtype_str:
+                         pdf[col] = s.fillna(False) # Or pd.NA if you prefer
+                    else:
+                        # Fallback: convert to object and then to string
+                        pdf[col] = s.astype('object').astype(str).fillna("")
+                except Exception as e:
+                    # If conversion fails, fall back to object and string
+                    pdf[col] = s.astype('object').astype(str).fillna("")
+            # --- Handle standard pandas dtypes ---
+            elif pd.api.types.is_integer_dtype(s.dtype):
                 if pd.api.types.is_extension_array_dtype(s.dtype):
                     pdf[col] = s.fillna(pd.NA)
                 else:
                     pdf[col] = s.fillna(0)
             elif pd.api.types.is_bool_dtype(s.dtype):
-                pdf[col] = s.fillna(pd.NA)
+                pdf[col] = s.fillna(pd.NA) # Or False
             elif pd.api.types.is_float_dtype(s.dtype):
                 pdf[col] = s.fillna(0.0)
             elif pd.api.types.is_datetime64_any_dtype(s.dtype):
+                # Datetimes - leave as is for now, will be handled in final step
                 pass
             else:
-                pdf[col] = s.fillna("")
+                # For object/string/category columns, ensure they're strings
+                pdf[col] = s.astype(str).fillna("")
         return pdf
+    def _ensure_clickhouse_compatible_datetime_types(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Final conversion step: Ensure datetime columns are in a format compatible
+        with clickhouse-connect driver. Specifically, convert numpy.datetime64 to
+        pandas.Timestamp or Python datetime objects.
+        This is called just before insertion.
+        """
+        df = df.copy()
+        for col in df.columns:
+            s = df[col]
+            # Check if the column is datetime-like
+            if pd.api.types.is_datetime64_any_dtype(s.dtype):
+                # --- Robust conversion to ensure compatibility ---
+                # 1. Convert to pandas datetime explicitly
+                df[col] = pd.to_datetime(s, utc=True) # Ensures timezone handling
+                # 2. Replace NaT with None for nullable columns (clickhouse-connect handles this)
+                #    This is often sufficient, but let's be extra sure about the object type
+                # 3. Ensure the underlying objects are pandas.Timestamp (which have .timestamp())
+                #    The pd.to_datetime should handle this, but accessing .dt accessor reinforces it.
+                #    If there are still issues, we can force object conversion:
+                # df[col] = df[col].dt.to_pydatetime() # Converts to numpy array of datetime64 or None
+                # But pd.Timestamp is better. Let's try accessing .dt to ensure it's proper:
+                try:
+                    _ = df[col].dt # Accessing .dt confirms it's datetime-like
+                except:
+                    # If .dt fails, it means conversion wasn't clean, force it
+                    self.logger.debug(f"Forcing datetime conversion for column {col}")
+                    df[col] = pd.to_datetime(df[col].astype('object'), utc=True)
+                # --- Final check and explicit conversion if needed ---
+                # If the error persists, we might need to explicitly convert the array elements.
+                # Let's add a check for the first non-null element in a sample:
+                sample_series = df[col].dropna()
+                if len(sample_series) > 0:
+                    first_val = sample_series.iloc[0]
+                    if isinstance(first_val, np.datetime64):
+                        self.logger.warning(f"Column {col} still contains numpy.datetime64 after conversion. Forcing object conversion.")
+                        # Force conversion to object array of pandas.Timestamp or None
+                        def convert_val(v):
+                            if pd.isna(v):
+                                return None
+                            if isinstance(v, np.datetime64):
+                                # Convert numpy.datetime64 to pandas.Timestamp
+                                return pd.Timestamp(v)
+                            return v
+                        df[col] = df[col].apply(convert_val)
+        return df
     # ------------- low-level helpers -------------
     def _get_client(self):
@@ -284,4 +410,3 @@ class ClickHouseWriter(ManagedResource):
         finally:
             if hasattr(self._tlocal, "client"):
                 delattr(self._tlocal, "client")

sibi_dst/utils/dask_utils.py CHANGED Viewed

@@ -31,7 +31,7 @@ def dask_is_empty(ddf: dd.DataFrame, *, sample: int = 4) -> bool:
     k = min(max(sample, 1), ddf.npartitions)
     probes = dask.compute(*[
         ddf.get_partition(i).map_partitions(len) for i in range(k)
-    ])
+    ], scheduler="threads")
     if any(_to_int_safe(n) > 0 for n in probes):
         return False

{sibi_dst-2025.9.4.dist-info → sibi_dst-2025.9.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sibi-dst
-Version: 2025.9.4
+Version: 2025.9.6
 Summary: Data Science Toolkit
 Author: Luis Valverde
 Author-email: lvalverdeb@gmail.com

{sibi_dst-2025.9.4.dist-info → sibi_dst-2025.9.6.dist-info}/RECORD RENAMED Viewed

@@ -35,19 +35,21 @@ sibi_dst/osmnx_helper/basemaps/router_plotter.py,sha256=UAiijn-J-jjX4YnL0_P9SFqT
 sibi_dst/osmnx_helper/route_path_builder.py,sha256=XJJyu4YXegAkCRjE-knyQncwXaxDVXZhalYacLcb7e0,3557
 sibi_dst/osmnx_helper/utils.py,sha256=7-lFVhGn4rHjGz6FvpXtC2jY8UzGIVyKR3MVyEfB7nw,14407
 sibi_dst/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sibi_dst/tests/test_baseclass.py,sha256=5huAwjWo_SOEZR2_0y5w9qUmw5G7pVdm8X1OTG87JK0,11562
 sibi_dst/tests/test_data_wrapper_class.py,sha256=6uFmZR2DxnxQz49L5jT2ehlKvlLnpUHMLFB_PqqUq7k,3336
 sibi_dst/utils/__init__.py,sha256=vShNCOMPw8KKwlb4tq5XGrpjqakJ_OE8YDc_xDAWAxI,1302
 sibi_dst/utils/async_utils.py,sha256=53aywfgq1Q6-0OVr9qR1Sf6g7Qv3I9qunAAR4fjFXBE,351
-sibi_dst/utils/base.py,sha256=W501bJFjpgElPBo9Xp7SkgFj-oGPXXfFE25Br0dZqxc,25470
-sibi_dst/utils/boilerplate/__init__.py,sha256=998ptGqawJl79WZA-UEeTyBhvc-ClENzXrMaCSWsrL4,295
+sibi_dst/utils/base.py,sha256=sFngliI7Ku8bZMz0YdVhppuaPNZ0dvqRwCsPe9XdF1A,16256
+sibi_dst/utils/boilerplate/__init__.py,sha256=zgkQ50-cKmRugOz1bHqhjVXb3Hb8rsIwN7d5-kVsRls,370
 sibi_dst/utils/boilerplate/base_attacher.py,sha256=JRAyvfljQjKVD5BJDDd09cBY9pGPIe8LQp0aUv_xJs0,736
 sibi_dst/utils/boilerplate/base_data_cube.py,sha256=ErKTM2kT8LsSXADcyYvT436O_Mp0J2hm8xs1IUircb4,2760
 sibi_dst/utils/boilerplate/base_parquet_artifact.py,sha256=oqPbjHFfChA9j1WL-eDAh7XLA3zmf-Rq7s_kzITVniA,3753
 sibi_dst/utils/boilerplate/base_parquet_reader.py,sha256=3kN9_bbxyX-WuJLMBWejeApW2V_BDArSljhSUOAOhVU,692
+sibi_dst/utils/boilerplate/hybrid_data_loader.py,sha256=Tazn7QL3FmWKVMXxzkvxPrG_2ucsPHvSotIW9dBLoNc,6018
 sibi_dst/utils/business_days.py,sha256=dP0Xj4FhTBIvZZrZYLOHZl5zOpDAgWkD4p_1a7BOT7I,8461
-sibi_dst/utils/clickhouse_writer.py,sha256=JCjLfPfsDDAvoMJeh0uVqVL5Je6mPcZn-G_EL9Pk6ms,10364
+sibi_dst/utils/clickhouse_writer.py,sha256=AOv0bYFzAI0u4dEkEBoUqtHekwPMISdNT5-ywCtDe4I,17049
 sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
-sibi_dst/utils/dask_utils.py,sha256=FURwrNqij6ptxFhI4v7yaGkyOIIyW9lSPpMfE9-kxHY,1970
+sibi_dst/utils/dask_utils.py,sha256=QhFcmpH4fXAy6b3DugIX5JvH4h-P3M3hXKnBYTLRkq0,1991
 sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
 sibi_dst/utils/data_utils.py,sha256=7bLidEjppieNoozDFb6OuRY0W995cxg4tiGAlkGfePI,7768
 sibi_dst/utils/data_wrapper.py,sha256=axHOmCG9cBJgjf5m8jpzsCCZzXJgynGs44rGe6FUrzk,29906
@@ -91,6 +93,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
 sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
 sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
 sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
-sibi_dst-2025.9.4.dist-info/METADATA,sha256=LKtGXXgxpOR9pL7rgBuGpySdppqMGi674oH_18tVKec,2710
-sibi_dst-2025.9.4.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-sibi_dst-2025.9.4.dist-info/RECORD,,
+sibi_dst-2025.9.6.dist-info/METADATA,sha256=e9vt1wbHivyTJhyubiEjJcMFBNDF1m9nERTlBgYvq9o,2710
+sibi_dst-2025.9.6.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+sibi_dst-2025.9.6.dist-info/RECORD,,

{sibi_dst-2025.9.4.dist-info → sibi_dst-2025.9.6.dist-info}/WHEEL RENAMED Viewed

File without changes

sibi-dst 2025.9.4__py3-none-any.whl → 2025.9.6__py3-none-any.whl

sibi-dst 2025.9.4py3-none-any.whl → 2025.9.6py3-none-any.whl