sibi-dst 2025.9.4__py3-none-any.whl → 2025.9.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,403 @@
1
+ import asyncio
2
+ import json
3
+ import threading
4
+ from typing import Any, Dict
5
+ from unittest.mock import MagicMock
6
+
7
+ import fsspec
8
+
9
+ from sibi_dst.utils import Logger
10
+ from sibi_dst.utils import ManagedResource
11
+ from sibi_dst.utils.base import _QueueSSE # Replace 'your_module' with actual module name
12
+
13
+
14
+ # ------------------------------ Test Fixtures ------------------------------
15
+
16
+ class TestResource(ManagedResource):
17
+ def __init__(self, *args, **kwargs):
18
+ super().__init__(*args, **kwargs)
19
+ self.cleanup_called = False
20
+ self.acleanup_called = False
21
+
22
+ def _cleanup(self) -> None:
23
+ self.cleanup_called = True
24
+ super()._cleanup()
25
+
26
+ async def _acleanup(self) -> None:
27
+ self.acleanup_called = True
28
+ await super()._acleanup()
29
+
30
+
31
+ class MockSSESink:
32
+ def __init__(self):
33
+ self.events = []
34
+ self.closed = False
35
+
36
+ async def send(self, event: str, data: Dict[str, Any]) -> None:
37
+ self.events.append({"event": event, "data": data})
38
+
39
+ async def aclose(self) -> None:
40
+ self.closed = True
41
+
42
+
43
+ class MockSyncSSESink:
44
+ def __init__(self):
45
+ self.events = []
46
+ self.closed = False
47
+
48
+ def send(self, event: str, data: Dict[str, Any]) -> None:
49
+ self.events.append({"event": event, "data": data})
50
+
51
+ def close(self) -> None:
52
+ self.closed = True
53
+
54
+
55
+ # ------------------------------ Mock fsspec filesystem ------------------------------
56
+
57
+ class MockFileSystem(fsspec.AbstractFileSystem):
58
+ def __init__(self, **kwargs):
59
+ super().__init__(**kwargs)
60
+ self.closed = False
61
+
62
+ def close(self):
63
+ self.closed = True
64
+
65
+
66
+ # ------------------------------ Utility for Event Loop ------------------------------
67
+
68
+ def run_async_test(coro):
69
+ """Run async test safely in different environments."""
70
+ try:
71
+ # Try to get existing event loop (for Jupyter/IPython)
72
+ loop = asyncio.get_event_loop()
73
+ if loop.is_running():
74
+ # In Jupyter, create a new task
75
+ task = loop.create_task(coro)
76
+ return task
77
+ else:
78
+ return loop.run_until_complete(coro)
79
+ except RuntimeError:
80
+ # No event loop running, use asyncio.run()
81
+ return asyncio.run(coro)
82
+
83
+
84
+ # ------------------------------ Lifecycle Tests ------------------------------
85
+
86
+ def test_double_close_no_error():
87
+ """Test that calling close() multiple times doesn't raise errors."""
88
+ resource = TestResource()
89
+ resource.close()
90
+ resource.close() # Should not raise
91
+ assert resource.closed
92
+
93
+
94
+ def test_double_aclose_no_error():
95
+ """Test that calling aclose() multiple times doesn't raise errors."""
96
+ async def test():
97
+ resource = TestResource()
98
+ await resource.aclose()
99
+ await resource.aclose() # Should not raise
100
+ assert resource.closed
101
+
102
+ run_async_test(test())
103
+
104
+
105
+ def test_context_manager_sync():
106
+ """Test sync context manager behavior."""
107
+ with TestResource() as resource:
108
+ assert not resource.closed
109
+ assert resource.closed
110
+ assert resource.cleanup_called
111
+
112
+
113
+ def test_context_manager_async():
114
+ """Test async context manager behavior."""
115
+ async def test():
116
+ async with TestResource() as resource:
117
+ assert not resource.closed
118
+ assert resource.closed
119
+ assert resource.acleanup_called
120
+
121
+ run_async_test(test())
122
+
123
+
124
+ # ------------------------------ SSE Emission Tests ------------------------------
125
+
126
+ def test_auto_sse_creation():
127
+ """Test automatic SSE creation when auto_sse=True."""
128
+ resource = TestResource(auto_sse=True)
129
+ sse = resource.get_sse()
130
+ assert sse is not None
131
+ assert isinstance(sse, _QueueSSE)
132
+ assert resource._owns_sse
133
+
134
+
135
+ def test_sse_emission_with_async_sink():
136
+ """Test SSE emission with async send method."""
137
+ async def test():
138
+ sink = MockSSESink()
139
+ resource = TestResource(sse=sink)
140
+
141
+ await resource.emit("test_event", key="value")
142
+
143
+ assert len(sink.events) == 1
144
+ assert sink.events[0]["event"] == "test_event"
145
+ assert sink.events[0]["data"] == {"key": "value"}
146
+
147
+ run_async_test(test())
148
+
149
+
150
+ def test_sse_emission_with_sync_sink():
151
+ """Test SSE emission with sync send method wrapped in async."""
152
+ sink = MockSyncSSESink()
153
+ resource = TestResource(sse=sink)
154
+
155
+ async def test():
156
+ await resource.emit("test_event", key="value")
157
+
158
+ assert len(sink.events) == 1
159
+ assert sink.events[0]["event"] == "test_event"
160
+ assert sink.events[0]["data"] == {"key": "value"}
161
+
162
+ run_async_test(test())
163
+
164
+
165
+ def test_sse_put_method_support():
166
+ """Test SSE emission with put method."""
167
+ class PutSink:
168
+ def __init__(self):
169
+ self.items = []
170
+
171
+ async def put(self, item: Dict[str, Any]) -> None:
172
+ self.items.append(item)
173
+
174
+ async def test():
175
+ sink = PutSink()
176
+ resource = TestResource(sse=sink)
177
+
178
+ await resource.emit("test_event", key="value")
179
+
180
+ assert len(sink.items) == 1
181
+ item = sink.items[0]
182
+ assert item["event"] == "test_event"
183
+ assert json.loads(item["data"]) == {"key": "value"}
184
+
185
+ run_async_test(test())
186
+
187
+
188
+ def test_sse_no_emitter_no_error():
189
+ """Test that emit on resource without emitter doesn't raise."""
190
+ resource = TestResource()
191
+ # Should not raise error
192
+ async def test():
193
+ await resource.emit("test_event", key="value")
194
+
195
+ run_async_test(test())
196
+
197
+
198
+ def test_sse_emission_after_close():
199
+ """Test that emit after close is no-op."""
200
+ async def test():
201
+ sink = MockSSESink()
202
+ resource = TestResource(sse=sink)
203
+
204
+ await resource.aclose()
205
+ await resource.emit("test_event", key="value") # Should not raise
206
+
207
+ assert len(sink.events) == 0
208
+
209
+ run_async_test(test())
210
+
211
+
212
+ # ------------------------------ Cleanup Interplay Tests ------------------------------
213
+
214
+ def test_sync_cleanup_called_on_sync_close():
215
+ """Test that sync cleanup is called during sync close."""
216
+ resource = TestResource()
217
+ resource.close()
218
+ assert resource.cleanup_called
219
+ assert not resource.acleanup_called
220
+
221
+
222
+ def test_async_cleanup_called_on_async_close():
223
+ """Test that async cleanup is called during async close."""
224
+ async def test():
225
+ resource = TestResource()
226
+ await resource.aclose()
227
+ assert resource.acleanup_called
228
+ assert not resource.cleanup_called
229
+
230
+ run_async_test(test())
231
+
232
+
233
+ # ------------------------------ Logger Tests ------------------------------
234
+
235
+ def test_logger_ownership():
236
+ """Test that logger is owned when not provided externally."""
237
+ resource = TestResource()
238
+ assert resource._owns_logger
239
+ assert resource.logger is not None
240
+
241
+
242
+ def test_external_logger_not_owned():
243
+ """Test that external logger is not owned."""
244
+ external_logger = Logger.default_logger("test")
245
+ resource = TestResource(logger=external_logger)
246
+ assert not resource._owns_logger
247
+ assert resource.logger is external_logger
248
+
249
+
250
+ def test_logger_level_configuration():
251
+ """Test logger level configuration based on verbose/debug flags."""
252
+ # Default (warning level)
253
+ resource = TestResource()
254
+ assert hasattr(resource.logger, 'level')
255
+
256
+ # Verbose (info level)
257
+ resource = TestResource(verbose=True)
258
+ assert hasattr(resource.logger, 'level')
259
+
260
+ # Debug (debug level)
261
+ resource = TestResource(debug=True)
262
+ assert hasattr(resource.logger, 'level')
263
+
264
+
265
+ # ------------------------------ Lazy Instantiation Tests ------------------------------
266
+
267
+ def test_lazy_fs_instantiation():
268
+ """Test lazy filesystem instantiation via factory."""
269
+ fs_instance = MockFileSystem()
270
+ factory_called = [False]
271
+
272
+ def fs_factory():
273
+ factory_called[0] = True
274
+ return fs_instance
275
+
276
+ resource = TestResource(fs_factory=fs_factory)
277
+ assert not factory_called[0] # Not called yet
278
+
279
+ fs = resource._ensure_fs()
280
+ assert factory_called[0]
281
+ assert fs is fs_instance
282
+ assert resource.fs is fs_instance
283
+
284
+
285
+ def test_lazy_sse_instantiation():
286
+ """Test lazy SSE instantiation via factory."""
287
+ sink_instance = MockSSESink()
288
+ factory_called = [False]
289
+
290
+ def sse_factory():
291
+ factory_called[0] = True
292
+ return sink_instance
293
+
294
+ resource = TestResource(sse_factory=sse_factory)
295
+ assert not factory_called[0] # Not called yet
296
+
297
+ sse = resource._ensure_sse()
298
+ assert factory_called[0]
299
+ assert sse is sink_instance
300
+ assert resource._sse is sink_instance
301
+
302
+
303
+ def test_lazy_fs_not_called_if_fs_provided():
304
+ """Test that factory is not called if fs is provided directly."""
305
+ fs_instance = MockFileSystem()
306
+ factory = MagicMock()
307
+
308
+ resource = TestResource(fs=fs_instance, fs_factory=factory)
309
+ fs = resource._ensure_fs()
310
+
311
+ assert fs is fs_instance
312
+ factory.assert_not_called()
313
+
314
+
315
+ def test_lazy_sse_not_called_if_sse_provided():
316
+ """Test that factory is not called if sse is provided directly."""
317
+ sink_instance = MockSSESink()
318
+ factory = MagicMock()
319
+
320
+ resource = TestResource(sse=sink_instance, sse_factory=factory)
321
+ sse = resource._ensure_sse()
322
+
323
+ assert sse is sink_instance
324
+ factory.assert_not_called()
325
+
326
+
327
+ # ------------------------------ Thread Safety Tests ------------------------------
328
+
329
+ def test_thread_safe_close():
330
+ """Test that close operations are thread-safe."""
331
+ resource = TestResource()
332
+
333
+ results = []
334
+ errors = []
335
+
336
+ def close_resource():
337
+ try:
338
+ resource.close()
339
+ results.append("success")
340
+ except Exception as e:
341
+ errors.append(str(e))
342
+ results.append(f"error: {e}")
343
+
344
+ # Start multiple threads trying to close simultaneously
345
+ threads = [threading.Thread(target=close_resource) for _ in range(5)]
346
+ for t in threads:
347
+ t.start()
348
+ for t in threads:
349
+ t.join()
350
+
351
+ # Debug information
352
+ print(f"Results: {results}")
353
+ print(f"Errors: {errors}")
354
+ print(f"Resource closed: {resource.closed}")
355
+
356
+ # Should have at least one success (the first one) and no exceptions
357
+ success_count = results.count("success")
358
+ error_count = len([r for r in results if r.startswith("error")])
359
+
360
+ # At least one should succeed
361
+ assert success_count >= 1, f"Expected at least 1 success, got {success_count}"
362
+ # No errors should occur
363
+ assert error_count == 0, f"Expected 0 errors, got {error_count}"
364
+ # Resource should be closed
365
+ assert resource.closed, "Resource should be closed"
366
+
367
+
368
+ # ------------------------------ Individual Test Functions ------------------------------
369
+
370
+ # You can now run individual tests like this:
371
+ if __name__ == "__main__":
372
+ # Run individual tests
373
+ test_double_close_no_error()
374
+ print("✓ test_double_close_no_error passed")
375
+
376
+ test_sync_cleanup_called_on_sync_close()
377
+ print("✓ test_sync_cleanup_called_on_sync_close passed")
378
+
379
+ test_logger_ownership()
380
+ print("✓ test_logger_ownership passed")
381
+
382
+ test_external_logger_not_owned()
383
+ print("✓ test_external_logger_not_owned passed")
384
+
385
+ test_lazy_fs_instantiation()
386
+ print("✓ test_lazy_fs_instantiation passed")
387
+
388
+ test_lazy_sse_instantiation()
389
+ print("✓ test_lazy_sse_instantiation passed")
390
+
391
+ test_lazy_fs_not_called_if_fs_provided()
392
+ print("✓ test_lazy_fs_not_called_if_fs_provided passed")
393
+
394
+ test_lazy_sse_not_called_if_sse_provided()
395
+ print("✓ test_lazy_sse_not_called_if_sse_provided passed")
396
+
397
+ test_thread_safe_close()
398
+ print("✓ test_thread_safe_close passed")
399
+
400
+ test_auto_sse_creation()
401
+ print("✓ test_auto_sse_creation passed")
402
+
403
+ print("All tests completed!")
sibi_dst/utils/base.py CHANGED
@@ -441,257 +441,3 @@ class ManagedResource(abc.ABC):
441
441
  except Exception:
442
442
  pass
443
443
 
444
- ## Before SSE handling
445
-
446
- # import abc
447
- # import threading
448
- # import weakref
449
- # from typing import Self, Optional, Callable
450
- #
451
- # import fsspec
452
- #
453
- # from sibi_dst.utils import Logger
454
- #
455
- #
456
- # class ManagedResource(abc.ABC):
457
- # """
458
- # Boilerplate ABC for components that manage a logger and an optional fsspec filesystem,
459
- # with sync/async lifecycle helpers, lazy FS creation via an optional factory, and
460
- # configurable cleanup-error logging.
461
- # """
462
- #
463
- # def __init__(
464
- # self,
465
- # *,
466
- # verbose: bool = False,
467
- # debug: bool = False,
468
- # log_cleanup_errors: bool = True,
469
- # logger: Optional[Logger] = None,
470
- # fs: Optional[fsspec.AbstractFileSystem] = None,
471
- # fs_factory: Optional[Callable[[], fsspec.AbstractFileSystem]] = None,
472
- # **_: object,
473
- # ) -> None:
474
- # # ---- Declared upfront for type checkers
475
- # self.logger: Logger
476
- # self.fs: Optional[fsspec.AbstractFileSystem] = None
477
- # self._fs_factory: Optional[Callable[[], fsspec.AbstractFileSystem]] = None
478
- # self._owns_logger: bool = False
479
- # self._owns_fs: bool = False
480
- # self._is_closed: bool = False
481
- # self._closing: bool = False
482
- # self._close_lock = threading.RLock()
483
- #
484
- # self.verbose = verbose
485
- # self.debug = debug
486
- # self._log_cleanup_errors = log_cleanup_errors
487
- #
488
- # # ---- Logger ownership
489
- # if logger is None:
490
- # self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
491
- # self._owns_logger = True
492
- # level = Logger.DEBUG if self.debug else (Logger.INFO if self.verbose else Logger.WARNING)
493
- # self.logger.set_level(level)
494
- # else:
495
- # self.logger = logger
496
- # self._owns_logger = False # do not mutate external logger
497
- #
498
- # # ---- FS ownership & lazy creation
499
- # if fs is not None:
500
- # self.fs = fs
501
- # self._owns_fs = False
502
- # self._fs_factory = None
503
- # elif fs_factory is not None:
504
- # # Lazy: don't create until first use
505
- # self._fs_factory = fs_factory
506
- # self._owns_fs = True # we will own it *if* created
507
- # self.fs = None
508
- # else:
509
- # self.fs = None
510
- # self._owns_fs = False
511
- # self._fs_factory = None
512
- #
513
- # # Register a GC-time finalizer that does not capture self
514
- # self_ref = weakref.ref(self)
515
- # self._finalizer = weakref.finalize(self, self._finalize_static, self_ref)
516
- #
517
- # if self.debug:
518
- # try:
519
- # self.logger.debug("Component %s initialized. %s", self.__class__.__name__, repr(self))
520
- # except Exception:
521
- # pass
522
- #
523
- # # ---------- Introspection ----------
524
- # @property
525
- # def is_closed(self) -> bool:
526
- # return self._is_closed
527
- #
528
- # @property
529
- # def closed(self) -> bool: # alias
530
- # return self._is_closed
531
- #
532
- # def __repr__(self) -> str:
533
- # class_name = self.__class__.__name__
534
- # logger_status = "own" if self._owns_logger else "external"
535
- # if self.fs is None and self._fs_factory is not None:
536
- # fs_status = "own(lazy)"
537
- # elif self.fs is None:
538
- # fs_status = "none"
539
- # else:
540
- # fs_status = "own" if self._owns_fs else "external"
541
- # return (f"<{class_name} debug={self.debug} verbose={self.verbose} "
542
- # f"log_cleanup_errors={self._log_cleanup_errors} "
543
- # f"logger={logger_status} fs={fs_status}>")
544
- #
545
- # # ---------- Subclass hooks ----------
546
- # def _cleanup(self) -> None:
547
- # """Sync cleanup for resources created BY THE SUBCLASS."""
548
- # return
549
- #
550
- # async def _acleanup(self) -> None:
551
- # """Async cleanup for resources created BY THE SUBCLASS."""
552
- # return
553
- #
554
- # # ---------- FS helpers ----------
555
- # def _ensure_fs(self) -> Optional[fsspec.AbstractFileSystem]:
556
- # """Create the FS lazily if a factory was provided. Return fs (or None)."""
557
- # if self.fs is None and self._fs_factory is not None:
558
- # created = self._fs_factory()
559
- # if not isinstance(created, fsspec.AbstractFileSystem):
560
- # raise TypeError(f"fs_factory() must return fsspec.AbstractFileSystem, got {type(created)!r}")
561
- # self.fs = created
562
- # # _owns_fs already True when factory is present
563
- # return self.fs
564
- #
565
- # def require_fs(self) -> fsspec.AbstractFileSystem:
566
- # """Return a filesystem or raise if not configured/creatable."""
567
- # fs = self._ensure_fs()
568
- # if fs is None:
569
- # raise RuntimeError(
570
- # f"{self.__class__.__name__}: filesystem is required but not configured"
571
- # )
572
- # return fs
573
- #
574
- # # ---------- Shared shutdown helpers (no logging; safe for late shutdown) ----------
575
- # def _release_owned_fs(self) -> None:
576
- # if self._owns_fs:
577
- # # ensure creation state is respected even if never used
578
- # _ = self.fs or None # no-op; if never created, nothing to close
579
- # if self.fs is not None:
580
- # close = getattr(self.fs, "close", None)
581
- # try:
582
- # if callable(close):
583
- # close()
584
- # finally:
585
- # self.fs = None
586
- #
587
- # def _shutdown_logger(self) -> None:
588
- # if self._owns_logger:
589
- # try:
590
- # self.logger.shutdown()
591
- # except Exception:
592
- # pass
593
- #
594
- # def _shutdown_owned_resources(self) -> None:
595
- # self._release_owned_fs()
596
- # self._shutdown_logger()
597
- #
598
- # # ---------- Public lifecycle (sync) ----------
599
- # def close(self) -> None:
600
- # with self._close_lock:
601
- # if self._is_closed or self._closing:
602
- # return
603
- # self._closing = True
604
- #
605
- # try:
606
- # self._cleanup()
607
- # except Exception:
608
- # # Only include traceback when debug=True
609
- # if self._log_cleanup_errors:
610
- # try:
611
- # self.logger.error(
612
- # "Error during %s._cleanup()", self.__class__.__name__,
613
- # exc_info=self.debug
614
- # )
615
- # except Exception:
616
- # pass
617
- # raise
618
- # finally:
619
- # with self._close_lock:
620
- # self._is_closed = True
621
- # self._closing = False
622
- # self._shutdown_owned_resources()
623
- # if self.debug:
624
- # try:
625
- # self.logger.debug("Component %s closed.", self.__class__.__name__)
626
- # except Exception:
627
- # pass
628
- #
629
- # # ---------- Public lifecycle (async) ----------
630
- # async def aclose(self) -> None:
631
- # with self._close_lock:
632
- # if self._is_closed or self._closing:
633
- # return
634
- # self._closing = True
635
- #
636
- # try:
637
- # await self._acleanup()
638
- # except Exception:
639
- # # Only include traceback when debug=True
640
- # if self._log_cleanup_errors:
641
- # try:
642
- # self.logger.error(
643
- # "Error during %s._acleanup()", self.__class__.__name__,
644
- # exc_info=self.debug
645
- # )
646
- # except Exception:
647
- # pass
648
- # raise
649
- # finally:
650
- # with self._close_lock:
651
- # self._is_closed = True
652
- # self._closing = False
653
- # self._shutdown_owned_resources()
654
- # if self.debug:
655
- # try:
656
- # self.logger.debug("Async component %s closed.", self.__class__.__name__)
657
- # except Exception:
658
- # pass
659
- #
660
- # # ---------- Context managers ----------
661
- # def __enter__(self) -> Self:
662
- # return self
663
- #
664
- # def __exit__(self, exc_type, exc, tb) -> bool:
665
- # self.close()
666
- # return False # propagate exceptions
667
- #
668
- # async def __aenter__(self) -> Self:
669
- # return self
670
- #
671
- # async def __aexit__(self, exc_type, exc, tb) -> bool:
672
- # await self.aclose()
673
- # return False
674
- #
675
- # # ---------- Finalizer ( at Garbage Collection-time absolutely silent) ----------
676
- # @staticmethod
677
- # def _finalize_static(ref: "weakref.ReferenceType[ManagedResource]") -> None:
678
- # obj = ref()
679
- # if obj is None:
680
- # return
681
- # # No logging here; interpreter may be tearing down.
682
- # # Best-effort silent cleanup; avoid locks and context managers.
683
- # try:
684
- # if not obj._is_closed:
685
- # try:
686
- # obj._cleanup()
687
- # except Exception:
688
- # pass
689
- # obj._is_closed = True
690
- # try:
691
- # obj._shutdown_owned_resources()
692
- # except Exception:
693
- # pass
694
- # except Exception:
695
- # # do not show anything at garbage collection time
696
- # pass
697
- #
@@ -2,10 +2,13 @@ from .base_parquet_artifact import BaseParquetArtifact
2
2
  from .base_data_cube import BaseDataCube
3
3
  from .base_attacher import make_attacher
4
4
  from .base_parquet_reader import BaseParquetReader
5
+ from .hybrid_data_loader import HybridDataLoader
6
+
5
7
  __all__ = [
6
8
  "BaseDataCube",
7
9
  "BaseParquetArtifact",
8
10
  "make_attacher",
9
- "BaseParquetReader"
11
+ "BaseParquetReader",
12
+ "HybridDataLoader",
10
13
  ]
11
14
 
@@ -0,0 +1,144 @@
1
+ import dask.dataframe as dd
2
+ import datetime
3
+ import pandas as pd
4
+ from typing import Optional
5
+ from sibi_dst.utils import Logger
6
+ from sibi_dst.utils.dask_utils import dask_is_empty
7
+
8
+ today = datetime.date.today()
9
+ yesterday = today - datetime.timedelta(days=1)
10
+ TODAY_STR = today.strftime('%Y-%m-%d')
11
+ YESTERDAY_STR = yesterday.strftime('%Y-%m-%d')
12
+
13
+
14
+ class HybridDataLoader:
15
+ """
16
+ A generic data loader that orchestrates loading from a historical
17
+ source and an optional live source.
18
+ """
19
+
20
+ def __init__(self, start_date: str, end_date: str, historical_reader, live_reader, date_field: str, **kwargs):
21
+ self.start_date = self._validate_date_format(start_date)
22
+ self.end_date = self._validate_date_format(end_date)
23
+ self.historical_reader = historical_reader
24
+ self.live_reader = live_reader
25
+ self.date_field = date_field
26
+
27
+ self.logger = kwargs.get('logger', Logger.default_logger(logger_name=__name__))
28
+ self.debug = kwargs.get('debug', False)
29
+ self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
30
+
31
+ # Validate date range
32
+ self._validate_date_range()
33
+
34
+ # Determine loading strategy
35
+ self._should_read_live = self.end_date == TODAY_STR
36
+ self._is_single_today = (self.start_date == TODAY_STR and self.end_date == TODAY_STR)
37
+ self._is_single_historical = (self.start_date == self.end_date and self.end_date != TODAY_STR)
38
+
39
+ def _validate_date_format(self, date_str: str) -> str:
40
+ """Validate that date string is in correct format."""
41
+ try:
42
+ datetime.datetime.strptime(date_str, '%Y-%m-%d')
43
+ return date_str
44
+ except ValueError:
45
+ raise ValueError(f"Date '{date_str}' is not in valid YYYY-MM-DD format")
46
+
47
+ def _validate_date_range(self):
48
+ """Validate that start date is not after end date."""
49
+ start = datetime.datetime.strptime(self.start_date, '%Y-%m-%d').date()
50
+ end = datetime.datetime.strptime(self.end_date, '%Y-%m-%d').date()
51
+ if end < start:
52
+ raise ValueError(f"End date ({self.end_date}) cannot be before start date ({self.start_date})")
53
+
54
+ def _align_schema_to_live(self, historical_df: dd.DataFrame, live_df: dd.DataFrame) -> dd.DataFrame:
55
+ """Forces the historical dataframe schema to match the live one."""
56
+ self.logger.debug("Aligning historical schema to match live schema.")
57
+ historical_cols = set(historical_df.columns)
58
+ live_cols = set(live_df.columns)
59
+
60
+ # Add missing columns to historical dataframe
61
+ for col in live_cols - historical_cols:
62
+ historical_df[col] = None
63
+
64
+ # Reorder columns to match live dataframe
65
+ return historical_df[list(live_df.columns)]
66
+
67
+ def _create_empty_dataframe(self) -> dd.DataFrame:
68
+ """Create an empty dask dataframe with proper structure."""
69
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
70
+
71
+ async def _load_today_data(self, **kwargs) -> Optional[dd.DataFrame]:
72
+ """Load today's data from the live reader."""
73
+ self.logger.debug(f"Loading today's live data...")
74
+ date_filter = {f"{self.date_field}__date": TODAY_STR}
75
+ filters = {**kwargs, **date_filter}
76
+
77
+ try:
78
+ today_df = await self.live_reader(
79
+ logger=self.logger,
80
+ debug=self.debug
81
+ ).aload(**filters)
82
+ return today_df
83
+ except Exception as e:
84
+ self.logger.error(f"Failed to load today's data: {e}")
85
+ if not self.debug:
86
+ return None
87
+ raise
88
+
89
+ async def _load_historical_data(self, start_date: str, end_date: str, **kwargs) -> dd.DataFrame:
90
+ """Load historical data from the historical reader."""
91
+ self.logger.debug(f"Loading historical data from {start_date} to {end_date}...")
92
+
93
+ try:
94
+ return await self.historical_reader(
95
+ parquet_start_date=start_date,
96
+ parquet_end_date=end_date,
97
+ logger=self.logger,
98
+ debug=self.debug
99
+ ).aload(**kwargs)
100
+ except Exception as e:
101
+ self.logger.error(f"Failed to load historical data from {start_date} to {end_date}: {e}")
102
+ if not self.debug:
103
+ return self._create_empty_dataframe()
104
+ raise
105
+
106
+ async def aload(self, **kwargs) -> dd.DataFrame:
107
+ """
108
+ Loads data from the historical source and, if required, the live source,
109
+ then concatenates them.
110
+ """
111
+ # Case 1: Only today's data requested
112
+ if self._is_single_today:
113
+ today_df = await self._load_today_data(**kwargs)
114
+ return today_df if today_df is not None else self._create_empty_dataframe()
115
+
116
+ # Case 2: Pure historical data (end date is not today)
117
+ if not self._should_read_live:
118
+ return await self._load_historical_data(self.start_date, self.end_date, **kwargs)
119
+
120
+ # Case 3: Mixed historical + live scenario (end date is today)
121
+ # Load historical data up to yesterday
122
+ historical_df = await self._load_historical_data(self.start_date, YESTERDAY_STR, **kwargs)
123
+
124
+ # Load today's data
125
+ today_df = await self._load_today_data(**kwargs)
126
+
127
+ # Combine dataframes
128
+ if today_df is not None and not dask_is_empty(today_df):
129
+ # Align schemas if needed
130
+ if len(historical_df.columns) > 0 and len(today_df.columns) > 0:
131
+ try:
132
+ historical_df = self._align_schema_to_live(historical_df, today_df)
133
+ except Exception as e:
134
+ self.logger.warning(f"Failed to align schemas: {e}")
135
+
136
+ return dd.concat([historical_df, today_df], ignore_index=True)
137
+ else:
138
+ return historical_df
139
+
140
+ def __repr__(self):
141
+ return (f"HybridDataLoader(start_date='{self.start_date}', "
142
+ f"end_date='{self.end_date}', "
143
+ f"loading_live={self._should_read_live})")
144
+
@@ -7,6 +7,7 @@ from typing import ClassVar, Dict, Optional, Any, Iterable, Tuple
7
7
  import pandas as pd
8
8
  import dask.dataframe as dd
9
9
  import clickhouse_connect
10
+ import numpy as np
10
11
 
11
12
  from . import ManagedResource
12
13
 
@@ -27,6 +28,7 @@ class ClickHouseWriter(ManagedResource):
27
28
  - Optional overwrite (drop + recreate)
28
29
  - Partitioned, batched inserts
29
30
  - Per-thread clients to avoid session conflicts
31
+ - Proper PyArrow dtype handling
30
32
  """
31
33
 
32
34
  # Default dtype mapping (pandas/dask → ClickHouse)
@@ -109,7 +111,11 @@ class ClickHouseWriter(ManagedResource):
109
111
  return
110
112
 
111
113
  # lazily fill missing values per-partition (no global compute)
112
- df = df.map_partitions(type(self)._fill_missing_partition, meta=df._meta)
114
+ # Use the new method that ensures correct types for ClickHouse
115
+ df = df.map_partitions(
116
+ type(self)._process_partition_for_clickhouse_compatible,
117
+ meta=df._meta
118
+ )
113
119
 
114
120
  # (re)create table
115
121
  ow = self.overwrite if overwrite is None else bool(overwrite)
@@ -121,7 +127,7 @@ class ClickHouseWriter(ManagedResource):
121
127
  self._command(f"DROP TABLE IF EXISTS {self._ident(self.table)}")
122
128
  self.logger.info(f"Dropped table {self.table} (overwrite=True)")
123
129
 
124
- create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql};"
130
+ create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql}"
125
131
  self._command(create_sql)
126
132
  self.logger.info(f"Ensured table {self.table} exists")
127
133
 
@@ -159,6 +165,26 @@ class ClickHouseWriter(ManagedResource):
159
165
  return ", ".join(pieces)
160
166
 
161
167
  def _map_dtype(self, dtype: Any) -> str:
168
+ dtype_str = str(dtype).lower()
169
+ # Handle PyArrow dtypes
170
+ if "[pyarrow]" in dtype_str:
171
+ if "int64" in dtype_str:
172
+ return "Int64"
173
+ elif "int32" in dtype_str:
174
+ return "Int32"
175
+ elif "float64" in dtype_str or "double" in dtype_str:
176
+ return "Float64"
177
+ elif "float32" in dtype_str:
178
+ return "Float32"
179
+ elif "bool" in dtype_str:
180
+ return "UInt8"
181
+ elif "timestamp" in dtype_str: # PyArrow timestamp
182
+ return "DateTime"
183
+ elif "string" in dtype_str: # PyArrow string
184
+ return "String"
185
+ else:
186
+ return "String" # fallback
187
+
162
188
  # Handle pandas extension dtypes explicitly
163
189
  if isinstance(dtype, pd.Int64Dtype):
164
190
  return "Int64"
@@ -170,19 +196,29 @@ class ClickHouseWriter(ManagedResource):
170
196
  return "Float64"
171
197
  if isinstance(dtype, pd.StringDtype):
172
198
  return "String"
173
- if "datetime64" in str(dtype):
199
+ if "datetime64" in dtype_str:
174
200
  return "DateTime"
175
201
 
176
202
  return self.DTYPE_MAP.get(str(dtype), "String")
177
203
 
178
204
  def _should_mark_nullable(self, dtype: Any) -> bool:
179
- s = str(dtype)
205
+ dtype_str = str(dtype).lower()
206
+ # PyArrow types are generally nullable, but let's be specific
207
+ if "[pyarrow]" in dtype_str:
208
+ # For PyArrow, make strings and timestamps nullable, numbers usually not unless data has nulls
209
+ base_type = dtype_str.replace("[pyarrow]", "")
210
+ if base_type in ["string", "large_string"] or "timestamp" in base_type:
211
+ return True
212
+ # For numeric PyArrow, check if the actual data contains nulls (hard to do here)
213
+ # Let's default to not nullable for numeric unless explicitly needed
214
+ return False # Conservative for PyArrow numerics
215
+
180
216
  if isinstance(dtype, (pd.StringDtype, pd.BooleanDtype, pd.Int64Dtype, pd.Int32Dtype, pd.Float64Dtype)):
181
217
  return True
182
- if "datetime64" in s:
218
+ if "datetime64" in dtype_str:
183
219
  return True
184
220
  # object/category almost always nullable
185
- if s in ("object", "category", "string"):
221
+ if dtype_str in ("object", "category", "string"):
186
222
  return True
187
223
  return False
188
224
 
@@ -203,6 +239,10 @@ class ClickHouseWriter(ManagedResource):
203
239
  # Ensure column ordering is stable
204
240
  cols = list(pdf.columns)
205
241
 
242
+ # --- CRITICAL FIX: Ensure datetime columns are compatible BEFORE insertion ---
243
+ # This is the key step to prevent the numpy.datetime64 error
244
+ pdf = self._ensure_clickhouse_compatible_datetime_types(pdf)
245
+
206
246
  # Split into batches (to avoid giant single insert)
207
247
  for start in range(0, len(pdf), self.insert_chunksize):
208
248
  batch = pdf.iloc[start:start + self.insert_chunksize]
@@ -215,30 +255,116 @@ class ClickHouseWriter(ManagedResource):
215
255
  def _insert_df(self, cols: Iterable[str], df: pd.DataFrame) -> None:
216
256
  client = self._get_client()
217
257
  # clickhouse-connect supports insert_df
258
+ # The df passed here should now have compatible datetime types
218
259
  client.insert_df(self.table, df[cols], settings={"async_insert": 1, "wait_end_of_query": 1})
219
260
 
220
- # ------------- missing values (lazy) -------------
261
+ # ------------- missing values & type conversion (lazy) -------------
221
262
 
222
263
  @staticmethod
223
- def _fill_missing_partition(pdf: pd.DataFrame) -> pd.DataFrame:
224
- # (unchanged body)
264
+ def _process_partition_for_clickhouse_compatible(pdf: pd.DataFrame) -> pd.DataFrame:
265
+ """
266
+ Process a partition to fill missing values and ensure initial data types are consistent.
267
+ This is the first step of data preparation.
268
+ """
269
+ pdf = pdf.copy() # Avoid modifying original
270
+
225
271
  for col in pdf.columns:
226
272
  s = pdf[col]
227
- if pd.api.types.is_integer_dtype(s.dtype):
273
+ dtype_str = str(s.dtype).lower()
274
+
275
+ # --- Handle PyArrow dtypes ---
276
+ if "[pyarrow]" in dtype_str:
277
+ try:
278
+ if "string" in dtype_str:
279
+ # Convert PyArrow string to object, fillna with empty string
280
+ pdf[col] = s.astype('object').fillna("")
281
+ elif "timestamp" in dtype_str:
282
+ # Convert PyArrow timestamp to pandas datetime, NaT for nulls
283
+ pdf[col] = pd.to_datetime(s, errors='coerce') # errors='coerce' handles conversion issues
284
+ elif "int" in dtype_str:
285
+ # Convert PyArrow int to pandas int, fillna with 0 for non-nullable
286
+ pdf[col] = s.fillna(0)
287
+ elif "float" in dtype_str or "double" in dtype_str:
288
+ pdf[col] = s.fillna(0.0)
289
+ elif "bool" in dtype_str:
290
+ pdf[col] = s.fillna(False) # Or pd.NA if you prefer
291
+ else:
292
+ # Fallback: convert to object and then to string
293
+ pdf[col] = s.astype('object').astype(str).fillna("")
294
+ except Exception as e:
295
+ # If conversion fails, fall back to object and string
296
+ pdf[col] = s.astype('object').astype(str).fillna("")
297
+
298
+ # --- Handle standard pandas dtypes ---
299
+ elif pd.api.types.is_integer_dtype(s.dtype):
228
300
  if pd.api.types.is_extension_array_dtype(s.dtype):
229
301
  pdf[col] = s.fillna(pd.NA)
230
302
  else:
231
303
  pdf[col] = s.fillna(0)
232
304
  elif pd.api.types.is_bool_dtype(s.dtype):
233
- pdf[col] = s.fillna(pd.NA)
305
+ pdf[col] = s.fillna(pd.NA) # Or False
234
306
  elif pd.api.types.is_float_dtype(s.dtype):
235
307
  pdf[col] = s.fillna(0.0)
236
308
  elif pd.api.types.is_datetime64_any_dtype(s.dtype):
309
+ # Datetimes - leave as is for now, will be handled in final step
237
310
  pass
238
311
  else:
239
- pdf[col] = s.fillna("")
312
+ # For object/string/category columns, ensure they're strings
313
+ pdf[col] = s.astype(str).fillna("")
314
+
240
315
  return pdf
241
316
 
317
+ def _ensure_clickhouse_compatible_datetime_types(self, df: pd.DataFrame) -> pd.DataFrame:
318
+ """
319
+ Final conversion step: Ensure datetime columns are in a format compatible
320
+ with clickhouse-connect driver. Specifically, convert numpy.datetime64 to
321
+ pandas.Timestamp or Python datetime objects.
322
+ This is called just before insertion.
323
+ """
324
+ df = df.copy()
325
+ for col in df.columns:
326
+ s = df[col]
327
+ # Check if the column is datetime-like
328
+ if pd.api.types.is_datetime64_any_dtype(s.dtype):
329
+ # --- Robust conversion to ensure compatibility ---
330
+ # 1. Convert to pandas datetime explicitly
331
+ df[col] = pd.to_datetime(s, utc=True) # Ensures timezone handling
332
+
333
+ # 2. Replace NaT with None for nullable columns (clickhouse-connect handles this)
334
+ # This is often sufficient, but let's be extra sure about the object type
335
+ # 3. Ensure the underlying objects are pandas.Timestamp (which have .timestamp())
336
+ # The pd.to_datetime should handle this, but accessing .dt accessor reinforces it.
337
+ # If there are still issues, we can force object conversion:
338
+ # df[col] = df[col].dt.to_pydatetime() # Converts to numpy array of datetime64 or None
339
+ # But pd.Timestamp is better. Let's try accessing .dt to ensure it's proper:
340
+ try:
341
+ _ = df[col].dt # Accessing .dt confirms it's datetime-like
342
+ except:
343
+ # If .dt fails, it means conversion wasn't clean, force it
344
+ self.logger.debug(f"Forcing datetime conversion for column {col}")
345
+ df[col] = pd.to_datetime(df[col].astype('object'), utc=True)
346
+
347
+ # --- Final check and explicit conversion if needed ---
348
+ # If the error persists, we might need to explicitly convert the array elements.
349
+ # Let's add a check for the first non-null element in a sample:
350
+ sample_series = df[col].dropna()
351
+ if len(sample_series) > 0:
352
+ first_val = sample_series.iloc[0]
353
+ if isinstance(first_val, np.datetime64):
354
+ self.logger.warning(f"Column {col} still contains numpy.datetime64 after conversion. Forcing object conversion.")
355
+ # Force conversion to object array of pandas.Timestamp or None
356
+ def convert_val(v):
357
+ if pd.isna(v):
358
+ return None
359
+ if isinstance(v, np.datetime64):
360
+ # Convert numpy.datetime64 to pandas.Timestamp
361
+ return pd.Timestamp(v)
362
+ return v
363
+ df[col] = df[col].apply(convert_val)
364
+
365
+ return df
366
+
367
+
242
368
  # ------------- low-level helpers -------------
243
369
 
244
370
  def _get_client(self):
@@ -284,4 +410,3 @@ class ClickHouseWriter(ManagedResource):
284
410
  finally:
285
411
  if hasattr(self._tlocal, "client"):
286
412
  delattr(self._tlocal, "client")
287
-
@@ -31,7 +31,7 @@ def dask_is_empty(ddf: dd.DataFrame, *, sample: int = 4) -> bool:
31
31
  k = min(max(sample, 1), ddf.npartitions)
32
32
  probes = dask.compute(*[
33
33
  ddf.get_partition(i).map_partitions(len) for i in range(k)
34
- ])
34
+ ], scheduler="threads")
35
35
 
36
36
  if any(_to_int_safe(n) > 0 for n in probes):
37
37
  return False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.9.4
3
+ Version: 2025.9.6
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -35,19 +35,21 @@ sibi_dst/osmnx_helper/basemaps/router_plotter.py,sha256=UAiijn-J-jjX4YnL0_P9SFqT
35
35
  sibi_dst/osmnx_helper/route_path_builder.py,sha256=XJJyu4YXegAkCRjE-knyQncwXaxDVXZhalYacLcb7e0,3557
36
36
  sibi_dst/osmnx_helper/utils.py,sha256=7-lFVhGn4rHjGz6FvpXtC2jY8UzGIVyKR3MVyEfB7nw,14407
37
37
  sibi_dst/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
+ sibi_dst/tests/test_baseclass.py,sha256=5huAwjWo_SOEZR2_0y5w9qUmw5G7pVdm8X1OTG87JK0,11562
38
39
  sibi_dst/tests/test_data_wrapper_class.py,sha256=6uFmZR2DxnxQz49L5jT2ehlKvlLnpUHMLFB_PqqUq7k,3336
39
40
  sibi_dst/utils/__init__.py,sha256=vShNCOMPw8KKwlb4tq5XGrpjqakJ_OE8YDc_xDAWAxI,1302
40
41
  sibi_dst/utils/async_utils.py,sha256=53aywfgq1Q6-0OVr9qR1Sf6g7Qv3I9qunAAR4fjFXBE,351
41
- sibi_dst/utils/base.py,sha256=W501bJFjpgElPBo9Xp7SkgFj-oGPXXfFE25Br0dZqxc,25470
42
- sibi_dst/utils/boilerplate/__init__.py,sha256=998ptGqawJl79WZA-UEeTyBhvc-ClENzXrMaCSWsrL4,295
42
+ sibi_dst/utils/base.py,sha256=sFngliI7Ku8bZMz0YdVhppuaPNZ0dvqRwCsPe9XdF1A,16256
43
+ sibi_dst/utils/boilerplate/__init__.py,sha256=zgkQ50-cKmRugOz1bHqhjVXb3Hb8rsIwN7d5-kVsRls,370
43
44
  sibi_dst/utils/boilerplate/base_attacher.py,sha256=JRAyvfljQjKVD5BJDDd09cBY9pGPIe8LQp0aUv_xJs0,736
44
45
  sibi_dst/utils/boilerplate/base_data_cube.py,sha256=ErKTM2kT8LsSXADcyYvT436O_Mp0J2hm8xs1IUircb4,2760
45
46
  sibi_dst/utils/boilerplate/base_parquet_artifact.py,sha256=oqPbjHFfChA9j1WL-eDAh7XLA3zmf-Rq7s_kzITVniA,3753
46
47
  sibi_dst/utils/boilerplate/base_parquet_reader.py,sha256=3kN9_bbxyX-WuJLMBWejeApW2V_BDArSljhSUOAOhVU,692
48
+ sibi_dst/utils/boilerplate/hybrid_data_loader.py,sha256=Tazn7QL3FmWKVMXxzkvxPrG_2ucsPHvSotIW9dBLoNc,6018
47
49
  sibi_dst/utils/business_days.py,sha256=dP0Xj4FhTBIvZZrZYLOHZl5zOpDAgWkD4p_1a7BOT7I,8461
48
- sibi_dst/utils/clickhouse_writer.py,sha256=JCjLfPfsDDAvoMJeh0uVqVL5Je6mPcZn-G_EL9Pk6ms,10364
50
+ sibi_dst/utils/clickhouse_writer.py,sha256=AOv0bYFzAI0u4dEkEBoUqtHekwPMISdNT5-ywCtDe4I,17049
49
51
  sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
50
- sibi_dst/utils/dask_utils.py,sha256=FURwrNqij6ptxFhI4v7yaGkyOIIyW9lSPpMfE9-kxHY,1970
52
+ sibi_dst/utils/dask_utils.py,sha256=QhFcmpH4fXAy6b3DugIX5JvH4h-P3M3hXKnBYTLRkq0,1991
51
53
  sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
52
54
  sibi_dst/utils/data_utils.py,sha256=7bLidEjppieNoozDFb6OuRY0W995cxg4tiGAlkGfePI,7768
53
55
  sibi_dst/utils/data_wrapper.py,sha256=axHOmCG9cBJgjf5m8jpzsCCZzXJgynGs44rGe6FUrzk,29906
@@ -91,6 +93,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
91
93
  sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
92
94
  sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
93
95
  sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
94
- sibi_dst-2025.9.4.dist-info/METADATA,sha256=LKtGXXgxpOR9pL7rgBuGpySdppqMGi674oH_18tVKec,2710
95
- sibi_dst-2025.9.4.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
96
- sibi_dst-2025.9.4.dist-info/RECORD,,
96
+ sibi_dst-2025.9.6.dist-info/METADATA,sha256=e9vt1wbHivyTJhyubiEjJcMFBNDF1m9nERTlBgYvq9o,2710
97
+ sibi_dst-2025.9.6.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
98
+ sibi_dst-2025.9.6.dist-info/RECORD,,