sibi-dst 2025.9.4__py3-none-any.whl → 2025.9.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/tests/test_baseclass.py +403 -0
- sibi_dst/utils/base.py +0 -254
- sibi_dst/utils/boilerplate/__init__.py +4 -1
- sibi_dst/utils/boilerplate/hybrid_data_loader.py +144 -0
- sibi_dst/utils/clickhouse_writer.py +138 -13
- sibi_dst/utils/dask_utils.py +1 -1
- {sibi_dst-2025.9.4.dist-info → sibi_dst-2025.9.6.dist-info}/METADATA +1 -1
- {sibi_dst-2025.9.4.dist-info → sibi_dst-2025.9.6.dist-info}/RECORD +9 -7
- {sibi_dst-2025.9.4.dist-info → sibi_dst-2025.9.6.dist-info}/WHEEL +0 -0
@@ -0,0 +1,403 @@
|
|
1
|
+
import asyncio
|
2
|
+
import json
|
3
|
+
import threading
|
4
|
+
from typing import Any, Dict
|
5
|
+
from unittest.mock import MagicMock
|
6
|
+
|
7
|
+
import fsspec
|
8
|
+
|
9
|
+
from sibi_dst.utils import Logger
|
10
|
+
from sibi_dst.utils import ManagedResource
|
11
|
+
from sibi_dst.utils.base import _QueueSSE # Replace 'your_module' with actual module name
|
12
|
+
|
13
|
+
|
14
|
+
# ------------------------------ Test Fixtures ------------------------------
|
15
|
+
|
16
|
+
class TestResource(ManagedResource):
|
17
|
+
def __init__(self, *args, **kwargs):
|
18
|
+
super().__init__(*args, **kwargs)
|
19
|
+
self.cleanup_called = False
|
20
|
+
self.acleanup_called = False
|
21
|
+
|
22
|
+
def _cleanup(self) -> None:
|
23
|
+
self.cleanup_called = True
|
24
|
+
super()._cleanup()
|
25
|
+
|
26
|
+
async def _acleanup(self) -> None:
|
27
|
+
self.acleanup_called = True
|
28
|
+
await super()._acleanup()
|
29
|
+
|
30
|
+
|
31
|
+
class MockSSESink:
|
32
|
+
def __init__(self):
|
33
|
+
self.events = []
|
34
|
+
self.closed = False
|
35
|
+
|
36
|
+
async def send(self, event: str, data: Dict[str, Any]) -> None:
|
37
|
+
self.events.append({"event": event, "data": data})
|
38
|
+
|
39
|
+
async def aclose(self) -> None:
|
40
|
+
self.closed = True
|
41
|
+
|
42
|
+
|
43
|
+
class MockSyncSSESink:
|
44
|
+
def __init__(self):
|
45
|
+
self.events = []
|
46
|
+
self.closed = False
|
47
|
+
|
48
|
+
def send(self, event: str, data: Dict[str, Any]) -> None:
|
49
|
+
self.events.append({"event": event, "data": data})
|
50
|
+
|
51
|
+
def close(self) -> None:
|
52
|
+
self.closed = True
|
53
|
+
|
54
|
+
|
55
|
+
# ------------------------------ Mock fsspec filesystem ------------------------------
|
56
|
+
|
57
|
+
class MockFileSystem(fsspec.AbstractFileSystem):
|
58
|
+
def __init__(self, **kwargs):
|
59
|
+
super().__init__(**kwargs)
|
60
|
+
self.closed = False
|
61
|
+
|
62
|
+
def close(self):
|
63
|
+
self.closed = True
|
64
|
+
|
65
|
+
|
66
|
+
# ------------------------------ Utility for Event Loop ------------------------------
|
67
|
+
|
68
|
+
def run_async_test(coro):
|
69
|
+
"""Run async test safely in different environments."""
|
70
|
+
try:
|
71
|
+
# Try to get existing event loop (for Jupyter/IPython)
|
72
|
+
loop = asyncio.get_event_loop()
|
73
|
+
if loop.is_running():
|
74
|
+
# In Jupyter, create a new task
|
75
|
+
task = loop.create_task(coro)
|
76
|
+
return task
|
77
|
+
else:
|
78
|
+
return loop.run_until_complete(coro)
|
79
|
+
except RuntimeError:
|
80
|
+
# No event loop running, use asyncio.run()
|
81
|
+
return asyncio.run(coro)
|
82
|
+
|
83
|
+
|
84
|
+
# ------------------------------ Lifecycle Tests ------------------------------
|
85
|
+
|
86
|
+
def test_double_close_no_error():
|
87
|
+
"""Test that calling close() multiple times doesn't raise errors."""
|
88
|
+
resource = TestResource()
|
89
|
+
resource.close()
|
90
|
+
resource.close() # Should not raise
|
91
|
+
assert resource.closed
|
92
|
+
|
93
|
+
|
94
|
+
def test_double_aclose_no_error():
|
95
|
+
"""Test that calling aclose() multiple times doesn't raise errors."""
|
96
|
+
async def test():
|
97
|
+
resource = TestResource()
|
98
|
+
await resource.aclose()
|
99
|
+
await resource.aclose() # Should not raise
|
100
|
+
assert resource.closed
|
101
|
+
|
102
|
+
run_async_test(test())
|
103
|
+
|
104
|
+
|
105
|
+
def test_context_manager_sync():
|
106
|
+
"""Test sync context manager behavior."""
|
107
|
+
with TestResource() as resource:
|
108
|
+
assert not resource.closed
|
109
|
+
assert resource.closed
|
110
|
+
assert resource.cleanup_called
|
111
|
+
|
112
|
+
|
113
|
+
def test_context_manager_async():
|
114
|
+
"""Test async context manager behavior."""
|
115
|
+
async def test():
|
116
|
+
async with TestResource() as resource:
|
117
|
+
assert not resource.closed
|
118
|
+
assert resource.closed
|
119
|
+
assert resource.acleanup_called
|
120
|
+
|
121
|
+
run_async_test(test())
|
122
|
+
|
123
|
+
|
124
|
+
# ------------------------------ SSE Emission Tests ------------------------------
|
125
|
+
|
126
|
+
def test_auto_sse_creation():
|
127
|
+
"""Test automatic SSE creation when auto_sse=True."""
|
128
|
+
resource = TestResource(auto_sse=True)
|
129
|
+
sse = resource.get_sse()
|
130
|
+
assert sse is not None
|
131
|
+
assert isinstance(sse, _QueueSSE)
|
132
|
+
assert resource._owns_sse
|
133
|
+
|
134
|
+
|
135
|
+
def test_sse_emission_with_async_sink():
|
136
|
+
"""Test SSE emission with async send method."""
|
137
|
+
async def test():
|
138
|
+
sink = MockSSESink()
|
139
|
+
resource = TestResource(sse=sink)
|
140
|
+
|
141
|
+
await resource.emit("test_event", key="value")
|
142
|
+
|
143
|
+
assert len(sink.events) == 1
|
144
|
+
assert sink.events[0]["event"] == "test_event"
|
145
|
+
assert sink.events[0]["data"] == {"key": "value"}
|
146
|
+
|
147
|
+
run_async_test(test())
|
148
|
+
|
149
|
+
|
150
|
+
def test_sse_emission_with_sync_sink():
|
151
|
+
"""Test SSE emission with sync send method wrapped in async."""
|
152
|
+
sink = MockSyncSSESink()
|
153
|
+
resource = TestResource(sse=sink)
|
154
|
+
|
155
|
+
async def test():
|
156
|
+
await resource.emit("test_event", key="value")
|
157
|
+
|
158
|
+
assert len(sink.events) == 1
|
159
|
+
assert sink.events[0]["event"] == "test_event"
|
160
|
+
assert sink.events[0]["data"] == {"key": "value"}
|
161
|
+
|
162
|
+
run_async_test(test())
|
163
|
+
|
164
|
+
|
165
|
+
def test_sse_put_method_support():
|
166
|
+
"""Test SSE emission with put method."""
|
167
|
+
class PutSink:
|
168
|
+
def __init__(self):
|
169
|
+
self.items = []
|
170
|
+
|
171
|
+
async def put(self, item: Dict[str, Any]) -> None:
|
172
|
+
self.items.append(item)
|
173
|
+
|
174
|
+
async def test():
|
175
|
+
sink = PutSink()
|
176
|
+
resource = TestResource(sse=sink)
|
177
|
+
|
178
|
+
await resource.emit("test_event", key="value")
|
179
|
+
|
180
|
+
assert len(sink.items) == 1
|
181
|
+
item = sink.items[0]
|
182
|
+
assert item["event"] == "test_event"
|
183
|
+
assert json.loads(item["data"]) == {"key": "value"}
|
184
|
+
|
185
|
+
run_async_test(test())
|
186
|
+
|
187
|
+
|
188
|
+
def test_sse_no_emitter_no_error():
|
189
|
+
"""Test that emit on resource without emitter doesn't raise."""
|
190
|
+
resource = TestResource()
|
191
|
+
# Should not raise error
|
192
|
+
async def test():
|
193
|
+
await resource.emit("test_event", key="value")
|
194
|
+
|
195
|
+
run_async_test(test())
|
196
|
+
|
197
|
+
|
198
|
+
def test_sse_emission_after_close():
|
199
|
+
"""Test that emit after close is no-op."""
|
200
|
+
async def test():
|
201
|
+
sink = MockSSESink()
|
202
|
+
resource = TestResource(sse=sink)
|
203
|
+
|
204
|
+
await resource.aclose()
|
205
|
+
await resource.emit("test_event", key="value") # Should not raise
|
206
|
+
|
207
|
+
assert len(sink.events) == 0
|
208
|
+
|
209
|
+
run_async_test(test())
|
210
|
+
|
211
|
+
|
212
|
+
# ------------------------------ Cleanup Interplay Tests ------------------------------
|
213
|
+
|
214
|
+
def test_sync_cleanup_called_on_sync_close():
|
215
|
+
"""Test that sync cleanup is called during sync close."""
|
216
|
+
resource = TestResource()
|
217
|
+
resource.close()
|
218
|
+
assert resource.cleanup_called
|
219
|
+
assert not resource.acleanup_called
|
220
|
+
|
221
|
+
|
222
|
+
def test_async_cleanup_called_on_async_close():
|
223
|
+
"""Test that async cleanup is called during async close."""
|
224
|
+
async def test():
|
225
|
+
resource = TestResource()
|
226
|
+
await resource.aclose()
|
227
|
+
assert resource.acleanup_called
|
228
|
+
assert not resource.cleanup_called
|
229
|
+
|
230
|
+
run_async_test(test())
|
231
|
+
|
232
|
+
|
233
|
+
# ------------------------------ Logger Tests ------------------------------
|
234
|
+
|
235
|
+
def test_logger_ownership():
|
236
|
+
"""Test that logger is owned when not provided externally."""
|
237
|
+
resource = TestResource()
|
238
|
+
assert resource._owns_logger
|
239
|
+
assert resource.logger is not None
|
240
|
+
|
241
|
+
|
242
|
+
def test_external_logger_not_owned():
|
243
|
+
"""Test that external logger is not owned."""
|
244
|
+
external_logger = Logger.default_logger("test")
|
245
|
+
resource = TestResource(logger=external_logger)
|
246
|
+
assert not resource._owns_logger
|
247
|
+
assert resource.logger is external_logger
|
248
|
+
|
249
|
+
|
250
|
+
def test_logger_level_configuration():
|
251
|
+
"""Test logger level configuration based on verbose/debug flags."""
|
252
|
+
# Default (warning level)
|
253
|
+
resource = TestResource()
|
254
|
+
assert hasattr(resource.logger, 'level')
|
255
|
+
|
256
|
+
# Verbose (info level)
|
257
|
+
resource = TestResource(verbose=True)
|
258
|
+
assert hasattr(resource.logger, 'level')
|
259
|
+
|
260
|
+
# Debug (debug level)
|
261
|
+
resource = TestResource(debug=True)
|
262
|
+
assert hasattr(resource.logger, 'level')
|
263
|
+
|
264
|
+
|
265
|
+
# ------------------------------ Lazy Instantiation Tests ------------------------------
|
266
|
+
|
267
|
+
def test_lazy_fs_instantiation():
|
268
|
+
"""Test lazy filesystem instantiation via factory."""
|
269
|
+
fs_instance = MockFileSystem()
|
270
|
+
factory_called = [False]
|
271
|
+
|
272
|
+
def fs_factory():
|
273
|
+
factory_called[0] = True
|
274
|
+
return fs_instance
|
275
|
+
|
276
|
+
resource = TestResource(fs_factory=fs_factory)
|
277
|
+
assert not factory_called[0] # Not called yet
|
278
|
+
|
279
|
+
fs = resource._ensure_fs()
|
280
|
+
assert factory_called[0]
|
281
|
+
assert fs is fs_instance
|
282
|
+
assert resource.fs is fs_instance
|
283
|
+
|
284
|
+
|
285
|
+
def test_lazy_sse_instantiation():
|
286
|
+
"""Test lazy SSE instantiation via factory."""
|
287
|
+
sink_instance = MockSSESink()
|
288
|
+
factory_called = [False]
|
289
|
+
|
290
|
+
def sse_factory():
|
291
|
+
factory_called[0] = True
|
292
|
+
return sink_instance
|
293
|
+
|
294
|
+
resource = TestResource(sse_factory=sse_factory)
|
295
|
+
assert not factory_called[0] # Not called yet
|
296
|
+
|
297
|
+
sse = resource._ensure_sse()
|
298
|
+
assert factory_called[0]
|
299
|
+
assert sse is sink_instance
|
300
|
+
assert resource._sse is sink_instance
|
301
|
+
|
302
|
+
|
303
|
+
def test_lazy_fs_not_called_if_fs_provided():
|
304
|
+
"""Test that factory is not called if fs is provided directly."""
|
305
|
+
fs_instance = MockFileSystem()
|
306
|
+
factory = MagicMock()
|
307
|
+
|
308
|
+
resource = TestResource(fs=fs_instance, fs_factory=factory)
|
309
|
+
fs = resource._ensure_fs()
|
310
|
+
|
311
|
+
assert fs is fs_instance
|
312
|
+
factory.assert_not_called()
|
313
|
+
|
314
|
+
|
315
|
+
def test_lazy_sse_not_called_if_sse_provided():
|
316
|
+
"""Test that factory is not called if sse is provided directly."""
|
317
|
+
sink_instance = MockSSESink()
|
318
|
+
factory = MagicMock()
|
319
|
+
|
320
|
+
resource = TestResource(sse=sink_instance, sse_factory=factory)
|
321
|
+
sse = resource._ensure_sse()
|
322
|
+
|
323
|
+
assert sse is sink_instance
|
324
|
+
factory.assert_not_called()
|
325
|
+
|
326
|
+
|
327
|
+
# ------------------------------ Thread Safety Tests ------------------------------
|
328
|
+
|
329
|
+
def test_thread_safe_close():
|
330
|
+
"""Test that close operations are thread-safe."""
|
331
|
+
resource = TestResource()
|
332
|
+
|
333
|
+
results = []
|
334
|
+
errors = []
|
335
|
+
|
336
|
+
def close_resource():
|
337
|
+
try:
|
338
|
+
resource.close()
|
339
|
+
results.append("success")
|
340
|
+
except Exception as e:
|
341
|
+
errors.append(str(e))
|
342
|
+
results.append(f"error: {e}")
|
343
|
+
|
344
|
+
# Start multiple threads trying to close simultaneously
|
345
|
+
threads = [threading.Thread(target=close_resource) for _ in range(5)]
|
346
|
+
for t in threads:
|
347
|
+
t.start()
|
348
|
+
for t in threads:
|
349
|
+
t.join()
|
350
|
+
|
351
|
+
# Debug information
|
352
|
+
print(f"Results: {results}")
|
353
|
+
print(f"Errors: {errors}")
|
354
|
+
print(f"Resource closed: {resource.closed}")
|
355
|
+
|
356
|
+
# Should have at least one success (the first one) and no exceptions
|
357
|
+
success_count = results.count("success")
|
358
|
+
error_count = len([r for r in results if r.startswith("error")])
|
359
|
+
|
360
|
+
# At least one should succeed
|
361
|
+
assert success_count >= 1, f"Expected at least 1 success, got {success_count}"
|
362
|
+
# No errors should occur
|
363
|
+
assert error_count == 0, f"Expected 0 errors, got {error_count}"
|
364
|
+
# Resource should be closed
|
365
|
+
assert resource.closed, "Resource should be closed"
|
366
|
+
|
367
|
+
|
368
|
+
# ------------------------------ Individual Test Functions ------------------------------
|
369
|
+
|
370
|
+
# You can now run individual tests like this:
|
371
|
+
if __name__ == "__main__":
|
372
|
+
# Run individual tests
|
373
|
+
test_double_close_no_error()
|
374
|
+
print("✓ test_double_close_no_error passed")
|
375
|
+
|
376
|
+
test_sync_cleanup_called_on_sync_close()
|
377
|
+
print("✓ test_sync_cleanup_called_on_sync_close passed")
|
378
|
+
|
379
|
+
test_logger_ownership()
|
380
|
+
print("✓ test_logger_ownership passed")
|
381
|
+
|
382
|
+
test_external_logger_not_owned()
|
383
|
+
print("✓ test_external_logger_not_owned passed")
|
384
|
+
|
385
|
+
test_lazy_fs_instantiation()
|
386
|
+
print("✓ test_lazy_fs_instantiation passed")
|
387
|
+
|
388
|
+
test_lazy_sse_instantiation()
|
389
|
+
print("✓ test_lazy_sse_instantiation passed")
|
390
|
+
|
391
|
+
test_lazy_fs_not_called_if_fs_provided()
|
392
|
+
print("✓ test_lazy_fs_not_called_if_fs_provided passed")
|
393
|
+
|
394
|
+
test_lazy_sse_not_called_if_sse_provided()
|
395
|
+
print("✓ test_lazy_sse_not_called_if_sse_provided passed")
|
396
|
+
|
397
|
+
test_thread_safe_close()
|
398
|
+
print("✓ test_thread_safe_close passed")
|
399
|
+
|
400
|
+
test_auto_sse_creation()
|
401
|
+
print("✓ test_auto_sse_creation passed")
|
402
|
+
|
403
|
+
print("All tests completed!")
|
sibi_dst/utils/base.py
CHANGED
@@ -441,257 +441,3 @@ class ManagedResource(abc.ABC):
|
|
441
441
|
except Exception:
|
442
442
|
pass
|
443
443
|
|
444
|
-
## Before SSE handling
|
445
|
-
|
446
|
-
# import abc
|
447
|
-
# import threading
|
448
|
-
# import weakref
|
449
|
-
# from typing import Self, Optional, Callable
|
450
|
-
#
|
451
|
-
# import fsspec
|
452
|
-
#
|
453
|
-
# from sibi_dst.utils import Logger
|
454
|
-
#
|
455
|
-
#
|
456
|
-
# class ManagedResource(abc.ABC):
|
457
|
-
# """
|
458
|
-
# Boilerplate ABC for components that manage a logger and an optional fsspec filesystem,
|
459
|
-
# with sync/async lifecycle helpers, lazy FS creation via an optional factory, and
|
460
|
-
# configurable cleanup-error logging.
|
461
|
-
# """
|
462
|
-
#
|
463
|
-
# def __init__(
|
464
|
-
# self,
|
465
|
-
# *,
|
466
|
-
# verbose: bool = False,
|
467
|
-
# debug: bool = False,
|
468
|
-
# log_cleanup_errors: bool = True,
|
469
|
-
# logger: Optional[Logger] = None,
|
470
|
-
# fs: Optional[fsspec.AbstractFileSystem] = None,
|
471
|
-
# fs_factory: Optional[Callable[[], fsspec.AbstractFileSystem]] = None,
|
472
|
-
# **_: object,
|
473
|
-
# ) -> None:
|
474
|
-
# # ---- Declared upfront for type checkers
|
475
|
-
# self.logger: Logger
|
476
|
-
# self.fs: Optional[fsspec.AbstractFileSystem] = None
|
477
|
-
# self._fs_factory: Optional[Callable[[], fsspec.AbstractFileSystem]] = None
|
478
|
-
# self._owns_logger: bool = False
|
479
|
-
# self._owns_fs: bool = False
|
480
|
-
# self._is_closed: bool = False
|
481
|
-
# self._closing: bool = False
|
482
|
-
# self._close_lock = threading.RLock()
|
483
|
-
#
|
484
|
-
# self.verbose = verbose
|
485
|
-
# self.debug = debug
|
486
|
-
# self._log_cleanup_errors = log_cleanup_errors
|
487
|
-
#
|
488
|
-
# # ---- Logger ownership
|
489
|
-
# if logger is None:
|
490
|
-
# self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
491
|
-
# self._owns_logger = True
|
492
|
-
# level = Logger.DEBUG if self.debug else (Logger.INFO if self.verbose else Logger.WARNING)
|
493
|
-
# self.logger.set_level(level)
|
494
|
-
# else:
|
495
|
-
# self.logger = logger
|
496
|
-
# self._owns_logger = False # do not mutate external logger
|
497
|
-
#
|
498
|
-
# # ---- FS ownership & lazy creation
|
499
|
-
# if fs is not None:
|
500
|
-
# self.fs = fs
|
501
|
-
# self._owns_fs = False
|
502
|
-
# self._fs_factory = None
|
503
|
-
# elif fs_factory is not None:
|
504
|
-
# # Lazy: don't create until first use
|
505
|
-
# self._fs_factory = fs_factory
|
506
|
-
# self._owns_fs = True # we will own it *if* created
|
507
|
-
# self.fs = None
|
508
|
-
# else:
|
509
|
-
# self.fs = None
|
510
|
-
# self._owns_fs = False
|
511
|
-
# self._fs_factory = None
|
512
|
-
#
|
513
|
-
# # Register a GC-time finalizer that does not capture self
|
514
|
-
# self_ref = weakref.ref(self)
|
515
|
-
# self._finalizer = weakref.finalize(self, self._finalize_static, self_ref)
|
516
|
-
#
|
517
|
-
# if self.debug:
|
518
|
-
# try:
|
519
|
-
# self.logger.debug("Component %s initialized. %s", self.__class__.__name__, repr(self))
|
520
|
-
# except Exception:
|
521
|
-
# pass
|
522
|
-
#
|
523
|
-
# # ---------- Introspection ----------
|
524
|
-
# @property
|
525
|
-
# def is_closed(self) -> bool:
|
526
|
-
# return self._is_closed
|
527
|
-
#
|
528
|
-
# @property
|
529
|
-
# def closed(self) -> bool: # alias
|
530
|
-
# return self._is_closed
|
531
|
-
#
|
532
|
-
# def __repr__(self) -> str:
|
533
|
-
# class_name = self.__class__.__name__
|
534
|
-
# logger_status = "own" if self._owns_logger else "external"
|
535
|
-
# if self.fs is None and self._fs_factory is not None:
|
536
|
-
# fs_status = "own(lazy)"
|
537
|
-
# elif self.fs is None:
|
538
|
-
# fs_status = "none"
|
539
|
-
# else:
|
540
|
-
# fs_status = "own" if self._owns_fs else "external"
|
541
|
-
# return (f"<{class_name} debug={self.debug} verbose={self.verbose} "
|
542
|
-
# f"log_cleanup_errors={self._log_cleanup_errors} "
|
543
|
-
# f"logger={logger_status} fs={fs_status}>")
|
544
|
-
#
|
545
|
-
# # ---------- Subclass hooks ----------
|
546
|
-
# def _cleanup(self) -> None:
|
547
|
-
# """Sync cleanup for resources created BY THE SUBCLASS."""
|
548
|
-
# return
|
549
|
-
#
|
550
|
-
# async def _acleanup(self) -> None:
|
551
|
-
# """Async cleanup for resources created BY THE SUBCLASS."""
|
552
|
-
# return
|
553
|
-
#
|
554
|
-
# # ---------- FS helpers ----------
|
555
|
-
# def _ensure_fs(self) -> Optional[fsspec.AbstractFileSystem]:
|
556
|
-
# """Create the FS lazily if a factory was provided. Return fs (or None)."""
|
557
|
-
# if self.fs is None and self._fs_factory is not None:
|
558
|
-
# created = self._fs_factory()
|
559
|
-
# if not isinstance(created, fsspec.AbstractFileSystem):
|
560
|
-
# raise TypeError(f"fs_factory() must return fsspec.AbstractFileSystem, got {type(created)!r}")
|
561
|
-
# self.fs = created
|
562
|
-
# # _owns_fs already True when factory is present
|
563
|
-
# return self.fs
|
564
|
-
#
|
565
|
-
# def require_fs(self) -> fsspec.AbstractFileSystem:
|
566
|
-
# """Return a filesystem or raise if not configured/creatable."""
|
567
|
-
# fs = self._ensure_fs()
|
568
|
-
# if fs is None:
|
569
|
-
# raise RuntimeError(
|
570
|
-
# f"{self.__class__.__name__}: filesystem is required but not configured"
|
571
|
-
# )
|
572
|
-
# return fs
|
573
|
-
#
|
574
|
-
# # ---------- Shared shutdown helpers (no logging; safe for late shutdown) ----------
|
575
|
-
# def _release_owned_fs(self) -> None:
|
576
|
-
# if self._owns_fs:
|
577
|
-
# # ensure creation state is respected even if never used
|
578
|
-
# _ = self.fs or None # no-op; if never created, nothing to close
|
579
|
-
# if self.fs is not None:
|
580
|
-
# close = getattr(self.fs, "close", None)
|
581
|
-
# try:
|
582
|
-
# if callable(close):
|
583
|
-
# close()
|
584
|
-
# finally:
|
585
|
-
# self.fs = None
|
586
|
-
#
|
587
|
-
# def _shutdown_logger(self) -> None:
|
588
|
-
# if self._owns_logger:
|
589
|
-
# try:
|
590
|
-
# self.logger.shutdown()
|
591
|
-
# except Exception:
|
592
|
-
# pass
|
593
|
-
#
|
594
|
-
# def _shutdown_owned_resources(self) -> None:
|
595
|
-
# self._release_owned_fs()
|
596
|
-
# self._shutdown_logger()
|
597
|
-
#
|
598
|
-
# # ---------- Public lifecycle (sync) ----------
|
599
|
-
# def close(self) -> None:
|
600
|
-
# with self._close_lock:
|
601
|
-
# if self._is_closed or self._closing:
|
602
|
-
# return
|
603
|
-
# self._closing = True
|
604
|
-
#
|
605
|
-
# try:
|
606
|
-
# self._cleanup()
|
607
|
-
# except Exception:
|
608
|
-
# # Only include traceback when debug=True
|
609
|
-
# if self._log_cleanup_errors:
|
610
|
-
# try:
|
611
|
-
# self.logger.error(
|
612
|
-
# "Error during %s._cleanup()", self.__class__.__name__,
|
613
|
-
# exc_info=self.debug
|
614
|
-
# )
|
615
|
-
# except Exception:
|
616
|
-
# pass
|
617
|
-
# raise
|
618
|
-
# finally:
|
619
|
-
# with self._close_lock:
|
620
|
-
# self._is_closed = True
|
621
|
-
# self._closing = False
|
622
|
-
# self._shutdown_owned_resources()
|
623
|
-
# if self.debug:
|
624
|
-
# try:
|
625
|
-
# self.logger.debug("Component %s closed.", self.__class__.__name__)
|
626
|
-
# except Exception:
|
627
|
-
# pass
|
628
|
-
#
|
629
|
-
# # ---------- Public lifecycle (async) ----------
|
630
|
-
# async def aclose(self) -> None:
|
631
|
-
# with self._close_lock:
|
632
|
-
# if self._is_closed or self._closing:
|
633
|
-
# return
|
634
|
-
# self._closing = True
|
635
|
-
#
|
636
|
-
# try:
|
637
|
-
# await self._acleanup()
|
638
|
-
# except Exception:
|
639
|
-
# # Only include traceback when debug=True
|
640
|
-
# if self._log_cleanup_errors:
|
641
|
-
# try:
|
642
|
-
# self.logger.error(
|
643
|
-
# "Error during %s._acleanup()", self.__class__.__name__,
|
644
|
-
# exc_info=self.debug
|
645
|
-
# )
|
646
|
-
# except Exception:
|
647
|
-
# pass
|
648
|
-
# raise
|
649
|
-
# finally:
|
650
|
-
# with self._close_lock:
|
651
|
-
# self._is_closed = True
|
652
|
-
# self._closing = False
|
653
|
-
# self._shutdown_owned_resources()
|
654
|
-
# if self.debug:
|
655
|
-
# try:
|
656
|
-
# self.logger.debug("Async component %s closed.", self.__class__.__name__)
|
657
|
-
# except Exception:
|
658
|
-
# pass
|
659
|
-
#
|
660
|
-
# # ---------- Context managers ----------
|
661
|
-
# def __enter__(self) -> Self:
|
662
|
-
# return self
|
663
|
-
#
|
664
|
-
# def __exit__(self, exc_type, exc, tb) -> bool:
|
665
|
-
# self.close()
|
666
|
-
# return False # propagate exceptions
|
667
|
-
#
|
668
|
-
# async def __aenter__(self) -> Self:
|
669
|
-
# return self
|
670
|
-
#
|
671
|
-
# async def __aexit__(self, exc_type, exc, tb) -> bool:
|
672
|
-
# await self.aclose()
|
673
|
-
# return False
|
674
|
-
#
|
675
|
-
# # ---------- Finalizer ( at Garbage Collection-time absolutely silent) ----------
|
676
|
-
# @staticmethod
|
677
|
-
# def _finalize_static(ref: "weakref.ReferenceType[ManagedResource]") -> None:
|
678
|
-
# obj = ref()
|
679
|
-
# if obj is None:
|
680
|
-
# return
|
681
|
-
# # No logging here; interpreter may be tearing down.
|
682
|
-
# # Best-effort silent cleanup; avoid locks and context managers.
|
683
|
-
# try:
|
684
|
-
# if not obj._is_closed:
|
685
|
-
# try:
|
686
|
-
# obj._cleanup()
|
687
|
-
# except Exception:
|
688
|
-
# pass
|
689
|
-
# obj._is_closed = True
|
690
|
-
# try:
|
691
|
-
# obj._shutdown_owned_resources()
|
692
|
-
# except Exception:
|
693
|
-
# pass
|
694
|
-
# except Exception:
|
695
|
-
# # do not show anything at garbage collection time
|
696
|
-
# pass
|
697
|
-
#
|
@@ -2,10 +2,13 @@ from .base_parquet_artifact import BaseParquetArtifact
|
|
2
2
|
from .base_data_cube import BaseDataCube
|
3
3
|
from .base_attacher import make_attacher
|
4
4
|
from .base_parquet_reader import BaseParquetReader
|
5
|
+
from .hybrid_data_loader import HybridDataLoader
|
6
|
+
|
5
7
|
__all__ = [
|
6
8
|
"BaseDataCube",
|
7
9
|
"BaseParquetArtifact",
|
8
10
|
"make_attacher",
|
9
|
-
"BaseParquetReader"
|
11
|
+
"BaseParquetReader",
|
12
|
+
"HybridDataLoader",
|
10
13
|
]
|
11
14
|
|
@@ -0,0 +1,144 @@
|
|
1
|
+
import dask.dataframe as dd
|
2
|
+
import datetime
|
3
|
+
import pandas as pd
|
4
|
+
from typing import Optional
|
5
|
+
from sibi_dst.utils import Logger
|
6
|
+
from sibi_dst.utils.dask_utils import dask_is_empty
|
7
|
+
|
8
|
+
today = datetime.date.today()
|
9
|
+
yesterday = today - datetime.timedelta(days=1)
|
10
|
+
TODAY_STR = today.strftime('%Y-%m-%d')
|
11
|
+
YESTERDAY_STR = yesterday.strftime('%Y-%m-%d')
|
12
|
+
|
13
|
+
|
14
|
+
class HybridDataLoader:
|
15
|
+
"""
|
16
|
+
A generic data loader that orchestrates loading from a historical
|
17
|
+
source and an optional live source.
|
18
|
+
"""
|
19
|
+
|
20
|
+
def __init__(self, start_date: str, end_date: str, historical_reader, live_reader, date_field: str, **kwargs):
|
21
|
+
self.start_date = self._validate_date_format(start_date)
|
22
|
+
self.end_date = self._validate_date_format(end_date)
|
23
|
+
self.historical_reader = historical_reader
|
24
|
+
self.live_reader = live_reader
|
25
|
+
self.date_field = date_field
|
26
|
+
|
27
|
+
self.logger = kwargs.get('logger', Logger.default_logger(logger_name=__name__))
|
28
|
+
self.debug = kwargs.get('debug', False)
|
29
|
+
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
30
|
+
|
31
|
+
# Validate date range
|
32
|
+
self._validate_date_range()
|
33
|
+
|
34
|
+
# Determine loading strategy
|
35
|
+
self._should_read_live = self.end_date == TODAY_STR
|
36
|
+
self._is_single_today = (self.start_date == TODAY_STR and self.end_date == TODAY_STR)
|
37
|
+
self._is_single_historical = (self.start_date == self.end_date and self.end_date != TODAY_STR)
|
38
|
+
|
39
|
+
def _validate_date_format(self, date_str: str) -> str:
|
40
|
+
"""Validate that date string is in correct format."""
|
41
|
+
try:
|
42
|
+
datetime.datetime.strptime(date_str, '%Y-%m-%d')
|
43
|
+
return date_str
|
44
|
+
except ValueError:
|
45
|
+
raise ValueError(f"Date '{date_str}' is not in valid YYYY-MM-DD format")
|
46
|
+
|
47
|
+
def _validate_date_range(self):
|
48
|
+
"""Validate that start date is not after end date."""
|
49
|
+
start = datetime.datetime.strptime(self.start_date, '%Y-%m-%d').date()
|
50
|
+
end = datetime.datetime.strptime(self.end_date, '%Y-%m-%d').date()
|
51
|
+
if end < start:
|
52
|
+
raise ValueError(f"End date ({self.end_date}) cannot be before start date ({self.start_date})")
|
53
|
+
|
54
|
+
def _align_schema_to_live(self, historical_df: dd.DataFrame, live_df: dd.DataFrame) -> dd.DataFrame:
|
55
|
+
"""Forces the historical dataframe schema to match the live one."""
|
56
|
+
self.logger.debug("Aligning historical schema to match live schema.")
|
57
|
+
historical_cols = set(historical_df.columns)
|
58
|
+
live_cols = set(live_df.columns)
|
59
|
+
|
60
|
+
# Add missing columns to historical dataframe
|
61
|
+
for col in live_cols - historical_cols:
|
62
|
+
historical_df[col] = None
|
63
|
+
|
64
|
+
# Reorder columns to match live dataframe
|
65
|
+
return historical_df[list(live_df.columns)]
|
66
|
+
|
67
|
+
def _create_empty_dataframe(self) -> dd.DataFrame:
|
68
|
+
"""Create an empty dask dataframe with proper structure."""
|
69
|
+
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
70
|
+
|
71
|
+
async def _load_today_data(self, **kwargs) -> Optional[dd.DataFrame]:
|
72
|
+
"""Load today's data from the live reader."""
|
73
|
+
self.logger.debug(f"Loading today's live data...")
|
74
|
+
date_filter = {f"{self.date_field}__date": TODAY_STR}
|
75
|
+
filters = {**kwargs, **date_filter}
|
76
|
+
|
77
|
+
try:
|
78
|
+
today_df = await self.live_reader(
|
79
|
+
logger=self.logger,
|
80
|
+
debug=self.debug
|
81
|
+
).aload(**filters)
|
82
|
+
return today_df
|
83
|
+
except Exception as e:
|
84
|
+
self.logger.error(f"Failed to load today's data: {e}")
|
85
|
+
if not self.debug:
|
86
|
+
return None
|
87
|
+
raise
|
88
|
+
|
89
|
+
async def _load_historical_data(self, start_date: str, end_date: str, **kwargs) -> dd.DataFrame:
|
90
|
+
"""Load historical data from the historical reader."""
|
91
|
+
self.logger.debug(f"Loading historical data from {start_date} to {end_date}...")
|
92
|
+
|
93
|
+
try:
|
94
|
+
return await self.historical_reader(
|
95
|
+
parquet_start_date=start_date,
|
96
|
+
parquet_end_date=end_date,
|
97
|
+
logger=self.logger,
|
98
|
+
debug=self.debug
|
99
|
+
).aload(**kwargs)
|
100
|
+
except Exception as e:
|
101
|
+
self.logger.error(f"Failed to load historical data from {start_date} to {end_date}: {e}")
|
102
|
+
if not self.debug:
|
103
|
+
return self._create_empty_dataframe()
|
104
|
+
raise
|
105
|
+
|
106
|
+
async def aload(self, **kwargs) -> dd.DataFrame:
|
107
|
+
"""
|
108
|
+
Loads data from the historical source and, if required, the live source,
|
109
|
+
then concatenates them.
|
110
|
+
"""
|
111
|
+
# Case 1: Only today's data requested
|
112
|
+
if self._is_single_today:
|
113
|
+
today_df = await self._load_today_data(**kwargs)
|
114
|
+
return today_df if today_df is not None else self._create_empty_dataframe()
|
115
|
+
|
116
|
+
# Case 2: Pure historical data (end date is not today)
|
117
|
+
if not self._should_read_live:
|
118
|
+
return await self._load_historical_data(self.start_date, self.end_date, **kwargs)
|
119
|
+
|
120
|
+
# Case 3: Mixed historical + live scenario (end date is today)
|
121
|
+
# Load historical data up to yesterday
|
122
|
+
historical_df = await self._load_historical_data(self.start_date, YESTERDAY_STR, **kwargs)
|
123
|
+
|
124
|
+
# Load today's data
|
125
|
+
today_df = await self._load_today_data(**kwargs)
|
126
|
+
|
127
|
+
# Combine dataframes
|
128
|
+
if today_df is not None and not dask_is_empty(today_df):
|
129
|
+
# Align schemas if needed
|
130
|
+
if len(historical_df.columns) > 0 and len(today_df.columns) > 0:
|
131
|
+
try:
|
132
|
+
historical_df = self._align_schema_to_live(historical_df, today_df)
|
133
|
+
except Exception as e:
|
134
|
+
self.logger.warning(f"Failed to align schemas: {e}")
|
135
|
+
|
136
|
+
return dd.concat([historical_df, today_df], ignore_index=True)
|
137
|
+
else:
|
138
|
+
return historical_df
|
139
|
+
|
140
|
+
def __repr__(self):
|
141
|
+
return (f"HybridDataLoader(start_date='{self.start_date}', "
|
142
|
+
f"end_date='{self.end_date}', "
|
143
|
+
f"loading_live={self._should_read_live})")
|
144
|
+
|
@@ -7,6 +7,7 @@ from typing import ClassVar, Dict, Optional, Any, Iterable, Tuple
|
|
7
7
|
import pandas as pd
|
8
8
|
import dask.dataframe as dd
|
9
9
|
import clickhouse_connect
|
10
|
+
import numpy as np
|
10
11
|
|
11
12
|
from . import ManagedResource
|
12
13
|
|
@@ -27,6 +28,7 @@ class ClickHouseWriter(ManagedResource):
|
|
27
28
|
- Optional overwrite (drop + recreate)
|
28
29
|
- Partitioned, batched inserts
|
29
30
|
- Per-thread clients to avoid session conflicts
|
31
|
+
- Proper PyArrow dtype handling
|
30
32
|
"""
|
31
33
|
|
32
34
|
# Default dtype mapping (pandas/dask → ClickHouse)
|
@@ -109,7 +111,11 @@ class ClickHouseWriter(ManagedResource):
|
|
109
111
|
return
|
110
112
|
|
111
113
|
# lazily fill missing values per-partition (no global compute)
|
112
|
-
|
114
|
+
# Use the new method that ensures correct types for ClickHouse
|
115
|
+
df = df.map_partitions(
|
116
|
+
type(self)._process_partition_for_clickhouse_compatible,
|
117
|
+
meta=df._meta
|
118
|
+
)
|
113
119
|
|
114
120
|
# (re)create table
|
115
121
|
ow = self.overwrite if overwrite is None else bool(overwrite)
|
@@ -121,7 +127,7 @@ class ClickHouseWriter(ManagedResource):
|
|
121
127
|
self._command(f"DROP TABLE IF EXISTS {self._ident(self.table)}")
|
122
128
|
self.logger.info(f"Dropped table {self.table} (overwrite=True)")
|
123
129
|
|
124
|
-
create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql}
|
130
|
+
create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql}"
|
125
131
|
self._command(create_sql)
|
126
132
|
self.logger.info(f"Ensured table {self.table} exists")
|
127
133
|
|
@@ -159,6 +165,26 @@ class ClickHouseWriter(ManagedResource):
|
|
159
165
|
return ", ".join(pieces)
|
160
166
|
|
161
167
|
def _map_dtype(self, dtype: Any) -> str:
|
168
|
+
dtype_str = str(dtype).lower()
|
169
|
+
# Handle PyArrow dtypes
|
170
|
+
if "[pyarrow]" in dtype_str:
|
171
|
+
if "int64" in dtype_str:
|
172
|
+
return "Int64"
|
173
|
+
elif "int32" in dtype_str:
|
174
|
+
return "Int32"
|
175
|
+
elif "float64" in dtype_str or "double" in dtype_str:
|
176
|
+
return "Float64"
|
177
|
+
elif "float32" in dtype_str:
|
178
|
+
return "Float32"
|
179
|
+
elif "bool" in dtype_str:
|
180
|
+
return "UInt8"
|
181
|
+
elif "timestamp" in dtype_str: # PyArrow timestamp
|
182
|
+
return "DateTime"
|
183
|
+
elif "string" in dtype_str: # PyArrow string
|
184
|
+
return "String"
|
185
|
+
else:
|
186
|
+
return "String" # fallback
|
187
|
+
|
162
188
|
# Handle pandas extension dtypes explicitly
|
163
189
|
if isinstance(dtype, pd.Int64Dtype):
|
164
190
|
return "Int64"
|
@@ -170,19 +196,29 @@ class ClickHouseWriter(ManagedResource):
|
|
170
196
|
return "Float64"
|
171
197
|
if isinstance(dtype, pd.StringDtype):
|
172
198
|
return "String"
|
173
|
-
if "datetime64" in
|
199
|
+
if "datetime64" in dtype_str:
|
174
200
|
return "DateTime"
|
175
201
|
|
176
202
|
return self.DTYPE_MAP.get(str(dtype), "String")
|
177
203
|
|
178
204
|
def _should_mark_nullable(self, dtype: Any) -> bool:
|
179
|
-
|
205
|
+
dtype_str = str(dtype).lower()
|
206
|
+
# PyArrow types are generally nullable, but let's be specific
|
207
|
+
if "[pyarrow]" in dtype_str:
|
208
|
+
# For PyArrow, make strings and timestamps nullable, numbers usually not unless data has nulls
|
209
|
+
base_type = dtype_str.replace("[pyarrow]", "")
|
210
|
+
if base_type in ["string", "large_string"] or "timestamp" in base_type:
|
211
|
+
return True
|
212
|
+
# For numeric PyArrow, check if the actual data contains nulls (hard to do here)
|
213
|
+
# Let's default to not nullable for numeric unless explicitly needed
|
214
|
+
return False # Conservative for PyArrow numerics
|
215
|
+
|
180
216
|
if isinstance(dtype, (pd.StringDtype, pd.BooleanDtype, pd.Int64Dtype, pd.Int32Dtype, pd.Float64Dtype)):
|
181
217
|
return True
|
182
|
-
if "datetime64" in
|
218
|
+
if "datetime64" in dtype_str:
|
183
219
|
return True
|
184
220
|
# object/category almost always nullable
|
185
|
-
if
|
221
|
+
if dtype_str in ("object", "category", "string"):
|
186
222
|
return True
|
187
223
|
return False
|
188
224
|
|
@@ -203,6 +239,10 @@ class ClickHouseWriter(ManagedResource):
|
|
203
239
|
# Ensure column ordering is stable
|
204
240
|
cols = list(pdf.columns)
|
205
241
|
|
242
|
+
# --- CRITICAL FIX: Ensure datetime columns are compatible BEFORE insertion ---
|
243
|
+
# This is the key step to prevent the numpy.datetime64 error
|
244
|
+
pdf = self._ensure_clickhouse_compatible_datetime_types(pdf)
|
245
|
+
|
206
246
|
# Split into batches (to avoid giant single insert)
|
207
247
|
for start in range(0, len(pdf), self.insert_chunksize):
|
208
248
|
batch = pdf.iloc[start:start + self.insert_chunksize]
|
@@ -215,30 +255,116 @@ class ClickHouseWriter(ManagedResource):
|
|
215
255
|
def _insert_df(self, cols: Iterable[str], df: pd.DataFrame) -> None:
|
216
256
|
client = self._get_client()
|
217
257
|
# clickhouse-connect supports insert_df
|
258
|
+
# The df passed here should now have compatible datetime types
|
218
259
|
client.insert_df(self.table, df[cols], settings={"async_insert": 1, "wait_end_of_query": 1})
|
219
260
|
|
220
|
-
# ------------- missing values (lazy) -------------
|
261
|
+
# ------------- missing values & type conversion (lazy) -------------
|
221
262
|
|
222
263
|
@staticmethod
|
223
|
-
def
|
224
|
-
|
264
|
+
def _process_partition_for_clickhouse_compatible(pdf: pd.DataFrame) -> pd.DataFrame:
|
265
|
+
"""
|
266
|
+
Process a partition to fill missing values and ensure initial data types are consistent.
|
267
|
+
This is the first step of data preparation.
|
268
|
+
"""
|
269
|
+
pdf = pdf.copy() # Avoid modifying original
|
270
|
+
|
225
271
|
for col in pdf.columns:
|
226
272
|
s = pdf[col]
|
227
|
-
|
273
|
+
dtype_str = str(s.dtype).lower()
|
274
|
+
|
275
|
+
# --- Handle PyArrow dtypes ---
|
276
|
+
if "[pyarrow]" in dtype_str:
|
277
|
+
try:
|
278
|
+
if "string" in dtype_str:
|
279
|
+
# Convert PyArrow string to object, fillna with empty string
|
280
|
+
pdf[col] = s.astype('object').fillna("")
|
281
|
+
elif "timestamp" in dtype_str:
|
282
|
+
# Convert PyArrow timestamp to pandas datetime, NaT for nulls
|
283
|
+
pdf[col] = pd.to_datetime(s, errors='coerce') # errors='coerce' handles conversion issues
|
284
|
+
elif "int" in dtype_str:
|
285
|
+
# Convert PyArrow int to pandas int, fillna with 0 for non-nullable
|
286
|
+
pdf[col] = s.fillna(0)
|
287
|
+
elif "float" in dtype_str or "double" in dtype_str:
|
288
|
+
pdf[col] = s.fillna(0.0)
|
289
|
+
elif "bool" in dtype_str:
|
290
|
+
pdf[col] = s.fillna(False) # Or pd.NA if you prefer
|
291
|
+
else:
|
292
|
+
# Fallback: convert to object and then to string
|
293
|
+
pdf[col] = s.astype('object').astype(str).fillna("")
|
294
|
+
except Exception as e:
|
295
|
+
# If conversion fails, fall back to object and string
|
296
|
+
pdf[col] = s.astype('object').astype(str).fillna("")
|
297
|
+
|
298
|
+
# --- Handle standard pandas dtypes ---
|
299
|
+
elif pd.api.types.is_integer_dtype(s.dtype):
|
228
300
|
if pd.api.types.is_extension_array_dtype(s.dtype):
|
229
301
|
pdf[col] = s.fillna(pd.NA)
|
230
302
|
else:
|
231
303
|
pdf[col] = s.fillna(0)
|
232
304
|
elif pd.api.types.is_bool_dtype(s.dtype):
|
233
|
-
pdf[col] = s.fillna(pd.NA)
|
305
|
+
pdf[col] = s.fillna(pd.NA) # Or False
|
234
306
|
elif pd.api.types.is_float_dtype(s.dtype):
|
235
307
|
pdf[col] = s.fillna(0.0)
|
236
308
|
elif pd.api.types.is_datetime64_any_dtype(s.dtype):
|
309
|
+
# Datetimes - leave as is for now, will be handled in final step
|
237
310
|
pass
|
238
311
|
else:
|
239
|
-
|
312
|
+
# For object/string/category columns, ensure they're strings
|
313
|
+
pdf[col] = s.astype(str).fillna("")
|
314
|
+
|
240
315
|
return pdf
|
241
316
|
|
317
|
+
def _ensure_clickhouse_compatible_datetime_types(self, df: pd.DataFrame) -> pd.DataFrame:
|
318
|
+
"""
|
319
|
+
Final conversion step: Ensure datetime columns are in a format compatible
|
320
|
+
with clickhouse-connect driver. Specifically, convert numpy.datetime64 to
|
321
|
+
pandas.Timestamp or Python datetime objects.
|
322
|
+
This is called just before insertion.
|
323
|
+
"""
|
324
|
+
df = df.copy()
|
325
|
+
for col in df.columns:
|
326
|
+
s = df[col]
|
327
|
+
# Check if the column is datetime-like
|
328
|
+
if pd.api.types.is_datetime64_any_dtype(s.dtype):
|
329
|
+
# --- Robust conversion to ensure compatibility ---
|
330
|
+
# 1. Convert to pandas datetime explicitly
|
331
|
+
df[col] = pd.to_datetime(s, utc=True) # Ensures timezone handling
|
332
|
+
|
333
|
+
# 2. Replace NaT with None for nullable columns (clickhouse-connect handles this)
|
334
|
+
# This is often sufficient, but let's be extra sure about the object type
|
335
|
+
# 3. Ensure the underlying objects are pandas.Timestamp (which have .timestamp())
|
336
|
+
# The pd.to_datetime should handle this, but accessing .dt accessor reinforces it.
|
337
|
+
# If there are still issues, we can force object conversion:
|
338
|
+
# df[col] = df[col].dt.to_pydatetime() # Converts to numpy array of datetime64 or None
|
339
|
+
# But pd.Timestamp is better. Let's try accessing .dt to ensure it's proper:
|
340
|
+
try:
|
341
|
+
_ = df[col].dt # Accessing .dt confirms it's datetime-like
|
342
|
+
except:
|
343
|
+
# If .dt fails, it means conversion wasn't clean, force it
|
344
|
+
self.logger.debug(f"Forcing datetime conversion for column {col}")
|
345
|
+
df[col] = pd.to_datetime(df[col].astype('object'), utc=True)
|
346
|
+
|
347
|
+
# --- Final check and explicit conversion if needed ---
|
348
|
+
# If the error persists, we might need to explicitly convert the array elements.
|
349
|
+
# Let's add a check for the first non-null element in a sample:
|
350
|
+
sample_series = df[col].dropna()
|
351
|
+
if len(sample_series) > 0:
|
352
|
+
first_val = sample_series.iloc[0]
|
353
|
+
if isinstance(first_val, np.datetime64):
|
354
|
+
self.logger.warning(f"Column {col} still contains numpy.datetime64 after conversion. Forcing object conversion.")
|
355
|
+
# Force conversion to object array of pandas.Timestamp or None
|
356
|
+
def convert_val(v):
|
357
|
+
if pd.isna(v):
|
358
|
+
return None
|
359
|
+
if isinstance(v, np.datetime64):
|
360
|
+
# Convert numpy.datetime64 to pandas.Timestamp
|
361
|
+
return pd.Timestamp(v)
|
362
|
+
return v
|
363
|
+
df[col] = df[col].apply(convert_val)
|
364
|
+
|
365
|
+
return df
|
366
|
+
|
367
|
+
|
242
368
|
# ------------- low-level helpers -------------
|
243
369
|
|
244
370
|
def _get_client(self):
|
@@ -284,4 +410,3 @@ class ClickHouseWriter(ManagedResource):
|
|
284
410
|
finally:
|
285
411
|
if hasattr(self._tlocal, "client"):
|
286
412
|
delattr(self._tlocal, "client")
|
287
|
-
|
sibi_dst/utils/dask_utils.py
CHANGED
@@ -31,7 +31,7 @@ def dask_is_empty(ddf: dd.DataFrame, *, sample: int = 4) -> bool:
|
|
31
31
|
k = min(max(sample, 1), ddf.npartitions)
|
32
32
|
probes = dask.compute(*[
|
33
33
|
ddf.get_partition(i).map_partitions(len) for i in range(k)
|
34
|
-
])
|
34
|
+
], scheduler="threads")
|
35
35
|
|
36
36
|
if any(_to_int_safe(n) > 0 for n in probes):
|
37
37
|
return False
|
@@ -35,19 +35,21 @@ sibi_dst/osmnx_helper/basemaps/router_plotter.py,sha256=UAiijn-J-jjX4YnL0_P9SFqT
|
|
35
35
|
sibi_dst/osmnx_helper/route_path_builder.py,sha256=XJJyu4YXegAkCRjE-knyQncwXaxDVXZhalYacLcb7e0,3557
|
36
36
|
sibi_dst/osmnx_helper/utils.py,sha256=7-lFVhGn4rHjGz6FvpXtC2jY8UzGIVyKR3MVyEfB7nw,14407
|
37
37
|
sibi_dst/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
|
+
sibi_dst/tests/test_baseclass.py,sha256=5huAwjWo_SOEZR2_0y5w9qUmw5G7pVdm8X1OTG87JK0,11562
|
38
39
|
sibi_dst/tests/test_data_wrapper_class.py,sha256=6uFmZR2DxnxQz49L5jT2ehlKvlLnpUHMLFB_PqqUq7k,3336
|
39
40
|
sibi_dst/utils/__init__.py,sha256=vShNCOMPw8KKwlb4tq5XGrpjqakJ_OE8YDc_xDAWAxI,1302
|
40
41
|
sibi_dst/utils/async_utils.py,sha256=53aywfgq1Q6-0OVr9qR1Sf6g7Qv3I9qunAAR4fjFXBE,351
|
41
|
-
sibi_dst/utils/base.py,sha256=
|
42
|
-
sibi_dst/utils/boilerplate/__init__.py,sha256=
|
42
|
+
sibi_dst/utils/base.py,sha256=sFngliI7Ku8bZMz0YdVhppuaPNZ0dvqRwCsPe9XdF1A,16256
|
43
|
+
sibi_dst/utils/boilerplate/__init__.py,sha256=zgkQ50-cKmRugOz1bHqhjVXb3Hb8rsIwN7d5-kVsRls,370
|
43
44
|
sibi_dst/utils/boilerplate/base_attacher.py,sha256=JRAyvfljQjKVD5BJDDd09cBY9pGPIe8LQp0aUv_xJs0,736
|
44
45
|
sibi_dst/utils/boilerplate/base_data_cube.py,sha256=ErKTM2kT8LsSXADcyYvT436O_Mp0J2hm8xs1IUircb4,2760
|
45
46
|
sibi_dst/utils/boilerplate/base_parquet_artifact.py,sha256=oqPbjHFfChA9j1WL-eDAh7XLA3zmf-Rq7s_kzITVniA,3753
|
46
47
|
sibi_dst/utils/boilerplate/base_parquet_reader.py,sha256=3kN9_bbxyX-WuJLMBWejeApW2V_BDArSljhSUOAOhVU,692
|
48
|
+
sibi_dst/utils/boilerplate/hybrid_data_loader.py,sha256=Tazn7QL3FmWKVMXxzkvxPrG_2ucsPHvSotIW9dBLoNc,6018
|
47
49
|
sibi_dst/utils/business_days.py,sha256=dP0Xj4FhTBIvZZrZYLOHZl5zOpDAgWkD4p_1a7BOT7I,8461
|
48
|
-
sibi_dst/utils/clickhouse_writer.py,sha256=
|
50
|
+
sibi_dst/utils/clickhouse_writer.py,sha256=AOv0bYFzAI0u4dEkEBoUqtHekwPMISdNT5-ywCtDe4I,17049
|
49
51
|
sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
|
50
|
-
sibi_dst/utils/dask_utils.py,sha256=
|
52
|
+
sibi_dst/utils/dask_utils.py,sha256=QhFcmpH4fXAy6b3DugIX5JvH4h-P3M3hXKnBYTLRkq0,1991
|
51
53
|
sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
|
52
54
|
sibi_dst/utils/data_utils.py,sha256=7bLidEjppieNoozDFb6OuRY0W995cxg4tiGAlkGfePI,7768
|
53
55
|
sibi_dst/utils/data_wrapper.py,sha256=axHOmCG9cBJgjf5m8jpzsCCZzXJgynGs44rGe6FUrzk,29906
|
@@ -91,6 +93,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
|
|
91
93
|
sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
|
92
94
|
sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
|
93
95
|
sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
|
94
|
-
sibi_dst-2025.9.
|
95
|
-
sibi_dst-2025.9.
|
96
|
-
sibi_dst-2025.9.
|
96
|
+
sibi_dst-2025.9.6.dist-info/METADATA,sha256=e9vt1wbHivyTJhyubiEjJcMFBNDF1m9nERTlBgYvq9o,2710
|
97
|
+
sibi_dst-2025.9.6.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
98
|
+
sibi_dst-2025.9.6.dist-info/RECORD,,
|
File without changes
|