sibi-dst 2025.9.4__tar.gz → 2025.9.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/PKG-INFO +1 -1
  2. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/pyproject.toml +1 -1
  3. sibi_dst-2025.9.5/sibi_dst/tests/test_baseclass.py +403 -0
  4. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/base.py +0 -254
  5. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/boilerplate/__init__.py +4 -1
  6. sibi_dst-2025.9.5/sibi_dst/utils/boilerplate/hybrid_data_loader.py +144 -0
  7. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/README.md +0 -0
  8. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/__init__.py +0 -0
  9. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/__init__.py +0 -0
  10. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/_artifact_updater_async.py +0 -0
  11. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/_artifact_updater_threaded.py +0 -0
  12. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/_df_helper.py +0 -0
  13. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  14. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  15. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/backends/__init__.py +0 -0
  16. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  17. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  18. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  19. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  20. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  21. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  22. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +0 -0
  23. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  24. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  25. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +0 -0
  26. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  27. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/core/__init__.py +0 -0
  28. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/core/_defaults.py +0 -0
  29. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  30. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/core/_params_config.py +0 -0
  31. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/core/_query_config.py +0 -0
  32. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/df_helper/data_cleaner.py +0 -0
  33. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/geopy_helper/__init__.py +0 -0
  34. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  35. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/geopy_helper/utils.py +0 -0
  36. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/osmnx_helper/__init__.py +0 -0
  37. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  38. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  39. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  40. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +0 -0
  41. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  42. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/osmnx_helper/route_path_builder.py +0 -0
  43. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/osmnx_helper/utils.py +0 -0
  44. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/tests/__init__.py +0 -0
  45. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  46. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/__init__.py +0 -0
  47. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/async_utils.py +0 -0
  48. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/boilerplate/base_attacher.py +0 -0
  49. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/boilerplate/base_data_cube.py +0 -0
  50. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/boilerplate/base_parquet_artifact.py +0 -0
  51. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/boilerplate/base_parquet_reader.py +0 -0
  52. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/business_days.py +0 -0
  53. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/clickhouse_writer.py +0 -0
  54. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/credentials.py +0 -0
  55. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/dask_utils.py +0 -0
  56. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/data_from_http_source.py +0 -0
  57. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/data_utils.py +0 -0
  58. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/data_wrapper.py +0 -0
  59. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/date_utils.py +0 -0
  60. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/df_utils.py +0 -0
  61. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/file_age_checker.py +0 -0
  62. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/file_utils.py +0 -0
  63. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/filepath_generator.py +0 -0
  64. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/iceberg_saver.py +0 -0
  65. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/log_utils.py +0 -0
  66. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/manifest_manager.py +0 -0
  67. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/parquet_saver.py +0 -0
  68. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/periods.py +0 -0
  69. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/phone_formatter.py +0 -0
  70. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/progress/__init__.py +0 -0
  71. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/progress/jobs.py +0 -0
  72. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/progress/sse_runner.py +0 -0
  73. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/storage_config.py +0 -0
  74. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/storage_hive.py +0 -0
  75. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/storage_manager.py +0 -0
  76. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/update_planner.py +0 -0
  77. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/webdav_client.py +0 -0
  78. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/utils/write_gatekeeper.py +0 -0
  79. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/v2/__init__.py +0 -0
  80. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/v2/df_helper/__init__.py +0 -0
  81. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  82. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  83. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  84. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  85. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  86. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  87. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  88. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  89. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
  90. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
  91. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  92. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  93. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  94. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  95. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  96. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
  97. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/v2/utils/__init__.py +0 -0
  98. {sibi_dst-2025.9.4 → sibi_dst-2025.9.5}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.9.4
3
+ Version: 2025.9.5
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "2025.9.4"
3
+ version = "2025.9.5"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -0,0 +1,403 @@
1
+ import asyncio
2
+ import json
3
+ import threading
4
+ from typing import Any, Dict
5
+ from unittest.mock import MagicMock
6
+
7
+ import fsspec
8
+
9
+ from sibi_dst.utils import Logger
10
+ from sibi_dst.utils import ManagedResource
11
+ from sibi_dst.utils.base import _QueueSSE # Replace 'your_module' with actual module name
12
+
13
+
14
+ # ------------------------------ Test Fixtures ------------------------------
15
+
16
+ class TestResource(ManagedResource):
17
+ def __init__(self, *args, **kwargs):
18
+ super().__init__(*args, **kwargs)
19
+ self.cleanup_called = False
20
+ self.acleanup_called = False
21
+
22
+ def _cleanup(self) -> None:
23
+ self.cleanup_called = True
24
+ super()._cleanup()
25
+
26
+ async def _acleanup(self) -> None:
27
+ self.acleanup_called = True
28
+ await super()._acleanup()
29
+
30
+
31
+ class MockSSESink:
32
+ def __init__(self):
33
+ self.events = []
34
+ self.closed = False
35
+
36
+ async def send(self, event: str, data: Dict[str, Any]) -> None:
37
+ self.events.append({"event": event, "data": data})
38
+
39
+ async def aclose(self) -> None:
40
+ self.closed = True
41
+
42
+
43
+ class MockSyncSSESink:
44
+ def __init__(self):
45
+ self.events = []
46
+ self.closed = False
47
+
48
+ def send(self, event: str, data: Dict[str, Any]) -> None:
49
+ self.events.append({"event": event, "data": data})
50
+
51
+ def close(self) -> None:
52
+ self.closed = True
53
+
54
+
55
+ # ------------------------------ Mock fsspec filesystem ------------------------------
56
+
57
+ class MockFileSystem(fsspec.AbstractFileSystem):
58
+ def __init__(self, **kwargs):
59
+ super().__init__(**kwargs)
60
+ self.closed = False
61
+
62
+ def close(self):
63
+ self.closed = True
64
+
65
+
66
+ # ------------------------------ Utility for Event Loop ------------------------------
67
+
68
+ def run_async_test(coro):
69
+ """Run async test safely in different environments."""
70
+ try:
71
+ # Try to get existing event loop (for Jupyter/IPython)
72
+ loop = asyncio.get_event_loop()
73
+ if loop.is_running():
74
+ # In Jupyter, create a new task
75
+ task = loop.create_task(coro)
76
+ return task
77
+ else:
78
+ return loop.run_until_complete(coro)
79
+ except RuntimeError:
80
+ # No event loop running, use asyncio.run()
81
+ return asyncio.run(coro)
82
+
83
+
84
+ # ------------------------------ Lifecycle Tests ------------------------------
85
+
86
+ def test_double_close_no_error():
87
+ """Test that calling close() multiple times doesn't raise errors."""
88
+ resource = TestResource()
89
+ resource.close()
90
+ resource.close() # Should not raise
91
+ assert resource.closed
92
+
93
+
94
+ def test_double_aclose_no_error():
95
+ """Test that calling aclose() multiple times doesn't raise errors."""
96
+ async def test():
97
+ resource = TestResource()
98
+ await resource.aclose()
99
+ await resource.aclose() # Should not raise
100
+ assert resource.closed
101
+
102
+ run_async_test(test())
103
+
104
+
105
+ def test_context_manager_sync():
106
+ """Test sync context manager behavior."""
107
+ with TestResource() as resource:
108
+ assert not resource.closed
109
+ assert resource.closed
110
+ assert resource.cleanup_called
111
+
112
+
113
+ def test_context_manager_async():
114
+ """Test async context manager behavior."""
115
+ async def test():
116
+ async with TestResource() as resource:
117
+ assert not resource.closed
118
+ assert resource.closed
119
+ assert resource.acleanup_called
120
+
121
+ run_async_test(test())
122
+
123
+
124
+ # ------------------------------ SSE Emission Tests ------------------------------
125
+
126
+ def test_auto_sse_creation():
127
+ """Test automatic SSE creation when auto_sse=True."""
128
+ resource = TestResource(auto_sse=True)
129
+ sse = resource.get_sse()
130
+ assert sse is not None
131
+ assert isinstance(sse, _QueueSSE)
132
+ assert resource._owns_sse
133
+
134
+
135
+ def test_sse_emission_with_async_sink():
136
+ """Test SSE emission with async send method."""
137
+ async def test():
138
+ sink = MockSSESink()
139
+ resource = TestResource(sse=sink)
140
+
141
+ await resource.emit("test_event", key="value")
142
+
143
+ assert len(sink.events) == 1
144
+ assert sink.events[0]["event"] == "test_event"
145
+ assert sink.events[0]["data"] == {"key": "value"}
146
+
147
+ run_async_test(test())
148
+
149
+
150
+ def test_sse_emission_with_sync_sink():
151
+ """Test SSE emission with sync send method wrapped in async."""
152
+ sink = MockSyncSSESink()
153
+ resource = TestResource(sse=sink)
154
+
155
+ async def test():
156
+ await resource.emit("test_event", key="value")
157
+
158
+ assert len(sink.events) == 1
159
+ assert sink.events[0]["event"] == "test_event"
160
+ assert sink.events[0]["data"] == {"key": "value"}
161
+
162
+ run_async_test(test())
163
+
164
+
165
+ def test_sse_put_method_support():
166
+ """Test SSE emission with put method."""
167
+ class PutSink:
168
+ def __init__(self):
169
+ self.items = []
170
+
171
+ async def put(self, item: Dict[str, Any]) -> None:
172
+ self.items.append(item)
173
+
174
+ async def test():
175
+ sink = PutSink()
176
+ resource = TestResource(sse=sink)
177
+
178
+ await resource.emit("test_event", key="value")
179
+
180
+ assert len(sink.items) == 1
181
+ item = sink.items[0]
182
+ assert item["event"] == "test_event"
183
+ assert json.loads(item["data"]) == {"key": "value"}
184
+
185
+ run_async_test(test())
186
+
187
+
188
+ def test_sse_no_emitter_no_error():
189
+ """Test that emit on resource without emitter doesn't raise."""
190
+ resource = TestResource()
191
+ # Should not raise error
192
+ async def test():
193
+ await resource.emit("test_event", key="value")
194
+
195
+ run_async_test(test())
196
+
197
+
198
+ def test_sse_emission_after_close():
199
+ """Test that emit after close is no-op."""
200
+ async def test():
201
+ sink = MockSSESink()
202
+ resource = TestResource(sse=sink)
203
+
204
+ await resource.aclose()
205
+ await resource.emit("test_event", key="value") # Should not raise
206
+
207
+ assert len(sink.events) == 0
208
+
209
+ run_async_test(test())
210
+
211
+
212
+ # ------------------------------ Cleanup Interplay Tests ------------------------------
213
+
214
+ def test_sync_cleanup_called_on_sync_close():
215
+ """Test that sync cleanup is called during sync close."""
216
+ resource = TestResource()
217
+ resource.close()
218
+ assert resource.cleanup_called
219
+ assert not resource.acleanup_called
220
+
221
+
222
+ def test_async_cleanup_called_on_async_close():
223
+ """Test that async cleanup is called during async close."""
224
+ async def test():
225
+ resource = TestResource()
226
+ await resource.aclose()
227
+ assert resource.acleanup_called
228
+ assert not resource.cleanup_called
229
+
230
+ run_async_test(test())
231
+
232
+
233
+ # ------------------------------ Logger Tests ------------------------------
234
+
235
+ def test_logger_ownership():
236
+ """Test that logger is owned when not provided externally."""
237
+ resource = TestResource()
238
+ assert resource._owns_logger
239
+ assert resource.logger is not None
240
+
241
+
242
+ def test_external_logger_not_owned():
243
+ """Test that external logger is not owned."""
244
+ external_logger = Logger.default_logger("test")
245
+ resource = TestResource(logger=external_logger)
246
+ assert not resource._owns_logger
247
+ assert resource.logger is external_logger
248
+
249
+
250
+ def test_logger_level_configuration():
251
+ """Test logger level configuration based on verbose/debug flags."""
252
+ # Default (warning level)
253
+ resource = TestResource()
254
+ assert hasattr(resource.logger, 'level')
255
+
256
+ # Verbose (info level)
257
+ resource = TestResource(verbose=True)
258
+ assert hasattr(resource.logger, 'level')
259
+
260
+ # Debug (debug level)
261
+ resource = TestResource(debug=True)
262
+ assert hasattr(resource.logger, 'level')
263
+
264
+
265
+ # ------------------------------ Lazy Instantiation Tests ------------------------------
266
+
267
+ def test_lazy_fs_instantiation():
268
+ """Test lazy filesystem instantiation via factory."""
269
+ fs_instance = MockFileSystem()
270
+ factory_called = [False]
271
+
272
+ def fs_factory():
273
+ factory_called[0] = True
274
+ return fs_instance
275
+
276
+ resource = TestResource(fs_factory=fs_factory)
277
+ assert not factory_called[0] # Not called yet
278
+
279
+ fs = resource._ensure_fs()
280
+ assert factory_called[0]
281
+ assert fs is fs_instance
282
+ assert resource.fs is fs_instance
283
+
284
+
285
+ def test_lazy_sse_instantiation():
286
+ """Test lazy SSE instantiation via factory."""
287
+ sink_instance = MockSSESink()
288
+ factory_called = [False]
289
+
290
+ def sse_factory():
291
+ factory_called[0] = True
292
+ return sink_instance
293
+
294
+ resource = TestResource(sse_factory=sse_factory)
295
+ assert not factory_called[0] # Not called yet
296
+
297
+ sse = resource._ensure_sse()
298
+ assert factory_called[0]
299
+ assert sse is sink_instance
300
+ assert resource._sse is sink_instance
301
+
302
+
303
+ def test_lazy_fs_not_called_if_fs_provided():
304
+ """Test that factory is not called if fs is provided directly."""
305
+ fs_instance = MockFileSystem()
306
+ factory = MagicMock()
307
+
308
+ resource = TestResource(fs=fs_instance, fs_factory=factory)
309
+ fs = resource._ensure_fs()
310
+
311
+ assert fs is fs_instance
312
+ factory.assert_not_called()
313
+
314
+
315
+ def test_lazy_sse_not_called_if_sse_provided():
316
+ """Test that factory is not called if sse is provided directly."""
317
+ sink_instance = MockSSESink()
318
+ factory = MagicMock()
319
+
320
+ resource = TestResource(sse=sink_instance, sse_factory=factory)
321
+ sse = resource._ensure_sse()
322
+
323
+ assert sse is sink_instance
324
+ factory.assert_not_called()
325
+
326
+
327
+ # ------------------------------ Thread Safety Tests ------------------------------
328
+
329
+ def test_thread_safe_close():
330
+ """Test that close operations are thread-safe."""
331
+ resource = TestResource()
332
+
333
+ results = []
334
+ errors = []
335
+
336
+ def close_resource():
337
+ try:
338
+ resource.close()
339
+ results.append("success")
340
+ except Exception as e:
341
+ errors.append(str(e))
342
+ results.append(f"error: {e}")
343
+
344
+ # Start multiple threads trying to close simultaneously
345
+ threads = [threading.Thread(target=close_resource) for _ in range(5)]
346
+ for t in threads:
347
+ t.start()
348
+ for t in threads:
349
+ t.join()
350
+
351
+ # Debug information
352
+ print(f"Results: {results}")
353
+ print(f"Errors: {errors}")
354
+ print(f"Resource closed: {resource.closed}")
355
+
356
+ # Should have at least one success (the first one) and no exceptions
357
+ success_count = results.count("success")
358
+ error_count = len([r for r in results if r.startswith("error")])
359
+
360
+ # At least one should succeed
361
+ assert success_count >= 1, f"Expected at least 1 success, got {success_count}"
362
+ # No errors should occur
363
+ assert error_count == 0, f"Expected 0 errors, got {error_count}"
364
+ # Resource should be closed
365
+ assert resource.closed, "Resource should be closed"
366
+
367
+
368
+ # ------------------------------ Individual Test Functions ------------------------------
369
+
370
+ # You can now run individual tests like this:
371
+ if __name__ == "__main__":
372
+ # Run individual tests
373
+ test_double_close_no_error()
374
+ print("✓ test_double_close_no_error passed")
375
+
376
+ test_sync_cleanup_called_on_sync_close()
377
+ print("✓ test_sync_cleanup_called_on_sync_close passed")
378
+
379
+ test_logger_ownership()
380
+ print("✓ test_logger_ownership passed")
381
+
382
+ test_external_logger_not_owned()
383
+ print("✓ test_external_logger_not_owned passed")
384
+
385
+ test_lazy_fs_instantiation()
386
+ print("✓ test_lazy_fs_instantiation passed")
387
+
388
+ test_lazy_sse_instantiation()
389
+ print("✓ test_lazy_sse_instantiation passed")
390
+
391
+ test_lazy_fs_not_called_if_fs_provided()
392
+ print("✓ test_lazy_fs_not_called_if_fs_provided passed")
393
+
394
+ test_lazy_sse_not_called_if_sse_provided()
395
+ print("✓ test_lazy_sse_not_called_if_sse_provided passed")
396
+
397
+ test_thread_safe_close()
398
+ print("✓ test_thread_safe_close passed")
399
+
400
+ test_auto_sse_creation()
401
+ print("✓ test_auto_sse_creation passed")
402
+
403
+ print("All tests completed!")
@@ -441,257 +441,3 @@ class ManagedResource(abc.ABC):
441
441
  except Exception:
442
442
  pass
443
443
 
444
- ## Before SSE handling
445
-
446
- # import abc
447
- # import threading
448
- # import weakref
449
- # from typing import Self, Optional, Callable
450
- #
451
- # import fsspec
452
- #
453
- # from sibi_dst.utils import Logger
454
- #
455
- #
456
- # class ManagedResource(abc.ABC):
457
- # """
458
- # Boilerplate ABC for components that manage a logger and an optional fsspec filesystem,
459
- # with sync/async lifecycle helpers, lazy FS creation via an optional factory, and
460
- # configurable cleanup-error logging.
461
- # """
462
- #
463
- # def __init__(
464
- # self,
465
- # *,
466
- # verbose: bool = False,
467
- # debug: bool = False,
468
- # log_cleanup_errors: bool = True,
469
- # logger: Optional[Logger] = None,
470
- # fs: Optional[fsspec.AbstractFileSystem] = None,
471
- # fs_factory: Optional[Callable[[], fsspec.AbstractFileSystem]] = None,
472
- # **_: object,
473
- # ) -> None:
474
- # # ---- Declared upfront for type checkers
475
- # self.logger: Logger
476
- # self.fs: Optional[fsspec.AbstractFileSystem] = None
477
- # self._fs_factory: Optional[Callable[[], fsspec.AbstractFileSystem]] = None
478
- # self._owns_logger: bool = False
479
- # self._owns_fs: bool = False
480
- # self._is_closed: bool = False
481
- # self._closing: bool = False
482
- # self._close_lock = threading.RLock()
483
- #
484
- # self.verbose = verbose
485
- # self.debug = debug
486
- # self._log_cleanup_errors = log_cleanup_errors
487
- #
488
- # # ---- Logger ownership
489
- # if logger is None:
490
- # self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
491
- # self._owns_logger = True
492
- # level = Logger.DEBUG if self.debug else (Logger.INFO if self.verbose else Logger.WARNING)
493
- # self.logger.set_level(level)
494
- # else:
495
- # self.logger = logger
496
- # self._owns_logger = False # do not mutate external logger
497
- #
498
- # # ---- FS ownership & lazy creation
499
- # if fs is not None:
500
- # self.fs = fs
501
- # self._owns_fs = False
502
- # self._fs_factory = None
503
- # elif fs_factory is not None:
504
- # # Lazy: don't create until first use
505
- # self._fs_factory = fs_factory
506
- # self._owns_fs = True # we will own it *if* created
507
- # self.fs = None
508
- # else:
509
- # self.fs = None
510
- # self._owns_fs = False
511
- # self._fs_factory = None
512
- #
513
- # # Register a GC-time finalizer that does not capture self
514
- # self_ref = weakref.ref(self)
515
- # self._finalizer = weakref.finalize(self, self._finalize_static, self_ref)
516
- #
517
- # if self.debug:
518
- # try:
519
- # self.logger.debug("Component %s initialized. %s", self.__class__.__name__, repr(self))
520
- # except Exception:
521
- # pass
522
- #
523
- # # ---------- Introspection ----------
524
- # @property
525
- # def is_closed(self) -> bool:
526
- # return self._is_closed
527
- #
528
- # @property
529
- # def closed(self) -> bool: # alias
530
- # return self._is_closed
531
- #
532
- # def __repr__(self) -> str:
533
- # class_name = self.__class__.__name__
534
- # logger_status = "own" if self._owns_logger else "external"
535
- # if self.fs is None and self._fs_factory is not None:
536
- # fs_status = "own(lazy)"
537
- # elif self.fs is None:
538
- # fs_status = "none"
539
- # else:
540
- # fs_status = "own" if self._owns_fs else "external"
541
- # return (f"<{class_name} debug={self.debug} verbose={self.verbose} "
542
- # f"log_cleanup_errors={self._log_cleanup_errors} "
543
- # f"logger={logger_status} fs={fs_status}>")
544
- #
545
- # # ---------- Subclass hooks ----------
546
- # def _cleanup(self) -> None:
547
- # """Sync cleanup for resources created BY THE SUBCLASS."""
548
- # return
549
- #
550
- # async def _acleanup(self) -> None:
551
- # """Async cleanup for resources created BY THE SUBCLASS."""
552
- # return
553
- #
554
- # # ---------- FS helpers ----------
555
- # def _ensure_fs(self) -> Optional[fsspec.AbstractFileSystem]:
556
- # """Create the FS lazily if a factory was provided. Return fs (or None)."""
557
- # if self.fs is None and self._fs_factory is not None:
558
- # created = self._fs_factory()
559
- # if not isinstance(created, fsspec.AbstractFileSystem):
560
- # raise TypeError(f"fs_factory() must return fsspec.AbstractFileSystem, got {type(created)!r}")
561
- # self.fs = created
562
- # # _owns_fs already True when factory is present
563
- # return self.fs
564
- #
565
- # def require_fs(self) -> fsspec.AbstractFileSystem:
566
- # """Return a filesystem or raise if not configured/creatable."""
567
- # fs = self._ensure_fs()
568
- # if fs is None:
569
- # raise RuntimeError(
570
- # f"{self.__class__.__name__}: filesystem is required but not configured"
571
- # )
572
- # return fs
573
- #
574
- # # ---------- Shared shutdown helpers (no logging; safe for late shutdown) ----------
575
- # def _release_owned_fs(self) -> None:
576
- # if self._owns_fs:
577
- # # ensure creation state is respected even if never used
578
- # _ = self.fs or None # no-op; if never created, nothing to close
579
- # if self.fs is not None:
580
- # close = getattr(self.fs, "close", None)
581
- # try:
582
- # if callable(close):
583
- # close()
584
- # finally:
585
- # self.fs = None
586
- #
587
- # def _shutdown_logger(self) -> None:
588
- # if self._owns_logger:
589
- # try:
590
- # self.logger.shutdown()
591
- # except Exception:
592
- # pass
593
- #
594
- # def _shutdown_owned_resources(self) -> None:
595
- # self._release_owned_fs()
596
- # self._shutdown_logger()
597
- #
598
- # # ---------- Public lifecycle (sync) ----------
599
- # def close(self) -> None:
600
- # with self._close_lock:
601
- # if self._is_closed or self._closing:
602
- # return
603
- # self._closing = True
604
- #
605
- # try:
606
- # self._cleanup()
607
- # except Exception:
608
- # # Only include traceback when debug=True
609
- # if self._log_cleanup_errors:
610
- # try:
611
- # self.logger.error(
612
- # "Error during %s._cleanup()", self.__class__.__name__,
613
- # exc_info=self.debug
614
- # )
615
- # except Exception:
616
- # pass
617
- # raise
618
- # finally:
619
- # with self._close_lock:
620
- # self._is_closed = True
621
- # self._closing = False
622
- # self._shutdown_owned_resources()
623
- # if self.debug:
624
- # try:
625
- # self.logger.debug("Component %s closed.", self.__class__.__name__)
626
- # except Exception:
627
- # pass
628
- #
629
- # # ---------- Public lifecycle (async) ----------
630
- # async def aclose(self) -> None:
631
- # with self._close_lock:
632
- # if self._is_closed or self._closing:
633
- # return
634
- # self._closing = True
635
- #
636
- # try:
637
- # await self._acleanup()
638
- # except Exception:
639
- # # Only include traceback when debug=True
640
- # if self._log_cleanup_errors:
641
- # try:
642
- # self.logger.error(
643
- # "Error during %s._acleanup()", self.__class__.__name__,
644
- # exc_info=self.debug
645
- # )
646
- # except Exception:
647
- # pass
648
- # raise
649
- # finally:
650
- # with self._close_lock:
651
- # self._is_closed = True
652
- # self._closing = False
653
- # self._shutdown_owned_resources()
654
- # if self.debug:
655
- # try:
656
- # self.logger.debug("Async component %s closed.", self.__class__.__name__)
657
- # except Exception:
658
- # pass
659
- #
660
- # # ---------- Context managers ----------
661
- # def __enter__(self) -> Self:
662
- # return self
663
- #
664
- # def __exit__(self, exc_type, exc, tb) -> bool:
665
- # self.close()
666
- # return False # propagate exceptions
667
- #
668
- # async def __aenter__(self) -> Self:
669
- # return self
670
- #
671
- # async def __aexit__(self, exc_type, exc, tb) -> bool:
672
- # await self.aclose()
673
- # return False
674
- #
675
- # # ---------- Finalizer ( at Garbage Collection-time absolutely silent) ----------
676
- # @staticmethod
677
- # def _finalize_static(ref: "weakref.ReferenceType[ManagedResource]") -> None:
678
- # obj = ref()
679
- # if obj is None:
680
- # return
681
- # # No logging here; interpreter may be tearing down.
682
- # # Best-effort silent cleanup; avoid locks and context managers.
683
- # try:
684
- # if not obj._is_closed:
685
- # try:
686
- # obj._cleanup()
687
- # except Exception:
688
- # pass
689
- # obj._is_closed = True
690
- # try:
691
- # obj._shutdown_owned_resources()
692
- # except Exception:
693
- # pass
694
- # except Exception:
695
- # # do not show anything at garbage collection time
696
- # pass
697
- #
@@ -2,10 +2,13 @@ from .base_parquet_artifact import BaseParquetArtifact
2
2
  from .base_data_cube import BaseDataCube
3
3
  from .base_attacher import make_attacher
4
4
  from .base_parquet_reader import BaseParquetReader
5
+ from .hybrid_data_loader import HybridDataLoader
6
+
5
7
  __all__ = [
6
8
  "BaseDataCube",
7
9
  "BaseParquetArtifact",
8
10
  "make_attacher",
9
- "BaseParquetReader"
11
+ "BaseParquetReader",
12
+ "HybridDataLoader",
10
13
  ]
11
14
 
@@ -0,0 +1,144 @@
1
+ import dask.dataframe as dd
2
+ import datetime
3
+ import pandas as pd
4
+ from typing import Optional
5
+ from sibi_dst.utils import Logger
6
+ from sibi_dst.utils.dask_utils import dask_is_empty
7
+
8
+ today = datetime.date.today()
9
+ yesterday = today - datetime.timedelta(days=1)
10
+ TODAY_STR = today.strftime('%Y-%m-%d')
11
+ YESTERDAY_STR = yesterday.strftime('%Y-%m-%d')
12
+
13
+
14
+ class HybridDataLoader:
15
+ """
16
+ A generic data loader that orchestrates loading from a historical
17
+ source and an optional live source.
18
+ """
19
+
20
+ def __init__(self, start_date: str, end_date: str, historical_reader, live_reader, date_field: str, **kwargs):
21
+ self.start_date = self._validate_date_format(start_date)
22
+ self.end_date = self._validate_date_format(end_date)
23
+ self.historical_reader = historical_reader
24
+ self.live_reader = live_reader
25
+ self.date_field = date_field
26
+
27
+ self.logger = kwargs.get('logger', Logger.default_logger(logger_name=__name__))
28
+ self.debug = kwargs.get('debug', False)
29
+ self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
30
+
31
+ # Validate date range
32
+ self._validate_date_range()
33
+
34
+ # Determine loading strategy
35
+ self._should_read_live = self.end_date == TODAY_STR
36
+ self._is_single_today = (self.start_date == TODAY_STR and self.end_date == TODAY_STR)
37
+ self._is_single_historical = (self.start_date == self.end_date and self.end_date != TODAY_STR)
38
+
39
+ def _validate_date_format(self, date_str: str) -> str:
40
+ """Validate that date string is in correct format."""
41
+ try:
42
+ datetime.datetime.strptime(date_str, '%Y-%m-%d')
43
+ return date_str
44
+ except ValueError:
45
+ raise ValueError(f"Date '{date_str}' is not in valid YYYY-MM-DD format")
46
+
47
+ def _validate_date_range(self):
48
+ """Validate that start date is not after end date."""
49
+ start = datetime.datetime.strptime(self.start_date, '%Y-%m-%d').date()
50
+ end = datetime.datetime.strptime(self.end_date, '%Y-%m-%d').date()
51
+ if end < start:
52
+ raise ValueError(f"End date ({self.end_date}) cannot be before start date ({self.start_date})")
53
+
54
+ def _align_schema_to_live(self, historical_df: dd.DataFrame, live_df: dd.DataFrame) -> dd.DataFrame:
55
+ """Forces the historical dataframe schema to match the live one."""
56
+ self.logger.debug("Aligning historical schema to match live schema.")
57
+ historical_cols = set(historical_df.columns)
58
+ live_cols = set(live_df.columns)
59
+
60
+ # Add missing columns to historical dataframe
61
+ for col in live_cols - historical_cols:
62
+ historical_df[col] = None
63
+
64
+ # Reorder columns to match live dataframe
65
+ return historical_df[list(live_df.columns)]
66
+
67
+ def _create_empty_dataframe(self) -> dd.DataFrame:
68
+ """Create an empty dask dataframe with proper structure."""
69
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
70
+
71
+ async def _load_today_data(self, **kwargs) -> Optional[dd.DataFrame]:
72
+ """Load today's data from the live reader."""
73
+ self.logger.debug(f"Loading today's live data...")
74
+ date_filter = {f"{self.date_field}__date": TODAY_STR}
75
+ filters = {**kwargs, **date_filter}
76
+
77
+ try:
78
+ today_df = await self.live_reader(
79
+ logger=self.logger,
80
+ debug=self.debug
81
+ ).aload(**filters)
82
+ return today_df
83
+ except Exception as e:
84
+ self.logger.error(f"Failed to load today's data: {e}")
85
+ if not self.debug:
86
+ return None
87
+ raise
88
+
89
+ async def _load_historical_data(self, start_date: str, end_date: str, **kwargs) -> dd.DataFrame:
90
+ """Load historical data from the historical reader."""
91
+ self.logger.debug(f"Loading historical data from {start_date} to {end_date}...")
92
+
93
+ try:
94
+ return await self.historical_reader(
95
+ parquet_start_date=start_date,
96
+ parquet_end_date=end_date,
97
+ logger=self.logger,
98
+ debug=self.debug
99
+ ).aload(**kwargs)
100
+ except Exception as e:
101
+ self.logger.error(f"Failed to load historical data from {start_date} to {end_date}: {e}")
102
+ if not self.debug:
103
+ return self._create_empty_dataframe()
104
+ raise
105
+
106
+ async def aload(self, **kwargs) -> dd.DataFrame:
107
+ """
108
+ Loads data from the historical source and, if required, the live source,
109
+ then concatenates them.
110
+ """
111
+ # Case 1: Only today's data requested
112
+ if self._is_single_today:
113
+ today_df = await self._load_today_data(**kwargs)
114
+ return today_df if today_df is not None else self._create_empty_dataframe()
115
+
116
+ # Case 2: Pure historical data (end date is not today)
117
+ if not self._should_read_live:
118
+ return await self._load_historical_data(self.start_date, self.end_date, **kwargs)
119
+
120
+ # Case 3: Mixed historical + live scenario (end date is today)
121
+ # Load historical data up to yesterday
122
+ historical_df = await self._load_historical_data(self.start_date, YESTERDAY_STR, **kwargs)
123
+
124
+ # Load today's data
125
+ today_df = await self._load_today_data(**kwargs)
126
+
127
+ # Combine dataframes
128
+ if today_df is not None and not dask_is_empty(today_df):
129
+ # Align schemas if needed
130
+ if len(historical_df.columns) > 0 and len(today_df.columns) > 0:
131
+ try:
132
+ historical_df = self._align_schema_to_live(historical_df, today_df)
133
+ except Exception as e:
134
+ self.logger.warning(f"Failed to align schemas: {e}")
135
+
136
+ return dd.concat([historical_df, today_df], ignore_index=True)
137
+ else:
138
+ return historical_df
139
+
140
+ def __repr__(self):
141
+ return (f"HybridDataLoader(start_date='{self.start_date}', "
142
+ f"end_date='{self.end_date}', "
143
+ f"loading_live={self._should_read_live})")
144
+
File without changes