sibi-dst 2025.8.7__tar.gz → 2025.8.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/PKG-INFO +3 -2
  2. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/pyproject.toml +3 -2
  3. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/_df_helper.py +105 -89
  4. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/_parquet_artifact.py +11 -10
  5. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/_parquet_reader.py +4 -0
  6. sibi_dst-2025.8.8/sibi_dst/df_helper/backends/parquet/_parquet_options.py +565 -0
  7. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +11 -10
  8. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +9 -8
  9. sibi_dst-2025.8.8/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +56 -0
  10. sibi_dst-2025.8.8/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +50 -0
  11. sibi_dst-2025.8.8/sibi_dst/utils/boilerplate/__init__.py +6 -0
  12. sibi_dst-2025.8.8/sibi_dst/utils/boilerplate/base_data_artifact.py +110 -0
  13. sibi_dst-2025.8.8/sibi_dst/utils/boilerplate/base_data_cube.py +79 -0
  14. sibi_dst-2025.8.8/sibi_dst/utils/data_wrapper.py +277 -0
  15. sibi_dst-2025.8.8/sibi_dst/utils/iceberg_saver.py +126 -0
  16. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/log_utils.py +0 -346
  17. sibi_dst-2025.8.8/sibi_dst/utils/parquet_saver.py +224 -0
  18. sibi_dst-2025.8.8/sibi_dst/utils/progress/__init__.py +5 -0
  19. sibi_dst-2025.8.8/sibi_dst/utils/progress/jobs.py +82 -0
  20. sibi_dst-2025.8.8/sibi_dst/utils/progress/sse_runner.py +82 -0
  21. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/storage_hive.py +38 -1
  22. sibi_dst-2025.8.8/sibi_dst/utils/update_planner.py +801 -0
  23. sibi_dst-2025.8.7/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -275
  24. sibi_dst-2025.8.7/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -128
  25. sibi_dst-2025.8.7/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -154
  26. sibi_dst-2025.8.7/sibi_dst/utils/data_wrapper.py +0 -518
  27. sibi_dst-2025.8.7/sibi_dst/utils/parquet_saver.py +0 -123
  28. sibi_dst-2025.8.7/sibi_dst/utils/update_planner.py +0 -300
  29. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/README.md +0 -0
  30. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/__init__.py +0 -0
  31. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/__init__.py +0 -0
  32. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/_artifact_updater_async.py +0 -0
  33. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/_artifact_updater_threaded.py +0 -0
  34. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/backends/__init__.py +0 -0
  35. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  36. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  37. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  38. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  39. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +0 -0
  40. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +0 -0
  41. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/core/__init__.py +0 -0
  42. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/core/_defaults.py +0 -0
  43. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  44. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/core/_params_config.py +0 -0
  45. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/core/_query_config.py +0 -0
  46. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/data_cleaner.py +0 -0
  47. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/geopy_helper/__init__.py +0 -0
  48. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  49. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/geopy_helper/utils.py +0 -0
  50. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/osmnx_helper/__init__.py +0 -0
  51. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  52. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  53. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  54. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +0 -0
  55. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  56. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/osmnx_helper/route_path_builder.py +0 -0
  57. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/osmnx_helper/utils.py +0 -0
  58. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/tests/__init__.py +0 -0
  59. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  60. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/__init__.py +0 -0
  61. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/async_utils.py +0 -0
  62. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/base.py +0 -0
  63. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/business_days.py +0 -0
  64. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/clickhouse_writer.py +0 -0
  65. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/credentials.py +0 -0
  66. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/data_from_http_source.py +0 -0
  67. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/data_utils.py +0 -0
  68. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/date_utils.py +0 -0
  69. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/df_utils.py +0 -0
  70. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/file_age_checker.py +0 -0
  71. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/file_utils.py +0 -0
  72. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/filepath_generator.py +0 -0
  73. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/manifest_manager.py +0 -0
  74. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/periods.py +0 -0
  75. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/phone_formatter.py +0 -0
  76. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/storage_config.py +0 -0
  77. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/storage_manager.py +0 -0
  78. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/webdav_client.py +0 -0
  79. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/__init__.py +0 -0
  80. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/__init__.py +0 -0
  81. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  82. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  83. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  84. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  85. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  86. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  87. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  88. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  89. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
  90. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
  91. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  92. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  93. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  94. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  95. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  96. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
  97. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/utils/__init__.py +0 -0
  98. {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.8.7
3
+ Version: 2025.8.8
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -19,10 +19,11 @@ Requires-Dist: pandas (>=2.3.1,<3.0.0)
19
19
  Requires-Dist: psycopg2 (>=2.9.10,<3.0.0)
20
20
  Requires-Dist: pyarrow (>=20.0.0,<21.0.0)
21
21
  Requires-Dist: pydantic (>=2.11.7,<3.0.0)
22
+ Requires-Dist: pyiceberg[hive,s3fs] (>=0.9.1,<0.10.0)
22
23
  Requires-Dist: pymysql (>=1.1.1,<2.0.0)
23
- Requires-Dist: rich (>=14.0.0,<15.0.0)
24
24
  Requires-Dist: s3fs (>=2025.5.1,<2026.0.0)
25
25
  Requires-Dist: sqlalchemy (>=2.0.41,<3.0.0)
26
+ Requires-Dist: sse-starlette (>=3.0.2,<4.0.0)
26
27
  Requires-Dist: tqdm (>=4.67.1,<5.0.0)
27
28
  Requires-Dist: webdav4 (>=0.10.0,<0.11.0)
28
29
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "2025.8.7"
3
+ version = "2025.8.8"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -21,9 +21,10 @@ pydantic = "^2.11.7"
21
21
  sqlalchemy = "^2.0.41"
22
22
  pymysql = "^1.1.1"
23
23
  pyarrow = "^20.0.0"
24
- rich = "^14.0.0"
25
24
  opentelemetry-exporter-otlp = "^1.35.0"
26
25
  opentelemetry-sdk = "^1.35.0"
26
+ pyiceberg = {extras = ["hive", "s3fs"], version = "^0.9.1"}
27
+ sse-starlette = "^3.0.2"
27
28
 
28
29
  [tool.poetry.group.dev]
29
30
  optional = true
@@ -16,6 +16,15 @@ from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromD
16
16
 
17
17
  T = TypeVar("T", bound=BaseModel)
18
18
 
19
+ def _is_dask_df(x) -> bool:
20
+ return isinstance(x, dd.DataFrame)
21
+
22
+ def _maybe_persist(df, persist: bool):
23
+ return df.persist() if persist and _is_dask_df(df) else df
24
+
25
+ def _maybe_compute(df, as_pandas: bool):
26
+ return df.compute() if as_pandas and _is_dask_df(df) else df
27
+
19
28
 
20
29
  # ---- Backend Strategy Pattern ----
21
30
  class BaseBackend:
@@ -23,13 +32,13 @@ class BaseBackend:
23
32
  self.helper = helper
24
33
  self.logger = helper.logger
25
34
  self.debug = helper.debug
26
- self.total_records = helper.total_records
35
+ self.total_records = -1
27
36
 
28
37
  def load(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
29
38
  raise NotImplementedError
30
39
 
31
40
  async def aload(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
32
- return self.load(**options)
41
+ return await asyncio.to_thread(self.load,**options)
33
42
 
34
43
 
35
44
  class SqlAlchemyBackend(BaseBackend):
@@ -48,7 +57,7 @@ class SqlAlchemyBackend(BaseBackend):
48
57
  self.total_records, result = db_loader.build_and_load()
49
58
  return self.total_records, result
50
59
  except Exception as e:
51
- self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug)
60
+ self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug, extra=self.helper.logger_extra)
52
61
  return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
53
62
 
54
63
 
@@ -56,53 +65,57 @@ class ParquetBackend(BaseBackend):
56
65
  def load(self, **options):
57
66
  try:
58
67
  df = self.helper.backend_parquet.load_files(**options)
59
- if self._is_empty(df):
60
- return -1, self._empty_like(df)
61
- nrows = self._row_count(df)
62
- if nrows == 0:
63
- self.logger.debug("No records after filters; returning empty DataFrame.")
68
+ if not self.helper._has_any_rows(df):
69
+ self.total_records = 0
64
70
  return 0, self._empty_like(df)
65
71
 
66
- df = df.persist()
67
- self.total_records = self._row_count(df) or -1
72
+ # Let DfHelper decide about persist
73
+ self.total_records = -1 # unknown without full count
68
74
  return self.total_records, df
69
75
 
70
76
  except Exception as e:
71
77
  self.total_records = -1 # Reset total_records on failure
72
- self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug)
78
+ self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug, extra=self.helper.logger_extra)
73
79
  return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
74
80
 
75
- def _is_empty(self, ddf) -> bool:
76
- """True if no rows across all partitions."""
77
- try:
78
- # head with npartitions=-1 walks partitions until it gets n rows
79
- return ddf.head(1, npartitions=-1).shape[0] == 0
80
- except Exception:
81
- return True
82
-
83
- def _row_count(self, ddf) -> int:
84
- """Reliable row count for Dask DataFrame."""
85
- return int(ddf.map_partitions(len).sum().compute())
86
-
87
- def _empty_like(self, ddf):
88
- """Return an empty Dask DF with the SAME columns/dtypes."""
81
+ @staticmethod
82
+ def _empty_like(ddf):
89
83
  empty_pdf = ddf._meta.iloc[0:0]
90
84
  return dd.from_pandas(empty_pdf, npartitions=1)
91
85
 
92
86
 
93
87
  class HttpBackend(BaseBackend):
94
88
  def load(self, **options):
95
- # Will raise NotImplementedError from helper.backend_http if sync not supported
96
- return self.helper.backend_http.fetch_data(**options)
89
+ # Avoid event-loop problems in sync code paths.
90
+ # If someone calls .load() on an async backend, make it explicit.
91
+ raise RuntimeError(
92
+ "HttpBackend.load() is sync but this backend is async-only. "
93
+ "Call `await helper.aload(...)` or `await helper.load_async(prefer_native=True, ...)`."
94
+ )
97
95
 
98
96
  async def aload(self, **options):
99
97
  if not self.helper.backend_http:
100
- self.logger.warning("HTTP plugin not configured properly.")
98
+ self.logger.warning("HTTP plugin not configured properly.", extra=self.helper.logger_extra)
101
99
  self.total_records = -1
102
100
  return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
101
+
103
102
  result = await self.helper.backend_http.fetch_data(**options)
104
- self.total_records = len(result)
105
- return self.total_records, result
103
+
104
+ # Normalize to DataFrame if the plugin returns list/dict
105
+ if isinstance(result, (list, dict)):
106
+ pdf = pd.DataFrame(result)
107
+ ddf = dd.from_pandas(pdf, npartitions=max(1, min(32, len(pdf) // 50_000 or 1)))
108
+ self.total_records = len(pdf)
109
+ return self.total_records, ddf
110
+
111
+ if isinstance(result, pd.DataFrame):
112
+ self.total_records = len(result)
113
+ ddf = dd.from_pandas(result, npartitions=max(1, min(32, len(result) // 50_000 or 1)))
114
+ return self.total_records, ddf
115
+
116
+ # Fallback
117
+ self.total_records = -1
118
+ return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
106
119
 
107
120
 
108
121
  class DfHelper(ManagedResource):
@@ -119,6 +132,7 @@ class DfHelper(ManagedResource):
119
132
  }
120
133
 
121
134
  default_config: Dict[str, Any] = None
135
+ logger_extra: Dict[str, Any] = {"sibi_dst_component": __name__}
122
136
 
123
137
  def __init__(self, backend="sqlalchemy", **kwargs):
124
138
  self.default_config = self.default_config or {}
@@ -155,24 +169,25 @@ class DfHelper(ManagedResource):
155
169
  def _cleanup(self):
156
170
  attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
157
171
  if not attr_name:
158
- self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
172
+ self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.", extra=self.logger_extra)
159
173
  return
160
174
  active_config = getattr(self, attr_name, None)
161
175
  if active_config and hasattr(active_config, "close"):
162
- self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
176
+ self.logger.debug(f"{self.__class__.__name__} is closing resources for backend '{self.backend}' backend using attribute '{attr_name}'.", extra=self.logger_extra)
163
177
  active_config.close()
164
178
 
165
179
  async def _acleanup(self):
166
180
  self.logger.warning(
167
- "DfHelper instance was not used in an async context manager; cleanup is being called manually."
181
+ "DfHelper instance was not used in an async context manager; cleanup is being called manually.",
182
+ extra=self.logger_extra,
168
183
  )
169
184
  attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
170
185
  if not attr_name:
171
- self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
186
+ self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.", extra=self.logger_extra)
172
187
  return
173
188
  active_config = getattr(self, attr_name, None)
174
189
  if active_config and hasattr(active_config, "aclose"):
175
- self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
190
+ self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.", extra=self.logger_extra)
176
191
  await active_config.aclose()
177
192
 
178
193
  # ---------- config helpers ----------
@@ -183,55 +198,49 @@ class DfHelper(ManagedResource):
183
198
 
184
199
  # ---------- load/aload ----------
185
200
  def load(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
186
- self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
201
+ self.logger.debug(f"Loading data from {self.backend} backend with options: {options}", extra=self.logger_extra)
187
202
  self.total_records, df = self.backend_strategy.load(**options)
188
203
  df = self._process_loaded_data(df)
189
204
  df = self._post_process_df(df)
190
- #self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
191
- df = df.persist() if persist else df
192
- return df.compute() if as_pandas else df
205
+ df = _maybe_persist(df, persist)
206
+ return _maybe_compute(df, as_pandas)
207
+
208
+ async def aload(
209
+ self,
210
+ *,
211
+ persist: bool = False,
212
+ as_pandas: bool = False,
213
+ timeout: Optional[float] = None,
214
+ **options
215
+ ) -> Union[pd.DataFrame, dd.DataFrame]:
216
+ # 1) Async load if available, else run sync load in a thread.
217
+ if hasattr(self.backend_strategy, "aload"):
218
+ load_awaitable = self.backend_strategy.aload(**options)
219
+ else:
220
+ # Run ONLY the backend load step in a thread to avoid event-loop blocking.
221
+ load_awaitable = asyncio.to_thread(self.backend_strategy.load, **options)
193
222
 
194
- async def aload(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
195
- self.total_records, df = await self.backend_strategy.aload(**options)
196
- df = self._process_loaded_data(df)
197
- df = self._post_process_df(df)
198
- df = df.persist() if persist else df
199
- return df.compute() if as_pandas else df
200
-
201
- async def load_async(
202
- self,
203
- *,
204
- persist: bool = False,
205
- as_pandas: bool = False,
206
- prefer_native: bool = False,
207
- **options,
208
- ):
209
- """
210
- Async load that prefers native async backends when available,
211
- otherwise runs the sync `load()` in a worker thread via asyncio.to_thread.
212
-
213
- Args:
214
- persist: same as `load`
215
- as_pandas: same as `load`
216
- prefer_native: if True and the backend overrides `aload`, use it.
217
- otherwise force thread offload of `load()`.
218
- **options: forwarded to `load` / `aload`
219
- """
220
- # If the backend provided an override for `aload`, use it
221
- if prefer_native and type(self.backend_strategy).aload is not BaseBackend.aload:
222
- return await self.aload(persist=persist, as_pandas=as_pandas, **options)
223
-
224
- # Fall back to offloading the sync path to a thread
225
- return await asyncio.to_thread(
226
- self.load,
227
- persist=persist,
228
- as_pandas=as_pandas,
229
- **options,
230
- )
223
+ total, df = await (asyncio.wait_for(load_awaitable, timeout) if timeout else load_awaitable)
224
+ self.total_records = total
225
+
226
+ # 2) Post-processing steps are sync; offload to threads.
227
+ df = await asyncio.to_thread(self._process_loaded_data, df)
228
+ df = await asyncio.to_thread(self._post_process_df, df)
229
+
230
+ # 3) Persist and compute can block; offload when needed.
231
+ if persist and _is_dask_df(df):
232
+ df = await asyncio.to_thread(df.persist)
233
+
234
+ if as_pandas and _is_dask_df(df):
235
+ # Allow separate timeout for compute if desired; reuse same timeout here.
236
+ compute_awaitable = asyncio.to_thread(df.compute)
237
+ return await (asyncio.wait_for(compute_awaitable, timeout) if timeout else compute_awaitable)
238
+
239
+ return df
231
240
 
232
241
  # ---------- dataframe post-processing ----------
233
242
  def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
234
- self.logger.debug("Post-processing DataFrame.")
243
+ self.logger.debug(f"{self.__class__.__name__} is post-processing resulting dataframe with {len(df)} records.", extra=self.logger_extra)
235
244
  df_params = self._backend_params.df_params
236
245
  if not df_params:
237
246
  return df
@@ -242,7 +251,7 @@ class DfHelper(ManagedResource):
242
251
  if fieldnames:
243
252
  valid = [f for f in fieldnames if f in df.columns]
244
253
  if len(valid) < len(fieldnames):
245
- self.logger.warning(f"Missing columns for filtering: {set(fieldnames) - set(valid)}")
254
+ self.logger.warning(f"Missing columns for filtering: {set(fieldnames) - set(valid)}", extra=self.logger_extra)
246
255
  df = df[valid]
247
256
  if column_names:
248
257
  if len(df.columns) != len(column_names):
@@ -255,7 +264,7 @@ class DfHelper(ManagedResource):
255
264
  raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
256
265
  df = df.set_index(index_col)
257
266
 
258
- self.logger.debug("Post-processing complete.")
267
+ self.logger.debug("Post-processing complete.", extra=self.logger_extra)
259
268
  return df
260
269
 
261
270
  def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
@@ -264,7 +273,7 @@ class DfHelper(ManagedResource):
264
273
  return df
265
274
  if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
266
275
  return df
267
- self.logger.debug("Applying rename mapping if necessary.")
276
+ self.logger.debug(f"{self.__class__.__name__} is applying rename mapping if/when necessary.", extra=self.logger_extra)
268
277
  rename_map = {k: v for k, v in field_map.items() if k in df.columns}
269
278
  if rename_map:
270
279
  df = df.rename(columns=rename_map)
@@ -274,15 +283,15 @@ class DfHelper(ManagedResource):
274
283
  def save_to_parquet(self, df: dd.DataFrame, **kwargs):
275
284
  fs: AbstractFileSystem = kwargs.pop("fs", self.fs)
276
285
  path: str = kwargs.pop("parquet_storage_path", self.backend_parquet.parquet_storage_path if self.backend_parquet else None)
277
- parquet_filename = kwargs.pop("parquet_filename" or self._backend_params.parquet_filename if self.backend_parquet else None)
286
+ parquet_filename = kwargs.pop("parquet_filename", self.backend_parquet.parquet_filename if self.backend_parquet else None)
278
287
  if not parquet_filename:
279
288
  raise ValueError("A 'parquet_filename' keyword argument must be provided.")
280
289
  if not fs:
281
290
  raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
282
291
  if not path:
283
292
  raise ValueError("A 'parquet_storage_path' keyword argument must be provided.")
284
- if len(df.head(1)) == 0:
285
- self.logger.warning("Skipping save: The provided DataFrame is empty.")
293
+ if not self._has_any_rows(df):
294
+ self.logger.warning("Skipping save: The provided DataFrame is empty.", extra=self.logger_extra)
286
295
  return
287
296
 
288
297
  with ParquetSaver(
@@ -296,15 +305,15 @@ class DfHelper(ManagedResource):
296
305
  ) as saver:
297
306
  saver.save_to_parquet(parquet_filename)
298
307
 
299
- self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.")
308
+ self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.", extra=self.logger_extra)
300
309
 
301
310
  def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
302
- if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
303
- self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
311
+ if not self._has_any_rows(df):
312
+ self.logger.warning("Skipping save to ClickHouse: The provided DataFrame is empty.", extra=self.logger_extra)
304
313
  return
305
314
  with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
306
315
  writer.save_to_clickhouse(df)
307
- self.logger.debug("Save to ClickHouse completed.")
316
+ self.logger.debug("Save to ClickHouse completed.", extra=self.logger_extra)
308
317
 
309
318
  # ---------- period loaders ----------
310
319
  def load_period(self, dt_field: str, start: str, end: str, **kwargs):
@@ -322,13 +331,20 @@ class DfHelper(ManagedResource):
322
331
  field_map = self._backend_params.field_map or {}
323
332
  reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
324
333
  if len(reverse_map) != len(field_map):
325
- self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.")
334
+ self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.", extra=self.logger_extra)
326
335
  mapped_field = reverse_map.get(dt_field, dt_field)
327
336
  if start_date == end_date:
328
337
  kwargs[f"{mapped_field}__date"] = start_date
329
338
  else:
330
339
  kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
331
- self.logger.debug(f"Period load generated filters: {kwargs}")
340
+ self.logger.debug(f"Period load generated filters: {kwargs}", extra=self.logger_extra)
332
341
  return kwargs
333
342
 
343
+ @staticmethod
344
+ def _has_any_rows(ddf: dd.DataFrame) -> bool:
345
+ try:
346
+ return bool(ddf.head(1, npartitions=-1).shape[0])
347
+ except Exception:
348
+ return False
349
+
334
350
 
@@ -23,6 +23,7 @@ class ParquetArtifact(ManagedResource):
23
23
 
24
24
  _global_lock = threading.RLock()
25
25
  _active_runs: set[tuple[str, str]] = set()
26
+ logger_extra = {"sibi_dst_component": __name__}
26
27
 
27
28
  def __init__(self, **kwargs: Any):
28
29
  # Merge defaults from ManagedResource and caller kwargs
@@ -49,7 +50,7 @@ class ParquetArtifact(ManagedResource):
49
50
  # ---------- lazy members ----------
50
51
  @cached_property
51
52
  def mmanifest(self) -> MissingManifestManager:
52
- self.logger.info("Initializing MissingManifestManager...")
53
+ self.logger.info("Initializing MissingManifestManager...", extra=self.logger_extra)
53
54
  manifest_path = self._build_manifest_path()
54
55
 
55
56
  # ensure manifest directory exists
@@ -66,16 +67,16 @@ class ParquetArtifact(ManagedResource):
66
67
  )
67
68
 
68
69
  if not mgr._safe_exists(mgr.manifest_path):
69
- self.logger.info(f"Creating new manifest at {mgr.manifest_path}")
70
+ self.logger.info(f"Creating new manifest at {mgr.manifest_path}", extra=self.logger_extra)
70
71
  mgr.save()
71
72
  else:
72
- self.logger.info(f"Manifest already exists at {mgr.manifest_path}")
73
+ self.logger.info(f"Manifest already exists at {mgr.manifest_path}", extra=self.logger_extra)
73
74
 
74
75
  return mgr
75
76
 
76
77
  @cached_property
77
78
  def update_planner(self) -> UpdatePlanner:
78
- self.logger.info("Initializing UpdatePlanner...")
79
+ self.logger.info("Initializing UpdatePlanner...", extra=self.logger_extra)
79
80
  skipped_files = self.mmanifest.load_existing() or []
80
81
 
81
82
  cfg = {
@@ -91,7 +92,7 @@ class ParquetArtifact(ManagedResource):
91
92
 
92
93
  @cached_property
93
94
  def data_wrapper(self) -> DataWrapper:
94
- self.logger.info("Initializing DataWrapper...")
95
+ self.logger.info("Initializing DataWrapper...", extra=self.logger_extra)
95
96
 
96
97
  # Ensure the planner has a plan
97
98
  if getattr(self.update_planner, "plan", None) is None:
@@ -170,7 +171,7 @@ class ParquetArtifact(ManagedResource):
170
171
  with ParquetArtifact._global_lock:
171
172
  if key in ParquetArtifact._active_runs:
172
173
  self.logger.info(
173
- f"Run already in progress for {key}; skipping this invocation."
174
+ f"Run already in progress for {key}; skipping this invocation.", extra=self.logger_extra
174
175
  )
175
176
  return
176
177
  ParquetArtifact._active_runs.add(key)
@@ -182,7 +183,7 @@ class ParquetArtifact(ManagedResource):
182
183
  plan = getattr(self.update_planner, "plan", None)
183
184
  if plan is None or (hasattr(plan, "empty") and plan.empty):
184
185
  # Planning uses Pandas; this is safe to check.
185
- self.logger.info("No updates needed. Skipping Parquet generation.")
186
+ self.logger.info("No updates needed. Skipping Parquet generation.", extra=self.logger_extra)
186
187
  return
187
188
 
188
189
  # Print plan once per run
@@ -286,7 +287,7 @@ class ParquetArtifact(ManagedResource):
286
287
 
287
288
  final_kwargs.update(period_params)
288
289
  self.logger.debug(
289
- f"kwargs passed to update_parquet/generate_parquet: {final_kwargs}"
290
+ f"kwargs passed to update_parquet/generate_parquet: {final_kwargs}", extra=self.logger_extra
290
291
  )
291
292
 
292
293
  # Delegate to generator (handles cache invalidation + forwarding knobs)
@@ -297,7 +298,7 @@ class ParquetArtifact(ManagedResource):
297
298
  """Ensure the directory exists across fsspec backends."""
298
299
  with self._lock:
299
300
  if not self.fs.exists(path):
300
- self.logger.info(f"Creating directory: {path}")
301
+ self.logger.info(f"Creating directory: {path}", extra=self.logger_extra)
301
302
  try:
302
303
  self.fs.makedirs(path, exist_ok=True)
303
304
  except TypeError:
@@ -317,4 +318,4 @@ class ParquetArtifact(ManagedResource):
317
318
  if "data_wrapper" in self.__dict__ and hasattr(self.data_wrapper, "close"):
318
319
  self.data_wrapper.close()
319
320
  except Exception as e:
320
- self.logger.warning(f"Error during resource cleanup: {e}")
321
+ self.logger.warning(f"Error during resource cleanup: {e}", extra=self.logger_extra)
@@ -70,6 +70,10 @@ class ParquetReader(DfHelper):
70
70
  self.df = super().load(**kwargs)
71
71
  return self.df
72
72
 
73
+ async def aload(self, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
74
+ self.df = await super().aload(**kwargs)
75
+ return self.df
76
+
73
77
  def directory_exists(self):
74
78
  try:
75
79
  info = self.fs.info(self.parquet_storage_path)