sibi-flux 2025.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. sibi_dst/__init__.py +44 -0
  2. sibi_flux/__init__.py +49 -0
  3. sibi_flux/artifacts/__init__.py +7 -0
  4. sibi_flux/artifacts/base.py +166 -0
  5. sibi_flux/artifacts/parquet.py +360 -0
  6. sibi_flux/artifacts/parquet_engine/__init__.py +5 -0
  7. sibi_flux/artifacts/parquet_engine/executor.py +204 -0
  8. sibi_flux/artifacts/parquet_engine/manifest.py +101 -0
  9. sibi_flux/artifacts/parquet_engine/planner.py +544 -0
  10. sibi_flux/conf/settings.py +131 -0
  11. sibi_flux/core/__init__.py +5 -0
  12. sibi_flux/core/managed_resource/__init__.py +3 -0
  13. sibi_flux/core/managed_resource/_managed_resource.py +733 -0
  14. sibi_flux/core/type_maps/__init__.py +100 -0
  15. sibi_flux/dask_cluster/__init__.py +47 -0
  16. sibi_flux/dask_cluster/async_core.py +27 -0
  17. sibi_flux/dask_cluster/client_manager.py +549 -0
  18. sibi_flux/dask_cluster/core.py +322 -0
  19. sibi_flux/dask_cluster/exceptions.py +34 -0
  20. sibi_flux/dask_cluster/utils.py +49 -0
  21. sibi_flux/datacube/__init__.py +3 -0
  22. sibi_flux/datacube/_data_cube.py +332 -0
  23. sibi_flux/datacube/config_engine.py +152 -0
  24. sibi_flux/datacube/field_factory.py +48 -0
  25. sibi_flux/datacube/field_registry.py +122 -0
  26. sibi_flux/datacube/generator.py +677 -0
  27. sibi_flux/datacube/orchestrator.py +171 -0
  28. sibi_flux/dataset/__init__.py +3 -0
  29. sibi_flux/dataset/_dataset.py +162 -0
  30. sibi_flux/df_enricher/__init__.py +56 -0
  31. sibi_flux/df_enricher/async_enricher.py +201 -0
  32. sibi_flux/df_enricher/merger.py +253 -0
  33. sibi_flux/df_enricher/specs.py +45 -0
  34. sibi_flux/df_enricher/types.py +12 -0
  35. sibi_flux/df_helper/__init__.py +5 -0
  36. sibi_flux/df_helper/_df_helper.py +450 -0
  37. sibi_flux/df_helper/backends/__init__.py +34 -0
  38. sibi_flux/df_helper/backends/_params.py +173 -0
  39. sibi_flux/df_helper/backends/_strategies.py +295 -0
  40. sibi_flux/df_helper/backends/http/__init__.py +5 -0
  41. sibi_flux/df_helper/backends/http/_http_config.py +122 -0
  42. sibi_flux/df_helper/backends/parquet/__init__.py +7 -0
  43. sibi_flux/df_helper/backends/parquet/_parquet_options.py +268 -0
  44. sibi_flux/df_helper/backends/sqlalchemy/__init__.py +9 -0
  45. sibi_flux/df_helper/backends/sqlalchemy/_db_connection.py +256 -0
  46. sibi_flux/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  47. sibi_flux/df_helper/backends/sqlalchemy/_io_dask.py +386 -0
  48. sibi_flux/df_helper/backends/sqlalchemy/_load_from_db.py +134 -0
  49. sibi_flux/df_helper/backends/sqlalchemy/_model_registry.py +239 -0
  50. sibi_flux/df_helper/backends/sqlalchemy/_sql_model_builder.py +42 -0
  51. sibi_flux/df_helper/backends/utils.py +32 -0
  52. sibi_flux/df_helper/core/__init__.py +15 -0
  53. sibi_flux/df_helper/core/_defaults.py +104 -0
  54. sibi_flux/df_helper/core/_filter_handler.py +617 -0
  55. sibi_flux/df_helper/core/_params_config.py +185 -0
  56. sibi_flux/df_helper/core/_query_config.py +17 -0
  57. sibi_flux/df_validator/__init__.py +3 -0
  58. sibi_flux/df_validator/_df_validator.py +222 -0
  59. sibi_flux/logger/__init__.py +1 -0
  60. sibi_flux/logger/_logger.py +480 -0
  61. sibi_flux/mcp/__init__.py +26 -0
  62. sibi_flux/mcp/client.py +150 -0
  63. sibi_flux/mcp/router.py +126 -0
  64. sibi_flux/orchestration/__init__.py +9 -0
  65. sibi_flux/orchestration/_artifact_orchestrator.py +346 -0
  66. sibi_flux/orchestration/_pipeline_executor.py +212 -0
  67. sibi_flux/osmnx_helper/__init__.py +22 -0
  68. sibi_flux/osmnx_helper/_pbf_handler.py +384 -0
  69. sibi_flux/osmnx_helper/graph_loader.py +225 -0
  70. sibi_flux/osmnx_helper/utils.py +100 -0
  71. sibi_flux/pipelines/__init__.py +3 -0
  72. sibi_flux/pipelines/base.py +218 -0
  73. sibi_flux/py.typed +0 -0
  74. sibi_flux/readers/__init__.py +3 -0
  75. sibi_flux/readers/base.py +82 -0
  76. sibi_flux/readers/parquet.py +106 -0
  77. sibi_flux/utils/__init__.py +53 -0
  78. sibi_flux/utils/boilerplate/__init__.py +19 -0
  79. sibi_flux/utils/boilerplate/base_attacher.py +45 -0
  80. sibi_flux/utils/boilerplate/base_cube_router.py +283 -0
  81. sibi_flux/utils/boilerplate/base_data_cube.py +132 -0
  82. sibi_flux/utils/boilerplate/base_pipeline_template.py +54 -0
  83. sibi_flux/utils/boilerplate/hybrid_data_loader.py +193 -0
  84. sibi_flux/utils/clickhouse_writer/__init__.py +6 -0
  85. sibi_flux/utils/clickhouse_writer/_clickhouse_writer.py +225 -0
  86. sibi_flux/utils/common.py +7 -0
  87. sibi_flux/utils/credentials/__init__.py +3 -0
  88. sibi_flux/utils/credentials/_config_manager.py +155 -0
  89. sibi_flux/utils/dask_utils.py +14 -0
  90. sibi_flux/utils/data_utils/__init__.py +3 -0
  91. sibi_flux/utils/data_utils/_data_utils.py +389 -0
  92. sibi_flux/utils/dataframe_utils.py +52 -0
  93. sibi_flux/utils/date_utils/__init__.py +10 -0
  94. sibi_flux/utils/date_utils/_business_days.py +220 -0
  95. sibi_flux/utils/date_utils/_date_utils.py +311 -0
  96. sibi_flux/utils/date_utils/_file_age_checker.py +319 -0
  97. sibi_flux/utils/file_utils.py +48 -0
  98. sibi_flux/utils/filepath_generator/__init__.py +5 -0
  99. sibi_flux/utils/filepath_generator/_filepath_generator.py +185 -0
  100. sibi_flux/utils/parquet_saver/__init__.py +6 -0
  101. sibi_flux/utils/parquet_saver/_parquet_saver.py +436 -0
  102. sibi_flux/utils/parquet_saver/_write_gatekeeper.py +33 -0
  103. sibi_flux/utils/retry.py +46 -0
  104. sibi_flux/utils/storage/__init__.py +7 -0
  105. sibi_flux/utils/storage/_fs_registry.py +112 -0
  106. sibi_flux/utils/storage/_storage_manager.py +257 -0
  107. sibi_flux/utils/storage/factory.py +33 -0
  108. sibi_flux-2025.12.0.dist-info/METADATA +283 -0
  109. sibi_flux-2025.12.0.dist-info/RECORD +110 -0
  110. sibi_flux-2025.12.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,544 @@
1
+ from __future__ import annotations
2
+
3
+ import datetime
4
+ import datetime as dt
5
+ import re
6
+ from concurrent.futures import ThreadPoolExecutor, wait
7
+ from typing import (
8
+ List,
9
+ Optional,
10
+ Dict,
11
+ Tuple,
12
+ Set,
13
+ Iterator,
14
+ ClassVar,
15
+ Any,
16
+ Callable,
17
+ )
18
+
19
+ import pandas as pd
20
+
21
+ from sibi_flux.core import ManagedResource
22
+ from sibi_flux.utils.date_utils._file_age_checker import FileAgeChecker
23
+
24
+
25
+ class UpdatePlanner(ManagedResource):
26
+ """
27
+ Represents an update planner for maintaining and managing updates to data stored in a
28
+ specific parquet storage path. The planner organizes data updates based on configured
29
+ heuristics, date ranges, and user-defined settings for prioritization and execution.
30
+
31
+ The class handles various configurations such as partitioning, update thresholds, and
32
+ progress visualization. It supports hive-style partitioning and provides mechanisms to
33
+ generate, review, and execute update plans.
34
+
35
+ :ivar DEFAULT_PRIORITY_MAP: Default priority levels assigned to update scenarios. Each key
36
+ corresponds to an update condition, and integer values represent priority levels.
37
+ :type DEFAULT_PRIORITY_MAP: Dict[str, int]
38
+ :ivar DEFAULT_MAX_AGE_MINUTES: Default maximum age (in minutes) for outdated files before
39
+ requiring update.
40
+ :type DEFAULT_MAX_AGE_MINUTES: int
41
+ :ivar DEFAULT_HISTORY_DAYS_THRESHOLD: Default period (in days) used as a history
42
+ threshold for updates.
43
+ :type DEFAULT_HISTORY_DAYS_THRESHOLD: int
44
+ :ivar DATA_FILE_PATTERNS: Supported file patterns used to identify data files in
45
+ the storage path.
46
+ :type DATA_FILE_PATTERNS: Tuple[str, ...]
47
+ :ivar CONTROL_BASENAMES: Set of control filenames typically used to manage data
48
+ updates, such as success markers or metadata files.
49
+ :type CONTROL_BASENAMES: Set[str]
50
+ :ivar HIVE_PARTITION_RE: Regular expression pattern to detect hive-style partitioning
51
+ patterns within file paths.
52
+ :type HIVE_PARTITION_RE: re.Pattern
53
+ :ivar data_path: Path to the parquet storage, ensuring any updates are scoped within
54
+ this directory.
55
+ :type data_path: str
56
+ :ivar description: Brief description of the planner or its purpose.
57
+ :type description: str
58
+ :ivar reverse_order: Whether to reverse the order of processing update tasks.
59
+ :type reverse_order: bool
60
+ :ivar show_progress: Flag to enable or disable progress reporting during updates.
61
+ :type show_progress: bool
62
+ :ivar overwrite: Indicates whether existing data should be forcibly overwritten
63
+ during updates.
64
+ :type overwrite: bool
65
+ :ivar ignore_missing: Flag to ignore missing data instead of reporting it as an error.
66
+ :type ignore_missing: bool
67
+ :ivar history_days_threshold: Custom threshold for history-based update prioritization.
68
+ :type history_days_threshold: int
69
+ :ivar max_age_minutes: Custom maximum allowable age for files in minutes before
70
+ an update is required.
71
+ :type max_age_minutes: int
72
+ :ivar priority_map: Map of custom priority levels for various update scenarios. Modifiable
73
+ by the user to override default priorities.
74
+ :type priority_map: Dict[str, int]
75
+ :ivar hive_style: Indicates if hive-style partitioning should be enabled.
76
+ :type hive_style: bool
77
+ :ivar partition_on: List of fields used for partitioning the data.
78
+ :type partition_on: List[str]
79
+ :ivar max_threads: Maximum number of threads to use for concurrent operations.
80
+ :type max_threads: int
81
+ :ivar timeout: Timeout (in seconds) applied for typical operations.
82
+ :type timeout: float
83
+ :ivar list_timeout: Timeout (in seconds) for listing files in storage.
84
+ :type list_timeout: float
85
+ :ivar total_timeout: Maximum duration allowed for operations to complete.
86
+ :type total_timeout: float
87
+ :ivar reference_date: The reference date used to anchor historical updates or
88
+ determine thresholds.
89
+ :type reference_date: dt.date
90
+ :ivar check_completeness: Configuration for enforcing data completeness checks.
91
+ :type check_completeness: bool
92
+ :ivar require_success_marker: Whether updates require the presence of a success marker
93
+ file for validation.
94
+ :type require_success_marker: bool
95
+ :ivar list_granularity: Granularity level used for organizing file listings (e.g., daily
96
+ or monthly).
97
+ :type list_granularity: str
98
+ :ivar data_file_suffixes: Supported file suffixes for identifying valid input files during
99
+ update planning.
100
+ :type data_file_suffixes: Tuple[str, ...]
101
+ """
102
+
103
+ DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
104
+ "file_is_recent": 0,
105
+ "missing_ignored": 0,
106
+ "overwrite_forced": 1,
107
+ "incomplete": 1,
108
+ "create_missing": 2,
109
+ "missing_in_history": 3,
110
+ "stale_in_history": 4,
111
+ "future": 99,
112
+ }
113
+
114
+ DEFAULT_MAX_AGE_MINUTES: int = 1440
115
+ DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
116
+
117
+ DATA_FILE_PATTERNS: ClassVar[Tuple[str, ...]] = (
118
+ ".parquet",
119
+ ".orc",
120
+ ".csv",
121
+ ".json",
122
+ )
123
+ CONTROL_BASENAMES: ClassVar[Set[str]] = {
124
+ "_SUCCESS",
125
+ "_metadata",
126
+ "_common_metadata",
127
+ }
128
+
129
+ HIVE_PARTITION_RE: ClassVar[re.Pattern] = re.compile(r"([^/=]+)=([^/]+)")
130
+
131
+ logger_extra = {"sibi_flux_component": __name__}
132
+
133
+ def __init__(
134
+ self,
135
+ parquet_storage_path: str,
136
+ *,
137
+ partition_on: Optional[List[str]] = None,
138
+ description: str = "Update Planner",
139
+ reference_date: str | dt.date | None = None,
140
+ history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
141
+ max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
142
+ overwrite: bool = False,
143
+ ignore_missing: bool = False,
144
+ custom_priority_map: Optional[Dict[str, int]] = None,
145
+ reverse_order: bool = False,
146
+ show_progress: bool = False,
147
+ hive_style: bool = False,
148
+ skipped: Optional[List[str | dt.date]] = None,
149
+ **kwargs,
150
+ ):
151
+ super().__init__(**kwargs)
152
+
153
+ # ---- core config ----
154
+ self.data_path: str = self._ensure_trailing_slash(parquet_storage_path)
155
+ self.description: str = description
156
+ self.reverse_order: bool = reverse_order
157
+ self.show_progress: bool = show_progress
158
+ self.overwrite: bool = overwrite
159
+ self.ignore_missing: bool = ignore_missing
160
+ self.history_days_threshold: int = history_days_threshold
161
+ self.max_age_minutes: int = max_age_minutes
162
+ self.priority_map: Dict[str, int] = (
163
+ dict(custom_priority_map)
164
+ if custom_priority_map
165
+ else dict(self.DEFAULT_PRIORITY_MAP)
166
+ )
167
+
168
+ self.hive_style: bool = hive_style
169
+ self.partition_on: List[str] = list(
170
+ partition_on or ["partition_date"]
171
+ if self.hive_style
172
+ else ["year", "month", "day"]
173
+ )
174
+
175
+ self.max_threads: int = int(kwargs.get("max_threads", 3))
176
+ self.timeout: float = float(kwargs.get("timeout", 30.0))
177
+ self.list_timeout: float = float(kwargs.get("list_timeout", self.timeout))
178
+ self.total_timeout: float = float(kwargs.get("total_timeout", self.timeout))
179
+
180
+ # ---- date window ----
181
+ self.start_date = kwargs.get("parquet_start_date")
182
+ self.end_date = kwargs.get("parquet_end_date")
183
+
184
+ # ---- reference date ----
185
+ if reference_date is not None:
186
+ self.reference_date: dt.date = pd.to_datetime(reference_date).date()
187
+ else:
188
+ self.reference_date = dt.date.today()
189
+
190
+ # ---- completeness/heuristics ----
191
+ self.check_completeness: bool = bool(kwargs.get("check_completeness", False))
192
+ self.require_success_marker: bool = bool(
193
+ kwargs.get("require_success_marker", False)
194
+ )
195
+ self.list_granularity: str = str(kwargs.get("list_granularity", "month"))
196
+ self.data_file_suffixes: Tuple[str, ...] = tuple(
197
+ kwargs.get("data_file_suffixes", self.DATA_FILE_PATTERNS)
198
+ )
199
+
200
+ # ---- clock for tests ----
201
+ self._utcnow: Callable[[], dt.datetime] = kwargs.get("utcnow_func", None) or (
202
+ lambda: dt.datetime.now(datetime.UTC)
203
+ )
204
+
205
+ # ---- skipped (back-compat) ----
206
+ self.skipped = list(skipped or kwargs.get("skipped", []) or [])
207
+ self.skipped_paths: Set[str] = {
208
+ p.rstrip("/") + "/" for p in self.skipped if isinstance(p, str)
209
+ }
210
+ self.skipped_dates: Set[dt.date] = {
211
+ p for p in self.skipped if isinstance(p, dt.date)
212
+ }
213
+
214
+ if not getattr(self, "fs", None):
215
+ raise ValueError("UpdatePlanner requires a valid fsspec filesystem (fs).")
216
+
217
+ self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
218
+ self.plan: pd.DataFrame = pd.DataFrame()
219
+ self.df_req: pd.DataFrame = pd.DataFrame()
220
+ self._printed_this_run: bool = False
221
+
222
+ @property
223
+ def skipped(self) -> List[str | dt.date]:
224
+ return [*sorted(self.skipped_paths), *sorted(self.skipped_dates)]
225
+
226
+ @skipped.setter
227
+ def skipped(self, value: List[str | dt.date]) -> None:
228
+ self.skipped_paths = {p.rstrip("/") + "/" for p in value if isinstance(p, str)}
229
+ self.skipped_dates = {p for p in value if isinstance(p, dt.date)}
230
+
231
+ # --------------------- Public API ---------------------
232
+ def generate_plan(
233
+ self,
234
+ start: str | dt.date | None = None,
235
+ end: str | dt.date | None = None,
236
+ freq: str = "D",
237
+ ) -> pd.DataFrame:
238
+ start = start or self.start_date
239
+ end = end or self.end_date
240
+ if start is None or end is None:
241
+ raise ValueError(
242
+ "start and end must be provided (or set via parquet_* kwargs)."
243
+ )
244
+
245
+ sd = pd.to_datetime(start).date()
246
+ ed = pd.to_datetime(end).date()
247
+ if sd > ed:
248
+ raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
249
+
250
+ self.logger.info(
251
+ f"Generating update plan for {self.description} from {sd} to {ed}",
252
+ extra=self._log_extra(),
253
+ )
254
+ self._generate_plan(sd, ed, freq=freq)
255
+ return self.df_req
256
+
257
+ def show_update_plan(self) -> None:
258
+ if not self.has_plan() or self._printed_this_run:
259
+ return
260
+ try:
261
+ from rich.console import Console
262
+ from rich.table import Table
263
+
264
+ console = Console()
265
+ table = Table(
266
+ title=f"Update Plan for {self.data_path} [{'Hive' if 'partition_date' in self.partition_on else 'Legacy'}]",
267
+ show_header=True,
268
+ header_style="bold magenta",
269
+ expand=True,
270
+ pad_edge=False,
271
+ )
272
+ for col in self.plan.columns:
273
+ table.add_column(col, justify="left", overflow="fold")
274
+ # Use itertuples for better performance and type safety
275
+ for row in self.plan.itertuples(index=False):
276
+ # row is a namedtuple, access fields by name matches column names
277
+ # logic: we iterate columns to ensure order matches table header
278
+ table.add_row(*(str(getattr(row, c)) for c in self.plan.columns))
279
+ console.print(table)
280
+ except Exception:
281
+ self.logger.debug(
282
+ f"Update Plan:\n{self.plan.head(50)}", extra=self._log_extra()
283
+ )
284
+ self._printed_this_run = True
285
+
286
+ def get_tasks_by_priority(self) -> Iterator[Tuple[int, List[dt.date]]]:
287
+ if not self.has_plan():
288
+ return
289
+ req = self.plan[self.plan["update_required"]]
290
+ for priority in sorted(req["update_priority"].unique()):
291
+ dates = (
292
+ req[req["update_priority"] == priority]
293
+ .sort_values(by="date", ascending=not self.reverse_order)["date"]
294
+ .tolist()
295
+ )
296
+ if dates:
297
+ yield int(priority), dates
298
+
299
+ def has_plan(self) -> bool:
300
+ return not self.plan.empty
301
+
302
+ def required_count(self) -> int:
303
+ return len(self.df_req)
304
+
305
+ def _generate_plan(self, start: dt.date, end: dt.date, freq: str = "D") -> None:
306
+ dates: List[dt.date] = pd.date_range(
307
+ start=start, end=end, freq=freq
308
+ ).date.tolist()
309
+ history_start = self.reference_date - dt.timedelta(
310
+ days=self.history_days_threshold
311
+ )
312
+ rows: List[Dict[str, Any]] = []
313
+
314
+ if "partition_date" in self.partition_on:
315
+ caches: Dict[dt.date, Dict[str, Any]] = self._list_prefix(self.data_path)
316
+ else:
317
+ caches = {}
318
+ months = list(
319
+ self._iter_month_starts(
320
+ self._month_floor(start), self._month_floor(end)
321
+ )
322
+ )
323
+ with ThreadPoolExecutor(max_workers=max(1, self.max_threads)) as ex:
324
+ future_to_unit = {
325
+ ex.submit(self._list_prefix, self._month_prefix(m)): m
326
+ for m in months
327
+ }
328
+ done, _ = wait(
329
+ future_to_unit.keys(), timeout=self.total_timeout or None
330
+ )
331
+ for fut in done:
332
+ m = future_to_unit[fut]
333
+ try:
334
+ caches[m] = fut.result(timeout=self.list_timeout or None)
335
+ except Exception:
336
+ caches[m] = {}
337
+
338
+ for d in dates:
339
+ if d > self.reference_date:
340
+ rows.append(self._row_future(d))
341
+ continue
342
+ if self._is_skipped(d):
343
+ rows.append(self._make_row(d, history_start, False, None))
344
+ continue
345
+
346
+ cache = (
347
+ caches
348
+ if "partition_date" in self.partition_on
349
+ else caches.get(d.replace(day=1), {})
350
+ )
351
+ exists, age_min, incomplete = self._summarize_partition(d, cache)
352
+ if incomplete and not self.overwrite:
353
+ rows.append(self._row_incomplete(d, age_min))
354
+ else:
355
+ rows.append(self._make_row(d, history_start, exists, age_min))
356
+
357
+ df = pd.DataFrame.from_records(rows)
358
+ if not df.empty:
359
+ df["date"] = pd.to_datetime(df["date"]).dt.date
360
+ df["update_priority"] = df["update_priority"].astype(int)
361
+ self.plan = df.sort_values(
362
+ by=["update_priority", "date"],
363
+ ascending=[True, not self.reverse_order],
364
+ kind="mergesort",
365
+ ).reset_index(drop=True)
366
+ self.df_req = self.plan[self.plan["update_required"]].copy()
367
+
368
+ def _list_prefix(self, prefix: str) -> Dict[dt.date, Dict[str, Any]]:
369
+ try:
370
+ items: Dict[str, Any] = self.fs.find(prefix, withdirs=False, detail=True)
371
+ except Exception:
372
+ return {}
373
+
374
+ out: Dict[dt.date, Dict[str, Any]] = {}
375
+ for path, info in items.items():
376
+ d: Optional[dt.date] = None
377
+ if "partition_date" in self.partition_on:
378
+ parts = self._extract_partitions(path)
379
+ if "partition_date" in parts:
380
+ try:
381
+ d = dt.date.fromisoformat(parts["partition_date"])
382
+ except Exception:
383
+ continue
384
+ else:
385
+ segs = path.strip("/").split("/")
386
+ if len(segs) >= 3:
387
+ try:
388
+ y, m, dd = int(segs[-3]), int(segs[-2]), int(segs[-1])
389
+ d = dt.date(y, m, dd)
390
+ except Exception:
391
+ continue
392
+ if d is None:
393
+ continue
394
+
395
+ rec = out.setdefault(
396
+ d, {"files": [], "has_success": False, "newest_ts": None}
397
+ )
398
+ base = path.rsplit("/", 1)[-1]
399
+ if base == "_SUCCESS":
400
+ rec["has_success"] = True
401
+ if self._is_data_file(path):
402
+ rec["files"].append(path)
403
+ ts = self._extract_mtime(info)
404
+ if ts and (rec["newest_ts"] is None or ts > rec["newest_ts"]):
405
+ rec["newest_ts"] = ts
406
+ return out
407
+
408
+ def _extract_partitions(self, path: str) -> Dict[str, str]:
409
+ out: Dict[str, str] = {}
410
+ for seg in path.strip("/").split("/"):
411
+ m = self.HIVE_PARTITION_RE.match(seg)
412
+ if m:
413
+ out[m.group(1)] = m.group(2)
414
+ return out
415
+
416
+ def _summarize_partition(
417
+ self, d: dt.date, cache: Dict[dt.date, Dict[str, Any]]
418
+ ) -> Tuple[bool, Optional[float], bool]:
419
+ rec = cache.get(d, {})
420
+ files = rec.get("files", [])
421
+ exists = bool(files)
422
+ if not exists:
423
+ return False, None, False
424
+ has_success = rec.get("has_success", False)
425
+ newest_ts = rec.get("newest_ts")
426
+ age_min = None
427
+ if newest_ts:
428
+ now = self._utcnow().replace(tzinfo=None)
429
+ ts = newest_ts.replace(tzinfo=None) if newest_ts.tzinfo else newest_ts
430
+ age_min = max(0.0, (now - ts).total_seconds() / 60.0)
431
+ incomplete = (
432
+ self.check_completeness and self.require_success_marker and not has_success
433
+ )
434
+ return exists, age_min, incomplete
435
+
436
+ def _make_row(
437
+ self, d: dt.date, history_start: dt.date, exists: bool, age_min: Optional[float]
438
+ ) -> Dict[str, Any]:
439
+ within_history = history_start <= d <= self.reference_date
440
+ category, update_required = "unknown", False
441
+ if self.overwrite:
442
+ category, update_required = "overwrite_forced", True
443
+ elif within_history:
444
+ if not exists:
445
+ category, update_required = "missing_in_history", True
446
+ elif age_min is not None and age_min > self.max_age_minutes:
447
+ category, update_required = "stale_in_history", True
448
+ else:
449
+ category = "file_is_recent"
450
+ elif not exists and not self.ignore_missing:
451
+ category, update_required = "create_missing", True
452
+ else:
453
+ category = "missing_ignored" if not exists else "file_is_recent"
454
+ return {
455
+ "date": d,
456
+ "file_exists": exists,
457
+ "file_age_minutes": age_min,
458
+ "update_category": category,
459
+ "update_priority": self.priority_map.get(category, 99),
460
+ "update_required": update_required,
461
+ "description": self.description,
462
+ }
463
+
464
+ def _row_future(self, d: dt.date) -> Dict[str, Any]:
465
+ return {
466
+ "date": d,
467
+ "file_exists": False,
468
+ "file_age_minutes": None,
469
+ "update_category": "future",
470
+ "update_priority": self.priority_map.get("future", 99),
471
+ "update_required": False,
472
+ "description": self.description,
473
+ }
474
+
475
+ def _row_incomplete(self, d: dt.date, age_min: Optional[float]) -> Dict[str, Any]:
476
+ return {
477
+ "date": d,
478
+ "file_exists": True,
479
+ "file_age_minutes": age_min,
480
+ "update_category": "incomplete",
481
+ "update_priority": self.priority_map.get("incomplete", 1),
482
+ "update_required": True,
483
+ "description": self.description,
484
+ }
485
+
486
+ @staticmethod
487
+ def _ensure_trailing_slash(path: str) -> str:
488
+ return path.rstrip("/") + "/"
489
+
490
+ @staticmethod
491
+ def _month_floor(d: dt.date) -> dt.date:
492
+ return d.replace(day=1)
493
+
494
+ @staticmethod
495
+ def _iter_month_starts(start: dt.date, end: dt.date) -> Iterator[dt.date]:
496
+ cur = start.replace(day=1)
497
+ while cur <= end:
498
+ yield cur
499
+ y, m = cur.year, cur.month
500
+ cur = dt.date(y + 1, 1, 1) if m == 12 else dt.date(y, m + 1, 1)
501
+
502
+ def _month_prefix(self, month_start: dt.date) -> str:
503
+ return f"{self.data_path}{month_start.year}/{month_start.month:02d}/"
504
+
505
+ def _is_data_file(self, path: str) -> bool:
506
+ base = path.rsplit("/", 1)[-1]
507
+ if not base or base.startswith(".") or base in self.CONTROL_BASENAMES:
508
+ return False
509
+ return any(base.lower().endswith(suf) for suf in self.data_file_suffixes)
510
+
511
+ @staticmethod
512
+ def _extract_mtime(info: Dict[str, Any]) -> Optional[dt.datetime]:
513
+ mtime = (
514
+ info.get("mtime") or info.get("LastModified") or info.get("last_modified")
515
+ )
516
+ if isinstance(mtime, (int, float)):
517
+ return dt.datetime.fromtimestamp(mtime, datetime.UTC)
518
+ if isinstance(mtime, str):
519
+ try:
520
+ return pd.to_datetime(mtime, utc=True).to_pydatetime()
521
+ except Exception:
522
+ return None
523
+ if isinstance(mtime, dt.datetime):
524
+ return mtime if mtime.tzinfo else mtime.replace(tzinfo=dt.timezone.utc)
525
+ return None
526
+
527
+ def _is_skipped(self, d: dt.date) -> bool:
528
+ if "partition_date" in self.partition_on:
529
+ canonical_path = f"{self.data_path}partition_date={d.isoformat()}/"
530
+ else:
531
+ canonical_path = f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
532
+ return (d in self.skipped_dates) or (canonical_path in self.skipped_paths)
533
+
534
+ def _log_extra(self, **overrides) -> Dict[str, Any]:
535
+ base = {
536
+ "sibi_flux_component": self.logger_extra.get(
537
+ "sibi_flux_component", "warehouse.update_planner"
538
+ ),
539
+ "date_of_update": self.reference_date.strftime("%Y-%m-%d"),
540
+ "dataclass": self.description,
541
+ "action_module_name": "update_plan",
542
+ }
543
+ base.update(overrides)
544
+ return base
@@ -0,0 +1,131 @@
1
+ from typing import Optional, Any
2
+ from pydantic import SecretStr
3
+ from pydantic_settings import BaseSettings, SettingsConfigDict
4
+
5
+
6
+ class SibiBaseSettings(BaseSettings):
7
+ """Base settings class with common configuration."""
8
+
9
+ model_config = SettingsConfigDict(
10
+ env_file=".env", env_file_encoding="utf-8", extra="ignore"
11
+ )
12
+
13
+
14
+ class FsSettings(SibiBaseSettings):
15
+ """Common filesystem settings."""
16
+
17
+ fs_type: str = "s3"
18
+ fs_path: str = "s3://dev-bucket/warehouse"
19
+ fs_key: str = "minio"
20
+ fs_secret: SecretStr = SecretStr("minio123")
21
+ fs_endpoint: str = "http://localhost:9000"
22
+ fs_token: Optional[SecretStr] = None
23
+ fs_region: str = "us-east-1"
24
+
25
+ def to_fsspec_options(self) -> dict[str, Any]:
26
+ """Convert settings to fsspec compatible options dict."""
27
+ if self.fs_type == "s3":
28
+ opts = {
29
+ "key": self.fs_key,
30
+ "secret": self.fs_secret.get_secret_value() if self.fs_secret else None,
31
+ "skip_instance_cache": True,
32
+ "use_listings_cache": False,
33
+ "client_kwargs": {
34
+ "endpoint_url": self.fs_endpoint,
35
+ "region_name": self.fs_region,
36
+ },
37
+ "config_kwargs": {
38
+ "signature_version": "s3v4",
39
+ "s3": {"addressing_style": "path"},
40
+ },
41
+ }
42
+ if self.fs_token:
43
+ opts["token"] = self.fs_token.get_secret_value()
44
+ return opts
45
+ return {}
46
+
47
+
48
+ class WebDavSettings(SibiBaseSettings):
49
+ """Common WebDAV settings."""
50
+
51
+ fs_type: str = "webdav"
52
+ fs_verify_ssl: bool = False
53
+ fs_endpoint: str = "http://localhost:8080"
54
+ fs_key: str = "user"
55
+ fs_secret: SecretStr = SecretStr("pass")
56
+ fs_token: Optional[SecretStr] = None
57
+
58
+ # AWS specific config often mixed in WebDAV context in legacy
59
+ aws_access_key_id: Optional[str] = None
60
+ aws_secret_access_key: Optional[str] = None
61
+ region_name: Optional[str] = None
62
+ session_token: Optional[str] = None
63
+ endpoint_url: Optional[str] = None
64
+
65
+ model_config = SettingsConfigDict(env_prefix="WEBDAV_")
66
+
67
+ def to_fsspec_options(self) -> dict[str, Any]:
68
+ verify = self.fs_verify_ssl
69
+ opts = {
70
+ "base_url": self.fs_endpoint,
71
+ "username": self.fs_key,
72
+ "password": self.fs_secret.get_secret_value() if self.fs_secret else "",
73
+ "verify": verify,
74
+ }
75
+ if self.fs_token:
76
+ opts["token"] = self.fs_token.get_secret_value()
77
+ return opts
78
+
79
+
80
+ # --- Database Base Settings ---
81
+
82
+
83
+ class DatabaseSettings(SibiBaseSettings):
84
+ """Generic SQL Database settings."""
85
+
86
+ db_url: str = "sqlite:///:memory:"
87
+
88
+
89
+ class ClickhouseBaseSettings(SibiBaseSettings):
90
+ """Base settings for ClickHouse connection."""
91
+
92
+ host: str = "localhost"
93
+ port: int = 8123
94
+ database: str = "default"
95
+ user: str = "default"
96
+ password: SecretStr = SecretStr("secret")
97
+
98
+ def to_legacy_dict(self) -> dict[str, Any]:
99
+ return {
100
+ "host": self.host,
101
+ "port": self.port,
102
+ "dbname": self.database,
103
+ "user": self.user,
104
+ "password": self.password.get_secret_value() if self.password else None,
105
+ }
106
+
107
+
108
+ class RedisBaseSettings(SibiBaseSettings):
109
+ """Base settings for Redis connection."""
110
+
111
+ host: str = "localhost"
112
+ port: int = 6379
113
+ db: int = 0
114
+ password: Optional[SecretStr] = None
115
+
116
+ def to_legacy_dict(self) -> dict[str, Any]:
117
+ return {
118
+ "host": self.host,
119
+ "port": self.port,
120
+ "db": self.db,
121
+ "password": self.password.get_secret_value() if self.password else None,
122
+ }
123
+
124
+
125
+ class DbPoolSettings(SibiBaseSettings):
126
+ """Base settings for SQLAlchemy connection pooling."""
127
+
128
+ db_pool_size: int = 5
129
+ db_max_overflow: int = 10
130
+ db_pool_timeout: int = 30
131
+ db_pool_recycle: int = 1800
@@ -0,0 +1,5 @@
1
+ from .managed_resource import ManagedResource
2
+
3
+ __all__ = [
4
+ "ManagedResource",
5
+ ]
@@ -0,0 +1,3 @@
1
+ from ._managed_resource import ManagedResource
2
+
3
+ __all__ = ["ManagedResource"]