sibi-flux 2025.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +44 -0
- sibi_flux/__init__.py +49 -0
- sibi_flux/artifacts/__init__.py +7 -0
- sibi_flux/artifacts/base.py +166 -0
- sibi_flux/artifacts/parquet.py +360 -0
- sibi_flux/artifacts/parquet_engine/__init__.py +5 -0
- sibi_flux/artifacts/parquet_engine/executor.py +204 -0
- sibi_flux/artifacts/parquet_engine/manifest.py +101 -0
- sibi_flux/artifacts/parquet_engine/planner.py +544 -0
- sibi_flux/conf/settings.py +131 -0
- sibi_flux/core/__init__.py +5 -0
- sibi_flux/core/managed_resource/__init__.py +3 -0
- sibi_flux/core/managed_resource/_managed_resource.py +733 -0
- sibi_flux/core/type_maps/__init__.py +100 -0
- sibi_flux/dask_cluster/__init__.py +47 -0
- sibi_flux/dask_cluster/async_core.py +27 -0
- sibi_flux/dask_cluster/client_manager.py +549 -0
- sibi_flux/dask_cluster/core.py +322 -0
- sibi_flux/dask_cluster/exceptions.py +34 -0
- sibi_flux/dask_cluster/utils.py +49 -0
- sibi_flux/datacube/__init__.py +3 -0
- sibi_flux/datacube/_data_cube.py +332 -0
- sibi_flux/datacube/config_engine.py +152 -0
- sibi_flux/datacube/field_factory.py +48 -0
- sibi_flux/datacube/field_registry.py +122 -0
- sibi_flux/datacube/generator.py +677 -0
- sibi_flux/datacube/orchestrator.py +171 -0
- sibi_flux/dataset/__init__.py +3 -0
- sibi_flux/dataset/_dataset.py +162 -0
- sibi_flux/df_enricher/__init__.py +56 -0
- sibi_flux/df_enricher/async_enricher.py +201 -0
- sibi_flux/df_enricher/merger.py +253 -0
- sibi_flux/df_enricher/specs.py +45 -0
- sibi_flux/df_enricher/types.py +12 -0
- sibi_flux/df_helper/__init__.py +5 -0
- sibi_flux/df_helper/_df_helper.py +450 -0
- sibi_flux/df_helper/backends/__init__.py +34 -0
- sibi_flux/df_helper/backends/_params.py +173 -0
- sibi_flux/df_helper/backends/_strategies.py +295 -0
- sibi_flux/df_helper/backends/http/__init__.py +5 -0
- sibi_flux/df_helper/backends/http/_http_config.py +122 -0
- sibi_flux/df_helper/backends/parquet/__init__.py +7 -0
- sibi_flux/df_helper/backends/parquet/_parquet_options.py +268 -0
- sibi_flux/df_helper/backends/sqlalchemy/__init__.py +9 -0
- sibi_flux/df_helper/backends/sqlalchemy/_db_connection.py +256 -0
- sibi_flux/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
- sibi_flux/df_helper/backends/sqlalchemy/_io_dask.py +386 -0
- sibi_flux/df_helper/backends/sqlalchemy/_load_from_db.py +134 -0
- sibi_flux/df_helper/backends/sqlalchemy/_model_registry.py +239 -0
- sibi_flux/df_helper/backends/sqlalchemy/_sql_model_builder.py +42 -0
- sibi_flux/df_helper/backends/utils.py +32 -0
- sibi_flux/df_helper/core/__init__.py +15 -0
- sibi_flux/df_helper/core/_defaults.py +104 -0
- sibi_flux/df_helper/core/_filter_handler.py +617 -0
- sibi_flux/df_helper/core/_params_config.py +185 -0
- sibi_flux/df_helper/core/_query_config.py +17 -0
- sibi_flux/df_validator/__init__.py +3 -0
- sibi_flux/df_validator/_df_validator.py +222 -0
- sibi_flux/logger/__init__.py +1 -0
- sibi_flux/logger/_logger.py +480 -0
- sibi_flux/mcp/__init__.py +26 -0
- sibi_flux/mcp/client.py +150 -0
- sibi_flux/mcp/router.py +126 -0
- sibi_flux/orchestration/__init__.py +9 -0
- sibi_flux/orchestration/_artifact_orchestrator.py +346 -0
- sibi_flux/orchestration/_pipeline_executor.py +212 -0
- sibi_flux/osmnx_helper/__init__.py +22 -0
- sibi_flux/osmnx_helper/_pbf_handler.py +384 -0
- sibi_flux/osmnx_helper/graph_loader.py +225 -0
- sibi_flux/osmnx_helper/utils.py +100 -0
- sibi_flux/pipelines/__init__.py +3 -0
- sibi_flux/pipelines/base.py +218 -0
- sibi_flux/py.typed +0 -0
- sibi_flux/readers/__init__.py +3 -0
- sibi_flux/readers/base.py +82 -0
- sibi_flux/readers/parquet.py +106 -0
- sibi_flux/utils/__init__.py +53 -0
- sibi_flux/utils/boilerplate/__init__.py +19 -0
- sibi_flux/utils/boilerplate/base_attacher.py +45 -0
- sibi_flux/utils/boilerplate/base_cube_router.py +283 -0
- sibi_flux/utils/boilerplate/base_data_cube.py +132 -0
- sibi_flux/utils/boilerplate/base_pipeline_template.py +54 -0
- sibi_flux/utils/boilerplate/hybrid_data_loader.py +193 -0
- sibi_flux/utils/clickhouse_writer/__init__.py +6 -0
- sibi_flux/utils/clickhouse_writer/_clickhouse_writer.py +225 -0
- sibi_flux/utils/common.py +7 -0
- sibi_flux/utils/credentials/__init__.py +3 -0
- sibi_flux/utils/credentials/_config_manager.py +155 -0
- sibi_flux/utils/dask_utils.py +14 -0
- sibi_flux/utils/data_utils/__init__.py +3 -0
- sibi_flux/utils/data_utils/_data_utils.py +389 -0
- sibi_flux/utils/dataframe_utils.py +52 -0
- sibi_flux/utils/date_utils/__init__.py +10 -0
- sibi_flux/utils/date_utils/_business_days.py +220 -0
- sibi_flux/utils/date_utils/_date_utils.py +311 -0
- sibi_flux/utils/date_utils/_file_age_checker.py +319 -0
- sibi_flux/utils/file_utils.py +48 -0
- sibi_flux/utils/filepath_generator/__init__.py +5 -0
- sibi_flux/utils/filepath_generator/_filepath_generator.py +185 -0
- sibi_flux/utils/parquet_saver/__init__.py +6 -0
- sibi_flux/utils/parquet_saver/_parquet_saver.py +436 -0
- sibi_flux/utils/parquet_saver/_write_gatekeeper.py +33 -0
- sibi_flux/utils/retry.py +46 -0
- sibi_flux/utils/storage/__init__.py +7 -0
- sibi_flux/utils/storage/_fs_registry.py +112 -0
- sibi_flux/utils/storage/_storage_manager.py +257 -0
- sibi_flux/utils/storage/factory.py +33 -0
- sibi_flux-2025.12.0.dist-info/METADATA +283 -0
- sibi_flux-2025.12.0.dist-info/RECORD +110 -0
- sibi_flux-2025.12.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,544 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import datetime
|
|
4
|
+
import datetime as dt
|
|
5
|
+
import re
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor, wait
|
|
7
|
+
from typing import (
|
|
8
|
+
List,
|
|
9
|
+
Optional,
|
|
10
|
+
Dict,
|
|
11
|
+
Tuple,
|
|
12
|
+
Set,
|
|
13
|
+
Iterator,
|
|
14
|
+
ClassVar,
|
|
15
|
+
Any,
|
|
16
|
+
Callable,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
import pandas as pd
|
|
20
|
+
|
|
21
|
+
from sibi_flux.core import ManagedResource
|
|
22
|
+
from sibi_flux.utils.date_utils._file_age_checker import FileAgeChecker
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class UpdatePlanner(ManagedResource):
|
|
26
|
+
"""
|
|
27
|
+
Represents an update planner for maintaining and managing updates to data stored in a
|
|
28
|
+
specific parquet storage path. The planner organizes data updates based on configured
|
|
29
|
+
heuristics, date ranges, and user-defined settings for prioritization and execution.
|
|
30
|
+
|
|
31
|
+
The class handles various configurations such as partitioning, update thresholds, and
|
|
32
|
+
progress visualization. It supports hive-style partitioning and provides mechanisms to
|
|
33
|
+
generate, review, and execute update plans.
|
|
34
|
+
|
|
35
|
+
:ivar DEFAULT_PRIORITY_MAP: Default priority levels assigned to update scenarios. Each key
|
|
36
|
+
corresponds to an update condition, and integer values represent priority levels.
|
|
37
|
+
:type DEFAULT_PRIORITY_MAP: Dict[str, int]
|
|
38
|
+
:ivar DEFAULT_MAX_AGE_MINUTES: Default maximum age (in minutes) for outdated files before
|
|
39
|
+
requiring update.
|
|
40
|
+
:type DEFAULT_MAX_AGE_MINUTES: int
|
|
41
|
+
:ivar DEFAULT_HISTORY_DAYS_THRESHOLD: Default period (in days) used as a history
|
|
42
|
+
threshold for updates.
|
|
43
|
+
:type DEFAULT_HISTORY_DAYS_THRESHOLD: int
|
|
44
|
+
:ivar DATA_FILE_PATTERNS: Supported file patterns used to identify data files in
|
|
45
|
+
the storage path.
|
|
46
|
+
:type DATA_FILE_PATTERNS: Tuple[str, ...]
|
|
47
|
+
:ivar CONTROL_BASENAMES: Set of control filenames typically used to manage data
|
|
48
|
+
updates, such as success markers or metadata files.
|
|
49
|
+
:type CONTROL_BASENAMES: Set[str]
|
|
50
|
+
:ivar HIVE_PARTITION_RE: Regular expression pattern to detect hive-style partitioning
|
|
51
|
+
patterns within file paths.
|
|
52
|
+
:type HIVE_PARTITION_RE: re.Pattern
|
|
53
|
+
:ivar data_path: Path to the parquet storage, ensuring any updates are scoped within
|
|
54
|
+
this directory.
|
|
55
|
+
:type data_path: str
|
|
56
|
+
:ivar description: Brief description of the planner or its purpose.
|
|
57
|
+
:type description: str
|
|
58
|
+
:ivar reverse_order: Whether to reverse the order of processing update tasks.
|
|
59
|
+
:type reverse_order: bool
|
|
60
|
+
:ivar show_progress: Flag to enable or disable progress reporting during updates.
|
|
61
|
+
:type show_progress: bool
|
|
62
|
+
:ivar overwrite: Indicates whether existing data should be forcibly overwritten
|
|
63
|
+
during updates.
|
|
64
|
+
:type overwrite: bool
|
|
65
|
+
:ivar ignore_missing: Flag to ignore missing data instead of reporting it as an error.
|
|
66
|
+
:type ignore_missing: bool
|
|
67
|
+
:ivar history_days_threshold: Custom threshold for history-based update prioritization.
|
|
68
|
+
:type history_days_threshold: int
|
|
69
|
+
:ivar max_age_minutes: Custom maximum allowable age for files in minutes before
|
|
70
|
+
an update is required.
|
|
71
|
+
:type max_age_minutes: int
|
|
72
|
+
:ivar priority_map: Map of custom priority levels for various update scenarios. Modifiable
|
|
73
|
+
by the user to override default priorities.
|
|
74
|
+
:type priority_map: Dict[str, int]
|
|
75
|
+
:ivar hive_style: Indicates if hive-style partitioning should be enabled.
|
|
76
|
+
:type hive_style: bool
|
|
77
|
+
:ivar partition_on: List of fields used for partitioning the data.
|
|
78
|
+
:type partition_on: List[str]
|
|
79
|
+
:ivar max_threads: Maximum number of threads to use for concurrent operations.
|
|
80
|
+
:type max_threads: int
|
|
81
|
+
:ivar timeout: Timeout (in seconds) applied for typical operations.
|
|
82
|
+
:type timeout: float
|
|
83
|
+
:ivar list_timeout: Timeout (in seconds) for listing files in storage.
|
|
84
|
+
:type list_timeout: float
|
|
85
|
+
:ivar total_timeout: Maximum duration allowed for operations to complete.
|
|
86
|
+
:type total_timeout: float
|
|
87
|
+
:ivar reference_date: The reference date used to anchor historical updates or
|
|
88
|
+
determine thresholds.
|
|
89
|
+
:type reference_date: dt.date
|
|
90
|
+
:ivar check_completeness: Configuration for enforcing data completeness checks.
|
|
91
|
+
:type check_completeness: bool
|
|
92
|
+
:ivar require_success_marker: Whether updates require the presence of a success marker
|
|
93
|
+
file for validation.
|
|
94
|
+
:type require_success_marker: bool
|
|
95
|
+
:ivar list_granularity: Granularity level used for organizing file listings (e.g., daily
|
|
96
|
+
or monthly).
|
|
97
|
+
:type list_granularity: str
|
|
98
|
+
:ivar data_file_suffixes: Supported file suffixes for identifying valid input files during
|
|
99
|
+
update planning.
|
|
100
|
+
:type data_file_suffixes: Tuple[str, ...]
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
|
|
104
|
+
"file_is_recent": 0,
|
|
105
|
+
"missing_ignored": 0,
|
|
106
|
+
"overwrite_forced": 1,
|
|
107
|
+
"incomplete": 1,
|
|
108
|
+
"create_missing": 2,
|
|
109
|
+
"missing_in_history": 3,
|
|
110
|
+
"stale_in_history": 4,
|
|
111
|
+
"future": 99,
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
DEFAULT_MAX_AGE_MINUTES: int = 1440
|
|
115
|
+
DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
|
|
116
|
+
|
|
117
|
+
DATA_FILE_PATTERNS: ClassVar[Tuple[str, ...]] = (
|
|
118
|
+
".parquet",
|
|
119
|
+
".orc",
|
|
120
|
+
".csv",
|
|
121
|
+
".json",
|
|
122
|
+
)
|
|
123
|
+
CONTROL_BASENAMES: ClassVar[Set[str]] = {
|
|
124
|
+
"_SUCCESS",
|
|
125
|
+
"_metadata",
|
|
126
|
+
"_common_metadata",
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
HIVE_PARTITION_RE: ClassVar[re.Pattern] = re.compile(r"([^/=]+)=([^/]+)")
|
|
130
|
+
|
|
131
|
+
logger_extra = {"sibi_flux_component": __name__}
|
|
132
|
+
|
|
133
|
+
def __init__(
|
|
134
|
+
self,
|
|
135
|
+
parquet_storage_path: str,
|
|
136
|
+
*,
|
|
137
|
+
partition_on: Optional[List[str]] = None,
|
|
138
|
+
description: str = "Update Planner",
|
|
139
|
+
reference_date: str | dt.date | None = None,
|
|
140
|
+
history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
|
|
141
|
+
max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
|
|
142
|
+
overwrite: bool = False,
|
|
143
|
+
ignore_missing: bool = False,
|
|
144
|
+
custom_priority_map: Optional[Dict[str, int]] = None,
|
|
145
|
+
reverse_order: bool = False,
|
|
146
|
+
show_progress: bool = False,
|
|
147
|
+
hive_style: bool = False,
|
|
148
|
+
skipped: Optional[List[str | dt.date]] = None,
|
|
149
|
+
**kwargs,
|
|
150
|
+
):
|
|
151
|
+
super().__init__(**kwargs)
|
|
152
|
+
|
|
153
|
+
# ---- core config ----
|
|
154
|
+
self.data_path: str = self._ensure_trailing_slash(parquet_storage_path)
|
|
155
|
+
self.description: str = description
|
|
156
|
+
self.reverse_order: bool = reverse_order
|
|
157
|
+
self.show_progress: bool = show_progress
|
|
158
|
+
self.overwrite: bool = overwrite
|
|
159
|
+
self.ignore_missing: bool = ignore_missing
|
|
160
|
+
self.history_days_threshold: int = history_days_threshold
|
|
161
|
+
self.max_age_minutes: int = max_age_minutes
|
|
162
|
+
self.priority_map: Dict[str, int] = (
|
|
163
|
+
dict(custom_priority_map)
|
|
164
|
+
if custom_priority_map
|
|
165
|
+
else dict(self.DEFAULT_PRIORITY_MAP)
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
self.hive_style: bool = hive_style
|
|
169
|
+
self.partition_on: List[str] = list(
|
|
170
|
+
partition_on or ["partition_date"]
|
|
171
|
+
if self.hive_style
|
|
172
|
+
else ["year", "month", "day"]
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
self.max_threads: int = int(kwargs.get("max_threads", 3))
|
|
176
|
+
self.timeout: float = float(kwargs.get("timeout", 30.0))
|
|
177
|
+
self.list_timeout: float = float(kwargs.get("list_timeout", self.timeout))
|
|
178
|
+
self.total_timeout: float = float(kwargs.get("total_timeout", self.timeout))
|
|
179
|
+
|
|
180
|
+
# ---- date window ----
|
|
181
|
+
self.start_date = kwargs.get("parquet_start_date")
|
|
182
|
+
self.end_date = kwargs.get("parquet_end_date")
|
|
183
|
+
|
|
184
|
+
# ---- reference date ----
|
|
185
|
+
if reference_date is not None:
|
|
186
|
+
self.reference_date: dt.date = pd.to_datetime(reference_date).date()
|
|
187
|
+
else:
|
|
188
|
+
self.reference_date = dt.date.today()
|
|
189
|
+
|
|
190
|
+
# ---- completeness/heuristics ----
|
|
191
|
+
self.check_completeness: bool = bool(kwargs.get("check_completeness", False))
|
|
192
|
+
self.require_success_marker: bool = bool(
|
|
193
|
+
kwargs.get("require_success_marker", False)
|
|
194
|
+
)
|
|
195
|
+
self.list_granularity: str = str(kwargs.get("list_granularity", "month"))
|
|
196
|
+
self.data_file_suffixes: Tuple[str, ...] = tuple(
|
|
197
|
+
kwargs.get("data_file_suffixes", self.DATA_FILE_PATTERNS)
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# ---- clock for tests ----
|
|
201
|
+
self._utcnow: Callable[[], dt.datetime] = kwargs.get("utcnow_func", None) or (
|
|
202
|
+
lambda: dt.datetime.now(datetime.UTC)
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# ---- skipped (back-compat) ----
|
|
206
|
+
self.skipped = list(skipped or kwargs.get("skipped", []) or [])
|
|
207
|
+
self.skipped_paths: Set[str] = {
|
|
208
|
+
p.rstrip("/") + "/" for p in self.skipped if isinstance(p, str)
|
|
209
|
+
}
|
|
210
|
+
self.skipped_dates: Set[dt.date] = {
|
|
211
|
+
p for p in self.skipped if isinstance(p, dt.date)
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
if not getattr(self, "fs", None):
|
|
215
|
+
raise ValueError("UpdatePlanner requires a valid fsspec filesystem (fs).")
|
|
216
|
+
|
|
217
|
+
self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
|
|
218
|
+
self.plan: pd.DataFrame = pd.DataFrame()
|
|
219
|
+
self.df_req: pd.DataFrame = pd.DataFrame()
|
|
220
|
+
self._printed_this_run: bool = False
|
|
221
|
+
|
|
222
|
+
@property
|
|
223
|
+
def skipped(self) -> List[str | dt.date]:
|
|
224
|
+
return [*sorted(self.skipped_paths), *sorted(self.skipped_dates)]
|
|
225
|
+
|
|
226
|
+
@skipped.setter
|
|
227
|
+
def skipped(self, value: List[str | dt.date]) -> None:
|
|
228
|
+
self.skipped_paths = {p.rstrip("/") + "/" for p in value if isinstance(p, str)}
|
|
229
|
+
self.skipped_dates = {p for p in value if isinstance(p, dt.date)}
|
|
230
|
+
|
|
231
|
+
# --------------------- Public API ---------------------
|
|
232
|
+
def generate_plan(
|
|
233
|
+
self,
|
|
234
|
+
start: str | dt.date | None = None,
|
|
235
|
+
end: str | dt.date | None = None,
|
|
236
|
+
freq: str = "D",
|
|
237
|
+
) -> pd.DataFrame:
|
|
238
|
+
start = start or self.start_date
|
|
239
|
+
end = end or self.end_date
|
|
240
|
+
if start is None or end is None:
|
|
241
|
+
raise ValueError(
|
|
242
|
+
"start and end must be provided (or set via parquet_* kwargs)."
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
sd = pd.to_datetime(start).date()
|
|
246
|
+
ed = pd.to_datetime(end).date()
|
|
247
|
+
if sd > ed:
|
|
248
|
+
raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
|
|
249
|
+
|
|
250
|
+
self.logger.info(
|
|
251
|
+
f"Generating update plan for {self.description} from {sd} to {ed}",
|
|
252
|
+
extra=self._log_extra(),
|
|
253
|
+
)
|
|
254
|
+
self._generate_plan(sd, ed, freq=freq)
|
|
255
|
+
return self.df_req
|
|
256
|
+
|
|
257
|
+
def show_update_plan(self) -> None:
|
|
258
|
+
if not self.has_plan() or self._printed_this_run:
|
|
259
|
+
return
|
|
260
|
+
try:
|
|
261
|
+
from rich.console import Console
|
|
262
|
+
from rich.table import Table
|
|
263
|
+
|
|
264
|
+
console = Console()
|
|
265
|
+
table = Table(
|
|
266
|
+
title=f"Update Plan for {self.data_path} [{'Hive' if 'partition_date' in self.partition_on else 'Legacy'}]",
|
|
267
|
+
show_header=True,
|
|
268
|
+
header_style="bold magenta",
|
|
269
|
+
expand=True,
|
|
270
|
+
pad_edge=False,
|
|
271
|
+
)
|
|
272
|
+
for col in self.plan.columns:
|
|
273
|
+
table.add_column(col, justify="left", overflow="fold")
|
|
274
|
+
# Use itertuples for better performance and type safety
|
|
275
|
+
for row in self.plan.itertuples(index=False):
|
|
276
|
+
# row is a namedtuple, access fields by name matches column names
|
|
277
|
+
# logic: we iterate columns to ensure order matches table header
|
|
278
|
+
table.add_row(*(str(getattr(row, c)) for c in self.plan.columns))
|
|
279
|
+
console.print(table)
|
|
280
|
+
except Exception:
|
|
281
|
+
self.logger.debug(
|
|
282
|
+
f"Update Plan:\n{self.plan.head(50)}", extra=self._log_extra()
|
|
283
|
+
)
|
|
284
|
+
self._printed_this_run = True
|
|
285
|
+
|
|
286
|
+
def get_tasks_by_priority(self) -> Iterator[Tuple[int, List[dt.date]]]:
|
|
287
|
+
if not self.has_plan():
|
|
288
|
+
return
|
|
289
|
+
req = self.plan[self.plan["update_required"]]
|
|
290
|
+
for priority in sorted(req["update_priority"].unique()):
|
|
291
|
+
dates = (
|
|
292
|
+
req[req["update_priority"] == priority]
|
|
293
|
+
.sort_values(by="date", ascending=not self.reverse_order)["date"]
|
|
294
|
+
.tolist()
|
|
295
|
+
)
|
|
296
|
+
if dates:
|
|
297
|
+
yield int(priority), dates
|
|
298
|
+
|
|
299
|
+
def has_plan(self) -> bool:
|
|
300
|
+
return not self.plan.empty
|
|
301
|
+
|
|
302
|
+
def required_count(self) -> int:
|
|
303
|
+
return len(self.df_req)
|
|
304
|
+
|
|
305
|
+
def _generate_plan(self, start: dt.date, end: dt.date, freq: str = "D") -> None:
|
|
306
|
+
dates: List[dt.date] = pd.date_range(
|
|
307
|
+
start=start, end=end, freq=freq
|
|
308
|
+
).date.tolist()
|
|
309
|
+
history_start = self.reference_date - dt.timedelta(
|
|
310
|
+
days=self.history_days_threshold
|
|
311
|
+
)
|
|
312
|
+
rows: List[Dict[str, Any]] = []
|
|
313
|
+
|
|
314
|
+
if "partition_date" in self.partition_on:
|
|
315
|
+
caches: Dict[dt.date, Dict[str, Any]] = self._list_prefix(self.data_path)
|
|
316
|
+
else:
|
|
317
|
+
caches = {}
|
|
318
|
+
months = list(
|
|
319
|
+
self._iter_month_starts(
|
|
320
|
+
self._month_floor(start), self._month_floor(end)
|
|
321
|
+
)
|
|
322
|
+
)
|
|
323
|
+
with ThreadPoolExecutor(max_workers=max(1, self.max_threads)) as ex:
|
|
324
|
+
future_to_unit = {
|
|
325
|
+
ex.submit(self._list_prefix, self._month_prefix(m)): m
|
|
326
|
+
for m in months
|
|
327
|
+
}
|
|
328
|
+
done, _ = wait(
|
|
329
|
+
future_to_unit.keys(), timeout=self.total_timeout or None
|
|
330
|
+
)
|
|
331
|
+
for fut in done:
|
|
332
|
+
m = future_to_unit[fut]
|
|
333
|
+
try:
|
|
334
|
+
caches[m] = fut.result(timeout=self.list_timeout or None)
|
|
335
|
+
except Exception:
|
|
336
|
+
caches[m] = {}
|
|
337
|
+
|
|
338
|
+
for d in dates:
|
|
339
|
+
if d > self.reference_date:
|
|
340
|
+
rows.append(self._row_future(d))
|
|
341
|
+
continue
|
|
342
|
+
if self._is_skipped(d):
|
|
343
|
+
rows.append(self._make_row(d, history_start, False, None))
|
|
344
|
+
continue
|
|
345
|
+
|
|
346
|
+
cache = (
|
|
347
|
+
caches
|
|
348
|
+
if "partition_date" in self.partition_on
|
|
349
|
+
else caches.get(d.replace(day=1), {})
|
|
350
|
+
)
|
|
351
|
+
exists, age_min, incomplete = self._summarize_partition(d, cache)
|
|
352
|
+
if incomplete and not self.overwrite:
|
|
353
|
+
rows.append(self._row_incomplete(d, age_min))
|
|
354
|
+
else:
|
|
355
|
+
rows.append(self._make_row(d, history_start, exists, age_min))
|
|
356
|
+
|
|
357
|
+
df = pd.DataFrame.from_records(rows)
|
|
358
|
+
if not df.empty:
|
|
359
|
+
df["date"] = pd.to_datetime(df["date"]).dt.date
|
|
360
|
+
df["update_priority"] = df["update_priority"].astype(int)
|
|
361
|
+
self.plan = df.sort_values(
|
|
362
|
+
by=["update_priority", "date"],
|
|
363
|
+
ascending=[True, not self.reverse_order],
|
|
364
|
+
kind="mergesort",
|
|
365
|
+
).reset_index(drop=True)
|
|
366
|
+
self.df_req = self.plan[self.plan["update_required"]].copy()
|
|
367
|
+
|
|
368
|
+
def _list_prefix(self, prefix: str) -> Dict[dt.date, Dict[str, Any]]:
|
|
369
|
+
try:
|
|
370
|
+
items: Dict[str, Any] = self.fs.find(prefix, withdirs=False, detail=True)
|
|
371
|
+
except Exception:
|
|
372
|
+
return {}
|
|
373
|
+
|
|
374
|
+
out: Dict[dt.date, Dict[str, Any]] = {}
|
|
375
|
+
for path, info in items.items():
|
|
376
|
+
d: Optional[dt.date] = None
|
|
377
|
+
if "partition_date" in self.partition_on:
|
|
378
|
+
parts = self._extract_partitions(path)
|
|
379
|
+
if "partition_date" in parts:
|
|
380
|
+
try:
|
|
381
|
+
d = dt.date.fromisoformat(parts["partition_date"])
|
|
382
|
+
except Exception:
|
|
383
|
+
continue
|
|
384
|
+
else:
|
|
385
|
+
segs = path.strip("/").split("/")
|
|
386
|
+
if len(segs) >= 3:
|
|
387
|
+
try:
|
|
388
|
+
y, m, dd = int(segs[-3]), int(segs[-2]), int(segs[-1])
|
|
389
|
+
d = dt.date(y, m, dd)
|
|
390
|
+
except Exception:
|
|
391
|
+
continue
|
|
392
|
+
if d is None:
|
|
393
|
+
continue
|
|
394
|
+
|
|
395
|
+
rec = out.setdefault(
|
|
396
|
+
d, {"files": [], "has_success": False, "newest_ts": None}
|
|
397
|
+
)
|
|
398
|
+
base = path.rsplit("/", 1)[-1]
|
|
399
|
+
if base == "_SUCCESS":
|
|
400
|
+
rec["has_success"] = True
|
|
401
|
+
if self._is_data_file(path):
|
|
402
|
+
rec["files"].append(path)
|
|
403
|
+
ts = self._extract_mtime(info)
|
|
404
|
+
if ts and (rec["newest_ts"] is None or ts > rec["newest_ts"]):
|
|
405
|
+
rec["newest_ts"] = ts
|
|
406
|
+
return out
|
|
407
|
+
|
|
408
|
+
def _extract_partitions(self, path: str) -> Dict[str, str]:
|
|
409
|
+
out: Dict[str, str] = {}
|
|
410
|
+
for seg in path.strip("/").split("/"):
|
|
411
|
+
m = self.HIVE_PARTITION_RE.match(seg)
|
|
412
|
+
if m:
|
|
413
|
+
out[m.group(1)] = m.group(2)
|
|
414
|
+
return out
|
|
415
|
+
|
|
416
|
+
def _summarize_partition(
|
|
417
|
+
self, d: dt.date, cache: Dict[dt.date, Dict[str, Any]]
|
|
418
|
+
) -> Tuple[bool, Optional[float], bool]:
|
|
419
|
+
rec = cache.get(d, {})
|
|
420
|
+
files = rec.get("files", [])
|
|
421
|
+
exists = bool(files)
|
|
422
|
+
if not exists:
|
|
423
|
+
return False, None, False
|
|
424
|
+
has_success = rec.get("has_success", False)
|
|
425
|
+
newest_ts = rec.get("newest_ts")
|
|
426
|
+
age_min = None
|
|
427
|
+
if newest_ts:
|
|
428
|
+
now = self._utcnow().replace(tzinfo=None)
|
|
429
|
+
ts = newest_ts.replace(tzinfo=None) if newest_ts.tzinfo else newest_ts
|
|
430
|
+
age_min = max(0.0, (now - ts).total_seconds() / 60.0)
|
|
431
|
+
incomplete = (
|
|
432
|
+
self.check_completeness and self.require_success_marker and not has_success
|
|
433
|
+
)
|
|
434
|
+
return exists, age_min, incomplete
|
|
435
|
+
|
|
436
|
+
def _make_row(
|
|
437
|
+
self, d: dt.date, history_start: dt.date, exists: bool, age_min: Optional[float]
|
|
438
|
+
) -> Dict[str, Any]:
|
|
439
|
+
within_history = history_start <= d <= self.reference_date
|
|
440
|
+
category, update_required = "unknown", False
|
|
441
|
+
if self.overwrite:
|
|
442
|
+
category, update_required = "overwrite_forced", True
|
|
443
|
+
elif within_history:
|
|
444
|
+
if not exists:
|
|
445
|
+
category, update_required = "missing_in_history", True
|
|
446
|
+
elif age_min is not None and age_min > self.max_age_minutes:
|
|
447
|
+
category, update_required = "stale_in_history", True
|
|
448
|
+
else:
|
|
449
|
+
category = "file_is_recent"
|
|
450
|
+
elif not exists and not self.ignore_missing:
|
|
451
|
+
category, update_required = "create_missing", True
|
|
452
|
+
else:
|
|
453
|
+
category = "missing_ignored" if not exists else "file_is_recent"
|
|
454
|
+
return {
|
|
455
|
+
"date": d,
|
|
456
|
+
"file_exists": exists,
|
|
457
|
+
"file_age_minutes": age_min,
|
|
458
|
+
"update_category": category,
|
|
459
|
+
"update_priority": self.priority_map.get(category, 99),
|
|
460
|
+
"update_required": update_required,
|
|
461
|
+
"description": self.description,
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
def _row_future(self, d: dt.date) -> Dict[str, Any]:
|
|
465
|
+
return {
|
|
466
|
+
"date": d,
|
|
467
|
+
"file_exists": False,
|
|
468
|
+
"file_age_minutes": None,
|
|
469
|
+
"update_category": "future",
|
|
470
|
+
"update_priority": self.priority_map.get("future", 99),
|
|
471
|
+
"update_required": False,
|
|
472
|
+
"description": self.description,
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
def _row_incomplete(self, d: dt.date, age_min: Optional[float]) -> Dict[str, Any]:
|
|
476
|
+
return {
|
|
477
|
+
"date": d,
|
|
478
|
+
"file_exists": True,
|
|
479
|
+
"file_age_minutes": age_min,
|
|
480
|
+
"update_category": "incomplete",
|
|
481
|
+
"update_priority": self.priority_map.get("incomplete", 1),
|
|
482
|
+
"update_required": True,
|
|
483
|
+
"description": self.description,
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
@staticmethod
|
|
487
|
+
def _ensure_trailing_slash(path: str) -> str:
|
|
488
|
+
return path.rstrip("/") + "/"
|
|
489
|
+
|
|
490
|
+
@staticmethod
|
|
491
|
+
def _month_floor(d: dt.date) -> dt.date:
|
|
492
|
+
return d.replace(day=1)
|
|
493
|
+
|
|
494
|
+
@staticmethod
|
|
495
|
+
def _iter_month_starts(start: dt.date, end: dt.date) -> Iterator[dt.date]:
|
|
496
|
+
cur = start.replace(day=1)
|
|
497
|
+
while cur <= end:
|
|
498
|
+
yield cur
|
|
499
|
+
y, m = cur.year, cur.month
|
|
500
|
+
cur = dt.date(y + 1, 1, 1) if m == 12 else dt.date(y, m + 1, 1)
|
|
501
|
+
|
|
502
|
+
def _month_prefix(self, month_start: dt.date) -> str:
|
|
503
|
+
return f"{self.data_path}{month_start.year}/{month_start.month:02d}/"
|
|
504
|
+
|
|
505
|
+
def _is_data_file(self, path: str) -> bool:
|
|
506
|
+
base = path.rsplit("/", 1)[-1]
|
|
507
|
+
if not base or base.startswith(".") or base in self.CONTROL_BASENAMES:
|
|
508
|
+
return False
|
|
509
|
+
return any(base.lower().endswith(suf) for suf in self.data_file_suffixes)
|
|
510
|
+
|
|
511
|
+
@staticmethod
|
|
512
|
+
def _extract_mtime(info: Dict[str, Any]) -> Optional[dt.datetime]:
|
|
513
|
+
mtime = (
|
|
514
|
+
info.get("mtime") or info.get("LastModified") or info.get("last_modified")
|
|
515
|
+
)
|
|
516
|
+
if isinstance(mtime, (int, float)):
|
|
517
|
+
return dt.datetime.fromtimestamp(mtime, datetime.UTC)
|
|
518
|
+
if isinstance(mtime, str):
|
|
519
|
+
try:
|
|
520
|
+
return pd.to_datetime(mtime, utc=True).to_pydatetime()
|
|
521
|
+
except Exception:
|
|
522
|
+
return None
|
|
523
|
+
if isinstance(mtime, dt.datetime):
|
|
524
|
+
return mtime if mtime.tzinfo else mtime.replace(tzinfo=dt.timezone.utc)
|
|
525
|
+
return None
|
|
526
|
+
|
|
527
|
+
def _is_skipped(self, d: dt.date) -> bool:
|
|
528
|
+
if "partition_date" in self.partition_on:
|
|
529
|
+
canonical_path = f"{self.data_path}partition_date={d.isoformat()}/"
|
|
530
|
+
else:
|
|
531
|
+
canonical_path = f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
|
|
532
|
+
return (d in self.skipped_dates) or (canonical_path in self.skipped_paths)
|
|
533
|
+
|
|
534
|
+
def _log_extra(self, **overrides) -> Dict[str, Any]:
|
|
535
|
+
base = {
|
|
536
|
+
"sibi_flux_component": self.logger_extra.get(
|
|
537
|
+
"sibi_flux_component", "warehouse.update_planner"
|
|
538
|
+
),
|
|
539
|
+
"date_of_update": self.reference_date.strftime("%Y-%m-%d"),
|
|
540
|
+
"dataclass": self.description,
|
|
541
|
+
"action_module_name": "update_plan",
|
|
542
|
+
}
|
|
543
|
+
base.update(overrides)
|
|
544
|
+
return base
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
from typing import Optional, Any
|
|
2
|
+
from pydantic import SecretStr
|
|
3
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SibiBaseSettings(BaseSettings):
|
|
7
|
+
"""Base settings class with common configuration."""
|
|
8
|
+
|
|
9
|
+
model_config = SettingsConfigDict(
|
|
10
|
+
env_file=".env", env_file_encoding="utf-8", extra="ignore"
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FsSettings(SibiBaseSettings):
|
|
15
|
+
"""Common filesystem settings."""
|
|
16
|
+
|
|
17
|
+
fs_type: str = "s3"
|
|
18
|
+
fs_path: str = "s3://dev-bucket/warehouse"
|
|
19
|
+
fs_key: str = "minio"
|
|
20
|
+
fs_secret: SecretStr = SecretStr("minio123")
|
|
21
|
+
fs_endpoint: str = "http://localhost:9000"
|
|
22
|
+
fs_token: Optional[SecretStr] = None
|
|
23
|
+
fs_region: str = "us-east-1"
|
|
24
|
+
|
|
25
|
+
def to_fsspec_options(self) -> dict[str, Any]:
|
|
26
|
+
"""Convert settings to fsspec compatible options dict."""
|
|
27
|
+
if self.fs_type == "s3":
|
|
28
|
+
opts = {
|
|
29
|
+
"key": self.fs_key,
|
|
30
|
+
"secret": self.fs_secret.get_secret_value() if self.fs_secret else None,
|
|
31
|
+
"skip_instance_cache": True,
|
|
32
|
+
"use_listings_cache": False,
|
|
33
|
+
"client_kwargs": {
|
|
34
|
+
"endpoint_url": self.fs_endpoint,
|
|
35
|
+
"region_name": self.fs_region,
|
|
36
|
+
},
|
|
37
|
+
"config_kwargs": {
|
|
38
|
+
"signature_version": "s3v4",
|
|
39
|
+
"s3": {"addressing_style": "path"},
|
|
40
|
+
},
|
|
41
|
+
}
|
|
42
|
+
if self.fs_token:
|
|
43
|
+
opts["token"] = self.fs_token.get_secret_value()
|
|
44
|
+
return opts
|
|
45
|
+
return {}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class WebDavSettings(SibiBaseSettings):
|
|
49
|
+
"""Common WebDAV settings."""
|
|
50
|
+
|
|
51
|
+
fs_type: str = "webdav"
|
|
52
|
+
fs_verify_ssl: bool = False
|
|
53
|
+
fs_endpoint: str = "http://localhost:8080"
|
|
54
|
+
fs_key: str = "user"
|
|
55
|
+
fs_secret: SecretStr = SecretStr("pass")
|
|
56
|
+
fs_token: Optional[SecretStr] = None
|
|
57
|
+
|
|
58
|
+
# AWS specific config often mixed in WebDAV context in legacy
|
|
59
|
+
aws_access_key_id: Optional[str] = None
|
|
60
|
+
aws_secret_access_key: Optional[str] = None
|
|
61
|
+
region_name: Optional[str] = None
|
|
62
|
+
session_token: Optional[str] = None
|
|
63
|
+
endpoint_url: Optional[str] = None
|
|
64
|
+
|
|
65
|
+
model_config = SettingsConfigDict(env_prefix="WEBDAV_")
|
|
66
|
+
|
|
67
|
+
def to_fsspec_options(self) -> dict[str, Any]:
|
|
68
|
+
verify = self.fs_verify_ssl
|
|
69
|
+
opts = {
|
|
70
|
+
"base_url": self.fs_endpoint,
|
|
71
|
+
"username": self.fs_key,
|
|
72
|
+
"password": self.fs_secret.get_secret_value() if self.fs_secret else "",
|
|
73
|
+
"verify": verify,
|
|
74
|
+
}
|
|
75
|
+
if self.fs_token:
|
|
76
|
+
opts["token"] = self.fs_token.get_secret_value()
|
|
77
|
+
return opts
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# --- Database Base Settings ---
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class DatabaseSettings(SibiBaseSettings):
|
|
84
|
+
"""Generic SQL Database settings."""
|
|
85
|
+
|
|
86
|
+
db_url: str = "sqlite:///:memory:"
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class ClickhouseBaseSettings(SibiBaseSettings):
|
|
90
|
+
"""Base settings for ClickHouse connection."""
|
|
91
|
+
|
|
92
|
+
host: str = "localhost"
|
|
93
|
+
port: int = 8123
|
|
94
|
+
database: str = "default"
|
|
95
|
+
user: str = "default"
|
|
96
|
+
password: SecretStr = SecretStr("secret")
|
|
97
|
+
|
|
98
|
+
def to_legacy_dict(self) -> dict[str, Any]:
|
|
99
|
+
return {
|
|
100
|
+
"host": self.host,
|
|
101
|
+
"port": self.port,
|
|
102
|
+
"dbname": self.database,
|
|
103
|
+
"user": self.user,
|
|
104
|
+
"password": self.password.get_secret_value() if self.password else None,
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class RedisBaseSettings(SibiBaseSettings):
|
|
109
|
+
"""Base settings for Redis connection."""
|
|
110
|
+
|
|
111
|
+
host: str = "localhost"
|
|
112
|
+
port: int = 6379
|
|
113
|
+
db: int = 0
|
|
114
|
+
password: Optional[SecretStr] = None
|
|
115
|
+
|
|
116
|
+
def to_legacy_dict(self) -> dict[str, Any]:
|
|
117
|
+
return {
|
|
118
|
+
"host": self.host,
|
|
119
|
+
"port": self.port,
|
|
120
|
+
"db": self.db,
|
|
121
|
+
"password": self.password.get_secret_value() if self.password else None,
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class DbPoolSettings(SibiBaseSettings):
|
|
126
|
+
"""Base settings for SQLAlchemy connection pooling."""
|
|
127
|
+
|
|
128
|
+
db_pool_size: int = 5
|
|
129
|
+
db_max_overflow: int = 10
|
|
130
|
+
db_pool_timeout: int = 30
|
|
131
|
+
db_pool_recycle: int = 1800
|