sibi-flux 2025.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +44 -0
- sibi_flux/__init__.py +49 -0
- sibi_flux/artifacts/__init__.py +7 -0
- sibi_flux/artifacts/base.py +166 -0
- sibi_flux/artifacts/parquet.py +360 -0
- sibi_flux/artifacts/parquet_engine/__init__.py +5 -0
- sibi_flux/artifacts/parquet_engine/executor.py +204 -0
- sibi_flux/artifacts/parquet_engine/manifest.py +101 -0
- sibi_flux/artifacts/parquet_engine/planner.py +544 -0
- sibi_flux/conf/settings.py +131 -0
- sibi_flux/core/__init__.py +5 -0
- sibi_flux/core/managed_resource/__init__.py +3 -0
- sibi_flux/core/managed_resource/_managed_resource.py +733 -0
- sibi_flux/core/type_maps/__init__.py +100 -0
- sibi_flux/dask_cluster/__init__.py +47 -0
- sibi_flux/dask_cluster/async_core.py +27 -0
- sibi_flux/dask_cluster/client_manager.py +549 -0
- sibi_flux/dask_cluster/core.py +322 -0
- sibi_flux/dask_cluster/exceptions.py +34 -0
- sibi_flux/dask_cluster/utils.py +49 -0
- sibi_flux/datacube/__init__.py +3 -0
- sibi_flux/datacube/_data_cube.py +332 -0
- sibi_flux/datacube/config_engine.py +152 -0
- sibi_flux/datacube/field_factory.py +48 -0
- sibi_flux/datacube/field_registry.py +122 -0
- sibi_flux/datacube/generator.py +677 -0
- sibi_flux/datacube/orchestrator.py +171 -0
- sibi_flux/dataset/__init__.py +3 -0
- sibi_flux/dataset/_dataset.py +162 -0
- sibi_flux/df_enricher/__init__.py +56 -0
- sibi_flux/df_enricher/async_enricher.py +201 -0
- sibi_flux/df_enricher/merger.py +253 -0
- sibi_flux/df_enricher/specs.py +45 -0
- sibi_flux/df_enricher/types.py +12 -0
- sibi_flux/df_helper/__init__.py +5 -0
- sibi_flux/df_helper/_df_helper.py +450 -0
- sibi_flux/df_helper/backends/__init__.py +34 -0
- sibi_flux/df_helper/backends/_params.py +173 -0
- sibi_flux/df_helper/backends/_strategies.py +295 -0
- sibi_flux/df_helper/backends/http/__init__.py +5 -0
- sibi_flux/df_helper/backends/http/_http_config.py +122 -0
- sibi_flux/df_helper/backends/parquet/__init__.py +7 -0
- sibi_flux/df_helper/backends/parquet/_parquet_options.py +268 -0
- sibi_flux/df_helper/backends/sqlalchemy/__init__.py +9 -0
- sibi_flux/df_helper/backends/sqlalchemy/_db_connection.py +256 -0
- sibi_flux/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
- sibi_flux/df_helper/backends/sqlalchemy/_io_dask.py +386 -0
- sibi_flux/df_helper/backends/sqlalchemy/_load_from_db.py +134 -0
- sibi_flux/df_helper/backends/sqlalchemy/_model_registry.py +239 -0
- sibi_flux/df_helper/backends/sqlalchemy/_sql_model_builder.py +42 -0
- sibi_flux/df_helper/backends/utils.py +32 -0
- sibi_flux/df_helper/core/__init__.py +15 -0
- sibi_flux/df_helper/core/_defaults.py +104 -0
- sibi_flux/df_helper/core/_filter_handler.py +617 -0
- sibi_flux/df_helper/core/_params_config.py +185 -0
- sibi_flux/df_helper/core/_query_config.py +17 -0
- sibi_flux/df_validator/__init__.py +3 -0
- sibi_flux/df_validator/_df_validator.py +222 -0
- sibi_flux/logger/__init__.py +1 -0
- sibi_flux/logger/_logger.py +480 -0
- sibi_flux/mcp/__init__.py +26 -0
- sibi_flux/mcp/client.py +150 -0
- sibi_flux/mcp/router.py +126 -0
- sibi_flux/orchestration/__init__.py +9 -0
- sibi_flux/orchestration/_artifact_orchestrator.py +346 -0
- sibi_flux/orchestration/_pipeline_executor.py +212 -0
- sibi_flux/osmnx_helper/__init__.py +22 -0
- sibi_flux/osmnx_helper/_pbf_handler.py +384 -0
- sibi_flux/osmnx_helper/graph_loader.py +225 -0
- sibi_flux/osmnx_helper/utils.py +100 -0
- sibi_flux/pipelines/__init__.py +3 -0
- sibi_flux/pipelines/base.py +218 -0
- sibi_flux/py.typed +0 -0
- sibi_flux/readers/__init__.py +3 -0
- sibi_flux/readers/base.py +82 -0
- sibi_flux/readers/parquet.py +106 -0
- sibi_flux/utils/__init__.py +53 -0
- sibi_flux/utils/boilerplate/__init__.py +19 -0
- sibi_flux/utils/boilerplate/base_attacher.py +45 -0
- sibi_flux/utils/boilerplate/base_cube_router.py +283 -0
- sibi_flux/utils/boilerplate/base_data_cube.py +132 -0
- sibi_flux/utils/boilerplate/base_pipeline_template.py +54 -0
- sibi_flux/utils/boilerplate/hybrid_data_loader.py +193 -0
- sibi_flux/utils/clickhouse_writer/__init__.py +6 -0
- sibi_flux/utils/clickhouse_writer/_clickhouse_writer.py +225 -0
- sibi_flux/utils/common.py +7 -0
- sibi_flux/utils/credentials/__init__.py +3 -0
- sibi_flux/utils/credentials/_config_manager.py +155 -0
- sibi_flux/utils/dask_utils.py +14 -0
- sibi_flux/utils/data_utils/__init__.py +3 -0
- sibi_flux/utils/data_utils/_data_utils.py +389 -0
- sibi_flux/utils/dataframe_utils.py +52 -0
- sibi_flux/utils/date_utils/__init__.py +10 -0
- sibi_flux/utils/date_utils/_business_days.py +220 -0
- sibi_flux/utils/date_utils/_date_utils.py +311 -0
- sibi_flux/utils/date_utils/_file_age_checker.py +319 -0
- sibi_flux/utils/file_utils.py +48 -0
- sibi_flux/utils/filepath_generator/__init__.py +5 -0
- sibi_flux/utils/filepath_generator/_filepath_generator.py +185 -0
- sibi_flux/utils/parquet_saver/__init__.py +6 -0
- sibi_flux/utils/parquet_saver/_parquet_saver.py +436 -0
- sibi_flux/utils/parquet_saver/_write_gatekeeper.py +33 -0
- sibi_flux/utils/retry.py +46 -0
- sibi_flux/utils/storage/__init__.py +7 -0
- sibi_flux/utils/storage/_fs_registry.py +112 -0
- sibi_flux/utils/storage/_storage_manager.py +257 -0
- sibi_flux/utils/storage/factory.py +33 -0
- sibi_flux-2025.12.0.dist-info/METADATA +283 -0
- sibi_flux-2025.12.0.dist-info/RECORD +110 -0
- sibi_flux-2025.12.0.dist-info/WHEEL +4 -0
sibi_dst/__init__.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
import sys as _sys
|
|
5
|
+
import warnings
|
|
6
|
+
|
|
7
|
+
warnings.warn(
|
|
8
|
+
"sibi_dst is deprecated; use sibi_flux instead.",
|
|
9
|
+
FutureWarning,
|
|
10
|
+
stacklevel=2,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
# Canonical package
|
|
14
|
+
_CANON = "sibi_flux"
|
|
15
|
+
|
|
16
|
+
# Make `import skt` behave like `import sibi_flux`
|
|
17
|
+
_pkg = importlib.import_module(_CANON)
|
|
18
|
+
|
|
19
|
+
# Populate this module’s namespace with the canonical package attributes
|
|
20
|
+
globals().update(_pkg.__dict__)
|
|
21
|
+
|
|
22
|
+
# Expose a clean public surface
|
|
23
|
+
__all__ = getattr(_pkg, "__all__", [])
|
|
24
|
+
__version__ = getattr(_pkg, "__version__", "0.0.0")
|
|
25
|
+
|
|
26
|
+
# Critical: alias module entries so `import skt.df_helper` maps to `sibi_flux.df_helper`
|
|
27
|
+
_sys.modules[__name__] = _pkg
|
|
28
|
+
_sys.modules[__name__ + ".__init__"] = _pkg # defensive (rarely needed)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def __getattr__(name: str):
|
|
32
|
+
# Lazy-load submodules on demand: skt.df_helper, skt.osmnx, etc.
|
|
33
|
+
try:
|
|
34
|
+
mod = importlib.import_module(f"{_CANON}.{name}")
|
|
35
|
+
except ModuleNotFoundError as e:
|
|
36
|
+
raise AttributeError(f"module '{__name__}' has no attribute '{name}'") from e
|
|
37
|
+
_sys.modules[f"{__name__}.{name}"] = mod
|
|
38
|
+
return mod
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def __dir__():
|
|
42
|
+
# Combine canonical attrs + subpackages for nicer autocomplete
|
|
43
|
+
base = set(globals().keys())
|
|
44
|
+
return sorted(base)
|
sibi_flux/__init__.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from importlib.metadata import version as _pkg_version, PackageNotFoundError
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
__version__ = _pkg_version("sibi-flux")
|
|
6
|
+
except PackageNotFoundError: # pragma: no cover
|
|
7
|
+
__version__ = "0.0.0"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Core Exports
|
|
11
|
+
from sibi_flux.logger import Logger
|
|
12
|
+
from sibi_flux.core import ManagedResource
|
|
13
|
+
from sibi_flux.datacube import Datacube, DatacubeConfig
|
|
14
|
+
from sibi_flux.dataset import Dataset
|
|
15
|
+
from sibi_flux.pipelines import BasePipeline
|
|
16
|
+
|
|
17
|
+
# Helper Exports
|
|
18
|
+
from sibi_flux.df_helper import DfHelper
|
|
19
|
+
from sibi_flux.df_enricher import AsyncDfEnricher as DfEnricher
|
|
20
|
+
from sibi_flux.df_validator._df_validator import DfValidator
|
|
21
|
+
|
|
22
|
+
# Artifacts
|
|
23
|
+
from sibi_flux.artifacts import ParquetArtifact, BaseArtifact as Artifact
|
|
24
|
+
from sibi_flux.readers.parquet import ParquetReader
|
|
25
|
+
|
|
26
|
+
# Utilities (Sub-packages)
|
|
27
|
+
from sibi_flux import dask_cluster
|
|
28
|
+
from sibi_flux.utils import boilerplate, parquet_saver, clickhouse_writer
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
"__version__",
|
|
33
|
+
"Logger",
|
|
34
|
+
"ManagedResource",
|
|
35
|
+
"Datacube",
|
|
36
|
+
"DatacubeConfig",
|
|
37
|
+
"Dataset",
|
|
38
|
+
"BasePipeline",
|
|
39
|
+
"DfHelper",
|
|
40
|
+
"DfEnricher",
|
|
41
|
+
"DfValidator",
|
|
42
|
+
"ParquetArtifact",
|
|
43
|
+
"Artifact",
|
|
44
|
+
"ParquetReader",
|
|
45
|
+
"dask_cluster",
|
|
46
|
+
"boilerplate",
|
|
47
|
+
"parquet_saver",
|
|
48
|
+
"clickhouse_writer",
|
|
49
|
+
]
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Artifact Base module.
|
|
3
|
+
|
|
4
|
+
Defines the base classes for system artifacts.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
from typing import Any, Dict, Mapping, Optional, Type
|
|
11
|
+
from datetime import date, datetime
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
import dask.dataframe as dd
|
|
15
|
+
|
|
16
|
+
# Correct internal import for the base class
|
|
17
|
+
from .parquet import ParquetArtifact
|
|
18
|
+
|
|
19
|
+
DateLike = str | date | datetime | None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _validate_and_format_date(name: str, value: DateLike) -> Optional[str]:
|
|
23
|
+
"""
|
|
24
|
+
Normalize date-like input into a canonical string '%Y-%m-%d'.
|
|
25
|
+
|
|
26
|
+
- None -> None
|
|
27
|
+
- str/date/datetime -> parse with pandas.to_datetime, take .date(), return '%Y-%m-%d'
|
|
28
|
+
- else -> raise TypeError
|
|
29
|
+
"""
|
|
30
|
+
if value is None:
|
|
31
|
+
return None
|
|
32
|
+
if isinstance(value, (str, date, datetime)):
|
|
33
|
+
try:
|
|
34
|
+
return pd.to_datetime(value).date().strftime("%Y-%m-%d")
|
|
35
|
+
except Exception as e:
|
|
36
|
+
raise ValueError(f"{name} must be a valid date; got {value!r}") from e
|
|
37
|
+
raise TypeError(f"{name} must be str, date, datetime, or None; got {type(value)}")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class BaseArtifact(ParquetArtifact):
|
|
41
|
+
"""
|
|
42
|
+
Generic base class for Parquet-based artifacts with enhanced date handling and async support.
|
|
43
|
+
|
|
44
|
+
Features:
|
|
45
|
+
- Automatic date normalization to '%Y-%m-%d' strings.
|
|
46
|
+
- Async loading support via `aload`.
|
|
47
|
+
- Lifecycle hooks (before/after load).
|
|
48
|
+
- Params serialization.
|
|
49
|
+
|
|
50
|
+
The `config` class attribute can be used to set default configuration values.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
config: Mapping[str, Any] = {}
|
|
54
|
+
|
|
55
|
+
# Typed attributes for IDE support (populated in __init__)
|
|
56
|
+
parquet_start_date: Optional[str]
|
|
57
|
+
parquet_end_date: Optional[str]
|
|
58
|
+
provider_class: Optional[Type[Any]]
|
|
59
|
+
class_params: Dict[str, Any]
|
|
60
|
+
df: Optional[pd.DataFrame | dd.DataFrame] = None
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
**kwargs: Any,
|
|
65
|
+
) -> None:
|
|
66
|
+
# 1. Merge defaults from class config with runtime kwargs
|
|
67
|
+
merged = {**self.config, **kwargs}
|
|
68
|
+
|
|
69
|
+
# 2. Extract and normalize date fields *before* passing to super
|
|
70
|
+
# This ensures ParquetArtifact receives clean strings if it validates them.
|
|
71
|
+
p_start = merged.get("parquet_start_date")
|
|
72
|
+
p_end = merged.get("parquet_end_date")
|
|
73
|
+
|
|
74
|
+
self.parquet_start_date = _validate_and_format_date(
|
|
75
|
+
"parquet_start_date", p_start
|
|
76
|
+
)
|
|
77
|
+
self.parquet_end_date = _validate_and_format_date("parquet_end_date", p_end)
|
|
78
|
+
|
|
79
|
+
# Update merged dict with normalized values to ensure consistency
|
|
80
|
+
if self.parquet_start_date:
|
|
81
|
+
merged["parquet_start_date"] = self.parquet_start_date
|
|
82
|
+
if self.parquet_end_date:
|
|
83
|
+
merged["parquet_end_date"] = self.parquet_end_date
|
|
84
|
+
|
|
85
|
+
# 3. Initialize parent (ParquetArtifact handles config validation and resource setup)
|
|
86
|
+
super().__init__(**merged)
|
|
87
|
+
|
|
88
|
+
# 4. Store additional wrapper config
|
|
89
|
+
self.provider_class = merged.get("provider_class", None)
|
|
90
|
+
|
|
91
|
+
# Default class_params from instance attributes if not provided
|
|
92
|
+
self.class_params = merged.get("class_params", None) or {
|
|
93
|
+
"debug": self.debug,
|
|
94
|
+
"logger": self.logger,
|
|
95
|
+
"fs": self.fs,
|
|
96
|
+
"verbose": getattr(self, "verbose", False),
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
# 5. Validation Logic
|
|
100
|
+
if self.parquet_start_date and self.parquet_end_date:
|
|
101
|
+
if self.parquet_start_date > self.parquet_end_date:
|
|
102
|
+
raise ValueError(
|
|
103
|
+
f"parquet_start_date {self.parquet_start_date} "
|
|
104
|
+
f"cannot be after parquet_end_date {self.parquet_end_date}"
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# -------- Lifecycle Hooks (No-op defaults) --------
|
|
108
|
+
|
|
109
|
+
def before_load(self, **kwargs: Any) -> None:
|
|
110
|
+
"""Hook called synchronously before loading data."""
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
def after_load(self, **kwargs: Any) -> None:
|
|
114
|
+
"""Hook called synchronously after loading data."""
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
async def abefore_load(self, **kwargs: Any) -> None:
|
|
118
|
+
"""Hook called asynchronously before loading data."""
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
async def aafter_load(self, **kwargs: Any) -> None:
|
|
122
|
+
"""Hook called asynchronously after loading data."""
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
# -------- Public API --------
|
|
126
|
+
|
|
127
|
+
def load(self, **kwargs: Any) -> pd.DataFrame | dd.DataFrame:
|
|
128
|
+
"""
|
|
129
|
+
Synchronous load with hooks.
|
|
130
|
+
Delegates actual loading to super().load().
|
|
131
|
+
"""
|
|
132
|
+
self.before_load(**kwargs)
|
|
133
|
+
self.df = super().load(**kwargs)
|
|
134
|
+
self.after_load(**kwargs)
|
|
135
|
+
return self.df
|
|
136
|
+
|
|
137
|
+
async def aload(self, **kwargs: Any) -> pd.DataFrame | dd.DataFrame:
|
|
138
|
+
"""
|
|
139
|
+
Asynchronous load with hooks.
|
|
140
|
+
Runs the blocking super().load() in a thread.
|
|
141
|
+
"""
|
|
142
|
+
await self.abefore_load(**kwargs)
|
|
143
|
+
# Run blocking IO in a separate thread to avoid blocking the event loop
|
|
144
|
+
df = await asyncio.to_thread(super().load, **kwargs)
|
|
145
|
+
self.df = df
|
|
146
|
+
await self.aafter_load(**kwargs)
|
|
147
|
+
return self.df
|
|
148
|
+
|
|
149
|
+
def has_date_window(self) -> bool:
|
|
150
|
+
"""Check if a date window is defined."""
|
|
151
|
+
return bool(self.parquet_start_date or self.parquet_end_date)
|
|
152
|
+
|
|
153
|
+
def date_window(self) -> tuple[Optional[str], Optional[str]]:
|
|
154
|
+
"""Return the (start, end) date window tuple."""
|
|
155
|
+
return self.parquet_start_date, self.parquet_end_date
|
|
156
|
+
|
|
157
|
+
def to_params(self) -> Dict[str, Any]:
|
|
158
|
+
"""Serialize configuration parameters."""
|
|
159
|
+
return {
|
|
160
|
+
"parquet_start_date": self.parquet_start_date,
|
|
161
|
+
"parquet_end_date": self.parquet_end_date,
|
|
162
|
+
"provider_class": self.provider_class,
|
|
163
|
+
"class_params": dict(self.class_params),
|
|
164
|
+
# Add useful metadata from parent
|
|
165
|
+
"parquet_storage_path": getattr(self, "_storage_path", None),
|
|
166
|
+
}
|
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Parquet Artifact module.
|
|
3
|
+
|
|
4
|
+
Defines the ParquetArtifact class for orchestrating Parquet dataset generation.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import datetime as dt
|
|
10
|
+
import threading
|
|
11
|
+
from functools import cached_property
|
|
12
|
+
from typing import Any, Dict, Type, TypeVar, Optional, Set, List
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, Field, ConfigDict, ValidationError
|
|
15
|
+
|
|
16
|
+
from sibi_flux.core import ManagedResource
|
|
17
|
+
from sibi_flux.utils.date_utils import DateUtils
|
|
18
|
+
|
|
19
|
+
from sibi_flux.artifacts.parquet_engine import (
|
|
20
|
+
Executor as DataWrapper,
|
|
21
|
+
UpdatePlanner,
|
|
22
|
+
MissingManifestManager,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
T = TypeVar("T")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ArtifactConfig(BaseModel):
|
|
29
|
+
"""
|
|
30
|
+
Strict configuration schema for ParquetArtifact.
|
|
31
|
+
Validates arguments immediately upon initialization.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
model_config = ConfigDict(arbitrary_types_allowed=True, extra="allow")
|
|
35
|
+
|
|
36
|
+
parquet_storage_path: str
|
|
37
|
+
parquet_filename: str
|
|
38
|
+
|
|
39
|
+
provider_class: Optional[Type] = None
|
|
40
|
+
partition_on: List[str] = Field(default_factory=lambda: ["partition_date"])
|
|
41
|
+
hive_style: bool = True
|
|
42
|
+
overwrite: bool = False
|
|
43
|
+
|
|
44
|
+
show_progress: bool = False
|
|
45
|
+
timeout: float = 30.0
|
|
46
|
+
max_threads: int = 3
|
|
47
|
+
|
|
48
|
+
max_retries: int = 3
|
|
49
|
+
backoff_base: float = 2.0
|
|
50
|
+
|
|
51
|
+
load_params: Dict[str, Any] = Field(default_factory=dict)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class ParquetArtifact(ManagedResource):
|
|
55
|
+
"""
|
|
56
|
+
Orchestrates the generation of a single date-partitioned Parquet dataset.
|
|
57
|
+
|
|
58
|
+
Features:
|
|
59
|
+
- Validated configuration via Pydantic.
|
|
60
|
+
- Thread-safe concurrency control (process-local).
|
|
61
|
+
- Lazy initialization of resources (manifests, planners).
|
|
62
|
+
- Safe resource cleanup.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
_global_lock = threading.RLock()
|
|
66
|
+
_active_runs: Set[tuple[str, str]] = set()
|
|
67
|
+
logger_extra = {"sibi_flux_component": __name__}
|
|
68
|
+
|
|
69
|
+
def __init__(self, **kwargs: Any):
|
|
70
|
+
"""
|
|
71
|
+
Initializes the ParquetArtifact.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
**kwargs: Configuration parameters validated against ArtifactConfig.
|
|
75
|
+
"""
|
|
76
|
+
try:
|
|
77
|
+
self.config = ArtifactConfig(**kwargs)
|
|
78
|
+
except ValidationError as e:
|
|
79
|
+
raise ValueError(f"Invalid ParquetArtifact configuration: {e}") from e
|
|
80
|
+
|
|
81
|
+
self.all_kwargs = self.config.model_dump()
|
|
82
|
+
|
|
83
|
+
super().__init__(**self.all_kwargs)
|
|
84
|
+
|
|
85
|
+
self._lock = threading.RLock()
|
|
86
|
+
|
|
87
|
+
self._storage_path = self.config.parquet_storage_path
|
|
88
|
+
self._provider_class = self.config.provider_class
|
|
89
|
+
|
|
90
|
+
self.logger_extra.update(
|
|
91
|
+
{
|
|
92
|
+
"artifact_storage_path": self._storage_path,
|
|
93
|
+
}
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def _invalidate_cached(self, *names: str) -> None:
|
|
97
|
+
"""
|
|
98
|
+
Invalidate cached properties by name.
|
|
99
|
+
|
|
100
|
+
SAFEGUARD: Tries to call .save() on objects (like manifests)
|
|
101
|
+
before removing them from memory to prevent state loss.
|
|
102
|
+
"""
|
|
103
|
+
for name in names:
|
|
104
|
+
if name in self.__dict__:
|
|
105
|
+
obj = self.__dict__[name]
|
|
106
|
+
|
|
107
|
+
if hasattr(obj, "save") and callable(obj.save):
|
|
108
|
+
try:
|
|
109
|
+
self.logger.debug(
|
|
110
|
+
f"Saving state for '{name}' before cache invalidation...",
|
|
111
|
+
extra=self.logger_extra,
|
|
112
|
+
)
|
|
113
|
+
obj.save()
|
|
114
|
+
except Exception as e:
|
|
115
|
+
self.logger.warning(
|
|
116
|
+
f"Failed to save '{name}' state during invalidation: {e}",
|
|
117
|
+
extra=self.logger_extra,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
del self.__dict__[name]
|
|
121
|
+
|
|
122
|
+
def _build_manifest_path(self) -> str:
|
|
123
|
+
"""Constructs the path for the missing data manifest."""
|
|
124
|
+
base = self._storage_path.rstrip("/") + "/"
|
|
125
|
+
return f"{base}_manifests/missing.parquet"
|
|
126
|
+
|
|
127
|
+
@cached_property
|
|
128
|
+
def mmanifest(self) -> MissingManifestManager:
|
|
129
|
+
self.logger.debug(
|
|
130
|
+
"Initializing MissingManifestManager...", extra=self.logger_extra
|
|
131
|
+
)
|
|
132
|
+
manifest_path = self._build_manifest_path()
|
|
133
|
+
|
|
134
|
+
manifest_dir = (
|
|
135
|
+
manifest_path.rsplit("/", 1)[0] if "/" in manifest_path else manifest_path
|
|
136
|
+
)
|
|
137
|
+
self.ensure_directory_exists(manifest_dir)
|
|
138
|
+
|
|
139
|
+
mgr = MissingManifestManager(
|
|
140
|
+
fs=self.fs,
|
|
141
|
+
manifest_path=manifest_path,
|
|
142
|
+
clear_existing=self.config.overwrite,
|
|
143
|
+
debug=self.debug,
|
|
144
|
+
logger=self.logger,
|
|
145
|
+
overwrite=self.config.overwrite,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
if hasattr(mgr, "_safe_exists") and not mgr._safe_exists(mgr.manifest_path):
|
|
149
|
+
self.logger.debug(
|
|
150
|
+
f"Creating new manifest at {mgr.manifest_path}", extra=self.logger_extra
|
|
151
|
+
)
|
|
152
|
+
mgr.save()
|
|
153
|
+
|
|
154
|
+
return mgr
|
|
155
|
+
|
|
156
|
+
@cached_property
|
|
157
|
+
def update_planner(self) -> UpdatePlanner:
|
|
158
|
+
self.logger.debug("Initializing UpdatePlanner...", extra=self.logger_extra)
|
|
159
|
+
skipped_files: List[Any] = self.mmanifest.load_existing() or []
|
|
160
|
+
|
|
161
|
+
cfg = {
|
|
162
|
+
**self.all_kwargs,
|
|
163
|
+
"fs": self.fs,
|
|
164
|
+
"debug": self.debug,
|
|
165
|
+
"logger": self.logger,
|
|
166
|
+
"description": getattr(self._provider_class, "__name__", "DataWrapper"),
|
|
167
|
+
"skipped": list(skipped_files),
|
|
168
|
+
"mmanifest": self.mmanifest,
|
|
169
|
+
"partition_on": self.config.partition_on,
|
|
170
|
+
"hive_style": self.config.hive_style,
|
|
171
|
+
}
|
|
172
|
+
return UpdatePlanner(**cfg)
|
|
173
|
+
|
|
174
|
+
@cached_property
|
|
175
|
+
def data_wrapper(self) -> DataWrapper:
|
|
176
|
+
self.logger.debug("Initializing DataWrapper...", extra=self.logger_extra)
|
|
177
|
+
|
|
178
|
+
_ = self.update_planner.plan
|
|
179
|
+
|
|
180
|
+
class_params = {
|
|
181
|
+
"debug": self.debug,
|
|
182
|
+
"logger": self.logger,
|
|
183
|
+
"fs": self.fs,
|
|
184
|
+
"verbose": self.verbose,
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
cfg = {
|
|
188
|
+
"data_path": self._storage_path,
|
|
189
|
+
"fs": self.fs,
|
|
190
|
+
"debug": self.debug,
|
|
191
|
+
"logger": self.logger,
|
|
192
|
+
"verbose": self.verbose,
|
|
193
|
+
"provider_class": self._provider_class,
|
|
194
|
+
"class_params": class_params,
|
|
195
|
+
"load_params": self.config.load_params,
|
|
196
|
+
"mmanifest": self.mmanifest,
|
|
197
|
+
"update_planner": self.update_planner,
|
|
198
|
+
"date_field": self.all_kwargs.get("date_field"),
|
|
199
|
+
"show_progress": self.config.show_progress,
|
|
200
|
+
"timeout": self.config.timeout,
|
|
201
|
+
"max_threads": self.config.max_threads,
|
|
202
|
+
}
|
|
203
|
+
return DataWrapper(**cfg)
|
|
204
|
+
|
|
205
|
+
def load(self, **kwargs: Any):
|
|
206
|
+
"""
|
|
207
|
+
Directly load data using the configured provider_class.
|
|
208
|
+
Bypasses planning/manifest process.
|
|
209
|
+
"""
|
|
210
|
+
if not self._provider_class:
|
|
211
|
+
raise ValueError("provider_class is not configured.")
|
|
212
|
+
|
|
213
|
+
params = {
|
|
214
|
+
"backend": "parquet",
|
|
215
|
+
"fs": self.fs,
|
|
216
|
+
"logger": self.logger,
|
|
217
|
+
"debug": self.debug,
|
|
218
|
+
"parquet_storage_path": self._storage_path,
|
|
219
|
+
"parquet_start_date": self.all_kwargs.get("parquet_start_date"),
|
|
220
|
+
"parquet_end_date": self.all_kwargs.get("parquet_end_date"),
|
|
221
|
+
"partition_on": self.config.partition_on,
|
|
222
|
+
**(self.all_kwargs.get("class_params") or {}),
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
with self._provider_class(**params) as instance:
|
|
226
|
+
return instance.load(**kwargs)
|
|
227
|
+
|
|
228
|
+
def generate_parquet(self, **kwargs: Any) -> None:
|
|
229
|
+
"""
|
|
230
|
+
Generate or update the Parquet dataset according to the plan.
|
|
231
|
+
- Merges runtime kwargs.
|
|
232
|
+
- Invalidates dependent cached properties safely.
|
|
233
|
+
- Guards against duplicate concurrent runs.
|
|
234
|
+
"""
|
|
235
|
+
self.all_kwargs.update(kwargs)
|
|
236
|
+
|
|
237
|
+
self._invalidate_cached("update_planner", "data_wrapper")
|
|
238
|
+
if "overwrite" in kwargs and kwargs["overwrite"]:
|
|
239
|
+
self._invalidate_cached("mmanifest")
|
|
240
|
+
|
|
241
|
+
key = self._storage_path
|
|
242
|
+
with ParquetArtifact._global_lock:
|
|
243
|
+
if key in ParquetArtifact._active_runs:
|
|
244
|
+
self.logger.info(
|
|
245
|
+
f"Run active for {key}; skipping.", extra=self.logger_extra
|
|
246
|
+
)
|
|
247
|
+
return
|
|
248
|
+
ParquetArtifact._active_runs.add(key)
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
self.ensure_directory_exists(self._storage_path)
|
|
252
|
+
|
|
253
|
+
plan = self.update_planner.plan
|
|
254
|
+
if plan is None or (hasattr(plan, "empty") and plan.empty):
|
|
255
|
+
self.logger.info("No updates needed.", extra=self.logger_extra)
|
|
256
|
+
return
|
|
257
|
+
|
|
258
|
+
if self.config.show_progress and not getattr(
|
|
259
|
+
self.update_planner, "_printed_this_run", False
|
|
260
|
+
):
|
|
261
|
+
try:
|
|
262
|
+
self.update_planner.show_update_plan()
|
|
263
|
+
setattr(self.update_planner, "_printed_this_run", True)
|
|
264
|
+
except Exception:
|
|
265
|
+
pass
|
|
266
|
+
|
|
267
|
+
dw_retry_kwargs = {
|
|
268
|
+
"max_retries": self.config.max_retries,
|
|
269
|
+
"backoff_base": self.config.backoff_base,
|
|
270
|
+
}
|
|
271
|
+
if "backoff_jitter" in self.all_kwargs:
|
|
272
|
+
dw_retry_kwargs["backoff_jitter"] = self.all_kwargs["backoff_jitter"]
|
|
273
|
+
if "backoff_max" in self.all_kwargs:
|
|
274
|
+
dw_retry_kwargs["backoff_max"] = self.all_kwargs["backoff_max"]
|
|
275
|
+
|
|
276
|
+
with self._lock:
|
|
277
|
+
dw = self.data_wrapper
|
|
278
|
+
if hasattr(dw, "process"):
|
|
279
|
+
dw.process(**dw_retry_kwargs)
|
|
280
|
+
|
|
281
|
+
if self.config.show_progress and hasattr(
|
|
282
|
+
dw, "show_benchmark_summary"
|
|
283
|
+
):
|
|
284
|
+
dw.show_benchmark_summary()
|
|
285
|
+
|
|
286
|
+
finally:
|
|
287
|
+
with ParquetArtifact._global_lock:
|
|
288
|
+
ParquetArtifact._active_runs.discard(key)
|
|
289
|
+
|
|
290
|
+
def update_parquet(self, period: str = "today", **kwargs: Any) -> None:
|
|
291
|
+
"""
|
|
292
|
+
High-level entry point to update Parquet for a standard or custom period.
|
|
293
|
+
"""
|
|
294
|
+
final_kwargs = {**self.all_kwargs, **kwargs}
|
|
295
|
+
period_params: Dict[str, dt.date] = {}
|
|
296
|
+
|
|
297
|
+
if period == "itd":
|
|
298
|
+
start_str = final_kwargs.get("history_begins_on")
|
|
299
|
+
if not start_str:
|
|
300
|
+
raise ValueError("Period 'itd' requires 'history_begins_on'.")
|
|
301
|
+
period_params = {
|
|
302
|
+
"parquet_start_date": dt.datetime.strptime(
|
|
303
|
+
start_str, "%Y-%m-%d"
|
|
304
|
+
).date(),
|
|
305
|
+
"parquet_end_date": dt.date.today(),
|
|
306
|
+
}
|
|
307
|
+
elif period == "ytd":
|
|
308
|
+
period_params = {
|
|
309
|
+
"parquet_start_date": dt.date(dt.date.today().year, 1, 1),
|
|
310
|
+
"parquet_end_date": dt.date.today(),
|
|
311
|
+
}
|
|
312
|
+
elif period == "custom":
|
|
313
|
+
p = final_kwargs
|
|
314
|
+
s = p.get("start_on") or p.get("start_date") or p.get("start")
|
|
315
|
+
e = p.get("end_on") or p.get("end_date") or p.get("end")
|
|
316
|
+
|
|
317
|
+
if not s or not e:
|
|
318
|
+
raise ValueError("Period 'custom' requires 'start_on' and 'end_on'.")
|
|
319
|
+
|
|
320
|
+
period_params = {
|
|
321
|
+
"parquet_start_date": dt.datetime.strptime(str(s), "%Y-%m-%d").date(),
|
|
322
|
+
"parquet_end_date": dt.datetime.strptime(str(e), "%Y-%m-%d").date(),
|
|
323
|
+
}
|
|
324
|
+
else:
|
|
325
|
+
try:
|
|
326
|
+
s, e = DateUtils.parse_period(period=period)
|
|
327
|
+
period_params = {"parquet_start_date": s, "parquet_end_date": e}
|
|
328
|
+
except Exception as e:
|
|
329
|
+
raise ValueError(f"Failed to parse period '{period}': {e}") from e
|
|
330
|
+
|
|
331
|
+
final_kwargs.update(period_params)
|
|
332
|
+
self.generate_parquet(**final_kwargs)
|
|
333
|
+
|
|
334
|
+
def ensure_directory_exists(self, path: str) -> None:
|
|
335
|
+
"""Robust directory creation handling various fs backends."""
|
|
336
|
+
with self._lock:
|
|
337
|
+
if not self.fs.exists(path):
|
|
338
|
+
try:
|
|
339
|
+
self.fs.makedirs(path, exist_ok=True)
|
|
340
|
+
except TypeError:
|
|
341
|
+
# Fallback for backends not supporting exist_ok
|
|
342
|
+
try:
|
|
343
|
+
self.fs.makedirs(path)
|
|
344
|
+
except FileExistsError:
|
|
345
|
+
pass
|
|
346
|
+
|
|
347
|
+
def _cleanup(self) -> None:
|
|
348
|
+
"""
|
|
349
|
+
Cleanup resources.
|
|
350
|
+
Calls _invalidate_cached to ensure manifests are saved before closing.
|
|
351
|
+
"""
|
|
352
|
+
try:
|
|
353
|
+
self._invalidate_cached("mmanifest")
|
|
354
|
+
|
|
355
|
+
if "data_wrapper" in self.__dict__ and hasattr(self.data_wrapper, "close"):
|
|
356
|
+
self.data_wrapper.close()
|
|
357
|
+
except Exception as e:
|
|
358
|
+
self.logger.warning(
|
|
359
|
+
f"Error during ParquetArtifact cleanup: {e}", extra=self.logger_extra
|
|
360
|
+
)
|