sibi-flux 2025.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. sibi_dst/__init__.py +44 -0
  2. sibi_flux/__init__.py +49 -0
  3. sibi_flux/artifacts/__init__.py +7 -0
  4. sibi_flux/artifacts/base.py +166 -0
  5. sibi_flux/artifacts/parquet.py +360 -0
  6. sibi_flux/artifacts/parquet_engine/__init__.py +5 -0
  7. sibi_flux/artifacts/parquet_engine/executor.py +204 -0
  8. sibi_flux/artifacts/parquet_engine/manifest.py +101 -0
  9. sibi_flux/artifacts/parquet_engine/planner.py +544 -0
  10. sibi_flux/conf/settings.py +131 -0
  11. sibi_flux/core/__init__.py +5 -0
  12. sibi_flux/core/managed_resource/__init__.py +3 -0
  13. sibi_flux/core/managed_resource/_managed_resource.py +733 -0
  14. sibi_flux/core/type_maps/__init__.py +100 -0
  15. sibi_flux/dask_cluster/__init__.py +47 -0
  16. sibi_flux/dask_cluster/async_core.py +27 -0
  17. sibi_flux/dask_cluster/client_manager.py +549 -0
  18. sibi_flux/dask_cluster/core.py +322 -0
  19. sibi_flux/dask_cluster/exceptions.py +34 -0
  20. sibi_flux/dask_cluster/utils.py +49 -0
  21. sibi_flux/datacube/__init__.py +3 -0
  22. sibi_flux/datacube/_data_cube.py +332 -0
  23. sibi_flux/datacube/config_engine.py +152 -0
  24. sibi_flux/datacube/field_factory.py +48 -0
  25. sibi_flux/datacube/field_registry.py +122 -0
  26. sibi_flux/datacube/generator.py +677 -0
  27. sibi_flux/datacube/orchestrator.py +171 -0
  28. sibi_flux/dataset/__init__.py +3 -0
  29. sibi_flux/dataset/_dataset.py +162 -0
  30. sibi_flux/df_enricher/__init__.py +56 -0
  31. sibi_flux/df_enricher/async_enricher.py +201 -0
  32. sibi_flux/df_enricher/merger.py +253 -0
  33. sibi_flux/df_enricher/specs.py +45 -0
  34. sibi_flux/df_enricher/types.py +12 -0
  35. sibi_flux/df_helper/__init__.py +5 -0
  36. sibi_flux/df_helper/_df_helper.py +450 -0
  37. sibi_flux/df_helper/backends/__init__.py +34 -0
  38. sibi_flux/df_helper/backends/_params.py +173 -0
  39. sibi_flux/df_helper/backends/_strategies.py +295 -0
  40. sibi_flux/df_helper/backends/http/__init__.py +5 -0
  41. sibi_flux/df_helper/backends/http/_http_config.py +122 -0
  42. sibi_flux/df_helper/backends/parquet/__init__.py +7 -0
  43. sibi_flux/df_helper/backends/parquet/_parquet_options.py +268 -0
  44. sibi_flux/df_helper/backends/sqlalchemy/__init__.py +9 -0
  45. sibi_flux/df_helper/backends/sqlalchemy/_db_connection.py +256 -0
  46. sibi_flux/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  47. sibi_flux/df_helper/backends/sqlalchemy/_io_dask.py +386 -0
  48. sibi_flux/df_helper/backends/sqlalchemy/_load_from_db.py +134 -0
  49. sibi_flux/df_helper/backends/sqlalchemy/_model_registry.py +239 -0
  50. sibi_flux/df_helper/backends/sqlalchemy/_sql_model_builder.py +42 -0
  51. sibi_flux/df_helper/backends/utils.py +32 -0
  52. sibi_flux/df_helper/core/__init__.py +15 -0
  53. sibi_flux/df_helper/core/_defaults.py +104 -0
  54. sibi_flux/df_helper/core/_filter_handler.py +617 -0
  55. sibi_flux/df_helper/core/_params_config.py +185 -0
  56. sibi_flux/df_helper/core/_query_config.py +17 -0
  57. sibi_flux/df_validator/__init__.py +3 -0
  58. sibi_flux/df_validator/_df_validator.py +222 -0
  59. sibi_flux/logger/__init__.py +1 -0
  60. sibi_flux/logger/_logger.py +480 -0
  61. sibi_flux/mcp/__init__.py +26 -0
  62. sibi_flux/mcp/client.py +150 -0
  63. sibi_flux/mcp/router.py +126 -0
  64. sibi_flux/orchestration/__init__.py +9 -0
  65. sibi_flux/orchestration/_artifact_orchestrator.py +346 -0
  66. sibi_flux/orchestration/_pipeline_executor.py +212 -0
  67. sibi_flux/osmnx_helper/__init__.py +22 -0
  68. sibi_flux/osmnx_helper/_pbf_handler.py +384 -0
  69. sibi_flux/osmnx_helper/graph_loader.py +225 -0
  70. sibi_flux/osmnx_helper/utils.py +100 -0
  71. sibi_flux/pipelines/__init__.py +3 -0
  72. sibi_flux/pipelines/base.py +218 -0
  73. sibi_flux/py.typed +0 -0
  74. sibi_flux/readers/__init__.py +3 -0
  75. sibi_flux/readers/base.py +82 -0
  76. sibi_flux/readers/parquet.py +106 -0
  77. sibi_flux/utils/__init__.py +53 -0
  78. sibi_flux/utils/boilerplate/__init__.py +19 -0
  79. sibi_flux/utils/boilerplate/base_attacher.py +45 -0
  80. sibi_flux/utils/boilerplate/base_cube_router.py +283 -0
  81. sibi_flux/utils/boilerplate/base_data_cube.py +132 -0
  82. sibi_flux/utils/boilerplate/base_pipeline_template.py +54 -0
  83. sibi_flux/utils/boilerplate/hybrid_data_loader.py +193 -0
  84. sibi_flux/utils/clickhouse_writer/__init__.py +6 -0
  85. sibi_flux/utils/clickhouse_writer/_clickhouse_writer.py +225 -0
  86. sibi_flux/utils/common.py +7 -0
  87. sibi_flux/utils/credentials/__init__.py +3 -0
  88. sibi_flux/utils/credentials/_config_manager.py +155 -0
  89. sibi_flux/utils/dask_utils.py +14 -0
  90. sibi_flux/utils/data_utils/__init__.py +3 -0
  91. sibi_flux/utils/data_utils/_data_utils.py +389 -0
  92. sibi_flux/utils/dataframe_utils.py +52 -0
  93. sibi_flux/utils/date_utils/__init__.py +10 -0
  94. sibi_flux/utils/date_utils/_business_days.py +220 -0
  95. sibi_flux/utils/date_utils/_date_utils.py +311 -0
  96. sibi_flux/utils/date_utils/_file_age_checker.py +319 -0
  97. sibi_flux/utils/file_utils.py +48 -0
  98. sibi_flux/utils/filepath_generator/__init__.py +5 -0
  99. sibi_flux/utils/filepath_generator/_filepath_generator.py +185 -0
  100. sibi_flux/utils/parquet_saver/__init__.py +6 -0
  101. sibi_flux/utils/parquet_saver/_parquet_saver.py +436 -0
  102. sibi_flux/utils/parquet_saver/_write_gatekeeper.py +33 -0
  103. sibi_flux/utils/retry.py +46 -0
  104. sibi_flux/utils/storage/__init__.py +7 -0
  105. sibi_flux/utils/storage/_fs_registry.py +112 -0
  106. sibi_flux/utils/storage/_storage_manager.py +257 -0
  107. sibi_flux/utils/storage/factory.py +33 -0
  108. sibi_flux-2025.12.0.dist-info/METADATA +283 -0
  109. sibi_flux-2025.12.0.dist-info/RECORD +110 -0
  110. sibi_flux-2025.12.0.dist-info/WHEEL +4 -0
sibi_dst/__init__.py ADDED
@@ -0,0 +1,44 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import sys as _sys
5
+ import warnings
6
+
7
+ warnings.warn(
8
+ "sibi_dst is deprecated; use sibi_flux instead.",
9
+ FutureWarning,
10
+ stacklevel=2,
11
+ )
12
+
13
+ # Canonical package
14
+ _CANON = "sibi_flux"
15
+
16
+ # Make `import skt` behave like `import sibi_flux`
17
+ _pkg = importlib.import_module(_CANON)
18
+
19
+ # Populate this module’s namespace with the canonical package attributes
20
+ globals().update(_pkg.__dict__)
21
+
22
+ # Expose a clean public surface
23
+ __all__ = getattr(_pkg, "__all__", [])
24
+ __version__ = getattr(_pkg, "__version__", "0.0.0")
25
+
26
+ # Critical: alias module entries so `import skt.df_helper` maps to `sibi_flux.df_helper`
27
+ _sys.modules[__name__] = _pkg
28
+ _sys.modules[__name__ + ".__init__"] = _pkg # defensive (rarely needed)
29
+
30
+
31
+ def __getattr__(name: str):
32
+ # Lazy-load submodules on demand: skt.df_helper, skt.osmnx, etc.
33
+ try:
34
+ mod = importlib.import_module(f"{_CANON}.{name}")
35
+ except ModuleNotFoundError as e:
36
+ raise AttributeError(f"module '{__name__}' has no attribute '{name}'") from e
37
+ _sys.modules[f"{__name__}.{name}"] = mod
38
+ return mod
39
+
40
+
41
+ def __dir__():
42
+ # Combine canonical attrs + subpackages for nicer autocomplete
43
+ base = set(globals().keys())
44
+ return sorted(base)
sibi_flux/__init__.py ADDED
@@ -0,0 +1,49 @@
1
+ from __future__ import annotations
2
+ from importlib.metadata import version as _pkg_version, PackageNotFoundError
3
+
4
+ try:
5
+ __version__ = _pkg_version("sibi-flux")
6
+ except PackageNotFoundError: # pragma: no cover
7
+ __version__ = "0.0.0"
8
+
9
+
10
+ # Core Exports
11
+ from sibi_flux.logger import Logger
12
+ from sibi_flux.core import ManagedResource
13
+ from sibi_flux.datacube import Datacube, DatacubeConfig
14
+ from sibi_flux.dataset import Dataset
15
+ from sibi_flux.pipelines import BasePipeline
16
+
17
+ # Helper Exports
18
+ from sibi_flux.df_helper import DfHelper
19
+ from sibi_flux.df_enricher import AsyncDfEnricher as DfEnricher
20
+ from sibi_flux.df_validator._df_validator import DfValidator
21
+
22
+ # Artifacts
23
+ from sibi_flux.artifacts import ParquetArtifact, BaseArtifact as Artifact
24
+ from sibi_flux.readers.parquet import ParquetReader
25
+
26
+ # Utilities (Sub-packages)
27
+ from sibi_flux import dask_cluster
28
+ from sibi_flux.utils import boilerplate, parquet_saver, clickhouse_writer
29
+
30
+
31
+ __all__ = [
32
+ "__version__",
33
+ "Logger",
34
+ "ManagedResource",
35
+ "Datacube",
36
+ "DatacubeConfig",
37
+ "Dataset",
38
+ "BasePipeline",
39
+ "DfHelper",
40
+ "DfEnricher",
41
+ "DfValidator",
42
+ "ParquetArtifact",
43
+ "Artifact",
44
+ "ParquetReader",
45
+ "dask_cluster",
46
+ "boilerplate",
47
+ "parquet_saver",
48
+ "clickhouse_writer",
49
+ ]
@@ -0,0 +1,7 @@
1
+ from .base import BaseArtifact
2
+ from .parquet import ParquetArtifact
3
+
4
+ __all__ = [
5
+ "BaseArtifact",
6
+ "ParquetArtifact",
7
+ ]
@@ -0,0 +1,166 @@
1
+ """
2
+ Artifact Base module.
3
+
4
+ Defines the base classes for system artifacts.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import asyncio
10
+ from typing import Any, Dict, Mapping, Optional, Type
11
+ from datetime import date, datetime
12
+
13
+ import pandas as pd
14
+ import dask.dataframe as dd
15
+
16
+ # Correct internal import for the base class
17
+ from .parquet import ParquetArtifact
18
+
19
+ DateLike = str | date | datetime | None
20
+
21
+
22
+ def _validate_and_format_date(name: str, value: DateLike) -> Optional[str]:
23
+ """
24
+ Normalize date-like input into a canonical string '%Y-%m-%d'.
25
+
26
+ - None -> None
27
+ - str/date/datetime -> parse with pandas.to_datetime, take .date(), return '%Y-%m-%d'
28
+ - else -> raise TypeError
29
+ """
30
+ if value is None:
31
+ return None
32
+ if isinstance(value, (str, date, datetime)):
33
+ try:
34
+ return pd.to_datetime(value).date().strftime("%Y-%m-%d")
35
+ except Exception as e:
36
+ raise ValueError(f"{name} must be a valid date; got {value!r}") from e
37
+ raise TypeError(f"{name} must be str, date, datetime, or None; got {type(value)}")
38
+
39
+
40
+ class BaseArtifact(ParquetArtifact):
41
+ """
42
+ Generic base class for Parquet-based artifacts with enhanced date handling and async support.
43
+
44
+ Features:
45
+ - Automatic date normalization to '%Y-%m-%d' strings.
46
+ - Async loading support via `aload`.
47
+ - Lifecycle hooks (before/after load).
48
+ - Params serialization.
49
+
50
+ The `config` class attribute can be used to set default configuration values.
51
+ """
52
+
53
+ config: Mapping[str, Any] = {}
54
+
55
+ # Typed attributes for IDE support (populated in __init__)
56
+ parquet_start_date: Optional[str]
57
+ parquet_end_date: Optional[str]
58
+ provider_class: Optional[Type[Any]]
59
+ class_params: Dict[str, Any]
60
+ df: Optional[pd.DataFrame | dd.DataFrame] = None
61
+
62
+ def __init__(
63
+ self,
64
+ **kwargs: Any,
65
+ ) -> None:
66
+ # 1. Merge defaults from class config with runtime kwargs
67
+ merged = {**self.config, **kwargs}
68
+
69
+ # 2. Extract and normalize date fields *before* passing to super
70
+ # This ensures ParquetArtifact receives clean strings if it validates them.
71
+ p_start = merged.get("parquet_start_date")
72
+ p_end = merged.get("parquet_end_date")
73
+
74
+ self.parquet_start_date = _validate_and_format_date(
75
+ "parquet_start_date", p_start
76
+ )
77
+ self.parquet_end_date = _validate_and_format_date("parquet_end_date", p_end)
78
+
79
+ # Update merged dict with normalized values to ensure consistency
80
+ if self.parquet_start_date:
81
+ merged["parquet_start_date"] = self.parquet_start_date
82
+ if self.parquet_end_date:
83
+ merged["parquet_end_date"] = self.parquet_end_date
84
+
85
+ # 3. Initialize parent (ParquetArtifact handles config validation and resource setup)
86
+ super().__init__(**merged)
87
+
88
+ # 4. Store additional wrapper config
89
+ self.provider_class = merged.get("provider_class", None)
90
+
91
+ # Default class_params from instance attributes if not provided
92
+ self.class_params = merged.get("class_params", None) or {
93
+ "debug": self.debug,
94
+ "logger": self.logger,
95
+ "fs": self.fs,
96
+ "verbose": getattr(self, "verbose", False),
97
+ }
98
+
99
+ # 5. Validation Logic
100
+ if self.parquet_start_date and self.parquet_end_date:
101
+ if self.parquet_start_date > self.parquet_end_date:
102
+ raise ValueError(
103
+ f"parquet_start_date {self.parquet_start_date} "
104
+ f"cannot be after parquet_end_date {self.parquet_end_date}"
105
+ )
106
+
107
+ # -------- Lifecycle Hooks (No-op defaults) --------
108
+
109
+ def before_load(self, **kwargs: Any) -> None:
110
+ """Hook called synchronously before loading data."""
111
+ return None
112
+
113
+ def after_load(self, **kwargs: Any) -> None:
114
+ """Hook called synchronously after loading data."""
115
+ return None
116
+
117
+ async def abefore_load(self, **kwargs: Any) -> None:
118
+ """Hook called asynchronously before loading data."""
119
+ return None
120
+
121
+ async def aafter_load(self, **kwargs: Any) -> None:
122
+ """Hook called asynchronously after loading data."""
123
+ return None
124
+
125
+ # -------- Public API --------
126
+
127
+ def load(self, **kwargs: Any) -> pd.DataFrame | dd.DataFrame:
128
+ """
129
+ Synchronous load with hooks.
130
+ Delegates actual loading to super().load().
131
+ """
132
+ self.before_load(**kwargs)
133
+ self.df = super().load(**kwargs)
134
+ self.after_load(**kwargs)
135
+ return self.df
136
+
137
+ async def aload(self, **kwargs: Any) -> pd.DataFrame | dd.DataFrame:
138
+ """
139
+ Asynchronous load with hooks.
140
+ Runs the blocking super().load() in a thread.
141
+ """
142
+ await self.abefore_load(**kwargs)
143
+ # Run blocking IO in a separate thread to avoid blocking the event loop
144
+ df = await asyncio.to_thread(super().load, **kwargs)
145
+ self.df = df
146
+ await self.aafter_load(**kwargs)
147
+ return self.df
148
+
149
+ def has_date_window(self) -> bool:
150
+ """Check if a date window is defined."""
151
+ return bool(self.parquet_start_date or self.parquet_end_date)
152
+
153
+ def date_window(self) -> tuple[Optional[str], Optional[str]]:
154
+ """Return the (start, end) date window tuple."""
155
+ return self.parquet_start_date, self.parquet_end_date
156
+
157
+ def to_params(self) -> Dict[str, Any]:
158
+ """Serialize configuration parameters."""
159
+ return {
160
+ "parquet_start_date": self.parquet_start_date,
161
+ "parquet_end_date": self.parquet_end_date,
162
+ "provider_class": self.provider_class,
163
+ "class_params": dict(self.class_params),
164
+ # Add useful metadata from parent
165
+ "parquet_storage_path": getattr(self, "_storage_path", None),
166
+ }
@@ -0,0 +1,360 @@
1
+ """
2
+ Parquet Artifact module.
3
+
4
+ Defines the ParquetArtifact class for orchestrating Parquet dataset generation.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import datetime as dt
10
+ import threading
11
+ from functools import cached_property
12
+ from typing import Any, Dict, Type, TypeVar, Optional, Set, List
13
+
14
+ from pydantic import BaseModel, Field, ConfigDict, ValidationError
15
+
16
+ from sibi_flux.core import ManagedResource
17
+ from sibi_flux.utils.date_utils import DateUtils
18
+
19
+ from sibi_flux.artifacts.parquet_engine import (
20
+ Executor as DataWrapper,
21
+ UpdatePlanner,
22
+ MissingManifestManager,
23
+ )
24
+
25
+ T = TypeVar("T")
26
+
27
+
28
+ class ArtifactConfig(BaseModel):
29
+ """
30
+ Strict configuration schema for ParquetArtifact.
31
+ Validates arguments immediately upon initialization.
32
+ """
33
+
34
+ model_config = ConfigDict(arbitrary_types_allowed=True, extra="allow")
35
+
36
+ parquet_storage_path: str
37
+ parquet_filename: str
38
+
39
+ provider_class: Optional[Type] = None
40
+ partition_on: List[str] = Field(default_factory=lambda: ["partition_date"])
41
+ hive_style: bool = True
42
+ overwrite: bool = False
43
+
44
+ show_progress: bool = False
45
+ timeout: float = 30.0
46
+ max_threads: int = 3
47
+
48
+ max_retries: int = 3
49
+ backoff_base: float = 2.0
50
+
51
+ load_params: Dict[str, Any] = Field(default_factory=dict)
52
+
53
+
54
+ class ParquetArtifact(ManagedResource):
55
+ """
56
+ Orchestrates the generation of a single date-partitioned Parquet dataset.
57
+
58
+ Features:
59
+ - Validated configuration via Pydantic.
60
+ - Thread-safe concurrency control (process-local).
61
+ - Lazy initialization of resources (manifests, planners).
62
+ - Safe resource cleanup.
63
+ """
64
+
65
+ _global_lock = threading.RLock()
66
+ _active_runs: Set[tuple[str, str]] = set()
67
+ logger_extra = {"sibi_flux_component": __name__}
68
+
69
+ def __init__(self, **kwargs: Any):
70
+ """
71
+ Initializes the ParquetArtifact.
72
+
73
+ Args:
74
+ **kwargs: Configuration parameters validated against ArtifactConfig.
75
+ """
76
+ try:
77
+ self.config = ArtifactConfig(**kwargs)
78
+ except ValidationError as e:
79
+ raise ValueError(f"Invalid ParquetArtifact configuration: {e}") from e
80
+
81
+ self.all_kwargs = self.config.model_dump()
82
+
83
+ super().__init__(**self.all_kwargs)
84
+
85
+ self._lock = threading.RLock()
86
+
87
+ self._storage_path = self.config.parquet_storage_path
88
+ self._provider_class = self.config.provider_class
89
+
90
+ self.logger_extra.update(
91
+ {
92
+ "artifact_storage_path": self._storage_path,
93
+ }
94
+ )
95
+
96
+ def _invalidate_cached(self, *names: str) -> None:
97
+ """
98
+ Invalidate cached properties by name.
99
+
100
+ SAFEGUARD: Tries to call .save() on objects (like manifests)
101
+ before removing them from memory to prevent state loss.
102
+ """
103
+ for name in names:
104
+ if name in self.__dict__:
105
+ obj = self.__dict__[name]
106
+
107
+ if hasattr(obj, "save") and callable(obj.save):
108
+ try:
109
+ self.logger.debug(
110
+ f"Saving state for '{name}' before cache invalidation...",
111
+ extra=self.logger_extra,
112
+ )
113
+ obj.save()
114
+ except Exception as e:
115
+ self.logger.warning(
116
+ f"Failed to save '{name}' state during invalidation: {e}",
117
+ extra=self.logger_extra,
118
+ )
119
+
120
+ del self.__dict__[name]
121
+
122
+ def _build_manifest_path(self) -> str:
123
+ """Constructs the path for the missing data manifest."""
124
+ base = self._storage_path.rstrip("/") + "/"
125
+ return f"{base}_manifests/missing.parquet"
126
+
127
+ @cached_property
128
+ def mmanifest(self) -> MissingManifestManager:
129
+ self.logger.debug(
130
+ "Initializing MissingManifestManager...", extra=self.logger_extra
131
+ )
132
+ manifest_path = self._build_manifest_path()
133
+
134
+ manifest_dir = (
135
+ manifest_path.rsplit("/", 1)[0] if "/" in manifest_path else manifest_path
136
+ )
137
+ self.ensure_directory_exists(manifest_dir)
138
+
139
+ mgr = MissingManifestManager(
140
+ fs=self.fs,
141
+ manifest_path=manifest_path,
142
+ clear_existing=self.config.overwrite,
143
+ debug=self.debug,
144
+ logger=self.logger,
145
+ overwrite=self.config.overwrite,
146
+ )
147
+
148
+ if hasattr(mgr, "_safe_exists") and not mgr._safe_exists(mgr.manifest_path):
149
+ self.logger.debug(
150
+ f"Creating new manifest at {mgr.manifest_path}", extra=self.logger_extra
151
+ )
152
+ mgr.save()
153
+
154
+ return mgr
155
+
156
+ @cached_property
157
+ def update_planner(self) -> UpdatePlanner:
158
+ self.logger.debug("Initializing UpdatePlanner...", extra=self.logger_extra)
159
+ skipped_files: List[Any] = self.mmanifest.load_existing() or []
160
+
161
+ cfg = {
162
+ **self.all_kwargs,
163
+ "fs": self.fs,
164
+ "debug": self.debug,
165
+ "logger": self.logger,
166
+ "description": getattr(self._provider_class, "__name__", "DataWrapper"),
167
+ "skipped": list(skipped_files),
168
+ "mmanifest": self.mmanifest,
169
+ "partition_on": self.config.partition_on,
170
+ "hive_style": self.config.hive_style,
171
+ }
172
+ return UpdatePlanner(**cfg)
173
+
174
+ @cached_property
175
+ def data_wrapper(self) -> DataWrapper:
176
+ self.logger.debug("Initializing DataWrapper...", extra=self.logger_extra)
177
+
178
+ _ = self.update_planner.plan
179
+
180
+ class_params = {
181
+ "debug": self.debug,
182
+ "logger": self.logger,
183
+ "fs": self.fs,
184
+ "verbose": self.verbose,
185
+ }
186
+
187
+ cfg = {
188
+ "data_path": self._storage_path,
189
+ "fs": self.fs,
190
+ "debug": self.debug,
191
+ "logger": self.logger,
192
+ "verbose": self.verbose,
193
+ "provider_class": self._provider_class,
194
+ "class_params": class_params,
195
+ "load_params": self.config.load_params,
196
+ "mmanifest": self.mmanifest,
197
+ "update_planner": self.update_planner,
198
+ "date_field": self.all_kwargs.get("date_field"),
199
+ "show_progress": self.config.show_progress,
200
+ "timeout": self.config.timeout,
201
+ "max_threads": self.config.max_threads,
202
+ }
203
+ return DataWrapper(**cfg)
204
+
205
+ def load(self, **kwargs: Any):
206
+ """
207
+ Directly load data using the configured provider_class.
208
+ Bypasses planning/manifest process.
209
+ """
210
+ if not self._provider_class:
211
+ raise ValueError("provider_class is not configured.")
212
+
213
+ params = {
214
+ "backend": "parquet",
215
+ "fs": self.fs,
216
+ "logger": self.logger,
217
+ "debug": self.debug,
218
+ "parquet_storage_path": self._storage_path,
219
+ "parquet_start_date": self.all_kwargs.get("parquet_start_date"),
220
+ "parquet_end_date": self.all_kwargs.get("parquet_end_date"),
221
+ "partition_on": self.config.partition_on,
222
+ **(self.all_kwargs.get("class_params") or {}),
223
+ }
224
+
225
+ with self._provider_class(**params) as instance:
226
+ return instance.load(**kwargs)
227
+
228
+ def generate_parquet(self, **kwargs: Any) -> None:
229
+ """
230
+ Generate or update the Parquet dataset according to the plan.
231
+ - Merges runtime kwargs.
232
+ - Invalidates dependent cached properties safely.
233
+ - Guards against duplicate concurrent runs.
234
+ """
235
+ self.all_kwargs.update(kwargs)
236
+
237
+ self._invalidate_cached("update_planner", "data_wrapper")
238
+ if "overwrite" in kwargs and kwargs["overwrite"]:
239
+ self._invalidate_cached("mmanifest")
240
+
241
+ key = self._storage_path
242
+ with ParquetArtifact._global_lock:
243
+ if key in ParquetArtifact._active_runs:
244
+ self.logger.info(
245
+ f"Run active for {key}; skipping.", extra=self.logger_extra
246
+ )
247
+ return
248
+ ParquetArtifact._active_runs.add(key)
249
+
250
+ try:
251
+ self.ensure_directory_exists(self._storage_path)
252
+
253
+ plan = self.update_planner.plan
254
+ if plan is None or (hasattr(plan, "empty") and plan.empty):
255
+ self.logger.info("No updates needed.", extra=self.logger_extra)
256
+ return
257
+
258
+ if self.config.show_progress and not getattr(
259
+ self.update_planner, "_printed_this_run", False
260
+ ):
261
+ try:
262
+ self.update_planner.show_update_plan()
263
+ setattr(self.update_planner, "_printed_this_run", True)
264
+ except Exception:
265
+ pass
266
+
267
+ dw_retry_kwargs = {
268
+ "max_retries": self.config.max_retries,
269
+ "backoff_base": self.config.backoff_base,
270
+ }
271
+ if "backoff_jitter" in self.all_kwargs:
272
+ dw_retry_kwargs["backoff_jitter"] = self.all_kwargs["backoff_jitter"]
273
+ if "backoff_max" in self.all_kwargs:
274
+ dw_retry_kwargs["backoff_max"] = self.all_kwargs["backoff_max"]
275
+
276
+ with self._lock:
277
+ dw = self.data_wrapper
278
+ if hasattr(dw, "process"):
279
+ dw.process(**dw_retry_kwargs)
280
+
281
+ if self.config.show_progress and hasattr(
282
+ dw, "show_benchmark_summary"
283
+ ):
284
+ dw.show_benchmark_summary()
285
+
286
+ finally:
287
+ with ParquetArtifact._global_lock:
288
+ ParquetArtifact._active_runs.discard(key)
289
+
290
+ def update_parquet(self, period: str = "today", **kwargs: Any) -> None:
291
+ """
292
+ High-level entry point to update Parquet for a standard or custom period.
293
+ """
294
+ final_kwargs = {**self.all_kwargs, **kwargs}
295
+ period_params: Dict[str, dt.date] = {}
296
+
297
+ if period == "itd":
298
+ start_str = final_kwargs.get("history_begins_on")
299
+ if not start_str:
300
+ raise ValueError("Period 'itd' requires 'history_begins_on'.")
301
+ period_params = {
302
+ "parquet_start_date": dt.datetime.strptime(
303
+ start_str, "%Y-%m-%d"
304
+ ).date(),
305
+ "parquet_end_date": dt.date.today(),
306
+ }
307
+ elif period == "ytd":
308
+ period_params = {
309
+ "parquet_start_date": dt.date(dt.date.today().year, 1, 1),
310
+ "parquet_end_date": dt.date.today(),
311
+ }
312
+ elif period == "custom":
313
+ p = final_kwargs
314
+ s = p.get("start_on") or p.get("start_date") or p.get("start")
315
+ e = p.get("end_on") or p.get("end_date") or p.get("end")
316
+
317
+ if not s or not e:
318
+ raise ValueError("Period 'custom' requires 'start_on' and 'end_on'.")
319
+
320
+ period_params = {
321
+ "parquet_start_date": dt.datetime.strptime(str(s), "%Y-%m-%d").date(),
322
+ "parquet_end_date": dt.datetime.strptime(str(e), "%Y-%m-%d").date(),
323
+ }
324
+ else:
325
+ try:
326
+ s, e = DateUtils.parse_period(period=period)
327
+ period_params = {"parquet_start_date": s, "parquet_end_date": e}
328
+ except Exception as e:
329
+ raise ValueError(f"Failed to parse period '{period}': {e}") from e
330
+
331
+ final_kwargs.update(period_params)
332
+ self.generate_parquet(**final_kwargs)
333
+
334
+ def ensure_directory_exists(self, path: str) -> None:
335
+ """Robust directory creation handling various fs backends."""
336
+ with self._lock:
337
+ if not self.fs.exists(path):
338
+ try:
339
+ self.fs.makedirs(path, exist_ok=True)
340
+ except TypeError:
341
+ # Fallback for backends not supporting exist_ok
342
+ try:
343
+ self.fs.makedirs(path)
344
+ except FileExistsError:
345
+ pass
346
+
347
+ def _cleanup(self) -> None:
348
+ """
349
+ Cleanup resources.
350
+ Calls _invalidate_cached to ensure manifests are saved before closing.
351
+ """
352
+ try:
353
+ self._invalidate_cached("mmanifest")
354
+
355
+ if "data_wrapper" in self.__dict__ and hasattr(self.data_wrapper, "close"):
356
+ self.data_wrapper.close()
357
+ except Exception as e:
358
+ self.logger.warning(
359
+ f"Error during ParquetArtifact cleanup: {e}", extra=self.logger_extra
360
+ )
@@ -0,0 +1,5 @@
1
+ from .executor import DataWrapper as Executor
2
+ from .planner import UpdatePlanner
3
+ from .manifest import MissingManifestManager
4
+
5
+ __all__ = ["Executor", "UpdatePlanner", "MissingManifestManager"]