odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/state/__init__.py
ADDED
|
@@ -0,0 +1,1203 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import random
|
|
5
|
+
import time
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _retry_delta_operation(func, max_retries: int = 5, base_delay: float = 1.0):
|
|
14
|
+
"""Retry a Delta operation with exponential backoff on concurrency conflicts.
|
|
15
|
+
|
|
16
|
+
Only logs debug during retries. Raises after all retries fail.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
func: Callable to execute.
|
|
20
|
+
max_retries: Maximum retry attempts (default 5 for high concurrency).
|
|
21
|
+
base_delay: Base delay in seconds (doubles each retry).
|
|
22
|
+
"""
|
|
23
|
+
for attempt in range(max_retries + 1):
|
|
24
|
+
try:
|
|
25
|
+
return func()
|
|
26
|
+
except Exception as e:
|
|
27
|
+
error_str = str(e)
|
|
28
|
+
is_concurrent = any(
|
|
29
|
+
msg in error_str
|
|
30
|
+
for msg in [
|
|
31
|
+
"ConcurrentAppendException",
|
|
32
|
+
"ConcurrentDeleteReadException",
|
|
33
|
+
"ConcurrentDeleteDeleteException",
|
|
34
|
+
"DELTA_CONCURRENT",
|
|
35
|
+
"concurrent",
|
|
36
|
+
"conflict",
|
|
37
|
+
]
|
|
38
|
+
)
|
|
39
|
+
if not is_concurrent or attempt >= max_retries:
|
|
40
|
+
raise
|
|
41
|
+
# Exponential backoff with jitter (1s, 2s, 4s, 8s, 16s = ~31s total)
|
|
42
|
+
delay = base_delay * (2**attempt) + random.uniform(0, 1.0)
|
|
43
|
+
logger.debug(
|
|
44
|
+
f"Delta concurrent write (attempt {attempt + 1}/{max_retries + 1}), "
|
|
45
|
+
f"retrying in {delay:.2f}s..."
|
|
46
|
+
)
|
|
47
|
+
time.sleep(delay)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# Suppress noisy delta-rs transaction conflict warnings (handled by retry)
|
|
51
|
+
# Must be set before deltalake is imported
|
|
52
|
+
if "RUST_LOG" not in os.environ:
|
|
53
|
+
os.environ["RUST_LOG"] = "deltalake_core::kernel::transaction=error"
|
|
54
|
+
|
|
55
|
+
# Try to import deltalake, but don't fail yet (it might be a Spark run)
|
|
56
|
+
try:
|
|
57
|
+
import pandas as pd
|
|
58
|
+
import pyarrow as pa
|
|
59
|
+
from deltalake import DeltaTable, write_deltalake
|
|
60
|
+
except ImportError:
|
|
61
|
+
DeltaTable = None
|
|
62
|
+
write_deltalake = None
|
|
63
|
+
pd = None
|
|
64
|
+
pa = None
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class StateBackend(ABC):
|
|
68
|
+
@abstractmethod
|
|
69
|
+
def load_state(self) -> Dict[str, Any]:
|
|
70
|
+
"""Return state in the current in-memory format, e.g. {'pipelines': {...}}."""
|
|
71
|
+
...
|
|
72
|
+
|
|
73
|
+
@abstractmethod
|
|
74
|
+
def save_pipeline_run(self, pipeline_name: str, pipeline_data: Dict[str, Any]) -> None:
|
|
75
|
+
"""Persist the given pipeline_data into backend."""
|
|
76
|
+
...
|
|
77
|
+
|
|
78
|
+
@abstractmethod
|
|
79
|
+
def get_last_run_info(self, pipeline_name: str, node_name: str) -> Optional[Dict[str, Any]]:
|
|
80
|
+
"""Get status and metadata of a node from last run."""
|
|
81
|
+
...
|
|
82
|
+
|
|
83
|
+
@abstractmethod
|
|
84
|
+
def get_last_run_status(self, pipeline_name: str, node_name: str) -> Optional[bool]:
|
|
85
|
+
"""Get success status of a node from last run."""
|
|
86
|
+
...
|
|
87
|
+
|
|
88
|
+
@abstractmethod
|
|
89
|
+
def get_hwm(self, key: str) -> Any:
|
|
90
|
+
"""Get High-Water Mark value for a key."""
|
|
91
|
+
...
|
|
92
|
+
|
|
93
|
+
@abstractmethod
|
|
94
|
+
def set_hwm(self, key: str, value: Any) -> None:
|
|
95
|
+
"""Set High-Water Mark value for a key."""
|
|
96
|
+
...
|
|
97
|
+
|
|
98
|
+
def set_hwm_batch(self, updates: List[Dict[str, Any]]) -> None:
|
|
99
|
+
"""Set multiple High-Water Mark values in a single operation.
|
|
100
|
+
|
|
101
|
+
Default implementation calls set_hwm() for each update.
|
|
102
|
+
Subclasses should override for efficient batch writes.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
updates: List of dicts with keys: key, value
|
|
106
|
+
"""
|
|
107
|
+
for update in updates:
|
|
108
|
+
self.set_hwm(update["key"], update["value"])
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class LocalJSONStateBackend(StateBackend):
|
|
112
|
+
"""
|
|
113
|
+
Local JSON-based State Backend.
|
|
114
|
+
Used for local development or when System Catalog is not configured.
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
def __init__(self, state_path: str):
|
|
118
|
+
self.state_path = state_path
|
|
119
|
+
self.state = self._load_from_disk()
|
|
120
|
+
|
|
121
|
+
def _load_from_disk(self) -> Dict[str, Any]:
|
|
122
|
+
if os.path.exists(self.state_path):
|
|
123
|
+
try:
|
|
124
|
+
with open(self.state_path, "r") as f:
|
|
125
|
+
return json.load(f)
|
|
126
|
+
except Exception as e:
|
|
127
|
+
logger.warning(f"Failed to load state from {self.state_path}: {e}")
|
|
128
|
+
return {"pipelines": {}, "hwm": {}}
|
|
129
|
+
|
|
130
|
+
def _save_to_disk(self) -> None:
|
|
131
|
+
os.makedirs(os.path.dirname(self.state_path), exist_ok=True)
|
|
132
|
+
with open(self.state_path, "w") as f:
|
|
133
|
+
json.dump(self.state, f, indent=2, default=str)
|
|
134
|
+
|
|
135
|
+
def load_state(self) -> Dict[str, Any]:
|
|
136
|
+
return self.state
|
|
137
|
+
|
|
138
|
+
def save_pipeline_run(self, pipeline_name: str, pipeline_data: Dict[str, Any]) -> None:
|
|
139
|
+
if "pipelines" not in self.state:
|
|
140
|
+
self.state["pipelines"] = {}
|
|
141
|
+
self.state["pipelines"][pipeline_name] = pipeline_data
|
|
142
|
+
self._save_to_disk()
|
|
143
|
+
|
|
144
|
+
def get_last_run_info(self, pipeline_name: str, node_name: str) -> Optional[Dict[str, Any]]:
|
|
145
|
+
pipe = self.state.get("pipelines", {}).get(pipeline_name, {})
|
|
146
|
+
nodes = pipe.get("nodes", {})
|
|
147
|
+
return nodes.get(node_name)
|
|
148
|
+
|
|
149
|
+
def get_last_run_status(self, pipeline_name: str, node_name: str) -> Optional[bool]:
|
|
150
|
+
info = self.get_last_run_info(pipeline_name, node_name)
|
|
151
|
+
if info:
|
|
152
|
+
return info.get("success")
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
def get_hwm(self, key: str) -> Any:
|
|
156
|
+
return self.state.get("hwm", {}).get(key)
|
|
157
|
+
|
|
158
|
+
def set_hwm(self, key: str, value: Any) -> None:
|
|
159
|
+
if "hwm" not in self.state:
|
|
160
|
+
self.state["hwm"] = {}
|
|
161
|
+
self.state["hwm"][key] = value
|
|
162
|
+
self._save_to_disk()
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class CatalogStateBackend(StateBackend):
|
|
166
|
+
"""
|
|
167
|
+
Unified State Backend using Delta Tables (System Catalog).
|
|
168
|
+
Supports both Spark and Local (via deltalake) execution.
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
def __init__(
|
|
172
|
+
self,
|
|
173
|
+
meta_runs_path: str,
|
|
174
|
+
meta_state_path: str,
|
|
175
|
+
spark_session: Any = None,
|
|
176
|
+
storage_options: Optional[Dict[str, str]] = None,
|
|
177
|
+
environment: Optional[str] = None,
|
|
178
|
+
):
|
|
179
|
+
self.meta_runs_path = meta_runs_path
|
|
180
|
+
self.meta_state_path = meta_state_path
|
|
181
|
+
self.spark = spark_session
|
|
182
|
+
self.storage_options = storage_options or {}
|
|
183
|
+
self.environment = environment
|
|
184
|
+
|
|
185
|
+
def load_state(self) -> Dict[str, Any]:
|
|
186
|
+
"""
|
|
187
|
+
Load state. For Catalog backend, we generally return empty
|
|
188
|
+
and rely on direct queries for specific info.
|
|
189
|
+
"""
|
|
190
|
+
return {"pipelines": {}}
|
|
191
|
+
|
|
192
|
+
def save_pipeline_run(self, pipeline_name: str, pipeline_data: Dict[str, Any]) -> None:
|
|
193
|
+
# CatalogManager already logs runs (meta_runs) during execution.
|
|
194
|
+
# We do not need to duplicate this here, avoiding schema conflicts.
|
|
195
|
+
pass
|
|
196
|
+
|
|
197
|
+
def _save_runs_spark(self, rows):
|
|
198
|
+
pass
|
|
199
|
+
|
|
200
|
+
def _save_runs_local(self, rows):
|
|
201
|
+
pass
|
|
202
|
+
|
|
203
|
+
def get_last_run_info(self, pipeline_name: str, node_name: str) -> Optional[Dict[str, Any]]:
|
|
204
|
+
if self.spark:
|
|
205
|
+
return self._get_last_run_spark(pipeline_name, node_name)
|
|
206
|
+
return self._get_last_run_local(pipeline_name, node_name)
|
|
207
|
+
|
|
208
|
+
def _get_last_run_spark(self, pipeline_name, node_name):
|
|
209
|
+
from pyspark.sql import functions as F
|
|
210
|
+
|
|
211
|
+
try:
|
|
212
|
+
df = self.spark.read.format("delta").load(self.meta_runs_path)
|
|
213
|
+
row = (
|
|
214
|
+
df.filter(
|
|
215
|
+
(F.col("pipeline_name") == pipeline_name) & (F.col("node_name") == node_name)
|
|
216
|
+
)
|
|
217
|
+
.select("status", "metadata")
|
|
218
|
+
.orderBy(F.col("timestamp").desc())
|
|
219
|
+
.first()
|
|
220
|
+
)
|
|
221
|
+
if row:
|
|
222
|
+
meta = {}
|
|
223
|
+
if row.metadata:
|
|
224
|
+
try:
|
|
225
|
+
meta = json.loads(row.metadata)
|
|
226
|
+
except Exception as e:
|
|
227
|
+
logger.debug(f"Failed to parse metadata JSON: {e}")
|
|
228
|
+
return {"success": (row.status == "SUCCESS"), "metadata": meta}
|
|
229
|
+
except Exception as e:
|
|
230
|
+
logger.warning(
|
|
231
|
+
f"Failed to get last run info from {self.meta_runs_path} "
|
|
232
|
+
f"for {pipeline_name}/{node_name}: {e}"
|
|
233
|
+
)
|
|
234
|
+
return None
|
|
235
|
+
|
|
236
|
+
def _get_last_run_local(self, pipeline_name, node_name):
|
|
237
|
+
if not DeltaTable:
|
|
238
|
+
return None
|
|
239
|
+
|
|
240
|
+
try:
|
|
241
|
+
dt = DeltaTable(self.meta_runs_path, storage_options=self.storage_options)
|
|
242
|
+
ds = dt.to_pyarrow_dataset()
|
|
243
|
+
import pyarrow.compute as pc
|
|
244
|
+
|
|
245
|
+
filter_expr = (pc.field("pipeline_name") == pipeline_name) & (
|
|
246
|
+
pc.field("node_name") == node_name
|
|
247
|
+
)
|
|
248
|
+
# Scan with filter
|
|
249
|
+
table = ds.to_table(filter=filter_expr)
|
|
250
|
+
|
|
251
|
+
if table.num_rows == 0:
|
|
252
|
+
return None
|
|
253
|
+
|
|
254
|
+
# Sort by timestamp desc to get latest
|
|
255
|
+
# PyArrow table sort? Convert to pandas for easier sorting if small history
|
|
256
|
+
# Or use duckdb
|
|
257
|
+
|
|
258
|
+
df = table.to_pandas()
|
|
259
|
+
if "timestamp" in df.columns:
|
|
260
|
+
df = df.sort_values("timestamp", ascending=False)
|
|
261
|
+
|
|
262
|
+
row = df.iloc[0]
|
|
263
|
+
|
|
264
|
+
meta = {}
|
|
265
|
+
if row.get("metadata"):
|
|
266
|
+
try:
|
|
267
|
+
meta = json.loads(row["metadata"])
|
|
268
|
+
except Exception as e:
|
|
269
|
+
logger.debug(f"Failed to parse metadata JSON: {e}")
|
|
270
|
+
|
|
271
|
+
status = row.get("status")
|
|
272
|
+
return {"success": (status == "SUCCESS"), "metadata": meta}
|
|
273
|
+
|
|
274
|
+
except Exception as e:
|
|
275
|
+
logger.warning(
|
|
276
|
+
f"Failed to get last run info from {self.meta_runs_path} "
|
|
277
|
+
f"for {pipeline_name}/{node_name}: {e}"
|
|
278
|
+
)
|
|
279
|
+
return None
|
|
280
|
+
|
|
281
|
+
def get_last_run_status(self, pipeline_name: str, node_name: str) -> Optional[bool]:
|
|
282
|
+
info = self.get_last_run_info(pipeline_name, node_name)
|
|
283
|
+
if info:
|
|
284
|
+
return info.get("success")
|
|
285
|
+
return None
|
|
286
|
+
|
|
287
|
+
def get_hwm(self, key: str) -> Any:
|
|
288
|
+
if self.spark:
|
|
289
|
+
return self._get_hwm_spark(key)
|
|
290
|
+
return self._get_hwm_local(key)
|
|
291
|
+
|
|
292
|
+
def _get_hwm_spark(self, key):
|
|
293
|
+
from pyspark.sql import functions as F
|
|
294
|
+
|
|
295
|
+
try:
|
|
296
|
+
df = self.spark.read.format("delta").load(self.meta_state_path)
|
|
297
|
+
row = df.filter(F.col("key") == key).select("value").first()
|
|
298
|
+
if row and row.value:
|
|
299
|
+
try:
|
|
300
|
+
return json.loads(row.value)
|
|
301
|
+
except Exception as e:
|
|
302
|
+
logger.debug(f"Failed to parse HWM value as JSON for key '{key}': {e}")
|
|
303
|
+
return row.value
|
|
304
|
+
except Exception as e:
|
|
305
|
+
error_str = str(e)
|
|
306
|
+
if "PATH_NOT_FOUND" in error_str or "does not exist" in error_str.lower():
|
|
307
|
+
logger.debug(
|
|
308
|
+
f"HWM state table does not exist yet at {self.meta_state_path}. "
|
|
309
|
+
"It will be created on first write."
|
|
310
|
+
)
|
|
311
|
+
else:
|
|
312
|
+
logger.warning(
|
|
313
|
+
f"Failed to get HWM for key '{key}' from {self.meta_state_path}: {e}"
|
|
314
|
+
)
|
|
315
|
+
return None
|
|
316
|
+
|
|
317
|
+
def _get_hwm_local(self, key):
|
|
318
|
+
if not DeltaTable:
|
|
319
|
+
return None
|
|
320
|
+
try:
|
|
321
|
+
dt = DeltaTable(self.meta_state_path, storage_options=self.storage_options)
|
|
322
|
+
ds = dt.to_pyarrow_dataset()
|
|
323
|
+
import pyarrow.compute as pc
|
|
324
|
+
|
|
325
|
+
filter_expr = pc.field("key") == key
|
|
326
|
+
table = ds.to_table(filter=filter_expr)
|
|
327
|
+
|
|
328
|
+
if table.num_rows == 0:
|
|
329
|
+
return None
|
|
330
|
+
|
|
331
|
+
val_str = table.column("value")[0].as_py()
|
|
332
|
+
if val_str:
|
|
333
|
+
try:
|
|
334
|
+
return json.loads(val_str)
|
|
335
|
+
except Exception as e:
|
|
336
|
+
logger.debug(f"Failed to parse HWM value as JSON for key '{key}': {e}")
|
|
337
|
+
return val_str
|
|
338
|
+
except Exception as e:
|
|
339
|
+
logger.warning(f"Failed to get HWM for key '{key}' from {self.meta_state_path}: {e}")
|
|
340
|
+
return None
|
|
341
|
+
|
|
342
|
+
def set_hwm(self, key: str, value: Any) -> None:
|
|
343
|
+
val_str = json.dumps(value, default=str)
|
|
344
|
+
row = {
|
|
345
|
+
"key": key,
|
|
346
|
+
"value": val_str,
|
|
347
|
+
"environment": self.environment,
|
|
348
|
+
"updated_at": datetime.now(timezone.utc),
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
def _do_set():
|
|
352
|
+
if self.spark:
|
|
353
|
+
self._set_hwm_spark(row)
|
|
354
|
+
else:
|
|
355
|
+
self._set_hwm_local(row)
|
|
356
|
+
|
|
357
|
+
_retry_delta_operation(_do_set)
|
|
358
|
+
|
|
359
|
+
def _set_hwm_spark(self, row):
|
|
360
|
+
from pyspark.sql.types import StringType, StructField, StructType, TimestampType
|
|
361
|
+
|
|
362
|
+
schema = StructType(
|
|
363
|
+
[
|
|
364
|
+
StructField("key", StringType(), False),
|
|
365
|
+
StructField("value", StringType(), True),
|
|
366
|
+
StructField("environment", StringType(), True),
|
|
367
|
+
StructField("updated_at", TimestampType(), True),
|
|
368
|
+
]
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
updates_df = self.spark.createDataFrame([row], schema)
|
|
372
|
+
|
|
373
|
+
if not self._spark_table_exists(self.meta_state_path):
|
|
374
|
+
updates_df.write.format("delta").mode("overwrite").save(self.meta_state_path)
|
|
375
|
+
return
|
|
376
|
+
|
|
377
|
+
view_name = f"_odibi_hwm_updates_{abs(hash(row['key']))}"
|
|
378
|
+
updates_df.createOrReplaceTempView(view_name)
|
|
379
|
+
|
|
380
|
+
merge_sql = f"""
|
|
381
|
+
MERGE INTO delta.`{self.meta_state_path}` AS t
|
|
382
|
+
USING {view_name} AS s
|
|
383
|
+
ON t.key = s.key
|
|
384
|
+
WHEN MATCHED THEN UPDATE SET
|
|
385
|
+
t.value = s.value,
|
|
386
|
+
t.environment = s.environment,
|
|
387
|
+
t.updated_at = s.updated_at
|
|
388
|
+
WHEN NOT MATCHED THEN INSERT *
|
|
389
|
+
"""
|
|
390
|
+
self.spark.sql(merge_sql)
|
|
391
|
+
self.spark.catalog.dropTempView(view_name)
|
|
392
|
+
|
|
393
|
+
def _set_hwm_local(self, row):
|
|
394
|
+
if not DeltaTable:
|
|
395
|
+
raise ImportError("deltalake library is required for local state backend.")
|
|
396
|
+
|
|
397
|
+
df = pd.DataFrame([row])
|
|
398
|
+
df["updated_at"] = pd.to_datetime(df["updated_at"])
|
|
399
|
+
|
|
400
|
+
try:
|
|
401
|
+
dt = DeltaTable(self.meta_state_path, storage_options=self.storage_options)
|
|
402
|
+
(
|
|
403
|
+
dt.merge(
|
|
404
|
+
source=df,
|
|
405
|
+
predicate="target.key = source.key",
|
|
406
|
+
source_alias="source",
|
|
407
|
+
target_alias="target",
|
|
408
|
+
)
|
|
409
|
+
.when_matched_update_all()
|
|
410
|
+
.when_not_matched_insert_all()
|
|
411
|
+
.execute()
|
|
412
|
+
)
|
|
413
|
+
except (ValueError, Exception):
|
|
414
|
+
write_deltalake(
|
|
415
|
+
self.meta_state_path,
|
|
416
|
+
df,
|
|
417
|
+
mode="append",
|
|
418
|
+
storage_options=self.storage_options,
|
|
419
|
+
schema_mode="merge",
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
def _spark_table_exists(self, path: str) -> bool:
|
|
423
|
+
try:
|
|
424
|
+
return self.spark.read.format("delta").load(path).count() >= 0
|
|
425
|
+
except Exception as e:
|
|
426
|
+
logger.debug(f"Table does not exist at {path}: {e}")
|
|
427
|
+
return False
|
|
428
|
+
|
|
429
|
+
def set_hwm_batch(self, updates: List[Dict[str, Any]]) -> None:
|
|
430
|
+
"""Set multiple High-Water Mark values in a single MERGE operation.
|
|
431
|
+
|
|
432
|
+
This is much more efficient than calling set_hwm() for each update
|
|
433
|
+
individually, especially when running parallel pipelines with many nodes.
|
|
434
|
+
|
|
435
|
+
Args:
|
|
436
|
+
updates: List of dicts with keys: key, value
|
|
437
|
+
"""
|
|
438
|
+
if not updates:
|
|
439
|
+
return
|
|
440
|
+
|
|
441
|
+
timestamp = datetime.now(timezone.utc)
|
|
442
|
+
rows = [
|
|
443
|
+
{
|
|
444
|
+
"key": u["key"],
|
|
445
|
+
"value": json.dumps(u["value"], default=str),
|
|
446
|
+
"environment": self.environment,
|
|
447
|
+
"updated_at": timestamp,
|
|
448
|
+
}
|
|
449
|
+
for u in updates
|
|
450
|
+
]
|
|
451
|
+
|
|
452
|
+
def _do_batch_set():
|
|
453
|
+
if self.spark:
|
|
454
|
+
self._set_hwm_batch_spark(rows)
|
|
455
|
+
else:
|
|
456
|
+
self._set_hwm_batch_local(rows)
|
|
457
|
+
|
|
458
|
+
_retry_delta_operation(_do_batch_set)
|
|
459
|
+
|
|
460
|
+
def _set_hwm_batch_spark(self, rows: List[Dict[str, Any]]) -> None:
|
|
461
|
+
from pyspark.sql.types import StringType, StructField, StructType, TimestampType
|
|
462
|
+
|
|
463
|
+
schema = StructType(
|
|
464
|
+
[
|
|
465
|
+
StructField("key", StringType(), False),
|
|
466
|
+
StructField("value", StringType(), True),
|
|
467
|
+
StructField("environment", StringType(), True),
|
|
468
|
+
StructField("updated_at", TimestampType(), True),
|
|
469
|
+
]
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
updates_df = self.spark.createDataFrame(rows, schema)
|
|
473
|
+
|
|
474
|
+
if not self._spark_table_exists(self.meta_state_path):
|
|
475
|
+
updates_df.write.format("delta").mode("overwrite").save(self.meta_state_path)
|
|
476
|
+
return
|
|
477
|
+
|
|
478
|
+
view_name = "_odibi_hwm_batch_updates"
|
|
479
|
+
updates_df.createOrReplaceTempView(view_name)
|
|
480
|
+
|
|
481
|
+
merge_sql = f"""
|
|
482
|
+
MERGE INTO delta.`{self.meta_state_path}` AS t
|
|
483
|
+
USING {view_name} AS s
|
|
484
|
+
ON t.key = s.key
|
|
485
|
+
WHEN MATCHED THEN UPDATE SET
|
|
486
|
+
t.value = s.value,
|
|
487
|
+
t.environment = s.environment,
|
|
488
|
+
t.updated_at = s.updated_at
|
|
489
|
+
WHEN NOT MATCHED THEN INSERT *
|
|
490
|
+
"""
|
|
491
|
+
self.spark.sql(merge_sql)
|
|
492
|
+
self.spark.catalog.dropTempView(view_name)
|
|
493
|
+
logger.debug(f"Batch set {len(rows)} HWM value(s) via Spark")
|
|
494
|
+
|
|
495
|
+
def _set_hwm_batch_local(self, rows: List[Dict[str, Any]]) -> None:
|
|
496
|
+
if not DeltaTable:
|
|
497
|
+
raise ImportError("deltalake library is required for local state backend.")
|
|
498
|
+
|
|
499
|
+
df = pd.DataFrame(rows)
|
|
500
|
+
df["updated_at"] = pd.to_datetime(df["updated_at"])
|
|
501
|
+
|
|
502
|
+
try:
|
|
503
|
+
dt = DeltaTable(self.meta_state_path, storage_options=self.storage_options)
|
|
504
|
+
(
|
|
505
|
+
dt.merge(
|
|
506
|
+
source=df,
|
|
507
|
+
predicate="target.key = source.key",
|
|
508
|
+
source_alias="source",
|
|
509
|
+
target_alias="target",
|
|
510
|
+
)
|
|
511
|
+
.when_matched_update_all()
|
|
512
|
+
.when_not_matched_insert_all()
|
|
513
|
+
.execute()
|
|
514
|
+
)
|
|
515
|
+
except Exception:
|
|
516
|
+
# Table doesn't exist or merge failed - create/append
|
|
517
|
+
write_deltalake(
|
|
518
|
+
self.meta_state_path,
|
|
519
|
+
df,
|
|
520
|
+
mode="overwrite",
|
|
521
|
+
storage_options=self.storage_options,
|
|
522
|
+
)
|
|
523
|
+
logger.debug(f"Batch set {len(rows)} HWM value(s) locally")
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
class SqlServerSystemBackend(StateBackend):
|
|
527
|
+
"""
|
|
528
|
+
SQL Server State Backend for centralized system tables.
|
|
529
|
+
|
|
530
|
+
Stores meta_runs and meta_state in SQL Server tables for cross-environment
|
|
531
|
+
visibility and querying. Useful when you want a single source of truth
|
|
532
|
+
for pipeline observability across dev/qat/prod environments.
|
|
533
|
+
|
|
534
|
+
Example config:
|
|
535
|
+
```yaml
|
|
536
|
+
system:
|
|
537
|
+
connection: sql_server
|
|
538
|
+
schema_name: odibi_system
|
|
539
|
+
environment: prod
|
|
540
|
+
```
|
|
541
|
+
"""
|
|
542
|
+
|
|
543
|
+
# SQL Server table DDL
|
|
544
|
+
META_RUNS_DDL = """
|
|
545
|
+
IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = 'meta_runs' AND schema_id = SCHEMA_ID(:schema))
|
|
546
|
+
BEGIN
|
|
547
|
+
CREATE TABLE [{schema}].[meta_runs] (
|
|
548
|
+
run_id NVARCHAR(100),
|
|
549
|
+
pipeline_name NVARCHAR(255),
|
|
550
|
+
node_name NVARCHAR(255),
|
|
551
|
+
status NVARCHAR(50),
|
|
552
|
+
rows_processed BIGINT,
|
|
553
|
+
duration_ms BIGINT,
|
|
554
|
+
metrics_json NVARCHAR(MAX),
|
|
555
|
+
environment NVARCHAR(50),
|
|
556
|
+
timestamp DATETIME2,
|
|
557
|
+
date DATE
|
|
558
|
+
)
|
|
559
|
+
END
|
|
560
|
+
"""
|
|
561
|
+
|
|
562
|
+
META_STATE_DDL = """
|
|
563
|
+
IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = 'meta_state' AND schema_id = SCHEMA_ID(:schema))
|
|
564
|
+
BEGIN
|
|
565
|
+
CREATE TABLE [{schema}].[meta_state] (
|
|
566
|
+
[key] NVARCHAR(500) PRIMARY KEY,
|
|
567
|
+
[value] NVARCHAR(MAX),
|
|
568
|
+
environment NVARCHAR(50),
|
|
569
|
+
updated_at DATETIME2
|
|
570
|
+
)
|
|
571
|
+
END
|
|
572
|
+
"""
|
|
573
|
+
|
|
574
|
+
def __init__(
|
|
575
|
+
self,
|
|
576
|
+
connection: Any,
|
|
577
|
+
schema_name: str = "odibi_system",
|
|
578
|
+
environment: Optional[str] = None,
|
|
579
|
+
):
|
|
580
|
+
"""
|
|
581
|
+
Initialize SQL Server System Backend.
|
|
582
|
+
|
|
583
|
+
Args:
|
|
584
|
+
connection: AzureSQL connection object
|
|
585
|
+
schema_name: Schema for system tables (default: odibi_system)
|
|
586
|
+
environment: Environment tag for records (e.g., 'dev', 'prod')
|
|
587
|
+
"""
|
|
588
|
+
self.connection = connection
|
|
589
|
+
self.schema_name = schema_name
|
|
590
|
+
self.environment = environment
|
|
591
|
+
self._tables_created = False
|
|
592
|
+
|
|
593
|
+
def _ensure_tables(self) -> None:
|
|
594
|
+
"""Create system tables if they don't exist."""
|
|
595
|
+
if self._tables_created:
|
|
596
|
+
return
|
|
597
|
+
|
|
598
|
+
try:
|
|
599
|
+
# Create schema if not exists
|
|
600
|
+
schema_ddl = f"""
|
|
601
|
+
IF NOT EXISTS (SELECT * FROM sys.schemas WHERE name = '{self.schema_name}')
|
|
602
|
+
BEGIN
|
|
603
|
+
EXEC('CREATE SCHEMA [{self.schema_name}]')
|
|
604
|
+
END
|
|
605
|
+
"""
|
|
606
|
+
self.connection.execute(schema_ddl)
|
|
607
|
+
|
|
608
|
+
# Create tables
|
|
609
|
+
runs_ddl = self.META_RUNS_DDL.replace("{schema}", self.schema_name).replace(
|
|
610
|
+
":schema", f"'{self.schema_name}'"
|
|
611
|
+
)
|
|
612
|
+
self.connection.execute(runs_ddl)
|
|
613
|
+
|
|
614
|
+
state_ddl = self.META_STATE_DDL.replace("{schema}", self.schema_name).replace(
|
|
615
|
+
":schema", f"'{self.schema_name}'"
|
|
616
|
+
)
|
|
617
|
+
self.connection.execute(state_ddl)
|
|
618
|
+
|
|
619
|
+
self._tables_created = True
|
|
620
|
+
logger.debug(f"SQL Server system tables ensured in schema {self.schema_name}")
|
|
621
|
+
except Exception as e:
|
|
622
|
+
logger.warning(f"Failed to ensure SQL Server system tables: {e}")
|
|
623
|
+
|
|
624
|
+
def load_state(self) -> Dict[str, Any]:
|
|
625
|
+
"""Load state - returns empty dict for SQL Server backend."""
|
|
626
|
+
return {"pipelines": {}}
|
|
627
|
+
|
|
628
|
+
def save_pipeline_run(self, pipeline_name: str, pipeline_data: Dict[str, Any]) -> None:
|
|
629
|
+
"""Pipeline runs are logged via log_run, not this method."""
|
|
630
|
+
pass
|
|
631
|
+
|
|
632
|
+
def get_last_run_info(self, pipeline_name: str, node_name: str) -> Optional[Dict[str, Any]]:
|
|
633
|
+
"""Get last run info from SQL Server."""
|
|
634
|
+
self._ensure_tables()
|
|
635
|
+
try:
|
|
636
|
+
sql = f"""
|
|
637
|
+
SELECT TOP 1 status, metrics_json
|
|
638
|
+
FROM [{self.schema_name}].[meta_runs]
|
|
639
|
+
WHERE pipeline_name = :pipeline_name AND node_name = :node_name
|
|
640
|
+
ORDER BY timestamp DESC
|
|
641
|
+
"""
|
|
642
|
+
result = self.connection.execute(
|
|
643
|
+
sql, {"pipeline_name": pipeline_name, "node_name": node_name}
|
|
644
|
+
)
|
|
645
|
+
if result:
|
|
646
|
+
row = result[0]
|
|
647
|
+
meta = {}
|
|
648
|
+
if row[1]:
|
|
649
|
+
try:
|
|
650
|
+
meta = json.loads(row[1])
|
|
651
|
+
except Exception:
|
|
652
|
+
pass
|
|
653
|
+
return {"success": row[0] == "SUCCESS", "metadata": meta}
|
|
654
|
+
except Exception as e:
|
|
655
|
+
logger.warning(f"Failed to get last run info: {e}")
|
|
656
|
+
return None
|
|
657
|
+
|
|
658
|
+
def get_last_run_status(self, pipeline_name: str, node_name: str) -> Optional[bool]:
|
|
659
|
+
"""Get last run status."""
|
|
660
|
+
info = self.get_last_run_info(pipeline_name, node_name)
|
|
661
|
+
return info.get("success") if info else None
|
|
662
|
+
|
|
663
|
+
def get_hwm(self, key: str) -> Any:
|
|
664
|
+
"""Get HWM value from SQL Server."""
|
|
665
|
+
self._ensure_tables()
|
|
666
|
+
try:
|
|
667
|
+
sql = f"""
|
|
668
|
+
SELECT [value] FROM [{self.schema_name}].[meta_state]
|
|
669
|
+
WHERE [key] = :key
|
|
670
|
+
"""
|
|
671
|
+
result = self.connection.execute(sql, {"key": key})
|
|
672
|
+
if result and result[0][0]:
|
|
673
|
+
try:
|
|
674
|
+
return json.loads(result[0][0])
|
|
675
|
+
except Exception:
|
|
676
|
+
return result[0][0]
|
|
677
|
+
except Exception as e:
|
|
678
|
+
logger.warning(f"Failed to get HWM: {e}")
|
|
679
|
+
return None
|
|
680
|
+
|
|
681
|
+
def set_hwm(self, key: str, value: Any) -> None:
|
|
682
|
+
"""Set HWM value in SQL Server using MERGE."""
|
|
683
|
+
self._ensure_tables()
|
|
684
|
+
val_str = json.dumps(value, default=str)
|
|
685
|
+
try:
|
|
686
|
+
sql = f"""
|
|
687
|
+
MERGE [{self.schema_name}].[meta_state] AS target
|
|
688
|
+
USING (SELECT :key AS [key]) AS source
|
|
689
|
+
ON target.[key] = source.[key]
|
|
690
|
+
WHEN MATCHED THEN
|
|
691
|
+
UPDATE SET [value] = :value, environment = :env, updated_at = GETUTCDATE()
|
|
692
|
+
WHEN NOT MATCHED THEN
|
|
693
|
+
INSERT ([key], [value], environment, updated_at)
|
|
694
|
+
VALUES (:key, :value, :env, GETUTCDATE());
|
|
695
|
+
"""
|
|
696
|
+
self.connection.execute(sql, {"key": key, "value": val_str, "env": self.environment})
|
|
697
|
+
except Exception as e:
|
|
698
|
+
logger.warning(f"Failed to set HWM: {e}")
|
|
699
|
+
|
|
700
|
+
def set_hwm_batch(self, updates: List[Dict[str, Any]]) -> None:
|
|
701
|
+
"""Set multiple HWM values."""
|
|
702
|
+
for update in updates:
|
|
703
|
+
self.set_hwm(update["key"], update["value"])
|
|
704
|
+
|
|
705
|
+
def log_run(
|
|
706
|
+
self,
|
|
707
|
+
run_id: str,
|
|
708
|
+
pipeline_name: str,
|
|
709
|
+
node_name: str,
|
|
710
|
+
status: str,
|
|
711
|
+
rows_processed: int = 0,
|
|
712
|
+
duration_ms: int = 0,
|
|
713
|
+
metrics_json: str = "{}",
|
|
714
|
+
) -> None:
|
|
715
|
+
"""Log a run to SQL Server meta_runs table."""
|
|
716
|
+
self._ensure_tables()
|
|
717
|
+
try:
|
|
718
|
+
sql = f"""
|
|
719
|
+
INSERT INTO [{self.schema_name}].[meta_runs]
|
|
720
|
+
(run_id, pipeline_name, node_name, status, rows_processed, duration_ms,
|
|
721
|
+
metrics_json, environment, timestamp, date)
|
|
722
|
+
VALUES (:run_id, :pipeline, :node, :status, :rows, :duration,
|
|
723
|
+
:metrics, :env, GETUTCDATE(), CAST(GETUTCDATE() AS DATE))
|
|
724
|
+
"""
|
|
725
|
+
self.connection.execute(
|
|
726
|
+
sql,
|
|
727
|
+
{
|
|
728
|
+
"run_id": run_id,
|
|
729
|
+
"pipeline": pipeline_name,
|
|
730
|
+
"node": node_name,
|
|
731
|
+
"status": status,
|
|
732
|
+
"rows": rows_processed,
|
|
733
|
+
"duration": duration_ms,
|
|
734
|
+
"metrics": metrics_json,
|
|
735
|
+
"env": self.environment,
|
|
736
|
+
},
|
|
737
|
+
)
|
|
738
|
+
except Exception as e:
|
|
739
|
+
logger.warning(f"Failed to log run to SQL Server: {e}")
|
|
740
|
+
|
|
741
|
+
def log_runs_batch(self, records: List[Dict[str, Any]]) -> None:
|
|
742
|
+
"""Log multiple runs to SQL Server."""
|
|
743
|
+
for record in records:
|
|
744
|
+
self.log_run(
|
|
745
|
+
run_id=record["run_id"],
|
|
746
|
+
pipeline_name=record["pipeline_name"],
|
|
747
|
+
node_name=record["node_name"],
|
|
748
|
+
status=record["status"],
|
|
749
|
+
rows_processed=record.get("rows_processed", 0),
|
|
750
|
+
duration_ms=record.get("duration_ms", 0),
|
|
751
|
+
metrics_json=record.get("metrics_json", "{}"),
|
|
752
|
+
)
|
|
753
|
+
|
|
754
|
+
|
|
755
|
+
class StateManager:
|
|
756
|
+
"""Manages execution state for checkpointing."""
|
|
757
|
+
|
|
758
|
+
def __init__(self, project_root: str = ".", backend: Optional[StateBackend] = None):
|
|
759
|
+
self.backend = backend
|
|
760
|
+
# Note: If backend is None, it should be injected.
|
|
761
|
+
# But we won't fallback to LocalFileStateBackend here anymore as it's removed.
|
|
762
|
+
if not self.backend:
|
|
763
|
+
raise ValueError("StateBackend must be provided to StateManager")
|
|
764
|
+
|
|
765
|
+
self.state: Dict[str, Any] = self.backend.load_state()
|
|
766
|
+
|
|
767
|
+
def save_pipeline_run(self, pipeline_name: str, results: Any):
|
|
768
|
+
"""Save pipeline run results."""
|
|
769
|
+
if hasattr(results, "to_dict"):
|
|
770
|
+
data = results.to_dict()
|
|
771
|
+
else:
|
|
772
|
+
data = results
|
|
773
|
+
|
|
774
|
+
node_status = {}
|
|
775
|
+
if hasattr(results, "node_results"):
|
|
776
|
+
for name, res in results.node_results.items():
|
|
777
|
+
node_status[name] = {
|
|
778
|
+
"success": res.success,
|
|
779
|
+
"timestamp": res.metadata.get("timestamp"),
|
|
780
|
+
"metadata": res.metadata,
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
pipeline_data = {
|
|
784
|
+
"last_run": data.get("end_time"),
|
|
785
|
+
"nodes": node_status,
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
self.backend.save_pipeline_run(pipeline_name, pipeline_data)
|
|
789
|
+
self.state = self.backend.load_state()
|
|
790
|
+
|
|
791
|
+
def get_last_run_info(self, pipeline_name: str, node_name: str) -> Optional[Dict[str, Any]]:
|
|
792
|
+
"""Get status and metadata of a node from last run."""
|
|
793
|
+
return self.backend.get_last_run_info(pipeline_name, node_name)
|
|
794
|
+
|
|
795
|
+
def get_last_run_status(self, pipeline_name: str, node_name: str) -> Optional[bool]:
|
|
796
|
+
"""Get success status of a node from last run."""
|
|
797
|
+
return self.backend.get_last_run_status(pipeline_name, node_name)
|
|
798
|
+
|
|
799
|
+
def get_hwm(self, key: str) -> Any:
|
|
800
|
+
"""Get High-Water Mark value for a key."""
|
|
801
|
+
return self.backend.get_hwm(key)
|
|
802
|
+
|
|
803
|
+
def set_hwm(self, key: str, value: Any) -> None:
|
|
804
|
+
"""Set High-Water Mark value for a key."""
|
|
805
|
+
self.backend.set_hwm(key, value)
|
|
806
|
+
|
|
807
|
+
def set_hwm_batch(self, updates: List[Dict[str, Any]]) -> None:
|
|
808
|
+
"""Set multiple High-Water Mark values in a single operation.
|
|
809
|
+
|
|
810
|
+
Args:
|
|
811
|
+
updates: List of dicts with keys: key, value
|
|
812
|
+
"""
|
|
813
|
+
self.backend.set_hwm_batch(updates)
|
|
814
|
+
|
|
815
|
+
|
|
816
|
+
def create_state_backend(
|
|
817
|
+
config: Any, project_root: str = ".", spark_session: Any = None
|
|
818
|
+
) -> StateBackend:
|
|
819
|
+
"""
|
|
820
|
+
Factory to create state backend from ProjectConfig.
|
|
821
|
+
|
|
822
|
+
Args:
|
|
823
|
+
config: ProjectConfig object
|
|
824
|
+
project_root: Root directory for local files
|
|
825
|
+
spark_session: Optional SparkSession for Delta backend
|
|
826
|
+
|
|
827
|
+
Returns:
|
|
828
|
+
Configured StateBackend
|
|
829
|
+
"""
|
|
830
|
+
# Fallback to Local JSON if no System Config
|
|
831
|
+
if not config.system:
|
|
832
|
+
import logging
|
|
833
|
+
|
|
834
|
+
logger = logging.getLogger(__name__)
|
|
835
|
+
logger.warning(
|
|
836
|
+
"No system catalog configured. Using local JSON state backend (local-only mode)."
|
|
837
|
+
)
|
|
838
|
+
state_path = os.path.join(project_root, ".odibi", "state.json")
|
|
839
|
+
return LocalJSONStateBackend(state_path)
|
|
840
|
+
|
|
841
|
+
system_conn_name = config.system.connection
|
|
842
|
+
conn_config = config.connections.get(system_conn_name)
|
|
843
|
+
|
|
844
|
+
if not conn_config:
|
|
845
|
+
raise ValueError(f"System connection '{system_conn_name}' not found.")
|
|
846
|
+
|
|
847
|
+
# Helper to get attribute from dict or object
|
|
848
|
+
def _get(obj, key, default=None):
|
|
849
|
+
if isinstance(obj, dict):
|
|
850
|
+
return obj.get(key, default)
|
|
851
|
+
return getattr(obj, key, default)
|
|
852
|
+
|
|
853
|
+
base_uri = ""
|
|
854
|
+
storage_options = {}
|
|
855
|
+
|
|
856
|
+
conn_type = _get(conn_config, "type")
|
|
857
|
+
environment = getattr(config.system, "environment", None)
|
|
858
|
+
|
|
859
|
+
# SQL Server backend - centralized system tables
|
|
860
|
+
if conn_type in ("sql_server", "azure_sql"):
|
|
861
|
+
from odibi.connections.factory import create_connection
|
|
862
|
+
|
|
863
|
+
# Create the SQL connection
|
|
864
|
+
connection = create_connection(system_conn_name, conn_config)
|
|
865
|
+
schema_name = getattr(config.system, "schema_name", None) or "odibi_system"
|
|
866
|
+
|
|
867
|
+
logger.info(f"Using SQL Server system backend: {system_conn_name}, schema: {schema_name}")
|
|
868
|
+
return SqlServerSystemBackend(
|
|
869
|
+
connection=connection,
|
|
870
|
+
schema_name=schema_name,
|
|
871
|
+
environment=environment,
|
|
872
|
+
)
|
|
873
|
+
|
|
874
|
+
# Determine Base URI based on connection type
|
|
875
|
+
if conn_type == "local":
|
|
876
|
+
base_path = _get(conn_config, "base_path")
|
|
877
|
+
if not os.path.isabs(base_path):
|
|
878
|
+
base_path = os.path.join(project_root, base_path)
|
|
879
|
+
|
|
880
|
+
# Ensure directory exists
|
|
881
|
+
try:
|
|
882
|
+
os.makedirs(base_path, exist_ok=True)
|
|
883
|
+
except Exception:
|
|
884
|
+
pass
|
|
885
|
+
|
|
886
|
+
base_uri = os.path.join(base_path, config.system.path)
|
|
887
|
+
|
|
888
|
+
elif conn_type == "azure_blob":
|
|
889
|
+
# Construct abfss://
|
|
890
|
+
account = _get(conn_config, "account_name")
|
|
891
|
+
container = _get(conn_config, "container")
|
|
892
|
+
base_uri = f"abfss://{container}@{account}.dfs.core.windows.net/{config.system.path}"
|
|
893
|
+
|
|
894
|
+
# Set up storage options
|
|
895
|
+
# Depends on auth mode
|
|
896
|
+
auth = _get(conn_config, "auth", {})
|
|
897
|
+
auth_mode = _get(auth, "mode")
|
|
898
|
+
if auth_mode == "account_key":
|
|
899
|
+
storage_options = {
|
|
900
|
+
"account_name": account,
|
|
901
|
+
"account_key": _get(auth, "account_key"),
|
|
902
|
+
}
|
|
903
|
+
elif auth_mode == "sas":
|
|
904
|
+
storage_options = {
|
|
905
|
+
"account_name": account,
|
|
906
|
+
"sas_token": _get(auth, "sas_token"),
|
|
907
|
+
}
|
|
908
|
+
# For MSI/KeyVault, it's more complex for deltalake-python without extra config
|
|
909
|
+
# But Spark handles it if configured in environment
|
|
910
|
+
|
|
911
|
+
else:
|
|
912
|
+
# Fallback for other types or throw error if not supported for system catalog
|
|
913
|
+
# For simplicity, try to treat as local path if it looks like one?
|
|
914
|
+
# Or raise error
|
|
915
|
+
# Assuming local or azure blob for now as they are main supported backends
|
|
916
|
+
# If delta connection?
|
|
917
|
+
if conn_type == "delta":
|
|
918
|
+
# If the connection itself is delta, it might point to a catalog/schema
|
|
919
|
+
# But system catalog needs specific path structure.
|
|
920
|
+
# For now assume system connection is a storage connection.
|
|
921
|
+
pass
|
|
922
|
+
|
|
923
|
+
if not base_uri:
|
|
924
|
+
# Default fallback if something went wrong or unsupported
|
|
925
|
+
base_uri = os.path.join(project_root, ".odibi/system")
|
|
926
|
+
|
|
927
|
+
meta_state_path = f"{base_uri}/meta_state"
|
|
928
|
+
meta_runs_path = f"{base_uri}/meta_runs"
|
|
929
|
+
|
|
930
|
+
return CatalogStateBackend(
|
|
931
|
+
meta_runs_path=meta_runs_path,
|
|
932
|
+
meta_state_path=meta_state_path,
|
|
933
|
+
spark_session=spark_session,
|
|
934
|
+
storage_options=storage_options,
|
|
935
|
+
environment=environment,
|
|
936
|
+
)
|
|
937
|
+
|
|
938
|
+
|
|
939
|
+
def create_sync_source_backend(
|
|
940
|
+
sync_from_config: Any,
|
|
941
|
+
connections: Dict[str, Any],
|
|
942
|
+
project_root: str = ".",
|
|
943
|
+
) -> StateBackend:
|
|
944
|
+
"""
|
|
945
|
+
Create a source StateBackend for sync operations.
|
|
946
|
+
|
|
947
|
+
Args:
|
|
948
|
+
sync_from_config: SyncFromConfig with connection/path/schema_name
|
|
949
|
+
connections: Dictionary of connection configs
|
|
950
|
+
project_root: Root directory for local paths
|
|
951
|
+
|
|
952
|
+
Returns:
|
|
953
|
+
Configured StateBackend for reading source data
|
|
954
|
+
"""
|
|
955
|
+
|
|
956
|
+
def _get(obj, key, default=None):
|
|
957
|
+
if isinstance(obj, dict):
|
|
958
|
+
return obj.get(key, default)
|
|
959
|
+
return getattr(obj, key, default)
|
|
960
|
+
|
|
961
|
+
conn_name = _get(sync_from_config, "connection")
|
|
962
|
+
conn_config = connections.get(conn_name)
|
|
963
|
+
|
|
964
|
+
if not conn_config:
|
|
965
|
+
raise ValueError(f"Sync source connection '{conn_name}' not found in connections.")
|
|
966
|
+
|
|
967
|
+
conn_type = _get(conn_config, "type")
|
|
968
|
+
|
|
969
|
+
# SQL Server source
|
|
970
|
+
if conn_type in ("sql_server", "azure_sql"):
|
|
971
|
+
from odibi.connections.factory import create_connection
|
|
972
|
+
|
|
973
|
+
connection = create_connection(conn_name, conn_config)
|
|
974
|
+
schema_name = _get(sync_from_config, "schema_name") or "odibi_system"
|
|
975
|
+
return SqlServerSystemBackend(
|
|
976
|
+
connection=connection,
|
|
977
|
+
schema_name=schema_name,
|
|
978
|
+
environment=None,
|
|
979
|
+
)
|
|
980
|
+
|
|
981
|
+
# File-based source (local, azure_blob)
|
|
982
|
+
base_uri = ""
|
|
983
|
+
storage_options = {}
|
|
984
|
+
path = _get(sync_from_config, "path") or "_odibi_system"
|
|
985
|
+
|
|
986
|
+
if conn_type == "local":
|
|
987
|
+
base_path = _get(conn_config, "base_path")
|
|
988
|
+
if not os.path.isabs(base_path):
|
|
989
|
+
base_path = os.path.join(project_root, base_path)
|
|
990
|
+
base_uri = os.path.join(base_path, path)
|
|
991
|
+
|
|
992
|
+
elif conn_type == "azure_blob":
|
|
993
|
+
account = _get(conn_config, "account_name")
|
|
994
|
+
container = _get(conn_config, "container")
|
|
995
|
+
base_uri = f"abfss://{container}@{account}.dfs.core.windows.net/{path}"
|
|
996
|
+
|
|
997
|
+
auth = _get(conn_config, "auth", {})
|
|
998
|
+
auth_mode = _get(auth, "mode")
|
|
999
|
+
if auth_mode == "account_key":
|
|
1000
|
+
storage_options = {
|
|
1001
|
+
"account_name": account,
|
|
1002
|
+
"account_key": _get(auth, "account_key"),
|
|
1003
|
+
}
|
|
1004
|
+
elif auth_mode == "sas":
|
|
1005
|
+
storage_options = {
|
|
1006
|
+
"account_name": account,
|
|
1007
|
+
"sas_token": _get(auth, "sas_token"),
|
|
1008
|
+
}
|
|
1009
|
+
|
|
1010
|
+
if not base_uri:
|
|
1011
|
+
base_uri = os.path.join(project_root, path)
|
|
1012
|
+
|
|
1013
|
+
meta_state_path = f"{base_uri}/meta_state"
|
|
1014
|
+
meta_runs_path = f"{base_uri}/meta_runs"
|
|
1015
|
+
|
|
1016
|
+
return CatalogStateBackend(
|
|
1017
|
+
meta_runs_path=meta_runs_path,
|
|
1018
|
+
meta_state_path=meta_state_path,
|
|
1019
|
+
spark_session=None,
|
|
1020
|
+
storage_options=storage_options,
|
|
1021
|
+
environment=None,
|
|
1022
|
+
)
|
|
1023
|
+
|
|
1024
|
+
|
|
1025
|
+
def sync_system_data(
|
|
1026
|
+
source_backend: StateBackend,
|
|
1027
|
+
target_backend: StateBackend,
|
|
1028
|
+
tables: Optional[List[str]] = None,
|
|
1029
|
+
) -> Dict[str, int]:
|
|
1030
|
+
"""
|
|
1031
|
+
Sync system data from source backend to target backend.
|
|
1032
|
+
|
|
1033
|
+
Reads meta_runs and meta_state from source and writes to target.
|
|
1034
|
+
|
|
1035
|
+
Args:
|
|
1036
|
+
source_backend: Source StateBackend to read from
|
|
1037
|
+
target_backend: Target StateBackend to write to
|
|
1038
|
+
tables: Optional list of tables to sync ('runs', 'state'). Default: both.
|
|
1039
|
+
|
|
1040
|
+
Returns:
|
|
1041
|
+
Dict with counts: {'runs': N, 'state': M}
|
|
1042
|
+
"""
|
|
1043
|
+
if tables is None:
|
|
1044
|
+
tables = ["runs", "state"]
|
|
1045
|
+
|
|
1046
|
+
result = {"runs": 0, "state": 0}
|
|
1047
|
+
|
|
1048
|
+
# Sync runs (meta_runs)
|
|
1049
|
+
if "runs" in tables:
|
|
1050
|
+
runs_count = _sync_runs(source_backend, target_backend)
|
|
1051
|
+
result["runs"] = runs_count
|
|
1052
|
+
logger.info(f"Synced {runs_count} run records")
|
|
1053
|
+
|
|
1054
|
+
# Sync state (meta_state / HWM)
|
|
1055
|
+
if "state" in tables:
|
|
1056
|
+
state_count = _sync_state(source_backend, target_backend)
|
|
1057
|
+
result["state"] = state_count
|
|
1058
|
+
logger.info(f"Synced {state_count} state records")
|
|
1059
|
+
|
|
1060
|
+
return result
|
|
1061
|
+
|
|
1062
|
+
|
|
1063
|
+
def _sync_runs(source: StateBackend, target: StateBackend) -> int:
|
|
1064
|
+
"""Sync runs from source to target."""
|
|
1065
|
+
records = []
|
|
1066
|
+
|
|
1067
|
+
# Read runs from source
|
|
1068
|
+
if isinstance(source, CatalogStateBackend):
|
|
1069
|
+
if not DeltaTable or not pd:
|
|
1070
|
+
logger.warning("Delta/Pandas not available for reading source runs")
|
|
1071
|
+
return 0
|
|
1072
|
+
|
|
1073
|
+
try:
|
|
1074
|
+
dt = DeltaTable(source.meta_runs_path, storage_options=source.storage_options)
|
|
1075
|
+
df = dt.to_pandas()
|
|
1076
|
+
if df.empty:
|
|
1077
|
+
return 0
|
|
1078
|
+
|
|
1079
|
+
for _, row in df.iterrows():
|
|
1080
|
+
records.append(
|
|
1081
|
+
{
|
|
1082
|
+
"run_id": row.get("run_id"),
|
|
1083
|
+
"pipeline_name": row.get("pipeline_name"),
|
|
1084
|
+
"node_name": row.get("node_name"),
|
|
1085
|
+
"status": row.get("status"),
|
|
1086
|
+
"rows_processed": int(row.get("rows_processed", 0) or 0),
|
|
1087
|
+
"duration_ms": int(row.get("duration_ms", 0) or 0),
|
|
1088
|
+
"metrics_json": row.get("metrics_json") or row.get("metadata") or "{}",
|
|
1089
|
+
}
|
|
1090
|
+
)
|
|
1091
|
+
except Exception as e:
|
|
1092
|
+
logger.warning(f"Failed to read runs from source: {e}")
|
|
1093
|
+
return 0
|
|
1094
|
+
|
|
1095
|
+
elif isinstance(source, SqlServerSystemBackend):
|
|
1096
|
+
source._ensure_tables()
|
|
1097
|
+
try:
|
|
1098
|
+
sql = f"""SELECT run_id, pipeline_name, node_name, status, rows_processed,
|
|
1099
|
+
duration_ms, metrics_json FROM [{source.schema_name}].[meta_runs]"""
|
|
1100
|
+
rows = source.connection.execute(sql)
|
|
1101
|
+
if rows:
|
|
1102
|
+
for row in rows:
|
|
1103
|
+
records.append(
|
|
1104
|
+
{
|
|
1105
|
+
"run_id": row[0],
|
|
1106
|
+
"pipeline_name": row[1],
|
|
1107
|
+
"node_name": row[2],
|
|
1108
|
+
"status": row[3],
|
|
1109
|
+
"rows_processed": int(row[4] or 0),
|
|
1110
|
+
"duration_ms": int(row[5] or 0),
|
|
1111
|
+
"metrics_json": row[6] or "{}",
|
|
1112
|
+
}
|
|
1113
|
+
)
|
|
1114
|
+
except Exception as e:
|
|
1115
|
+
logger.warning(f"Failed to read runs from SQL source: {e}")
|
|
1116
|
+
return 0
|
|
1117
|
+
|
|
1118
|
+
if not records:
|
|
1119
|
+
return 0
|
|
1120
|
+
|
|
1121
|
+
# Write runs to target
|
|
1122
|
+
if isinstance(target, SqlServerSystemBackend):
|
|
1123
|
+
target.log_runs_batch(records)
|
|
1124
|
+
elif isinstance(target, CatalogStateBackend):
|
|
1125
|
+
_write_runs_to_catalog(target, records)
|
|
1126
|
+
|
|
1127
|
+
return len(records)
|
|
1128
|
+
|
|
1129
|
+
|
|
1130
|
+
def _write_runs_to_catalog(target: CatalogStateBackend, records: List[Dict]) -> None:
|
|
1131
|
+
"""Write run records to CatalogStateBackend."""
|
|
1132
|
+
if not pd or not write_deltalake:
|
|
1133
|
+
logger.warning("Delta/Pandas not available for writing runs")
|
|
1134
|
+
return
|
|
1135
|
+
|
|
1136
|
+
df = pd.DataFrame(records)
|
|
1137
|
+
df["timestamp"] = datetime.now(timezone.utc)
|
|
1138
|
+
df["date"] = datetime.now(timezone.utc).date()
|
|
1139
|
+
df["environment"] = target.environment
|
|
1140
|
+
|
|
1141
|
+
def _write():
|
|
1142
|
+
write_deltalake(
|
|
1143
|
+
target.meta_runs_path,
|
|
1144
|
+
df,
|
|
1145
|
+
mode="append",
|
|
1146
|
+
storage_options=target.storage_options,
|
|
1147
|
+
)
|
|
1148
|
+
|
|
1149
|
+
_retry_delta_operation(_write)
|
|
1150
|
+
|
|
1151
|
+
|
|
1152
|
+
def _sync_state(source: StateBackend, target: StateBackend) -> int:
|
|
1153
|
+
"""Sync HWM state from source to target."""
|
|
1154
|
+
hwm_records = []
|
|
1155
|
+
|
|
1156
|
+
# Read state from source
|
|
1157
|
+
if isinstance(source, CatalogStateBackend):
|
|
1158
|
+
if not DeltaTable or not pd:
|
|
1159
|
+
logger.warning("Delta/Pandas not available for reading source state")
|
|
1160
|
+
return 0
|
|
1161
|
+
|
|
1162
|
+
try:
|
|
1163
|
+
dt = DeltaTable(source.meta_state_path, storage_options=source.storage_options)
|
|
1164
|
+
df = dt.to_pandas()
|
|
1165
|
+
if df.empty:
|
|
1166
|
+
return 0
|
|
1167
|
+
|
|
1168
|
+
for _, row in df.iterrows():
|
|
1169
|
+
key = row.get("key")
|
|
1170
|
+
value = row.get("value")
|
|
1171
|
+
if key:
|
|
1172
|
+
try:
|
|
1173
|
+
hwm_records.append({"key": key, "value": json.loads(value)})
|
|
1174
|
+
except (json.JSONDecodeError, TypeError):
|
|
1175
|
+
hwm_records.append({"key": key, "value": value})
|
|
1176
|
+
except Exception as e:
|
|
1177
|
+
logger.warning(f"Failed to read state from source: {e}")
|
|
1178
|
+
return 0
|
|
1179
|
+
|
|
1180
|
+
elif isinstance(source, SqlServerSystemBackend):
|
|
1181
|
+
source._ensure_tables()
|
|
1182
|
+
try:
|
|
1183
|
+
sql = f"SELECT [key], [value] FROM [{source.schema_name}].[meta_state]"
|
|
1184
|
+
rows = source.connection.execute(sql)
|
|
1185
|
+
if rows:
|
|
1186
|
+
for row in rows:
|
|
1187
|
+
key, value = row[0], row[1]
|
|
1188
|
+
if key:
|
|
1189
|
+
try:
|
|
1190
|
+
hwm_records.append({"key": key, "value": json.loads(value)})
|
|
1191
|
+
except (json.JSONDecodeError, TypeError):
|
|
1192
|
+
hwm_records.append({"key": key, "value": value})
|
|
1193
|
+
except Exception as e:
|
|
1194
|
+
logger.warning(f"Failed to read state from SQL source: {e}")
|
|
1195
|
+
return 0
|
|
1196
|
+
|
|
1197
|
+
if not hwm_records:
|
|
1198
|
+
return 0
|
|
1199
|
+
|
|
1200
|
+
# Write state to target
|
|
1201
|
+
target.set_hwm_batch(hwm_records)
|
|
1202
|
+
|
|
1203
|
+
return len(hwm_records)
|