odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/catalog.py
ADDED
|
@@ -0,0 +1,3011 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import random
|
|
5
|
+
import time
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from pyspark.sql import SparkSession
|
|
11
|
+
from pyspark.sql.types import (
|
|
12
|
+
ArrayType,
|
|
13
|
+
DateType,
|
|
14
|
+
DoubleType,
|
|
15
|
+
LongType,
|
|
16
|
+
StringType,
|
|
17
|
+
StructField,
|
|
18
|
+
StructType,
|
|
19
|
+
TimestampType,
|
|
20
|
+
)
|
|
21
|
+
except ImportError:
|
|
22
|
+
# Fallback for environments without PySpark (e.g., pure Pandas mode)
|
|
23
|
+
SparkSession = Any
|
|
24
|
+
|
|
25
|
+
class DataType:
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
class StringType(DataType):
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
class LongType(DataType):
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
class DoubleType(DataType):
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
class DateType(DataType):
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
class TimestampType(DataType):
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
class ArrayType(DataType):
|
|
44
|
+
def __init__(self, elementType):
|
|
45
|
+
self.elementType = elementType
|
|
46
|
+
|
|
47
|
+
class StructField:
|
|
48
|
+
def __init__(self, name, dtype, nullable=True):
|
|
49
|
+
self.name = name
|
|
50
|
+
self.dataType = dtype
|
|
51
|
+
|
|
52
|
+
class StructType:
|
|
53
|
+
def __init__(self, fields):
|
|
54
|
+
self.fields = fields
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
from odibi.config import SystemConfig
|
|
58
|
+
|
|
59
|
+
logger = logging.getLogger(__name__)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class CatalogManager:
|
|
63
|
+
"""
|
|
64
|
+
Manages the Odibi System Catalog (The Brain).
|
|
65
|
+
Handles bootstrapping and interaction with meta-tables.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
def __init__(
|
|
69
|
+
self,
|
|
70
|
+
spark: Optional[SparkSession],
|
|
71
|
+
config: SystemConfig,
|
|
72
|
+
base_path: str,
|
|
73
|
+
engine: Optional[Any] = None,
|
|
74
|
+
connection: Optional[Any] = None,
|
|
75
|
+
):
|
|
76
|
+
"""
|
|
77
|
+
Initialize the Catalog Manager.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
spark: Active SparkSession (optional if engine is provided)
|
|
81
|
+
config: SystemConfig object
|
|
82
|
+
base_path: Absolute path to the system catalog directory (resolved from connection).
|
|
83
|
+
Example: "abfss://container@account.dfs.core.windows.net/_odibi_system"
|
|
84
|
+
engine: Execution engine (optional, for Pandas mode)
|
|
85
|
+
connection: Connection object for storage credentials (optional, for Pandas mode)
|
|
86
|
+
"""
|
|
87
|
+
self.spark = spark
|
|
88
|
+
self.config = config
|
|
89
|
+
self.base_path = base_path.rstrip("/")
|
|
90
|
+
self.engine = engine
|
|
91
|
+
self.connection = connection
|
|
92
|
+
|
|
93
|
+
# Table Paths
|
|
94
|
+
self.tables = {
|
|
95
|
+
"meta_tables": f"{self.base_path}/meta_tables",
|
|
96
|
+
"meta_runs": f"{self.base_path}/meta_runs",
|
|
97
|
+
"meta_patterns": f"{self.base_path}/meta_patterns",
|
|
98
|
+
"meta_metrics": f"{self.base_path}/meta_metrics",
|
|
99
|
+
"meta_state": f"{self.base_path}/meta_state",
|
|
100
|
+
"meta_pipelines": f"{self.base_path}/meta_pipelines",
|
|
101
|
+
"meta_nodes": f"{self.base_path}/meta_nodes",
|
|
102
|
+
"meta_schemas": f"{self.base_path}/meta_schemas",
|
|
103
|
+
"meta_lineage": f"{self.base_path}/meta_lineage",
|
|
104
|
+
"meta_outputs": f"{self.base_path}/meta_outputs",
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
# Cache for meta table reads (invalidated on write operations)
|
|
108
|
+
self._pipelines_cache: Optional[Dict[str, Dict[str, Any]]] = None
|
|
109
|
+
self._nodes_cache: Optional[Dict[str, Dict[str, str]]] = None
|
|
110
|
+
self._outputs_cache: Optional[Dict[str, Dict[str, Any]]] = None
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def is_spark_mode(self) -> bool:
|
|
114
|
+
"""Check if running in Spark mode."""
|
|
115
|
+
return self.spark is not None
|
|
116
|
+
|
|
117
|
+
@property
|
|
118
|
+
def is_pandas_mode(self) -> bool:
|
|
119
|
+
"""Check if running in Pandas mode."""
|
|
120
|
+
return self.engine is not None and self.engine.name == "pandas"
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def is_sql_server_mode(self) -> bool:
|
|
124
|
+
"""Check if running with SQL Server system backend."""
|
|
125
|
+
if self.connection is None:
|
|
126
|
+
return False
|
|
127
|
+
# Check if connection is AzureSQL type
|
|
128
|
+
conn_type = getattr(self.connection, "__class__", None)
|
|
129
|
+
if conn_type is None:
|
|
130
|
+
return False
|
|
131
|
+
return conn_type.__name__ in ("AzureSQL", "SqlServerConnection")
|
|
132
|
+
|
|
133
|
+
def _get_storage_options(self) -> Dict[str, Any]:
|
|
134
|
+
"""Get storage options for pandas/delta-rs operations.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
Dict with storage credentials if connection supports it, else empty dict.
|
|
138
|
+
"""
|
|
139
|
+
if self.connection and hasattr(self.connection, "pandas_storage_options"):
|
|
140
|
+
return self.connection.pandas_storage_options()
|
|
141
|
+
return {}
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def has_backend(self) -> bool:
|
|
145
|
+
"""Check if any backend (Spark or engine) is available."""
|
|
146
|
+
return self.spark is not None or self.engine is not None
|
|
147
|
+
|
|
148
|
+
def invalidate_cache(self) -> None:
|
|
149
|
+
"""Invalidate all cached meta table data."""
|
|
150
|
+
self._pipelines_cache = None
|
|
151
|
+
self._nodes_cache = None
|
|
152
|
+
self._outputs_cache = None
|
|
153
|
+
|
|
154
|
+
def _retry_with_backoff(self, func, max_retries: int = 5, base_delay: float = 1.0):
|
|
155
|
+
"""Retry a function with exponential backoff and jitter for concurrent writes.
|
|
156
|
+
|
|
157
|
+
Only retries on Delta Lake concurrency exceptions. Other exceptions are
|
|
158
|
+
raised immediately. Warnings are only logged after all retries fail.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
func: Callable to execute.
|
|
162
|
+
max_retries: Maximum retry attempts (default 5 for high concurrency).
|
|
163
|
+
base_delay: Base delay in seconds (doubles each retry).
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Result of the function.
|
|
167
|
+
|
|
168
|
+
Raises:
|
|
169
|
+
Exception: If all retries fail or non-retryable error occurs.
|
|
170
|
+
"""
|
|
171
|
+
for attempt in range(max_retries + 1):
|
|
172
|
+
try:
|
|
173
|
+
return func()
|
|
174
|
+
except Exception as e:
|
|
175
|
+
error_str = str(e)
|
|
176
|
+
# Check for Delta concurrency exceptions
|
|
177
|
+
is_concurrent_error = any(
|
|
178
|
+
msg in error_str
|
|
179
|
+
for msg in [
|
|
180
|
+
"ConcurrentAppendException",
|
|
181
|
+
"ConcurrentDeleteReadException",
|
|
182
|
+
"ConcurrentDeleteDeleteException",
|
|
183
|
+
"DELTA_CONCURRENT",
|
|
184
|
+
"concurrent",
|
|
185
|
+
"conflict",
|
|
186
|
+
]
|
|
187
|
+
)
|
|
188
|
+
if not is_concurrent_error or attempt >= max_retries:
|
|
189
|
+
raise
|
|
190
|
+
# Exponential backoff with jitter (1s, 2s, 4s, 8s, 16s = ~31s total)
|
|
191
|
+
delay = base_delay * (2**attempt) + random.uniform(0, 1.0)
|
|
192
|
+
logger.debug(
|
|
193
|
+
f"Delta concurrent write (attempt {attempt + 1}/{max_retries + 1}), "
|
|
194
|
+
f"retrying in {delay:.2f}s..."
|
|
195
|
+
)
|
|
196
|
+
time.sleep(delay)
|
|
197
|
+
|
|
198
|
+
def _get_all_pipelines_cached(self) -> Dict[str, Dict[str, Any]]:
|
|
199
|
+
"""Get all pipelines with caching."""
|
|
200
|
+
if self._pipelines_cache is not None:
|
|
201
|
+
return self._pipelines_cache
|
|
202
|
+
|
|
203
|
+
self._pipelines_cache = {}
|
|
204
|
+
if not self.spark and not self.engine:
|
|
205
|
+
return self._pipelines_cache
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
if self.spark:
|
|
209
|
+
df = self.spark.read.format("delta").load(self.tables["meta_pipelines"])
|
|
210
|
+
rows = df.collect()
|
|
211
|
+
for row in rows:
|
|
212
|
+
row_dict = row.asDict()
|
|
213
|
+
self._pipelines_cache[row_dict["pipeline_name"]] = row_dict
|
|
214
|
+
elif self.engine:
|
|
215
|
+
df = self._read_local_table(self.tables["meta_pipelines"])
|
|
216
|
+
if not df.empty and "pipeline_name" in df.columns:
|
|
217
|
+
for _, row in df.iterrows():
|
|
218
|
+
self._pipelines_cache[row["pipeline_name"]] = row.to_dict()
|
|
219
|
+
except Exception as e:
|
|
220
|
+
logger.debug(f"Could not cache pipelines: {e}")
|
|
221
|
+
self._pipelines_cache = {}
|
|
222
|
+
|
|
223
|
+
return self._pipelines_cache
|
|
224
|
+
|
|
225
|
+
def _get_all_nodes_cached(self) -> Dict[str, Dict[str, str]]:
|
|
226
|
+
"""Get all nodes grouped by pipeline with caching."""
|
|
227
|
+
if self._nodes_cache is not None:
|
|
228
|
+
return self._nodes_cache
|
|
229
|
+
|
|
230
|
+
self._nodes_cache = {}
|
|
231
|
+
if not self.spark and not self.engine:
|
|
232
|
+
return self._nodes_cache
|
|
233
|
+
|
|
234
|
+
try:
|
|
235
|
+
if self.spark:
|
|
236
|
+
df = self.spark.read.format("delta").load(self.tables["meta_nodes"])
|
|
237
|
+
rows = df.select("pipeline_name", "node_name", "version_hash").collect()
|
|
238
|
+
for row in rows:
|
|
239
|
+
p_name = row["pipeline_name"]
|
|
240
|
+
if p_name not in self._nodes_cache:
|
|
241
|
+
self._nodes_cache[p_name] = {}
|
|
242
|
+
self._nodes_cache[p_name][row["node_name"]] = row["version_hash"]
|
|
243
|
+
elif self.engine:
|
|
244
|
+
df = self._read_local_table(self.tables["meta_nodes"])
|
|
245
|
+
if not df.empty and "pipeline_name" in df.columns:
|
|
246
|
+
for _, row in df.iterrows():
|
|
247
|
+
p_name = row["pipeline_name"]
|
|
248
|
+
if p_name not in self._nodes_cache:
|
|
249
|
+
self._nodes_cache[p_name] = {}
|
|
250
|
+
self._nodes_cache[p_name][row["node_name"]] = row["version_hash"]
|
|
251
|
+
except Exception as e:
|
|
252
|
+
logger.debug(f"Could not cache nodes: {e}")
|
|
253
|
+
self._nodes_cache = {}
|
|
254
|
+
|
|
255
|
+
return self._nodes_cache
|
|
256
|
+
|
|
257
|
+
def bootstrap(self) -> None:
|
|
258
|
+
"""
|
|
259
|
+
Ensures all system tables exist. Creates them if missing.
|
|
260
|
+
"""
|
|
261
|
+
if not self.spark and not self.engine:
|
|
262
|
+
logger.warning(
|
|
263
|
+
"Neither SparkSession nor Engine available. Skipping System Catalog bootstrap."
|
|
264
|
+
)
|
|
265
|
+
return
|
|
266
|
+
|
|
267
|
+
logger.info(f"Bootstrapping System Catalog at {self.base_path}...")
|
|
268
|
+
|
|
269
|
+
self._ensure_table("meta_tables", self._get_schema_meta_tables())
|
|
270
|
+
self._ensure_table(
|
|
271
|
+
"meta_runs",
|
|
272
|
+
self._get_schema_meta_runs(),
|
|
273
|
+
partition_cols=["pipeline_name", "date"],
|
|
274
|
+
schema_evolution=True,
|
|
275
|
+
)
|
|
276
|
+
self._ensure_table("meta_patterns", self._get_schema_meta_patterns())
|
|
277
|
+
self._ensure_table("meta_metrics", self._get_schema_meta_metrics())
|
|
278
|
+
self._ensure_table("meta_state", self._get_schema_meta_state())
|
|
279
|
+
self._ensure_table("meta_pipelines", self._get_schema_meta_pipelines())
|
|
280
|
+
self._ensure_table("meta_nodes", self._get_schema_meta_nodes())
|
|
281
|
+
self._ensure_table("meta_schemas", self._get_schema_meta_schemas())
|
|
282
|
+
self._ensure_table("meta_lineage", self._get_schema_meta_lineage())
|
|
283
|
+
self._ensure_table("meta_outputs", self._get_schema_meta_outputs())
|
|
284
|
+
|
|
285
|
+
def _ensure_table(
|
|
286
|
+
self,
|
|
287
|
+
name: str,
|
|
288
|
+
schema: StructType,
|
|
289
|
+
partition_cols: Optional[list] = None,
|
|
290
|
+
schema_evolution: bool = False,
|
|
291
|
+
) -> None:
|
|
292
|
+
path = self.tables[name]
|
|
293
|
+
if not self._table_exists(path):
|
|
294
|
+
logger.info(f"Creating system table: {name} at {path}")
|
|
295
|
+
|
|
296
|
+
if self.spark:
|
|
297
|
+
# Create empty DataFrame with schema
|
|
298
|
+
writer = self.spark.createDataFrame([], schema).write.format("delta")
|
|
299
|
+
if partition_cols:
|
|
300
|
+
writer = writer.partitionBy(*partition_cols)
|
|
301
|
+
writer.save(path)
|
|
302
|
+
elif self.engine and self.engine.name == "pandas":
|
|
303
|
+
# Pandas/Local Mode
|
|
304
|
+
import os
|
|
305
|
+
|
|
306
|
+
import pandas as pd
|
|
307
|
+
|
|
308
|
+
os.makedirs(path, exist_ok=True)
|
|
309
|
+
|
|
310
|
+
# Attempt to create Delta Table if library exists (using Arrow for strict typing)
|
|
311
|
+
try:
|
|
312
|
+
import pyarrow as pa
|
|
313
|
+
from deltalake import write_deltalake
|
|
314
|
+
|
|
315
|
+
def map_to_arrow_type(dtype):
|
|
316
|
+
s_type = str(dtype)
|
|
317
|
+
if isinstance(dtype, StringType) or "StringType" in s_type:
|
|
318
|
+
return pa.string()
|
|
319
|
+
if isinstance(dtype, LongType) or "LongType" in s_type:
|
|
320
|
+
return pa.int64()
|
|
321
|
+
if isinstance(dtype, DoubleType) or "DoubleType" in s_type:
|
|
322
|
+
return pa.float64()
|
|
323
|
+
if isinstance(dtype, TimestampType) or "TimestampType" in s_type:
|
|
324
|
+
return pa.timestamp("us", tz="UTC")
|
|
325
|
+
if isinstance(dtype, DateType) or "DateType" in s_type:
|
|
326
|
+
return pa.date32()
|
|
327
|
+
if isinstance(dtype, ArrayType) or "ArrayType" in s_type:
|
|
328
|
+
# Access element type safely
|
|
329
|
+
elem_type = getattr(dtype, "elementType", StringType())
|
|
330
|
+
return pa.list_(map_to_arrow_type(elem_type))
|
|
331
|
+
return pa.string()
|
|
332
|
+
|
|
333
|
+
# Define Arrow Schema
|
|
334
|
+
arrow_fields = []
|
|
335
|
+
for field in schema.fields:
|
|
336
|
+
arrow_fields.append(pa.field(field.name, map_to_arrow_type(field.dataType)))
|
|
337
|
+
|
|
338
|
+
arrow_schema = pa.schema(arrow_fields)
|
|
339
|
+
|
|
340
|
+
# Create Empty Table
|
|
341
|
+
# Note: We pass a dict of empty lists. PyArrow handles the rest using schema.
|
|
342
|
+
data = {f.name: [] for f in schema.fields}
|
|
343
|
+
table = pa.Table.from_pydict(data, schema=arrow_schema)
|
|
344
|
+
|
|
345
|
+
storage_opts = self._get_storage_options()
|
|
346
|
+
write_deltalake(
|
|
347
|
+
path,
|
|
348
|
+
table,
|
|
349
|
+
mode="overwrite",
|
|
350
|
+
partition_by=partition_cols,
|
|
351
|
+
storage_options=storage_opts if storage_opts else None,
|
|
352
|
+
)
|
|
353
|
+
logger.info(f"Initialized Delta table: {name}")
|
|
354
|
+
|
|
355
|
+
except ImportError:
|
|
356
|
+
# Fallback to Pandas/Parquet if Delta/Arrow not available
|
|
357
|
+
# Prepare empty DataFrame with correct columns and types
|
|
358
|
+
data = {}
|
|
359
|
+
|
|
360
|
+
def get_pd_type(dtype):
|
|
361
|
+
if isinstance(dtype, StringType) or "StringType" in str(type(dtype)):
|
|
362
|
+
return "string"
|
|
363
|
+
if isinstance(dtype, LongType) or "LongType" in str(type(dtype)):
|
|
364
|
+
return "int64"
|
|
365
|
+
if isinstance(dtype, DoubleType) or "DoubleType" in str(type(dtype)):
|
|
366
|
+
return "float64"
|
|
367
|
+
if isinstance(dtype, TimestampType) or "TimestampType" in str(type(dtype)):
|
|
368
|
+
return "datetime64[ns, UTC]"
|
|
369
|
+
if isinstance(dtype, DateType) or "DateType" in str(type(dtype)):
|
|
370
|
+
return "datetime64[ns]"
|
|
371
|
+
return "object"
|
|
372
|
+
|
|
373
|
+
for field in schema.fields:
|
|
374
|
+
pd_type = get_pd_type(field.dataType)
|
|
375
|
+
data[field.name] = pd.Series([], dtype=pd_type)
|
|
376
|
+
|
|
377
|
+
df = pd.DataFrame(data)
|
|
378
|
+
|
|
379
|
+
# Fallback to Parquet
|
|
380
|
+
# Pandas to_parquet with partition_cols
|
|
381
|
+
df.to_parquet(path, partition_cols=partition_cols)
|
|
382
|
+
logger.info(f"Initialized Parquet table: {name} (Delta library not found)")
|
|
383
|
+
except Exception as e:
|
|
384
|
+
logger.error(f"Failed to create local system table {name}: {e}")
|
|
385
|
+
raise e
|
|
386
|
+
else:
|
|
387
|
+
# If table exists and schema evolution is requested (only for Pandas/Delta mode currently)
|
|
388
|
+
if schema_evolution and self.engine and self.engine.name == "pandas":
|
|
389
|
+
try:
|
|
390
|
+
from deltalake import DeltaTable, write_deltalake
|
|
391
|
+
|
|
392
|
+
storage_opts = self._get_storage_options()
|
|
393
|
+
_ = DeltaTable(path, storage_options=storage_opts if storage_opts else None)
|
|
394
|
+
# Basic schema evolution: overwrite schema if we are appending?
|
|
395
|
+
# For now, let's just log. True evolution is complex.
|
|
396
|
+
# A simple fix for "fields mismatch" is to allow schema merge.
|
|
397
|
+
pass
|
|
398
|
+
except ImportError:
|
|
399
|
+
pass
|
|
400
|
+
logger.debug(f"System table exists: {name}")
|
|
401
|
+
self._migrate_schema_if_needed(name, path, schema)
|
|
402
|
+
|
|
403
|
+
def _migrate_schema_if_needed(self, name: str, path: str, expected_schema: StructType) -> None:
|
|
404
|
+
"""
|
|
405
|
+
Migrate table schema if there are incompatible type changes.
|
|
406
|
+
This handles cases like ArrayType -> StringType migrations.
|
|
407
|
+
"""
|
|
408
|
+
try:
|
|
409
|
+
if self.spark:
|
|
410
|
+
existing_df = self.spark.read.format("delta").load(path)
|
|
411
|
+
existing_fields = {f.name: f.dataType for f in existing_df.schema.fields}
|
|
412
|
+
expected_fields = {f.name: f.dataType for f in expected_schema.fields}
|
|
413
|
+
|
|
414
|
+
needs_migration = False
|
|
415
|
+
for field_name, expected_type in expected_fields.items():
|
|
416
|
+
if field_name in existing_fields:
|
|
417
|
+
existing_type = existing_fields[field_name]
|
|
418
|
+
if type(existing_type) is not type(expected_type):
|
|
419
|
+
logger.info(
|
|
420
|
+
f"Schema migration needed for {name}.{field_name}: "
|
|
421
|
+
f"{existing_type} -> {expected_type}"
|
|
422
|
+
)
|
|
423
|
+
needs_migration = True
|
|
424
|
+
break
|
|
425
|
+
|
|
426
|
+
if needs_migration:
|
|
427
|
+
logger.info(f"Migrating schema for {name}...")
|
|
428
|
+
migrated_df = existing_df
|
|
429
|
+
for field in expected_schema.fields:
|
|
430
|
+
if field.name in existing_fields:
|
|
431
|
+
existing_type = existing_fields[field.name]
|
|
432
|
+
if not isinstance(existing_type, type(field.dataType)):
|
|
433
|
+
from pyspark.sql import functions as F
|
|
434
|
+
|
|
435
|
+
if isinstance(existing_type, ArrayType) and isinstance(
|
|
436
|
+
field.dataType, StringType
|
|
437
|
+
):
|
|
438
|
+
migrated_df = migrated_df.withColumn(
|
|
439
|
+
field.name, F.to_json(F.col(field.name))
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
migrated_df.write.format("delta").mode("overwrite").option(
|
|
443
|
+
"overwriteSchema", "true"
|
|
444
|
+
).save(path)
|
|
445
|
+
logger.info(f"Schema migration completed for {name}")
|
|
446
|
+
|
|
447
|
+
elif self.engine and self.engine.name == "pandas":
|
|
448
|
+
from deltalake import DeltaTable
|
|
449
|
+
|
|
450
|
+
storage_opts = self._get_storage_options()
|
|
451
|
+
dt = DeltaTable(path, storage_options=storage_opts if storage_opts else None)
|
|
452
|
+
existing_schema = dt.schema()
|
|
453
|
+
existing_fields = {f.name: f.type for f in existing_schema.fields}
|
|
454
|
+
|
|
455
|
+
needs_migration = False
|
|
456
|
+
for field in expected_schema.fields:
|
|
457
|
+
if field.name in existing_fields:
|
|
458
|
+
existing_type_str = str(existing_fields[field.name])
|
|
459
|
+
expected_type_str = field.dataType.simpleString()
|
|
460
|
+
if "array" in existing_type_str.lower() and expected_type_str == "string":
|
|
461
|
+
needs_migration = True
|
|
462
|
+
break
|
|
463
|
+
|
|
464
|
+
if needs_migration:
|
|
465
|
+
logger.info(f"Migrating schema for {name}...")
|
|
466
|
+
import json
|
|
467
|
+
|
|
468
|
+
df = dt.to_pandas()
|
|
469
|
+
for field in expected_schema.fields:
|
|
470
|
+
if field.name in df.columns and field.name in existing_fields:
|
|
471
|
+
existing_type_str = str(existing_fields[field.name])
|
|
472
|
+
if "array" in existing_type_str.lower():
|
|
473
|
+
df[field.name] = df[field.name].apply(
|
|
474
|
+
lambda x: json.dumps(x) if isinstance(x, list) else x
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
from deltalake import write_deltalake
|
|
478
|
+
|
|
479
|
+
storage_opts = self._get_storage_options()
|
|
480
|
+
write_deltalake(
|
|
481
|
+
path,
|
|
482
|
+
df,
|
|
483
|
+
mode="overwrite",
|
|
484
|
+
overwrite_schema=True,
|
|
485
|
+
storage_options=storage_opts if storage_opts else None,
|
|
486
|
+
)
|
|
487
|
+
logger.info(f"Schema migration completed for {name}")
|
|
488
|
+
|
|
489
|
+
except Exception as e:
|
|
490
|
+
logger.warning(f"Schema migration check failed for {name}: {e}")
|
|
491
|
+
|
|
492
|
+
def _table_exists(self, path: str) -> bool:
|
|
493
|
+
if self.spark:
|
|
494
|
+
try:
|
|
495
|
+
self.spark.read.format("delta").load(path).limit(0).collect()
|
|
496
|
+
return True
|
|
497
|
+
except Exception as e:
|
|
498
|
+
# If AnalysisException or "Path does not exist", return False
|
|
499
|
+
# Otherwise, if it's an auth error, we might want to warn.
|
|
500
|
+
msg = str(e).lower()
|
|
501
|
+
if (
|
|
502
|
+
"path does not exist" in msg
|
|
503
|
+
or "filenotfound" in msg
|
|
504
|
+
or "analysisexception" in type(e).__name__.lower()
|
|
505
|
+
):
|
|
506
|
+
return False
|
|
507
|
+
|
|
508
|
+
logger.warning(f"Error checking if table exists at {path}: {e}")
|
|
509
|
+
return False
|
|
510
|
+
elif self.engine:
|
|
511
|
+
import os
|
|
512
|
+
|
|
513
|
+
# For cloud paths, try to load with delta-rs
|
|
514
|
+
if path.startswith(("abfss://", "az://", "s3://", "gs://", "https://")):
|
|
515
|
+
try:
|
|
516
|
+
from deltalake import DeltaTable
|
|
517
|
+
|
|
518
|
+
storage_opts = self._get_storage_options()
|
|
519
|
+
DeltaTable(path, storage_options=storage_opts if storage_opts else None)
|
|
520
|
+
return True
|
|
521
|
+
except Exception:
|
|
522
|
+
return False
|
|
523
|
+
|
|
524
|
+
# For local paths, check if directory exists and has content
|
|
525
|
+
if not os.path.exists(path):
|
|
526
|
+
return False
|
|
527
|
+
if os.path.isdir(path):
|
|
528
|
+
# Check if empty or contains relevant files
|
|
529
|
+
if not os.listdir(path):
|
|
530
|
+
return False
|
|
531
|
+
return True
|
|
532
|
+
return False
|
|
533
|
+
return False
|
|
534
|
+
|
|
535
|
+
def _get_schema_meta_tables(self) -> StructType:
|
|
536
|
+
"""
|
|
537
|
+
meta_tables (Inventory): Tracks physical assets.
|
|
538
|
+
"""
|
|
539
|
+
return StructType(
|
|
540
|
+
[
|
|
541
|
+
StructField("project_name", StringType(), True),
|
|
542
|
+
StructField("table_name", StringType(), True),
|
|
543
|
+
StructField("path", StringType(), True),
|
|
544
|
+
StructField("format", StringType(), True),
|
|
545
|
+
StructField("pattern_type", StringType(), True),
|
|
546
|
+
StructField("schema_hash", StringType(), True),
|
|
547
|
+
StructField("updated_at", TimestampType(), True),
|
|
548
|
+
]
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
def _get_schema_meta_runs(self) -> StructType:
|
|
552
|
+
"""
|
|
553
|
+
meta_runs (Observability): Tracks execution history.
|
|
554
|
+
"""
|
|
555
|
+
return StructType(
|
|
556
|
+
[
|
|
557
|
+
StructField("run_id", StringType(), True),
|
|
558
|
+
StructField("pipeline_name", StringType(), True),
|
|
559
|
+
StructField("node_name", StringType(), True),
|
|
560
|
+
StructField("status", StringType(), True),
|
|
561
|
+
StructField("rows_processed", LongType(), True),
|
|
562
|
+
StructField("duration_ms", LongType(), True),
|
|
563
|
+
StructField("metrics_json", StringType(), True),
|
|
564
|
+
StructField("environment", StringType(), True),
|
|
565
|
+
StructField("timestamp", TimestampType(), True),
|
|
566
|
+
StructField("date", DateType(), True),
|
|
567
|
+
]
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
def _get_schema_meta_patterns(self) -> StructType:
|
|
571
|
+
"""
|
|
572
|
+
meta_patterns (Governance): Tracks pattern compliance.
|
|
573
|
+
"""
|
|
574
|
+
return StructType(
|
|
575
|
+
[
|
|
576
|
+
StructField("table_name", StringType(), True),
|
|
577
|
+
StructField("pattern_type", StringType(), True),
|
|
578
|
+
StructField("configuration", StringType(), True),
|
|
579
|
+
StructField("compliance_score", DoubleType(), True),
|
|
580
|
+
]
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
def _get_schema_meta_metrics(self) -> StructType:
|
|
584
|
+
"""
|
|
585
|
+
meta_metrics (Semantics): Tracks business logic.
|
|
586
|
+
Note: dimensions is stored as JSON string for cross-engine portability.
|
|
587
|
+
"""
|
|
588
|
+
return StructType(
|
|
589
|
+
[
|
|
590
|
+
StructField("metric_name", StringType(), True),
|
|
591
|
+
StructField("definition_sql", StringType(), True),
|
|
592
|
+
StructField("dimensions", StringType(), True),
|
|
593
|
+
StructField("source_table", StringType(), True),
|
|
594
|
+
]
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
def _get_schema_meta_state(self) -> StructType:
|
|
598
|
+
"""
|
|
599
|
+
meta_state (HWM Key-Value Store): Tracks high-water marks for incremental loads.
|
|
600
|
+
Uses a generic key/value pattern for flexibility.
|
|
601
|
+
"""
|
|
602
|
+
return StructType(
|
|
603
|
+
[
|
|
604
|
+
StructField("key", StringType(), False),
|
|
605
|
+
StructField("value", StringType(), True),
|
|
606
|
+
StructField("environment", StringType(), True),
|
|
607
|
+
StructField("updated_at", TimestampType(), True),
|
|
608
|
+
]
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
def _get_schema_meta_pipelines(self) -> StructType:
|
|
612
|
+
"""
|
|
613
|
+
meta_pipelines (Definitions): Tracks pipeline configurations.
|
|
614
|
+
"""
|
|
615
|
+
return StructType(
|
|
616
|
+
[
|
|
617
|
+
StructField("pipeline_name", StringType(), True),
|
|
618
|
+
StructField("version_hash", StringType(), True),
|
|
619
|
+
StructField("description", StringType(), True),
|
|
620
|
+
StructField("layer", StringType(), True),
|
|
621
|
+
StructField("schedule", StringType(), True),
|
|
622
|
+
StructField("tags_json", StringType(), True),
|
|
623
|
+
StructField("updated_at", TimestampType(), True),
|
|
624
|
+
]
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
def _get_schema_meta_nodes(self) -> StructType:
|
|
628
|
+
"""
|
|
629
|
+
meta_nodes (Definitions): Tracks node configurations within pipelines.
|
|
630
|
+
"""
|
|
631
|
+
return StructType(
|
|
632
|
+
[
|
|
633
|
+
StructField("pipeline_name", StringType(), True),
|
|
634
|
+
StructField("node_name", StringType(), True),
|
|
635
|
+
StructField("version_hash", StringType(), True),
|
|
636
|
+
StructField("type", StringType(), True), # read/transform/write
|
|
637
|
+
StructField("config_json", StringType(), True),
|
|
638
|
+
StructField("updated_at", TimestampType(), True),
|
|
639
|
+
]
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
def _get_schema_meta_schemas(self) -> StructType:
|
|
643
|
+
"""
|
|
644
|
+
meta_schemas (Schema Version Tracking): Tracks schema changes over time.
|
|
645
|
+
"""
|
|
646
|
+
return StructType(
|
|
647
|
+
[
|
|
648
|
+
StructField("table_path", StringType(), False),
|
|
649
|
+
StructField("schema_version", LongType(), False),
|
|
650
|
+
StructField("schema_hash", StringType(), False),
|
|
651
|
+
StructField("columns", StringType(), False), # JSON: {"col": "type", ...}
|
|
652
|
+
StructField("captured_at", TimestampType(), False),
|
|
653
|
+
StructField("pipeline", StringType(), True),
|
|
654
|
+
StructField("node", StringType(), True),
|
|
655
|
+
StructField("run_id", StringType(), True),
|
|
656
|
+
StructField("columns_added", StringType(), True), # JSON array as string
|
|
657
|
+
StructField("columns_removed", StringType(), True), # JSON array as string
|
|
658
|
+
StructField("columns_type_changed", StringType(), True), # JSON array as string
|
|
659
|
+
]
|
|
660
|
+
)
|
|
661
|
+
|
|
662
|
+
def _get_schema_meta_lineage(self) -> StructType:
|
|
663
|
+
"""
|
|
664
|
+
meta_lineage (Cross-Pipeline Lineage): Tracks table-level lineage relationships.
|
|
665
|
+
"""
|
|
666
|
+
return StructType(
|
|
667
|
+
[
|
|
668
|
+
StructField("source_table", StringType(), False),
|
|
669
|
+
StructField("target_table", StringType(), False),
|
|
670
|
+
StructField("source_pipeline", StringType(), True),
|
|
671
|
+
StructField("source_node", StringType(), True),
|
|
672
|
+
StructField("target_pipeline", StringType(), True),
|
|
673
|
+
StructField("target_node", StringType(), True),
|
|
674
|
+
StructField("relationship", StringType(), False), # "feeds" | "derived_from"
|
|
675
|
+
StructField("last_observed", TimestampType(), False),
|
|
676
|
+
StructField("run_id", StringType(), True),
|
|
677
|
+
]
|
|
678
|
+
)
|
|
679
|
+
|
|
680
|
+
def _get_schema_meta_outputs(self) -> StructType:
|
|
681
|
+
"""
|
|
682
|
+
meta_outputs (Node Outputs Registry): Tracks output metadata for cross-pipeline dependencies.
|
|
683
|
+
|
|
684
|
+
Stores output metadata for every node that has a `write` block.
|
|
685
|
+
Primary key: (pipeline_name, node_name)
|
|
686
|
+
"""
|
|
687
|
+
return StructType(
|
|
688
|
+
[
|
|
689
|
+
StructField("pipeline_name", StringType(), False),
|
|
690
|
+
StructField("node_name", StringType(), False),
|
|
691
|
+
StructField(
|
|
692
|
+
"output_type", StringType(), False
|
|
693
|
+
), # "external_table" | "managed_table"
|
|
694
|
+
StructField("connection_name", StringType(), True),
|
|
695
|
+
StructField("path", StringType(), True),
|
|
696
|
+
StructField("format", StringType(), True),
|
|
697
|
+
StructField("table_name", StringType(), True),
|
|
698
|
+
StructField("last_run", TimestampType(), False),
|
|
699
|
+
StructField("row_count", LongType(), True),
|
|
700
|
+
StructField("updated_at", TimestampType(), False),
|
|
701
|
+
]
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
def get_registered_pipeline(self, pipeline_name: str) -> Optional[Dict[str, Any]]:
|
|
705
|
+
"""
|
|
706
|
+
Get existing registered pipeline record with version_hash.
|
|
707
|
+
|
|
708
|
+
Args:
|
|
709
|
+
pipeline_name: Name of the pipeline to look up
|
|
710
|
+
|
|
711
|
+
Returns:
|
|
712
|
+
Dict with pipeline record including version_hash, or None if not found
|
|
713
|
+
"""
|
|
714
|
+
pipelines_cache = self._get_all_pipelines_cached()
|
|
715
|
+
return pipelines_cache.get(pipeline_name)
|
|
716
|
+
|
|
717
|
+
def get_registered_nodes(self, pipeline_name: str) -> Dict[str, str]:
|
|
718
|
+
"""
|
|
719
|
+
Get existing registered nodes for a pipeline with their version hashes.
|
|
720
|
+
|
|
721
|
+
Args:
|
|
722
|
+
pipeline_name: Name of the pipeline to look up nodes for
|
|
723
|
+
|
|
724
|
+
Returns:
|
|
725
|
+
Dict mapping node_name -> version_hash for all registered nodes
|
|
726
|
+
"""
|
|
727
|
+
nodes_cache = self._get_all_nodes_cached()
|
|
728
|
+
return nodes_cache.get(pipeline_name, {})
|
|
729
|
+
|
|
730
|
+
def get_all_registered_pipelines(self) -> Dict[str, str]:
|
|
731
|
+
"""
|
|
732
|
+
Get all registered pipelines with their version hashes.
|
|
733
|
+
|
|
734
|
+
Returns:
|
|
735
|
+
Dict mapping pipeline_name -> version_hash
|
|
736
|
+
"""
|
|
737
|
+
pipelines_cache = self._get_all_pipelines_cached()
|
|
738
|
+
return {name: data.get("version_hash", "") for name, data in pipelines_cache.items()}
|
|
739
|
+
|
|
740
|
+
def get_all_registered_nodes(self, pipeline_names: List[str]) -> Dict[str, Dict[str, str]]:
|
|
741
|
+
"""
|
|
742
|
+
Get all registered nodes for multiple pipelines with their version hashes.
|
|
743
|
+
|
|
744
|
+
Args:
|
|
745
|
+
pipeline_names: List of pipeline names to look up nodes for
|
|
746
|
+
|
|
747
|
+
Returns:
|
|
748
|
+
Dict mapping pipeline_name -> {node_name -> version_hash}
|
|
749
|
+
"""
|
|
750
|
+
nodes_cache = self._get_all_nodes_cached()
|
|
751
|
+
return {name: nodes_cache.get(name, {}) for name in pipeline_names}
|
|
752
|
+
|
|
753
|
+
def register_pipelines_batch(
|
|
754
|
+
self,
|
|
755
|
+
records: List[Dict[str, Any]],
|
|
756
|
+
) -> None:
|
|
757
|
+
"""
|
|
758
|
+
Batch registers/upserts multiple pipeline definitions to meta_pipelines.
|
|
759
|
+
|
|
760
|
+
Args:
|
|
761
|
+
records: List of dicts with keys: pipeline_name, version_hash, description,
|
|
762
|
+
layer, schedule, tags_json
|
|
763
|
+
"""
|
|
764
|
+
if not self.spark and not self.engine:
|
|
765
|
+
return
|
|
766
|
+
|
|
767
|
+
if not records:
|
|
768
|
+
return
|
|
769
|
+
|
|
770
|
+
try:
|
|
771
|
+
from datetime import datetime, timezone
|
|
772
|
+
|
|
773
|
+
if self.spark:
|
|
774
|
+
from pyspark.sql import functions as F
|
|
775
|
+
|
|
776
|
+
schema = self._get_schema_meta_pipelines()
|
|
777
|
+
input_schema = StructType(schema.fields[:-1]) # Exclude updated_at
|
|
778
|
+
|
|
779
|
+
rows = [
|
|
780
|
+
(
|
|
781
|
+
r["pipeline_name"],
|
|
782
|
+
r["version_hash"],
|
|
783
|
+
r["description"],
|
|
784
|
+
r["layer"],
|
|
785
|
+
r["schedule"],
|
|
786
|
+
r["tags_json"],
|
|
787
|
+
)
|
|
788
|
+
for r in records
|
|
789
|
+
]
|
|
790
|
+
df = self.spark.createDataFrame(rows, input_schema)
|
|
791
|
+
df = df.withColumn("updated_at", F.current_timestamp())
|
|
792
|
+
|
|
793
|
+
view_name = "_odibi_meta_pipelines_batch_upsert"
|
|
794
|
+
df.createOrReplaceTempView(view_name)
|
|
795
|
+
|
|
796
|
+
target_path = self.tables["meta_pipelines"]
|
|
797
|
+
|
|
798
|
+
merge_sql = f"""
|
|
799
|
+
MERGE INTO delta.`{target_path}` AS target
|
|
800
|
+
USING {view_name} AS source
|
|
801
|
+
ON target.pipeline_name = source.pipeline_name
|
|
802
|
+
WHEN MATCHED THEN UPDATE SET
|
|
803
|
+
target.version_hash = source.version_hash,
|
|
804
|
+
target.description = source.description,
|
|
805
|
+
target.layer = source.layer,
|
|
806
|
+
target.schedule = source.schedule,
|
|
807
|
+
target.tags_json = source.tags_json,
|
|
808
|
+
target.updated_at = source.updated_at
|
|
809
|
+
WHEN NOT MATCHED THEN INSERT *
|
|
810
|
+
"""
|
|
811
|
+
self.spark.sql(merge_sql)
|
|
812
|
+
self.spark.catalog.dropTempView(view_name)
|
|
813
|
+
|
|
814
|
+
elif self.engine:
|
|
815
|
+
import pandas as pd
|
|
816
|
+
|
|
817
|
+
data = {
|
|
818
|
+
"pipeline_name": [r["pipeline_name"] for r in records],
|
|
819
|
+
"version_hash": [r["version_hash"] for r in records],
|
|
820
|
+
"description": [r["description"] for r in records],
|
|
821
|
+
"layer": [r["layer"] for r in records],
|
|
822
|
+
"schedule": [r["schedule"] for r in records],
|
|
823
|
+
"tags_json": [r["tags_json"] for r in records],
|
|
824
|
+
"updated_at": [datetime.now(timezone.utc) for _ in records],
|
|
825
|
+
}
|
|
826
|
+
df = pd.DataFrame(data)
|
|
827
|
+
|
|
828
|
+
def do_write():
|
|
829
|
+
self.engine.write(
|
|
830
|
+
df,
|
|
831
|
+
connection=self.connection,
|
|
832
|
+
format="delta",
|
|
833
|
+
path=self.tables["meta_pipelines"],
|
|
834
|
+
mode="upsert",
|
|
835
|
+
options={"keys": ["pipeline_name"]},
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
self._retry_with_backoff(do_write)
|
|
839
|
+
|
|
840
|
+
self._pipelines_cache = None
|
|
841
|
+
logger.debug(f"Batch registered {len(records)} pipeline(s)")
|
|
842
|
+
|
|
843
|
+
except Exception as e:
|
|
844
|
+
logger.warning(f"Failed to batch register pipelines: {e}")
|
|
845
|
+
|
|
846
|
+
def register_nodes_batch(
|
|
847
|
+
self,
|
|
848
|
+
records: List[Dict[str, Any]],
|
|
849
|
+
) -> None:
|
|
850
|
+
"""
|
|
851
|
+
Batch registers/upserts multiple node definitions to meta_nodes.
|
|
852
|
+
|
|
853
|
+
Args:
|
|
854
|
+
records: List of dicts with keys: pipeline_name, node_name, version_hash,
|
|
855
|
+
type, config_json
|
|
856
|
+
"""
|
|
857
|
+
if not self.spark and not self.engine:
|
|
858
|
+
return
|
|
859
|
+
|
|
860
|
+
if not records:
|
|
861
|
+
return
|
|
862
|
+
|
|
863
|
+
try:
|
|
864
|
+
from datetime import datetime, timezone
|
|
865
|
+
|
|
866
|
+
if self.spark:
|
|
867
|
+
from pyspark.sql import functions as F
|
|
868
|
+
|
|
869
|
+
schema = self._get_schema_meta_nodes()
|
|
870
|
+
input_schema = StructType(schema.fields[:-1]) # Exclude updated_at
|
|
871
|
+
|
|
872
|
+
rows = [
|
|
873
|
+
(
|
|
874
|
+
r["pipeline_name"],
|
|
875
|
+
r["node_name"],
|
|
876
|
+
r["version_hash"],
|
|
877
|
+
r["type"],
|
|
878
|
+
r["config_json"],
|
|
879
|
+
)
|
|
880
|
+
for r in records
|
|
881
|
+
]
|
|
882
|
+
df = self.spark.createDataFrame(rows, input_schema)
|
|
883
|
+
df = df.withColumn("updated_at", F.current_timestamp())
|
|
884
|
+
|
|
885
|
+
view_name = "_odibi_meta_nodes_batch_upsert"
|
|
886
|
+
df.createOrReplaceTempView(view_name)
|
|
887
|
+
|
|
888
|
+
target_path = self.tables["meta_nodes"]
|
|
889
|
+
|
|
890
|
+
merge_sql = f"""
|
|
891
|
+
MERGE INTO delta.`{target_path}` AS target
|
|
892
|
+
USING {view_name} AS source
|
|
893
|
+
ON target.pipeline_name = source.pipeline_name
|
|
894
|
+
AND target.node_name = source.node_name
|
|
895
|
+
WHEN MATCHED THEN UPDATE SET
|
|
896
|
+
target.version_hash = source.version_hash,
|
|
897
|
+
target.type = source.type,
|
|
898
|
+
target.config_json = source.config_json,
|
|
899
|
+
target.updated_at = source.updated_at
|
|
900
|
+
WHEN NOT MATCHED THEN INSERT *
|
|
901
|
+
"""
|
|
902
|
+
self.spark.sql(merge_sql)
|
|
903
|
+
self.spark.catalog.dropTempView(view_name)
|
|
904
|
+
|
|
905
|
+
elif self.engine:
|
|
906
|
+
import pandas as pd
|
|
907
|
+
|
|
908
|
+
data = {
|
|
909
|
+
"pipeline_name": [r["pipeline_name"] for r in records],
|
|
910
|
+
"node_name": [r["node_name"] for r in records],
|
|
911
|
+
"version_hash": [r["version_hash"] for r in records],
|
|
912
|
+
"type": [r["type"] for r in records],
|
|
913
|
+
"config_json": [r["config_json"] for r in records],
|
|
914
|
+
"updated_at": [datetime.now(timezone.utc) for _ in records],
|
|
915
|
+
}
|
|
916
|
+
df = pd.DataFrame(data)
|
|
917
|
+
|
|
918
|
+
def do_write():
|
|
919
|
+
self.engine.write(
|
|
920
|
+
df,
|
|
921
|
+
connection=self.connection,
|
|
922
|
+
format="delta",
|
|
923
|
+
path=self.tables["meta_nodes"],
|
|
924
|
+
mode="upsert",
|
|
925
|
+
options={"keys": ["pipeline_name", "node_name"]},
|
|
926
|
+
)
|
|
927
|
+
|
|
928
|
+
self._retry_with_backoff(do_write)
|
|
929
|
+
|
|
930
|
+
self._nodes_cache = None
|
|
931
|
+
logger.debug(f"Batch registered {len(records)} node(s)")
|
|
932
|
+
|
|
933
|
+
except Exception as e:
|
|
934
|
+
logger.warning(f"Failed to batch register nodes: {e}")
|
|
935
|
+
|
|
936
|
+
def register_outputs_batch(
|
|
937
|
+
self,
|
|
938
|
+
records: List[Dict[str, Any]],
|
|
939
|
+
) -> None:
|
|
940
|
+
"""
|
|
941
|
+
Batch registers/upserts multiple node outputs to meta_outputs.
|
|
942
|
+
|
|
943
|
+
Uses MERGE INTO for efficient upsert. This is performance critical -
|
|
944
|
+
all outputs are collected during pipeline execution and written in a
|
|
945
|
+
single batch at the end.
|
|
946
|
+
|
|
947
|
+
Args:
|
|
948
|
+
records: List of dicts with keys:
|
|
949
|
+
- pipeline_name: str (pipeline identifier)
|
|
950
|
+
- node_name: str (node identifier)
|
|
951
|
+
- output_type: str ("external_table" | "managed_table")
|
|
952
|
+
- connection_name: str (nullable, for external tables)
|
|
953
|
+
- path: str (nullable, storage path)
|
|
954
|
+
- format: str (delta, parquet, etc.)
|
|
955
|
+
- table_name: str (nullable, registered table name)
|
|
956
|
+
- last_run: datetime (execution timestamp)
|
|
957
|
+
- row_count: int (nullable)
|
|
958
|
+
"""
|
|
959
|
+
if not self.spark and not self.engine:
|
|
960
|
+
return
|
|
961
|
+
|
|
962
|
+
if not records:
|
|
963
|
+
return
|
|
964
|
+
|
|
965
|
+
try:
|
|
966
|
+
if self.spark:
|
|
967
|
+
from pyspark.sql import functions as F
|
|
968
|
+
|
|
969
|
+
schema = self._get_schema_meta_outputs()
|
|
970
|
+
input_schema = StructType(schema.fields[:-1]) # Exclude updated_at
|
|
971
|
+
|
|
972
|
+
rows = [
|
|
973
|
+
(
|
|
974
|
+
r["pipeline_name"],
|
|
975
|
+
r["node_name"],
|
|
976
|
+
r["output_type"],
|
|
977
|
+
r.get("connection_name"),
|
|
978
|
+
r.get("path"),
|
|
979
|
+
r.get("format"),
|
|
980
|
+
r.get("table_name"),
|
|
981
|
+
r["last_run"],
|
|
982
|
+
r.get("row_count"),
|
|
983
|
+
)
|
|
984
|
+
for r in records
|
|
985
|
+
]
|
|
986
|
+
df = self.spark.createDataFrame(rows, input_schema)
|
|
987
|
+
df = df.withColumn("updated_at", F.current_timestamp())
|
|
988
|
+
|
|
989
|
+
view_name = "_odibi_meta_outputs_batch_upsert"
|
|
990
|
+
df.createOrReplaceTempView(view_name)
|
|
991
|
+
|
|
992
|
+
target_path = self.tables["meta_outputs"]
|
|
993
|
+
|
|
994
|
+
merge_sql = f"""
|
|
995
|
+
MERGE INTO delta.`{target_path}` AS target
|
|
996
|
+
USING {view_name} AS source
|
|
997
|
+
ON target.pipeline_name = source.pipeline_name
|
|
998
|
+
AND target.node_name = source.node_name
|
|
999
|
+
WHEN MATCHED THEN UPDATE SET
|
|
1000
|
+
target.output_type = source.output_type,
|
|
1001
|
+
target.connection_name = source.connection_name,
|
|
1002
|
+
target.path = source.path,
|
|
1003
|
+
target.format = source.format,
|
|
1004
|
+
target.table_name = source.table_name,
|
|
1005
|
+
target.last_run = source.last_run,
|
|
1006
|
+
target.row_count = source.row_count,
|
|
1007
|
+
target.updated_at = source.updated_at
|
|
1008
|
+
WHEN NOT MATCHED THEN INSERT *
|
|
1009
|
+
"""
|
|
1010
|
+
self.spark.sql(merge_sql)
|
|
1011
|
+
self.spark.catalog.dropTempView(view_name)
|
|
1012
|
+
|
|
1013
|
+
elif self.engine:
|
|
1014
|
+
import pandas as pd
|
|
1015
|
+
|
|
1016
|
+
data = {
|
|
1017
|
+
"pipeline_name": [r["pipeline_name"] for r in records],
|
|
1018
|
+
"node_name": [r["node_name"] for r in records],
|
|
1019
|
+
"output_type": [r["output_type"] for r in records],
|
|
1020
|
+
"connection_name": [r.get("connection_name") for r in records],
|
|
1021
|
+
"path": [r.get("path") for r in records],
|
|
1022
|
+
"format": [r.get("format") for r in records],
|
|
1023
|
+
"table_name": [r.get("table_name") for r in records],
|
|
1024
|
+
"last_run": [r["last_run"] for r in records],
|
|
1025
|
+
"row_count": [r.get("row_count") for r in records],
|
|
1026
|
+
"updated_at": [datetime.now(timezone.utc) for _ in records],
|
|
1027
|
+
}
|
|
1028
|
+
df = pd.DataFrame(data)
|
|
1029
|
+
|
|
1030
|
+
def do_write():
|
|
1031
|
+
self.engine.write(
|
|
1032
|
+
df,
|
|
1033
|
+
connection=self.connection,
|
|
1034
|
+
format="delta",
|
|
1035
|
+
path=self.tables["meta_outputs"],
|
|
1036
|
+
mode="upsert",
|
|
1037
|
+
options={"keys": ["pipeline_name", "node_name"]},
|
|
1038
|
+
)
|
|
1039
|
+
|
|
1040
|
+
self._retry_with_backoff(do_write)
|
|
1041
|
+
|
|
1042
|
+
self._outputs_cache = None
|
|
1043
|
+
logger.debug(f"Batch registered {len(records)} output(s)")
|
|
1044
|
+
|
|
1045
|
+
except Exception as e:
|
|
1046
|
+
logger.warning(f"Failed to batch register outputs: {e}")
|
|
1047
|
+
|
|
1048
|
+
def _get_all_outputs_cached(self) -> Dict[str, Dict[str, Any]]:
|
|
1049
|
+
"""
|
|
1050
|
+
Get all outputs with caching.
|
|
1051
|
+
|
|
1052
|
+
Returns:
|
|
1053
|
+
Dict mapping "{pipeline_name}.{node_name}" -> output record
|
|
1054
|
+
"""
|
|
1055
|
+
# Thread-safe check: if cache exists and is populated, return it
|
|
1056
|
+
if self._outputs_cache is not None:
|
|
1057
|
+
return self._outputs_cache
|
|
1058
|
+
|
|
1059
|
+
# Build cache in a local variable first to avoid race conditions
|
|
1060
|
+
cache: Dict[str, Dict[str, Any]] = {}
|
|
1061
|
+
if not self.spark and not self.engine:
|
|
1062
|
+
self._outputs_cache = cache
|
|
1063
|
+
return self._outputs_cache
|
|
1064
|
+
|
|
1065
|
+
try:
|
|
1066
|
+
if self.spark:
|
|
1067
|
+
df = self.spark.read.format("delta").load(self.tables["meta_outputs"])
|
|
1068
|
+
rows = df.collect()
|
|
1069
|
+
for row in rows:
|
|
1070
|
+
row_dict = row.asDict()
|
|
1071
|
+
key = f"{row_dict['pipeline_name']}.{row_dict['node_name']}"
|
|
1072
|
+
cache[key] = row_dict
|
|
1073
|
+
elif self.engine:
|
|
1074
|
+
df = self._read_local_table(self.tables["meta_outputs"])
|
|
1075
|
+
if not df.empty and "pipeline_name" in df.columns:
|
|
1076
|
+
for _, row in df.iterrows():
|
|
1077
|
+
key = f"{row['pipeline_name']}.{row['node_name']}"
|
|
1078
|
+
cache[key] = row.to_dict()
|
|
1079
|
+
except Exception as e:
|
|
1080
|
+
logger.warning(f"Could not cache outputs from {self.tables.get('meta_outputs')}: {e}")
|
|
1081
|
+
|
|
1082
|
+
# Atomic assignment after building complete cache
|
|
1083
|
+
self._outputs_cache = cache
|
|
1084
|
+
return self._outputs_cache
|
|
1085
|
+
|
|
1086
|
+
def get_node_output(
|
|
1087
|
+
self,
|
|
1088
|
+
pipeline_name: str,
|
|
1089
|
+
node_name: str,
|
|
1090
|
+
) -> Optional[Dict[str, Any]]:
|
|
1091
|
+
"""
|
|
1092
|
+
Retrieves output metadata for a specific node.
|
|
1093
|
+
|
|
1094
|
+
Used for cross-pipeline dependency resolution ($pipeline.node references).
|
|
1095
|
+
|
|
1096
|
+
Args:
|
|
1097
|
+
pipeline_name: Name of the pipeline
|
|
1098
|
+
node_name: Name of the node
|
|
1099
|
+
|
|
1100
|
+
Returns:
|
|
1101
|
+
Dict with output metadata or None if not found.
|
|
1102
|
+
Keys: pipeline_name, node_name, output_type, connection_name,
|
|
1103
|
+
path, format, table_name, last_run, row_count
|
|
1104
|
+
"""
|
|
1105
|
+
outputs_cache = self._get_all_outputs_cached()
|
|
1106
|
+
key = f"{pipeline_name}.{node_name}"
|
|
1107
|
+
return outputs_cache.get(key)
|
|
1108
|
+
|
|
1109
|
+
def register_outputs_from_config(
|
|
1110
|
+
self,
|
|
1111
|
+
pipeline_config: Any,
|
|
1112
|
+
) -> int:
|
|
1113
|
+
"""
|
|
1114
|
+
Pre-register node outputs from pipeline config without running the pipeline.
|
|
1115
|
+
|
|
1116
|
+
Scans pipeline nodes for output locations (write blocks, merge/scd2 params)
|
|
1117
|
+
and registers them to meta_outputs. This enables cross-pipeline references
|
|
1118
|
+
without requiring the source pipeline to have run first.
|
|
1119
|
+
|
|
1120
|
+
Args:
|
|
1121
|
+
pipeline_config: Pipeline configuration object with nodes
|
|
1122
|
+
|
|
1123
|
+
Returns:
|
|
1124
|
+
Number of outputs registered
|
|
1125
|
+
"""
|
|
1126
|
+
from datetime import datetime
|
|
1127
|
+
|
|
1128
|
+
records = []
|
|
1129
|
+
pipeline_name = pipeline_config.pipeline
|
|
1130
|
+
|
|
1131
|
+
for node in pipeline_config.nodes:
|
|
1132
|
+
output_info = self._extract_node_output_info(node)
|
|
1133
|
+
if output_info:
|
|
1134
|
+
records.append(
|
|
1135
|
+
{
|
|
1136
|
+
"pipeline_name": pipeline_name,
|
|
1137
|
+
"node_name": node.name,
|
|
1138
|
+
"output_type": output_info.get("output_type", "external_table"),
|
|
1139
|
+
"connection_name": output_info.get("connection"),
|
|
1140
|
+
"path": output_info.get("path"),
|
|
1141
|
+
"format": output_info.get("format", "delta"),
|
|
1142
|
+
"table_name": output_info.get("register_table"),
|
|
1143
|
+
"last_run": datetime.now(),
|
|
1144
|
+
"row_count": None,
|
|
1145
|
+
}
|
|
1146
|
+
)
|
|
1147
|
+
|
|
1148
|
+
if records:
|
|
1149
|
+
self.register_outputs_batch(records)
|
|
1150
|
+
self._outputs_cache = None
|
|
1151
|
+
|
|
1152
|
+
return len(records)
|
|
1153
|
+
|
|
1154
|
+
def _extract_node_output_info(self, node_config: Any) -> Optional[Dict[str, Any]]:
|
|
1155
|
+
"""
|
|
1156
|
+
Extract output location from a node config.
|
|
1157
|
+
|
|
1158
|
+
Checks in order of precedence:
|
|
1159
|
+
1. Explicit write block
|
|
1160
|
+
2. merge/scd2 in transform steps
|
|
1161
|
+
3. Top-level merge/scd2 transformer
|
|
1162
|
+
|
|
1163
|
+
Args:
|
|
1164
|
+
node_config: Node configuration object
|
|
1165
|
+
|
|
1166
|
+
Returns:
|
|
1167
|
+
Dict with connection, path, format, register_table or None
|
|
1168
|
+
"""
|
|
1169
|
+
if node_config.write:
|
|
1170
|
+
write_cfg = node_config.write
|
|
1171
|
+
output_type = (
|
|
1172
|
+
"managed_table" if write_cfg.table and not write_cfg.path else "external_table"
|
|
1173
|
+
)
|
|
1174
|
+
return {
|
|
1175
|
+
"connection": write_cfg.connection,
|
|
1176
|
+
"path": write_cfg.path,
|
|
1177
|
+
"format": write_cfg.format or "delta",
|
|
1178
|
+
"register_table": write_cfg.register_table or write_cfg.table,
|
|
1179
|
+
"output_type": output_type,
|
|
1180
|
+
}
|
|
1181
|
+
|
|
1182
|
+
output_functions = {"merge", "scd2"}
|
|
1183
|
+
|
|
1184
|
+
if node_config.transform and node_config.transform.steps:
|
|
1185
|
+
for step in reversed(node_config.transform.steps):
|
|
1186
|
+
if isinstance(step, str):
|
|
1187
|
+
continue
|
|
1188
|
+
|
|
1189
|
+
if hasattr(step, "function") and step.function in output_functions:
|
|
1190
|
+
params = step.params or {}
|
|
1191
|
+
connection = params.get("connection")
|
|
1192
|
+
path = params.get("path") or params.get("target")
|
|
1193
|
+
register_table = params.get("register_table")
|
|
1194
|
+
|
|
1195
|
+
if connection and path:
|
|
1196
|
+
return {
|
|
1197
|
+
"connection": connection,
|
|
1198
|
+
"path": path,
|
|
1199
|
+
"format": "delta",
|
|
1200
|
+
"register_table": register_table,
|
|
1201
|
+
"output_type": "managed_table" if register_table else "external_table",
|
|
1202
|
+
}
|
|
1203
|
+
|
|
1204
|
+
if node_config.transformer in output_functions and node_config.params:
|
|
1205
|
+
params = node_config.params
|
|
1206
|
+
connection = params.get("connection")
|
|
1207
|
+
path = params.get("path") or params.get("target")
|
|
1208
|
+
register_table = params.get("register_table")
|
|
1209
|
+
|
|
1210
|
+
if connection and path:
|
|
1211
|
+
return {
|
|
1212
|
+
"connection": connection,
|
|
1213
|
+
"path": path,
|
|
1214
|
+
"format": "delta",
|
|
1215
|
+
"register_table": register_table,
|
|
1216
|
+
"output_type": "managed_table" if register_table else "external_table",
|
|
1217
|
+
}
|
|
1218
|
+
|
|
1219
|
+
return None
|
|
1220
|
+
|
|
1221
|
+
def _prepare_pipeline_record(self, pipeline_config: Any) -> Dict[str, Any]:
|
|
1222
|
+
"""Prepare a pipeline record for batch registration."""
|
|
1223
|
+
from odibi.utils.hashing import calculate_pipeline_hash
|
|
1224
|
+
|
|
1225
|
+
version_hash = calculate_pipeline_hash(pipeline_config)
|
|
1226
|
+
|
|
1227
|
+
all_tags = set()
|
|
1228
|
+
for node in pipeline_config.nodes:
|
|
1229
|
+
if node.tags:
|
|
1230
|
+
all_tags.update(node.tags)
|
|
1231
|
+
|
|
1232
|
+
return {
|
|
1233
|
+
"pipeline_name": pipeline_config.pipeline,
|
|
1234
|
+
"version_hash": version_hash,
|
|
1235
|
+
"description": pipeline_config.description or "",
|
|
1236
|
+
"layer": pipeline_config.layer or "",
|
|
1237
|
+
"schedule": "",
|
|
1238
|
+
"tags_json": json.dumps(list(all_tags)),
|
|
1239
|
+
}
|
|
1240
|
+
|
|
1241
|
+
def register_pipeline(
|
|
1242
|
+
self,
|
|
1243
|
+
pipeline_config: Any,
|
|
1244
|
+
project_config: Optional[Any] = None,
|
|
1245
|
+
skip_if_unchanged: bool = False,
|
|
1246
|
+
) -> bool:
|
|
1247
|
+
"""
|
|
1248
|
+
Registers/Upserts a pipeline definition to meta_pipelines.
|
|
1249
|
+
|
|
1250
|
+
.. deprecated::
|
|
1251
|
+
Use :meth:`register_pipelines_batch` for better performance.
|
|
1252
|
+
|
|
1253
|
+
Args:
|
|
1254
|
+
pipeline_config: The pipeline configuration object
|
|
1255
|
+
project_config: Optional project configuration
|
|
1256
|
+
skip_if_unchanged: If True, skip write if version_hash matches existing
|
|
1257
|
+
|
|
1258
|
+
Returns:
|
|
1259
|
+
True if write was performed, False if skipped
|
|
1260
|
+
"""
|
|
1261
|
+
import warnings
|
|
1262
|
+
|
|
1263
|
+
warnings.warn(
|
|
1264
|
+
"register_pipeline is deprecated, use register_pipelines_batch for better performance",
|
|
1265
|
+
DeprecationWarning,
|
|
1266
|
+
stacklevel=2,
|
|
1267
|
+
)
|
|
1268
|
+
|
|
1269
|
+
if not self.spark and not self.engine:
|
|
1270
|
+
return False
|
|
1271
|
+
|
|
1272
|
+
try:
|
|
1273
|
+
record = self._prepare_pipeline_record(pipeline_config)
|
|
1274
|
+
|
|
1275
|
+
if skip_if_unchanged:
|
|
1276
|
+
existing = self.get_registered_pipeline(pipeline_config.pipeline)
|
|
1277
|
+
if existing and existing.get("version_hash") == record["version_hash"]:
|
|
1278
|
+
logger.debug(f"Skipping pipeline '{pipeline_config.pipeline}' - unchanged")
|
|
1279
|
+
return False
|
|
1280
|
+
|
|
1281
|
+
self.register_pipelines_batch([record])
|
|
1282
|
+
return True
|
|
1283
|
+
|
|
1284
|
+
except Exception as e:
|
|
1285
|
+
logger.warning(f"Failed to register pipeline '{pipeline_config.pipeline}': {e}")
|
|
1286
|
+
return False
|
|
1287
|
+
|
|
1288
|
+
def _prepare_node_record(self, pipeline_name: str, node_config: Any) -> Dict[str, Any]:
|
|
1289
|
+
"""Prepare a node record for batch registration."""
|
|
1290
|
+
from odibi.utils.hashing import calculate_node_hash
|
|
1291
|
+
|
|
1292
|
+
version_hash = calculate_node_hash(node_config)
|
|
1293
|
+
|
|
1294
|
+
node_type = "transform"
|
|
1295
|
+
if node_config.read:
|
|
1296
|
+
node_type = "read"
|
|
1297
|
+
if node_config.write:
|
|
1298
|
+
node_type = "write"
|
|
1299
|
+
|
|
1300
|
+
if hasattr(node_config, "model_dump"):
|
|
1301
|
+
dump = node_config.model_dump(mode="json", exclude={"description", "tags", "log_level"})
|
|
1302
|
+
else:
|
|
1303
|
+
dump = node_config.model_dump(exclude={"description", "tags", "log_level"})
|
|
1304
|
+
|
|
1305
|
+
return {
|
|
1306
|
+
"pipeline_name": pipeline_name,
|
|
1307
|
+
"node_name": node_config.name,
|
|
1308
|
+
"version_hash": version_hash,
|
|
1309
|
+
"type": node_type,
|
|
1310
|
+
"config_json": json.dumps(dump),
|
|
1311
|
+
}
|
|
1312
|
+
|
|
1313
|
+
def register_node(
|
|
1314
|
+
self,
|
|
1315
|
+
pipeline_name: str,
|
|
1316
|
+
node_config: Any,
|
|
1317
|
+
skip_if_unchanged: bool = False,
|
|
1318
|
+
existing_hash: Optional[str] = None,
|
|
1319
|
+
) -> bool:
|
|
1320
|
+
"""
|
|
1321
|
+
Registers/Upserts a node definition to meta_nodes.
|
|
1322
|
+
|
|
1323
|
+
.. deprecated::
|
|
1324
|
+
Use :meth:`register_nodes_batch` for better performance.
|
|
1325
|
+
|
|
1326
|
+
Args:
|
|
1327
|
+
pipeline_name: Name of the parent pipeline
|
|
1328
|
+
node_config: The node configuration object
|
|
1329
|
+
skip_if_unchanged: If True, skip write if version_hash matches existing
|
|
1330
|
+
existing_hash: Pre-fetched existing hash (to avoid re-reading)
|
|
1331
|
+
|
|
1332
|
+
Returns:
|
|
1333
|
+
True if write was performed, False if skipped
|
|
1334
|
+
"""
|
|
1335
|
+
import warnings
|
|
1336
|
+
|
|
1337
|
+
warnings.warn(
|
|
1338
|
+
"register_node is deprecated, use register_nodes_batch for better performance",
|
|
1339
|
+
DeprecationWarning,
|
|
1340
|
+
stacklevel=2,
|
|
1341
|
+
)
|
|
1342
|
+
|
|
1343
|
+
if not self.spark and not self.engine:
|
|
1344
|
+
return False
|
|
1345
|
+
|
|
1346
|
+
try:
|
|
1347
|
+
record = self._prepare_node_record(pipeline_name, node_config)
|
|
1348
|
+
|
|
1349
|
+
if skip_if_unchanged:
|
|
1350
|
+
current_hash = existing_hash
|
|
1351
|
+
if current_hash is None:
|
|
1352
|
+
nodes = self.get_registered_nodes(pipeline_name)
|
|
1353
|
+
current_hash = nodes.get(node_config.name)
|
|
1354
|
+
|
|
1355
|
+
if current_hash == record["version_hash"]:
|
|
1356
|
+
logger.debug(f"Skipping node '{node_config.name}' - unchanged")
|
|
1357
|
+
return False
|
|
1358
|
+
|
|
1359
|
+
self.register_nodes_batch([record])
|
|
1360
|
+
return True
|
|
1361
|
+
|
|
1362
|
+
except Exception as e:
|
|
1363
|
+
logger.warning(f"Failed to register node '{node_config.name}': {e}")
|
|
1364
|
+
return False
|
|
1365
|
+
|
|
1366
|
+
def log_run(
|
|
1367
|
+
self,
|
|
1368
|
+
run_id: str,
|
|
1369
|
+
pipeline_name: str,
|
|
1370
|
+
node_name: str,
|
|
1371
|
+
status: str,
|
|
1372
|
+
rows_processed: Optional[int] = 0,
|
|
1373
|
+
duration_ms: Optional[int] = 0,
|
|
1374
|
+
metrics_json: Optional[str] = "{}",
|
|
1375
|
+
) -> None:
|
|
1376
|
+
"""
|
|
1377
|
+
Logs execution telemetry to meta_runs.
|
|
1378
|
+
|
|
1379
|
+
Note: For better performance with multiple nodes, use log_runs_batch() instead.
|
|
1380
|
+
"""
|
|
1381
|
+
environment = getattr(self.config, "environment", None)
|
|
1382
|
+
|
|
1383
|
+
# SQL Server mode - direct insert
|
|
1384
|
+
if self.is_sql_server_mode:
|
|
1385
|
+
self._log_run_sql_server(
|
|
1386
|
+
run_id,
|
|
1387
|
+
pipeline_name,
|
|
1388
|
+
node_name,
|
|
1389
|
+
status,
|
|
1390
|
+
rows_processed,
|
|
1391
|
+
duration_ms,
|
|
1392
|
+
metrics_json,
|
|
1393
|
+
environment,
|
|
1394
|
+
)
|
|
1395
|
+
return
|
|
1396
|
+
|
|
1397
|
+
if not self.spark and not self.engine:
|
|
1398
|
+
return
|
|
1399
|
+
|
|
1400
|
+
def _do_log_run():
|
|
1401
|
+
if self.spark:
|
|
1402
|
+
from pyspark.sql import functions as F
|
|
1403
|
+
|
|
1404
|
+
rows = [
|
|
1405
|
+
(
|
|
1406
|
+
run_id,
|
|
1407
|
+
pipeline_name,
|
|
1408
|
+
node_name,
|
|
1409
|
+
status,
|
|
1410
|
+
rows_processed,
|
|
1411
|
+
duration_ms,
|
|
1412
|
+
metrics_json,
|
|
1413
|
+
environment,
|
|
1414
|
+
)
|
|
1415
|
+
]
|
|
1416
|
+
schema = self._get_schema_meta_runs()
|
|
1417
|
+
input_schema = StructType(schema.fields[:-2])
|
|
1418
|
+
|
|
1419
|
+
df = self.spark.createDataFrame(rows, input_schema)
|
|
1420
|
+
df = df.withColumn("timestamp", F.current_timestamp()).withColumn(
|
|
1421
|
+
"date", F.to_date(F.col("timestamp"))
|
|
1422
|
+
)
|
|
1423
|
+
|
|
1424
|
+
df.write.format("delta").mode("append").save(self.tables["meta_runs"])
|
|
1425
|
+
elif self.engine:
|
|
1426
|
+
from datetime import datetime, timezone
|
|
1427
|
+
|
|
1428
|
+
import pandas as pd
|
|
1429
|
+
|
|
1430
|
+
timestamp = datetime.now(timezone.utc)
|
|
1431
|
+
|
|
1432
|
+
data = {
|
|
1433
|
+
"run_id": [run_id],
|
|
1434
|
+
"pipeline_name": [pipeline_name],
|
|
1435
|
+
"node_name": [node_name],
|
|
1436
|
+
"status": [status],
|
|
1437
|
+
"rows_processed": [rows_processed],
|
|
1438
|
+
"duration_ms": [duration_ms],
|
|
1439
|
+
"metrics_json": [metrics_json],
|
|
1440
|
+
"environment": [environment],
|
|
1441
|
+
"timestamp": [timestamp],
|
|
1442
|
+
"date": [timestamp.date()],
|
|
1443
|
+
}
|
|
1444
|
+
df = pd.DataFrame(data)
|
|
1445
|
+
|
|
1446
|
+
self.engine.write(
|
|
1447
|
+
df,
|
|
1448
|
+
connection=self.connection,
|
|
1449
|
+
format="delta",
|
|
1450
|
+
path=self.tables["meta_runs"],
|
|
1451
|
+
mode="append",
|
|
1452
|
+
)
|
|
1453
|
+
|
|
1454
|
+
try:
|
|
1455
|
+
self._retry_with_backoff(_do_log_run)
|
|
1456
|
+
except Exception as e:
|
|
1457
|
+
logger.warning(f"Failed to log run to system catalog: {e}")
|
|
1458
|
+
|
|
1459
|
+
def _log_run_sql_server(
|
|
1460
|
+
self,
|
|
1461
|
+
run_id: str,
|
|
1462
|
+
pipeline_name: str,
|
|
1463
|
+
node_name: str,
|
|
1464
|
+
status: str,
|
|
1465
|
+
rows_processed: int,
|
|
1466
|
+
duration_ms: int,
|
|
1467
|
+
metrics_json: str,
|
|
1468
|
+
environment: Optional[str],
|
|
1469
|
+
) -> None:
|
|
1470
|
+
"""Log a run to SQL Server meta_runs table."""
|
|
1471
|
+
schema_name = getattr(self.config, "schema_name", None) or "odibi_system"
|
|
1472
|
+
try:
|
|
1473
|
+
sql = f"""
|
|
1474
|
+
INSERT INTO [{schema_name}].[meta_runs]
|
|
1475
|
+
(run_id, pipeline_name, node_name, status, rows_processed, duration_ms,
|
|
1476
|
+
metrics_json, environment, timestamp, date)
|
|
1477
|
+
VALUES (:run_id, :pipeline, :node, :status, :rows, :duration,
|
|
1478
|
+
:metrics, :env, GETUTCDATE(), CAST(GETUTCDATE() AS DATE))
|
|
1479
|
+
"""
|
|
1480
|
+
self.connection.execute(
|
|
1481
|
+
sql,
|
|
1482
|
+
{
|
|
1483
|
+
"run_id": run_id,
|
|
1484
|
+
"pipeline": pipeline_name,
|
|
1485
|
+
"node": node_name,
|
|
1486
|
+
"status": status,
|
|
1487
|
+
"rows": rows_processed or 0,
|
|
1488
|
+
"duration": duration_ms or 0,
|
|
1489
|
+
"metrics": metrics_json or "{}",
|
|
1490
|
+
"env": environment,
|
|
1491
|
+
},
|
|
1492
|
+
)
|
|
1493
|
+
except Exception as e:
|
|
1494
|
+
logger.warning(f"Failed to log run to SQL Server: {e}")
|
|
1495
|
+
|
|
1496
|
+
def log_runs_batch(
|
|
1497
|
+
self,
|
|
1498
|
+
records: List[Dict[str, Any]],
|
|
1499
|
+
) -> None:
|
|
1500
|
+
"""
|
|
1501
|
+
Batch logs multiple execution records to meta_runs in a single write.
|
|
1502
|
+
|
|
1503
|
+
This is much more efficient than calling log_run() for each node individually.
|
|
1504
|
+
|
|
1505
|
+
Args:
|
|
1506
|
+
records: List of dicts with keys: run_id, pipeline_name, node_name,
|
|
1507
|
+
status, rows_processed, duration_ms, metrics_json
|
|
1508
|
+
"""
|
|
1509
|
+
if not records:
|
|
1510
|
+
return
|
|
1511
|
+
|
|
1512
|
+
environment = getattr(self.config, "environment", None)
|
|
1513
|
+
|
|
1514
|
+
# SQL Server mode - batch insert
|
|
1515
|
+
if self.is_sql_server_mode:
|
|
1516
|
+
for r in records:
|
|
1517
|
+
self._log_run_sql_server(
|
|
1518
|
+
r["run_id"],
|
|
1519
|
+
r["pipeline_name"],
|
|
1520
|
+
r["node_name"],
|
|
1521
|
+
r["status"],
|
|
1522
|
+
r.get("rows_processed", 0),
|
|
1523
|
+
r.get("duration_ms", 0),
|
|
1524
|
+
r.get("metrics_json", "{}"),
|
|
1525
|
+
environment,
|
|
1526
|
+
)
|
|
1527
|
+
logger.debug(f"Batch logged {len(records)} run records to SQL Server")
|
|
1528
|
+
return
|
|
1529
|
+
|
|
1530
|
+
if not self.spark and not self.engine:
|
|
1531
|
+
return
|
|
1532
|
+
|
|
1533
|
+
def _do_batch_log():
|
|
1534
|
+
if self.spark:
|
|
1535
|
+
from pyspark.sql import functions as F
|
|
1536
|
+
|
|
1537
|
+
rows = [
|
|
1538
|
+
(
|
|
1539
|
+
r["run_id"],
|
|
1540
|
+
r["pipeline_name"],
|
|
1541
|
+
r["node_name"],
|
|
1542
|
+
r["status"],
|
|
1543
|
+
r.get("rows_processed", 0),
|
|
1544
|
+
r.get("duration_ms", 0),
|
|
1545
|
+
r.get("metrics_json", "{}"),
|
|
1546
|
+
environment,
|
|
1547
|
+
)
|
|
1548
|
+
for r in records
|
|
1549
|
+
]
|
|
1550
|
+
schema = self._get_schema_meta_runs()
|
|
1551
|
+
input_schema = StructType(schema.fields[:-2])
|
|
1552
|
+
|
|
1553
|
+
df = self.spark.createDataFrame(rows, input_schema)
|
|
1554
|
+
df = df.withColumn("timestamp", F.current_timestamp()).withColumn(
|
|
1555
|
+
"date", F.to_date(F.col("timestamp"))
|
|
1556
|
+
)
|
|
1557
|
+
|
|
1558
|
+
df.write.format("delta").mode("append").save(self.tables["meta_runs"])
|
|
1559
|
+
logger.debug(f"Batch logged {len(records)} run records to meta_runs")
|
|
1560
|
+
|
|
1561
|
+
elif self.engine:
|
|
1562
|
+
from datetime import datetime, timezone
|
|
1563
|
+
|
|
1564
|
+
import pandas as pd
|
|
1565
|
+
|
|
1566
|
+
timestamp = datetime.now(timezone.utc)
|
|
1567
|
+
|
|
1568
|
+
data = {
|
|
1569
|
+
"run_id": [r["run_id"] for r in records],
|
|
1570
|
+
"pipeline_name": [r["pipeline_name"] for r in records],
|
|
1571
|
+
"node_name": [r["node_name"] for r in records],
|
|
1572
|
+
"status": [r["status"] for r in records],
|
|
1573
|
+
"rows_processed": [r.get("rows_processed", 0) for r in records],
|
|
1574
|
+
"duration_ms": [r.get("duration_ms", 0) for r in records],
|
|
1575
|
+
"metrics_json": [r.get("metrics_json", "{}") for r in records],
|
|
1576
|
+
"environment": [environment] * len(records),
|
|
1577
|
+
"timestamp": [timestamp] * len(records),
|
|
1578
|
+
"date": [timestamp.date()] * len(records),
|
|
1579
|
+
}
|
|
1580
|
+
df = pd.DataFrame(data)
|
|
1581
|
+
|
|
1582
|
+
self.engine.write(
|
|
1583
|
+
df,
|
|
1584
|
+
connection=self.connection,
|
|
1585
|
+
format="delta",
|
|
1586
|
+
path=self.tables["meta_runs"],
|
|
1587
|
+
mode="append",
|
|
1588
|
+
)
|
|
1589
|
+
logger.debug(f"Batch logged {len(records)} run records to meta_runs")
|
|
1590
|
+
|
|
1591
|
+
try:
|
|
1592
|
+
self._retry_with_backoff(_do_batch_log)
|
|
1593
|
+
except Exception as e:
|
|
1594
|
+
logger.warning(f"Failed to batch log runs to system catalog: {e}")
|
|
1595
|
+
|
|
1596
|
+
def log_pattern(
|
|
1597
|
+
self,
|
|
1598
|
+
table_name: str,
|
|
1599
|
+
pattern_type: str,
|
|
1600
|
+
configuration: str,
|
|
1601
|
+
compliance_score: float,
|
|
1602
|
+
) -> None:
|
|
1603
|
+
"""
|
|
1604
|
+
Logs pattern usage to meta_patterns.
|
|
1605
|
+
"""
|
|
1606
|
+
if not self.spark and not self.engine:
|
|
1607
|
+
return
|
|
1608
|
+
|
|
1609
|
+
def _do_log_pattern():
|
|
1610
|
+
if self.spark:
|
|
1611
|
+
rows = [
|
|
1612
|
+
(
|
|
1613
|
+
table_name,
|
|
1614
|
+
pattern_type,
|
|
1615
|
+
configuration,
|
|
1616
|
+
compliance_score,
|
|
1617
|
+
)
|
|
1618
|
+
]
|
|
1619
|
+
schema = self._get_schema_meta_patterns()
|
|
1620
|
+
|
|
1621
|
+
df = self.spark.createDataFrame(rows, schema)
|
|
1622
|
+
|
|
1623
|
+
# Append to meta_patterns
|
|
1624
|
+
df.write.format("delta").mode("append").save(self.tables["meta_patterns"])
|
|
1625
|
+
|
|
1626
|
+
elif self.engine:
|
|
1627
|
+
import pandas as pd
|
|
1628
|
+
|
|
1629
|
+
data = {
|
|
1630
|
+
"table_name": [table_name],
|
|
1631
|
+
"pattern_type": [pattern_type],
|
|
1632
|
+
"configuration": [configuration],
|
|
1633
|
+
"compliance_score": [compliance_score],
|
|
1634
|
+
}
|
|
1635
|
+
df = pd.DataFrame(data)
|
|
1636
|
+
|
|
1637
|
+
self.engine.write(
|
|
1638
|
+
df,
|
|
1639
|
+
connection=self.connection,
|
|
1640
|
+
format="delta",
|
|
1641
|
+
path=self.tables["meta_patterns"],
|
|
1642
|
+
mode="append",
|
|
1643
|
+
)
|
|
1644
|
+
|
|
1645
|
+
try:
|
|
1646
|
+
self._retry_with_backoff(_do_log_pattern)
|
|
1647
|
+
except Exception as e:
|
|
1648
|
+
logger.warning(f"Failed to log pattern to system catalog: {e}")
|
|
1649
|
+
|
|
1650
|
+
def register_asset(
|
|
1651
|
+
self,
|
|
1652
|
+
project_name: str,
|
|
1653
|
+
table_name: str,
|
|
1654
|
+
path: str,
|
|
1655
|
+
format: str,
|
|
1656
|
+
pattern_type: str,
|
|
1657
|
+
schema_hash: str = "",
|
|
1658
|
+
) -> None:
|
|
1659
|
+
"""
|
|
1660
|
+
Registers/Upserts a physical asset to meta_tables.
|
|
1661
|
+
"""
|
|
1662
|
+
if not self.spark and not self.engine:
|
|
1663
|
+
return
|
|
1664
|
+
|
|
1665
|
+
def _do_register():
|
|
1666
|
+
if self.spark:
|
|
1667
|
+
from pyspark.sql import functions as F
|
|
1668
|
+
|
|
1669
|
+
# Prepare data
|
|
1670
|
+
rows = [
|
|
1671
|
+
(
|
|
1672
|
+
project_name,
|
|
1673
|
+
table_name,
|
|
1674
|
+
path,
|
|
1675
|
+
format,
|
|
1676
|
+
pattern_type,
|
|
1677
|
+
schema_hash,
|
|
1678
|
+
)
|
|
1679
|
+
]
|
|
1680
|
+
schema = self._get_schema_meta_tables()
|
|
1681
|
+
input_schema = StructType(schema.fields[:-1]) # Exclude updated_at
|
|
1682
|
+
|
|
1683
|
+
df = self.spark.createDataFrame(rows, input_schema)
|
|
1684
|
+
df = df.withColumn("updated_at", F.current_timestamp())
|
|
1685
|
+
|
|
1686
|
+
# Merge Logic
|
|
1687
|
+
# We need a temp view
|
|
1688
|
+
view_name = f"_odibi_meta_tables_upsert_{abs(hash(table_name))}"
|
|
1689
|
+
df.createOrReplaceTempView(view_name)
|
|
1690
|
+
|
|
1691
|
+
target_path = self.tables["meta_tables"]
|
|
1692
|
+
|
|
1693
|
+
merge_sql = f"""
|
|
1694
|
+
MERGE INTO delta.`{target_path}` AS target
|
|
1695
|
+
USING {view_name} AS source
|
|
1696
|
+
ON target.project_name = source.project_name
|
|
1697
|
+
AND target.table_name = source.table_name
|
|
1698
|
+
WHEN MATCHED THEN UPDATE SET
|
|
1699
|
+
target.path = source.path,
|
|
1700
|
+
target.format = source.format,
|
|
1701
|
+
target.pattern_type = source.pattern_type,
|
|
1702
|
+
target.schema_hash = source.schema_hash,
|
|
1703
|
+
target.updated_at = source.updated_at
|
|
1704
|
+
WHEN NOT MATCHED THEN INSERT *
|
|
1705
|
+
"""
|
|
1706
|
+
self.spark.sql(merge_sql)
|
|
1707
|
+
self.spark.catalog.dropTempView(view_name)
|
|
1708
|
+
elif self.engine:
|
|
1709
|
+
from datetime import datetime, timezone
|
|
1710
|
+
|
|
1711
|
+
import pandas as pd
|
|
1712
|
+
|
|
1713
|
+
# Construct DataFrame
|
|
1714
|
+
data = {
|
|
1715
|
+
"project_name": [project_name],
|
|
1716
|
+
"table_name": [table_name],
|
|
1717
|
+
"path": [path],
|
|
1718
|
+
"format": [format],
|
|
1719
|
+
"pattern_type": [pattern_type],
|
|
1720
|
+
"schema_hash": [schema_hash],
|
|
1721
|
+
"updated_at": [datetime.now(timezone.utc)],
|
|
1722
|
+
}
|
|
1723
|
+
df = pd.DataFrame(data)
|
|
1724
|
+
|
|
1725
|
+
target_path = self.tables["meta_tables"]
|
|
1726
|
+
|
|
1727
|
+
# Use Merge transformer if available, or manual engine merge?
|
|
1728
|
+
# Since we are inside catalog, using transformer might be circular.
|
|
1729
|
+
# Let's use engine.write with mode='upsert' if engine supports it?
|
|
1730
|
+
# PandasEngine.write(..., mode='upsert') delegates to _handle_generic_upsert
|
|
1731
|
+
# or _write_delta which calls dt.merge.
|
|
1732
|
+
|
|
1733
|
+
self.engine.write(
|
|
1734
|
+
df,
|
|
1735
|
+
connection=self.connection,
|
|
1736
|
+
format="delta",
|
|
1737
|
+
path=target_path,
|
|
1738
|
+
mode="upsert",
|
|
1739
|
+
options={"keys": ["project_name", "table_name"]},
|
|
1740
|
+
)
|
|
1741
|
+
|
|
1742
|
+
try:
|
|
1743
|
+
self._retry_with_backoff(_do_register)
|
|
1744
|
+
except Exception as e:
|
|
1745
|
+
logger.warning(f"Failed to register asset in system catalog: {e}")
|
|
1746
|
+
|
|
1747
|
+
def resolve_table_path(self, table_name: str) -> Optional[str]:
|
|
1748
|
+
"""
|
|
1749
|
+
Resolves logical table name (e.g. 'gold.orders') to physical path.
|
|
1750
|
+
"""
|
|
1751
|
+
if self.spark:
|
|
1752
|
+
try:
|
|
1753
|
+
from pyspark.sql import functions as F
|
|
1754
|
+
|
|
1755
|
+
df = self.spark.read.format("delta").load(self.tables["meta_tables"])
|
|
1756
|
+
# Filter
|
|
1757
|
+
row = df.filter(F.col("table_name") == table_name).select("path").first()
|
|
1758
|
+
|
|
1759
|
+
return row.path if row else None
|
|
1760
|
+
except Exception:
|
|
1761
|
+
return None
|
|
1762
|
+
elif self.engine:
|
|
1763
|
+
df = self._read_local_table(self.tables["meta_tables"])
|
|
1764
|
+
if df.empty:
|
|
1765
|
+
return None
|
|
1766
|
+
|
|
1767
|
+
# Pandas filtering
|
|
1768
|
+
if "table_name" not in df.columns:
|
|
1769
|
+
return None
|
|
1770
|
+
|
|
1771
|
+
row = df[df["table_name"] == table_name]
|
|
1772
|
+
if not row.empty:
|
|
1773
|
+
return row.iloc[0]["path"]
|
|
1774
|
+
return None
|
|
1775
|
+
|
|
1776
|
+
return None
|
|
1777
|
+
|
|
1778
|
+
def get_pipeline_hash(self, pipeline_name: str) -> Optional[str]:
|
|
1779
|
+
"""
|
|
1780
|
+
Retrieves the version hash of a pipeline from the catalog.
|
|
1781
|
+
"""
|
|
1782
|
+
if self.spark:
|
|
1783
|
+
try:
|
|
1784
|
+
from pyspark.sql import functions as F
|
|
1785
|
+
|
|
1786
|
+
df = self.spark.read.format("delta").load(self.tables["meta_pipelines"])
|
|
1787
|
+
row = (
|
|
1788
|
+
df.filter(F.col("pipeline_name") == pipeline_name)
|
|
1789
|
+
.select("version_hash")
|
|
1790
|
+
.first()
|
|
1791
|
+
)
|
|
1792
|
+
return row.version_hash if row else None
|
|
1793
|
+
except Exception:
|
|
1794
|
+
return None
|
|
1795
|
+
elif self.engine:
|
|
1796
|
+
df = self._read_local_table(self.tables["meta_pipelines"])
|
|
1797
|
+
if df.empty:
|
|
1798
|
+
return None
|
|
1799
|
+
if "pipeline_name" not in df.columns or "version_hash" not in df.columns:
|
|
1800
|
+
return None
|
|
1801
|
+
|
|
1802
|
+
# Ensure we get the latest one if duplicates exist (though upsert should prevent)
|
|
1803
|
+
# But reading parquet fallback might have duplicates.
|
|
1804
|
+
# Sorting by updated_at desc
|
|
1805
|
+
if "updated_at" in df.columns:
|
|
1806
|
+
df = df.sort_values("updated_at", ascending=False)
|
|
1807
|
+
|
|
1808
|
+
row = df[df["pipeline_name"] == pipeline_name]
|
|
1809
|
+
if not row.empty:
|
|
1810
|
+
return row.iloc[0]["version_hash"]
|
|
1811
|
+
return None
|
|
1812
|
+
return None
|
|
1813
|
+
|
|
1814
|
+
def get_average_volume(self, node_name: str, days: int = 7) -> Optional[float]:
|
|
1815
|
+
"""
|
|
1816
|
+
Calculates average rows processed for a node over last N days.
|
|
1817
|
+
"""
|
|
1818
|
+
if self.spark:
|
|
1819
|
+
try:
|
|
1820
|
+
from pyspark.sql import functions as F
|
|
1821
|
+
|
|
1822
|
+
df = self.spark.read.format("delta").load(self.tables["meta_runs"])
|
|
1823
|
+
|
|
1824
|
+
# Filter by node and success status
|
|
1825
|
+
stats = (
|
|
1826
|
+
df.filter(
|
|
1827
|
+
(F.col("node_name") == node_name)
|
|
1828
|
+
& (F.col("status") == "SUCCESS")
|
|
1829
|
+
& (F.col("timestamp") >= F.date_sub(F.current_date(), days))
|
|
1830
|
+
)
|
|
1831
|
+
.agg(F.avg("rows_processed"))
|
|
1832
|
+
.first()
|
|
1833
|
+
)
|
|
1834
|
+
|
|
1835
|
+
return stats[0] if stats else None
|
|
1836
|
+
except Exception:
|
|
1837
|
+
return None
|
|
1838
|
+
elif self.engine:
|
|
1839
|
+
df = self._read_local_table(self.tables["meta_runs"])
|
|
1840
|
+
if df.empty:
|
|
1841
|
+
return None
|
|
1842
|
+
|
|
1843
|
+
# Need status, node_name, rows_processed, timestamp
|
|
1844
|
+
required = ["status", "node_name", "rows_processed", "timestamp"]
|
|
1845
|
+
if not all(col in df.columns for col in required):
|
|
1846
|
+
return None
|
|
1847
|
+
|
|
1848
|
+
from datetime import datetime, timedelta, timezone
|
|
1849
|
+
|
|
1850
|
+
import pandas as pd
|
|
1851
|
+
|
|
1852
|
+
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
|
|
1853
|
+
|
|
1854
|
+
# Ensure timestamp is datetime
|
|
1855
|
+
if not pd.api.types.is_datetime64_any_dtype(df["timestamp"]):
|
|
1856
|
+
try:
|
|
1857
|
+
df["timestamp"] = pd.to_datetime(df["timestamp"])
|
|
1858
|
+
except Exception:
|
|
1859
|
+
return None
|
|
1860
|
+
|
|
1861
|
+
filtered = df[
|
|
1862
|
+
(df["node_name"] == node_name)
|
|
1863
|
+
& (df["status"] == "SUCCESS")
|
|
1864
|
+
& (df["timestamp"] >= cutoff)
|
|
1865
|
+
]
|
|
1866
|
+
|
|
1867
|
+
if filtered.empty:
|
|
1868
|
+
return None
|
|
1869
|
+
|
|
1870
|
+
return float(filtered["rows_processed"].mean())
|
|
1871
|
+
|
|
1872
|
+
return None
|
|
1873
|
+
|
|
1874
|
+
def get_average_duration(self, node_name: str, days: int = 7) -> Optional[float]:
|
|
1875
|
+
"""
|
|
1876
|
+
Calculates average duration (seconds) for a node over last N days.
|
|
1877
|
+
"""
|
|
1878
|
+
if self.spark:
|
|
1879
|
+
try:
|
|
1880
|
+
from pyspark.sql import functions as F
|
|
1881
|
+
|
|
1882
|
+
df = self.spark.read.format("delta").load(self.tables["meta_runs"])
|
|
1883
|
+
|
|
1884
|
+
stats = (
|
|
1885
|
+
df.filter(
|
|
1886
|
+
(F.col("node_name") == node_name)
|
|
1887
|
+
& (F.col("status") == "SUCCESS")
|
|
1888
|
+
& (F.col("timestamp") >= F.date_sub(F.current_date(), days))
|
|
1889
|
+
)
|
|
1890
|
+
.agg(F.avg("duration_ms"))
|
|
1891
|
+
.first()
|
|
1892
|
+
)
|
|
1893
|
+
|
|
1894
|
+
return stats[0] / 1000.0 if stats and stats[0] is not None else None
|
|
1895
|
+
except Exception:
|
|
1896
|
+
return None
|
|
1897
|
+
elif self.engine:
|
|
1898
|
+
df = self._read_local_table(self.tables["meta_runs"])
|
|
1899
|
+
if df.empty:
|
|
1900
|
+
return None
|
|
1901
|
+
|
|
1902
|
+
from datetime import datetime, timedelta, timezone
|
|
1903
|
+
|
|
1904
|
+
import pandas as pd
|
|
1905
|
+
|
|
1906
|
+
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
|
|
1907
|
+
|
|
1908
|
+
if not pd.api.types.is_datetime64_any_dtype(df["timestamp"]):
|
|
1909
|
+
try:
|
|
1910
|
+
df["timestamp"] = pd.to_datetime(df["timestamp"])
|
|
1911
|
+
except Exception:
|
|
1912
|
+
return None
|
|
1913
|
+
|
|
1914
|
+
filtered = df[
|
|
1915
|
+
(df["node_name"] == node_name)
|
|
1916
|
+
& (df["status"] == "SUCCESS")
|
|
1917
|
+
& (df["timestamp"] >= cutoff)
|
|
1918
|
+
]
|
|
1919
|
+
|
|
1920
|
+
if filtered.empty:
|
|
1921
|
+
return None
|
|
1922
|
+
|
|
1923
|
+
avg_ms = float(filtered["duration_ms"].mean())
|
|
1924
|
+
return avg_ms / 1000.0
|
|
1925
|
+
|
|
1926
|
+
return None
|
|
1927
|
+
|
|
1928
|
+
def _read_table(self, path: str):
|
|
1929
|
+
"""
|
|
1930
|
+
Read system table using Spark (for remote paths) or local methods.
|
|
1931
|
+
Returns pandas DataFrame. Empty DataFrame on failure.
|
|
1932
|
+
"""
|
|
1933
|
+
import pandas as pd
|
|
1934
|
+
|
|
1935
|
+
# Use Spark for remote paths (ADLS, S3, etc.) or when Spark is available
|
|
1936
|
+
if self.spark:
|
|
1937
|
+
try:
|
|
1938
|
+
spark_df = self.spark.read.format("delta").load(path)
|
|
1939
|
+
return spark_df.toPandas()
|
|
1940
|
+
except Exception as e:
|
|
1941
|
+
logger.debug(f"Could not read table via Spark at {path}: {e}")
|
|
1942
|
+
return pd.DataFrame()
|
|
1943
|
+
|
|
1944
|
+
# Fallback to local reading for non-Spark environments
|
|
1945
|
+
return self._read_local_table(path)
|
|
1946
|
+
|
|
1947
|
+
def _read_local_table(self, path: str):
|
|
1948
|
+
"""
|
|
1949
|
+
Helper to read local system tables (Delta or Parquet).
|
|
1950
|
+
Returns empty DataFrame on failure.
|
|
1951
|
+
"""
|
|
1952
|
+
import pandas as pd
|
|
1953
|
+
|
|
1954
|
+
storage_opts = self._get_storage_options()
|
|
1955
|
+
|
|
1956
|
+
try:
|
|
1957
|
+
# Try Delta first if library available
|
|
1958
|
+
try:
|
|
1959
|
+
from deltalake import DeltaTable
|
|
1960
|
+
|
|
1961
|
+
dt = DeltaTable(path, storage_options=storage_opts if storage_opts else None)
|
|
1962
|
+
return dt.to_pandas()
|
|
1963
|
+
except ImportError:
|
|
1964
|
+
# Delta library not installed, proceed to parquet fallback
|
|
1965
|
+
pass
|
|
1966
|
+
except Exception:
|
|
1967
|
+
# Not a valid delta table? Fallback to parquet
|
|
1968
|
+
pass
|
|
1969
|
+
|
|
1970
|
+
# Fallback: Read as Parquet (directory or file)
|
|
1971
|
+
return pd.read_parquet(path, storage_options=storage_opts if storage_opts else None)
|
|
1972
|
+
|
|
1973
|
+
except Exception as e:
|
|
1974
|
+
# Only log debug to avoid noise if table just doesn't exist or is empty yet
|
|
1975
|
+
logger.debug(f"Could not read local table at {path}: {e}")
|
|
1976
|
+
return pd.DataFrame()
|
|
1977
|
+
|
|
1978
|
+
def _hash_schema(self, schema: Dict[str, str]) -> str:
|
|
1979
|
+
"""Generate MD5 hash of column definitions for change detection."""
|
|
1980
|
+
sorted_schema = json.dumps(schema, sort_keys=True)
|
|
1981
|
+
return hashlib.md5(sorted_schema.encode("utf-8")).hexdigest()
|
|
1982
|
+
|
|
1983
|
+
def _get_latest_schema(self, table_path: str) -> Optional[Dict[str, Any]]:
|
|
1984
|
+
"""Get the most recent schema record for a table."""
|
|
1985
|
+
if self.spark:
|
|
1986
|
+
try:
|
|
1987
|
+
from pyspark.sql import functions as F
|
|
1988
|
+
|
|
1989
|
+
df = self.spark.read.format("delta").load(self.tables["meta_schemas"])
|
|
1990
|
+
row = (
|
|
1991
|
+
df.filter(F.col("table_path") == table_path)
|
|
1992
|
+
.orderBy(F.col("schema_version").desc())
|
|
1993
|
+
.first()
|
|
1994
|
+
)
|
|
1995
|
+
if row:
|
|
1996
|
+
return row.asDict()
|
|
1997
|
+
return None
|
|
1998
|
+
except Exception:
|
|
1999
|
+
return None
|
|
2000
|
+
elif self.engine:
|
|
2001
|
+
df = self._read_local_table(self.tables["meta_schemas"])
|
|
2002
|
+
if df.empty or "table_path" not in df.columns:
|
|
2003
|
+
return None
|
|
2004
|
+
|
|
2005
|
+
filtered = df[df["table_path"] == table_path]
|
|
2006
|
+
if filtered.empty:
|
|
2007
|
+
return None
|
|
2008
|
+
|
|
2009
|
+
if "schema_version" in filtered.columns:
|
|
2010
|
+
filtered = filtered.sort_values("schema_version", ascending=False)
|
|
2011
|
+
return filtered.iloc[0].to_dict()
|
|
2012
|
+
|
|
2013
|
+
return None
|
|
2014
|
+
|
|
2015
|
+
def track_schema(
|
|
2016
|
+
self,
|
|
2017
|
+
table_path: str,
|
|
2018
|
+
schema: Dict[str, str],
|
|
2019
|
+
pipeline: str,
|
|
2020
|
+
node: str,
|
|
2021
|
+
run_id: str,
|
|
2022
|
+
) -> Dict[str, Any]:
|
|
2023
|
+
"""
|
|
2024
|
+
Track schema version for a table.
|
|
2025
|
+
|
|
2026
|
+
Args:
|
|
2027
|
+
table_path: Full path to the table (e.g., "silver/customers")
|
|
2028
|
+
schema: Dictionary of column names to types
|
|
2029
|
+
pipeline: Pipeline name
|
|
2030
|
+
node: Node name
|
|
2031
|
+
run_id: Execution run ID
|
|
2032
|
+
|
|
2033
|
+
Returns:
|
|
2034
|
+
Dict with version info and detected changes:
|
|
2035
|
+
- changed: bool indicating if schema changed
|
|
2036
|
+
- version: current schema version number
|
|
2037
|
+
- previous_version: previous version (if exists)
|
|
2038
|
+
- columns_added: list of new columns
|
|
2039
|
+
- columns_removed: list of removed columns
|
|
2040
|
+
- columns_type_changed: list of columns with type changes
|
|
2041
|
+
"""
|
|
2042
|
+
if not self.spark and not self.engine:
|
|
2043
|
+
return {"changed": False, "version": 0}
|
|
2044
|
+
|
|
2045
|
+
try:
|
|
2046
|
+
schema_hash = self._hash_schema(schema)
|
|
2047
|
+
previous = self._get_latest_schema(table_path)
|
|
2048
|
+
|
|
2049
|
+
if previous and previous.get("schema_hash") == schema_hash:
|
|
2050
|
+
return {"changed": False, "version": previous.get("schema_version", 1)}
|
|
2051
|
+
|
|
2052
|
+
changes: Dict[str, Any] = {
|
|
2053
|
+
"columns_added": [],
|
|
2054
|
+
"columns_removed": [],
|
|
2055
|
+
"columns_type_changed": [],
|
|
2056
|
+
}
|
|
2057
|
+
|
|
2058
|
+
if previous:
|
|
2059
|
+
prev_cols_str = previous.get("columns", "{}")
|
|
2060
|
+
prev_cols = json.loads(prev_cols_str) if isinstance(prev_cols_str, str) else {}
|
|
2061
|
+
|
|
2062
|
+
changes["columns_added"] = list(set(schema.keys()) - set(prev_cols.keys()))
|
|
2063
|
+
changes["columns_removed"] = list(set(prev_cols.keys()) - set(schema.keys()))
|
|
2064
|
+
changes["columns_type_changed"] = [
|
|
2065
|
+
col for col in schema if col in prev_cols and schema[col] != prev_cols[col]
|
|
2066
|
+
]
|
|
2067
|
+
new_version = previous.get("schema_version", 0) + 1
|
|
2068
|
+
else:
|
|
2069
|
+
new_version = 1
|
|
2070
|
+
|
|
2071
|
+
record = {
|
|
2072
|
+
"table_path": table_path,
|
|
2073
|
+
"schema_version": new_version,
|
|
2074
|
+
"schema_hash": schema_hash,
|
|
2075
|
+
"columns": json.dumps(schema),
|
|
2076
|
+
"captured_at": datetime.now(timezone.utc),
|
|
2077
|
+
"pipeline": pipeline,
|
|
2078
|
+
"node": node,
|
|
2079
|
+
"run_id": run_id,
|
|
2080
|
+
"columns_added": (
|
|
2081
|
+
json.dumps(changes["columns_added"]) if changes["columns_added"] else None
|
|
2082
|
+
),
|
|
2083
|
+
"columns_removed": (
|
|
2084
|
+
json.dumps(changes["columns_removed"]) if changes["columns_removed"] else None
|
|
2085
|
+
),
|
|
2086
|
+
"columns_type_changed": (
|
|
2087
|
+
json.dumps(changes["columns_type_changed"])
|
|
2088
|
+
if changes["columns_type_changed"]
|
|
2089
|
+
else None
|
|
2090
|
+
),
|
|
2091
|
+
}
|
|
2092
|
+
|
|
2093
|
+
if self.spark:
|
|
2094
|
+
df = self.spark.createDataFrame([record], schema=self._get_schema_meta_schemas())
|
|
2095
|
+
df.write.format("delta").mode("append").save(self.tables["meta_schemas"])
|
|
2096
|
+
|
|
2097
|
+
elif self.engine:
|
|
2098
|
+
import pandas as pd
|
|
2099
|
+
|
|
2100
|
+
df = pd.DataFrame([record])
|
|
2101
|
+
self.engine.write(
|
|
2102
|
+
df,
|
|
2103
|
+
connection=self.connection,
|
|
2104
|
+
format="delta",
|
|
2105
|
+
path=self.tables["meta_schemas"],
|
|
2106
|
+
mode="append",
|
|
2107
|
+
)
|
|
2108
|
+
|
|
2109
|
+
result = {
|
|
2110
|
+
"changed": True,
|
|
2111
|
+
"version": new_version,
|
|
2112
|
+
"previous_version": previous.get("schema_version") if previous else None,
|
|
2113
|
+
**changes,
|
|
2114
|
+
}
|
|
2115
|
+
|
|
2116
|
+
logger.info(
|
|
2117
|
+
f"Schema tracked for {table_path}: v{new_version} "
|
|
2118
|
+
f"(+{len(changes['columns_added'])}/-{len(changes['columns_removed'])}/"
|
|
2119
|
+
f"~{len(changes['columns_type_changed'])})"
|
|
2120
|
+
)
|
|
2121
|
+
|
|
2122
|
+
return result
|
|
2123
|
+
|
|
2124
|
+
except Exception as e:
|
|
2125
|
+
logger.warning(f"Failed to track schema for {table_path}: {e}")
|
|
2126
|
+
return {"changed": False, "version": 0, "error": str(e)}
|
|
2127
|
+
|
|
2128
|
+
def get_schema_history(
|
|
2129
|
+
self,
|
|
2130
|
+
table_path: str,
|
|
2131
|
+
limit: int = 10,
|
|
2132
|
+
) -> List[Dict[str, Any]]:
|
|
2133
|
+
"""
|
|
2134
|
+
Get schema version history for a table.
|
|
2135
|
+
|
|
2136
|
+
Args:
|
|
2137
|
+
table_path: Full path to the table (e.g., "silver/customers")
|
|
2138
|
+
limit: Maximum number of versions to return (default: 10)
|
|
2139
|
+
|
|
2140
|
+
Returns:
|
|
2141
|
+
List of schema version records, most recent first
|
|
2142
|
+
"""
|
|
2143
|
+
if not self.spark and not self.engine:
|
|
2144
|
+
return []
|
|
2145
|
+
|
|
2146
|
+
try:
|
|
2147
|
+
if self.spark:
|
|
2148
|
+
from pyspark.sql import functions as F
|
|
2149
|
+
|
|
2150
|
+
df = self.spark.read.format("delta").load(self.tables["meta_schemas"])
|
|
2151
|
+
rows = (
|
|
2152
|
+
df.filter(F.col("table_path") == table_path)
|
|
2153
|
+
.orderBy(F.col("schema_version").desc())
|
|
2154
|
+
.limit(limit)
|
|
2155
|
+
.collect()
|
|
2156
|
+
)
|
|
2157
|
+
return [row.asDict() for row in rows]
|
|
2158
|
+
|
|
2159
|
+
elif self.engine:
|
|
2160
|
+
df = self._read_local_table(self.tables["meta_schemas"])
|
|
2161
|
+
if df.empty or "table_path" not in df.columns:
|
|
2162
|
+
return []
|
|
2163
|
+
|
|
2164
|
+
filtered = df[df["table_path"] == table_path]
|
|
2165
|
+
if filtered.empty:
|
|
2166
|
+
return []
|
|
2167
|
+
|
|
2168
|
+
if "schema_version" in filtered.columns:
|
|
2169
|
+
filtered = filtered.sort_values("schema_version", ascending=False)
|
|
2170
|
+
|
|
2171
|
+
return filtered.head(limit).to_dict("records")
|
|
2172
|
+
|
|
2173
|
+
except Exception as e:
|
|
2174
|
+
logger.warning(f"Failed to get schema history for {table_path}: {e}")
|
|
2175
|
+
return []
|
|
2176
|
+
|
|
2177
|
+
return []
|
|
2178
|
+
|
|
2179
|
+
def record_lineage(
|
|
2180
|
+
self,
|
|
2181
|
+
source_table: str,
|
|
2182
|
+
target_table: str,
|
|
2183
|
+
target_pipeline: str,
|
|
2184
|
+
target_node: str,
|
|
2185
|
+
run_id: str,
|
|
2186
|
+
source_pipeline: Optional[str] = None,
|
|
2187
|
+
source_node: Optional[str] = None,
|
|
2188
|
+
relationship: str = "feeds",
|
|
2189
|
+
) -> None:
|
|
2190
|
+
"""
|
|
2191
|
+
Record a lineage relationship between tables.
|
|
2192
|
+
|
|
2193
|
+
Args:
|
|
2194
|
+
source_table: Source table path
|
|
2195
|
+
target_table: Target table path
|
|
2196
|
+
target_pipeline: Pipeline name writing to target
|
|
2197
|
+
target_node: Node name writing to target
|
|
2198
|
+
run_id: Execution run ID
|
|
2199
|
+
source_pipeline: Source pipeline name (if known)
|
|
2200
|
+
source_node: Source node name (if known)
|
|
2201
|
+
relationship: Type of relationship ("feeds" or "derived_from")
|
|
2202
|
+
"""
|
|
2203
|
+
if not self.spark and not self.engine:
|
|
2204
|
+
return
|
|
2205
|
+
|
|
2206
|
+
def _do_record():
|
|
2207
|
+
record = {
|
|
2208
|
+
"source_table": source_table,
|
|
2209
|
+
"target_table": target_table,
|
|
2210
|
+
"source_pipeline": source_pipeline,
|
|
2211
|
+
"source_node": source_node,
|
|
2212
|
+
"target_pipeline": target_pipeline,
|
|
2213
|
+
"target_node": target_node,
|
|
2214
|
+
"relationship": relationship,
|
|
2215
|
+
"last_observed": datetime.now(timezone.utc),
|
|
2216
|
+
"run_id": run_id,
|
|
2217
|
+
}
|
|
2218
|
+
|
|
2219
|
+
if self.spark:
|
|
2220
|
+
view_name = f"_odibi_lineage_upsert_{abs(hash(f'{source_table}_{target_table}'))}"
|
|
2221
|
+
df = self.spark.createDataFrame([record], schema=self._get_schema_meta_lineage())
|
|
2222
|
+
df.createOrReplaceTempView(view_name)
|
|
2223
|
+
|
|
2224
|
+
target_path = self.tables["meta_lineage"]
|
|
2225
|
+
|
|
2226
|
+
merge_sql = f"""
|
|
2227
|
+
MERGE INTO delta.`{target_path}` AS target
|
|
2228
|
+
USING {view_name} AS source
|
|
2229
|
+
ON target.source_table = source.source_table
|
|
2230
|
+
AND target.target_table = source.target_table
|
|
2231
|
+
WHEN MATCHED THEN UPDATE SET
|
|
2232
|
+
target.source_pipeline = source.source_pipeline,
|
|
2233
|
+
target.source_node = source.source_node,
|
|
2234
|
+
target.target_pipeline = source.target_pipeline,
|
|
2235
|
+
target.target_node = source.target_node,
|
|
2236
|
+
target.relationship = source.relationship,
|
|
2237
|
+
target.last_observed = source.last_observed,
|
|
2238
|
+
target.run_id = source.run_id
|
|
2239
|
+
WHEN NOT MATCHED THEN INSERT *
|
|
2240
|
+
"""
|
|
2241
|
+
self.spark.sql(merge_sql)
|
|
2242
|
+
self.spark.catalog.dropTempView(view_name)
|
|
2243
|
+
|
|
2244
|
+
elif self.engine:
|
|
2245
|
+
import pandas as pd
|
|
2246
|
+
|
|
2247
|
+
df = pd.DataFrame([record])
|
|
2248
|
+
self.engine.write(
|
|
2249
|
+
df,
|
|
2250
|
+
connection=self.connection,
|
|
2251
|
+
format="delta",
|
|
2252
|
+
path=self.tables["meta_lineage"],
|
|
2253
|
+
mode="upsert",
|
|
2254
|
+
options={"keys": ["source_table", "target_table"]},
|
|
2255
|
+
)
|
|
2256
|
+
|
|
2257
|
+
logger.debug(f"Recorded lineage: {source_table} -> {target_table}")
|
|
2258
|
+
|
|
2259
|
+
try:
|
|
2260
|
+
self._retry_with_backoff(_do_record)
|
|
2261
|
+
except Exception as e:
|
|
2262
|
+
logger.warning(f"Failed to record lineage: {e}")
|
|
2263
|
+
|
|
2264
|
+
def record_lineage_batch(
|
|
2265
|
+
self,
|
|
2266
|
+
records: List[Dict[str, Any]],
|
|
2267
|
+
) -> None:
|
|
2268
|
+
"""
|
|
2269
|
+
Batch records multiple lineage relationships to meta_lineage in a single MERGE.
|
|
2270
|
+
|
|
2271
|
+
This is much more efficient than calling record_lineage() for each relationship
|
|
2272
|
+
individually, especially when running parallel pipelines with many nodes.
|
|
2273
|
+
|
|
2274
|
+
Args:
|
|
2275
|
+
records: List of dicts with keys: source_table, target_table, target_pipeline,
|
|
2276
|
+
target_node, run_id, source_pipeline (optional), source_node (optional),
|
|
2277
|
+
relationship (optional, defaults to "feeds")
|
|
2278
|
+
"""
|
|
2279
|
+
if not self.spark and not self.engine:
|
|
2280
|
+
return
|
|
2281
|
+
|
|
2282
|
+
if not records:
|
|
2283
|
+
return
|
|
2284
|
+
|
|
2285
|
+
def _do_batch_record():
|
|
2286
|
+
timestamp = datetime.now(timezone.utc)
|
|
2287
|
+
|
|
2288
|
+
if self.spark:
|
|
2289
|
+
rows = [
|
|
2290
|
+
(
|
|
2291
|
+
r["source_table"],
|
|
2292
|
+
r["target_table"],
|
|
2293
|
+
r.get("source_pipeline"),
|
|
2294
|
+
r.get("source_node"),
|
|
2295
|
+
r["target_pipeline"],
|
|
2296
|
+
r["target_node"],
|
|
2297
|
+
r.get("relationship", "feeds"),
|
|
2298
|
+
timestamp,
|
|
2299
|
+
r["run_id"],
|
|
2300
|
+
)
|
|
2301
|
+
for r in records
|
|
2302
|
+
]
|
|
2303
|
+
schema = self._get_schema_meta_lineage()
|
|
2304
|
+
df = self.spark.createDataFrame(rows, schema)
|
|
2305
|
+
|
|
2306
|
+
view_name = "_odibi_meta_lineage_batch_upsert"
|
|
2307
|
+
df.createOrReplaceTempView(view_name)
|
|
2308
|
+
|
|
2309
|
+
target_path = self.tables["meta_lineage"]
|
|
2310
|
+
|
|
2311
|
+
merge_sql = f"""
|
|
2312
|
+
MERGE INTO delta.`{target_path}` AS target
|
|
2313
|
+
USING {view_name} AS source
|
|
2314
|
+
ON target.source_table = source.source_table
|
|
2315
|
+
AND target.target_table = source.target_table
|
|
2316
|
+
WHEN MATCHED THEN UPDATE SET
|
|
2317
|
+
target.source_pipeline = source.source_pipeline,
|
|
2318
|
+
target.source_node = source.source_node,
|
|
2319
|
+
target.target_pipeline = source.target_pipeline,
|
|
2320
|
+
target.target_node = source.target_node,
|
|
2321
|
+
target.relationship = source.relationship,
|
|
2322
|
+
target.last_observed = source.last_observed,
|
|
2323
|
+
target.run_id = source.run_id
|
|
2324
|
+
WHEN NOT MATCHED THEN INSERT *
|
|
2325
|
+
"""
|
|
2326
|
+
self.spark.sql(merge_sql)
|
|
2327
|
+
self.spark.catalog.dropTempView(view_name)
|
|
2328
|
+
|
|
2329
|
+
elif self.engine:
|
|
2330
|
+
import pandas as pd
|
|
2331
|
+
|
|
2332
|
+
data = {
|
|
2333
|
+
"source_table": [r["source_table"] for r in records],
|
|
2334
|
+
"target_table": [r["target_table"] for r in records],
|
|
2335
|
+
"source_pipeline": [r.get("source_pipeline") for r in records],
|
|
2336
|
+
"source_node": [r.get("source_node") for r in records],
|
|
2337
|
+
"target_pipeline": [r["target_pipeline"] for r in records],
|
|
2338
|
+
"target_node": [r["target_node"] for r in records],
|
|
2339
|
+
"relationship": [r.get("relationship", "feeds") for r in records],
|
|
2340
|
+
"last_observed": [timestamp] * len(records),
|
|
2341
|
+
"run_id": [r["run_id"] for r in records],
|
|
2342
|
+
}
|
|
2343
|
+
df = pd.DataFrame(data)
|
|
2344
|
+
|
|
2345
|
+
self.engine.write(
|
|
2346
|
+
df,
|
|
2347
|
+
connection=self.connection,
|
|
2348
|
+
format="delta",
|
|
2349
|
+
path=self.tables["meta_lineage"],
|
|
2350
|
+
mode="upsert",
|
|
2351
|
+
options={"keys": ["source_table", "target_table"]},
|
|
2352
|
+
)
|
|
2353
|
+
|
|
2354
|
+
logger.debug(f"Batch recorded {len(records)} lineage relationship(s)")
|
|
2355
|
+
|
|
2356
|
+
try:
|
|
2357
|
+
self._retry_with_backoff(_do_batch_record)
|
|
2358
|
+
except Exception as e:
|
|
2359
|
+
logger.warning(f"Failed to batch record lineage: {e}")
|
|
2360
|
+
|
|
2361
|
+
def register_assets_batch(
|
|
2362
|
+
self,
|
|
2363
|
+
records: List[Dict[str, Any]],
|
|
2364
|
+
) -> None:
|
|
2365
|
+
"""
|
|
2366
|
+
Batch registers/upserts multiple physical assets to meta_tables in a single MERGE.
|
|
2367
|
+
|
|
2368
|
+
This is much more efficient than calling register_asset() for each asset
|
|
2369
|
+
individually, especially when running parallel pipelines with many nodes.
|
|
2370
|
+
|
|
2371
|
+
Args:
|
|
2372
|
+
records: List of dicts with keys: project_name, table_name, path, format,
|
|
2373
|
+
pattern_type, schema_hash (optional, defaults to "")
|
|
2374
|
+
"""
|
|
2375
|
+
if not self.spark and not self.engine:
|
|
2376
|
+
return
|
|
2377
|
+
|
|
2378
|
+
if not records:
|
|
2379
|
+
return
|
|
2380
|
+
|
|
2381
|
+
def _do_batch_register():
|
|
2382
|
+
timestamp = datetime.now(timezone.utc)
|
|
2383
|
+
|
|
2384
|
+
if self.spark:
|
|
2385
|
+
from pyspark.sql import functions as F
|
|
2386
|
+
|
|
2387
|
+
schema = self._get_schema_meta_tables()
|
|
2388
|
+
input_schema = StructType(schema.fields[:-1]) # Exclude updated_at
|
|
2389
|
+
|
|
2390
|
+
rows = [
|
|
2391
|
+
(
|
|
2392
|
+
r["project_name"],
|
|
2393
|
+
r["table_name"],
|
|
2394
|
+
r["path"],
|
|
2395
|
+
r["format"],
|
|
2396
|
+
r["pattern_type"],
|
|
2397
|
+
r.get("schema_hash", ""),
|
|
2398
|
+
)
|
|
2399
|
+
for r in records
|
|
2400
|
+
]
|
|
2401
|
+
df = self.spark.createDataFrame(rows, input_schema)
|
|
2402
|
+
df = df.withColumn("updated_at", F.current_timestamp())
|
|
2403
|
+
|
|
2404
|
+
view_name = "_odibi_meta_tables_batch_upsert"
|
|
2405
|
+
df.createOrReplaceTempView(view_name)
|
|
2406
|
+
|
|
2407
|
+
target_path = self.tables["meta_tables"]
|
|
2408
|
+
|
|
2409
|
+
merge_sql = f"""
|
|
2410
|
+
MERGE INTO delta.`{target_path}` AS target
|
|
2411
|
+
USING {view_name} AS source
|
|
2412
|
+
ON target.project_name = source.project_name
|
|
2413
|
+
AND target.table_name = source.table_name
|
|
2414
|
+
WHEN MATCHED THEN UPDATE SET
|
|
2415
|
+
target.path = source.path,
|
|
2416
|
+
target.format = source.format,
|
|
2417
|
+
target.pattern_type = source.pattern_type,
|
|
2418
|
+
target.schema_hash = source.schema_hash,
|
|
2419
|
+
target.updated_at = source.updated_at
|
|
2420
|
+
WHEN NOT MATCHED THEN INSERT *
|
|
2421
|
+
"""
|
|
2422
|
+
self.spark.sql(merge_sql)
|
|
2423
|
+
self.spark.catalog.dropTempView(view_name)
|
|
2424
|
+
|
|
2425
|
+
elif self.engine:
|
|
2426
|
+
import pandas as pd
|
|
2427
|
+
|
|
2428
|
+
data = {
|
|
2429
|
+
"project_name": [r["project_name"] for r in records],
|
|
2430
|
+
"table_name": [r["table_name"] for r in records],
|
|
2431
|
+
"path": [r["path"] for r in records],
|
|
2432
|
+
"format": [r["format"] for r in records],
|
|
2433
|
+
"pattern_type": [r["pattern_type"] for r in records],
|
|
2434
|
+
"schema_hash": [r.get("schema_hash", "") for r in records],
|
|
2435
|
+
"updated_at": [timestamp] * len(records),
|
|
2436
|
+
}
|
|
2437
|
+
df = pd.DataFrame(data)
|
|
2438
|
+
|
|
2439
|
+
self.engine.write(
|
|
2440
|
+
df,
|
|
2441
|
+
connection=self.connection,
|
|
2442
|
+
format="delta",
|
|
2443
|
+
path=self.tables["meta_tables"],
|
|
2444
|
+
mode="upsert",
|
|
2445
|
+
options={"keys": ["project_name", "table_name"]},
|
|
2446
|
+
)
|
|
2447
|
+
|
|
2448
|
+
logger.debug(f"Batch registered {len(records)} asset(s)")
|
|
2449
|
+
|
|
2450
|
+
try:
|
|
2451
|
+
self._retry_with_backoff(_do_batch_register)
|
|
2452
|
+
except Exception as e:
|
|
2453
|
+
logger.warning(f"Failed to batch register assets: {e}")
|
|
2454
|
+
|
|
2455
|
+
def get_upstream(
|
|
2456
|
+
self,
|
|
2457
|
+
table_path: str,
|
|
2458
|
+
depth: int = 3,
|
|
2459
|
+
) -> List[Dict[str, Any]]:
|
|
2460
|
+
"""
|
|
2461
|
+
Get all upstream sources for a table.
|
|
2462
|
+
|
|
2463
|
+
Args:
|
|
2464
|
+
table_path: Table to trace upstream from
|
|
2465
|
+
depth: Maximum depth to traverse
|
|
2466
|
+
|
|
2467
|
+
Returns:
|
|
2468
|
+
List of upstream lineage records with depth information
|
|
2469
|
+
"""
|
|
2470
|
+
if not self.spark and not self.engine:
|
|
2471
|
+
return []
|
|
2472
|
+
|
|
2473
|
+
upstream = []
|
|
2474
|
+
visited = set()
|
|
2475
|
+
queue = [(table_path, 0)]
|
|
2476
|
+
|
|
2477
|
+
try:
|
|
2478
|
+
while queue:
|
|
2479
|
+
current, level = queue.pop(0)
|
|
2480
|
+
if current in visited or level > depth:
|
|
2481
|
+
continue
|
|
2482
|
+
visited.add(current)
|
|
2483
|
+
|
|
2484
|
+
if self.spark:
|
|
2485
|
+
from pyspark.sql import functions as F
|
|
2486
|
+
|
|
2487
|
+
df = self.spark.read.format("delta").load(self.tables["meta_lineage"])
|
|
2488
|
+
sources = df.filter(F.col("target_table") == current).collect()
|
|
2489
|
+
for row in sources:
|
|
2490
|
+
record = row.asDict()
|
|
2491
|
+
record["depth"] = level
|
|
2492
|
+
upstream.append(record)
|
|
2493
|
+
queue.append((record["source_table"], level + 1))
|
|
2494
|
+
|
|
2495
|
+
elif self.engine:
|
|
2496
|
+
df = self._read_local_table(self.tables["meta_lineage"])
|
|
2497
|
+
if df.empty or "target_table" not in df.columns:
|
|
2498
|
+
break
|
|
2499
|
+
|
|
2500
|
+
sources = df[df["target_table"] == current]
|
|
2501
|
+
for _, row in sources.iterrows():
|
|
2502
|
+
record = row.to_dict()
|
|
2503
|
+
record["depth"] = level
|
|
2504
|
+
upstream.append(record)
|
|
2505
|
+
queue.append((record["source_table"], level + 1))
|
|
2506
|
+
|
|
2507
|
+
except Exception as e:
|
|
2508
|
+
logger.warning(f"Failed to get upstream lineage for {table_path}: {e}")
|
|
2509
|
+
|
|
2510
|
+
return upstream
|
|
2511
|
+
|
|
2512
|
+
def get_downstream(
|
|
2513
|
+
self,
|
|
2514
|
+
table_path: str,
|
|
2515
|
+
depth: int = 3,
|
|
2516
|
+
) -> List[Dict[str, Any]]:
|
|
2517
|
+
"""
|
|
2518
|
+
Get all downstream consumers of a table.
|
|
2519
|
+
|
|
2520
|
+
Args:
|
|
2521
|
+
table_path: Table to trace downstream from
|
|
2522
|
+
depth: Maximum depth to traverse
|
|
2523
|
+
|
|
2524
|
+
Returns:
|
|
2525
|
+
List of downstream lineage records with depth information
|
|
2526
|
+
"""
|
|
2527
|
+
if not self.spark and not self.engine:
|
|
2528
|
+
return []
|
|
2529
|
+
|
|
2530
|
+
downstream = []
|
|
2531
|
+
visited = set()
|
|
2532
|
+
queue = [(table_path, 0)]
|
|
2533
|
+
|
|
2534
|
+
try:
|
|
2535
|
+
while queue:
|
|
2536
|
+
current, level = queue.pop(0)
|
|
2537
|
+
if current in visited or level > depth:
|
|
2538
|
+
continue
|
|
2539
|
+
visited.add(current)
|
|
2540
|
+
|
|
2541
|
+
if self.spark:
|
|
2542
|
+
from pyspark.sql import functions as F
|
|
2543
|
+
|
|
2544
|
+
df = self.spark.read.format("delta").load(self.tables["meta_lineage"])
|
|
2545
|
+
targets = df.filter(F.col("source_table") == current).collect()
|
|
2546
|
+
for row in targets:
|
|
2547
|
+
record = row.asDict()
|
|
2548
|
+
record["depth"] = level
|
|
2549
|
+
downstream.append(record)
|
|
2550
|
+
queue.append((record["target_table"], level + 1))
|
|
2551
|
+
|
|
2552
|
+
elif self.engine:
|
|
2553
|
+
df = self._read_local_table(self.tables["meta_lineage"])
|
|
2554
|
+
if df.empty or "source_table" not in df.columns:
|
|
2555
|
+
break
|
|
2556
|
+
|
|
2557
|
+
targets = df[df["source_table"] == current]
|
|
2558
|
+
for _, row in targets.iterrows():
|
|
2559
|
+
record = row.to_dict()
|
|
2560
|
+
record["depth"] = level
|
|
2561
|
+
downstream.append(record)
|
|
2562
|
+
queue.append((record["target_table"], level + 1))
|
|
2563
|
+
|
|
2564
|
+
except Exception as e:
|
|
2565
|
+
logger.warning(f"Failed to get downstream lineage for {table_path}: {e}")
|
|
2566
|
+
|
|
2567
|
+
return downstream
|
|
2568
|
+
|
|
2569
|
+
def optimize(self) -> None:
|
|
2570
|
+
"""
|
|
2571
|
+
Runs VACUUM and OPTIMIZE (Z-Order) on meta_runs.
|
|
2572
|
+
Spark-only feature.
|
|
2573
|
+
"""
|
|
2574
|
+
if not self.spark:
|
|
2575
|
+
return
|
|
2576
|
+
|
|
2577
|
+
try:
|
|
2578
|
+
logger.info("Starting Catalog Optimization...")
|
|
2579
|
+
|
|
2580
|
+
# 1. meta_runs
|
|
2581
|
+
# VACUUM: Remove files older than 7 days (Spark requires check disable or careful setting)
|
|
2582
|
+
# Note: default retention check might block < 168 hours.
|
|
2583
|
+
# We'll use RETAIN 168 HOURS (7 days) to be safe.
|
|
2584
|
+
self.spark.sql(f"VACUUM delta.`{self.tables['meta_runs']}` RETAIN 168 HOURS")
|
|
2585
|
+
|
|
2586
|
+
# OPTIMIZE: Z-ORDER BY timestamp (for range queries)
|
|
2587
|
+
# We also have 'pipeline_name' and 'date' as partitions.
|
|
2588
|
+
# Z-Ordering by timestamp helps within the partitions.
|
|
2589
|
+
self.spark.sql(f"OPTIMIZE delta.`{self.tables['meta_runs']}` ZORDER BY (timestamp)")
|
|
2590
|
+
|
|
2591
|
+
logger.info("Catalog Optimization completed successfully.")
|
|
2592
|
+
|
|
2593
|
+
except Exception as e:
|
|
2594
|
+
logger.warning(f"Catalog Optimization failed: {e}")
|
|
2595
|
+
|
|
2596
|
+
# -------------------------------------------------------------------------
|
|
2597
|
+
# Phase 3.6: Metrics Logging
|
|
2598
|
+
# -------------------------------------------------------------------------
|
|
2599
|
+
|
|
2600
|
+
def log_metrics(
|
|
2601
|
+
self,
|
|
2602
|
+
metric_name: str,
|
|
2603
|
+
definition_sql: str,
|
|
2604
|
+
dimensions: List[str],
|
|
2605
|
+
source_table: str,
|
|
2606
|
+
) -> None:
|
|
2607
|
+
"""Log a business metric definition to meta_metrics.
|
|
2608
|
+
|
|
2609
|
+
Args:
|
|
2610
|
+
metric_name: Name of the metric
|
|
2611
|
+
definition_sql: SQL definition of the metric
|
|
2612
|
+
dimensions: List of dimension columns
|
|
2613
|
+
source_table: Source table for the metric
|
|
2614
|
+
"""
|
|
2615
|
+
if not self.spark and not self.engine:
|
|
2616
|
+
return
|
|
2617
|
+
|
|
2618
|
+
def _do_log_metrics():
|
|
2619
|
+
import json
|
|
2620
|
+
|
|
2621
|
+
if self.spark:
|
|
2622
|
+
dimensions_json = json.dumps(dimensions)
|
|
2623
|
+
rows = [(metric_name, definition_sql, dimensions_json, source_table)]
|
|
2624
|
+
schema = self._get_schema_meta_metrics()
|
|
2625
|
+
|
|
2626
|
+
df = self.spark.createDataFrame(rows, schema)
|
|
2627
|
+
df.write.format("delta").mode("append").save(self.tables["meta_metrics"])
|
|
2628
|
+
|
|
2629
|
+
elif self.engine:
|
|
2630
|
+
import pandas as pd
|
|
2631
|
+
|
|
2632
|
+
data = {
|
|
2633
|
+
"metric_name": [metric_name],
|
|
2634
|
+
"definition_sql": [definition_sql],
|
|
2635
|
+
"dimensions": [json.dumps(dimensions)],
|
|
2636
|
+
"source_table": [source_table],
|
|
2637
|
+
}
|
|
2638
|
+
df = pd.DataFrame(data)
|
|
2639
|
+
|
|
2640
|
+
self.engine.write(
|
|
2641
|
+
df,
|
|
2642
|
+
connection=self.connection,
|
|
2643
|
+
format="delta",
|
|
2644
|
+
path=self.tables["meta_metrics"],
|
|
2645
|
+
mode="append",
|
|
2646
|
+
)
|
|
2647
|
+
|
|
2648
|
+
logger.debug(f"Logged metric: {metric_name}")
|
|
2649
|
+
|
|
2650
|
+
try:
|
|
2651
|
+
self._retry_with_backoff(_do_log_metrics)
|
|
2652
|
+
except Exception as e:
|
|
2653
|
+
logger.warning(f"Failed to log metric to system catalog: {e}")
|
|
2654
|
+
|
|
2655
|
+
# -------------------------------------------------------------------------
|
|
2656
|
+
# Phase 4: Cleanup/Removal Methods
|
|
2657
|
+
# -------------------------------------------------------------------------
|
|
2658
|
+
|
|
2659
|
+
def remove_pipeline(self, pipeline_name: str) -> int:
|
|
2660
|
+
"""Remove pipeline and cascade to nodes, state entries.
|
|
2661
|
+
|
|
2662
|
+
Args:
|
|
2663
|
+
pipeline_name: Name of the pipeline to remove
|
|
2664
|
+
|
|
2665
|
+
Returns:
|
|
2666
|
+
Count of deleted entries
|
|
2667
|
+
"""
|
|
2668
|
+
if not self.spark and not self.engine:
|
|
2669
|
+
return 0
|
|
2670
|
+
|
|
2671
|
+
deleted_count = 0
|
|
2672
|
+
|
|
2673
|
+
try:
|
|
2674
|
+
if self.spark:
|
|
2675
|
+
from pyspark.sql import functions as F
|
|
2676
|
+
|
|
2677
|
+
# Delete from meta_pipelines
|
|
2678
|
+
df = self.spark.read.format("delta").load(self.tables["meta_pipelines"])
|
|
2679
|
+
df.cache()
|
|
2680
|
+
initial_count = df.count()
|
|
2681
|
+
df_filtered = df.filter(F.col("pipeline_name") != pipeline_name)
|
|
2682
|
+
df_filtered.write.format("delta").mode("overwrite").save(
|
|
2683
|
+
self.tables["meta_pipelines"]
|
|
2684
|
+
)
|
|
2685
|
+
deleted_count += initial_count - df_filtered.count()
|
|
2686
|
+
df.unpersist()
|
|
2687
|
+
|
|
2688
|
+
# Delete associated nodes from meta_nodes
|
|
2689
|
+
df_nodes = self.spark.read.format("delta").load(self.tables["meta_nodes"])
|
|
2690
|
+
df_nodes.cache()
|
|
2691
|
+
nodes_initial = df_nodes.count()
|
|
2692
|
+
df_nodes_filtered = df_nodes.filter(F.col("pipeline_name") != pipeline_name)
|
|
2693
|
+
df_nodes_filtered.write.format("delta").mode("overwrite").save(
|
|
2694
|
+
self.tables["meta_nodes"]
|
|
2695
|
+
)
|
|
2696
|
+
deleted_count += nodes_initial - df_nodes_filtered.count()
|
|
2697
|
+
df_nodes.unpersist()
|
|
2698
|
+
|
|
2699
|
+
elif self.engine:
|
|
2700
|
+
# Delete from meta_pipelines
|
|
2701
|
+
df = self._read_local_table(self.tables["meta_pipelines"])
|
|
2702
|
+
if not df.empty and "pipeline_name" in df.columns:
|
|
2703
|
+
initial_count = len(df)
|
|
2704
|
+
df = df[df["pipeline_name"] != pipeline_name]
|
|
2705
|
+
self.engine.write(
|
|
2706
|
+
df,
|
|
2707
|
+
connection=self.connection,
|
|
2708
|
+
format="delta",
|
|
2709
|
+
path=self.tables["meta_pipelines"],
|
|
2710
|
+
mode="overwrite",
|
|
2711
|
+
)
|
|
2712
|
+
deleted_count += initial_count - len(df)
|
|
2713
|
+
|
|
2714
|
+
# Delete associated nodes from meta_nodes
|
|
2715
|
+
df_nodes = self._read_local_table(self.tables["meta_nodes"])
|
|
2716
|
+
if not df_nodes.empty and "pipeline_name" in df_nodes.columns:
|
|
2717
|
+
nodes_initial = len(df_nodes)
|
|
2718
|
+
df_nodes = df_nodes[df_nodes["pipeline_name"] != pipeline_name]
|
|
2719
|
+
self.engine.write(
|
|
2720
|
+
df_nodes,
|
|
2721
|
+
connection=self.connection,
|
|
2722
|
+
format="delta",
|
|
2723
|
+
path=self.tables["meta_nodes"],
|
|
2724
|
+
mode="overwrite",
|
|
2725
|
+
)
|
|
2726
|
+
deleted_count += nodes_initial - len(df_nodes)
|
|
2727
|
+
|
|
2728
|
+
self.invalidate_cache()
|
|
2729
|
+
logger.info(f"Removed pipeline '{pipeline_name}': {deleted_count} entries deleted")
|
|
2730
|
+
|
|
2731
|
+
except Exception as e:
|
|
2732
|
+
logger.warning(f"Failed to remove pipeline: {e}")
|
|
2733
|
+
|
|
2734
|
+
return deleted_count
|
|
2735
|
+
|
|
2736
|
+
def remove_node(self, pipeline_name: str, node_name: str) -> int:
|
|
2737
|
+
"""Remove node and associated state entries.
|
|
2738
|
+
|
|
2739
|
+
Args:
|
|
2740
|
+
pipeline_name: Pipeline name
|
|
2741
|
+
node_name: Node name to remove
|
|
2742
|
+
|
|
2743
|
+
Returns:
|
|
2744
|
+
Count of deleted entries
|
|
2745
|
+
"""
|
|
2746
|
+
if not self.spark and not self.engine:
|
|
2747
|
+
return 0
|
|
2748
|
+
|
|
2749
|
+
deleted_count = 0
|
|
2750
|
+
|
|
2751
|
+
try:
|
|
2752
|
+
if self.spark:
|
|
2753
|
+
from pyspark.sql import functions as F
|
|
2754
|
+
|
|
2755
|
+
# Delete from meta_nodes
|
|
2756
|
+
df = self.spark.read.format("delta").load(self.tables["meta_nodes"])
|
|
2757
|
+
df.cache()
|
|
2758
|
+
initial_count = df.count()
|
|
2759
|
+
df_filtered = df.filter(
|
|
2760
|
+
~((F.col("pipeline_name") == pipeline_name) & (F.col("node_name") == node_name))
|
|
2761
|
+
)
|
|
2762
|
+
df_filtered.write.format("delta").mode("overwrite").save(self.tables["meta_nodes"])
|
|
2763
|
+
deleted_count = initial_count - df_filtered.count()
|
|
2764
|
+
df.unpersist()
|
|
2765
|
+
|
|
2766
|
+
elif self.engine:
|
|
2767
|
+
df = self._read_local_table(self.tables["meta_nodes"])
|
|
2768
|
+
if not df.empty and "pipeline_name" in df.columns and "node_name" in df.columns:
|
|
2769
|
+
initial_count = len(df)
|
|
2770
|
+
df = df[
|
|
2771
|
+
~((df["pipeline_name"] == pipeline_name) & (df["node_name"] == node_name))
|
|
2772
|
+
]
|
|
2773
|
+
self.engine.write(
|
|
2774
|
+
df,
|
|
2775
|
+
connection=self.connection,
|
|
2776
|
+
format="delta",
|
|
2777
|
+
path=self.tables["meta_nodes"],
|
|
2778
|
+
mode="overwrite",
|
|
2779
|
+
)
|
|
2780
|
+
deleted_count = initial_count - len(df)
|
|
2781
|
+
|
|
2782
|
+
self._nodes_cache = None
|
|
2783
|
+
logger.info(
|
|
2784
|
+
f"Removed node '{node_name}' from pipeline '{pipeline_name}': "
|
|
2785
|
+
f"{deleted_count} entries deleted"
|
|
2786
|
+
)
|
|
2787
|
+
|
|
2788
|
+
except Exception as e:
|
|
2789
|
+
logger.warning(f"Failed to remove node: {e}")
|
|
2790
|
+
|
|
2791
|
+
return deleted_count
|
|
2792
|
+
|
|
2793
|
+
def cleanup_orphans(self, current_config: Any) -> Dict[str, int]:
|
|
2794
|
+
"""Compare catalog against current config, remove stale entries.
|
|
2795
|
+
|
|
2796
|
+
Args:
|
|
2797
|
+
current_config: ProjectConfig with current pipeline definitions
|
|
2798
|
+
|
|
2799
|
+
Returns:
|
|
2800
|
+
Dict of {table: deleted_count}
|
|
2801
|
+
"""
|
|
2802
|
+
if not self.spark and not self.engine:
|
|
2803
|
+
return {}
|
|
2804
|
+
|
|
2805
|
+
results = {"meta_pipelines": 0, "meta_nodes": 0}
|
|
2806
|
+
|
|
2807
|
+
try:
|
|
2808
|
+
# Get current pipeline and node names from config
|
|
2809
|
+
current_pipelines = set()
|
|
2810
|
+
current_nodes = {} # {pipeline_name: set(node_names)}
|
|
2811
|
+
|
|
2812
|
+
for pipeline in current_config.pipelines:
|
|
2813
|
+
current_pipelines.add(pipeline.pipeline)
|
|
2814
|
+
current_nodes[pipeline.pipeline] = {node.name for node in pipeline.nodes}
|
|
2815
|
+
|
|
2816
|
+
if self.spark:
|
|
2817
|
+
from pyspark.sql import functions as F
|
|
2818
|
+
|
|
2819
|
+
# Cleanup orphan pipelines
|
|
2820
|
+
df_pipelines = self.spark.read.format("delta").load(self.tables["meta_pipelines"])
|
|
2821
|
+
df_pipelines.cache()
|
|
2822
|
+
initial_pipelines = df_pipelines.count()
|
|
2823
|
+
df_pipelines_filtered = df_pipelines.filter(
|
|
2824
|
+
F.col("pipeline_name").isin(list(current_pipelines))
|
|
2825
|
+
)
|
|
2826
|
+
df_pipelines_filtered.write.format("delta").mode("overwrite").save(
|
|
2827
|
+
self.tables["meta_pipelines"]
|
|
2828
|
+
)
|
|
2829
|
+
results["meta_pipelines"] = initial_pipelines - df_pipelines_filtered.count()
|
|
2830
|
+
df_pipelines.unpersist()
|
|
2831
|
+
|
|
2832
|
+
# Cleanup orphan nodes
|
|
2833
|
+
df_nodes = self.spark.read.format("delta").load(self.tables["meta_nodes"])
|
|
2834
|
+
df_nodes.cache()
|
|
2835
|
+
initial_nodes = df_nodes.count()
|
|
2836
|
+
|
|
2837
|
+
# Filter: keep only nodes that belong to current pipelines and exist in config
|
|
2838
|
+
valid_nodes = []
|
|
2839
|
+
for p_name, nodes in current_nodes.items():
|
|
2840
|
+
for n_name in nodes:
|
|
2841
|
+
valid_nodes.append((p_name, n_name))
|
|
2842
|
+
|
|
2843
|
+
if valid_nodes:
|
|
2844
|
+
valid_df = self.spark.createDataFrame(
|
|
2845
|
+
valid_nodes, ["pipeline_name", "node_name"]
|
|
2846
|
+
)
|
|
2847
|
+
df_nodes_filtered = df_nodes.join(
|
|
2848
|
+
valid_df, ["pipeline_name", "node_name"], "inner"
|
|
2849
|
+
)
|
|
2850
|
+
else:
|
|
2851
|
+
df_nodes_filtered = df_nodes.limit(0)
|
|
2852
|
+
|
|
2853
|
+
df_nodes_filtered.write.format("delta").mode("overwrite").save(
|
|
2854
|
+
self.tables["meta_nodes"]
|
|
2855
|
+
)
|
|
2856
|
+
results["meta_nodes"] = initial_nodes - df_nodes_filtered.count()
|
|
2857
|
+
df_nodes.unpersist()
|
|
2858
|
+
|
|
2859
|
+
elif self.engine:
|
|
2860
|
+
# Cleanup orphan pipelines
|
|
2861
|
+
df_pipelines = self._read_local_table(self.tables["meta_pipelines"])
|
|
2862
|
+
if not df_pipelines.empty and "pipeline_name" in df_pipelines.columns:
|
|
2863
|
+
initial_pipelines = len(df_pipelines)
|
|
2864
|
+
df_pipelines = df_pipelines[
|
|
2865
|
+
df_pipelines["pipeline_name"].isin(current_pipelines)
|
|
2866
|
+
]
|
|
2867
|
+
self.engine.write(
|
|
2868
|
+
df_pipelines,
|
|
2869
|
+
connection=self.connection,
|
|
2870
|
+
format="delta",
|
|
2871
|
+
path=self.tables["meta_pipelines"],
|
|
2872
|
+
mode="overwrite",
|
|
2873
|
+
)
|
|
2874
|
+
results["meta_pipelines"] = initial_pipelines - len(df_pipelines)
|
|
2875
|
+
|
|
2876
|
+
# Cleanup orphan nodes
|
|
2877
|
+
df_nodes = self._read_local_table(self.tables["meta_nodes"])
|
|
2878
|
+
if not df_nodes.empty and "pipeline_name" in df_nodes.columns:
|
|
2879
|
+
initial_nodes = len(df_nodes)
|
|
2880
|
+
|
|
2881
|
+
valid_node_tuples = set()
|
|
2882
|
+
for p_name, nodes in current_nodes.items():
|
|
2883
|
+
for n_name in nodes:
|
|
2884
|
+
valid_node_tuples.add((p_name, n_name))
|
|
2885
|
+
|
|
2886
|
+
df_nodes["_valid"] = df_nodes.apply(
|
|
2887
|
+
lambda row: (row["pipeline_name"], row["node_name"]) in valid_node_tuples,
|
|
2888
|
+
axis=1,
|
|
2889
|
+
)
|
|
2890
|
+
df_nodes = df_nodes[df_nodes["_valid"]].drop(columns=["_valid"])
|
|
2891
|
+
|
|
2892
|
+
self.engine.write(
|
|
2893
|
+
df_nodes,
|
|
2894
|
+
connection=self.connection,
|
|
2895
|
+
format="delta",
|
|
2896
|
+
path=self.tables["meta_nodes"],
|
|
2897
|
+
mode="overwrite",
|
|
2898
|
+
)
|
|
2899
|
+
results["meta_nodes"] = initial_nodes - len(df_nodes)
|
|
2900
|
+
|
|
2901
|
+
self.invalidate_cache()
|
|
2902
|
+
logger.info(
|
|
2903
|
+
f"Cleanup orphans completed: {results['meta_pipelines']} pipelines, "
|
|
2904
|
+
f"{results['meta_nodes']} nodes removed"
|
|
2905
|
+
)
|
|
2906
|
+
|
|
2907
|
+
except Exception as e:
|
|
2908
|
+
logger.warning(f"Failed to cleanup orphans: {e}")
|
|
2909
|
+
|
|
2910
|
+
return results
|
|
2911
|
+
|
|
2912
|
+
def clear_state_key(self, key: str) -> bool:
|
|
2913
|
+
"""Remove a single state entry by key.
|
|
2914
|
+
|
|
2915
|
+
Args:
|
|
2916
|
+
key: State key to remove
|
|
2917
|
+
|
|
2918
|
+
Returns:
|
|
2919
|
+
True if deleted, False otherwise
|
|
2920
|
+
"""
|
|
2921
|
+
if not self.spark and not self.engine:
|
|
2922
|
+
return False
|
|
2923
|
+
|
|
2924
|
+
try:
|
|
2925
|
+
if self.spark:
|
|
2926
|
+
from pyspark.sql import functions as F
|
|
2927
|
+
|
|
2928
|
+
df = self.spark.read.format("delta").load(self.tables["meta_state"])
|
|
2929
|
+
initial_count = df.count()
|
|
2930
|
+
df = df.filter(F.col("key") != key)
|
|
2931
|
+
df.write.format("delta").mode("overwrite").save(self.tables["meta_state"])
|
|
2932
|
+
return df.count() < initial_count
|
|
2933
|
+
|
|
2934
|
+
elif self.engine:
|
|
2935
|
+
df = self._read_local_table(self.tables["meta_state"])
|
|
2936
|
+
if df.empty or "key" not in df.columns:
|
|
2937
|
+
return False
|
|
2938
|
+
|
|
2939
|
+
initial_count = len(df)
|
|
2940
|
+
df = df[df["key"] != key]
|
|
2941
|
+
|
|
2942
|
+
if len(df) < initial_count:
|
|
2943
|
+
self.engine.write(
|
|
2944
|
+
df,
|
|
2945
|
+
connection=self.connection,
|
|
2946
|
+
format="delta",
|
|
2947
|
+
path=self.tables["meta_state"],
|
|
2948
|
+
mode="overwrite",
|
|
2949
|
+
)
|
|
2950
|
+
return True
|
|
2951
|
+
|
|
2952
|
+
return False
|
|
2953
|
+
|
|
2954
|
+
except Exception as e:
|
|
2955
|
+
logger.warning(f"Failed to clear state key '{key}': {e}")
|
|
2956
|
+
return False
|
|
2957
|
+
|
|
2958
|
+
def clear_state_pattern(self, key_pattern: str) -> int:
|
|
2959
|
+
"""Remove state entries matching pattern (supports wildcards).
|
|
2960
|
+
|
|
2961
|
+
Args:
|
|
2962
|
+
key_pattern: Pattern with optional * wildcards
|
|
2963
|
+
|
|
2964
|
+
Returns:
|
|
2965
|
+
Count of deleted entries
|
|
2966
|
+
"""
|
|
2967
|
+
if not self.spark and not self.engine:
|
|
2968
|
+
return 0
|
|
2969
|
+
|
|
2970
|
+
try:
|
|
2971
|
+
if self.spark:
|
|
2972
|
+
from pyspark.sql import functions as F
|
|
2973
|
+
|
|
2974
|
+
df = self.spark.read.format("delta").load(self.tables["meta_state"])
|
|
2975
|
+
initial_count = df.count()
|
|
2976
|
+
|
|
2977
|
+
# Convert wildcard pattern to SQL LIKE pattern
|
|
2978
|
+
like_pattern = key_pattern.replace("*", "%")
|
|
2979
|
+
df = df.filter(~F.col("key").like(like_pattern))
|
|
2980
|
+
df.write.format("delta").mode("overwrite").save(self.tables["meta_state"])
|
|
2981
|
+
|
|
2982
|
+
return initial_count - df.count()
|
|
2983
|
+
|
|
2984
|
+
elif self.engine:
|
|
2985
|
+
import re
|
|
2986
|
+
|
|
2987
|
+
df = self._read_local_table(self.tables["meta_state"])
|
|
2988
|
+
if df.empty or "key" not in df.columns:
|
|
2989
|
+
return 0
|
|
2990
|
+
|
|
2991
|
+
initial_count = len(df)
|
|
2992
|
+
|
|
2993
|
+
# Convert wildcard pattern to regex
|
|
2994
|
+
regex_pattern = "^" + key_pattern.replace("*", ".*") + "$"
|
|
2995
|
+
pattern = re.compile(regex_pattern)
|
|
2996
|
+
df = df[~df["key"].apply(lambda x: bool(pattern.match(str(x))))]
|
|
2997
|
+
|
|
2998
|
+
if len(df) < initial_count:
|
|
2999
|
+
self.engine.write(
|
|
3000
|
+
df,
|
|
3001
|
+
connection=self.connection,
|
|
3002
|
+
format="delta",
|
|
3003
|
+
path=self.tables["meta_state"],
|
|
3004
|
+
mode="overwrite",
|
|
3005
|
+
)
|
|
3006
|
+
|
|
3007
|
+
return initial_count - len(df)
|
|
3008
|
+
|
|
3009
|
+
except Exception as e:
|
|
3010
|
+
logger.warning(f"Failed to clear state pattern '{key_pattern}': {e}")
|
|
3011
|
+
return 0
|