odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/catalog.py ADDED
@@ -0,0 +1,3011 @@
1
+ import hashlib
2
+ import json
3
+ import logging
4
+ import random
5
+ import time
6
+ from datetime import datetime, timezone
7
+ from typing import Any, Dict, List, Optional
8
+
9
+ try:
10
+ from pyspark.sql import SparkSession
11
+ from pyspark.sql.types import (
12
+ ArrayType,
13
+ DateType,
14
+ DoubleType,
15
+ LongType,
16
+ StringType,
17
+ StructField,
18
+ StructType,
19
+ TimestampType,
20
+ )
21
+ except ImportError:
22
+ # Fallback for environments without PySpark (e.g., pure Pandas mode)
23
+ SparkSession = Any
24
+
25
+ class DataType:
26
+ pass
27
+
28
+ class StringType(DataType):
29
+ pass
30
+
31
+ class LongType(DataType):
32
+ pass
33
+
34
+ class DoubleType(DataType):
35
+ pass
36
+
37
+ class DateType(DataType):
38
+ pass
39
+
40
+ class TimestampType(DataType):
41
+ pass
42
+
43
+ class ArrayType(DataType):
44
+ def __init__(self, elementType):
45
+ self.elementType = elementType
46
+
47
+ class StructField:
48
+ def __init__(self, name, dtype, nullable=True):
49
+ self.name = name
50
+ self.dataType = dtype
51
+
52
+ class StructType:
53
+ def __init__(self, fields):
54
+ self.fields = fields
55
+
56
+
57
+ from odibi.config import SystemConfig
58
+
59
+ logger = logging.getLogger(__name__)
60
+
61
+
62
+ class CatalogManager:
63
+ """
64
+ Manages the Odibi System Catalog (The Brain).
65
+ Handles bootstrapping and interaction with meta-tables.
66
+ """
67
+
68
+ def __init__(
69
+ self,
70
+ spark: Optional[SparkSession],
71
+ config: SystemConfig,
72
+ base_path: str,
73
+ engine: Optional[Any] = None,
74
+ connection: Optional[Any] = None,
75
+ ):
76
+ """
77
+ Initialize the Catalog Manager.
78
+
79
+ Args:
80
+ spark: Active SparkSession (optional if engine is provided)
81
+ config: SystemConfig object
82
+ base_path: Absolute path to the system catalog directory (resolved from connection).
83
+ Example: "abfss://container@account.dfs.core.windows.net/_odibi_system"
84
+ engine: Execution engine (optional, for Pandas mode)
85
+ connection: Connection object for storage credentials (optional, for Pandas mode)
86
+ """
87
+ self.spark = spark
88
+ self.config = config
89
+ self.base_path = base_path.rstrip("/")
90
+ self.engine = engine
91
+ self.connection = connection
92
+
93
+ # Table Paths
94
+ self.tables = {
95
+ "meta_tables": f"{self.base_path}/meta_tables",
96
+ "meta_runs": f"{self.base_path}/meta_runs",
97
+ "meta_patterns": f"{self.base_path}/meta_patterns",
98
+ "meta_metrics": f"{self.base_path}/meta_metrics",
99
+ "meta_state": f"{self.base_path}/meta_state",
100
+ "meta_pipelines": f"{self.base_path}/meta_pipelines",
101
+ "meta_nodes": f"{self.base_path}/meta_nodes",
102
+ "meta_schemas": f"{self.base_path}/meta_schemas",
103
+ "meta_lineage": f"{self.base_path}/meta_lineage",
104
+ "meta_outputs": f"{self.base_path}/meta_outputs",
105
+ }
106
+
107
+ # Cache for meta table reads (invalidated on write operations)
108
+ self._pipelines_cache: Optional[Dict[str, Dict[str, Any]]] = None
109
+ self._nodes_cache: Optional[Dict[str, Dict[str, str]]] = None
110
+ self._outputs_cache: Optional[Dict[str, Dict[str, Any]]] = None
111
+
112
+ @property
113
+ def is_spark_mode(self) -> bool:
114
+ """Check if running in Spark mode."""
115
+ return self.spark is not None
116
+
117
+ @property
118
+ def is_pandas_mode(self) -> bool:
119
+ """Check if running in Pandas mode."""
120
+ return self.engine is not None and self.engine.name == "pandas"
121
+
122
+ @property
123
+ def is_sql_server_mode(self) -> bool:
124
+ """Check if running with SQL Server system backend."""
125
+ if self.connection is None:
126
+ return False
127
+ # Check if connection is AzureSQL type
128
+ conn_type = getattr(self.connection, "__class__", None)
129
+ if conn_type is None:
130
+ return False
131
+ return conn_type.__name__ in ("AzureSQL", "SqlServerConnection")
132
+
133
+ def _get_storage_options(self) -> Dict[str, Any]:
134
+ """Get storage options for pandas/delta-rs operations.
135
+
136
+ Returns:
137
+ Dict with storage credentials if connection supports it, else empty dict.
138
+ """
139
+ if self.connection and hasattr(self.connection, "pandas_storage_options"):
140
+ return self.connection.pandas_storage_options()
141
+ return {}
142
+
143
+ @property
144
+ def has_backend(self) -> bool:
145
+ """Check if any backend (Spark or engine) is available."""
146
+ return self.spark is not None or self.engine is not None
147
+
148
+ def invalidate_cache(self) -> None:
149
+ """Invalidate all cached meta table data."""
150
+ self._pipelines_cache = None
151
+ self._nodes_cache = None
152
+ self._outputs_cache = None
153
+
154
+ def _retry_with_backoff(self, func, max_retries: int = 5, base_delay: float = 1.0):
155
+ """Retry a function with exponential backoff and jitter for concurrent writes.
156
+
157
+ Only retries on Delta Lake concurrency exceptions. Other exceptions are
158
+ raised immediately. Warnings are only logged after all retries fail.
159
+
160
+ Args:
161
+ func: Callable to execute.
162
+ max_retries: Maximum retry attempts (default 5 for high concurrency).
163
+ base_delay: Base delay in seconds (doubles each retry).
164
+
165
+ Returns:
166
+ Result of the function.
167
+
168
+ Raises:
169
+ Exception: If all retries fail or non-retryable error occurs.
170
+ """
171
+ for attempt in range(max_retries + 1):
172
+ try:
173
+ return func()
174
+ except Exception as e:
175
+ error_str = str(e)
176
+ # Check for Delta concurrency exceptions
177
+ is_concurrent_error = any(
178
+ msg in error_str
179
+ for msg in [
180
+ "ConcurrentAppendException",
181
+ "ConcurrentDeleteReadException",
182
+ "ConcurrentDeleteDeleteException",
183
+ "DELTA_CONCURRENT",
184
+ "concurrent",
185
+ "conflict",
186
+ ]
187
+ )
188
+ if not is_concurrent_error or attempt >= max_retries:
189
+ raise
190
+ # Exponential backoff with jitter (1s, 2s, 4s, 8s, 16s = ~31s total)
191
+ delay = base_delay * (2**attempt) + random.uniform(0, 1.0)
192
+ logger.debug(
193
+ f"Delta concurrent write (attempt {attempt + 1}/{max_retries + 1}), "
194
+ f"retrying in {delay:.2f}s..."
195
+ )
196
+ time.sleep(delay)
197
+
198
+ def _get_all_pipelines_cached(self) -> Dict[str, Dict[str, Any]]:
199
+ """Get all pipelines with caching."""
200
+ if self._pipelines_cache is not None:
201
+ return self._pipelines_cache
202
+
203
+ self._pipelines_cache = {}
204
+ if not self.spark and not self.engine:
205
+ return self._pipelines_cache
206
+
207
+ try:
208
+ if self.spark:
209
+ df = self.spark.read.format("delta").load(self.tables["meta_pipelines"])
210
+ rows = df.collect()
211
+ for row in rows:
212
+ row_dict = row.asDict()
213
+ self._pipelines_cache[row_dict["pipeline_name"]] = row_dict
214
+ elif self.engine:
215
+ df = self._read_local_table(self.tables["meta_pipelines"])
216
+ if not df.empty and "pipeline_name" in df.columns:
217
+ for _, row in df.iterrows():
218
+ self._pipelines_cache[row["pipeline_name"]] = row.to_dict()
219
+ except Exception as e:
220
+ logger.debug(f"Could not cache pipelines: {e}")
221
+ self._pipelines_cache = {}
222
+
223
+ return self._pipelines_cache
224
+
225
+ def _get_all_nodes_cached(self) -> Dict[str, Dict[str, str]]:
226
+ """Get all nodes grouped by pipeline with caching."""
227
+ if self._nodes_cache is not None:
228
+ return self._nodes_cache
229
+
230
+ self._nodes_cache = {}
231
+ if not self.spark and not self.engine:
232
+ return self._nodes_cache
233
+
234
+ try:
235
+ if self.spark:
236
+ df = self.spark.read.format("delta").load(self.tables["meta_nodes"])
237
+ rows = df.select("pipeline_name", "node_name", "version_hash").collect()
238
+ for row in rows:
239
+ p_name = row["pipeline_name"]
240
+ if p_name not in self._nodes_cache:
241
+ self._nodes_cache[p_name] = {}
242
+ self._nodes_cache[p_name][row["node_name"]] = row["version_hash"]
243
+ elif self.engine:
244
+ df = self._read_local_table(self.tables["meta_nodes"])
245
+ if not df.empty and "pipeline_name" in df.columns:
246
+ for _, row in df.iterrows():
247
+ p_name = row["pipeline_name"]
248
+ if p_name not in self._nodes_cache:
249
+ self._nodes_cache[p_name] = {}
250
+ self._nodes_cache[p_name][row["node_name"]] = row["version_hash"]
251
+ except Exception as e:
252
+ logger.debug(f"Could not cache nodes: {e}")
253
+ self._nodes_cache = {}
254
+
255
+ return self._nodes_cache
256
+
257
+ def bootstrap(self) -> None:
258
+ """
259
+ Ensures all system tables exist. Creates them if missing.
260
+ """
261
+ if not self.spark and not self.engine:
262
+ logger.warning(
263
+ "Neither SparkSession nor Engine available. Skipping System Catalog bootstrap."
264
+ )
265
+ return
266
+
267
+ logger.info(f"Bootstrapping System Catalog at {self.base_path}...")
268
+
269
+ self._ensure_table("meta_tables", self._get_schema_meta_tables())
270
+ self._ensure_table(
271
+ "meta_runs",
272
+ self._get_schema_meta_runs(),
273
+ partition_cols=["pipeline_name", "date"],
274
+ schema_evolution=True,
275
+ )
276
+ self._ensure_table("meta_patterns", self._get_schema_meta_patterns())
277
+ self._ensure_table("meta_metrics", self._get_schema_meta_metrics())
278
+ self._ensure_table("meta_state", self._get_schema_meta_state())
279
+ self._ensure_table("meta_pipelines", self._get_schema_meta_pipelines())
280
+ self._ensure_table("meta_nodes", self._get_schema_meta_nodes())
281
+ self._ensure_table("meta_schemas", self._get_schema_meta_schemas())
282
+ self._ensure_table("meta_lineage", self._get_schema_meta_lineage())
283
+ self._ensure_table("meta_outputs", self._get_schema_meta_outputs())
284
+
285
+ def _ensure_table(
286
+ self,
287
+ name: str,
288
+ schema: StructType,
289
+ partition_cols: Optional[list] = None,
290
+ schema_evolution: bool = False,
291
+ ) -> None:
292
+ path = self.tables[name]
293
+ if not self._table_exists(path):
294
+ logger.info(f"Creating system table: {name} at {path}")
295
+
296
+ if self.spark:
297
+ # Create empty DataFrame with schema
298
+ writer = self.spark.createDataFrame([], schema).write.format("delta")
299
+ if partition_cols:
300
+ writer = writer.partitionBy(*partition_cols)
301
+ writer.save(path)
302
+ elif self.engine and self.engine.name == "pandas":
303
+ # Pandas/Local Mode
304
+ import os
305
+
306
+ import pandas as pd
307
+
308
+ os.makedirs(path, exist_ok=True)
309
+
310
+ # Attempt to create Delta Table if library exists (using Arrow for strict typing)
311
+ try:
312
+ import pyarrow as pa
313
+ from deltalake import write_deltalake
314
+
315
+ def map_to_arrow_type(dtype):
316
+ s_type = str(dtype)
317
+ if isinstance(dtype, StringType) or "StringType" in s_type:
318
+ return pa.string()
319
+ if isinstance(dtype, LongType) or "LongType" in s_type:
320
+ return pa.int64()
321
+ if isinstance(dtype, DoubleType) or "DoubleType" in s_type:
322
+ return pa.float64()
323
+ if isinstance(dtype, TimestampType) or "TimestampType" in s_type:
324
+ return pa.timestamp("us", tz="UTC")
325
+ if isinstance(dtype, DateType) or "DateType" in s_type:
326
+ return pa.date32()
327
+ if isinstance(dtype, ArrayType) or "ArrayType" in s_type:
328
+ # Access element type safely
329
+ elem_type = getattr(dtype, "elementType", StringType())
330
+ return pa.list_(map_to_arrow_type(elem_type))
331
+ return pa.string()
332
+
333
+ # Define Arrow Schema
334
+ arrow_fields = []
335
+ for field in schema.fields:
336
+ arrow_fields.append(pa.field(field.name, map_to_arrow_type(field.dataType)))
337
+
338
+ arrow_schema = pa.schema(arrow_fields)
339
+
340
+ # Create Empty Table
341
+ # Note: We pass a dict of empty lists. PyArrow handles the rest using schema.
342
+ data = {f.name: [] for f in schema.fields}
343
+ table = pa.Table.from_pydict(data, schema=arrow_schema)
344
+
345
+ storage_opts = self._get_storage_options()
346
+ write_deltalake(
347
+ path,
348
+ table,
349
+ mode="overwrite",
350
+ partition_by=partition_cols,
351
+ storage_options=storage_opts if storage_opts else None,
352
+ )
353
+ logger.info(f"Initialized Delta table: {name}")
354
+
355
+ except ImportError:
356
+ # Fallback to Pandas/Parquet if Delta/Arrow not available
357
+ # Prepare empty DataFrame with correct columns and types
358
+ data = {}
359
+
360
+ def get_pd_type(dtype):
361
+ if isinstance(dtype, StringType) or "StringType" in str(type(dtype)):
362
+ return "string"
363
+ if isinstance(dtype, LongType) or "LongType" in str(type(dtype)):
364
+ return "int64"
365
+ if isinstance(dtype, DoubleType) or "DoubleType" in str(type(dtype)):
366
+ return "float64"
367
+ if isinstance(dtype, TimestampType) or "TimestampType" in str(type(dtype)):
368
+ return "datetime64[ns, UTC]"
369
+ if isinstance(dtype, DateType) or "DateType" in str(type(dtype)):
370
+ return "datetime64[ns]"
371
+ return "object"
372
+
373
+ for field in schema.fields:
374
+ pd_type = get_pd_type(field.dataType)
375
+ data[field.name] = pd.Series([], dtype=pd_type)
376
+
377
+ df = pd.DataFrame(data)
378
+
379
+ # Fallback to Parquet
380
+ # Pandas to_parquet with partition_cols
381
+ df.to_parquet(path, partition_cols=partition_cols)
382
+ logger.info(f"Initialized Parquet table: {name} (Delta library not found)")
383
+ except Exception as e:
384
+ logger.error(f"Failed to create local system table {name}: {e}")
385
+ raise e
386
+ else:
387
+ # If table exists and schema evolution is requested (only for Pandas/Delta mode currently)
388
+ if schema_evolution and self.engine and self.engine.name == "pandas":
389
+ try:
390
+ from deltalake import DeltaTable, write_deltalake
391
+
392
+ storage_opts = self._get_storage_options()
393
+ _ = DeltaTable(path, storage_options=storage_opts if storage_opts else None)
394
+ # Basic schema evolution: overwrite schema if we are appending?
395
+ # For now, let's just log. True evolution is complex.
396
+ # A simple fix for "fields mismatch" is to allow schema merge.
397
+ pass
398
+ except ImportError:
399
+ pass
400
+ logger.debug(f"System table exists: {name}")
401
+ self._migrate_schema_if_needed(name, path, schema)
402
+
403
+ def _migrate_schema_if_needed(self, name: str, path: str, expected_schema: StructType) -> None:
404
+ """
405
+ Migrate table schema if there are incompatible type changes.
406
+ This handles cases like ArrayType -> StringType migrations.
407
+ """
408
+ try:
409
+ if self.spark:
410
+ existing_df = self.spark.read.format("delta").load(path)
411
+ existing_fields = {f.name: f.dataType for f in existing_df.schema.fields}
412
+ expected_fields = {f.name: f.dataType for f in expected_schema.fields}
413
+
414
+ needs_migration = False
415
+ for field_name, expected_type in expected_fields.items():
416
+ if field_name in existing_fields:
417
+ existing_type = existing_fields[field_name]
418
+ if type(existing_type) is not type(expected_type):
419
+ logger.info(
420
+ f"Schema migration needed for {name}.{field_name}: "
421
+ f"{existing_type} -> {expected_type}"
422
+ )
423
+ needs_migration = True
424
+ break
425
+
426
+ if needs_migration:
427
+ logger.info(f"Migrating schema for {name}...")
428
+ migrated_df = existing_df
429
+ for field in expected_schema.fields:
430
+ if field.name in existing_fields:
431
+ existing_type = existing_fields[field.name]
432
+ if not isinstance(existing_type, type(field.dataType)):
433
+ from pyspark.sql import functions as F
434
+
435
+ if isinstance(existing_type, ArrayType) and isinstance(
436
+ field.dataType, StringType
437
+ ):
438
+ migrated_df = migrated_df.withColumn(
439
+ field.name, F.to_json(F.col(field.name))
440
+ )
441
+
442
+ migrated_df.write.format("delta").mode("overwrite").option(
443
+ "overwriteSchema", "true"
444
+ ).save(path)
445
+ logger.info(f"Schema migration completed for {name}")
446
+
447
+ elif self.engine and self.engine.name == "pandas":
448
+ from deltalake import DeltaTable
449
+
450
+ storage_opts = self._get_storage_options()
451
+ dt = DeltaTable(path, storage_options=storage_opts if storage_opts else None)
452
+ existing_schema = dt.schema()
453
+ existing_fields = {f.name: f.type for f in existing_schema.fields}
454
+
455
+ needs_migration = False
456
+ for field in expected_schema.fields:
457
+ if field.name in existing_fields:
458
+ existing_type_str = str(existing_fields[field.name])
459
+ expected_type_str = field.dataType.simpleString()
460
+ if "array" in existing_type_str.lower() and expected_type_str == "string":
461
+ needs_migration = True
462
+ break
463
+
464
+ if needs_migration:
465
+ logger.info(f"Migrating schema for {name}...")
466
+ import json
467
+
468
+ df = dt.to_pandas()
469
+ for field in expected_schema.fields:
470
+ if field.name in df.columns and field.name in existing_fields:
471
+ existing_type_str = str(existing_fields[field.name])
472
+ if "array" in existing_type_str.lower():
473
+ df[field.name] = df[field.name].apply(
474
+ lambda x: json.dumps(x) if isinstance(x, list) else x
475
+ )
476
+
477
+ from deltalake import write_deltalake
478
+
479
+ storage_opts = self._get_storage_options()
480
+ write_deltalake(
481
+ path,
482
+ df,
483
+ mode="overwrite",
484
+ overwrite_schema=True,
485
+ storage_options=storage_opts if storage_opts else None,
486
+ )
487
+ logger.info(f"Schema migration completed for {name}")
488
+
489
+ except Exception as e:
490
+ logger.warning(f"Schema migration check failed for {name}: {e}")
491
+
492
+ def _table_exists(self, path: str) -> bool:
493
+ if self.spark:
494
+ try:
495
+ self.spark.read.format("delta").load(path).limit(0).collect()
496
+ return True
497
+ except Exception as e:
498
+ # If AnalysisException or "Path does not exist", return False
499
+ # Otherwise, if it's an auth error, we might want to warn.
500
+ msg = str(e).lower()
501
+ if (
502
+ "path does not exist" in msg
503
+ or "filenotfound" in msg
504
+ or "analysisexception" in type(e).__name__.lower()
505
+ ):
506
+ return False
507
+
508
+ logger.warning(f"Error checking if table exists at {path}: {e}")
509
+ return False
510
+ elif self.engine:
511
+ import os
512
+
513
+ # For cloud paths, try to load with delta-rs
514
+ if path.startswith(("abfss://", "az://", "s3://", "gs://", "https://")):
515
+ try:
516
+ from deltalake import DeltaTable
517
+
518
+ storage_opts = self._get_storage_options()
519
+ DeltaTable(path, storage_options=storage_opts if storage_opts else None)
520
+ return True
521
+ except Exception:
522
+ return False
523
+
524
+ # For local paths, check if directory exists and has content
525
+ if not os.path.exists(path):
526
+ return False
527
+ if os.path.isdir(path):
528
+ # Check if empty or contains relevant files
529
+ if not os.listdir(path):
530
+ return False
531
+ return True
532
+ return False
533
+ return False
534
+
535
+ def _get_schema_meta_tables(self) -> StructType:
536
+ """
537
+ meta_tables (Inventory): Tracks physical assets.
538
+ """
539
+ return StructType(
540
+ [
541
+ StructField("project_name", StringType(), True),
542
+ StructField("table_name", StringType(), True),
543
+ StructField("path", StringType(), True),
544
+ StructField("format", StringType(), True),
545
+ StructField("pattern_type", StringType(), True),
546
+ StructField("schema_hash", StringType(), True),
547
+ StructField("updated_at", TimestampType(), True),
548
+ ]
549
+ )
550
+
551
+ def _get_schema_meta_runs(self) -> StructType:
552
+ """
553
+ meta_runs (Observability): Tracks execution history.
554
+ """
555
+ return StructType(
556
+ [
557
+ StructField("run_id", StringType(), True),
558
+ StructField("pipeline_name", StringType(), True),
559
+ StructField("node_name", StringType(), True),
560
+ StructField("status", StringType(), True),
561
+ StructField("rows_processed", LongType(), True),
562
+ StructField("duration_ms", LongType(), True),
563
+ StructField("metrics_json", StringType(), True),
564
+ StructField("environment", StringType(), True),
565
+ StructField("timestamp", TimestampType(), True),
566
+ StructField("date", DateType(), True),
567
+ ]
568
+ )
569
+
570
+ def _get_schema_meta_patterns(self) -> StructType:
571
+ """
572
+ meta_patterns (Governance): Tracks pattern compliance.
573
+ """
574
+ return StructType(
575
+ [
576
+ StructField("table_name", StringType(), True),
577
+ StructField("pattern_type", StringType(), True),
578
+ StructField("configuration", StringType(), True),
579
+ StructField("compliance_score", DoubleType(), True),
580
+ ]
581
+ )
582
+
583
+ def _get_schema_meta_metrics(self) -> StructType:
584
+ """
585
+ meta_metrics (Semantics): Tracks business logic.
586
+ Note: dimensions is stored as JSON string for cross-engine portability.
587
+ """
588
+ return StructType(
589
+ [
590
+ StructField("metric_name", StringType(), True),
591
+ StructField("definition_sql", StringType(), True),
592
+ StructField("dimensions", StringType(), True),
593
+ StructField("source_table", StringType(), True),
594
+ ]
595
+ )
596
+
597
+ def _get_schema_meta_state(self) -> StructType:
598
+ """
599
+ meta_state (HWM Key-Value Store): Tracks high-water marks for incremental loads.
600
+ Uses a generic key/value pattern for flexibility.
601
+ """
602
+ return StructType(
603
+ [
604
+ StructField("key", StringType(), False),
605
+ StructField("value", StringType(), True),
606
+ StructField("environment", StringType(), True),
607
+ StructField("updated_at", TimestampType(), True),
608
+ ]
609
+ )
610
+
611
+ def _get_schema_meta_pipelines(self) -> StructType:
612
+ """
613
+ meta_pipelines (Definitions): Tracks pipeline configurations.
614
+ """
615
+ return StructType(
616
+ [
617
+ StructField("pipeline_name", StringType(), True),
618
+ StructField("version_hash", StringType(), True),
619
+ StructField("description", StringType(), True),
620
+ StructField("layer", StringType(), True),
621
+ StructField("schedule", StringType(), True),
622
+ StructField("tags_json", StringType(), True),
623
+ StructField("updated_at", TimestampType(), True),
624
+ ]
625
+ )
626
+
627
+ def _get_schema_meta_nodes(self) -> StructType:
628
+ """
629
+ meta_nodes (Definitions): Tracks node configurations within pipelines.
630
+ """
631
+ return StructType(
632
+ [
633
+ StructField("pipeline_name", StringType(), True),
634
+ StructField("node_name", StringType(), True),
635
+ StructField("version_hash", StringType(), True),
636
+ StructField("type", StringType(), True), # read/transform/write
637
+ StructField("config_json", StringType(), True),
638
+ StructField("updated_at", TimestampType(), True),
639
+ ]
640
+ )
641
+
642
+ def _get_schema_meta_schemas(self) -> StructType:
643
+ """
644
+ meta_schemas (Schema Version Tracking): Tracks schema changes over time.
645
+ """
646
+ return StructType(
647
+ [
648
+ StructField("table_path", StringType(), False),
649
+ StructField("schema_version", LongType(), False),
650
+ StructField("schema_hash", StringType(), False),
651
+ StructField("columns", StringType(), False), # JSON: {"col": "type", ...}
652
+ StructField("captured_at", TimestampType(), False),
653
+ StructField("pipeline", StringType(), True),
654
+ StructField("node", StringType(), True),
655
+ StructField("run_id", StringType(), True),
656
+ StructField("columns_added", StringType(), True), # JSON array as string
657
+ StructField("columns_removed", StringType(), True), # JSON array as string
658
+ StructField("columns_type_changed", StringType(), True), # JSON array as string
659
+ ]
660
+ )
661
+
662
+ def _get_schema_meta_lineage(self) -> StructType:
663
+ """
664
+ meta_lineage (Cross-Pipeline Lineage): Tracks table-level lineage relationships.
665
+ """
666
+ return StructType(
667
+ [
668
+ StructField("source_table", StringType(), False),
669
+ StructField("target_table", StringType(), False),
670
+ StructField("source_pipeline", StringType(), True),
671
+ StructField("source_node", StringType(), True),
672
+ StructField("target_pipeline", StringType(), True),
673
+ StructField("target_node", StringType(), True),
674
+ StructField("relationship", StringType(), False), # "feeds" | "derived_from"
675
+ StructField("last_observed", TimestampType(), False),
676
+ StructField("run_id", StringType(), True),
677
+ ]
678
+ )
679
+
680
+ def _get_schema_meta_outputs(self) -> StructType:
681
+ """
682
+ meta_outputs (Node Outputs Registry): Tracks output metadata for cross-pipeline dependencies.
683
+
684
+ Stores output metadata for every node that has a `write` block.
685
+ Primary key: (pipeline_name, node_name)
686
+ """
687
+ return StructType(
688
+ [
689
+ StructField("pipeline_name", StringType(), False),
690
+ StructField("node_name", StringType(), False),
691
+ StructField(
692
+ "output_type", StringType(), False
693
+ ), # "external_table" | "managed_table"
694
+ StructField("connection_name", StringType(), True),
695
+ StructField("path", StringType(), True),
696
+ StructField("format", StringType(), True),
697
+ StructField("table_name", StringType(), True),
698
+ StructField("last_run", TimestampType(), False),
699
+ StructField("row_count", LongType(), True),
700
+ StructField("updated_at", TimestampType(), False),
701
+ ]
702
+ )
703
+
704
+ def get_registered_pipeline(self, pipeline_name: str) -> Optional[Dict[str, Any]]:
705
+ """
706
+ Get existing registered pipeline record with version_hash.
707
+
708
+ Args:
709
+ pipeline_name: Name of the pipeline to look up
710
+
711
+ Returns:
712
+ Dict with pipeline record including version_hash, or None if not found
713
+ """
714
+ pipelines_cache = self._get_all_pipelines_cached()
715
+ return pipelines_cache.get(pipeline_name)
716
+
717
+ def get_registered_nodes(self, pipeline_name: str) -> Dict[str, str]:
718
+ """
719
+ Get existing registered nodes for a pipeline with their version hashes.
720
+
721
+ Args:
722
+ pipeline_name: Name of the pipeline to look up nodes for
723
+
724
+ Returns:
725
+ Dict mapping node_name -> version_hash for all registered nodes
726
+ """
727
+ nodes_cache = self._get_all_nodes_cached()
728
+ return nodes_cache.get(pipeline_name, {})
729
+
730
+ def get_all_registered_pipelines(self) -> Dict[str, str]:
731
+ """
732
+ Get all registered pipelines with their version hashes.
733
+
734
+ Returns:
735
+ Dict mapping pipeline_name -> version_hash
736
+ """
737
+ pipelines_cache = self._get_all_pipelines_cached()
738
+ return {name: data.get("version_hash", "") for name, data in pipelines_cache.items()}
739
+
740
+ def get_all_registered_nodes(self, pipeline_names: List[str]) -> Dict[str, Dict[str, str]]:
741
+ """
742
+ Get all registered nodes for multiple pipelines with their version hashes.
743
+
744
+ Args:
745
+ pipeline_names: List of pipeline names to look up nodes for
746
+
747
+ Returns:
748
+ Dict mapping pipeline_name -> {node_name -> version_hash}
749
+ """
750
+ nodes_cache = self._get_all_nodes_cached()
751
+ return {name: nodes_cache.get(name, {}) for name in pipeline_names}
752
+
753
+ def register_pipelines_batch(
754
+ self,
755
+ records: List[Dict[str, Any]],
756
+ ) -> None:
757
+ """
758
+ Batch registers/upserts multiple pipeline definitions to meta_pipelines.
759
+
760
+ Args:
761
+ records: List of dicts with keys: pipeline_name, version_hash, description,
762
+ layer, schedule, tags_json
763
+ """
764
+ if not self.spark and not self.engine:
765
+ return
766
+
767
+ if not records:
768
+ return
769
+
770
+ try:
771
+ from datetime import datetime, timezone
772
+
773
+ if self.spark:
774
+ from pyspark.sql import functions as F
775
+
776
+ schema = self._get_schema_meta_pipelines()
777
+ input_schema = StructType(schema.fields[:-1]) # Exclude updated_at
778
+
779
+ rows = [
780
+ (
781
+ r["pipeline_name"],
782
+ r["version_hash"],
783
+ r["description"],
784
+ r["layer"],
785
+ r["schedule"],
786
+ r["tags_json"],
787
+ )
788
+ for r in records
789
+ ]
790
+ df = self.spark.createDataFrame(rows, input_schema)
791
+ df = df.withColumn("updated_at", F.current_timestamp())
792
+
793
+ view_name = "_odibi_meta_pipelines_batch_upsert"
794
+ df.createOrReplaceTempView(view_name)
795
+
796
+ target_path = self.tables["meta_pipelines"]
797
+
798
+ merge_sql = f"""
799
+ MERGE INTO delta.`{target_path}` AS target
800
+ USING {view_name} AS source
801
+ ON target.pipeline_name = source.pipeline_name
802
+ WHEN MATCHED THEN UPDATE SET
803
+ target.version_hash = source.version_hash,
804
+ target.description = source.description,
805
+ target.layer = source.layer,
806
+ target.schedule = source.schedule,
807
+ target.tags_json = source.tags_json,
808
+ target.updated_at = source.updated_at
809
+ WHEN NOT MATCHED THEN INSERT *
810
+ """
811
+ self.spark.sql(merge_sql)
812
+ self.spark.catalog.dropTempView(view_name)
813
+
814
+ elif self.engine:
815
+ import pandas as pd
816
+
817
+ data = {
818
+ "pipeline_name": [r["pipeline_name"] for r in records],
819
+ "version_hash": [r["version_hash"] for r in records],
820
+ "description": [r["description"] for r in records],
821
+ "layer": [r["layer"] for r in records],
822
+ "schedule": [r["schedule"] for r in records],
823
+ "tags_json": [r["tags_json"] for r in records],
824
+ "updated_at": [datetime.now(timezone.utc) for _ in records],
825
+ }
826
+ df = pd.DataFrame(data)
827
+
828
+ def do_write():
829
+ self.engine.write(
830
+ df,
831
+ connection=self.connection,
832
+ format="delta",
833
+ path=self.tables["meta_pipelines"],
834
+ mode="upsert",
835
+ options={"keys": ["pipeline_name"]},
836
+ )
837
+
838
+ self._retry_with_backoff(do_write)
839
+
840
+ self._pipelines_cache = None
841
+ logger.debug(f"Batch registered {len(records)} pipeline(s)")
842
+
843
+ except Exception as e:
844
+ logger.warning(f"Failed to batch register pipelines: {e}")
845
+
846
+ def register_nodes_batch(
847
+ self,
848
+ records: List[Dict[str, Any]],
849
+ ) -> None:
850
+ """
851
+ Batch registers/upserts multiple node definitions to meta_nodes.
852
+
853
+ Args:
854
+ records: List of dicts with keys: pipeline_name, node_name, version_hash,
855
+ type, config_json
856
+ """
857
+ if not self.spark and not self.engine:
858
+ return
859
+
860
+ if not records:
861
+ return
862
+
863
+ try:
864
+ from datetime import datetime, timezone
865
+
866
+ if self.spark:
867
+ from pyspark.sql import functions as F
868
+
869
+ schema = self._get_schema_meta_nodes()
870
+ input_schema = StructType(schema.fields[:-1]) # Exclude updated_at
871
+
872
+ rows = [
873
+ (
874
+ r["pipeline_name"],
875
+ r["node_name"],
876
+ r["version_hash"],
877
+ r["type"],
878
+ r["config_json"],
879
+ )
880
+ for r in records
881
+ ]
882
+ df = self.spark.createDataFrame(rows, input_schema)
883
+ df = df.withColumn("updated_at", F.current_timestamp())
884
+
885
+ view_name = "_odibi_meta_nodes_batch_upsert"
886
+ df.createOrReplaceTempView(view_name)
887
+
888
+ target_path = self.tables["meta_nodes"]
889
+
890
+ merge_sql = f"""
891
+ MERGE INTO delta.`{target_path}` AS target
892
+ USING {view_name} AS source
893
+ ON target.pipeline_name = source.pipeline_name
894
+ AND target.node_name = source.node_name
895
+ WHEN MATCHED THEN UPDATE SET
896
+ target.version_hash = source.version_hash,
897
+ target.type = source.type,
898
+ target.config_json = source.config_json,
899
+ target.updated_at = source.updated_at
900
+ WHEN NOT MATCHED THEN INSERT *
901
+ """
902
+ self.spark.sql(merge_sql)
903
+ self.spark.catalog.dropTempView(view_name)
904
+
905
+ elif self.engine:
906
+ import pandas as pd
907
+
908
+ data = {
909
+ "pipeline_name": [r["pipeline_name"] for r in records],
910
+ "node_name": [r["node_name"] for r in records],
911
+ "version_hash": [r["version_hash"] for r in records],
912
+ "type": [r["type"] for r in records],
913
+ "config_json": [r["config_json"] for r in records],
914
+ "updated_at": [datetime.now(timezone.utc) for _ in records],
915
+ }
916
+ df = pd.DataFrame(data)
917
+
918
+ def do_write():
919
+ self.engine.write(
920
+ df,
921
+ connection=self.connection,
922
+ format="delta",
923
+ path=self.tables["meta_nodes"],
924
+ mode="upsert",
925
+ options={"keys": ["pipeline_name", "node_name"]},
926
+ )
927
+
928
+ self._retry_with_backoff(do_write)
929
+
930
+ self._nodes_cache = None
931
+ logger.debug(f"Batch registered {len(records)} node(s)")
932
+
933
+ except Exception as e:
934
+ logger.warning(f"Failed to batch register nodes: {e}")
935
+
936
+ def register_outputs_batch(
937
+ self,
938
+ records: List[Dict[str, Any]],
939
+ ) -> None:
940
+ """
941
+ Batch registers/upserts multiple node outputs to meta_outputs.
942
+
943
+ Uses MERGE INTO for efficient upsert. This is performance critical -
944
+ all outputs are collected during pipeline execution and written in a
945
+ single batch at the end.
946
+
947
+ Args:
948
+ records: List of dicts with keys:
949
+ - pipeline_name: str (pipeline identifier)
950
+ - node_name: str (node identifier)
951
+ - output_type: str ("external_table" | "managed_table")
952
+ - connection_name: str (nullable, for external tables)
953
+ - path: str (nullable, storage path)
954
+ - format: str (delta, parquet, etc.)
955
+ - table_name: str (nullable, registered table name)
956
+ - last_run: datetime (execution timestamp)
957
+ - row_count: int (nullable)
958
+ """
959
+ if not self.spark and not self.engine:
960
+ return
961
+
962
+ if not records:
963
+ return
964
+
965
+ try:
966
+ if self.spark:
967
+ from pyspark.sql import functions as F
968
+
969
+ schema = self._get_schema_meta_outputs()
970
+ input_schema = StructType(schema.fields[:-1]) # Exclude updated_at
971
+
972
+ rows = [
973
+ (
974
+ r["pipeline_name"],
975
+ r["node_name"],
976
+ r["output_type"],
977
+ r.get("connection_name"),
978
+ r.get("path"),
979
+ r.get("format"),
980
+ r.get("table_name"),
981
+ r["last_run"],
982
+ r.get("row_count"),
983
+ )
984
+ for r in records
985
+ ]
986
+ df = self.spark.createDataFrame(rows, input_schema)
987
+ df = df.withColumn("updated_at", F.current_timestamp())
988
+
989
+ view_name = "_odibi_meta_outputs_batch_upsert"
990
+ df.createOrReplaceTempView(view_name)
991
+
992
+ target_path = self.tables["meta_outputs"]
993
+
994
+ merge_sql = f"""
995
+ MERGE INTO delta.`{target_path}` AS target
996
+ USING {view_name} AS source
997
+ ON target.pipeline_name = source.pipeline_name
998
+ AND target.node_name = source.node_name
999
+ WHEN MATCHED THEN UPDATE SET
1000
+ target.output_type = source.output_type,
1001
+ target.connection_name = source.connection_name,
1002
+ target.path = source.path,
1003
+ target.format = source.format,
1004
+ target.table_name = source.table_name,
1005
+ target.last_run = source.last_run,
1006
+ target.row_count = source.row_count,
1007
+ target.updated_at = source.updated_at
1008
+ WHEN NOT MATCHED THEN INSERT *
1009
+ """
1010
+ self.spark.sql(merge_sql)
1011
+ self.spark.catalog.dropTempView(view_name)
1012
+
1013
+ elif self.engine:
1014
+ import pandas as pd
1015
+
1016
+ data = {
1017
+ "pipeline_name": [r["pipeline_name"] for r in records],
1018
+ "node_name": [r["node_name"] for r in records],
1019
+ "output_type": [r["output_type"] for r in records],
1020
+ "connection_name": [r.get("connection_name") for r in records],
1021
+ "path": [r.get("path") for r in records],
1022
+ "format": [r.get("format") for r in records],
1023
+ "table_name": [r.get("table_name") for r in records],
1024
+ "last_run": [r["last_run"] for r in records],
1025
+ "row_count": [r.get("row_count") for r in records],
1026
+ "updated_at": [datetime.now(timezone.utc) for _ in records],
1027
+ }
1028
+ df = pd.DataFrame(data)
1029
+
1030
+ def do_write():
1031
+ self.engine.write(
1032
+ df,
1033
+ connection=self.connection,
1034
+ format="delta",
1035
+ path=self.tables["meta_outputs"],
1036
+ mode="upsert",
1037
+ options={"keys": ["pipeline_name", "node_name"]},
1038
+ )
1039
+
1040
+ self._retry_with_backoff(do_write)
1041
+
1042
+ self._outputs_cache = None
1043
+ logger.debug(f"Batch registered {len(records)} output(s)")
1044
+
1045
+ except Exception as e:
1046
+ logger.warning(f"Failed to batch register outputs: {e}")
1047
+
1048
+ def _get_all_outputs_cached(self) -> Dict[str, Dict[str, Any]]:
1049
+ """
1050
+ Get all outputs with caching.
1051
+
1052
+ Returns:
1053
+ Dict mapping "{pipeline_name}.{node_name}" -> output record
1054
+ """
1055
+ # Thread-safe check: if cache exists and is populated, return it
1056
+ if self._outputs_cache is not None:
1057
+ return self._outputs_cache
1058
+
1059
+ # Build cache in a local variable first to avoid race conditions
1060
+ cache: Dict[str, Dict[str, Any]] = {}
1061
+ if not self.spark and not self.engine:
1062
+ self._outputs_cache = cache
1063
+ return self._outputs_cache
1064
+
1065
+ try:
1066
+ if self.spark:
1067
+ df = self.spark.read.format("delta").load(self.tables["meta_outputs"])
1068
+ rows = df.collect()
1069
+ for row in rows:
1070
+ row_dict = row.asDict()
1071
+ key = f"{row_dict['pipeline_name']}.{row_dict['node_name']}"
1072
+ cache[key] = row_dict
1073
+ elif self.engine:
1074
+ df = self._read_local_table(self.tables["meta_outputs"])
1075
+ if not df.empty and "pipeline_name" in df.columns:
1076
+ for _, row in df.iterrows():
1077
+ key = f"{row['pipeline_name']}.{row['node_name']}"
1078
+ cache[key] = row.to_dict()
1079
+ except Exception as e:
1080
+ logger.warning(f"Could not cache outputs from {self.tables.get('meta_outputs')}: {e}")
1081
+
1082
+ # Atomic assignment after building complete cache
1083
+ self._outputs_cache = cache
1084
+ return self._outputs_cache
1085
+
1086
+ def get_node_output(
1087
+ self,
1088
+ pipeline_name: str,
1089
+ node_name: str,
1090
+ ) -> Optional[Dict[str, Any]]:
1091
+ """
1092
+ Retrieves output metadata for a specific node.
1093
+
1094
+ Used for cross-pipeline dependency resolution ($pipeline.node references).
1095
+
1096
+ Args:
1097
+ pipeline_name: Name of the pipeline
1098
+ node_name: Name of the node
1099
+
1100
+ Returns:
1101
+ Dict with output metadata or None if not found.
1102
+ Keys: pipeline_name, node_name, output_type, connection_name,
1103
+ path, format, table_name, last_run, row_count
1104
+ """
1105
+ outputs_cache = self._get_all_outputs_cached()
1106
+ key = f"{pipeline_name}.{node_name}"
1107
+ return outputs_cache.get(key)
1108
+
1109
+ def register_outputs_from_config(
1110
+ self,
1111
+ pipeline_config: Any,
1112
+ ) -> int:
1113
+ """
1114
+ Pre-register node outputs from pipeline config without running the pipeline.
1115
+
1116
+ Scans pipeline nodes for output locations (write blocks, merge/scd2 params)
1117
+ and registers them to meta_outputs. This enables cross-pipeline references
1118
+ without requiring the source pipeline to have run first.
1119
+
1120
+ Args:
1121
+ pipeline_config: Pipeline configuration object with nodes
1122
+
1123
+ Returns:
1124
+ Number of outputs registered
1125
+ """
1126
+ from datetime import datetime
1127
+
1128
+ records = []
1129
+ pipeline_name = pipeline_config.pipeline
1130
+
1131
+ for node in pipeline_config.nodes:
1132
+ output_info = self._extract_node_output_info(node)
1133
+ if output_info:
1134
+ records.append(
1135
+ {
1136
+ "pipeline_name": pipeline_name,
1137
+ "node_name": node.name,
1138
+ "output_type": output_info.get("output_type", "external_table"),
1139
+ "connection_name": output_info.get("connection"),
1140
+ "path": output_info.get("path"),
1141
+ "format": output_info.get("format", "delta"),
1142
+ "table_name": output_info.get("register_table"),
1143
+ "last_run": datetime.now(),
1144
+ "row_count": None,
1145
+ }
1146
+ )
1147
+
1148
+ if records:
1149
+ self.register_outputs_batch(records)
1150
+ self._outputs_cache = None
1151
+
1152
+ return len(records)
1153
+
1154
+ def _extract_node_output_info(self, node_config: Any) -> Optional[Dict[str, Any]]:
1155
+ """
1156
+ Extract output location from a node config.
1157
+
1158
+ Checks in order of precedence:
1159
+ 1. Explicit write block
1160
+ 2. merge/scd2 in transform steps
1161
+ 3. Top-level merge/scd2 transformer
1162
+
1163
+ Args:
1164
+ node_config: Node configuration object
1165
+
1166
+ Returns:
1167
+ Dict with connection, path, format, register_table or None
1168
+ """
1169
+ if node_config.write:
1170
+ write_cfg = node_config.write
1171
+ output_type = (
1172
+ "managed_table" if write_cfg.table and not write_cfg.path else "external_table"
1173
+ )
1174
+ return {
1175
+ "connection": write_cfg.connection,
1176
+ "path": write_cfg.path,
1177
+ "format": write_cfg.format or "delta",
1178
+ "register_table": write_cfg.register_table or write_cfg.table,
1179
+ "output_type": output_type,
1180
+ }
1181
+
1182
+ output_functions = {"merge", "scd2"}
1183
+
1184
+ if node_config.transform and node_config.transform.steps:
1185
+ for step in reversed(node_config.transform.steps):
1186
+ if isinstance(step, str):
1187
+ continue
1188
+
1189
+ if hasattr(step, "function") and step.function in output_functions:
1190
+ params = step.params or {}
1191
+ connection = params.get("connection")
1192
+ path = params.get("path") or params.get("target")
1193
+ register_table = params.get("register_table")
1194
+
1195
+ if connection and path:
1196
+ return {
1197
+ "connection": connection,
1198
+ "path": path,
1199
+ "format": "delta",
1200
+ "register_table": register_table,
1201
+ "output_type": "managed_table" if register_table else "external_table",
1202
+ }
1203
+
1204
+ if node_config.transformer in output_functions and node_config.params:
1205
+ params = node_config.params
1206
+ connection = params.get("connection")
1207
+ path = params.get("path") or params.get("target")
1208
+ register_table = params.get("register_table")
1209
+
1210
+ if connection and path:
1211
+ return {
1212
+ "connection": connection,
1213
+ "path": path,
1214
+ "format": "delta",
1215
+ "register_table": register_table,
1216
+ "output_type": "managed_table" if register_table else "external_table",
1217
+ }
1218
+
1219
+ return None
1220
+
1221
+ def _prepare_pipeline_record(self, pipeline_config: Any) -> Dict[str, Any]:
1222
+ """Prepare a pipeline record for batch registration."""
1223
+ from odibi.utils.hashing import calculate_pipeline_hash
1224
+
1225
+ version_hash = calculate_pipeline_hash(pipeline_config)
1226
+
1227
+ all_tags = set()
1228
+ for node in pipeline_config.nodes:
1229
+ if node.tags:
1230
+ all_tags.update(node.tags)
1231
+
1232
+ return {
1233
+ "pipeline_name": pipeline_config.pipeline,
1234
+ "version_hash": version_hash,
1235
+ "description": pipeline_config.description or "",
1236
+ "layer": pipeline_config.layer or "",
1237
+ "schedule": "",
1238
+ "tags_json": json.dumps(list(all_tags)),
1239
+ }
1240
+
1241
+ def register_pipeline(
1242
+ self,
1243
+ pipeline_config: Any,
1244
+ project_config: Optional[Any] = None,
1245
+ skip_if_unchanged: bool = False,
1246
+ ) -> bool:
1247
+ """
1248
+ Registers/Upserts a pipeline definition to meta_pipelines.
1249
+
1250
+ .. deprecated::
1251
+ Use :meth:`register_pipelines_batch` for better performance.
1252
+
1253
+ Args:
1254
+ pipeline_config: The pipeline configuration object
1255
+ project_config: Optional project configuration
1256
+ skip_if_unchanged: If True, skip write if version_hash matches existing
1257
+
1258
+ Returns:
1259
+ True if write was performed, False if skipped
1260
+ """
1261
+ import warnings
1262
+
1263
+ warnings.warn(
1264
+ "register_pipeline is deprecated, use register_pipelines_batch for better performance",
1265
+ DeprecationWarning,
1266
+ stacklevel=2,
1267
+ )
1268
+
1269
+ if not self.spark and not self.engine:
1270
+ return False
1271
+
1272
+ try:
1273
+ record = self._prepare_pipeline_record(pipeline_config)
1274
+
1275
+ if skip_if_unchanged:
1276
+ existing = self.get_registered_pipeline(pipeline_config.pipeline)
1277
+ if existing and existing.get("version_hash") == record["version_hash"]:
1278
+ logger.debug(f"Skipping pipeline '{pipeline_config.pipeline}' - unchanged")
1279
+ return False
1280
+
1281
+ self.register_pipelines_batch([record])
1282
+ return True
1283
+
1284
+ except Exception as e:
1285
+ logger.warning(f"Failed to register pipeline '{pipeline_config.pipeline}': {e}")
1286
+ return False
1287
+
1288
+ def _prepare_node_record(self, pipeline_name: str, node_config: Any) -> Dict[str, Any]:
1289
+ """Prepare a node record for batch registration."""
1290
+ from odibi.utils.hashing import calculate_node_hash
1291
+
1292
+ version_hash = calculate_node_hash(node_config)
1293
+
1294
+ node_type = "transform"
1295
+ if node_config.read:
1296
+ node_type = "read"
1297
+ if node_config.write:
1298
+ node_type = "write"
1299
+
1300
+ if hasattr(node_config, "model_dump"):
1301
+ dump = node_config.model_dump(mode="json", exclude={"description", "tags", "log_level"})
1302
+ else:
1303
+ dump = node_config.model_dump(exclude={"description", "tags", "log_level"})
1304
+
1305
+ return {
1306
+ "pipeline_name": pipeline_name,
1307
+ "node_name": node_config.name,
1308
+ "version_hash": version_hash,
1309
+ "type": node_type,
1310
+ "config_json": json.dumps(dump),
1311
+ }
1312
+
1313
+ def register_node(
1314
+ self,
1315
+ pipeline_name: str,
1316
+ node_config: Any,
1317
+ skip_if_unchanged: bool = False,
1318
+ existing_hash: Optional[str] = None,
1319
+ ) -> bool:
1320
+ """
1321
+ Registers/Upserts a node definition to meta_nodes.
1322
+
1323
+ .. deprecated::
1324
+ Use :meth:`register_nodes_batch` for better performance.
1325
+
1326
+ Args:
1327
+ pipeline_name: Name of the parent pipeline
1328
+ node_config: The node configuration object
1329
+ skip_if_unchanged: If True, skip write if version_hash matches existing
1330
+ existing_hash: Pre-fetched existing hash (to avoid re-reading)
1331
+
1332
+ Returns:
1333
+ True if write was performed, False if skipped
1334
+ """
1335
+ import warnings
1336
+
1337
+ warnings.warn(
1338
+ "register_node is deprecated, use register_nodes_batch for better performance",
1339
+ DeprecationWarning,
1340
+ stacklevel=2,
1341
+ )
1342
+
1343
+ if not self.spark and not self.engine:
1344
+ return False
1345
+
1346
+ try:
1347
+ record = self._prepare_node_record(pipeline_name, node_config)
1348
+
1349
+ if skip_if_unchanged:
1350
+ current_hash = existing_hash
1351
+ if current_hash is None:
1352
+ nodes = self.get_registered_nodes(pipeline_name)
1353
+ current_hash = nodes.get(node_config.name)
1354
+
1355
+ if current_hash == record["version_hash"]:
1356
+ logger.debug(f"Skipping node '{node_config.name}' - unchanged")
1357
+ return False
1358
+
1359
+ self.register_nodes_batch([record])
1360
+ return True
1361
+
1362
+ except Exception as e:
1363
+ logger.warning(f"Failed to register node '{node_config.name}': {e}")
1364
+ return False
1365
+
1366
+ def log_run(
1367
+ self,
1368
+ run_id: str,
1369
+ pipeline_name: str,
1370
+ node_name: str,
1371
+ status: str,
1372
+ rows_processed: Optional[int] = 0,
1373
+ duration_ms: Optional[int] = 0,
1374
+ metrics_json: Optional[str] = "{}",
1375
+ ) -> None:
1376
+ """
1377
+ Logs execution telemetry to meta_runs.
1378
+
1379
+ Note: For better performance with multiple nodes, use log_runs_batch() instead.
1380
+ """
1381
+ environment = getattr(self.config, "environment", None)
1382
+
1383
+ # SQL Server mode - direct insert
1384
+ if self.is_sql_server_mode:
1385
+ self._log_run_sql_server(
1386
+ run_id,
1387
+ pipeline_name,
1388
+ node_name,
1389
+ status,
1390
+ rows_processed,
1391
+ duration_ms,
1392
+ metrics_json,
1393
+ environment,
1394
+ )
1395
+ return
1396
+
1397
+ if not self.spark and not self.engine:
1398
+ return
1399
+
1400
+ def _do_log_run():
1401
+ if self.spark:
1402
+ from pyspark.sql import functions as F
1403
+
1404
+ rows = [
1405
+ (
1406
+ run_id,
1407
+ pipeline_name,
1408
+ node_name,
1409
+ status,
1410
+ rows_processed,
1411
+ duration_ms,
1412
+ metrics_json,
1413
+ environment,
1414
+ )
1415
+ ]
1416
+ schema = self._get_schema_meta_runs()
1417
+ input_schema = StructType(schema.fields[:-2])
1418
+
1419
+ df = self.spark.createDataFrame(rows, input_schema)
1420
+ df = df.withColumn("timestamp", F.current_timestamp()).withColumn(
1421
+ "date", F.to_date(F.col("timestamp"))
1422
+ )
1423
+
1424
+ df.write.format("delta").mode("append").save(self.tables["meta_runs"])
1425
+ elif self.engine:
1426
+ from datetime import datetime, timezone
1427
+
1428
+ import pandas as pd
1429
+
1430
+ timestamp = datetime.now(timezone.utc)
1431
+
1432
+ data = {
1433
+ "run_id": [run_id],
1434
+ "pipeline_name": [pipeline_name],
1435
+ "node_name": [node_name],
1436
+ "status": [status],
1437
+ "rows_processed": [rows_processed],
1438
+ "duration_ms": [duration_ms],
1439
+ "metrics_json": [metrics_json],
1440
+ "environment": [environment],
1441
+ "timestamp": [timestamp],
1442
+ "date": [timestamp.date()],
1443
+ }
1444
+ df = pd.DataFrame(data)
1445
+
1446
+ self.engine.write(
1447
+ df,
1448
+ connection=self.connection,
1449
+ format="delta",
1450
+ path=self.tables["meta_runs"],
1451
+ mode="append",
1452
+ )
1453
+
1454
+ try:
1455
+ self._retry_with_backoff(_do_log_run)
1456
+ except Exception as e:
1457
+ logger.warning(f"Failed to log run to system catalog: {e}")
1458
+
1459
+ def _log_run_sql_server(
1460
+ self,
1461
+ run_id: str,
1462
+ pipeline_name: str,
1463
+ node_name: str,
1464
+ status: str,
1465
+ rows_processed: int,
1466
+ duration_ms: int,
1467
+ metrics_json: str,
1468
+ environment: Optional[str],
1469
+ ) -> None:
1470
+ """Log a run to SQL Server meta_runs table."""
1471
+ schema_name = getattr(self.config, "schema_name", None) or "odibi_system"
1472
+ try:
1473
+ sql = f"""
1474
+ INSERT INTO [{schema_name}].[meta_runs]
1475
+ (run_id, pipeline_name, node_name, status, rows_processed, duration_ms,
1476
+ metrics_json, environment, timestamp, date)
1477
+ VALUES (:run_id, :pipeline, :node, :status, :rows, :duration,
1478
+ :metrics, :env, GETUTCDATE(), CAST(GETUTCDATE() AS DATE))
1479
+ """
1480
+ self.connection.execute(
1481
+ sql,
1482
+ {
1483
+ "run_id": run_id,
1484
+ "pipeline": pipeline_name,
1485
+ "node": node_name,
1486
+ "status": status,
1487
+ "rows": rows_processed or 0,
1488
+ "duration": duration_ms or 0,
1489
+ "metrics": metrics_json or "{}",
1490
+ "env": environment,
1491
+ },
1492
+ )
1493
+ except Exception as e:
1494
+ logger.warning(f"Failed to log run to SQL Server: {e}")
1495
+
1496
+ def log_runs_batch(
1497
+ self,
1498
+ records: List[Dict[str, Any]],
1499
+ ) -> None:
1500
+ """
1501
+ Batch logs multiple execution records to meta_runs in a single write.
1502
+
1503
+ This is much more efficient than calling log_run() for each node individually.
1504
+
1505
+ Args:
1506
+ records: List of dicts with keys: run_id, pipeline_name, node_name,
1507
+ status, rows_processed, duration_ms, metrics_json
1508
+ """
1509
+ if not records:
1510
+ return
1511
+
1512
+ environment = getattr(self.config, "environment", None)
1513
+
1514
+ # SQL Server mode - batch insert
1515
+ if self.is_sql_server_mode:
1516
+ for r in records:
1517
+ self._log_run_sql_server(
1518
+ r["run_id"],
1519
+ r["pipeline_name"],
1520
+ r["node_name"],
1521
+ r["status"],
1522
+ r.get("rows_processed", 0),
1523
+ r.get("duration_ms", 0),
1524
+ r.get("metrics_json", "{}"),
1525
+ environment,
1526
+ )
1527
+ logger.debug(f"Batch logged {len(records)} run records to SQL Server")
1528
+ return
1529
+
1530
+ if not self.spark and not self.engine:
1531
+ return
1532
+
1533
+ def _do_batch_log():
1534
+ if self.spark:
1535
+ from pyspark.sql import functions as F
1536
+
1537
+ rows = [
1538
+ (
1539
+ r["run_id"],
1540
+ r["pipeline_name"],
1541
+ r["node_name"],
1542
+ r["status"],
1543
+ r.get("rows_processed", 0),
1544
+ r.get("duration_ms", 0),
1545
+ r.get("metrics_json", "{}"),
1546
+ environment,
1547
+ )
1548
+ for r in records
1549
+ ]
1550
+ schema = self._get_schema_meta_runs()
1551
+ input_schema = StructType(schema.fields[:-2])
1552
+
1553
+ df = self.spark.createDataFrame(rows, input_schema)
1554
+ df = df.withColumn("timestamp", F.current_timestamp()).withColumn(
1555
+ "date", F.to_date(F.col("timestamp"))
1556
+ )
1557
+
1558
+ df.write.format("delta").mode("append").save(self.tables["meta_runs"])
1559
+ logger.debug(f"Batch logged {len(records)} run records to meta_runs")
1560
+
1561
+ elif self.engine:
1562
+ from datetime import datetime, timezone
1563
+
1564
+ import pandas as pd
1565
+
1566
+ timestamp = datetime.now(timezone.utc)
1567
+
1568
+ data = {
1569
+ "run_id": [r["run_id"] for r in records],
1570
+ "pipeline_name": [r["pipeline_name"] for r in records],
1571
+ "node_name": [r["node_name"] for r in records],
1572
+ "status": [r["status"] for r in records],
1573
+ "rows_processed": [r.get("rows_processed", 0) for r in records],
1574
+ "duration_ms": [r.get("duration_ms", 0) for r in records],
1575
+ "metrics_json": [r.get("metrics_json", "{}") for r in records],
1576
+ "environment": [environment] * len(records),
1577
+ "timestamp": [timestamp] * len(records),
1578
+ "date": [timestamp.date()] * len(records),
1579
+ }
1580
+ df = pd.DataFrame(data)
1581
+
1582
+ self.engine.write(
1583
+ df,
1584
+ connection=self.connection,
1585
+ format="delta",
1586
+ path=self.tables["meta_runs"],
1587
+ mode="append",
1588
+ )
1589
+ logger.debug(f"Batch logged {len(records)} run records to meta_runs")
1590
+
1591
+ try:
1592
+ self._retry_with_backoff(_do_batch_log)
1593
+ except Exception as e:
1594
+ logger.warning(f"Failed to batch log runs to system catalog: {e}")
1595
+
1596
+ def log_pattern(
1597
+ self,
1598
+ table_name: str,
1599
+ pattern_type: str,
1600
+ configuration: str,
1601
+ compliance_score: float,
1602
+ ) -> None:
1603
+ """
1604
+ Logs pattern usage to meta_patterns.
1605
+ """
1606
+ if not self.spark and not self.engine:
1607
+ return
1608
+
1609
+ def _do_log_pattern():
1610
+ if self.spark:
1611
+ rows = [
1612
+ (
1613
+ table_name,
1614
+ pattern_type,
1615
+ configuration,
1616
+ compliance_score,
1617
+ )
1618
+ ]
1619
+ schema = self._get_schema_meta_patterns()
1620
+
1621
+ df = self.spark.createDataFrame(rows, schema)
1622
+
1623
+ # Append to meta_patterns
1624
+ df.write.format("delta").mode("append").save(self.tables["meta_patterns"])
1625
+
1626
+ elif self.engine:
1627
+ import pandas as pd
1628
+
1629
+ data = {
1630
+ "table_name": [table_name],
1631
+ "pattern_type": [pattern_type],
1632
+ "configuration": [configuration],
1633
+ "compliance_score": [compliance_score],
1634
+ }
1635
+ df = pd.DataFrame(data)
1636
+
1637
+ self.engine.write(
1638
+ df,
1639
+ connection=self.connection,
1640
+ format="delta",
1641
+ path=self.tables["meta_patterns"],
1642
+ mode="append",
1643
+ )
1644
+
1645
+ try:
1646
+ self._retry_with_backoff(_do_log_pattern)
1647
+ except Exception as e:
1648
+ logger.warning(f"Failed to log pattern to system catalog: {e}")
1649
+
1650
+ def register_asset(
1651
+ self,
1652
+ project_name: str,
1653
+ table_name: str,
1654
+ path: str,
1655
+ format: str,
1656
+ pattern_type: str,
1657
+ schema_hash: str = "",
1658
+ ) -> None:
1659
+ """
1660
+ Registers/Upserts a physical asset to meta_tables.
1661
+ """
1662
+ if not self.spark and not self.engine:
1663
+ return
1664
+
1665
+ def _do_register():
1666
+ if self.spark:
1667
+ from pyspark.sql import functions as F
1668
+
1669
+ # Prepare data
1670
+ rows = [
1671
+ (
1672
+ project_name,
1673
+ table_name,
1674
+ path,
1675
+ format,
1676
+ pattern_type,
1677
+ schema_hash,
1678
+ )
1679
+ ]
1680
+ schema = self._get_schema_meta_tables()
1681
+ input_schema = StructType(schema.fields[:-1]) # Exclude updated_at
1682
+
1683
+ df = self.spark.createDataFrame(rows, input_schema)
1684
+ df = df.withColumn("updated_at", F.current_timestamp())
1685
+
1686
+ # Merge Logic
1687
+ # We need a temp view
1688
+ view_name = f"_odibi_meta_tables_upsert_{abs(hash(table_name))}"
1689
+ df.createOrReplaceTempView(view_name)
1690
+
1691
+ target_path = self.tables["meta_tables"]
1692
+
1693
+ merge_sql = f"""
1694
+ MERGE INTO delta.`{target_path}` AS target
1695
+ USING {view_name} AS source
1696
+ ON target.project_name = source.project_name
1697
+ AND target.table_name = source.table_name
1698
+ WHEN MATCHED THEN UPDATE SET
1699
+ target.path = source.path,
1700
+ target.format = source.format,
1701
+ target.pattern_type = source.pattern_type,
1702
+ target.schema_hash = source.schema_hash,
1703
+ target.updated_at = source.updated_at
1704
+ WHEN NOT MATCHED THEN INSERT *
1705
+ """
1706
+ self.spark.sql(merge_sql)
1707
+ self.spark.catalog.dropTempView(view_name)
1708
+ elif self.engine:
1709
+ from datetime import datetime, timezone
1710
+
1711
+ import pandas as pd
1712
+
1713
+ # Construct DataFrame
1714
+ data = {
1715
+ "project_name": [project_name],
1716
+ "table_name": [table_name],
1717
+ "path": [path],
1718
+ "format": [format],
1719
+ "pattern_type": [pattern_type],
1720
+ "schema_hash": [schema_hash],
1721
+ "updated_at": [datetime.now(timezone.utc)],
1722
+ }
1723
+ df = pd.DataFrame(data)
1724
+
1725
+ target_path = self.tables["meta_tables"]
1726
+
1727
+ # Use Merge transformer if available, or manual engine merge?
1728
+ # Since we are inside catalog, using transformer might be circular.
1729
+ # Let's use engine.write with mode='upsert' if engine supports it?
1730
+ # PandasEngine.write(..., mode='upsert') delegates to _handle_generic_upsert
1731
+ # or _write_delta which calls dt.merge.
1732
+
1733
+ self.engine.write(
1734
+ df,
1735
+ connection=self.connection,
1736
+ format="delta",
1737
+ path=target_path,
1738
+ mode="upsert",
1739
+ options={"keys": ["project_name", "table_name"]},
1740
+ )
1741
+
1742
+ try:
1743
+ self._retry_with_backoff(_do_register)
1744
+ except Exception as e:
1745
+ logger.warning(f"Failed to register asset in system catalog: {e}")
1746
+
1747
+ def resolve_table_path(self, table_name: str) -> Optional[str]:
1748
+ """
1749
+ Resolves logical table name (e.g. 'gold.orders') to physical path.
1750
+ """
1751
+ if self.spark:
1752
+ try:
1753
+ from pyspark.sql import functions as F
1754
+
1755
+ df = self.spark.read.format("delta").load(self.tables["meta_tables"])
1756
+ # Filter
1757
+ row = df.filter(F.col("table_name") == table_name).select("path").first()
1758
+
1759
+ return row.path if row else None
1760
+ except Exception:
1761
+ return None
1762
+ elif self.engine:
1763
+ df = self._read_local_table(self.tables["meta_tables"])
1764
+ if df.empty:
1765
+ return None
1766
+
1767
+ # Pandas filtering
1768
+ if "table_name" not in df.columns:
1769
+ return None
1770
+
1771
+ row = df[df["table_name"] == table_name]
1772
+ if not row.empty:
1773
+ return row.iloc[0]["path"]
1774
+ return None
1775
+
1776
+ return None
1777
+
1778
+ def get_pipeline_hash(self, pipeline_name: str) -> Optional[str]:
1779
+ """
1780
+ Retrieves the version hash of a pipeline from the catalog.
1781
+ """
1782
+ if self.spark:
1783
+ try:
1784
+ from pyspark.sql import functions as F
1785
+
1786
+ df = self.spark.read.format("delta").load(self.tables["meta_pipelines"])
1787
+ row = (
1788
+ df.filter(F.col("pipeline_name") == pipeline_name)
1789
+ .select("version_hash")
1790
+ .first()
1791
+ )
1792
+ return row.version_hash if row else None
1793
+ except Exception:
1794
+ return None
1795
+ elif self.engine:
1796
+ df = self._read_local_table(self.tables["meta_pipelines"])
1797
+ if df.empty:
1798
+ return None
1799
+ if "pipeline_name" not in df.columns or "version_hash" not in df.columns:
1800
+ return None
1801
+
1802
+ # Ensure we get the latest one if duplicates exist (though upsert should prevent)
1803
+ # But reading parquet fallback might have duplicates.
1804
+ # Sorting by updated_at desc
1805
+ if "updated_at" in df.columns:
1806
+ df = df.sort_values("updated_at", ascending=False)
1807
+
1808
+ row = df[df["pipeline_name"] == pipeline_name]
1809
+ if not row.empty:
1810
+ return row.iloc[0]["version_hash"]
1811
+ return None
1812
+ return None
1813
+
1814
+ def get_average_volume(self, node_name: str, days: int = 7) -> Optional[float]:
1815
+ """
1816
+ Calculates average rows processed for a node over last N days.
1817
+ """
1818
+ if self.spark:
1819
+ try:
1820
+ from pyspark.sql import functions as F
1821
+
1822
+ df = self.spark.read.format("delta").load(self.tables["meta_runs"])
1823
+
1824
+ # Filter by node and success status
1825
+ stats = (
1826
+ df.filter(
1827
+ (F.col("node_name") == node_name)
1828
+ & (F.col("status") == "SUCCESS")
1829
+ & (F.col("timestamp") >= F.date_sub(F.current_date(), days))
1830
+ )
1831
+ .agg(F.avg("rows_processed"))
1832
+ .first()
1833
+ )
1834
+
1835
+ return stats[0] if stats else None
1836
+ except Exception:
1837
+ return None
1838
+ elif self.engine:
1839
+ df = self._read_local_table(self.tables["meta_runs"])
1840
+ if df.empty:
1841
+ return None
1842
+
1843
+ # Need status, node_name, rows_processed, timestamp
1844
+ required = ["status", "node_name", "rows_processed", "timestamp"]
1845
+ if not all(col in df.columns for col in required):
1846
+ return None
1847
+
1848
+ from datetime import datetime, timedelta, timezone
1849
+
1850
+ import pandas as pd
1851
+
1852
+ cutoff = datetime.now(timezone.utc) - timedelta(days=days)
1853
+
1854
+ # Ensure timestamp is datetime
1855
+ if not pd.api.types.is_datetime64_any_dtype(df["timestamp"]):
1856
+ try:
1857
+ df["timestamp"] = pd.to_datetime(df["timestamp"])
1858
+ except Exception:
1859
+ return None
1860
+
1861
+ filtered = df[
1862
+ (df["node_name"] == node_name)
1863
+ & (df["status"] == "SUCCESS")
1864
+ & (df["timestamp"] >= cutoff)
1865
+ ]
1866
+
1867
+ if filtered.empty:
1868
+ return None
1869
+
1870
+ return float(filtered["rows_processed"].mean())
1871
+
1872
+ return None
1873
+
1874
+ def get_average_duration(self, node_name: str, days: int = 7) -> Optional[float]:
1875
+ """
1876
+ Calculates average duration (seconds) for a node over last N days.
1877
+ """
1878
+ if self.spark:
1879
+ try:
1880
+ from pyspark.sql import functions as F
1881
+
1882
+ df = self.spark.read.format("delta").load(self.tables["meta_runs"])
1883
+
1884
+ stats = (
1885
+ df.filter(
1886
+ (F.col("node_name") == node_name)
1887
+ & (F.col("status") == "SUCCESS")
1888
+ & (F.col("timestamp") >= F.date_sub(F.current_date(), days))
1889
+ )
1890
+ .agg(F.avg("duration_ms"))
1891
+ .first()
1892
+ )
1893
+
1894
+ return stats[0] / 1000.0 if stats and stats[0] is not None else None
1895
+ except Exception:
1896
+ return None
1897
+ elif self.engine:
1898
+ df = self._read_local_table(self.tables["meta_runs"])
1899
+ if df.empty:
1900
+ return None
1901
+
1902
+ from datetime import datetime, timedelta, timezone
1903
+
1904
+ import pandas as pd
1905
+
1906
+ cutoff = datetime.now(timezone.utc) - timedelta(days=days)
1907
+
1908
+ if not pd.api.types.is_datetime64_any_dtype(df["timestamp"]):
1909
+ try:
1910
+ df["timestamp"] = pd.to_datetime(df["timestamp"])
1911
+ except Exception:
1912
+ return None
1913
+
1914
+ filtered = df[
1915
+ (df["node_name"] == node_name)
1916
+ & (df["status"] == "SUCCESS")
1917
+ & (df["timestamp"] >= cutoff)
1918
+ ]
1919
+
1920
+ if filtered.empty:
1921
+ return None
1922
+
1923
+ avg_ms = float(filtered["duration_ms"].mean())
1924
+ return avg_ms / 1000.0
1925
+
1926
+ return None
1927
+
1928
+ def _read_table(self, path: str):
1929
+ """
1930
+ Read system table using Spark (for remote paths) or local methods.
1931
+ Returns pandas DataFrame. Empty DataFrame on failure.
1932
+ """
1933
+ import pandas as pd
1934
+
1935
+ # Use Spark for remote paths (ADLS, S3, etc.) or when Spark is available
1936
+ if self.spark:
1937
+ try:
1938
+ spark_df = self.spark.read.format("delta").load(path)
1939
+ return spark_df.toPandas()
1940
+ except Exception as e:
1941
+ logger.debug(f"Could not read table via Spark at {path}: {e}")
1942
+ return pd.DataFrame()
1943
+
1944
+ # Fallback to local reading for non-Spark environments
1945
+ return self._read_local_table(path)
1946
+
1947
+ def _read_local_table(self, path: str):
1948
+ """
1949
+ Helper to read local system tables (Delta or Parquet).
1950
+ Returns empty DataFrame on failure.
1951
+ """
1952
+ import pandas as pd
1953
+
1954
+ storage_opts = self._get_storage_options()
1955
+
1956
+ try:
1957
+ # Try Delta first if library available
1958
+ try:
1959
+ from deltalake import DeltaTable
1960
+
1961
+ dt = DeltaTable(path, storage_options=storage_opts if storage_opts else None)
1962
+ return dt.to_pandas()
1963
+ except ImportError:
1964
+ # Delta library not installed, proceed to parquet fallback
1965
+ pass
1966
+ except Exception:
1967
+ # Not a valid delta table? Fallback to parquet
1968
+ pass
1969
+
1970
+ # Fallback: Read as Parquet (directory or file)
1971
+ return pd.read_parquet(path, storage_options=storage_opts if storage_opts else None)
1972
+
1973
+ except Exception as e:
1974
+ # Only log debug to avoid noise if table just doesn't exist or is empty yet
1975
+ logger.debug(f"Could not read local table at {path}: {e}")
1976
+ return pd.DataFrame()
1977
+
1978
+ def _hash_schema(self, schema: Dict[str, str]) -> str:
1979
+ """Generate MD5 hash of column definitions for change detection."""
1980
+ sorted_schema = json.dumps(schema, sort_keys=True)
1981
+ return hashlib.md5(sorted_schema.encode("utf-8")).hexdigest()
1982
+
1983
+ def _get_latest_schema(self, table_path: str) -> Optional[Dict[str, Any]]:
1984
+ """Get the most recent schema record for a table."""
1985
+ if self.spark:
1986
+ try:
1987
+ from pyspark.sql import functions as F
1988
+
1989
+ df = self.spark.read.format("delta").load(self.tables["meta_schemas"])
1990
+ row = (
1991
+ df.filter(F.col("table_path") == table_path)
1992
+ .orderBy(F.col("schema_version").desc())
1993
+ .first()
1994
+ )
1995
+ if row:
1996
+ return row.asDict()
1997
+ return None
1998
+ except Exception:
1999
+ return None
2000
+ elif self.engine:
2001
+ df = self._read_local_table(self.tables["meta_schemas"])
2002
+ if df.empty or "table_path" not in df.columns:
2003
+ return None
2004
+
2005
+ filtered = df[df["table_path"] == table_path]
2006
+ if filtered.empty:
2007
+ return None
2008
+
2009
+ if "schema_version" in filtered.columns:
2010
+ filtered = filtered.sort_values("schema_version", ascending=False)
2011
+ return filtered.iloc[0].to_dict()
2012
+
2013
+ return None
2014
+
2015
+ def track_schema(
2016
+ self,
2017
+ table_path: str,
2018
+ schema: Dict[str, str],
2019
+ pipeline: str,
2020
+ node: str,
2021
+ run_id: str,
2022
+ ) -> Dict[str, Any]:
2023
+ """
2024
+ Track schema version for a table.
2025
+
2026
+ Args:
2027
+ table_path: Full path to the table (e.g., "silver/customers")
2028
+ schema: Dictionary of column names to types
2029
+ pipeline: Pipeline name
2030
+ node: Node name
2031
+ run_id: Execution run ID
2032
+
2033
+ Returns:
2034
+ Dict with version info and detected changes:
2035
+ - changed: bool indicating if schema changed
2036
+ - version: current schema version number
2037
+ - previous_version: previous version (if exists)
2038
+ - columns_added: list of new columns
2039
+ - columns_removed: list of removed columns
2040
+ - columns_type_changed: list of columns with type changes
2041
+ """
2042
+ if not self.spark and not self.engine:
2043
+ return {"changed": False, "version": 0}
2044
+
2045
+ try:
2046
+ schema_hash = self._hash_schema(schema)
2047
+ previous = self._get_latest_schema(table_path)
2048
+
2049
+ if previous and previous.get("schema_hash") == schema_hash:
2050
+ return {"changed": False, "version": previous.get("schema_version", 1)}
2051
+
2052
+ changes: Dict[str, Any] = {
2053
+ "columns_added": [],
2054
+ "columns_removed": [],
2055
+ "columns_type_changed": [],
2056
+ }
2057
+
2058
+ if previous:
2059
+ prev_cols_str = previous.get("columns", "{}")
2060
+ prev_cols = json.loads(prev_cols_str) if isinstance(prev_cols_str, str) else {}
2061
+
2062
+ changes["columns_added"] = list(set(schema.keys()) - set(prev_cols.keys()))
2063
+ changes["columns_removed"] = list(set(prev_cols.keys()) - set(schema.keys()))
2064
+ changes["columns_type_changed"] = [
2065
+ col for col in schema if col in prev_cols and schema[col] != prev_cols[col]
2066
+ ]
2067
+ new_version = previous.get("schema_version", 0) + 1
2068
+ else:
2069
+ new_version = 1
2070
+
2071
+ record = {
2072
+ "table_path": table_path,
2073
+ "schema_version": new_version,
2074
+ "schema_hash": schema_hash,
2075
+ "columns": json.dumps(schema),
2076
+ "captured_at": datetime.now(timezone.utc),
2077
+ "pipeline": pipeline,
2078
+ "node": node,
2079
+ "run_id": run_id,
2080
+ "columns_added": (
2081
+ json.dumps(changes["columns_added"]) if changes["columns_added"] else None
2082
+ ),
2083
+ "columns_removed": (
2084
+ json.dumps(changes["columns_removed"]) if changes["columns_removed"] else None
2085
+ ),
2086
+ "columns_type_changed": (
2087
+ json.dumps(changes["columns_type_changed"])
2088
+ if changes["columns_type_changed"]
2089
+ else None
2090
+ ),
2091
+ }
2092
+
2093
+ if self.spark:
2094
+ df = self.spark.createDataFrame([record], schema=self._get_schema_meta_schemas())
2095
+ df.write.format("delta").mode("append").save(self.tables["meta_schemas"])
2096
+
2097
+ elif self.engine:
2098
+ import pandas as pd
2099
+
2100
+ df = pd.DataFrame([record])
2101
+ self.engine.write(
2102
+ df,
2103
+ connection=self.connection,
2104
+ format="delta",
2105
+ path=self.tables["meta_schemas"],
2106
+ mode="append",
2107
+ )
2108
+
2109
+ result = {
2110
+ "changed": True,
2111
+ "version": new_version,
2112
+ "previous_version": previous.get("schema_version") if previous else None,
2113
+ **changes,
2114
+ }
2115
+
2116
+ logger.info(
2117
+ f"Schema tracked for {table_path}: v{new_version} "
2118
+ f"(+{len(changes['columns_added'])}/-{len(changes['columns_removed'])}/"
2119
+ f"~{len(changes['columns_type_changed'])})"
2120
+ )
2121
+
2122
+ return result
2123
+
2124
+ except Exception as e:
2125
+ logger.warning(f"Failed to track schema for {table_path}: {e}")
2126
+ return {"changed": False, "version": 0, "error": str(e)}
2127
+
2128
+ def get_schema_history(
2129
+ self,
2130
+ table_path: str,
2131
+ limit: int = 10,
2132
+ ) -> List[Dict[str, Any]]:
2133
+ """
2134
+ Get schema version history for a table.
2135
+
2136
+ Args:
2137
+ table_path: Full path to the table (e.g., "silver/customers")
2138
+ limit: Maximum number of versions to return (default: 10)
2139
+
2140
+ Returns:
2141
+ List of schema version records, most recent first
2142
+ """
2143
+ if not self.spark and not self.engine:
2144
+ return []
2145
+
2146
+ try:
2147
+ if self.spark:
2148
+ from pyspark.sql import functions as F
2149
+
2150
+ df = self.spark.read.format("delta").load(self.tables["meta_schemas"])
2151
+ rows = (
2152
+ df.filter(F.col("table_path") == table_path)
2153
+ .orderBy(F.col("schema_version").desc())
2154
+ .limit(limit)
2155
+ .collect()
2156
+ )
2157
+ return [row.asDict() for row in rows]
2158
+
2159
+ elif self.engine:
2160
+ df = self._read_local_table(self.tables["meta_schemas"])
2161
+ if df.empty or "table_path" not in df.columns:
2162
+ return []
2163
+
2164
+ filtered = df[df["table_path"] == table_path]
2165
+ if filtered.empty:
2166
+ return []
2167
+
2168
+ if "schema_version" in filtered.columns:
2169
+ filtered = filtered.sort_values("schema_version", ascending=False)
2170
+
2171
+ return filtered.head(limit).to_dict("records")
2172
+
2173
+ except Exception as e:
2174
+ logger.warning(f"Failed to get schema history for {table_path}: {e}")
2175
+ return []
2176
+
2177
+ return []
2178
+
2179
+ def record_lineage(
2180
+ self,
2181
+ source_table: str,
2182
+ target_table: str,
2183
+ target_pipeline: str,
2184
+ target_node: str,
2185
+ run_id: str,
2186
+ source_pipeline: Optional[str] = None,
2187
+ source_node: Optional[str] = None,
2188
+ relationship: str = "feeds",
2189
+ ) -> None:
2190
+ """
2191
+ Record a lineage relationship between tables.
2192
+
2193
+ Args:
2194
+ source_table: Source table path
2195
+ target_table: Target table path
2196
+ target_pipeline: Pipeline name writing to target
2197
+ target_node: Node name writing to target
2198
+ run_id: Execution run ID
2199
+ source_pipeline: Source pipeline name (if known)
2200
+ source_node: Source node name (if known)
2201
+ relationship: Type of relationship ("feeds" or "derived_from")
2202
+ """
2203
+ if not self.spark and not self.engine:
2204
+ return
2205
+
2206
+ def _do_record():
2207
+ record = {
2208
+ "source_table": source_table,
2209
+ "target_table": target_table,
2210
+ "source_pipeline": source_pipeline,
2211
+ "source_node": source_node,
2212
+ "target_pipeline": target_pipeline,
2213
+ "target_node": target_node,
2214
+ "relationship": relationship,
2215
+ "last_observed": datetime.now(timezone.utc),
2216
+ "run_id": run_id,
2217
+ }
2218
+
2219
+ if self.spark:
2220
+ view_name = f"_odibi_lineage_upsert_{abs(hash(f'{source_table}_{target_table}'))}"
2221
+ df = self.spark.createDataFrame([record], schema=self._get_schema_meta_lineage())
2222
+ df.createOrReplaceTempView(view_name)
2223
+
2224
+ target_path = self.tables["meta_lineage"]
2225
+
2226
+ merge_sql = f"""
2227
+ MERGE INTO delta.`{target_path}` AS target
2228
+ USING {view_name} AS source
2229
+ ON target.source_table = source.source_table
2230
+ AND target.target_table = source.target_table
2231
+ WHEN MATCHED THEN UPDATE SET
2232
+ target.source_pipeline = source.source_pipeline,
2233
+ target.source_node = source.source_node,
2234
+ target.target_pipeline = source.target_pipeline,
2235
+ target.target_node = source.target_node,
2236
+ target.relationship = source.relationship,
2237
+ target.last_observed = source.last_observed,
2238
+ target.run_id = source.run_id
2239
+ WHEN NOT MATCHED THEN INSERT *
2240
+ """
2241
+ self.spark.sql(merge_sql)
2242
+ self.spark.catalog.dropTempView(view_name)
2243
+
2244
+ elif self.engine:
2245
+ import pandas as pd
2246
+
2247
+ df = pd.DataFrame([record])
2248
+ self.engine.write(
2249
+ df,
2250
+ connection=self.connection,
2251
+ format="delta",
2252
+ path=self.tables["meta_lineage"],
2253
+ mode="upsert",
2254
+ options={"keys": ["source_table", "target_table"]},
2255
+ )
2256
+
2257
+ logger.debug(f"Recorded lineage: {source_table} -> {target_table}")
2258
+
2259
+ try:
2260
+ self._retry_with_backoff(_do_record)
2261
+ except Exception as e:
2262
+ logger.warning(f"Failed to record lineage: {e}")
2263
+
2264
+ def record_lineage_batch(
2265
+ self,
2266
+ records: List[Dict[str, Any]],
2267
+ ) -> None:
2268
+ """
2269
+ Batch records multiple lineage relationships to meta_lineage in a single MERGE.
2270
+
2271
+ This is much more efficient than calling record_lineage() for each relationship
2272
+ individually, especially when running parallel pipelines with many nodes.
2273
+
2274
+ Args:
2275
+ records: List of dicts with keys: source_table, target_table, target_pipeline,
2276
+ target_node, run_id, source_pipeline (optional), source_node (optional),
2277
+ relationship (optional, defaults to "feeds")
2278
+ """
2279
+ if not self.spark and not self.engine:
2280
+ return
2281
+
2282
+ if not records:
2283
+ return
2284
+
2285
+ def _do_batch_record():
2286
+ timestamp = datetime.now(timezone.utc)
2287
+
2288
+ if self.spark:
2289
+ rows = [
2290
+ (
2291
+ r["source_table"],
2292
+ r["target_table"],
2293
+ r.get("source_pipeline"),
2294
+ r.get("source_node"),
2295
+ r["target_pipeline"],
2296
+ r["target_node"],
2297
+ r.get("relationship", "feeds"),
2298
+ timestamp,
2299
+ r["run_id"],
2300
+ )
2301
+ for r in records
2302
+ ]
2303
+ schema = self._get_schema_meta_lineage()
2304
+ df = self.spark.createDataFrame(rows, schema)
2305
+
2306
+ view_name = "_odibi_meta_lineage_batch_upsert"
2307
+ df.createOrReplaceTempView(view_name)
2308
+
2309
+ target_path = self.tables["meta_lineage"]
2310
+
2311
+ merge_sql = f"""
2312
+ MERGE INTO delta.`{target_path}` AS target
2313
+ USING {view_name} AS source
2314
+ ON target.source_table = source.source_table
2315
+ AND target.target_table = source.target_table
2316
+ WHEN MATCHED THEN UPDATE SET
2317
+ target.source_pipeline = source.source_pipeline,
2318
+ target.source_node = source.source_node,
2319
+ target.target_pipeline = source.target_pipeline,
2320
+ target.target_node = source.target_node,
2321
+ target.relationship = source.relationship,
2322
+ target.last_observed = source.last_observed,
2323
+ target.run_id = source.run_id
2324
+ WHEN NOT MATCHED THEN INSERT *
2325
+ """
2326
+ self.spark.sql(merge_sql)
2327
+ self.spark.catalog.dropTempView(view_name)
2328
+
2329
+ elif self.engine:
2330
+ import pandas as pd
2331
+
2332
+ data = {
2333
+ "source_table": [r["source_table"] for r in records],
2334
+ "target_table": [r["target_table"] for r in records],
2335
+ "source_pipeline": [r.get("source_pipeline") for r in records],
2336
+ "source_node": [r.get("source_node") for r in records],
2337
+ "target_pipeline": [r["target_pipeline"] for r in records],
2338
+ "target_node": [r["target_node"] for r in records],
2339
+ "relationship": [r.get("relationship", "feeds") for r in records],
2340
+ "last_observed": [timestamp] * len(records),
2341
+ "run_id": [r["run_id"] for r in records],
2342
+ }
2343
+ df = pd.DataFrame(data)
2344
+
2345
+ self.engine.write(
2346
+ df,
2347
+ connection=self.connection,
2348
+ format="delta",
2349
+ path=self.tables["meta_lineage"],
2350
+ mode="upsert",
2351
+ options={"keys": ["source_table", "target_table"]},
2352
+ )
2353
+
2354
+ logger.debug(f"Batch recorded {len(records)} lineage relationship(s)")
2355
+
2356
+ try:
2357
+ self._retry_with_backoff(_do_batch_record)
2358
+ except Exception as e:
2359
+ logger.warning(f"Failed to batch record lineage: {e}")
2360
+
2361
+ def register_assets_batch(
2362
+ self,
2363
+ records: List[Dict[str, Any]],
2364
+ ) -> None:
2365
+ """
2366
+ Batch registers/upserts multiple physical assets to meta_tables in a single MERGE.
2367
+
2368
+ This is much more efficient than calling register_asset() for each asset
2369
+ individually, especially when running parallel pipelines with many nodes.
2370
+
2371
+ Args:
2372
+ records: List of dicts with keys: project_name, table_name, path, format,
2373
+ pattern_type, schema_hash (optional, defaults to "")
2374
+ """
2375
+ if not self.spark and not self.engine:
2376
+ return
2377
+
2378
+ if not records:
2379
+ return
2380
+
2381
+ def _do_batch_register():
2382
+ timestamp = datetime.now(timezone.utc)
2383
+
2384
+ if self.spark:
2385
+ from pyspark.sql import functions as F
2386
+
2387
+ schema = self._get_schema_meta_tables()
2388
+ input_schema = StructType(schema.fields[:-1]) # Exclude updated_at
2389
+
2390
+ rows = [
2391
+ (
2392
+ r["project_name"],
2393
+ r["table_name"],
2394
+ r["path"],
2395
+ r["format"],
2396
+ r["pattern_type"],
2397
+ r.get("schema_hash", ""),
2398
+ )
2399
+ for r in records
2400
+ ]
2401
+ df = self.spark.createDataFrame(rows, input_schema)
2402
+ df = df.withColumn("updated_at", F.current_timestamp())
2403
+
2404
+ view_name = "_odibi_meta_tables_batch_upsert"
2405
+ df.createOrReplaceTempView(view_name)
2406
+
2407
+ target_path = self.tables["meta_tables"]
2408
+
2409
+ merge_sql = f"""
2410
+ MERGE INTO delta.`{target_path}` AS target
2411
+ USING {view_name} AS source
2412
+ ON target.project_name = source.project_name
2413
+ AND target.table_name = source.table_name
2414
+ WHEN MATCHED THEN UPDATE SET
2415
+ target.path = source.path,
2416
+ target.format = source.format,
2417
+ target.pattern_type = source.pattern_type,
2418
+ target.schema_hash = source.schema_hash,
2419
+ target.updated_at = source.updated_at
2420
+ WHEN NOT MATCHED THEN INSERT *
2421
+ """
2422
+ self.spark.sql(merge_sql)
2423
+ self.spark.catalog.dropTempView(view_name)
2424
+
2425
+ elif self.engine:
2426
+ import pandas as pd
2427
+
2428
+ data = {
2429
+ "project_name": [r["project_name"] for r in records],
2430
+ "table_name": [r["table_name"] for r in records],
2431
+ "path": [r["path"] for r in records],
2432
+ "format": [r["format"] for r in records],
2433
+ "pattern_type": [r["pattern_type"] for r in records],
2434
+ "schema_hash": [r.get("schema_hash", "") for r in records],
2435
+ "updated_at": [timestamp] * len(records),
2436
+ }
2437
+ df = pd.DataFrame(data)
2438
+
2439
+ self.engine.write(
2440
+ df,
2441
+ connection=self.connection,
2442
+ format="delta",
2443
+ path=self.tables["meta_tables"],
2444
+ mode="upsert",
2445
+ options={"keys": ["project_name", "table_name"]},
2446
+ )
2447
+
2448
+ logger.debug(f"Batch registered {len(records)} asset(s)")
2449
+
2450
+ try:
2451
+ self._retry_with_backoff(_do_batch_register)
2452
+ except Exception as e:
2453
+ logger.warning(f"Failed to batch register assets: {e}")
2454
+
2455
+ def get_upstream(
2456
+ self,
2457
+ table_path: str,
2458
+ depth: int = 3,
2459
+ ) -> List[Dict[str, Any]]:
2460
+ """
2461
+ Get all upstream sources for a table.
2462
+
2463
+ Args:
2464
+ table_path: Table to trace upstream from
2465
+ depth: Maximum depth to traverse
2466
+
2467
+ Returns:
2468
+ List of upstream lineage records with depth information
2469
+ """
2470
+ if not self.spark and not self.engine:
2471
+ return []
2472
+
2473
+ upstream = []
2474
+ visited = set()
2475
+ queue = [(table_path, 0)]
2476
+
2477
+ try:
2478
+ while queue:
2479
+ current, level = queue.pop(0)
2480
+ if current in visited or level > depth:
2481
+ continue
2482
+ visited.add(current)
2483
+
2484
+ if self.spark:
2485
+ from pyspark.sql import functions as F
2486
+
2487
+ df = self.spark.read.format("delta").load(self.tables["meta_lineage"])
2488
+ sources = df.filter(F.col("target_table") == current).collect()
2489
+ for row in sources:
2490
+ record = row.asDict()
2491
+ record["depth"] = level
2492
+ upstream.append(record)
2493
+ queue.append((record["source_table"], level + 1))
2494
+
2495
+ elif self.engine:
2496
+ df = self._read_local_table(self.tables["meta_lineage"])
2497
+ if df.empty or "target_table" not in df.columns:
2498
+ break
2499
+
2500
+ sources = df[df["target_table"] == current]
2501
+ for _, row in sources.iterrows():
2502
+ record = row.to_dict()
2503
+ record["depth"] = level
2504
+ upstream.append(record)
2505
+ queue.append((record["source_table"], level + 1))
2506
+
2507
+ except Exception as e:
2508
+ logger.warning(f"Failed to get upstream lineage for {table_path}: {e}")
2509
+
2510
+ return upstream
2511
+
2512
+ def get_downstream(
2513
+ self,
2514
+ table_path: str,
2515
+ depth: int = 3,
2516
+ ) -> List[Dict[str, Any]]:
2517
+ """
2518
+ Get all downstream consumers of a table.
2519
+
2520
+ Args:
2521
+ table_path: Table to trace downstream from
2522
+ depth: Maximum depth to traverse
2523
+
2524
+ Returns:
2525
+ List of downstream lineage records with depth information
2526
+ """
2527
+ if not self.spark and not self.engine:
2528
+ return []
2529
+
2530
+ downstream = []
2531
+ visited = set()
2532
+ queue = [(table_path, 0)]
2533
+
2534
+ try:
2535
+ while queue:
2536
+ current, level = queue.pop(0)
2537
+ if current in visited or level > depth:
2538
+ continue
2539
+ visited.add(current)
2540
+
2541
+ if self.spark:
2542
+ from pyspark.sql import functions as F
2543
+
2544
+ df = self.spark.read.format("delta").load(self.tables["meta_lineage"])
2545
+ targets = df.filter(F.col("source_table") == current).collect()
2546
+ for row in targets:
2547
+ record = row.asDict()
2548
+ record["depth"] = level
2549
+ downstream.append(record)
2550
+ queue.append((record["target_table"], level + 1))
2551
+
2552
+ elif self.engine:
2553
+ df = self._read_local_table(self.tables["meta_lineage"])
2554
+ if df.empty or "source_table" not in df.columns:
2555
+ break
2556
+
2557
+ targets = df[df["source_table"] == current]
2558
+ for _, row in targets.iterrows():
2559
+ record = row.to_dict()
2560
+ record["depth"] = level
2561
+ downstream.append(record)
2562
+ queue.append((record["target_table"], level + 1))
2563
+
2564
+ except Exception as e:
2565
+ logger.warning(f"Failed to get downstream lineage for {table_path}: {e}")
2566
+
2567
+ return downstream
2568
+
2569
+ def optimize(self) -> None:
2570
+ """
2571
+ Runs VACUUM and OPTIMIZE (Z-Order) on meta_runs.
2572
+ Spark-only feature.
2573
+ """
2574
+ if not self.spark:
2575
+ return
2576
+
2577
+ try:
2578
+ logger.info("Starting Catalog Optimization...")
2579
+
2580
+ # 1. meta_runs
2581
+ # VACUUM: Remove files older than 7 days (Spark requires check disable or careful setting)
2582
+ # Note: default retention check might block < 168 hours.
2583
+ # We'll use RETAIN 168 HOURS (7 days) to be safe.
2584
+ self.spark.sql(f"VACUUM delta.`{self.tables['meta_runs']}` RETAIN 168 HOURS")
2585
+
2586
+ # OPTIMIZE: Z-ORDER BY timestamp (for range queries)
2587
+ # We also have 'pipeline_name' and 'date' as partitions.
2588
+ # Z-Ordering by timestamp helps within the partitions.
2589
+ self.spark.sql(f"OPTIMIZE delta.`{self.tables['meta_runs']}` ZORDER BY (timestamp)")
2590
+
2591
+ logger.info("Catalog Optimization completed successfully.")
2592
+
2593
+ except Exception as e:
2594
+ logger.warning(f"Catalog Optimization failed: {e}")
2595
+
2596
+ # -------------------------------------------------------------------------
2597
+ # Phase 3.6: Metrics Logging
2598
+ # -------------------------------------------------------------------------
2599
+
2600
+ def log_metrics(
2601
+ self,
2602
+ metric_name: str,
2603
+ definition_sql: str,
2604
+ dimensions: List[str],
2605
+ source_table: str,
2606
+ ) -> None:
2607
+ """Log a business metric definition to meta_metrics.
2608
+
2609
+ Args:
2610
+ metric_name: Name of the metric
2611
+ definition_sql: SQL definition of the metric
2612
+ dimensions: List of dimension columns
2613
+ source_table: Source table for the metric
2614
+ """
2615
+ if not self.spark and not self.engine:
2616
+ return
2617
+
2618
+ def _do_log_metrics():
2619
+ import json
2620
+
2621
+ if self.spark:
2622
+ dimensions_json = json.dumps(dimensions)
2623
+ rows = [(metric_name, definition_sql, dimensions_json, source_table)]
2624
+ schema = self._get_schema_meta_metrics()
2625
+
2626
+ df = self.spark.createDataFrame(rows, schema)
2627
+ df.write.format("delta").mode("append").save(self.tables["meta_metrics"])
2628
+
2629
+ elif self.engine:
2630
+ import pandas as pd
2631
+
2632
+ data = {
2633
+ "metric_name": [metric_name],
2634
+ "definition_sql": [definition_sql],
2635
+ "dimensions": [json.dumps(dimensions)],
2636
+ "source_table": [source_table],
2637
+ }
2638
+ df = pd.DataFrame(data)
2639
+
2640
+ self.engine.write(
2641
+ df,
2642
+ connection=self.connection,
2643
+ format="delta",
2644
+ path=self.tables["meta_metrics"],
2645
+ mode="append",
2646
+ )
2647
+
2648
+ logger.debug(f"Logged metric: {metric_name}")
2649
+
2650
+ try:
2651
+ self._retry_with_backoff(_do_log_metrics)
2652
+ except Exception as e:
2653
+ logger.warning(f"Failed to log metric to system catalog: {e}")
2654
+
2655
+ # -------------------------------------------------------------------------
2656
+ # Phase 4: Cleanup/Removal Methods
2657
+ # -------------------------------------------------------------------------
2658
+
2659
+ def remove_pipeline(self, pipeline_name: str) -> int:
2660
+ """Remove pipeline and cascade to nodes, state entries.
2661
+
2662
+ Args:
2663
+ pipeline_name: Name of the pipeline to remove
2664
+
2665
+ Returns:
2666
+ Count of deleted entries
2667
+ """
2668
+ if not self.spark and not self.engine:
2669
+ return 0
2670
+
2671
+ deleted_count = 0
2672
+
2673
+ try:
2674
+ if self.spark:
2675
+ from pyspark.sql import functions as F
2676
+
2677
+ # Delete from meta_pipelines
2678
+ df = self.spark.read.format("delta").load(self.tables["meta_pipelines"])
2679
+ df.cache()
2680
+ initial_count = df.count()
2681
+ df_filtered = df.filter(F.col("pipeline_name") != pipeline_name)
2682
+ df_filtered.write.format("delta").mode("overwrite").save(
2683
+ self.tables["meta_pipelines"]
2684
+ )
2685
+ deleted_count += initial_count - df_filtered.count()
2686
+ df.unpersist()
2687
+
2688
+ # Delete associated nodes from meta_nodes
2689
+ df_nodes = self.spark.read.format("delta").load(self.tables["meta_nodes"])
2690
+ df_nodes.cache()
2691
+ nodes_initial = df_nodes.count()
2692
+ df_nodes_filtered = df_nodes.filter(F.col("pipeline_name") != pipeline_name)
2693
+ df_nodes_filtered.write.format("delta").mode("overwrite").save(
2694
+ self.tables["meta_nodes"]
2695
+ )
2696
+ deleted_count += nodes_initial - df_nodes_filtered.count()
2697
+ df_nodes.unpersist()
2698
+
2699
+ elif self.engine:
2700
+ # Delete from meta_pipelines
2701
+ df = self._read_local_table(self.tables["meta_pipelines"])
2702
+ if not df.empty and "pipeline_name" in df.columns:
2703
+ initial_count = len(df)
2704
+ df = df[df["pipeline_name"] != pipeline_name]
2705
+ self.engine.write(
2706
+ df,
2707
+ connection=self.connection,
2708
+ format="delta",
2709
+ path=self.tables["meta_pipelines"],
2710
+ mode="overwrite",
2711
+ )
2712
+ deleted_count += initial_count - len(df)
2713
+
2714
+ # Delete associated nodes from meta_nodes
2715
+ df_nodes = self._read_local_table(self.tables["meta_nodes"])
2716
+ if not df_nodes.empty and "pipeline_name" in df_nodes.columns:
2717
+ nodes_initial = len(df_nodes)
2718
+ df_nodes = df_nodes[df_nodes["pipeline_name"] != pipeline_name]
2719
+ self.engine.write(
2720
+ df_nodes,
2721
+ connection=self.connection,
2722
+ format="delta",
2723
+ path=self.tables["meta_nodes"],
2724
+ mode="overwrite",
2725
+ )
2726
+ deleted_count += nodes_initial - len(df_nodes)
2727
+
2728
+ self.invalidate_cache()
2729
+ logger.info(f"Removed pipeline '{pipeline_name}': {deleted_count} entries deleted")
2730
+
2731
+ except Exception as e:
2732
+ logger.warning(f"Failed to remove pipeline: {e}")
2733
+
2734
+ return deleted_count
2735
+
2736
+ def remove_node(self, pipeline_name: str, node_name: str) -> int:
2737
+ """Remove node and associated state entries.
2738
+
2739
+ Args:
2740
+ pipeline_name: Pipeline name
2741
+ node_name: Node name to remove
2742
+
2743
+ Returns:
2744
+ Count of deleted entries
2745
+ """
2746
+ if not self.spark and not self.engine:
2747
+ return 0
2748
+
2749
+ deleted_count = 0
2750
+
2751
+ try:
2752
+ if self.spark:
2753
+ from pyspark.sql import functions as F
2754
+
2755
+ # Delete from meta_nodes
2756
+ df = self.spark.read.format("delta").load(self.tables["meta_nodes"])
2757
+ df.cache()
2758
+ initial_count = df.count()
2759
+ df_filtered = df.filter(
2760
+ ~((F.col("pipeline_name") == pipeline_name) & (F.col("node_name") == node_name))
2761
+ )
2762
+ df_filtered.write.format("delta").mode("overwrite").save(self.tables["meta_nodes"])
2763
+ deleted_count = initial_count - df_filtered.count()
2764
+ df.unpersist()
2765
+
2766
+ elif self.engine:
2767
+ df = self._read_local_table(self.tables["meta_nodes"])
2768
+ if not df.empty and "pipeline_name" in df.columns and "node_name" in df.columns:
2769
+ initial_count = len(df)
2770
+ df = df[
2771
+ ~((df["pipeline_name"] == pipeline_name) & (df["node_name"] == node_name))
2772
+ ]
2773
+ self.engine.write(
2774
+ df,
2775
+ connection=self.connection,
2776
+ format="delta",
2777
+ path=self.tables["meta_nodes"],
2778
+ mode="overwrite",
2779
+ )
2780
+ deleted_count = initial_count - len(df)
2781
+
2782
+ self._nodes_cache = None
2783
+ logger.info(
2784
+ f"Removed node '{node_name}' from pipeline '{pipeline_name}': "
2785
+ f"{deleted_count} entries deleted"
2786
+ )
2787
+
2788
+ except Exception as e:
2789
+ logger.warning(f"Failed to remove node: {e}")
2790
+
2791
+ return deleted_count
2792
+
2793
+ def cleanup_orphans(self, current_config: Any) -> Dict[str, int]:
2794
+ """Compare catalog against current config, remove stale entries.
2795
+
2796
+ Args:
2797
+ current_config: ProjectConfig with current pipeline definitions
2798
+
2799
+ Returns:
2800
+ Dict of {table: deleted_count}
2801
+ """
2802
+ if not self.spark and not self.engine:
2803
+ return {}
2804
+
2805
+ results = {"meta_pipelines": 0, "meta_nodes": 0}
2806
+
2807
+ try:
2808
+ # Get current pipeline and node names from config
2809
+ current_pipelines = set()
2810
+ current_nodes = {} # {pipeline_name: set(node_names)}
2811
+
2812
+ for pipeline in current_config.pipelines:
2813
+ current_pipelines.add(pipeline.pipeline)
2814
+ current_nodes[pipeline.pipeline] = {node.name for node in pipeline.nodes}
2815
+
2816
+ if self.spark:
2817
+ from pyspark.sql import functions as F
2818
+
2819
+ # Cleanup orphan pipelines
2820
+ df_pipelines = self.spark.read.format("delta").load(self.tables["meta_pipelines"])
2821
+ df_pipelines.cache()
2822
+ initial_pipelines = df_pipelines.count()
2823
+ df_pipelines_filtered = df_pipelines.filter(
2824
+ F.col("pipeline_name").isin(list(current_pipelines))
2825
+ )
2826
+ df_pipelines_filtered.write.format("delta").mode("overwrite").save(
2827
+ self.tables["meta_pipelines"]
2828
+ )
2829
+ results["meta_pipelines"] = initial_pipelines - df_pipelines_filtered.count()
2830
+ df_pipelines.unpersist()
2831
+
2832
+ # Cleanup orphan nodes
2833
+ df_nodes = self.spark.read.format("delta").load(self.tables["meta_nodes"])
2834
+ df_nodes.cache()
2835
+ initial_nodes = df_nodes.count()
2836
+
2837
+ # Filter: keep only nodes that belong to current pipelines and exist in config
2838
+ valid_nodes = []
2839
+ for p_name, nodes in current_nodes.items():
2840
+ for n_name in nodes:
2841
+ valid_nodes.append((p_name, n_name))
2842
+
2843
+ if valid_nodes:
2844
+ valid_df = self.spark.createDataFrame(
2845
+ valid_nodes, ["pipeline_name", "node_name"]
2846
+ )
2847
+ df_nodes_filtered = df_nodes.join(
2848
+ valid_df, ["pipeline_name", "node_name"], "inner"
2849
+ )
2850
+ else:
2851
+ df_nodes_filtered = df_nodes.limit(0)
2852
+
2853
+ df_nodes_filtered.write.format("delta").mode("overwrite").save(
2854
+ self.tables["meta_nodes"]
2855
+ )
2856
+ results["meta_nodes"] = initial_nodes - df_nodes_filtered.count()
2857
+ df_nodes.unpersist()
2858
+
2859
+ elif self.engine:
2860
+ # Cleanup orphan pipelines
2861
+ df_pipelines = self._read_local_table(self.tables["meta_pipelines"])
2862
+ if not df_pipelines.empty and "pipeline_name" in df_pipelines.columns:
2863
+ initial_pipelines = len(df_pipelines)
2864
+ df_pipelines = df_pipelines[
2865
+ df_pipelines["pipeline_name"].isin(current_pipelines)
2866
+ ]
2867
+ self.engine.write(
2868
+ df_pipelines,
2869
+ connection=self.connection,
2870
+ format="delta",
2871
+ path=self.tables["meta_pipelines"],
2872
+ mode="overwrite",
2873
+ )
2874
+ results["meta_pipelines"] = initial_pipelines - len(df_pipelines)
2875
+
2876
+ # Cleanup orphan nodes
2877
+ df_nodes = self._read_local_table(self.tables["meta_nodes"])
2878
+ if not df_nodes.empty and "pipeline_name" in df_nodes.columns:
2879
+ initial_nodes = len(df_nodes)
2880
+
2881
+ valid_node_tuples = set()
2882
+ for p_name, nodes in current_nodes.items():
2883
+ for n_name in nodes:
2884
+ valid_node_tuples.add((p_name, n_name))
2885
+
2886
+ df_nodes["_valid"] = df_nodes.apply(
2887
+ lambda row: (row["pipeline_name"], row["node_name"]) in valid_node_tuples,
2888
+ axis=1,
2889
+ )
2890
+ df_nodes = df_nodes[df_nodes["_valid"]].drop(columns=["_valid"])
2891
+
2892
+ self.engine.write(
2893
+ df_nodes,
2894
+ connection=self.connection,
2895
+ format="delta",
2896
+ path=self.tables["meta_nodes"],
2897
+ mode="overwrite",
2898
+ )
2899
+ results["meta_nodes"] = initial_nodes - len(df_nodes)
2900
+
2901
+ self.invalidate_cache()
2902
+ logger.info(
2903
+ f"Cleanup orphans completed: {results['meta_pipelines']} pipelines, "
2904
+ f"{results['meta_nodes']} nodes removed"
2905
+ )
2906
+
2907
+ except Exception as e:
2908
+ logger.warning(f"Failed to cleanup orphans: {e}")
2909
+
2910
+ return results
2911
+
2912
+ def clear_state_key(self, key: str) -> bool:
2913
+ """Remove a single state entry by key.
2914
+
2915
+ Args:
2916
+ key: State key to remove
2917
+
2918
+ Returns:
2919
+ True if deleted, False otherwise
2920
+ """
2921
+ if not self.spark and not self.engine:
2922
+ return False
2923
+
2924
+ try:
2925
+ if self.spark:
2926
+ from pyspark.sql import functions as F
2927
+
2928
+ df = self.spark.read.format("delta").load(self.tables["meta_state"])
2929
+ initial_count = df.count()
2930
+ df = df.filter(F.col("key") != key)
2931
+ df.write.format("delta").mode("overwrite").save(self.tables["meta_state"])
2932
+ return df.count() < initial_count
2933
+
2934
+ elif self.engine:
2935
+ df = self._read_local_table(self.tables["meta_state"])
2936
+ if df.empty or "key" not in df.columns:
2937
+ return False
2938
+
2939
+ initial_count = len(df)
2940
+ df = df[df["key"] != key]
2941
+
2942
+ if len(df) < initial_count:
2943
+ self.engine.write(
2944
+ df,
2945
+ connection=self.connection,
2946
+ format="delta",
2947
+ path=self.tables["meta_state"],
2948
+ mode="overwrite",
2949
+ )
2950
+ return True
2951
+
2952
+ return False
2953
+
2954
+ except Exception as e:
2955
+ logger.warning(f"Failed to clear state key '{key}': {e}")
2956
+ return False
2957
+
2958
+ def clear_state_pattern(self, key_pattern: str) -> int:
2959
+ """Remove state entries matching pattern (supports wildcards).
2960
+
2961
+ Args:
2962
+ key_pattern: Pattern with optional * wildcards
2963
+
2964
+ Returns:
2965
+ Count of deleted entries
2966
+ """
2967
+ if not self.spark and not self.engine:
2968
+ return 0
2969
+
2970
+ try:
2971
+ if self.spark:
2972
+ from pyspark.sql import functions as F
2973
+
2974
+ df = self.spark.read.format("delta").load(self.tables["meta_state"])
2975
+ initial_count = df.count()
2976
+
2977
+ # Convert wildcard pattern to SQL LIKE pattern
2978
+ like_pattern = key_pattern.replace("*", "%")
2979
+ df = df.filter(~F.col("key").like(like_pattern))
2980
+ df.write.format("delta").mode("overwrite").save(self.tables["meta_state"])
2981
+
2982
+ return initial_count - df.count()
2983
+
2984
+ elif self.engine:
2985
+ import re
2986
+
2987
+ df = self._read_local_table(self.tables["meta_state"])
2988
+ if df.empty or "key" not in df.columns:
2989
+ return 0
2990
+
2991
+ initial_count = len(df)
2992
+
2993
+ # Convert wildcard pattern to regex
2994
+ regex_pattern = "^" + key_pattern.replace("*", ".*") + "$"
2995
+ pattern = re.compile(regex_pattern)
2996
+ df = df[~df["key"].apply(lambda x: bool(pattern.match(str(x))))]
2997
+
2998
+ if len(df) < initial_count:
2999
+ self.engine.write(
3000
+ df,
3001
+ connection=self.connection,
3002
+ format="delta",
3003
+ path=self.tables["meta_state"],
3004
+ mode="overwrite",
3005
+ )
3006
+
3007
+ return initial_count - len(df)
3008
+
3009
+ except Exception as e:
3010
+ logger.warning(f"Failed to clear state pattern '{key_pattern}': {e}")
3011
+ return 0