odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1203 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ import random
5
+ import time
6
+ from abc import ABC, abstractmethod
7
+ from datetime import datetime, timezone
8
+ from typing import Any, Dict, List, Optional
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def _retry_delta_operation(func, max_retries: int = 5, base_delay: float = 1.0):
14
+ """Retry a Delta operation with exponential backoff on concurrency conflicts.
15
+
16
+ Only logs debug during retries. Raises after all retries fail.
17
+
18
+ Args:
19
+ func: Callable to execute.
20
+ max_retries: Maximum retry attempts (default 5 for high concurrency).
21
+ base_delay: Base delay in seconds (doubles each retry).
22
+ """
23
+ for attempt in range(max_retries + 1):
24
+ try:
25
+ return func()
26
+ except Exception as e:
27
+ error_str = str(e)
28
+ is_concurrent = any(
29
+ msg in error_str
30
+ for msg in [
31
+ "ConcurrentAppendException",
32
+ "ConcurrentDeleteReadException",
33
+ "ConcurrentDeleteDeleteException",
34
+ "DELTA_CONCURRENT",
35
+ "concurrent",
36
+ "conflict",
37
+ ]
38
+ )
39
+ if not is_concurrent or attempt >= max_retries:
40
+ raise
41
+ # Exponential backoff with jitter (1s, 2s, 4s, 8s, 16s = ~31s total)
42
+ delay = base_delay * (2**attempt) + random.uniform(0, 1.0)
43
+ logger.debug(
44
+ f"Delta concurrent write (attempt {attempt + 1}/{max_retries + 1}), "
45
+ f"retrying in {delay:.2f}s..."
46
+ )
47
+ time.sleep(delay)
48
+
49
+
50
+ # Suppress noisy delta-rs transaction conflict warnings (handled by retry)
51
+ # Must be set before deltalake is imported
52
+ if "RUST_LOG" not in os.environ:
53
+ os.environ["RUST_LOG"] = "deltalake_core::kernel::transaction=error"
54
+
55
+ # Try to import deltalake, but don't fail yet (it might be a Spark run)
56
+ try:
57
+ import pandas as pd
58
+ import pyarrow as pa
59
+ from deltalake import DeltaTable, write_deltalake
60
+ except ImportError:
61
+ DeltaTable = None
62
+ write_deltalake = None
63
+ pd = None
64
+ pa = None
65
+
66
+
67
+ class StateBackend(ABC):
68
+ @abstractmethod
69
+ def load_state(self) -> Dict[str, Any]:
70
+ """Return state in the current in-memory format, e.g. {'pipelines': {...}}."""
71
+ ...
72
+
73
+ @abstractmethod
74
+ def save_pipeline_run(self, pipeline_name: str, pipeline_data: Dict[str, Any]) -> None:
75
+ """Persist the given pipeline_data into backend."""
76
+ ...
77
+
78
+ @abstractmethod
79
+ def get_last_run_info(self, pipeline_name: str, node_name: str) -> Optional[Dict[str, Any]]:
80
+ """Get status and metadata of a node from last run."""
81
+ ...
82
+
83
+ @abstractmethod
84
+ def get_last_run_status(self, pipeline_name: str, node_name: str) -> Optional[bool]:
85
+ """Get success status of a node from last run."""
86
+ ...
87
+
88
+ @abstractmethod
89
+ def get_hwm(self, key: str) -> Any:
90
+ """Get High-Water Mark value for a key."""
91
+ ...
92
+
93
+ @abstractmethod
94
+ def set_hwm(self, key: str, value: Any) -> None:
95
+ """Set High-Water Mark value for a key."""
96
+ ...
97
+
98
+ def set_hwm_batch(self, updates: List[Dict[str, Any]]) -> None:
99
+ """Set multiple High-Water Mark values in a single operation.
100
+
101
+ Default implementation calls set_hwm() for each update.
102
+ Subclasses should override for efficient batch writes.
103
+
104
+ Args:
105
+ updates: List of dicts with keys: key, value
106
+ """
107
+ for update in updates:
108
+ self.set_hwm(update["key"], update["value"])
109
+
110
+
111
+ class LocalJSONStateBackend(StateBackend):
112
+ """
113
+ Local JSON-based State Backend.
114
+ Used for local development or when System Catalog is not configured.
115
+ """
116
+
117
+ def __init__(self, state_path: str):
118
+ self.state_path = state_path
119
+ self.state = self._load_from_disk()
120
+
121
+ def _load_from_disk(self) -> Dict[str, Any]:
122
+ if os.path.exists(self.state_path):
123
+ try:
124
+ with open(self.state_path, "r") as f:
125
+ return json.load(f)
126
+ except Exception as e:
127
+ logger.warning(f"Failed to load state from {self.state_path}: {e}")
128
+ return {"pipelines": {}, "hwm": {}}
129
+
130
+ def _save_to_disk(self) -> None:
131
+ os.makedirs(os.path.dirname(self.state_path), exist_ok=True)
132
+ with open(self.state_path, "w") as f:
133
+ json.dump(self.state, f, indent=2, default=str)
134
+
135
+ def load_state(self) -> Dict[str, Any]:
136
+ return self.state
137
+
138
+ def save_pipeline_run(self, pipeline_name: str, pipeline_data: Dict[str, Any]) -> None:
139
+ if "pipelines" not in self.state:
140
+ self.state["pipelines"] = {}
141
+ self.state["pipelines"][pipeline_name] = pipeline_data
142
+ self._save_to_disk()
143
+
144
+ def get_last_run_info(self, pipeline_name: str, node_name: str) -> Optional[Dict[str, Any]]:
145
+ pipe = self.state.get("pipelines", {}).get(pipeline_name, {})
146
+ nodes = pipe.get("nodes", {})
147
+ return nodes.get(node_name)
148
+
149
+ def get_last_run_status(self, pipeline_name: str, node_name: str) -> Optional[bool]:
150
+ info = self.get_last_run_info(pipeline_name, node_name)
151
+ if info:
152
+ return info.get("success")
153
+ return None
154
+
155
+ def get_hwm(self, key: str) -> Any:
156
+ return self.state.get("hwm", {}).get(key)
157
+
158
+ def set_hwm(self, key: str, value: Any) -> None:
159
+ if "hwm" not in self.state:
160
+ self.state["hwm"] = {}
161
+ self.state["hwm"][key] = value
162
+ self._save_to_disk()
163
+
164
+
165
+ class CatalogStateBackend(StateBackend):
166
+ """
167
+ Unified State Backend using Delta Tables (System Catalog).
168
+ Supports both Spark and Local (via deltalake) execution.
169
+ """
170
+
171
+ def __init__(
172
+ self,
173
+ meta_runs_path: str,
174
+ meta_state_path: str,
175
+ spark_session: Any = None,
176
+ storage_options: Optional[Dict[str, str]] = None,
177
+ environment: Optional[str] = None,
178
+ ):
179
+ self.meta_runs_path = meta_runs_path
180
+ self.meta_state_path = meta_state_path
181
+ self.spark = spark_session
182
+ self.storage_options = storage_options or {}
183
+ self.environment = environment
184
+
185
+ def load_state(self) -> Dict[str, Any]:
186
+ """
187
+ Load state. For Catalog backend, we generally return empty
188
+ and rely on direct queries for specific info.
189
+ """
190
+ return {"pipelines": {}}
191
+
192
+ def save_pipeline_run(self, pipeline_name: str, pipeline_data: Dict[str, Any]) -> None:
193
+ # CatalogManager already logs runs (meta_runs) during execution.
194
+ # We do not need to duplicate this here, avoiding schema conflicts.
195
+ pass
196
+
197
+ def _save_runs_spark(self, rows):
198
+ pass
199
+
200
+ def _save_runs_local(self, rows):
201
+ pass
202
+
203
+ def get_last_run_info(self, pipeline_name: str, node_name: str) -> Optional[Dict[str, Any]]:
204
+ if self.spark:
205
+ return self._get_last_run_spark(pipeline_name, node_name)
206
+ return self._get_last_run_local(pipeline_name, node_name)
207
+
208
+ def _get_last_run_spark(self, pipeline_name, node_name):
209
+ from pyspark.sql import functions as F
210
+
211
+ try:
212
+ df = self.spark.read.format("delta").load(self.meta_runs_path)
213
+ row = (
214
+ df.filter(
215
+ (F.col("pipeline_name") == pipeline_name) & (F.col("node_name") == node_name)
216
+ )
217
+ .select("status", "metadata")
218
+ .orderBy(F.col("timestamp").desc())
219
+ .first()
220
+ )
221
+ if row:
222
+ meta = {}
223
+ if row.metadata:
224
+ try:
225
+ meta = json.loads(row.metadata)
226
+ except Exception as e:
227
+ logger.debug(f"Failed to parse metadata JSON: {e}")
228
+ return {"success": (row.status == "SUCCESS"), "metadata": meta}
229
+ except Exception as e:
230
+ logger.warning(
231
+ f"Failed to get last run info from {self.meta_runs_path} "
232
+ f"for {pipeline_name}/{node_name}: {e}"
233
+ )
234
+ return None
235
+
236
+ def _get_last_run_local(self, pipeline_name, node_name):
237
+ if not DeltaTable:
238
+ return None
239
+
240
+ try:
241
+ dt = DeltaTable(self.meta_runs_path, storage_options=self.storage_options)
242
+ ds = dt.to_pyarrow_dataset()
243
+ import pyarrow.compute as pc
244
+
245
+ filter_expr = (pc.field("pipeline_name") == pipeline_name) & (
246
+ pc.field("node_name") == node_name
247
+ )
248
+ # Scan with filter
249
+ table = ds.to_table(filter=filter_expr)
250
+
251
+ if table.num_rows == 0:
252
+ return None
253
+
254
+ # Sort by timestamp desc to get latest
255
+ # PyArrow table sort? Convert to pandas for easier sorting if small history
256
+ # Or use duckdb
257
+
258
+ df = table.to_pandas()
259
+ if "timestamp" in df.columns:
260
+ df = df.sort_values("timestamp", ascending=False)
261
+
262
+ row = df.iloc[0]
263
+
264
+ meta = {}
265
+ if row.get("metadata"):
266
+ try:
267
+ meta = json.loads(row["metadata"])
268
+ except Exception as e:
269
+ logger.debug(f"Failed to parse metadata JSON: {e}")
270
+
271
+ status = row.get("status")
272
+ return {"success": (status == "SUCCESS"), "metadata": meta}
273
+
274
+ except Exception as e:
275
+ logger.warning(
276
+ f"Failed to get last run info from {self.meta_runs_path} "
277
+ f"for {pipeline_name}/{node_name}: {e}"
278
+ )
279
+ return None
280
+
281
+ def get_last_run_status(self, pipeline_name: str, node_name: str) -> Optional[bool]:
282
+ info = self.get_last_run_info(pipeline_name, node_name)
283
+ if info:
284
+ return info.get("success")
285
+ return None
286
+
287
+ def get_hwm(self, key: str) -> Any:
288
+ if self.spark:
289
+ return self._get_hwm_spark(key)
290
+ return self._get_hwm_local(key)
291
+
292
+ def _get_hwm_spark(self, key):
293
+ from pyspark.sql import functions as F
294
+
295
+ try:
296
+ df = self.spark.read.format("delta").load(self.meta_state_path)
297
+ row = df.filter(F.col("key") == key).select("value").first()
298
+ if row and row.value:
299
+ try:
300
+ return json.loads(row.value)
301
+ except Exception as e:
302
+ logger.debug(f"Failed to parse HWM value as JSON for key '{key}': {e}")
303
+ return row.value
304
+ except Exception as e:
305
+ error_str = str(e)
306
+ if "PATH_NOT_FOUND" in error_str or "does not exist" in error_str.lower():
307
+ logger.debug(
308
+ f"HWM state table does not exist yet at {self.meta_state_path}. "
309
+ "It will be created on first write."
310
+ )
311
+ else:
312
+ logger.warning(
313
+ f"Failed to get HWM for key '{key}' from {self.meta_state_path}: {e}"
314
+ )
315
+ return None
316
+
317
+ def _get_hwm_local(self, key):
318
+ if not DeltaTable:
319
+ return None
320
+ try:
321
+ dt = DeltaTable(self.meta_state_path, storage_options=self.storage_options)
322
+ ds = dt.to_pyarrow_dataset()
323
+ import pyarrow.compute as pc
324
+
325
+ filter_expr = pc.field("key") == key
326
+ table = ds.to_table(filter=filter_expr)
327
+
328
+ if table.num_rows == 0:
329
+ return None
330
+
331
+ val_str = table.column("value")[0].as_py()
332
+ if val_str:
333
+ try:
334
+ return json.loads(val_str)
335
+ except Exception as e:
336
+ logger.debug(f"Failed to parse HWM value as JSON for key '{key}': {e}")
337
+ return val_str
338
+ except Exception as e:
339
+ logger.warning(f"Failed to get HWM for key '{key}' from {self.meta_state_path}: {e}")
340
+ return None
341
+
342
+ def set_hwm(self, key: str, value: Any) -> None:
343
+ val_str = json.dumps(value, default=str)
344
+ row = {
345
+ "key": key,
346
+ "value": val_str,
347
+ "environment": self.environment,
348
+ "updated_at": datetime.now(timezone.utc),
349
+ }
350
+
351
+ def _do_set():
352
+ if self.spark:
353
+ self._set_hwm_spark(row)
354
+ else:
355
+ self._set_hwm_local(row)
356
+
357
+ _retry_delta_operation(_do_set)
358
+
359
+ def _set_hwm_spark(self, row):
360
+ from pyspark.sql.types import StringType, StructField, StructType, TimestampType
361
+
362
+ schema = StructType(
363
+ [
364
+ StructField("key", StringType(), False),
365
+ StructField("value", StringType(), True),
366
+ StructField("environment", StringType(), True),
367
+ StructField("updated_at", TimestampType(), True),
368
+ ]
369
+ )
370
+
371
+ updates_df = self.spark.createDataFrame([row], schema)
372
+
373
+ if not self._spark_table_exists(self.meta_state_path):
374
+ updates_df.write.format("delta").mode("overwrite").save(self.meta_state_path)
375
+ return
376
+
377
+ view_name = f"_odibi_hwm_updates_{abs(hash(row['key']))}"
378
+ updates_df.createOrReplaceTempView(view_name)
379
+
380
+ merge_sql = f"""
381
+ MERGE INTO delta.`{self.meta_state_path}` AS t
382
+ USING {view_name} AS s
383
+ ON t.key = s.key
384
+ WHEN MATCHED THEN UPDATE SET
385
+ t.value = s.value,
386
+ t.environment = s.environment,
387
+ t.updated_at = s.updated_at
388
+ WHEN NOT MATCHED THEN INSERT *
389
+ """
390
+ self.spark.sql(merge_sql)
391
+ self.spark.catalog.dropTempView(view_name)
392
+
393
+ def _set_hwm_local(self, row):
394
+ if not DeltaTable:
395
+ raise ImportError("deltalake library is required for local state backend.")
396
+
397
+ df = pd.DataFrame([row])
398
+ df["updated_at"] = pd.to_datetime(df["updated_at"])
399
+
400
+ try:
401
+ dt = DeltaTable(self.meta_state_path, storage_options=self.storage_options)
402
+ (
403
+ dt.merge(
404
+ source=df,
405
+ predicate="target.key = source.key",
406
+ source_alias="source",
407
+ target_alias="target",
408
+ )
409
+ .when_matched_update_all()
410
+ .when_not_matched_insert_all()
411
+ .execute()
412
+ )
413
+ except (ValueError, Exception):
414
+ write_deltalake(
415
+ self.meta_state_path,
416
+ df,
417
+ mode="append",
418
+ storage_options=self.storage_options,
419
+ schema_mode="merge",
420
+ )
421
+
422
+ def _spark_table_exists(self, path: str) -> bool:
423
+ try:
424
+ return self.spark.read.format("delta").load(path).count() >= 0
425
+ except Exception as e:
426
+ logger.debug(f"Table does not exist at {path}: {e}")
427
+ return False
428
+
429
+ def set_hwm_batch(self, updates: List[Dict[str, Any]]) -> None:
430
+ """Set multiple High-Water Mark values in a single MERGE operation.
431
+
432
+ This is much more efficient than calling set_hwm() for each update
433
+ individually, especially when running parallel pipelines with many nodes.
434
+
435
+ Args:
436
+ updates: List of dicts with keys: key, value
437
+ """
438
+ if not updates:
439
+ return
440
+
441
+ timestamp = datetime.now(timezone.utc)
442
+ rows = [
443
+ {
444
+ "key": u["key"],
445
+ "value": json.dumps(u["value"], default=str),
446
+ "environment": self.environment,
447
+ "updated_at": timestamp,
448
+ }
449
+ for u in updates
450
+ ]
451
+
452
+ def _do_batch_set():
453
+ if self.spark:
454
+ self._set_hwm_batch_spark(rows)
455
+ else:
456
+ self._set_hwm_batch_local(rows)
457
+
458
+ _retry_delta_operation(_do_batch_set)
459
+
460
+ def _set_hwm_batch_spark(self, rows: List[Dict[str, Any]]) -> None:
461
+ from pyspark.sql.types import StringType, StructField, StructType, TimestampType
462
+
463
+ schema = StructType(
464
+ [
465
+ StructField("key", StringType(), False),
466
+ StructField("value", StringType(), True),
467
+ StructField("environment", StringType(), True),
468
+ StructField("updated_at", TimestampType(), True),
469
+ ]
470
+ )
471
+
472
+ updates_df = self.spark.createDataFrame(rows, schema)
473
+
474
+ if not self._spark_table_exists(self.meta_state_path):
475
+ updates_df.write.format("delta").mode("overwrite").save(self.meta_state_path)
476
+ return
477
+
478
+ view_name = "_odibi_hwm_batch_updates"
479
+ updates_df.createOrReplaceTempView(view_name)
480
+
481
+ merge_sql = f"""
482
+ MERGE INTO delta.`{self.meta_state_path}` AS t
483
+ USING {view_name} AS s
484
+ ON t.key = s.key
485
+ WHEN MATCHED THEN UPDATE SET
486
+ t.value = s.value,
487
+ t.environment = s.environment,
488
+ t.updated_at = s.updated_at
489
+ WHEN NOT MATCHED THEN INSERT *
490
+ """
491
+ self.spark.sql(merge_sql)
492
+ self.spark.catalog.dropTempView(view_name)
493
+ logger.debug(f"Batch set {len(rows)} HWM value(s) via Spark")
494
+
495
+ def _set_hwm_batch_local(self, rows: List[Dict[str, Any]]) -> None:
496
+ if not DeltaTable:
497
+ raise ImportError("deltalake library is required for local state backend.")
498
+
499
+ df = pd.DataFrame(rows)
500
+ df["updated_at"] = pd.to_datetime(df["updated_at"])
501
+
502
+ try:
503
+ dt = DeltaTable(self.meta_state_path, storage_options=self.storage_options)
504
+ (
505
+ dt.merge(
506
+ source=df,
507
+ predicate="target.key = source.key",
508
+ source_alias="source",
509
+ target_alias="target",
510
+ )
511
+ .when_matched_update_all()
512
+ .when_not_matched_insert_all()
513
+ .execute()
514
+ )
515
+ except Exception:
516
+ # Table doesn't exist or merge failed - create/append
517
+ write_deltalake(
518
+ self.meta_state_path,
519
+ df,
520
+ mode="overwrite",
521
+ storage_options=self.storage_options,
522
+ )
523
+ logger.debug(f"Batch set {len(rows)} HWM value(s) locally")
524
+
525
+
526
+ class SqlServerSystemBackend(StateBackend):
527
+ """
528
+ SQL Server State Backend for centralized system tables.
529
+
530
+ Stores meta_runs and meta_state in SQL Server tables for cross-environment
531
+ visibility and querying. Useful when you want a single source of truth
532
+ for pipeline observability across dev/qat/prod environments.
533
+
534
+ Example config:
535
+ ```yaml
536
+ system:
537
+ connection: sql_server
538
+ schema_name: odibi_system
539
+ environment: prod
540
+ ```
541
+ """
542
+
543
+ # SQL Server table DDL
544
+ META_RUNS_DDL = """
545
+ IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = 'meta_runs' AND schema_id = SCHEMA_ID(:schema))
546
+ BEGIN
547
+ CREATE TABLE [{schema}].[meta_runs] (
548
+ run_id NVARCHAR(100),
549
+ pipeline_name NVARCHAR(255),
550
+ node_name NVARCHAR(255),
551
+ status NVARCHAR(50),
552
+ rows_processed BIGINT,
553
+ duration_ms BIGINT,
554
+ metrics_json NVARCHAR(MAX),
555
+ environment NVARCHAR(50),
556
+ timestamp DATETIME2,
557
+ date DATE
558
+ )
559
+ END
560
+ """
561
+
562
+ META_STATE_DDL = """
563
+ IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = 'meta_state' AND schema_id = SCHEMA_ID(:schema))
564
+ BEGIN
565
+ CREATE TABLE [{schema}].[meta_state] (
566
+ [key] NVARCHAR(500) PRIMARY KEY,
567
+ [value] NVARCHAR(MAX),
568
+ environment NVARCHAR(50),
569
+ updated_at DATETIME2
570
+ )
571
+ END
572
+ """
573
+
574
+ def __init__(
575
+ self,
576
+ connection: Any,
577
+ schema_name: str = "odibi_system",
578
+ environment: Optional[str] = None,
579
+ ):
580
+ """
581
+ Initialize SQL Server System Backend.
582
+
583
+ Args:
584
+ connection: AzureSQL connection object
585
+ schema_name: Schema for system tables (default: odibi_system)
586
+ environment: Environment tag for records (e.g., 'dev', 'prod')
587
+ """
588
+ self.connection = connection
589
+ self.schema_name = schema_name
590
+ self.environment = environment
591
+ self._tables_created = False
592
+
593
+ def _ensure_tables(self) -> None:
594
+ """Create system tables if they don't exist."""
595
+ if self._tables_created:
596
+ return
597
+
598
+ try:
599
+ # Create schema if not exists
600
+ schema_ddl = f"""
601
+ IF NOT EXISTS (SELECT * FROM sys.schemas WHERE name = '{self.schema_name}')
602
+ BEGIN
603
+ EXEC('CREATE SCHEMA [{self.schema_name}]')
604
+ END
605
+ """
606
+ self.connection.execute(schema_ddl)
607
+
608
+ # Create tables
609
+ runs_ddl = self.META_RUNS_DDL.replace("{schema}", self.schema_name).replace(
610
+ ":schema", f"'{self.schema_name}'"
611
+ )
612
+ self.connection.execute(runs_ddl)
613
+
614
+ state_ddl = self.META_STATE_DDL.replace("{schema}", self.schema_name).replace(
615
+ ":schema", f"'{self.schema_name}'"
616
+ )
617
+ self.connection.execute(state_ddl)
618
+
619
+ self._tables_created = True
620
+ logger.debug(f"SQL Server system tables ensured in schema {self.schema_name}")
621
+ except Exception as e:
622
+ logger.warning(f"Failed to ensure SQL Server system tables: {e}")
623
+
624
+ def load_state(self) -> Dict[str, Any]:
625
+ """Load state - returns empty dict for SQL Server backend."""
626
+ return {"pipelines": {}}
627
+
628
+ def save_pipeline_run(self, pipeline_name: str, pipeline_data: Dict[str, Any]) -> None:
629
+ """Pipeline runs are logged via log_run, not this method."""
630
+ pass
631
+
632
+ def get_last_run_info(self, pipeline_name: str, node_name: str) -> Optional[Dict[str, Any]]:
633
+ """Get last run info from SQL Server."""
634
+ self._ensure_tables()
635
+ try:
636
+ sql = f"""
637
+ SELECT TOP 1 status, metrics_json
638
+ FROM [{self.schema_name}].[meta_runs]
639
+ WHERE pipeline_name = :pipeline_name AND node_name = :node_name
640
+ ORDER BY timestamp DESC
641
+ """
642
+ result = self.connection.execute(
643
+ sql, {"pipeline_name": pipeline_name, "node_name": node_name}
644
+ )
645
+ if result:
646
+ row = result[0]
647
+ meta = {}
648
+ if row[1]:
649
+ try:
650
+ meta = json.loads(row[1])
651
+ except Exception:
652
+ pass
653
+ return {"success": row[0] == "SUCCESS", "metadata": meta}
654
+ except Exception as e:
655
+ logger.warning(f"Failed to get last run info: {e}")
656
+ return None
657
+
658
+ def get_last_run_status(self, pipeline_name: str, node_name: str) -> Optional[bool]:
659
+ """Get last run status."""
660
+ info = self.get_last_run_info(pipeline_name, node_name)
661
+ return info.get("success") if info else None
662
+
663
+ def get_hwm(self, key: str) -> Any:
664
+ """Get HWM value from SQL Server."""
665
+ self._ensure_tables()
666
+ try:
667
+ sql = f"""
668
+ SELECT [value] FROM [{self.schema_name}].[meta_state]
669
+ WHERE [key] = :key
670
+ """
671
+ result = self.connection.execute(sql, {"key": key})
672
+ if result and result[0][0]:
673
+ try:
674
+ return json.loads(result[0][0])
675
+ except Exception:
676
+ return result[0][0]
677
+ except Exception as e:
678
+ logger.warning(f"Failed to get HWM: {e}")
679
+ return None
680
+
681
+ def set_hwm(self, key: str, value: Any) -> None:
682
+ """Set HWM value in SQL Server using MERGE."""
683
+ self._ensure_tables()
684
+ val_str = json.dumps(value, default=str)
685
+ try:
686
+ sql = f"""
687
+ MERGE [{self.schema_name}].[meta_state] AS target
688
+ USING (SELECT :key AS [key]) AS source
689
+ ON target.[key] = source.[key]
690
+ WHEN MATCHED THEN
691
+ UPDATE SET [value] = :value, environment = :env, updated_at = GETUTCDATE()
692
+ WHEN NOT MATCHED THEN
693
+ INSERT ([key], [value], environment, updated_at)
694
+ VALUES (:key, :value, :env, GETUTCDATE());
695
+ """
696
+ self.connection.execute(sql, {"key": key, "value": val_str, "env": self.environment})
697
+ except Exception as e:
698
+ logger.warning(f"Failed to set HWM: {e}")
699
+
700
+ def set_hwm_batch(self, updates: List[Dict[str, Any]]) -> None:
701
+ """Set multiple HWM values."""
702
+ for update in updates:
703
+ self.set_hwm(update["key"], update["value"])
704
+
705
+ def log_run(
706
+ self,
707
+ run_id: str,
708
+ pipeline_name: str,
709
+ node_name: str,
710
+ status: str,
711
+ rows_processed: int = 0,
712
+ duration_ms: int = 0,
713
+ metrics_json: str = "{}",
714
+ ) -> None:
715
+ """Log a run to SQL Server meta_runs table."""
716
+ self._ensure_tables()
717
+ try:
718
+ sql = f"""
719
+ INSERT INTO [{self.schema_name}].[meta_runs]
720
+ (run_id, pipeline_name, node_name, status, rows_processed, duration_ms,
721
+ metrics_json, environment, timestamp, date)
722
+ VALUES (:run_id, :pipeline, :node, :status, :rows, :duration,
723
+ :metrics, :env, GETUTCDATE(), CAST(GETUTCDATE() AS DATE))
724
+ """
725
+ self.connection.execute(
726
+ sql,
727
+ {
728
+ "run_id": run_id,
729
+ "pipeline": pipeline_name,
730
+ "node": node_name,
731
+ "status": status,
732
+ "rows": rows_processed,
733
+ "duration": duration_ms,
734
+ "metrics": metrics_json,
735
+ "env": self.environment,
736
+ },
737
+ )
738
+ except Exception as e:
739
+ logger.warning(f"Failed to log run to SQL Server: {e}")
740
+
741
+ def log_runs_batch(self, records: List[Dict[str, Any]]) -> None:
742
+ """Log multiple runs to SQL Server."""
743
+ for record in records:
744
+ self.log_run(
745
+ run_id=record["run_id"],
746
+ pipeline_name=record["pipeline_name"],
747
+ node_name=record["node_name"],
748
+ status=record["status"],
749
+ rows_processed=record.get("rows_processed", 0),
750
+ duration_ms=record.get("duration_ms", 0),
751
+ metrics_json=record.get("metrics_json", "{}"),
752
+ )
753
+
754
+
755
+ class StateManager:
756
+ """Manages execution state for checkpointing."""
757
+
758
+ def __init__(self, project_root: str = ".", backend: Optional[StateBackend] = None):
759
+ self.backend = backend
760
+ # Note: If backend is None, it should be injected.
761
+ # But we won't fallback to LocalFileStateBackend here anymore as it's removed.
762
+ if not self.backend:
763
+ raise ValueError("StateBackend must be provided to StateManager")
764
+
765
+ self.state: Dict[str, Any] = self.backend.load_state()
766
+
767
+ def save_pipeline_run(self, pipeline_name: str, results: Any):
768
+ """Save pipeline run results."""
769
+ if hasattr(results, "to_dict"):
770
+ data = results.to_dict()
771
+ else:
772
+ data = results
773
+
774
+ node_status = {}
775
+ if hasattr(results, "node_results"):
776
+ for name, res in results.node_results.items():
777
+ node_status[name] = {
778
+ "success": res.success,
779
+ "timestamp": res.metadata.get("timestamp"),
780
+ "metadata": res.metadata,
781
+ }
782
+
783
+ pipeline_data = {
784
+ "last_run": data.get("end_time"),
785
+ "nodes": node_status,
786
+ }
787
+
788
+ self.backend.save_pipeline_run(pipeline_name, pipeline_data)
789
+ self.state = self.backend.load_state()
790
+
791
+ def get_last_run_info(self, pipeline_name: str, node_name: str) -> Optional[Dict[str, Any]]:
792
+ """Get status and metadata of a node from last run."""
793
+ return self.backend.get_last_run_info(pipeline_name, node_name)
794
+
795
+ def get_last_run_status(self, pipeline_name: str, node_name: str) -> Optional[bool]:
796
+ """Get success status of a node from last run."""
797
+ return self.backend.get_last_run_status(pipeline_name, node_name)
798
+
799
+ def get_hwm(self, key: str) -> Any:
800
+ """Get High-Water Mark value for a key."""
801
+ return self.backend.get_hwm(key)
802
+
803
+ def set_hwm(self, key: str, value: Any) -> None:
804
+ """Set High-Water Mark value for a key."""
805
+ self.backend.set_hwm(key, value)
806
+
807
+ def set_hwm_batch(self, updates: List[Dict[str, Any]]) -> None:
808
+ """Set multiple High-Water Mark values in a single operation.
809
+
810
+ Args:
811
+ updates: List of dicts with keys: key, value
812
+ """
813
+ self.backend.set_hwm_batch(updates)
814
+
815
+
816
+ def create_state_backend(
817
+ config: Any, project_root: str = ".", spark_session: Any = None
818
+ ) -> StateBackend:
819
+ """
820
+ Factory to create state backend from ProjectConfig.
821
+
822
+ Args:
823
+ config: ProjectConfig object
824
+ project_root: Root directory for local files
825
+ spark_session: Optional SparkSession for Delta backend
826
+
827
+ Returns:
828
+ Configured StateBackend
829
+ """
830
+ # Fallback to Local JSON if no System Config
831
+ if not config.system:
832
+ import logging
833
+
834
+ logger = logging.getLogger(__name__)
835
+ logger.warning(
836
+ "No system catalog configured. Using local JSON state backend (local-only mode)."
837
+ )
838
+ state_path = os.path.join(project_root, ".odibi", "state.json")
839
+ return LocalJSONStateBackend(state_path)
840
+
841
+ system_conn_name = config.system.connection
842
+ conn_config = config.connections.get(system_conn_name)
843
+
844
+ if not conn_config:
845
+ raise ValueError(f"System connection '{system_conn_name}' not found.")
846
+
847
+ # Helper to get attribute from dict or object
848
+ def _get(obj, key, default=None):
849
+ if isinstance(obj, dict):
850
+ return obj.get(key, default)
851
+ return getattr(obj, key, default)
852
+
853
+ base_uri = ""
854
+ storage_options = {}
855
+
856
+ conn_type = _get(conn_config, "type")
857
+ environment = getattr(config.system, "environment", None)
858
+
859
+ # SQL Server backend - centralized system tables
860
+ if conn_type in ("sql_server", "azure_sql"):
861
+ from odibi.connections.factory import create_connection
862
+
863
+ # Create the SQL connection
864
+ connection = create_connection(system_conn_name, conn_config)
865
+ schema_name = getattr(config.system, "schema_name", None) or "odibi_system"
866
+
867
+ logger.info(f"Using SQL Server system backend: {system_conn_name}, schema: {schema_name}")
868
+ return SqlServerSystemBackend(
869
+ connection=connection,
870
+ schema_name=schema_name,
871
+ environment=environment,
872
+ )
873
+
874
+ # Determine Base URI based on connection type
875
+ if conn_type == "local":
876
+ base_path = _get(conn_config, "base_path")
877
+ if not os.path.isabs(base_path):
878
+ base_path = os.path.join(project_root, base_path)
879
+
880
+ # Ensure directory exists
881
+ try:
882
+ os.makedirs(base_path, exist_ok=True)
883
+ except Exception:
884
+ pass
885
+
886
+ base_uri = os.path.join(base_path, config.system.path)
887
+
888
+ elif conn_type == "azure_blob":
889
+ # Construct abfss://
890
+ account = _get(conn_config, "account_name")
891
+ container = _get(conn_config, "container")
892
+ base_uri = f"abfss://{container}@{account}.dfs.core.windows.net/{config.system.path}"
893
+
894
+ # Set up storage options
895
+ # Depends on auth mode
896
+ auth = _get(conn_config, "auth", {})
897
+ auth_mode = _get(auth, "mode")
898
+ if auth_mode == "account_key":
899
+ storage_options = {
900
+ "account_name": account,
901
+ "account_key": _get(auth, "account_key"),
902
+ }
903
+ elif auth_mode == "sas":
904
+ storage_options = {
905
+ "account_name": account,
906
+ "sas_token": _get(auth, "sas_token"),
907
+ }
908
+ # For MSI/KeyVault, it's more complex for deltalake-python without extra config
909
+ # But Spark handles it if configured in environment
910
+
911
+ else:
912
+ # Fallback for other types or throw error if not supported for system catalog
913
+ # For simplicity, try to treat as local path if it looks like one?
914
+ # Or raise error
915
+ # Assuming local or azure blob for now as they are main supported backends
916
+ # If delta connection?
917
+ if conn_type == "delta":
918
+ # If the connection itself is delta, it might point to a catalog/schema
919
+ # But system catalog needs specific path structure.
920
+ # For now assume system connection is a storage connection.
921
+ pass
922
+
923
+ if not base_uri:
924
+ # Default fallback if something went wrong or unsupported
925
+ base_uri = os.path.join(project_root, ".odibi/system")
926
+
927
+ meta_state_path = f"{base_uri}/meta_state"
928
+ meta_runs_path = f"{base_uri}/meta_runs"
929
+
930
+ return CatalogStateBackend(
931
+ meta_runs_path=meta_runs_path,
932
+ meta_state_path=meta_state_path,
933
+ spark_session=spark_session,
934
+ storage_options=storage_options,
935
+ environment=environment,
936
+ )
937
+
938
+
939
+ def create_sync_source_backend(
940
+ sync_from_config: Any,
941
+ connections: Dict[str, Any],
942
+ project_root: str = ".",
943
+ ) -> StateBackend:
944
+ """
945
+ Create a source StateBackend for sync operations.
946
+
947
+ Args:
948
+ sync_from_config: SyncFromConfig with connection/path/schema_name
949
+ connections: Dictionary of connection configs
950
+ project_root: Root directory for local paths
951
+
952
+ Returns:
953
+ Configured StateBackend for reading source data
954
+ """
955
+
956
+ def _get(obj, key, default=None):
957
+ if isinstance(obj, dict):
958
+ return obj.get(key, default)
959
+ return getattr(obj, key, default)
960
+
961
+ conn_name = _get(sync_from_config, "connection")
962
+ conn_config = connections.get(conn_name)
963
+
964
+ if not conn_config:
965
+ raise ValueError(f"Sync source connection '{conn_name}' not found in connections.")
966
+
967
+ conn_type = _get(conn_config, "type")
968
+
969
+ # SQL Server source
970
+ if conn_type in ("sql_server", "azure_sql"):
971
+ from odibi.connections.factory import create_connection
972
+
973
+ connection = create_connection(conn_name, conn_config)
974
+ schema_name = _get(sync_from_config, "schema_name") or "odibi_system"
975
+ return SqlServerSystemBackend(
976
+ connection=connection,
977
+ schema_name=schema_name,
978
+ environment=None,
979
+ )
980
+
981
+ # File-based source (local, azure_blob)
982
+ base_uri = ""
983
+ storage_options = {}
984
+ path = _get(sync_from_config, "path") or "_odibi_system"
985
+
986
+ if conn_type == "local":
987
+ base_path = _get(conn_config, "base_path")
988
+ if not os.path.isabs(base_path):
989
+ base_path = os.path.join(project_root, base_path)
990
+ base_uri = os.path.join(base_path, path)
991
+
992
+ elif conn_type == "azure_blob":
993
+ account = _get(conn_config, "account_name")
994
+ container = _get(conn_config, "container")
995
+ base_uri = f"abfss://{container}@{account}.dfs.core.windows.net/{path}"
996
+
997
+ auth = _get(conn_config, "auth", {})
998
+ auth_mode = _get(auth, "mode")
999
+ if auth_mode == "account_key":
1000
+ storage_options = {
1001
+ "account_name": account,
1002
+ "account_key": _get(auth, "account_key"),
1003
+ }
1004
+ elif auth_mode == "sas":
1005
+ storage_options = {
1006
+ "account_name": account,
1007
+ "sas_token": _get(auth, "sas_token"),
1008
+ }
1009
+
1010
+ if not base_uri:
1011
+ base_uri = os.path.join(project_root, path)
1012
+
1013
+ meta_state_path = f"{base_uri}/meta_state"
1014
+ meta_runs_path = f"{base_uri}/meta_runs"
1015
+
1016
+ return CatalogStateBackend(
1017
+ meta_runs_path=meta_runs_path,
1018
+ meta_state_path=meta_state_path,
1019
+ spark_session=None,
1020
+ storage_options=storage_options,
1021
+ environment=None,
1022
+ )
1023
+
1024
+
1025
+ def sync_system_data(
1026
+ source_backend: StateBackend,
1027
+ target_backend: StateBackend,
1028
+ tables: Optional[List[str]] = None,
1029
+ ) -> Dict[str, int]:
1030
+ """
1031
+ Sync system data from source backend to target backend.
1032
+
1033
+ Reads meta_runs and meta_state from source and writes to target.
1034
+
1035
+ Args:
1036
+ source_backend: Source StateBackend to read from
1037
+ target_backend: Target StateBackend to write to
1038
+ tables: Optional list of tables to sync ('runs', 'state'). Default: both.
1039
+
1040
+ Returns:
1041
+ Dict with counts: {'runs': N, 'state': M}
1042
+ """
1043
+ if tables is None:
1044
+ tables = ["runs", "state"]
1045
+
1046
+ result = {"runs": 0, "state": 0}
1047
+
1048
+ # Sync runs (meta_runs)
1049
+ if "runs" in tables:
1050
+ runs_count = _sync_runs(source_backend, target_backend)
1051
+ result["runs"] = runs_count
1052
+ logger.info(f"Synced {runs_count} run records")
1053
+
1054
+ # Sync state (meta_state / HWM)
1055
+ if "state" in tables:
1056
+ state_count = _sync_state(source_backend, target_backend)
1057
+ result["state"] = state_count
1058
+ logger.info(f"Synced {state_count} state records")
1059
+
1060
+ return result
1061
+
1062
+
1063
+ def _sync_runs(source: StateBackend, target: StateBackend) -> int:
1064
+ """Sync runs from source to target."""
1065
+ records = []
1066
+
1067
+ # Read runs from source
1068
+ if isinstance(source, CatalogStateBackend):
1069
+ if not DeltaTable or not pd:
1070
+ logger.warning("Delta/Pandas not available for reading source runs")
1071
+ return 0
1072
+
1073
+ try:
1074
+ dt = DeltaTable(source.meta_runs_path, storage_options=source.storage_options)
1075
+ df = dt.to_pandas()
1076
+ if df.empty:
1077
+ return 0
1078
+
1079
+ for _, row in df.iterrows():
1080
+ records.append(
1081
+ {
1082
+ "run_id": row.get("run_id"),
1083
+ "pipeline_name": row.get("pipeline_name"),
1084
+ "node_name": row.get("node_name"),
1085
+ "status": row.get("status"),
1086
+ "rows_processed": int(row.get("rows_processed", 0) or 0),
1087
+ "duration_ms": int(row.get("duration_ms", 0) or 0),
1088
+ "metrics_json": row.get("metrics_json") or row.get("metadata") or "{}",
1089
+ }
1090
+ )
1091
+ except Exception as e:
1092
+ logger.warning(f"Failed to read runs from source: {e}")
1093
+ return 0
1094
+
1095
+ elif isinstance(source, SqlServerSystemBackend):
1096
+ source._ensure_tables()
1097
+ try:
1098
+ sql = f"""SELECT run_id, pipeline_name, node_name, status, rows_processed,
1099
+ duration_ms, metrics_json FROM [{source.schema_name}].[meta_runs]"""
1100
+ rows = source.connection.execute(sql)
1101
+ if rows:
1102
+ for row in rows:
1103
+ records.append(
1104
+ {
1105
+ "run_id": row[0],
1106
+ "pipeline_name": row[1],
1107
+ "node_name": row[2],
1108
+ "status": row[3],
1109
+ "rows_processed": int(row[4] or 0),
1110
+ "duration_ms": int(row[5] or 0),
1111
+ "metrics_json": row[6] or "{}",
1112
+ }
1113
+ )
1114
+ except Exception as e:
1115
+ logger.warning(f"Failed to read runs from SQL source: {e}")
1116
+ return 0
1117
+
1118
+ if not records:
1119
+ return 0
1120
+
1121
+ # Write runs to target
1122
+ if isinstance(target, SqlServerSystemBackend):
1123
+ target.log_runs_batch(records)
1124
+ elif isinstance(target, CatalogStateBackend):
1125
+ _write_runs_to_catalog(target, records)
1126
+
1127
+ return len(records)
1128
+
1129
+
1130
+ def _write_runs_to_catalog(target: CatalogStateBackend, records: List[Dict]) -> None:
1131
+ """Write run records to CatalogStateBackend."""
1132
+ if not pd or not write_deltalake:
1133
+ logger.warning("Delta/Pandas not available for writing runs")
1134
+ return
1135
+
1136
+ df = pd.DataFrame(records)
1137
+ df["timestamp"] = datetime.now(timezone.utc)
1138
+ df["date"] = datetime.now(timezone.utc).date()
1139
+ df["environment"] = target.environment
1140
+
1141
+ def _write():
1142
+ write_deltalake(
1143
+ target.meta_runs_path,
1144
+ df,
1145
+ mode="append",
1146
+ storage_options=target.storage_options,
1147
+ )
1148
+
1149
+ _retry_delta_operation(_write)
1150
+
1151
+
1152
+ def _sync_state(source: StateBackend, target: StateBackend) -> int:
1153
+ """Sync HWM state from source to target."""
1154
+ hwm_records = []
1155
+
1156
+ # Read state from source
1157
+ if isinstance(source, CatalogStateBackend):
1158
+ if not DeltaTable or not pd:
1159
+ logger.warning("Delta/Pandas not available for reading source state")
1160
+ return 0
1161
+
1162
+ try:
1163
+ dt = DeltaTable(source.meta_state_path, storage_options=source.storage_options)
1164
+ df = dt.to_pandas()
1165
+ if df.empty:
1166
+ return 0
1167
+
1168
+ for _, row in df.iterrows():
1169
+ key = row.get("key")
1170
+ value = row.get("value")
1171
+ if key:
1172
+ try:
1173
+ hwm_records.append({"key": key, "value": json.loads(value)})
1174
+ except (json.JSONDecodeError, TypeError):
1175
+ hwm_records.append({"key": key, "value": value})
1176
+ except Exception as e:
1177
+ logger.warning(f"Failed to read state from source: {e}")
1178
+ return 0
1179
+
1180
+ elif isinstance(source, SqlServerSystemBackend):
1181
+ source._ensure_tables()
1182
+ try:
1183
+ sql = f"SELECT [key], [value] FROM [{source.schema_name}].[meta_state]"
1184
+ rows = source.connection.execute(sql)
1185
+ if rows:
1186
+ for row in rows:
1187
+ key, value = row[0], row[1]
1188
+ if key:
1189
+ try:
1190
+ hwm_records.append({"key": key, "value": json.loads(value)})
1191
+ except (json.JSONDecodeError, TypeError):
1192
+ hwm_records.append({"key": key, "value": value})
1193
+ except Exception as e:
1194
+ logger.warning(f"Failed to read state from SQL source: {e}")
1195
+ return 0
1196
+
1197
+ if not hwm_records:
1198
+ return 0
1199
+
1200
+ # Write state to target
1201
+ target.set_hwm_batch(hwm_records)
1202
+
1203
+ return len(hwm_records)