odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/context.py
ADDED
|
@@ -0,0 +1,528 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import threading
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from collections.abc import Iterator
|
|
5
|
+
from typing import Any, Dict, Optional, Union
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
import polars as pl
|
|
11
|
+
except ImportError:
|
|
12
|
+
pl = None
|
|
13
|
+
|
|
14
|
+
from odibi.enums import EngineType
|
|
15
|
+
|
|
16
|
+
# Thread-local storage for unique temp view names
|
|
17
|
+
_thread_local = threading.local()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _get_unique_view_name() -> str:
|
|
21
|
+
"""Generate a unique temp view name for thread-safe parallel execution."""
|
|
22
|
+
if not hasattr(_thread_local, "view_counter"):
|
|
23
|
+
_thread_local.view_counter = 0
|
|
24
|
+
_thread_local.view_counter += 1
|
|
25
|
+
thread_id = threading.current_thread().ident or 0
|
|
26
|
+
return f"_df_{thread_id}_{_thread_local.view_counter}"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class EngineContext:
|
|
30
|
+
"""
|
|
31
|
+
The context passed to transformations.
|
|
32
|
+
Wraps the global context (other datasets) and the local state (current dataframe).
|
|
33
|
+
Provides uniform API for SQL and Data operations.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
context: "Context",
|
|
39
|
+
df: Any,
|
|
40
|
+
engine_type: EngineType,
|
|
41
|
+
sql_executor: Optional[Any] = None,
|
|
42
|
+
engine: Optional[Any] = None,
|
|
43
|
+
pii_metadata: Optional[Dict[str, bool]] = None,
|
|
44
|
+
):
|
|
45
|
+
self.context = context
|
|
46
|
+
self.df = df
|
|
47
|
+
self.engine_type = engine_type
|
|
48
|
+
self.sql_executor = sql_executor
|
|
49
|
+
self.engine = engine
|
|
50
|
+
self.pii_metadata = pii_metadata or {}
|
|
51
|
+
self._sql_history: list[str] = []
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def columns(self) -> list[str]:
|
|
55
|
+
if hasattr(self.df, "columns"):
|
|
56
|
+
return list(self.df.columns)
|
|
57
|
+
# Spark
|
|
58
|
+
if hasattr(self.df, "schema"):
|
|
59
|
+
return self.df.columns
|
|
60
|
+
return []
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def schema(self) -> Dict[str, str]:
|
|
64
|
+
"""Get schema types."""
|
|
65
|
+
if self.engine:
|
|
66
|
+
return self.engine.get_schema(self.df)
|
|
67
|
+
return {}
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def spark(self) -> Any:
|
|
71
|
+
"""Helper to access SparkSession if available in context."""
|
|
72
|
+
if hasattr(self.context, "spark"):
|
|
73
|
+
return self.context.spark
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
def with_df(self, df: Any) -> "EngineContext":
|
|
77
|
+
"""Returns a new context with updated DataFrame."""
|
|
78
|
+
new_ctx = EngineContext(
|
|
79
|
+
self.context, df, self.engine_type, self.sql_executor, self.engine, self.pii_metadata
|
|
80
|
+
)
|
|
81
|
+
# Preserve history? No, we want history per-transformation scope usually.
|
|
82
|
+
# But wait, if we chain, we might want to pass it?
|
|
83
|
+
# For now, let's keep history tied to the specific context instance used in a transform.
|
|
84
|
+
# The Node will check the context instance it passed to the function.
|
|
85
|
+
# However, if the function returns a new context (via with_df), we lose the reference.
|
|
86
|
+
# Actually, the user functions usually return a DataFrame, not a Context.
|
|
87
|
+
# Context is just the helper.
|
|
88
|
+
# So we can accumulate in the *original* context passed to the function?
|
|
89
|
+
# No, context is immutable-ish.
|
|
90
|
+
# Let's make sql_history shared if we branch?
|
|
91
|
+
# Actually, simple approach: The user calls context.sql(). That context instance records it.
|
|
92
|
+
# If they chain .sql().sql(), we need the new context to share the history list.
|
|
93
|
+
new_ctx._sql_history = self._sql_history
|
|
94
|
+
return new_ctx
|
|
95
|
+
|
|
96
|
+
def get(self, name: str) -> Any:
|
|
97
|
+
"""Get a dataset from global context."""
|
|
98
|
+
return self.context.get(name)
|
|
99
|
+
|
|
100
|
+
def register_temp_view(self, name: str, df: Any) -> None:
|
|
101
|
+
"""Register a temporary view for SQL."""
|
|
102
|
+
self.context.register(name, df)
|
|
103
|
+
|
|
104
|
+
def sql(self, query: str) -> "EngineContext":
|
|
105
|
+
"""Execute SQL on the current DataFrame (aliased as 'df')."""
|
|
106
|
+
self._sql_history.append(query)
|
|
107
|
+
|
|
108
|
+
if self.sql_executor:
|
|
109
|
+
# Use unique temp view name for thread-safe parallel execution
|
|
110
|
+
view_name = _get_unique_view_name()
|
|
111
|
+
self.context.register(view_name, self.df)
|
|
112
|
+
try:
|
|
113
|
+
# Replace 'df' references with our unique view name in the query
|
|
114
|
+
# Use word boundary matching to avoid replacing 'df' inside column names
|
|
115
|
+
safe_query = re.sub(r"\bdf\b", view_name, query)
|
|
116
|
+
res = self.sql_executor(safe_query, self.context)
|
|
117
|
+
return self.with_df(res)
|
|
118
|
+
finally:
|
|
119
|
+
# Cleanup temp view to avoid memory leaks
|
|
120
|
+
self.context.unregister(view_name)
|
|
121
|
+
|
|
122
|
+
raise NotImplementedError("EngineContext.sql requires sql_executor to be set.")
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class Context(ABC):
|
|
126
|
+
"""Abstract base for execution context."""
|
|
127
|
+
|
|
128
|
+
@abstractmethod
|
|
129
|
+
def register(self, name: str, df: Any, metadata: Optional[Dict[str, Any]] = None) -> None:
|
|
130
|
+
"""Register a DataFrame for use in downstream nodes.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
name: Identifier for the DataFrame
|
|
134
|
+
df: DataFrame (Spark or Pandas) or Iterator (Pandas chunked)
|
|
135
|
+
metadata: Optional metadata (e.g. PII info)
|
|
136
|
+
"""
|
|
137
|
+
pass
|
|
138
|
+
|
|
139
|
+
@abstractmethod
|
|
140
|
+
def get(self, name: str) -> Any:
|
|
141
|
+
"""Retrieve a registered DataFrame.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
name: Identifier of the DataFrame
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
The registered DataFrame
|
|
148
|
+
|
|
149
|
+
Raises:
|
|
150
|
+
KeyError: If name not found in context
|
|
151
|
+
"""
|
|
152
|
+
pass
|
|
153
|
+
|
|
154
|
+
@abstractmethod
|
|
155
|
+
def get_metadata(self, name: str) -> Dict[str, Any]:
|
|
156
|
+
"""Retrieve metadata for a registered DataFrame.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
name: Identifier of the DataFrame
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Metadata dictionary (empty if none)
|
|
163
|
+
"""
|
|
164
|
+
pass
|
|
165
|
+
|
|
166
|
+
@abstractmethod
|
|
167
|
+
def has(self, name: str) -> bool:
|
|
168
|
+
"""Check if a DataFrame exists in context.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
name: Identifier to check
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
True if exists, False otherwise
|
|
175
|
+
"""
|
|
176
|
+
pass
|
|
177
|
+
|
|
178
|
+
@abstractmethod
|
|
179
|
+
def list_names(self) -> list[str]:
|
|
180
|
+
"""List all registered DataFrame names.
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
List of registered names
|
|
184
|
+
"""
|
|
185
|
+
pass
|
|
186
|
+
|
|
187
|
+
@abstractmethod
|
|
188
|
+
def clear(self) -> None:
|
|
189
|
+
"""Clear all registered DataFrames."""
|
|
190
|
+
pass
|
|
191
|
+
|
|
192
|
+
def unregister(self, name: str) -> None:
|
|
193
|
+
"""Unregister a DataFrame from the context.
|
|
194
|
+
|
|
195
|
+
Default implementation does nothing (optional cleanup).
|
|
196
|
+
Subclasses can override for cleanup (e.g., dropping temp views).
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
name: Identifier to unregister
|
|
200
|
+
"""
|
|
201
|
+
pass
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class PandasContext(Context):
|
|
205
|
+
"""Context implementation for Pandas engine."""
|
|
206
|
+
|
|
207
|
+
def __init__(self) -> None:
|
|
208
|
+
"""Initialize Pandas context."""
|
|
209
|
+
self._data: Dict[str, Union[pd.DataFrame, Iterator[pd.DataFrame]]] = {}
|
|
210
|
+
self._metadata: Dict[str, Dict[str, Any]] = {}
|
|
211
|
+
|
|
212
|
+
def register(
|
|
213
|
+
self,
|
|
214
|
+
name: str,
|
|
215
|
+
df: Union[pd.DataFrame, Iterator[pd.DataFrame], Any],
|
|
216
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
217
|
+
) -> None:
|
|
218
|
+
"""Register a Pandas DataFrame, Iterator, or LazyDataset.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
name: Identifier for the DataFrame
|
|
222
|
+
df: Pandas DataFrame or Iterator of DataFrames or LazyDataset
|
|
223
|
+
metadata: Optional metadata
|
|
224
|
+
"""
|
|
225
|
+
# Relaxed type check to support LazyDataset
|
|
226
|
+
is_valid = (
|
|
227
|
+
isinstance(df, pd.DataFrame)
|
|
228
|
+
or isinstance(df, Iterator)
|
|
229
|
+
or type(df).__name__ == "LazyDataset"
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
if not is_valid:
|
|
233
|
+
raise TypeError(
|
|
234
|
+
f"Expected pandas.DataFrame, Iterator, or LazyDataset, got {type(df).__module__}.{type(df).__name__}"
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
self._data[name] = df
|
|
238
|
+
if metadata:
|
|
239
|
+
self._metadata[name] = metadata
|
|
240
|
+
|
|
241
|
+
def get(self, name: str) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
|
|
242
|
+
"""Retrieve a registered Pandas DataFrame or Iterator.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
name: Identifier of the DataFrame
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
The registered Pandas DataFrame or Iterator
|
|
249
|
+
|
|
250
|
+
Raises:
|
|
251
|
+
KeyError: If name not found in context
|
|
252
|
+
"""
|
|
253
|
+
if name not in self._data:
|
|
254
|
+
available = ", ".join(self._data.keys()) if self._data else "none"
|
|
255
|
+
raise KeyError(f"DataFrame '{name}' not found in context. Available: {available}")
|
|
256
|
+
return self._data[name]
|
|
257
|
+
|
|
258
|
+
def get_metadata(self, name: str) -> Dict[str, Any]:
|
|
259
|
+
"""Retrieve metadata."""
|
|
260
|
+
return self._metadata.get(name, {})
|
|
261
|
+
|
|
262
|
+
def has(self, name: str) -> bool:
|
|
263
|
+
"""Check if a DataFrame exists.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
name: Identifier to check
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
True if exists, False otherwise
|
|
270
|
+
"""
|
|
271
|
+
return name in self._data
|
|
272
|
+
|
|
273
|
+
def list_names(self) -> list[str]:
|
|
274
|
+
"""List all registered DataFrame names.
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
List of registered names
|
|
278
|
+
"""
|
|
279
|
+
return list(self._data.keys())
|
|
280
|
+
|
|
281
|
+
def clear(self) -> None:
|
|
282
|
+
"""Clear all registered DataFrames."""
|
|
283
|
+
self._data.clear()
|
|
284
|
+
|
|
285
|
+
def unregister(self, name: str) -> None:
|
|
286
|
+
"""Unregister a DataFrame from the context."""
|
|
287
|
+
self._data.pop(name, None)
|
|
288
|
+
self._metadata.pop(name, None)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
class PolarsContext(Context):
|
|
292
|
+
"""Context implementation for Polars engine."""
|
|
293
|
+
|
|
294
|
+
def __init__(self) -> None:
|
|
295
|
+
"""Initialize Polars context."""
|
|
296
|
+
self._data: Dict[str, Any] = {}
|
|
297
|
+
self._metadata: Dict[str, Dict[str, Any]] = {}
|
|
298
|
+
|
|
299
|
+
def register(self, name: str, df: Any, metadata: Optional[Dict[str, Any]] = None) -> None:
|
|
300
|
+
"""Register a Polars DataFrame or LazyFrame.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
name: Identifier for the DataFrame
|
|
304
|
+
df: Polars DataFrame or LazyFrame
|
|
305
|
+
metadata: Optional metadata
|
|
306
|
+
"""
|
|
307
|
+
self._data[name] = df
|
|
308
|
+
if metadata:
|
|
309
|
+
self._metadata[name] = metadata
|
|
310
|
+
|
|
311
|
+
def get(self, name: str) -> Any:
|
|
312
|
+
"""Retrieve a registered Polars DataFrame.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
name: Identifier of the DataFrame
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
The registered DataFrame
|
|
319
|
+
|
|
320
|
+
Raises:
|
|
321
|
+
KeyError: If name not found in context
|
|
322
|
+
"""
|
|
323
|
+
if name not in self._data:
|
|
324
|
+
available = ", ".join(self._data.keys()) if self._data else "none"
|
|
325
|
+
raise KeyError(f"DataFrame '{name}' not found in context. Available: {available}")
|
|
326
|
+
return self._data[name]
|
|
327
|
+
|
|
328
|
+
def get_metadata(self, name: str) -> Dict[str, Any]:
|
|
329
|
+
return self._metadata.get(name, {})
|
|
330
|
+
|
|
331
|
+
def has(self, name: str) -> bool:
|
|
332
|
+
"""Check if a DataFrame exists.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
name: Identifier to check
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
True if exists, False otherwise
|
|
339
|
+
"""
|
|
340
|
+
return name in self._data
|
|
341
|
+
|
|
342
|
+
def list_names(self) -> list[str]:
|
|
343
|
+
"""List all registered DataFrame names.
|
|
344
|
+
|
|
345
|
+
Returns:
|
|
346
|
+
List of registered names
|
|
347
|
+
"""
|
|
348
|
+
return list(self._data.keys())
|
|
349
|
+
|
|
350
|
+
def clear(self) -> None:
|
|
351
|
+
"""Clear all registered DataFrames."""
|
|
352
|
+
self._data.clear()
|
|
353
|
+
|
|
354
|
+
def unregister(self, name: str) -> None:
|
|
355
|
+
"""Unregister a DataFrame from the context."""
|
|
356
|
+
self._data.pop(name, None)
|
|
357
|
+
self._metadata.pop(name, None)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
class SparkContext(Context):
|
|
361
|
+
"""Context implementation for Spark engine."""
|
|
362
|
+
|
|
363
|
+
def __init__(self, spark_session: Any) -> None:
|
|
364
|
+
"""Initialize Spark context.
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
spark_session: Active SparkSession
|
|
368
|
+
"""
|
|
369
|
+
try:
|
|
370
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
|
371
|
+
except ImportError:
|
|
372
|
+
# Fallback for when pyspark is not installed (e.g. testing without spark)
|
|
373
|
+
SparkDataFrame = Any
|
|
374
|
+
|
|
375
|
+
self.spark = spark_session
|
|
376
|
+
self._spark_df_type = SparkDataFrame
|
|
377
|
+
|
|
378
|
+
# Track registered views for cleanup
|
|
379
|
+
self._registered_views: set[str] = set()
|
|
380
|
+
|
|
381
|
+
# Lock for thread safety
|
|
382
|
+
self._lock = threading.RLock()
|
|
383
|
+
|
|
384
|
+
# Metadata store
|
|
385
|
+
self._metadata: Dict[str, Dict[str, Any]] = {}
|
|
386
|
+
|
|
387
|
+
def _validate_name(self, name: str) -> None:
|
|
388
|
+
"""Validate that node name is a valid Spark identifier.
|
|
389
|
+
|
|
390
|
+
Spark SQL views should be alphanumeric + underscore.
|
|
391
|
+
Spaces and special characters (hyphens) cause issues in SQL generation.
|
|
392
|
+
|
|
393
|
+
Args:
|
|
394
|
+
name: Node name to validate
|
|
395
|
+
|
|
396
|
+
Raises:
|
|
397
|
+
ValueError: If name is invalid
|
|
398
|
+
"""
|
|
399
|
+
# Regex: alphanumeric and underscore only
|
|
400
|
+
if not re.match(r"^[a-zA-Z0-9_]+$", name):
|
|
401
|
+
raise ValueError(
|
|
402
|
+
f"Invalid node name '{name}' for Spark engine. "
|
|
403
|
+
"Names must contain only alphanumeric characters and underscores "
|
|
404
|
+
"(no spaces or hyphens). Please rename this node in your configuration."
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
def register(self, name: str, df: Any, metadata: Optional[Dict[str, Any]] = None) -> None:
|
|
408
|
+
"""Register a Spark DataFrame as temp view.
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
name: Identifier for the DataFrame
|
|
412
|
+
df: Spark DataFrame
|
|
413
|
+
metadata: Optional metadata
|
|
414
|
+
"""
|
|
415
|
+
# 1. Validate Type
|
|
416
|
+
if self._spark_df_type is not Any and not isinstance(df, self._spark_df_type):
|
|
417
|
+
if not hasattr(df, "createOrReplaceTempView"):
|
|
418
|
+
raise TypeError(
|
|
419
|
+
f"Expected pyspark.sql.DataFrame, got {type(df).__module__}.{type(df).__name__}"
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
# 2. Validate Name (Explicit rule)
|
|
423
|
+
self._validate_name(name)
|
|
424
|
+
|
|
425
|
+
# 3. Register
|
|
426
|
+
with self._lock:
|
|
427
|
+
self._registered_views.add(name)
|
|
428
|
+
if metadata:
|
|
429
|
+
self._metadata[name] = metadata
|
|
430
|
+
|
|
431
|
+
# Create view (metadata op)
|
|
432
|
+
df.createOrReplaceTempView(name)
|
|
433
|
+
|
|
434
|
+
def get(self, name: str) -> Any:
|
|
435
|
+
"""Retrieve a registered Spark DataFrame.
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
name: Identifier of the DataFrame
|
|
439
|
+
|
|
440
|
+
Returns:
|
|
441
|
+
The registered Spark DataFrame
|
|
442
|
+
|
|
443
|
+
Raises:
|
|
444
|
+
KeyError: If name not found in context
|
|
445
|
+
"""
|
|
446
|
+
with self._lock:
|
|
447
|
+
if name not in self._registered_views:
|
|
448
|
+
available = ", ".join(self._registered_views) if self._registered_views else "none"
|
|
449
|
+
raise KeyError(f"DataFrame '{name}' not found in context. Available: {available}")
|
|
450
|
+
|
|
451
|
+
return self.spark.table(name)
|
|
452
|
+
|
|
453
|
+
def get_metadata(self, name: str) -> Dict[str, Any]:
|
|
454
|
+
with self._lock:
|
|
455
|
+
return self._metadata.get(name, {})
|
|
456
|
+
|
|
457
|
+
def has(self, name: str) -> bool:
|
|
458
|
+
"""Check if a DataFrame exists.
|
|
459
|
+
|
|
460
|
+
Args:
|
|
461
|
+
name: Identifier to check
|
|
462
|
+
|
|
463
|
+
Returns:
|
|
464
|
+
True if exists, False otherwise
|
|
465
|
+
"""
|
|
466
|
+
with self._lock:
|
|
467
|
+
return name in self._registered_views
|
|
468
|
+
|
|
469
|
+
def list_names(self) -> list[str]:
|
|
470
|
+
"""List all registered DataFrame names.
|
|
471
|
+
|
|
472
|
+
Returns:
|
|
473
|
+
List of registered names
|
|
474
|
+
"""
|
|
475
|
+
with self._lock:
|
|
476
|
+
return list(self._registered_views)
|
|
477
|
+
|
|
478
|
+
def clear(self) -> None:
|
|
479
|
+
"""Clear all registered temp views."""
|
|
480
|
+
with self._lock:
|
|
481
|
+
views_to_drop = list(self._registered_views)
|
|
482
|
+
self._registered_views.clear()
|
|
483
|
+
|
|
484
|
+
for name in views_to_drop:
|
|
485
|
+
try:
|
|
486
|
+
self.spark.catalog.dropTempView(name)
|
|
487
|
+
except Exception:
|
|
488
|
+
pass
|
|
489
|
+
|
|
490
|
+
def unregister(self, name: str) -> None:
|
|
491
|
+
"""Unregister a temp view from Spark.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
name: View name to drop
|
|
495
|
+
"""
|
|
496
|
+
with self._lock:
|
|
497
|
+
self._registered_views.discard(name)
|
|
498
|
+
self._metadata.pop(name, None)
|
|
499
|
+
|
|
500
|
+
try:
|
|
501
|
+
self.spark.catalog.dropTempView(name)
|
|
502
|
+
except Exception:
|
|
503
|
+
pass
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def create_context(engine: str, spark_session: Optional[Any] = None) -> Context:
|
|
507
|
+
"""Factory function to create appropriate context.
|
|
508
|
+
|
|
509
|
+
Args:
|
|
510
|
+
engine: Engine type ('pandas' or 'spark')
|
|
511
|
+
spark_session: SparkSession (required if engine='spark')
|
|
512
|
+
|
|
513
|
+
Returns:
|
|
514
|
+
Context instance for the specified engine
|
|
515
|
+
|
|
516
|
+
Raises:
|
|
517
|
+
ValueError: If engine is invalid or SparkSession missing for Spark
|
|
518
|
+
"""
|
|
519
|
+
if engine == "pandas":
|
|
520
|
+
return PandasContext()
|
|
521
|
+
elif engine == "spark":
|
|
522
|
+
if spark_session is None:
|
|
523
|
+
raise ValueError("SparkSession required for Spark engine")
|
|
524
|
+
return SparkContext(spark_session)
|
|
525
|
+
elif engine == "polars":
|
|
526
|
+
return PolarsContext()
|
|
527
|
+
else:
|
|
528
|
+
raise ValueError(f"Unsupported engine: {engine}. Use 'pandas' or 'spark'")
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ODIBI Diagnostics Module
|
|
3
|
+
========================
|
|
4
|
+
|
|
5
|
+
Tools for troubleshooting, lineage, and drift detection.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .delta import detect_drift, get_delta_diff
|
|
9
|
+
from .diff import diff_nodes, diff_runs
|
|
10
|
+
from .manager import HistoryManager
|
|
11
|
+
|
|
12
|
+
__all__ = ["get_delta_diff", "detect_drift", "diff_nodes", "diff_runs", "HistoryManager"]
|