odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,302 @@
1
+ """Setup helpers for ODIBI - Phase 2C performance utilities."""
2
+
3
+ import concurrent.futures
4
+ import warnings
5
+ from dataclasses import dataclass
6
+ from typing import Any, Dict, List, Optional, Tuple
7
+
8
+
9
+ @dataclass
10
+ class KeyVaultFetchResult:
11
+ """Result of a Key Vault secret fetch operation."""
12
+
13
+ connection_name: str
14
+ account: str
15
+ success: bool
16
+ secret_value: Optional[str] = None
17
+ error: Optional[Exception] = None
18
+ duration_ms: Optional[float] = None
19
+
20
+
21
+ def fetch_keyvault_secret(
22
+ connection_name: str,
23
+ key_vault_name: str,
24
+ secret_name: str,
25
+ timeout: float = 30.0,
26
+ ) -> KeyVaultFetchResult:
27
+ """Fetch a single Key Vault secret with timeout protection.
28
+
29
+ Args:
30
+ connection_name: Name of the connection (for error reporting)
31
+ key_vault_name: Azure Key Vault name
32
+ secret_name: Secret name in Key Vault
33
+ timeout: Timeout in seconds (default: 30.0)
34
+
35
+ Returns:
36
+ KeyVaultFetchResult with success status and secret value or error
37
+ """
38
+ import time
39
+
40
+ start_time = time.time()
41
+
42
+ try:
43
+ from azure.identity import DefaultAzureCredential
44
+ from azure.keyvault.secrets import SecretClient
45
+ except ImportError:
46
+ duration_ms = (time.time() - start_time) * 1000
47
+ return KeyVaultFetchResult(
48
+ connection_name=connection_name,
49
+ account=key_vault_name,
50
+ success=False,
51
+ error=ImportError(
52
+ "Key Vault authentication requires 'azure-identity' and 'azure-keyvault-secrets'. "
53
+ "Install with: pip install odibi[azure]"
54
+ ),
55
+ duration_ms=duration_ms,
56
+ )
57
+
58
+ try:
59
+ credential = DefaultAzureCredential()
60
+ kv_uri = f"https://{key_vault_name}.vault.azure.net"
61
+ client = SecretClient(vault_url=kv_uri, credential=credential)
62
+
63
+ secret = client.get_secret(secret_name)
64
+ duration_ms = (time.time() - start_time) * 1000
65
+
66
+ return KeyVaultFetchResult(
67
+ connection_name=connection_name,
68
+ account=key_vault_name,
69
+ success=True,
70
+ secret_value=secret.value,
71
+ duration_ms=duration_ms,
72
+ )
73
+
74
+ except Exception as e:
75
+ duration_ms = (time.time() - start_time) * 1000
76
+ return KeyVaultFetchResult(
77
+ connection_name=connection_name,
78
+ account=key_vault_name,
79
+ success=False,
80
+ error=e,
81
+ duration_ms=duration_ms,
82
+ )
83
+
84
+
85
+ def fetch_keyvault_secrets_parallel(
86
+ connections: Dict[str, Any],
87
+ max_workers: int = 5,
88
+ timeout: float = 30.0,
89
+ verbose: bool = True,
90
+ ) -> Dict[str, KeyVaultFetchResult]:
91
+ """Fetch Key Vault secrets in parallel for multiple connections.
92
+
93
+ This provides 3x+ performance improvement over sequential fetching
94
+ when multiple ADLS connections use Key Vault authentication.
95
+
96
+ Args:
97
+ connections: Dictionary of connection objects (name -> connection instance)
98
+ max_workers: Maximum number of parallel workers (default: 5)
99
+ timeout: Timeout per secret fetch in seconds (default: 30.0)
100
+ verbose: Print progress messages
101
+
102
+ Returns:
103
+ Dictionary mapping connection name to KeyVaultFetchResult
104
+
105
+ Example:
106
+ >>> from odibi.connections import AzureADLS
107
+ >>> connections = {
108
+ ... "bronze": AzureADLS(account="storage1", container="bronze", auth_mode="key_vault",
109
+ ... key_vault_name="kv1", secret_name="secret1", validate=False),
110
+ ... "silver": AzureADLS(account="storage2", container="silver", auth_mode="key_vault",
111
+ ... key_vault_name="kv2", secret_name="secret2", validate=False),
112
+ ... }
113
+ >>> results = fetch_keyvault_secrets_parallel(connections)
114
+ >>> all(r.success for r in results.values())
115
+ True
116
+ """
117
+ import time
118
+
119
+ kv_connections = []
120
+ results = {}
121
+
122
+ for name, conn in connections.items():
123
+ # Check if connection is configured to use Key Vault (has vault name and secret name)
124
+ # This supports ANY auth mode (key_vault, sas_token, service_principal, sql, etc.)
125
+ # as long as they want to fetch a credential from KV.
126
+ if (
127
+ hasattr(conn, "key_vault_name")
128
+ and conn.key_vault_name
129
+ and hasattr(conn, "secret_name")
130
+ and conn.secret_name
131
+ ):
132
+ kv_connections.append((name, conn))
133
+ else:
134
+ results[name] = KeyVaultFetchResult(
135
+ connection_name=name,
136
+ account=getattr(conn, "account", "unknown"),
137
+ success=True,
138
+ secret_value=None,
139
+ duration_ms=0.0,
140
+ )
141
+
142
+ if not kv_connections:
143
+ if verbose:
144
+ print("- No Key Vault connections to fetch")
145
+ return results
146
+
147
+ if verbose:
148
+ print(f"⚡ Fetching {len(kv_connections)} Key Vault secrets in parallel...")
149
+
150
+ start_time = time.time()
151
+
152
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
153
+ future_to_conn = {
154
+ executor.submit(
155
+ fetch_keyvault_secret,
156
+ name,
157
+ conn.key_vault_name,
158
+ conn.secret_name,
159
+ timeout,
160
+ ): (name, conn)
161
+ for name, conn in kv_connections
162
+ }
163
+
164
+ for future in concurrent.futures.as_completed(future_to_conn):
165
+ name, conn = future_to_conn[future]
166
+ result = future.result()
167
+ results[name] = result
168
+
169
+ if verbose:
170
+ if result.success:
171
+ print(f" - {name}: {result.duration_ms:.0f}ms")
172
+ else:
173
+ print(f" [X] {name}: {type(result.error).__name__}")
174
+
175
+ total_duration = (time.time() - start_time) * 1000
176
+
177
+ if verbose:
178
+ success_count = sum(1 for r in results.values() if r.success)
179
+ print(
180
+ f"- Completed in {total_duration:.0f}ms ({success_count}/{len(kv_connections)} successful)"
181
+ )
182
+
183
+ return results
184
+
185
+
186
+ def configure_connections_parallel(
187
+ connections: Dict[str, Any],
188
+ prefetch_secrets: bool = True,
189
+ max_workers: int = 5,
190
+ timeout: float = 30.0,
191
+ verbose: bool = True,
192
+ ) -> Tuple[Dict[str, Any], List[str]]:
193
+ """Configure connections with parallel Key Vault fetching.
194
+
195
+ Args:
196
+ connections: Dictionary of connection objects
197
+ prefetch_secrets: Whether to prefetch Key Vault secrets in parallel
198
+ max_workers: Maximum parallel workers
199
+ timeout: Timeout per operation
200
+ verbose: Print progress messages
201
+
202
+ Returns:
203
+ Tuple of (configured_connections, errors)
204
+ - configured_connections: Dict with cached secrets
205
+ - errors: List of error messages
206
+
207
+ Example:
208
+ >>> connections, errors = configure_connections_parallel(my_connections)
209
+ >>> if errors:
210
+ ... print("Errors:", errors)
211
+ """
212
+ errors = []
213
+
214
+ if not prefetch_secrets:
215
+ return connections, errors
216
+
217
+ results = fetch_keyvault_secrets_parallel(
218
+ connections, max_workers=max_workers, timeout=timeout, verbose=verbose
219
+ )
220
+
221
+ for name, result in results.items():
222
+ if not result.success:
223
+ error_msg = f"Failed to fetch secret for '{name}': {result.error}"
224
+ errors.append(error_msg)
225
+ if verbose:
226
+ warnings.warn(error_msg, UserWarning)
227
+ elif result.secret_value:
228
+ conn = connections[name]
229
+ if hasattr(conn, "_cached_key"):
230
+ conn._cached_key = result.secret_value
231
+
232
+ return connections, errors
233
+
234
+
235
+ def validate_databricks_environment(verbose: bool = True) -> Dict[str, Any]:
236
+ """Validate that we're running in a Databricks environment.
237
+
238
+ Args:
239
+ verbose: Print validation results
240
+
241
+ Returns:
242
+ Dictionary with validation results:
243
+ - is_databricks: bool
244
+ - spark_available: bool
245
+ - dbutils_available: bool
246
+ - runtime_version: Optional[str]
247
+ - errors: List[str]
248
+
249
+ Example:
250
+ >>> info = validate_databricks_environment()
251
+ >>> if info["is_databricks"]:
252
+ ... print("Running in Databricks")
253
+ """
254
+ results = {
255
+ "is_databricks": False,
256
+ "spark_available": False,
257
+ "dbutils_available": False,
258
+ "runtime_version": None,
259
+ "errors": [],
260
+ }
261
+
262
+ try:
263
+ import os
264
+
265
+ runtime = os.getenv("DATABRICKS_RUNTIME_VERSION")
266
+ if runtime:
267
+ results["is_databricks"] = True
268
+ results["runtime_version"] = runtime
269
+ except Exception as e:
270
+ results["errors"].append(f"Environment check failed: {e}")
271
+
272
+ try:
273
+ from pyspark.sql import SparkSession
274
+
275
+ spark = SparkSession.getActiveSession()
276
+ if spark:
277
+ results["spark_available"] = True
278
+ except Exception as e:
279
+ results["errors"].append(f"Spark check failed: {e}")
280
+
281
+ try:
282
+ import IPython
283
+
284
+ ipython = IPython.get_ipython()
285
+ if ipython and hasattr(ipython, "user_ns") and "dbutils" in ipython.user_ns:
286
+ results["dbutils_available"] = True
287
+ except Exception as e:
288
+ results["errors"].append(f"dbutils check failed: {e}")
289
+
290
+ if verbose:
291
+ print(f" Databricks Runtime: {'[X]' if results['is_databricks'] else '[ ]'}")
292
+ if results["runtime_version"]:
293
+ print(f" Runtime Version: {results['runtime_version']}")
294
+ print(f" Spark Available: {'[X]' if results['spark_available'] else '[ ]'}")
295
+ print(f" dbutils Available: {'[X]' if results['dbutils_available'] else '[ ]'}")
296
+
297
+ if results["errors"]:
298
+ print("\n Errors:")
299
+ for error in results["errors"]:
300
+ print(f" - {error}")
301
+
302
+ return results
@@ -0,0 +1,140 @@
1
+ """Telemetry utilities for OpenTelemetry integration."""
2
+
3
+ import os
4
+ import sys
5
+
6
+ try:
7
+ from opentelemetry import metrics, trace
8
+ from opentelemetry.trace import Status, StatusCode
9
+
10
+ AVAILABLE = True
11
+ except ImportError:
12
+ AVAILABLE = False
13
+
14
+ # --- Mock Classes for when OTel is missing ---
15
+
16
+
17
+ class StatusCode:
18
+ OK = 1
19
+ ERROR = 2
20
+
21
+
22
+ class Status:
23
+ def __init__(self, status_code, description=""):
24
+ pass
25
+
26
+
27
+ class MockSpan:
28
+ def __enter__(self):
29
+ return self
30
+
31
+ def __exit__(self, *args):
32
+ pass
33
+
34
+ def set_attribute(self, key, value):
35
+ pass
36
+
37
+ def set_status(self, status):
38
+ pass
39
+
40
+ def record_exception(self, exception):
41
+ pass
42
+
43
+ def add_event(self, name, attributes=None):
44
+ pass
45
+
46
+
47
+ class MockTracer:
48
+ def start_as_current_span(self, name, kind=None, attributes=None):
49
+ return MockSpan()
50
+
51
+
52
+ class MockCounter:
53
+ def add(self, amount, attributes=None):
54
+ pass
55
+
56
+
57
+ class MockHistogram:
58
+ def record(self, amount, attributes=None):
59
+ pass
60
+
61
+
62
+ class MockMeter:
63
+ def create_counter(self, name, unit="", description=""):
64
+ return MockCounter()
65
+
66
+ def create_histogram(self, name, unit="", description=""):
67
+ return MockHistogram()
68
+
69
+
70
+ # --- Public API ---
71
+
72
+
73
+ def get_tracer(name: str):
74
+ """Get a tracer (real or mock)."""
75
+ if AVAILABLE:
76
+ return trace.get_tracer(name)
77
+ return MockTracer()
78
+
79
+
80
+ def get_meter(name: str):
81
+ """Get a meter (real or mock)."""
82
+ if AVAILABLE:
83
+ return metrics.get_meter(name)
84
+ return MockMeter()
85
+
86
+
87
+ def setup_telemetry(service_name: str = "odibi"):
88
+ """Configure OpenTelemetry if available and configured.
89
+
90
+ Checks OTEL_EXPORTER_OTLP_ENDPOINT environment variable.
91
+ If set, configures OTLP exporter.
92
+ """
93
+ if not AVAILABLE:
94
+ return
95
+
96
+ # Check for OTLP endpoint
97
+ endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT")
98
+ if not endpoint:
99
+ return
100
+
101
+ try:
102
+ from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
103
+ from opentelemetry.sdk.resources import Resource
104
+ from opentelemetry.sdk.trace import TracerProvider
105
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor
106
+
107
+ # Initialize Provider
108
+ resource = Resource.create(attributes={"service.name": service_name})
109
+ provider = TracerProvider(resource=resource)
110
+
111
+ # OTLP Exporter
112
+ exporter = OTLPSpanExporter(endpoint=endpoint)
113
+ processor = BatchSpanProcessor(exporter)
114
+ provider.add_span_processor(processor)
115
+
116
+ # Set Global
117
+ trace.set_tracer_provider(provider)
118
+
119
+ except ImportError:
120
+ # OTLP exporter might not be installed
121
+ pass
122
+ except Exception as e:
123
+ print(f"Warning: Failed to initialize OpenTelemetry: {e}", file=sys.stderr)
124
+
125
+
126
+ # --- Global Instances ---
127
+
128
+ tracer = get_tracer("odibi")
129
+ meter = get_meter("odibi")
130
+
131
+ # Metrics
132
+ nodes_executed = meter.create_counter(
133
+ "odibi.nodes_executed", description="Number of nodes executed"
134
+ )
135
+
136
+ rows_processed = meter.create_counter("odibi.rows_processed", description="Total rows processed")
137
+
138
+ node_duration = meter.create_histogram(
139
+ "odibi.node_duration", unit="s", description="Duration of node execution"
140
+ )
@@ -0,0 +1,62 @@
1
+ """
2
+ Quality Enforcement and Validation
3
+ ===================================
4
+
5
+ This module enforces Odibi's quality standards through automated validation.
6
+
7
+ Features:
8
+ - Explanation linting: Ensure transformations are documented
9
+ - Quality scoring: Detect generic/lazy documentation
10
+ - Schema validation: Verify config structure
11
+ - Pre-run validation: Catch errors before execution
12
+ - Quarantine tables: Route failed rows to dedicated tables
13
+ - Quality gates: Batch-level validation thresholds
14
+ - FK validation: Referential integrity checks for star schemas
15
+
16
+ Principle: Enforce excellence, don't hope for it.
17
+ """
18
+
19
+ from .engine import Validator
20
+ from .explanation_linter import ExplanationLinter, LintIssue
21
+ from .fk import (
22
+ FKValidationReport,
23
+ FKValidationResult,
24
+ FKValidator,
25
+ OrphanRecord,
26
+ RelationshipConfig,
27
+ RelationshipRegistry,
28
+ get_orphan_records,
29
+ parse_relationships_config,
30
+ validate_fk_on_load,
31
+ )
32
+ from .gate import GateResult, evaluate_gate
33
+ from .quarantine import (
34
+ QuarantineResult,
35
+ add_quarantine_metadata,
36
+ has_quarantine_tests,
37
+ split_valid_invalid,
38
+ write_quarantine,
39
+ )
40
+
41
+ __all__ = [
42
+ "ExplanationLinter",
43
+ "LintIssue",
44
+ "Validator",
45
+ "GateResult",
46
+ "evaluate_gate",
47
+ "QuarantineResult",
48
+ "add_quarantine_metadata",
49
+ "has_quarantine_tests",
50
+ "split_valid_invalid",
51
+ "write_quarantine",
52
+ "FKValidator",
53
+ "FKValidationResult",
54
+ "FKValidationReport",
55
+ "OrphanRecord",
56
+ "RelationshipConfig",
57
+ "RelationshipRegistry",
58
+ "get_orphan_records",
59
+ "validate_fk_on_load",
60
+ "parse_relationships_config",
61
+ ]
62
+ __version__ = "1.3.0-alpha.1"