odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,740 @@
|
|
|
1
|
+
"""Enhanced logging context for structured observability.
|
|
2
|
+
|
|
3
|
+
This module provides a context-based logging system that captures:
|
|
4
|
+
- Pipeline and node context
|
|
5
|
+
- Operation timing
|
|
6
|
+
- Row counts and schema changes
|
|
7
|
+
- Engine-specific metrics
|
|
8
|
+
|
|
9
|
+
Design: Composition over inheritance - LoggingContext wraps the base logger.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import codecs
|
|
13
|
+
import json
|
|
14
|
+
import logging
|
|
15
|
+
import sys
|
|
16
|
+
import time
|
|
17
|
+
import traceback
|
|
18
|
+
from contextlib import contextmanager
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
from datetime import datetime, timezone
|
|
21
|
+
from enum import Enum
|
|
22
|
+
from typing import Any, Dict, List, Optional
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
from rich.console import Console
|
|
26
|
+
from rich.logging import RichHandler
|
|
27
|
+
|
|
28
|
+
RICH_AVAILABLE = True
|
|
29
|
+
except ImportError:
|
|
30
|
+
RICH_AVAILABLE = False
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class OperationType(str, Enum):
|
|
34
|
+
"""Types of operations for logging categorization."""
|
|
35
|
+
|
|
36
|
+
READ = "read"
|
|
37
|
+
WRITE = "write"
|
|
38
|
+
TRANSFORM = "transform"
|
|
39
|
+
VALIDATE = "validate"
|
|
40
|
+
RESOLVE = "resolve"
|
|
41
|
+
CONNECT = "connect"
|
|
42
|
+
GRAPH = "graph"
|
|
43
|
+
CONFIG = "config"
|
|
44
|
+
EXECUTE = "execute"
|
|
45
|
+
PATTERN = "pattern"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class OperationMetrics:
|
|
50
|
+
"""Metrics captured during an operation."""
|
|
51
|
+
|
|
52
|
+
start_time: float = field(default_factory=time.time)
|
|
53
|
+
end_time: Optional[float] = None
|
|
54
|
+
rows_in: Optional[int] = None
|
|
55
|
+
rows_out: Optional[int] = None
|
|
56
|
+
schema_before: Optional[Dict[str, str]] = None
|
|
57
|
+
schema_after: Optional[Dict[str, str]] = None
|
|
58
|
+
partition_count: Optional[int] = None
|
|
59
|
+
memory_bytes: Optional[int] = None
|
|
60
|
+
extra: Dict[str, Any] = field(default_factory=dict)
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def elapsed_ms(self) -> Optional[float]:
|
|
64
|
+
"""Get elapsed time in milliseconds."""
|
|
65
|
+
if self.end_time is None:
|
|
66
|
+
return None
|
|
67
|
+
return (self.end_time - self.start_time) * 1000
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def row_delta(self) -> Optional[int]:
|
|
71
|
+
"""Get row count change."""
|
|
72
|
+
if self.rows_in is not None and self.rows_out is not None:
|
|
73
|
+
return self.rows_out - self.rows_in
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
77
|
+
"""Convert to dictionary for logging."""
|
|
78
|
+
result = {}
|
|
79
|
+
if self.elapsed_ms is not None:
|
|
80
|
+
result["elapsed_ms"] = round(self.elapsed_ms, 2)
|
|
81
|
+
if self.rows_in is not None:
|
|
82
|
+
result["rows_in"] = self.rows_in
|
|
83
|
+
if self.rows_out is not None:
|
|
84
|
+
result["rows_out"] = self.rows_out
|
|
85
|
+
if self.row_delta is not None:
|
|
86
|
+
result["row_delta"] = self.row_delta
|
|
87
|
+
if self.schema_before:
|
|
88
|
+
result["columns_before"] = len(self.schema_before)
|
|
89
|
+
if self.schema_after:
|
|
90
|
+
result["columns_after"] = len(self.schema_after)
|
|
91
|
+
if self.partition_count is not None:
|
|
92
|
+
result["partitions"] = self.partition_count
|
|
93
|
+
if self.memory_bytes is not None:
|
|
94
|
+
result["memory_mb"] = round(self.memory_bytes / (1024 * 1024), 2)
|
|
95
|
+
result.update(self.extra)
|
|
96
|
+
return result
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class StructuredLogger:
|
|
100
|
+
"""Logger that supports both human-readable and JSON output with secret redaction."""
|
|
101
|
+
|
|
102
|
+
def __init__(self, structured: bool = False, level: str = "INFO"):
|
|
103
|
+
self.structured = structured
|
|
104
|
+
self.level = getattr(logging, level.upper(), logging.INFO)
|
|
105
|
+
self._secrets: set = set()
|
|
106
|
+
|
|
107
|
+
if (
|
|
108
|
+
sys.platform == "win32"
|
|
109
|
+
and sys.stdout
|
|
110
|
+
and sys.stdout.encoding
|
|
111
|
+
and sys.stdout.encoding.lower() != "utf-8"
|
|
112
|
+
):
|
|
113
|
+
try:
|
|
114
|
+
sys.stdout.reconfigure(encoding="utf-8")
|
|
115
|
+
except AttributeError:
|
|
116
|
+
sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
|
|
117
|
+
|
|
118
|
+
if not self.structured and RICH_AVAILABLE:
|
|
119
|
+
logging.basicConfig(
|
|
120
|
+
level=self.level,
|
|
121
|
+
format="%(message)s",
|
|
122
|
+
datefmt="[%X]",
|
|
123
|
+
handlers=[
|
|
124
|
+
RichHandler(
|
|
125
|
+
rich_tracebacks=True,
|
|
126
|
+
markup=True,
|
|
127
|
+
show_path=False,
|
|
128
|
+
console=(
|
|
129
|
+
Console(force_terminal=True, legacy_windows=False)
|
|
130
|
+
if sys.platform == "win32"
|
|
131
|
+
else None
|
|
132
|
+
),
|
|
133
|
+
)
|
|
134
|
+
],
|
|
135
|
+
)
|
|
136
|
+
else:
|
|
137
|
+
logging.basicConfig(level=self.level, format="%(message)s", stream=sys.stdout)
|
|
138
|
+
|
|
139
|
+
self.logger = logging.getLogger("odibi")
|
|
140
|
+
self.logger.setLevel(self.level)
|
|
141
|
+
|
|
142
|
+
third_party_level = max(self.level, logging.WARNING)
|
|
143
|
+
for logger_name in [
|
|
144
|
+
"py4j",
|
|
145
|
+
"azure",
|
|
146
|
+
"azure.core.pipeline.policies.http_logging_policy",
|
|
147
|
+
"adlfs",
|
|
148
|
+
"urllib3",
|
|
149
|
+
"fsspec",
|
|
150
|
+
]:
|
|
151
|
+
logging.getLogger(logger_name).setLevel(third_party_level)
|
|
152
|
+
|
|
153
|
+
def register_secret(self, secret: str) -> None:
|
|
154
|
+
"""Register a secret string to be redacted from logs."""
|
|
155
|
+
if secret and isinstance(secret, str) and len(secret.strip()) > 0:
|
|
156
|
+
self._secrets.add(secret)
|
|
157
|
+
|
|
158
|
+
def _redact(self, text: str) -> str:
|
|
159
|
+
"""Redact registered secrets from text."""
|
|
160
|
+
if not text or not self._secrets:
|
|
161
|
+
return text
|
|
162
|
+
|
|
163
|
+
for secret in self._secrets:
|
|
164
|
+
if secret in text:
|
|
165
|
+
text = text.replace(secret, "[REDACTED]")
|
|
166
|
+
return text
|
|
167
|
+
|
|
168
|
+
def info(self, message: str, **kwargs) -> None:
|
|
169
|
+
self._log("INFO", message, **kwargs)
|
|
170
|
+
|
|
171
|
+
def warning(self, message: str, **kwargs) -> None:
|
|
172
|
+
self._log("WARNING", message, **kwargs)
|
|
173
|
+
|
|
174
|
+
def error(self, message: str, **kwargs) -> None:
|
|
175
|
+
self._log("ERROR", message, **kwargs)
|
|
176
|
+
|
|
177
|
+
def debug(self, message: str, **kwargs) -> None:
|
|
178
|
+
self._log("DEBUG", message, **kwargs)
|
|
179
|
+
|
|
180
|
+
def _log(self, level: str, message: str, **kwargs) -> None:
|
|
181
|
+
level_val = getattr(logging, level, logging.INFO)
|
|
182
|
+
if level_val < self.level:
|
|
183
|
+
return
|
|
184
|
+
|
|
185
|
+
message = self._redact(str(message))
|
|
186
|
+
|
|
187
|
+
redacted_kwargs = {}
|
|
188
|
+
for k, v in kwargs.items():
|
|
189
|
+
if isinstance(v, str):
|
|
190
|
+
redacted_kwargs[k] = self._redact(v)
|
|
191
|
+
else:
|
|
192
|
+
redacted_kwargs[k] = v
|
|
193
|
+
|
|
194
|
+
if self.structured:
|
|
195
|
+
log_entry = {
|
|
196
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
197
|
+
"level": level,
|
|
198
|
+
"message": message,
|
|
199
|
+
**redacted_kwargs,
|
|
200
|
+
}
|
|
201
|
+
print(json.dumps(log_entry))
|
|
202
|
+
else:
|
|
203
|
+
context_str = ""
|
|
204
|
+
if redacted_kwargs:
|
|
205
|
+
context_items = [f"{k}={v}" for k, v in redacted_kwargs.items()]
|
|
206
|
+
context_str = f" ({', '.join(context_items)})"
|
|
207
|
+
|
|
208
|
+
formatted_msg = f"{message}{context_str}"
|
|
209
|
+
|
|
210
|
+
if level == "INFO":
|
|
211
|
+
self.logger.info(formatted_msg)
|
|
212
|
+
elif level == "WARNING":
|
|
213
|
+
self.logger.warning(f"[WARN] {formatted_msg}")
|
|
214
|
+
elif level == "ERROR":
|
|
215
|
+
self.logger.error(f"[ERROR] {formatted_msg}")
|
|
216
|
+
elif level == "DEBUG":
|
|
217
|
+
self.logger.debug(f"[DEBUG] {formatted_msg}")
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
class LoggingContext:
|
|
221
|
+
"""Context-aware logging wrapper for pipeline operations.
|
|
222
|
+
|
|
223
|
+
Provides structured logging with automatic context injection and timing.
|
|
224
|
+
Uses composition pattern - wraps a StructuredLogger instance.
|
|
225
|
+
|
|
226
|
+
Example:
|
|
227
|
+
>>> with LoggingContext(pipeline_id="etl_daily", node_id="load_users") as ctx:
|
|
228
|
+
... ctx.log_operation_start(OperationType.READ, file="users.csv")
|
|
229
|
+
... # ... perform read ...
|
|
230
|
+
... ctx.log_operation_end(rows=1000)
|
|
231
|
+
"""
|
|
232
|
+
|
|
233
|
+
def __init__(
|
|
234
|
+
self,
|
|
235
|
+
logger: Optional[StructuredLogger] = None,
|
|
236
|
+
pipeline_id: Optional[str] = None,
|
|
237
|
+
node_id: Optional[str] = None,
|
|
238
|
+
engine: Optional[str] = None,
|
|
239
|
+
):
|
|
240
|
+
"""Initialize logging context.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
logger: StructuredLogger instance (uses global if None)
|
|
244
|
+
pipeline_id: Pipeline identifier for correlation
|
|
245
|
+
node_id: Current node identifier
|
|
246
|
+
engine: Engine type (pandas/spark/polars)
|
|
247
|
+
"""
|
|
248
|
+
self._logger = logger
|
|
249
|
+
self.pipeline_id = pipeline_id
|
|
250
|
+
self.node_id = node_id
|
|
251
|
+
self.engine = engine
|
|
252
|
+
self._operation_stack: List[tuple] = []
|
|
253
|
+
self._current_metrics: Optional[OperationMetrics] = None
|
|
254
|
+
|
|
255
|
+
@property
|
|
256
|
+
def logger(self) -> StructuredLogger:
|
|
257
|
+
"""Get the underlying logger."""
|
|
258
|
+
if self._logger is None:
|
|
259
|
+
from odibi.utils.logging import logger as global_logger
|
|
260
|
+
|
|
261
|
+
return global_logger
|
|
262
|
+
return self._logger
|
|
263
|
+
|
|
264
|
+
def _base_context(self) -> Dict[str, Any]:
|
|
265
|
+
"""Build base context dict for all log entries."""
|
|
266
|
+
ctx = {"timestamp": datetime.now(timezone.utc).isoformat()}
|
|
267
|
+
if self.pipeline_id:
|
|
268
|
+
ctx["pipeline_id"] = self.pipeline_id
|
|
269
|
+
if self.node_id:
|
|
270
|
+
ctx["node_id"] = self.node_id
|
|
271
|
+
if self.engine:
|
|
272
|
+
ctx["engine"] = self.engine
|
|
273
|
+
return ctx
|
|
274
|
+
|
|
275
|
+
def with_context(
|
|
276
|
+
self,
|
|
277
|
+
pipeline_id: Optional[str] = None,
|
|
278
|
+
node_id: Optional[str] = None,
|
|
279
|
+
engine: Optional[str] = None,
|
|
280
|
+
) -> "LoggingContext":
|
|
281
|
+
"""Create a new LoggingContext with updated context."""
|
|
282
|
+
return LoggingContext(
|
|
283
|
+
logger=self._logger,
|
|
284
|
+
pipeline_id=pipeline_id or self.pipeline_id,
|
|
285
|
+
node_id=node_id or self.node_id,
|
|
286
|
+
engine=engine or self.engine,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
def __enter__(self) -> "LoggingContext":
|
|
290
|
+
return self
|
|
291
|
+
|
|
292
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
293
|
+
if exc_type is not None:
|
|
294
|
+
self.log_exception(exc_val, operation="context_exit")
|
|
295
|
+
|
|
296
|
+
def info(self, message: str, **kwargs) -> None:
|
|
297
|
+
"""Log info message with context."""
|
|
298
|
+
self.logger.info(message, **{**self._base_context(), **kwargs})
|
|
299
|
+
|
|
300
|
+
def warning(self, message: str, **kwargs) -> None:
|
|
301
|
+
"""Log warning message with context."""
|
|
302
|
+
self.logger.warning(message, **{**self._base_context(), **kwargs})
|
|
303
|
+
|
|
304
|
+
def error(self, message: str, **kwargs) -> None:
|
|
305
|
+
"""Log error message with context."""
|
|
306
|
+
self.logger.error(message, **{**self._base_context(), **kwargs})
|
|
307
|
+
|
|
308
|
+
def debug(self, message: str, **kwargs) -> None:
|
|
309
|
+
"""Log debug message with context."""
|
|
310
|
+
self.logger.debug(message, **{**self._base_context(), **kwargs})
|
|
311
|
+
|
|
312
|
+
@contextmanager
|
|
313
|
+
def operation(
|
|
314
|
+
self,
|
|
315
|
+
op_type: OperationType,
|
|
316
|
+
description: str = "",
|
|
317
|
+
**initial_context,
|
|
318
|
+
):
|
|
319
|
+
"""Context manager for timed operations with automatic logging.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
op_type: Type of operation
|
|
323
|
+
description: Human-readable description
|
|
324
|
+
**initial_context: Additional context to include
|
|
325
|
+
|
|
326
|
+
Yields:
|
|
327
|
+
OperationMetrics object to populate with results
|
|
328
|
+
|
|
329
|
+
Example:
|
|
330
|
+
>>> with ctx.operation(OperationType.TRANSFORM, "apply_filter") as metrics:
|
|
331
|
+
... metrics.rows_in = len(df)
|
|
332
|
+
... result = df.filter(...)
|
|
333
|
+
... metrics.rows_out = len(result)
|
|
334
|
+
"""
|
|
335
|
+
metrics = OperationMetrics()
|
|
336
|
+
self._current_metrics = metrics
|
|
337
|
+
self._operation_stack.append((op_type, description, time.time()))
|
|
338
|
+
|
|
339
|
+
self.debug(
|
|
340
|
+
f"Starting {op_type.value}: {description}",
|
|
341
|
+
operation=op_type.value,
|
|
342
|
+
**initial_context,
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
try:
|
|
346
|
+
yield metrics
|
|
347
|
+
metrics.end_time = time.time()
|
|
348
|
+
|
|
349
|
+
log_data = {
|
|
350
|
+
"operation": op_type.value,
|
|
351
|
+
**metrics.to_dict(),
|
|
352
|
+
**initial_context,
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
self.info(f"Completed {op_type.value}: {description}", **log_data)
|
|
356
|
+
|
|
357
|
+
except Exception as e:
|
|
358
|
+
metrics.end_time = time.time()
|
|
359
|
+
self.log_exception(
|
|
360
|
+
e,
|
|
361
|
+
operation=op_type.value,
|
|
362
|
+
description=description,
|
|
363
|
+
elapsed_ms=metrics.elapsed_ms,
|
|
364
|
+
)
|
|
365
|
+
raise
|
|
366
|
+
finally:
|
|
367
|
+
self._operation_stack.pop()
|
|
368
|
+
self._current_metrics = None
|
|
369
|
+
|
|
370
|
+
def log_operation_start(
|
|
371
|
+
self,
|
|
372
|
+
op_type: OperationType,
|
|
373
|
+
description: str = "",
|
|
374
|
+
**kwargs,
|
|
375
|
+
) -> OperationMetrics:
|
|
376
|
+
"""Log operation start and return metrics tracker.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
op_type: Type of operation
|
|
380
|
+
description: Operation description
|
|
381
|
+
**kwargs: Additional context
|
|
382
|
+
|
|
383
|
+
Returns:
|
|
384
|
+
OperationMetrics to track operation details
|
|
385
|
+
"""
|
|
386
|
+
metrics = OperationMetrics()
|
|
387
|
+
self._current_metrics = metrics
|
|
388
|
+
self._operation_stack.append((op_type, description, time.time()))
|
|
389
|
+
|
|
390
|
+
self.debug(
|
|
391
|
+
f"Starting {op_type.value}: {description}",
|
|
392
|
+
operation=op_type.value,
|
|
393
|
+
**kwargs,
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
return metrics
|
|
397
|
+
|
|
398
|
+
def log_operation_end(
|
|
399
|
+
self,
|
|
400
|
+
metrics: Optional[OperationMetrics] = None,
|
|
401
|
+
success: bool = True,
|
|
402
|
+
**kwargs,
|
|
403
|
+
) -> None:
|
|
404
|
+
"""Log operation completion.
|
|
405
|
+
|
|
406
|
+
Args:
|
|
407
|
+
metrics: Metrics from log_operation_start (uses current if None)
|
|
408
|
+
success: Whether operation succeeded
|
|
409
|
+
**kwargs: Additional context
|
|
410
|
+
"""
|
|
411
|
+
if metrics is None:
|
|
412
|
+
metrics = self._current_metrics
|
|
413
|
+
|
|
414
|
+
if metrics is not None:
|
|
415
|
+
metrics.end_time = time.time()
|
|
416
|
+
|
|
417
|
+
if self._operation_stack:
|
|
418
|
+
op_type, description, _ = self._operation_stack.pop()
|
|
419
|
+
else:
|
|
420
|
+
op_type, description = OperationType.EXECUTE, "unknown"
|
|
421
|
+
|
|
422
|
+
log_data = {"operation": op_type.value, "success": success, **kwargs}
|
|
423
|
+
|
|
424
|
+
if metrics is not None:
|
|
425
|
+
log_data.update(metrics.to_dict())
|
|
426
|
+
|
|
427
|
+
if success:
|
|
428
|
+
self.info(f"Completed {op_type.value}: {description}", **log_data)
|
|
429
|
+
else:
|
|
430
|
+
self.warning(f"Failed {op_type.value}: {description}", **log_data)
|
|
431
|
+
|
|
432
|
+
self._current_metrics = None
|
|
433
|
+
|
|
434
|
+
def log_exception(
|
|
435
|
+
self,
|
|
436
|
+
exception: Exception,
|
|
437
|
+
operation: Optional[str] = None,
|
|
438
|
+
include_traceback: bool = False,
|
|
439
|
+
**kwargs,
|
|
440
|
+
) -> None:
|
|
441
|
+
"""Log exception with context.
|
|
442
|
+
|
|
443
|
+
Args:
|
|
444
|
+
exception: The exception to log
|
|
445
|
+
operation: Operation that failed
|
|
446
|
+
include_traceback: Whether to include full traceback
|
|
447
|
+
**kwargs: Additional context
|
|
448
|
+
"""
|
|
449
|
+
error_data = {
|
|
450
|
+
"error_type": type(exception).__name__,
|
|
451
|
+
"error_message": str(exception),
|
|
452
|
+
**kwargs,
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
if operation:
|
|
456
|
+
error_data["operation"] = operation
|
|
457
|
+
|
|
458
|
+
if include_traceback:
|
|
459
|
+
error_data["traceback"] = traceback.format_exc()
|
|
460
|
+
|
|
461
|
+
self.error(f"Exception: {type(exception).__name__}: {exception}", **error_data)
|
|
462
|
+
|
|
463
|
+
def log_schema_change(
|
|
464
|
+
self,
|
|
465
|
+
before: Dict[str, str],
|
|
466
|
+
after: Dict[str, str],
|
|
467
|
+
operation: str = "transform",
|
|
468
|
+
) -> None:
|
|
469
|
+
"""Log schema changes between transformations.
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
before: Schema before transformation
|
|
473
|
+
after: Schema after transformation
|
|
474
|
+
operation: Name of the transformation
|
|
475
|
+
"""
|
|
476
|
+
cols_before = set(before.keys())
|
|
477
|
+
cols_after = set(after.keys())
|
|
478
|
+
|
|
479
|
+
added = cols_after - cols_before
|
|
480
|
+
removed = cols_before - cols_after
|
|
481
|
+
|
|
482
|
+
type_changes = {}
|
|
483
|
+
for col in cols_before & cols_after:
|
|
484
|
+
if before[col] != after[col]:
|
|
485
|
+
type_changes[col] = f"{before[col]} -> {after[col]}"
|
|
486
|
+
|
|
487
|
+
self.debug(
|
|
488
|
+
f"Schema change in {operation}",
|
|
489
|
+
operation=operation,
|
|
490
|
+
columns_before=len(cols_before),
|
|
491
|
+
columns_after=len(cols_after),
|
|
492
|
+
columns_added=list(added) if added else None,
|
|
493
|
+
columns_removed=list(removed) if removed else None,
|
|
494
|
+
type_changes=type_changes if type_changes else None,
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
def log_row_count_change(
|
|
498
|
+
self,
|
|
499
|
+
before: int,
|
|
500
|
+
after: int,
|
|
501
|
+
operation: str = "transform",
|
|
502
|
+
) -> None:
|
|
503
|
+
"""Log row count changes.
|
|
504
|
+
|
|
505
|
+
Args:
|
|
506
|
+
before: Row count before
|
|
507
|
+
after: Row count after
|
|
508
|
+
operation: Name of the transformation
|
|
509
|
+
"""
|
|
510
|
+
delta = after - before
|
|
511
|
+
pct_change = ((after - before) / before * 100) if before > 0 else 0
|
|
512
|
+
|
|
513
|
+
msg = (
|
|
514
|
+
f"Row count change in {operation}: {before} -> {after} ({delta:+d}, {pct_change:+.1f}%)"
|
|
515
|
+
)
|
|
516
|
+
self.debug(msg, operation=operation, rows_before=before, rows_after=after)
|
|
517
|
+
|
|
518
|
+
def log_spark_metrics(
|
|
519
|
+
self,
|
|
520
|
+
partition_count: Optional[int] = None,
|
|
521
|
+
shuffle_partitions: Optional[int] = None,
|
|
522
|
+
broadcast_size_mb: Optional[float] = None,
|
|
523
|
+
cached: bool = False,
|
|
524
|
+
**kwargs,
|
|
525
|
+
) -> None:
|
|
526
|
+
"""Log Spark-specific metrics.
|
|
527
|
+
|
|
528
|
+
Args:
|
|
529
|
+
partition_count: Number of partitions
|
|
530
|
+
shuffle_partitions: Shuffle partition count
|
|
531
|
+
broadcast_size_mb: Broadcast variable size
|
|
532
|
+
cached: Whether data is cached
|
|
533
|
+
**kwargs: Additional metrics
|
|
534
|
+
"""
|
|
535
|
+
metrics = {}
|
|
536
|
+
if partition_count is not None:
|
|
537
|
+
metrics["partitions"] = partition_count
|
|
538
|
+
if shuffle_partitions is not None:
|
|
539
|
+
metrics["shuffle_partitions"] = shuffle_partitions
|
|
540
|
+
if broadcast_size_mb is not None:
|
|
541
|
+
metrics["broadcast_size_mb"] = broadcast_size_mb
|
|
542
|
+
if cached:
|
|
543
|
+
metrics["cached"] = cached
|
|
544
|
+
metrics.update(kwargs)
|
|
545
|
+
|
|
546
|
+
if metrics:
|
|
547
|
+
self.debug("Spark metrics", **metrics)
|
|
548
|
+
|
|
549
|
+
def log_pandas_metrics(
|
|
550
|
+
self,
|
|
551
|
+
memory_mb: Optional[float] = None,
|
|
552
|
+
dtypes: Optional[Dict[str, str]] = None,
|
|
553
|
+
chunked: bool = False,
|
|
554
|
+
chunk_size: Optional[int] = None,
|
|
555
|
+
**kwargs,
|
|
556
|
+
) -> None:
|
|
557
|
+
"""Log Pandas-specific metrics.
|
|
558
|
+
|
|
559
|
+
Args:
|
|
560
|
+
memory_mb: Memory footprint in MB
|
|
561
|
+
dtypes: Column dtypes
|
|
562
|
+
chunked: Whether using chunked processing
|
|
563
|
+
chunk_size: Chunk size if chunked
|
|
564
|
+
**kwargs: Additional metrics
|
|
565
|
+
"""
|
|
566
|
+
metrics = {}
|
|
567
|
+
if memory_mb is not None:
|
|
568
|
+
metrics["memory_mb"] = round(memory_mb, 2)
|
|
569
|
+
if memory_mb > 1000:
|
|
570
|
+
self.warning(
|
|
571
|
+
f"High memory usage: {memory_mb:.2f} MB",
|
|
572
|
+
memory_mb=round(memory_mb, 2),
|
|
573
|
+
)
|
|
574
|
+
if dtypes:
|
|
575
|
+
metrics["dtype_count"] = len(dtypes)
|
|
576
|
+
if chunked:
|
|
577
|
+
metrics["chunked"] = True
|
|
578
|
+
if chunk_size:
|
|
579
|
+
metrics["chunk_size"] = chunk_size
|
|
580
|
+
metrics.update(kwargs)
|
|
581
|
+
|
|
582
|
+
if metrics:
|
|
583
|
+
self.debug("Pandas metrics", **metrics)
|
|
584
|
+
|
|
585
|
+
def log_validation_result(
|
|
586
|
+
self,
|
|
587
|
+
passed: bool,
|
|
588
|
+
rule_name: str,
|
|
589
|
+
failures: Optional[List[str]] = None,
|
|
590
|
+
**kwargs,
|
|
591
|
+
) -> None:
|
|
592
|
+
"""Log validation result.
|
|
593
|
+
|
|
594
|
+
Args:
|
|
595
|
+
passed: Whether validation passed
|
|
596
|
+
rule_name: Name of validation rule
|
|
597
|
+
failures: List of failure messages
|
|
598
|
+
**kwargs: Additional context
|
|
599
|
+
"""
|
|
600
|
+
if passed:
|
|
601
|
+
self.debug(f"Validation passed: {rule_name}", rule=rule_name, passed=True, **kwargs)
|
|
602
|
+
else:
|
|
603
|
+
self.warning(
|
|
604
|
+
f"Validation failed: {rule_name}",
|
|
605
|
+
rule=rule_name,
|
|
606
|
+
passed=False,
|
|
607
|
+
failures=failures,
|
|
608
|
+
**kwargs,
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
def log_connection(
|
|
612
|
+
self,
|
|
613
|
+
connection_type: str,
|
|
614
|
+
connection_name: str,
|
|
615
|
+
action: str = "connect",
|
|
616
|
+
**kwargs,
|
|
617
|
+
) -> None:
|
|
618
|
+
"""Log connection activity.
|
|
619
|
+
|
|
620
|
+
Args:
|
|
621
|
+
connection_type: Type of connection (azure_blob, sql_server, etc.)
|
|
622
|
+
connection_name: Name of the connection
|
|
623
|
+
action: Action being performed
|
|
624
|
+
**kwargs: Additional context (excluding secrets)
|
|
625
|
+
"""
|
|
626
|
+
self.debug(
|
|
627
|
+
f"Connection {action}: {connection_name}",
|
|
628
|
+
connection_type=connection_type,
|
|
629
|
+
connection_name=connection_name,
|
|
630
|
+
action=action,
|
|
631
|
+
**kwargs,
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
def log_file_io(
|
|
635
|
+
self,
|
|
636
|
+
path: str,
|
|
637
|
+
format: str,
|
|
638
|
+
mode: str,
|
|
639
|
+
rows: Optional[int] = None,
|
|
640
|
+
size_mb: Optional[float] = None,
|
|
641
|
+
partitions: Optional[List[str]] = None,
|
|
642
|
+
**kwargs,
|
|
643
|
+
) -> None:
|
|
644
|
+
"""Log file I/O operations.
|
|
645
|
+
|
|
646
|
+
Args:
|
|
647
|
+
path: File path
|
|
648
|
+
format: File format (csv, parquet, delta, etc.)
|
|
649
|
+
mode: I/O mode (read, write, append, overwrite)
|
|
650
|
+
rows: Row count
|
|
651
|
+
size_mb: File size in MB
|
|
652
|
+
partitions: Partition columns
|
|
653
|
+
**kwargs: Additional context
|
|
654
|
+
"""
|
|
655
|
+
log_data = {
|
|
656
|
+
"path": path,
|
|
657
|
+
"format": format,
|
|
658
|
+
"mode": mode,
|
|
659
|
+
}
|
|
660
|
+
if rows is not None:
|
|
661
|
+
log_data["rows"] = rows
|
|
662
|
+
if size_mb is not None:
|
|
663
|
+
log_data["size_mb"] = round(size_mb, 2)
|
|
664
|
+
if partitions:
|
|
665
|
+
log_data["partitions"] = partitions
|
|
666
|
+
log_data.update(kwargs)
|
|
667
|
+
|
|
668
|
+
self.info(f"File I/O: {mode} {format} at {path}", **log_data)
|
|
669
|
+
|
|
670
|
+
def log_graph_operation(
|
|
671
|
+
self,
|
|
672
|
+
operation: str,
|
|
673
|
+
node_count: Optional[int] = None,
|
|
674
|
+
edge_count: Optional[int] = None,
|
|
675
|
+
layer_count: Optional[int] = None,
|
|
676
|
+
**kwargs,
|
|
677
|
+
) -> None:
|
|
678
|
+
"""Log dependency graph operations.
|
|
679
|
+
|
|
680
|
+
Args:
|
|
681
|
+
operation: Graph operation (load, resolve, validate, etc.)
|
|
682
|
+
node_count: Number of nodes
|
|
683
|
+
edge_count: Number of edges/dependencies
|
|
684
|
+
layer_count: Number of execution layers
|
|
685
|
+
**kwargs: Additional context
|
|
686
|
+
"""
|
|
687
|
+
log_data = {"operation": operation}
|
|
688
|
+
if node_count is not None:
|
|
689
|
+
log_data["nodes"] = node_count
|
|
690
|
+
if edge_count is not None:
|
|
691
|
+
log_data["edges"] = edge_count
|
|
692
|
+
if layer_count is not None:
|
|
693
|
+
log_data["layers"] = layer_count
|
|
694
|
+
log_data.update(kwargs)
|
|
695
|
+
|
|
696
|
+
self.debug(f"Graph {operation}", **log_data)
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
_global_context: Optional[LoggingContext] = None
|
|
700
|
+
|
|
701
|
+
|
|
702
|
+
def get_logging_context() -> LoggingContext:
|
|
703
|
+
"""Get the global logging context."""
|
|
704
|
+
global _global_context
|
|
705
|
+
if _global_context is None:
|
|
706
|
+
from odibi.utils.logging import logger
|
|
707
|
+
|
|
708
|
+
_global_context = LoggingContext(logger=logger)
|
|
709
|
+
return _global_context
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
def set_logging_context(context: LoggingContext) -> None:
|
|
713
|
+
"""Set the global logging context."""
|
|
714
|
+
global _global_context
|
|
715
|
+
_global_context = context
|
|
716
|
+
|
|
717
|
+
|
|
718
|
+
def create_logging_context(
|
|
719
|
+
pipeline_id: Optional[str] = None,
|
|
720
|
+
node_id: Optional[str] = None,
|
|
721
|
+
engine: Optional[str] = None,
|
|
722
|
+
) -> LoggingContext:
|
|
723
|
+
"""Create a new logging context with the specified parameters.
|
|
724
|
+
|
|
725
|
+
Args:
|
|
726
|
+
pipeline_id: Pipeline identifier
|
|
727
|
+
node_id: Node identifier
|
|
728
|
+
engine: Engine type
|
|
729
|
+
|
|
730
|
+
Returns:
|
|
731
|
+
New LoggingContext instance
|
|
732
|
+
"""
|
|
733
|
+
from odibi.utils.logging import logger
|
|
734
|
+
|
|
735
|
+
return LoggingContext(
|
|
736
|
+
logger=logger,
|
|
737
|
+
pipeline_id=pipeline_id,
|
|
738
|
+
node_id=node_id,
|
|
739
|
+
engine=engine,
|
|
740
|
+
)
|