odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,740 @@
1
+ """Enhanced logging context for structured observability.
2
+
3
+ This module provides a context-based logging system that captures:
4
+ - Pipeline and node context
5
+ - Operation timing
6
+ - Row counts and schema changes
7
+ - Engine-specific metrics
8
+
9
+ Design: Composition over inheritance - LoggingContext wraps the base logger.
10
+ """
11
+
12
+ import codecs
13
+ import json
14
+ import logging
15
+ import sys
16
+ import time
17
+ import traceback
18
+ from contextlib import contextmanager
19
+ from dataclasses import dataclass, field
20
+ from datetime import datetime, timezone
21
+ from enum import Enum
22
+ from typing import Any, Dict, List, Optional
23
+
24
+ try:
25
+ from rich.console import Console
26
+ from rich.logging import RichHandler
27
+
28
+ RICH_AVAILABLE = True
29
+ except ImportError:
30
+ RICH_AVAILABLE = False
31
+
32
+
33
+ class OperationType(str, Enum):
34
+ """Types of operations for logging categorization."""
35
+
36
+ READ = "read"
37
+ WRITE = "write"
38
+ TRANSFORM = "transform"
39
+ VALIDATE = "validate"
40
+ RESOLVE = "resolve"
41
+ CONNECT = "connect"
42
+ GRAPH = "graph"
43
+ CONFIG = "config"
44
+ EXECUTE = "execute"
45
+ PATTERN = "pattern"
46
+
47
+
48
+ @dataclass
49
+ class OperationMetrics:
50
+ """Metrics captured during an operation."""
51
+
52
+ start_time: float = field(default_factory=time.time)
53
+ end_time: Optional[float] = None
54
+ rows_in: Optional[int] = None
55
+ rows_out: Optional[int] = None
56
+ schema_before: Optional[Dict[str, str]] = None
57
+ schema_after: Optional[Dict[str, str]] = None
58
+ partition_count: Optional[int] = None
59
+ memory_bytes: Optional[int] = None
60
+ extra: Dict[str, Any] = field(default_factory=dict)
61
+
62
+ @property
63
+ def elapsed_ms(self) -> Optional[float]:
64
+ """Get elapsed time in milliseconds."""
65
+ if self.end_time is None:
66
+ return None
67
+ return (self.end_time - self.start_time) * 1000
68
+
69
+ @property
70
+ def row_delta(self) -> Optional[int]:
71
+ """Get row count change."""
72
+ if self.rows_in is not None and self.rows_out is not None:
73
+ return self.rows_out - self.rows_in
74
+ return None
75
+
76
+ def to_dict(self) -> Dict[str, Any]:
77
+ """Convert to dictionary for logging."""
78
+ result = {}
79
+ if self.elapsed_ms is not None:
80
+ result["elapsed_ms"] = round(self.elapsed_ms, 2)
81
+ if self.rows_in is not None:
82
+ result["rows_in"] = self.rows_in
83
+ if self.rows_out is not None:
84
+ result["rows_out"] = self.rows_out
85
+ if self.row_delta is not None:
86
+ result["row_delta"] = self.row_delta
87
+ if self.schema_before:
88
+ result["columns_before"] = len(self.schema_before)
89
+ if self.schema_after:
90
+ result["columns_after"] = len(self.schema_after)
91
+ if self.partition_count is not None:
92
+ result["partitions"] = self.partition_count
93
+ if self.memory_bytes is not None:
94
+ result["memory_mb"] = round(self.memory_bytes / (1024 * 1024), 2)
95
+ result.update(self.extra)
96
+ return result
97
+
98
+
99
+ class StructuredLogger:
100
+ """Logger that supports both human-readable and JSON output with secret redaction."""
101
+
102
+ def __init__(self, structured: bool = False, level: str = "INFO"):
103
+ self.structured = structured
104
+ self.level = getattr(logging, level.upper(), logging.INFO)
105
+ self._secrets: set = set()
106
+
107
+ if (
108
+ sys.platform == "win32"
109
+ and sys.stdout
110
+ and sys.stdout.encoding
111
+ and sys.stdout.encoding.lower() != "utf-8"
112
+ ):
113
+ try:
114
+ sys.stdout.reconfigure(encoding="utf-8")
115
+ except AttributeError:
116
+ sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
117
+
118
+ if not self.structured and RICH_AVAILABLE:
119
+ logging.basicConfig(
120
+ level=self.level,
121
+ format="%(message)s",
122
+ datefmt="[%X]",
123
+ handlers=[
124
+ RichHandler(
125
+ rich_tracebacks=True,
126
+ markup=True,
127
+ show_path=False,
128
+ console=(
129
+ Console(force_terminal=True, legacy_windows=False)
130
+ if sys.platform == "win32"
131
+ else None
132
+ ),
133
+ )
134
+ ],
135
+ )
136
+ else:
137
+ logging.basicConfig(level=self.level, format="%(message)s", stream=sys.stdout)
138
+
139
+ self.logger = logging.getLogger("odibi")
140
+ self.logger.setLevel(self.level)
141
+
142
+ third_party_level = max(self.level, logging.WARNING)
143
+ for logger_name in [
144
+ "py4j",
145
+ "azure",
146
+ "azure.core.pipeline.policies.http_logging_policy",
147
+ "adlfs",
148
+ "urllib3",
149
+ "fsspec",
150
+ ]:
151
+ logging.getLogger(logger_name).setLevel(third_party_level)
152
+
153
+ def register_secret(self, secret: str) -> None:
154
+ """Register a secret string to be redacted from logs."""
155
+ if secret and isinstance(secret, str) and len(secret.strip()) > 0:
156
+ self._secrets.add(secret)
157
+
158
+ def _redact(self, text: str) -> str:
159
+ """Redact registered secrets from text."""
160
+ if not text or not self._secrets:
161
+ return text
162
+
163
+ for secret in self._secrets:
164
+ if secret in text:
165
+ text = text.replace(secret, "[REDACTED]")
166
+ return text
167
+
168
+ def info(self, message: str, **kwargs) -> None:
169
+ self._log("INFO", message, **kwargs)
170
+
171
+ def warning(self, message: str, **kwargs) -> None:
172
+ self._log("WARNING", message, **kwargs)
173
+
174
+ def error(self, message: str, **kwargs) -> None:
175
+ self._log("ERROR", message, **kwargs)
176
+
177
+ def debug(self, message: str, **kwargs) -> None:
178
+ self._log("DEBUG", message, **kwargs)
179
+
180
+ def _log(self, level: str, message: str, **kwargs) -> None:
181
+ level_val = getattr(logging, level, logging.INFO)
182
+ if level_val < self.level:
183
+ return
184
+
185
+ message = self._redact(str(message))
186
+
187
+ redacted_kwargs = {}
188
+ for k, v in kwargs.items():
189
+ if isinstance(v, str):
190
+ redacted_kwargs[k] = self._redact(v)
191
+ else:
192
+ redacted_kwargs[k] = v
193
+
194
+ if self.structured:
195
+ log_entry = {
196
+ "timestamp": datetime.now(timezone.utc).isoformat(),
197
+ "level": level,
198
+ "message": message,
199
+ **redacted_kwargs,
200
+ }
201
+ print(json.dumps(log_entry))
202
+ else:
203
+ context_str = ""
204
+ if redacted_kwargs:
205
+ context_items = [f"{k}={v}" for k, v in redacted_kwargs.items()]
206
+ context_str = f" ({', '.join(context_items)})"
207
+
208
+ formatted_msg = f"{message}{context_str}"
209
+
210
+ if level == "INFO":
211
+ self.logger.info(formatted_msg)
212
+ elif level == "WARNING":
213
+ self.logger.warning(f"[WARN] {formatted_msg}")
214
+ elif level == "ERROR":
215
+ self.logger.error(f"[ERROR] {formatted_msg}")
216
+ elif level == "DEBUG":
217
+ self.logger.debug(f"[DEBUG] {formatted_msg}")
218
+
219
+
220
+ class LoggingContext:
221
+ """Context-aware logging wrapper for pipeline operations.
222
+
223
+ Provides structured logging with automatic context injection and timing.
224
+ Uses composition pattern - wraps a StructuredLogger instance.
225
+
226
+ Example:
227
+ >>> with LoggingContext(pipeline_id="etl_daily", node_id="load_users") as ctx:
228
+ ... ctx.log_operation_start(OperationType.READ, file="users.csv")
229
+ ... # ... perform read ...
230
+ ... ctx.log_operation_end(rows=1000)
231
+ """
232
+
233
+ def __init__(
234
+ self,
235
+ logger: Optional[StructuredLogger] = None,
236
+ pipeline_id: Optional[str] = None,
237
+ node_id: Optional[str] = None,
238
+ engine: Optional[str] = None,
239
+ ):
240
+ """Initialize logging context.
241
+
242
+ Args:
243
+ logger: StructuredLogger instance (uses global if None)
244
+ pipeline_id: Pipeline identifier for correlation
245
+ node_id: Current node identifier
246
+ engine: Engine type (pandas/spark/polars)
247
+ """
248
+ self._logger = logger
249
+ self.pipeline_id = pipeline_id
250
+ self.node_id = node_id
251
+ self.engine = engine
252
+ self._operation_stack: List[tuple] = []
253
+ self._current_metrics: Optional[OperationMetrics] = None
254
+
255
+ @property
256
+ def logger(self) -> StructuredLogger:
257
+ """Get the underlying logger."""
258
+ if self._logger is None:
259
+ from odibi.utils.logging import logger as global_logger
260
+
261
+ return global_logger
262
+ return self._logger
263
+
264
+ def _base_context(self) -> Dict[str, Any]:
265
+ """Build base context dict for all log entries."""
266
+ ctx = {"timestamp": datetime.now(timezone.utc).isoformat()}
267
+ if self.pipeline_id:
268
+ ctx["pipeline_id"] = self.pipeline_id
269
+ if self.node_id:
270
+ ctx["node_id"] = self.node_id
271
+ if self.engine:
272
+ ctx["engine"] = self.engine
273
+ return ctx
274
+
275
+ def with_context(
276
+ self,
277
+ pipeline_id: Optional[str] = None,
278
+ node_id: Optional[str] = None,
279
+ engine: Optional[str] = None,
280
+ ) -> "LoggingContext":
281
+ """Create a new LoggingContext with updated context."""
282
+ return LoggingContext(
283
+ logger=self._logger,
284
+ pipeline_id=pipeline_id or self.pipeline_id,
285
+ node_id=node_id or self.node_id,
286
+ engine=engine or self.engine,
287
+ )
288
+
289
+ def __enter__(self) -> "LoggingContext":
290
+ return self
291
+
292
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
293
+ if exc_type is not None:
294
+ self.log_exception(exc_val, operation="context_exit")
295
+
296
+ def info(self, message: str, **kwargs) -> None:
297
+ """Log info message with context."""
298
+ self.logger.info(message, **{**self._base_context(), **kwargs})
299
+
300
+ def warning(self, message: str, **kwargs) -> None:
301
+ """Log warning message with context."""
302
+ self.logger.warning(message, **{**self._base_context(), **kwargs})
303
+
304
+ def error(self, message: str, **kwargs) -> None:
305
+ """Log error message with context."""
306
+ self.logger.error(message, **{**self._base_context(), **kwargs})
307
+
308
+ def debug(self, message: str, **kwargs) -> None:
309
+ """Log debug message with context."""
310
+ self.logger.debug(message, **{**self._base_context(), **kwargs})
311
+
312
+ @contextmanager
313
+ def operation(
314
+ self,
315
+ op_type: OperationType,
316
+ description: str = "",
317
+ **initial_context,
318
+ ):
319
+ """Context manager for timed operations with automatic logging.
320
+
321
+ Args:
322
+ op_type: Type of operation
323
+ description: Human-readable description
324
+ **initial_context: Additional context to include
325
+
326
+ Yields:
327
+ OperationMetrics object to populate with results
328
+
329
+ Example:
330
+ >>> with ctx.operation(OperationType.TRANSFORM, "apply_filter") as metrics:
331
+ ... metrics.rows_in = len(df)
332
+ ... result = df.filter(...)
333
+ ... metrics.rows_out = len(result)
334
+ """
335
+ metrics = OperationMetrics()
336
+ self._current_metrics = metrics
337
+ self._operation_stack.append((op_type, description, time.time()))
338
+
339
+ self.debug(
340
+ f"Starting {op_type.value}: {description}",
341
+ operation=op_type.value,
342
+ **initial_context,
343
+ )
344
+
345
+ try:
346
+ yield metrics
347
+ metrics.end_time = time.time()
348
+
349
+ log_data = {
350
+ "operation": op_type.value,
351
+ **metrics.to_dict(),
352
+ **initial_context,
353
+ }
354
+
355
+ self.info(f"Completed {op_type.value}: {description}", **log_data)
356
+
357
+ except Exception as e:
358
+ metrics.end_time = time.time()
359
+ self.log_exception(
360
+ e,
361
+ operation=op_type.value,
362
+ description=description,
363
+ elapsed_ms=metrics.elapsed_ms,
364
+ )
365
+ raise
366
+ finally:
367
+ self._operation_stack.pop()
368
+ self._current_metrics = None
369
+
370
+ def log_operation_start(
371
+ self,
372
+ op_type: OperationType,
373
+ description: str = "",
374
+ **kwargs,
375
+ ) -> OperationMetrics:
376
+ """Log operation start and return metrics tracker.
377
+
378
+ Args:
379
+ op_type: Type of operation
380
+ description: Operation description
381
+ **kwargs: Additional context
382
+
383
+ Returns:
384
+ OperationMetrics to track operation details
385
+ """
386
+ metrics = OperationMetrics()
387
+ self._current_metrics = metrics
388
+ self._operation_stack.append((op_type, description, time.time()))
389
+
390
+ self.debug(
391
+ f"Starting {op_type.value}: {description}",
392
+ operation=op_type.value,
393
+ **kwargs,
394
+ )
395
+
396
+ return metrics
397
+
398
+ def log_operation_end(
399
+ self,
400
+ metrics: Optional[OperationMetrics] = None,
401
+ success: bool = True,
402
+ **kwargs,
403
+ ) -> None:
404
+ """Log operation completion.
405
+
406
+ Args:
407
+ metrics: Metrics from log_operation_start (uses current if None)
408
+ success: Whether operation succeeded
409
+ **kwargs: Additional context
410
+ """
411
+ if metrics is None:
412
+ metrics = self._current_metrics
413
+
414
+ if metrics is not None:
415
+ metrics.end_time = time.time()
416
+
417
+ if self._operation_stack:
418
+ op_type, description, _ = self._operation_stack.pop()
419
+ else:
420
+ op_type, description = OperationType.EXECUTE, "unknown"
421
+
422
+ log_data = {"operation": op_type.value, "success": success, **kwargs}
423
+
424
+ if metrics is not None:
425
+ log_data.update(metrics.to_dict())
426
+
427
+ if success:
428
+ self.info(f"Completed {op_type.value}: {description}", **log_data)
429
+ else:
430
+ self.warning(f"Failed {op_type.value}: {description}", **log_data)
431
+
432
+ self._current_metrics = None
433
+
434
+ def log_exception(
435
+ self,
436
+ exception: Exception,
437
+ operation: Optional[str] = None,
438
+ include_traceback: bool = False,
439
+ **kwargs,
440
+ ) -> None:
441
+ """Log exception with context.
442
+
443
+ Args:
444
+ exception: The exception to log
445
+ operation: Operation that failed
446
+ include_traceback: Whether to include full traceback
447
+ **kwargs: Additional context
448
+ """
449
+ error_data = {
450
+ "error_type": type(exception).__name__,
451
+ "error_message": str(exception),
452
+ **kwargs,
453
+ }
454
+
455
+ if operation:
456
+ error_data["operation"] = operation
457
+
458
+ if include_traceback:
459
+ error_data["traceback"] = traceback.format_exc()
460
+
461
+ self.error(f"Exception: {type(exception).__name__}: {exception}", **error_data)
462
+
463
+ def log_schema_change(
464
+ self,
465
+ before: Dict[str, str],
466
+ after: Dict[str, str],
467
+ operation: str = "transform",
468
+ ) -> None:
469
+ """Log schema changes between transformations.
470
+
471
+ Args:
472
+ before: Schema before transformation
473
+ after: Schema after transformation
474
+ operation: Name of the transformation
475
+ """
476
+ cols_before = set(before.keys())
477
+ cols_after = set(after.keys())
478
+
479
+ added = cols_after - cols_before
480
+ removed = cols_before - cols_after
481
+
482
+ type_changes = {}
483
+ for col in cols_before & cols_after:
484
+ if before[col] != after[col]:
485
+ type_changes[col] = f"{before[col]} -> {after[col]}"
486
+
487
+ self.debug(
488
+ f"Schema change in {operation}",
489
+ operation=operation,
490
+ columns_before=len(cols_before),
491
+ columns_after=len(cols_after),
492
+ columns_added=list(added) if added else None,
493
+ columns_removed=list(removed) if removed else None,
494
+ type_changes=type_changes if type_changes else None,
495
+ )
496
+
497
+ def log_row_count_change(
498
+ self,
499
+ before: int,
500
+ after: int,
501
+ operation: str = "transform",
502
+ ) -> None:
503
+ """Log row count changes.
504
+
505
+ Args:
506
+ before: Row count before
507
+ after: Row count after
508
+ operation: Name of the transformation
509
+ """
510
+ delta = after - before
511
+ pct_change = ((after - before) / before * 100) if before > 0 else 0
512
+
513
+ msg = (
514
+ f"Row count change in {operation}: {before} -> {after} ({delta:+d}, {pct_change:+.1f}%)"
515
+ )
516
+ self.debug(msg, operation=operation, rows_before=before, rows_after=after)
517
+
518
+ def log_spark_metrics(
519
+ self,
520
+ partition_count: Optional[int] = None,
521
+ shuffle_partitions: Optional[int] = None,
522
+ broadcast_size_mb: Optional[float] = None,
523
+ cached: bool = False,
524
+ **kwargs,
525
+ ) -> None:
526
+ """Log Spark-specific metrics.
527
+
528
+ Args:
529
+ partition_count: Number of partitions
530
+ shuffle_partitions: Shuffle partition count
531
+ broadcast_size_mb: Broadcast variable size
532
+ cached: Whether data is cached
533
+ **kwargs: Additional metrics
534
+ """
535
+ metrics = {}
536
+ if partition_count is not None:
537
+ metrics["partitions"] = partition_count
538
+ if shuffle_partitions is not None:
539
+ metrics["shuffle_partitions"] = shuffle_partitions
540
+ if broadcast_size_mb is not None:
541
+ metrics["broadcast_size_mb"] = broadcast_size_mb
542
+ if cached:
543
+ metrics["cached"] = cached
544
+ metrics.update(kwargs)
545
+
546
+ if metrics:
547
+ self.debug("Spark metrics", **metrics)
548
+
549
+ def log_pandas_metrics(
550
+ self,
551
+ memory_mb: Optional[float] = None,
552
+ dtypes: Optional[Dict[str, str]] = None,
553
+ chunked: bool = False,
554
+ chunk_size: Optional[int] = None,
555
+ **kwargs,
556
+ ) -> None:
557
+ """Log Pandas-specific metrics.
558
+
559
+ Args:
560
+ memory_mb: Memory footprint in MB
561
+ dtypes: Column dtypes
562
+ chunked: Whether using chunked processing
563
+ chunk_size: Chunk size if chunked
564
+ **kwargs: Additional metrics
565
+ """
566
+ metrics = {}
567
+ if memory_mb is not None:
568
+ metrics["memory_mb"] = round(memory_mb, 2)
569
+ if memory_mb > 1000:
570
+ self.warning(
571
+ f"High memory usage: {memory_mb:.2f} MB",
572
+ memory_mb=round(memory_mb, 2),
573
+ )
574
+ if dtypes:
575
+ metrics["dtype_count"] = len(dtypes)
576
+ if chunked:
577
+ metrics["chunked"] = True
578
+ if chunk_size:
579
+ metrics["chunk_size"] = chunk_size
580
+ metrics.update(kwargs)
581
+
582
+ if metrics:
583
+ self.debug("Pandas metrics", **metrics)
584
+
585
+ def log_validation_result(
586
+ self,
587
+ passed: bool,
588
+ rule_name: str,
589
+ failures: Optional[List[str]] = None,
590
+ **kwargs,
591
+ ) -> None:
592
+ """Log validation result.
593
+
594
+ Args:
595
+ passed: Whether validation passed
596
+ rule_name: Name of validation rule
597
+ failures: List of failure messages
598
+ **kwargs: Additional context
599
+ """
600
+ if passed:
601
+ self.debug(f"Validation passed: {rule_name}", rule=rule_name, passed=True, **kwargs)
602
+ else:
603
+ self.warning(
604
+ f"Validation failed: {rule_name}",
605
+ rule=rule_name,
606
+ passed=False,
607
+ failures=failures,
608
+ **kwargs,
609
+ )
610
+
611
+ def log_connection(
612
+ self,
613
+ connection_type: str,
614
+ connection_name: str,
615
+ action: str = "connect",
616
+ **kwargs,
617
+ ) -> None:
618
+ """Log connection activity.
619
+
620
+ Args:
621
+ connection_type: Type of connection (azure_blob, sql_server, etc.)
622
+ connection_name: Name of the connection
623
+ action: Action being performed
624
+ **kwargs: Additional context (excluding secrets)
625
+ """
626
+ self.debug(
627
+ f"Connection {action}: {connection_name}",
628
+ connection_type=connection_type,
629
+ connection_name=connection_name,
630
+ action=action,
631
+ **kwargs,
632
+ )
633
+
634
+ def log_file_io(
635
+ self,
636
+ path: str,
637
+ format: str,
638
+ mode: str,
639
+ rows: Optional[int] = None,
640
+ size_mb: Optional[float] = None,
641
+ partitions: Optional[List[str]] = None,
642
+ **kwargs,
643
+ ) -> None:
644
+ """Log file I/O operations.
645
+
646
+ Args:
647
+ path: File path
648
+ format: File format (csv, parquet, delta, etc.)
649
+ mode: I/O mode (read, write, append, overwrite)
650
+ rows: Row count
651
+ size_mb: File size in MB
652
+ partitions: Partition columns
653
+ **kwargs: Additional context
654
+ """
655
+ log_data = {
656
+ "path": path,
657
+ "format": format,
658
+ "mode": mode,
659
+ }
660
+ if rows is not None:
661
+ log_data["rows"] = rows
662
+ if size_mb is not None:
663
+ log_data["size_mb"] = round(size_mb, 2)
664
+ if partitions:
665
+ log_data["partitions"] = partitions
666
+ log_data.update(kwargs)
667
+
668
+ self.info(f"File I/O: {mode} {format} at {path}", **log_data)
669
+
670
+ def log_graph_operation(
671
+ self,
672
+ operation: str,
673
+ node_count: Optional[int] = None,
674
+ edge_count: Optional[int] = None,
675
+ layer_count: Optional[int] = None,
676
+ **kwargs,
677
+ ) -> None:
678
+ """Log dependency graph operations.
679
+
680
+ Args:
681
+ operation: Graph operation (load, resolve, validate, etc.)
682
+ node_count: Number of nodes
683
+ edge_count: Number of edges/dependencies
684
+ layer_count: Number of execution layers
685
+ **kwargs: Additional context
686
+ """
687
+ log_data = {"operation": operation}
688
+ if node_count is not None:
689
+ log_data["nodes"] = node_count
690
+ if edge_count is not None:
691
+ log_data["edges"] = edge_count
692
+ if layer_count is not None:
693
+ log_data["layers"] = layer_count
694
+ log_data.update(kwargs)
695
+
696
+ self.debug(f"Graph {operation}", **log_data)
697
+
698
+
699
+ _global_context: Optional[LoggingContext] = None
700
+
701
+
702
+ def get_logging_context() -> LoggingContext:
703
+ """Get the global logging context."""
704
+ global _global_context
705
+ if _global_context is None:
706
+ from odibi.utils.logging import logger
707
+
708
+ _global_context = LoggingContext(logger=logger)
709
+ return _global_context
710
+
711
+
712
+ def set_logging_context(context: LoggingContext) -> None:
713
+ """Set the global logging context."""
714
+ global _global_context
715
+ _global_context = context
716
+
717
+
718
+ def create_logging_context(
719
+ pipeline_id: Optional[str] = None,
720
+ node_id: Optional[str] = None,
721
+ engine: Optional[str] = None,
722
+ ) -> LoggingContext:
723
+ """Create a new logging context with the specified parameters.
724
+
725
+ Args:
726
+ pipeline_id: Pipeline identifier
727
+ node_id: Node identifier
728
+ engine: Engine type
729
+
730
+ Returns:
731
+ New LoggingContext instance
732
+ """
733
+ from odibi.utils.logging import logger
734
+
735
+ return LoggingContext(
736
+ logger=logger,
737
+ pipeline_id=pipeline_id,
738
+ node_id=node_id,
739
+ engine=engine,
740
+ )