odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,202 @@
1
+ """Content hashing utilities for skip_if_unchanged feature.
2
+
3
+ This module provides functions to compute deterministic hashes of DataFrames
4
+ for change detection in snapshot ingestion patterns.
5
+ """
6
+
7
+ import hashlib
8
+ from typing import TYPE_CHECKING, List, Optional
9
+
10
+ if TYPE_CHECKING:
11
+ import pandas as pd
12
+
13
+
14
+ def compute_dataframe_hash(
15
+ df: "pd.DataFrame",
16
+ columns: Optional[List[str]] = None,
17
+ sort_columns: Optional[List[str]] = None,
18
+ ) -> str:
19
+ """Compute a deterministic SHA256 hash of a DataFrame's content.
20
+
21
+ Args:
22
+ df: Pandas DataFrame to hash
23
+ columns: Subset of columns to include in hash. If None, all columns.
24
+ sort_columns: Columns to sort by for deterministic ordering.
25
+ If None, DataFrame is not sorted (assumes consistent order).
26
+
27
+ Returns:
28
+ SHA256 hex digest string (64 characters)
29
+
30
+ Example:
31
+ >>> df = pd.DataFrame({"id": [1, 2], "value": ["a", "b"]})
32
+ >>> hash1 = compute_dataframe_hash(df, sort_columns=["id"])
33
+ >>> hash2 = compute_dataframe_hash(df, sort_columns=["id"])
34
+ >>> assert hash1 == hash2 # Same content = same hash
35
+ """
36
+ if df.empty:
37
+ return hashlib.sha256(b"EMPTY_DATAFRAME").hexdigest()
38
+
39
+ work_df = df
40
+
41
+ if columns:
42
+ missing = set(columns) - set(df.columns)
43
+ if missing:
44
+ raise ValueError(f"Hash columns not found in DataFrame: {missing}")
45
+ work_df = work_df[columns]
46
+
47
+ if sort_columns:
48
+ missing = set(sort_columns) - set(df.columns)
49
+ if missing:
50
+ raise ValueError(f"Sort columns not found in DataFrame: {missing}")
51
+ work_df = work_df.sort_values(sort_columns).reset_index(drop=True)
52
+
53
+ csv_bytes = work_df.to_csv(index=False).encode("utf-8")
54
+ return hashlib.sha256(csv_bytes).hexdigest()
55
+
56
+
57
+ def compute_spark_dataframe_hash(
58
+ df,
59
+ columns: Optional[List[str]] = None,
60
+ sort_columns: Optional[List[str]] = None,
61
+ distributed: bool = True,
62
+ ) -> str:
63
+ """Compute a deterministic SHA256 hash of a Spark DataFrame's content.
64
+
65
+ Args:
66
+ df: Spark DataFrame to hash
67
+ columns: Subset of columns to include in hash. If None, all columns are used.
68
+ sort_columns: Columns to sort by for deterministic ordering.
69
+ (Only used in legacy mode when distributed=False)
70
+ distributed: If True (default), use distributed hash computation.
71
+ If False, use legacy collect-to-driver approach.
72
+
73
+ Returns:
74
+ SHA256 hex digest string (64 characters)
75
+
76
+ Note:
77
+ The distributed mode (default) computes hash without collecting data to driver,
78
+ making it safe for large datasets. The hash is computed as:
79
+ 1. Per-row xxhash64 of all column values
80
+ 2. Sum of all row hashes (order-independent)
81
+ 3. Combined with row count for final SHA256
82
+
83
+ Since the sum is commutative, this produces consistent hashes regardless of
84
+ partition ordering, without requiring a full sort operation.
85
+ """
86
+ if df.isEmpty():
87
+ return hashlib.sha256(b"EMPTY_DATAFRAME").hexdigest()
88
+
89
+ work_df = df
90
+
91
+ if columns:
92
+ work_df = work_df.select(columns)
93
+
94
+ if distributed:
95
+ return _compute_spark_hash_distributed(work_df)
96
+ else:
97
+ return _compute_spark_hash_legacy(work_df, sort_columns)
98
+
99
+
100
+ def _compute_spark_hash_distributed(df) -> str:
101
+ """Compute hash distributedly using Spark's xxhash64.
102
+
103
+ This approach:
104
+ - Never collects data to driver (except 2 scalar values)
105
+ - Uses xxhash64 for fast row-level hashing
106
+ - Uses commutative sum for order-independent aggregation
107
+ - Is safe for arbitrarily large DataFrames
108
+ """
109
+ from pyspark.sql import functions as F
110
+
111
+ hash_cols = [F.coalesce(F.col(c).cast("string"), F.lit("__NULL__")) for c in df.columns]
112
+ work_df = df.withColumn("_row_hash", F.xxhash64(*hash_cols))
113
+
114
+ result = work_df.agg(
115
+ F.count("*").alias("row_count"),
116
+ F.sum("_row_hash").alias("hash_sum"),
117
+ ).collect()[0]
118
+
119
+ row_count = result["row_count"] or 0
120
+ hash_sum = result["hash_sum"] or 0
121
+ combined = f"v2:{row_count}:{hash_sum}:{','.join(sorted(df.columns))}"
122
+ return hashlib.sha256(combined.encode()).hexdigest()
123
+
124
+
125
+ def _compute_spark_hash_legacy(df, sort_columns: Optional[List[str]] = None) -> str:
126
+ """Legacy hash computation that collects to driver.
127
+
128
+ Warning: This can cause OOM on large datasets.
129
+ Use distributed=True for production workloads.
130
+ """
131
+ work_df = df
132
+
133
+ if sort_columns:
134
+ work_df = work_df.orderBy(sort_columns)
135
+
136
+ pandas_df = work_df.toPandas()
137
+ csv_bytes = pandas_df.to_csv(index=False).encode("utf-8")
138
+ return hashlib.sha256(csv_bytes).hexdigest()
139
+
140
+
141
+ def make_content_hash_key(node_name: str, table_name: str) -> str:
142
+ """Generate a state key for content hash storage.
143
+
144
+ Args:
145
+ node_name: Pipeline node name
146
+ table_name: Target table name
147
+
148
+ Returns:
149
+ State key string
150
+ """
151
+ return f"content_hash:{node_name}:{table_name}"
152
+
153
+
154
+ def get_content_hash_from_state(state_backend, node_name: str, table_name: str) -> Optional[str]:
155
+ """Retrieve stored content hash from state backend (catalog).
156
+
157
+ Args:
158
+ state_backend: CatalogStateBackend or compatible state backend
159
+ node_name: Pipeline node name
160
+ table_name: Target table name
161
+
162
+ Returns:
163
+ Previously stored hash string, or None if not found
164
+ """
165
+ if state_backend is None:
166
+ return None
167
+
168
+ try:
169
+ key = make_content_hash_key(node_name, table_name)
170
+ value = state_backend.get_hwm(key)
171
+ if isinstance(value, dict):
172
+ return value.get("hash")
173
+ return None
174
+ except Exception:
175
+ return None
176
+
177
+
178
+ def set_content_hash_in_state(
179
+ state_backend,
180
+ node_name: str,
181
+ table_name: str,
182
+ content_hash: str,
183
+ ) -> None:
184
+ """Store content hash in state backend (catalog).
185
+
186
+ Args:
187
+ state_backend: CatalogStateBackend or compatible state backend
188
+ node_name: Pipeline node name
189
+ table_name: Target table name
190
+ content_hash: Hash string to store
191
+ """
192
+ if state_backend is None:
193
+ return
194
+
195
+ from datetime import datetime, timezone
196
+
197
+ key = make_content_hash_key(node_name, table_name)
198
+ value = {
199
+ "hash": content_hash,
200
+ "timestamp": datetime.now(timezone.utc).isoformat(),
201
+ }
202
+ state_backend.set_hwm(key, value)
@@ -0,0 +1,43 @@
1
+ """Duration parsing utilities."""
2
+
3
+ from datetime import timedelta
4
+ from typing import Optional
5
+
6
+
7
+ def parse_duration(duration_str: str) -> Optional[timedelta]:
8
+ """Parse a duration string like '2h', '30m', '1d' into a timedelta.
9
+
10
+ Args:
11
+ duration_str: Duration string with suffix (h=hours, m=minutes, d=days, s=seconds)
12
+
13
+ Returns:
14
+ timedelta object or None if parsing fails
15
+
16
+ Examples:
17
+ >>> parse_duration("2h")
18
+ datetime.timedelta(seconds=7200)
19
+ >>> parse_duration("30m")
20
+ datetime.timedelta(seconds=1800)
21
+ >>> parse_duration("1d")
22
+ datetime.timedelta(days=1)
23
+ """
24
+ if not duration_str:
25
+ return None
26
+
27
+ duration_str = duration_str.strip().lower()
28
+
29
+ try:
30
+ if duration_str.endswith("h"):
31
+ return timedelta(hours=int(duration_str[:-1]))
32
+ elif duration_str.endswith("d"):
33
+ return timedelta(days=int(duration_str[:-1]))
34
+ elif duration_str.endswith("m"):
35
+ return timedelta(minutes=int(duration_str[:-1]))
36
+ elif duration_str.endswith("s"):
37
+ return timedelta(seconds=int(duration_str[:-1]))
38
+ elif duration_str.endswith("w"):
39
+ return timedelta(weeks=int(duration_str[:-1]))
40
+ else:
41
+ return None
42
+ except ValueError:
43
+ return None
@@ -0,0 +1,102 @@
1
+ """Encoding detection utilities."""
2
+
3
+ import logging
4
+ from typing import Any, List, Optional
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ # Common encodings to try
9
+ CANDIDATE_ENCODINGS = ["utf-8", "utf-8-sig", "latin1", "cp1252"]
10
+
11
+
12
+ def detect_encoding(
13
+ connection: Any,
14
+ path: str,
15
+ sample_bytes: int = 65536,
16
+ candidates: Optional[List[str]] = None,
17
+ ) -> Optional[str]:
18
+ """Detect text encoding of a file.
19
+
20
+ Args:
21
+ connection: Connection object
22
+ path: File path (relative to connection base)
23
+ sample_bytes: Number of bytes to read for detection
24
+ candidates: List of encodings to try (default: common list)
25
+
26
+ Returns:
27
+ Detected encoding name or None if detection failed
28
+ """
29
+ full_path = connection.get_path(path)
30
+ candidates = candidates or CANDIDATE_ENCODINGS
31
+
32
+ # Read sample bytes
33
+ sample = _read_sample_bytes(connection, full_path, sample_bytes)
34
+ if not sample:
35
+ return None
36
+
37
+ # Try decoding
38
+ for encoding in candidates:
39
+ if _is_valid_encoding(sample, encoding):
40
+ return encoding
41
+
42
+ return None
43
+
44
+
45
+ def _read_sample_bytes(connection: Any, path: str, size: int) -> Optional[bytes]:
46
+ """Read bytes from file using available methods."""
47
+ # 1. Try fsspec (supports local and remote)
48
+ try:
49
+ import fsspec
50
+
51
+ # Get storage options from connection if available
52
+ storage_options = {}
53
+ if hasattr(connection, "pandas_storage_options"):
54
+ storage_options = connection.pandas_storage_options()
55
+
56
+ with fsspec.open(path, "rb", **storage_options) as f:
57
+ return f.read(size)
58
+ except ImportError:
59
+ pass
60
+ except Exception as e:
61
+ logger.debug(f"fsspec read failed for {path}: {e}")
62
+
63
+ # 2. Try local open (if path is local)
64
+ # Handle 'file://' prefix or plain path
65
+ local_path = path
66
+ if path.startswith("file://"):
67
+ local_path = path[7:]
68
+ elif "://" in path:
69
+ # Remote path and no fsspec -> cannot read
70
+ return None
71
+
72
+ try:
73
+ with open(local_path, "rb") as f:
74
+ return f.read(size)
75
+ except Exception as e:
76
+ logger.debug(f"Local open failed for {local_path}: {e}")
77
+
78
+ return None
79
+
80
+
81
+ def _is_valid_encoding(sample: bytes, encoding: str) -> bool:
82
+ """Check if bytes can be decoded with encoding and look reasonable."""
83
+ try:
84
+ sample.decode(encoding)
85
+
86
+ # Check for replacement characters (if decoder doesn't fail but inserts them)
87
+ # Note: strict errors would raise exception, but some encodings might be loose.
88
+ # We use strict check first.
89
+ sample.decode(encoding, errors="strict")
90
+
91
+ # Heuristics:
92
+ # 1. Check for excessive non-printable characters?
93
+ # For now, strict decoding is a strong signal.
94
+ # Latin1 accepts everything, so it always succeeds.
95
+ # So we prioritize UTF-8. If UTF-8 works, use it.
96
+ # If not, Latin1 will work but might show garbage.
97
+ # "Looks right" is hard.
98
+ # Maybe check for common delimiters if it's CSV?
99
+
100
+ return True
101
+ except UnicodeError:
102
+ return False
@@ -0,0 +1,28 @@
1
+ """Extension loading utilities."""
2
+
3
+ import importlib.util
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ from odibi.utils.logging import logger
8
+
9
+
10
+ def load_extensions(path: Path):
11
+ """Load python extensions (transforms.py, plugins.py) from path."""
12
+ # Add path to sys.path to handle imports within the extensions
13
+ if str(path) not in sys.path:
14
+ sys.path.append(str(path))
15
+
16
+ for name in ["transforms.py", "plugins.py"]:
17
+ file_path = path / name
18
+ if file_path.exists():
19
+ try:
20
+ module_name = file_path.stem
21
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
22
+ if spec and spec.loader:
23
+ module = importlib.util.module_from_spec(spec)
24
+ sys.modules[module_name] = module
25
+ spec.loader.exec_module(module)
26
+ logger.info(f"Loaded extension: {file_path}")
27
+ except Exception as e:
28
+ logger.warning(f"Failed to load {name}: {e}", exc_info=True)
odibi/utils/hashing.py ADDED
@@ -0,0 +1,61 @@
1
+ """Utilities for calculating configuration hashes."""
2
+
3
+ import hashlib
4
+ import json
5
+ from typing import Any, Optional, Set
6
+
7
+
8
+ DEFAULT_EXCLUDE_FIELDS: Set[str] = {"description", "tags", "log_level"}
9
+
10
+
11
+ def calculate_config_hash(
12
+ config: Any,
13
+ exclude: Optional[Set[str]] = None,
14
+ ) -> str:
15
+ """
16
+ Calculate MD5 hash of a configuration object.
17
+
18
+ Args:
19
+ config: Pydantic model or object with model_dump/dict method
20
+ exclude: Set of field names to exclude from hash calculation
21
+
22
+ Returns:
23
+ MD5 hex digest of the config
24
+ """
25
+ exclude = exclude if exclude is not None else DEFAULT_EXCLUDE_FIELDS
26
+
27
+ if hasattr(config, "model_dump"):
28
+ dump = config.model_dump(mode="json", exclude=exclude)
29
+ elif hasattr(config, "dict"):
30
+ dump = config.model_dump(exclude=exclude)
31
+ else:
32
+ dump = config
33
+
34
+ dump_str = json.dumps(dump, sort_keys=True)
35
+ return hashlib.md5(dump_str.encode("utf-8")).hexdigest()
36
+
37
+
38
+ def calculate_pipeline_hash(config: Any) -> str:
39
+ """
40
+ Calculate hash for a pipeline configuration.
41
+
42
+ Pipeline hashes include all fields (no exclusions).
43
+ """
44
+ if hasattr(config, "model_dump"):
45
+ dump = config.model_dump(mode="json")
46
+ elif hasattr(config, "dict"):
47
+ dump = config.model_dump()
48
+ else:
49
+ dump = config
50
+
51
+ dump_str = json.dumps(dump, sort_keys=True)
52
+ return hashlib.md5(dump_str.encode("utf-8")).hexdigest()
53
+
54
+
55
+ def calculate_node_hash(config: Any) -> str:
56
+ """
57
+ Calculate hash for a node configuration.
58
+
59
+ Node hashes exclude description, tags, and log_level.
60
+ """
61
+ return calculate_config_hash(config, exclude=DEFAULT_EXCLUDE_FIELDS)
odibi/utils/logging.py ADDED
@@ -0,0 +1,203 @@
1
+ """Structured logging for Odibi framework.
2
+
3
+ This module provides the core logging infrastructure:
4
+ - StructuredLogger: Base logger with JSON/human-readable output
5
+ - LoggingContext: Context-aware logging wrapper (imported from logging_context)
6
+ - Secret redaction for sensitive data
7
+
8
+ For enhanced observability features, use:
9
+ from odibi.utils.logging_context import LoggingContext, OperationType
10
+ """
11
+
12
+ import codecs
13
+ import json
14
+ import logging
15
+ import sys
16
+ from datetime import datetime, timezone
17
+
18
+ try:
19
+ from rich.console import Console
20
+ from rich.logging import RichHandler
21
+
22
+ RICH_AVAILABLE = True
23
+ except ImportError:
24
+ RICH_AVAILABLE = False
25
+
26
+
27
+ class StructuredLogger:
28
+ """Logger that supports both human-readable and JSON output with secret redaction.
29
+
30
+ This is the base logging class for Odibi. For context-aware logging with
31
+ automatic pipeline/node tracking, use LoggingContext instead.
32
+
33
+ Example:
34
+ >>> logger = StructuredLogger(structured=True, level="DEBUG")
35
+ >>> logger.info("Processing started", pipeline="daily_etl", rows=1000)
36
+ """
37
+
38
+ def __init__(self, structured: bool = False, level: str = "INFO"):
39
+ """Initialize structured logger.
40
+
41
+ Args:
42
+ structured: If True, output JSON logs; otherwise human-readable
43
+ level: Log level (DEBUG, INFO, WARNING, ERROR)
44
+ """
45
+ self.structured = structured
46
+ self.level = getattr(logging, level.upper(), logging.INFO)
47
+ self._secrets: set = set()
48
+ self._initialized = False
49
+
50
+ if (
51
+ sys.platform == "win32"
52
+ and sys.stdout
53
+ and sys.stdout.encoding
54
+ and sys.stdout.encoding.lower() != "utf-8"
55
+ ):
56
+ try:
57
+ sys.stdout.reconfigure(encoding="utf-8")
58
+ except AttributeError:
59
+ sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
60
+
61
+ self._setup_handlers()
62
+
63
+ def _setup_handlers(self) -> None:
64
+ """Set up logging handlers."""
65
+ if self._initialized:
66
+ return
67
+
68
+ if not self.structured and RICH_AVAILABLE:
69
+ logging.basicConfig(
70
+ level=self.level,
71
+ format="%(message)s",
72
+ datefmt="[%X]",
73
+ handlers=[
74
+ RichHandler(
75
+ rich_tracebacks=True,
76
+ markup=True,
77
+ show_path=False,
78
+ console=(
79
+ Console(force_terminal=True, legacy_windows=False)
80
+ if sys.platform == "win32"
81
+ else None
82
+ ),
83
+ )
84
+ ],
85
+ )
86
+ else:
87
+ logging.basicConfig(level=self.level, format="%(message)s", stream=sys.stdout)
88
+
89
+ self.logger = logging.getLogger("odibi")
90
+ self.logger.setLevel(self.level)
91
+
92
+ third_party_level = max(self.level, logging.WARNING)
93
+ for logger_name in [
94
+ "py4j",
95
+ "azure",
96
+ "azure.core.pipeline.policies.http_logging_policy",
97
+ "adlfs",
98
+ "urllib3",
99
+ "fsspec",
100
+ ]:
101
+ logging.getLogger(logger_name).setLevel(third_party_level)
102
+
103
+ self._initialized = True
104
+
105
+ def register_secret(self, secret: str) -> None:
106
+ """Register a secret string to be redacted from logs.
107
+
108
+ Args:
109
+ secret: Secret value to redact (passwords, keys, tokens)
110
+ """
111
+ if secret and isinstance(secret, str) and len(secret.strip()) > 0:
112
+ self._secrets.add(secret)
113
+
114
+ def _redact(self, text: str) -> str:
115
+ """Redact registered secrets from text.
116
+
117
+ Args:
118
+ text: Text to redact
119
+
120
+ Returns:
121
+ Text with secrets replaced by [REDACTED]
122
+ """
123
+ if not text or not self._secrets:
124
+ return text
125
+
126
+ for secret in self._secrets:
127
+ if secret in text:
128
+ text = text.replace(secret, "[REDACTED]")
129
+ return text
130
+
131
+ def info(self, message: str, **kwargs) -> None:
132
+ """Log info message."""
133
+ self._log("INFO", message, **kwargs)
134
+
135
+ def warning(self, message: str, **kwargs) -> None:
136
+ """Log warning message."""
137
+ self._log("WARNING", message, **kwargs)
138
+
139
+ def error(self, message: str, **kwargs) -> None:
140
+ """Log error message."""
141
+ self._log("ERROR", message, **kwargs)
142
+
143
+ def debug(self, message: str, **kwargs) -> None:
144
+ """Log debug message."""
145
+ self._log("DEBUG", message, **kwargs)
146
+
147
+ def _log(self, level: str, message: str, **kwargs) -> None:
148
+ """Internal log method with redaction and formatting.
149
+
150
+ Args:
151
+ level: Log level
152
+ message: Log message
153
+ **kwargs: Additional context to include
154
+ """
155
+ level_val = getattr(logging, level, logging.INFO)
156
+ if level_val < self.level:
157
+ return
158
+
159
+ message = self._redact(str(message))
160
+
161
+ redacted_kwargs = {}
162
+ for k, v in kwargs.items():
163
+ if isinstance(v, str):
164
+ redacted_kwargs[k] = self._redact(v)
165
+ elif v is None:
166
+ continue
167
+ else:
168
+ redacted_kwargs[k] = v
169
+
170
+ if self.structured:
171
+ log_entry = {
172
+ "timestamp": datetime.now(timezone.utc).isoformat(),
173
+ "level": level,
174
+ "message": message,
175
+ **redacted_kwargs,
176
+ }
177
+ print(json.dumps(log_entry, default=str))
178
+ else:
179
+ context_str = ""
180
+ if redacted_kwargs:
181
+ context_items = [f"{k}={v}" for k, v in redacted_kwargs.items()]
182
+ context_str = f" ({', '.join(context_items)})"
183
+
184
+ formatted_msg = f"{message}{context_str}"
185
+
186
+ if level == "INFO":
187
+ self.logger.info(formatted_msg)
188
+ elif level == "WARNING":
189
+ self.logger.warning(f"[WARN] {formatted_msg}")
190
+ elif level == "ERROR":
191
+ self.logger.error(f"[ERROR] {formatted_msg}")
192
+ elif level == "DEBUG":
193
+ self.logger.debug(f"[DEBUG] {formatted_msg}")
194
+
195
+
196
+ # Global instance to be initialized
197
+ logger = StructuredLogger()
198
+
199
+
200
+ def configure_logging(structured: bool, level: str):
201
+ """Configure the global logger."""
202
+ global logger
203
+ logger = StructuredLogger(structured=structured, level=level)