odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""Content hashing utilities for skip_if_unchanged feature.
|
|
2
|
+
|
|
3
|
+
This module provides functions to compute deterministic hashes of DataFrames
|
|
4
|
+
for change detection in snapshot ingestion patterns.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import hashlib
|
|
8
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def compute_dataframe_hash(
|
|
15
|
+
df: "pd.DataFrame",
|
|
16
|
+
columns: Optional[List[str]] = None,
|
|
17
|
+
sort_columns: Optional[List[str]] = None,
|
|
18
|
+
) -> str:
|
|
19
|
+
"""Compute a deterministic SHA256 hash of a DataFrame's content.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
df: Pandas DataFrame to hash
|
|
23
|
+
columns: Subset of columns to include in hash. If None, all columns.
|
|
24
|
+
sort_columns: Columns to sort by for deterministic ordering.
|
|
25
|
+
If None, DataFrame is not sorted (assumes consistent order).
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
SHA256 hex digest string (64 characters)
|
|
29
|
+
|
|
30
|
+
Example:
|
|
31
|
+
>>> df = pd.DataFrame({"id": [1, 2], "value": ["a", "b"]})
|
|
32
|
+
>>> hash1 = compute_dataframe_hash(df, sort_columns=["id"])
|
|
33
|
+
>>> hash2 = compute_dataframe_hash(df, sort_columns=["id"])
|
|
34
|
+
>>> assert hash1 == hash2 # Same content = same hash
|
|
35
|
+
"""
|
|
36
|
+
if df.empty:
|
|
37
|
+
return hashlib.sha256(b"EMPTY_DATAFRAME").hexdigest()
|
|
38
|
+
|
|
39
|
+
work_df = df
|
|
40
|
+
|
|
41
|
+
if columns:
|
|
42
|
+
missing = set(columns) - set(df.columns)
|
|
43
|
+
if missing:
|
|
44
|
+
raise ValueError(f"Hash columns not found in DataFrame: {missing}")
|
|
45
|
+
work_df = work_df[columns]
|
|
46
|
+
|
|
47
|
+
if sort_columns:
|
|
48
|
+
missing = set(sort_columns) - set(df.columns)
|
|
49
|
+
if missing:
|
|
50
|
+
raise ValueError(f"Sort columns not found in DataFrame: {missing}")
|
|
51
|
+
work_df = work_df.sort_values(sort_columns).reset_index(drop=True)
|
|
52
|
+
|
|
53
|
+
csv_bytes = work_df.to_csv(index=False).encode("utf-8")
|
|
54
|
+
return hashlib.sha256(csv_bytes).hexdigest()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def compute_spark_dataframe_hash(
|
|
58
|
+
df,
|
|
59
|
+
columns: Optional[List[str]] = None,
|
|
60
|
+
sort_columns: Optional[List[str]] = None,
|
|
61
|
+
distributed: bool = True,
|
|
62
|
+
) -> str:
|
|
63
|
+
"""Compute a deterministic SHA256 hash of a Spark DataFrame's content.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
df: Spark DataFrame to hash
|
|
67
|
+
columns: Subset of columns to include in hash. If None, all columns are used.
|
|
68
|
+
sort_columns: Columns to sort by for deterministic ordering.
|
|
69
|
+
(Only used in legacy mode when distributed=False)
|
|
70
|
+
distributed: If True (default), use distributed hash computation.
|
|
71
|
+
If False, use legacy collect-to-driver approach.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
SHA256 hex digest string (64 characters)
|
|
75
|
+
|
|
76
|
+
Note:
|
|
77
|
+
The distributed mode (default) computes hash without collecting data to driver,
|
|
78
|
+
making it safe for large datasets. The hash is computed as:
|
|
79
|
+
1. Per-row xxhash64 of all column values
|
|
80
|
+
2. Sum of all row hashes (order-independent)
|
|
81
|
+
3. Combined with row count for final SHA256
|
|
82
|
+
|
|
83
|
+
Since the sum is commutative, this produces consistent hashes regardless of
|
|
84
|
+
partition ordering, without requiring a full sort operation.
|
|
85
|
+
"""
|
|
86
|
+
if df.isEmpty():
|
|
87
|
+
return hashlib.sha256(b"EMPTY_DATAFRAME").hexdigest()
|
|
88
|
+
|
|
89
|
+
work_df = df
|
|
90
|
+
|
|
91
|
+
if columns:
|
|
92
|
+
work_df = work_df.select(columns)
|
|
93
|
+
|
|
94
|
+
if distributed:
|
|
95
|
+
return _compute_spark_hash_distributed(work_df)
|
|
96
|
+
else:
|
|
97
|
+
return _compute_spark_hash_legacy(work_df, sort_columns)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _compute_spark_hash_distributed(df) -> str:
|
|
101
|
+
"""Compute hash distributedly using Spark's xxhash64.
|
|
102
|
+
|
|
103
|
+
This approach:
|
|
104
|
+
- Never collects data to driver (except 2 scalar values)
|
|
105
|
+
- Uses xxhash64 for fast row-level hashing
|
|
106
|
+
- Uses commutative sum for order-independent aggregation
|
|
107
|
+
- Is safe for arbitrarily large DataFrames
|
|
108
|
+
"""
|
|
109
|
+
from pyspark.sql import functions as F
|
|
110
|
+
|
|
111
|
+
hash_cols = [F.coalesce(F.col(c).cast("string"), F.lit("__NULL__")) for c in df.columns]
|
|
112
|
+
work_df = df.withColumn("_row_hash", F.xxhash64(*hash_cols))
|
|
113
|
+
|
|
114
|
+
result = work_df.agg(
|
|
115
|
+
F.count("*").alias("row_count"),
|
|
116
|
+
F.sum("_row_hash").alias("hash_sum"),
|
|
117
|
+
).collect()[0]
|
|
118
|
+
|
|
119
|
+
row_count = result["row_count"] or 0
|
|
120
|
+
hash_sum = result["hash_sum"] or 0
|
|
121
|
+
combined = f"v2:{row_count}:{hash_sum}:{','.join(sorted(df.columns))}"
|
|
122
|
+
return hashlib.sha256(combined.encode()).hexdigest()
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _compute_spark_hash_legacy(df, sort_columns: Optional[List[str]] = None) -> str:
|
|
126
|
+
"""Legacy hash computation that collects to driver.
|
|
127
|
+
|
|
128
|
+
Warning: This can cause OOM on large datasets.
|
|
129
|
+
Use distributed=True for production workloads.
|
|
130
|
+
"""
|
|
131
|
+
work_df = df
|
|
132
|
+
|
|
133
|
+
if sort_columns:
|
|
134
|
+
work_df = work_df.orderBy(sort_columns)
|
|
135
|
+
|
|
136
|
+
pandas_df = work_df.toPandas()
|
|
137
|
+
csv_bytes = pandas_df.to_csv(index=False).encode("utf-8")
|
|
138
|
+
return hashlib.sha256(csv_bytes).hexdigest()
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def make_content_hash_key(node_name: str, table_name: str) -> str:
|
|
142
|
+
"""Generate a state key for content hash storage.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
node_name: Pipeline node name
|
|
146
|
+
table_name: Target table name
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
State key string
|
|
150
|
+
"""
|
|
151
|
+
return f"content_hash:{node_name}:{table_name}"
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def get_content_hash_from_state(state_backend, node_name: str, table_name: str) -> Optional[str]:
|
|
155
|
+
"""Retrieve stored content hash from state backend (catalog).
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
state_backend: CatalogStateBackend or compatible state backend
|
|
159
|
+
node_name: Pipeline node name
|
|
160
|
+
table_name: Target table name
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Previously stored hash string, or None if not found
|
|
164
|
+
"""
|
|
165
|
+
if state_backend is None:
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
key = make_content_hash_key(node_name, table_name)
|
|
170
|
+
value = state_backend.get_hwm(key)
|
|
171
|
+
if isinstance(value, dict):
|
|
172
|
+
return value.get("hash")
|
|
173
|
+
return None
|
|
174
|
+
except Exception:
|
|
175
|
+
return None
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def set_content_hash_in_state(
|
|
179
|
+
state_backend,
|
|
180
|
+
node_name: str,
|
|
181
|
+
table_name: str,
|
|
182
|
+
content_hash: str,
|
|
183
|
+
) -> None:
|
|
184
|
+
"""Store content hash in state backend (catalog).
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
state_backend: CatalogStateBackend or compatible state backend
|
|
188
|
+
node_name: Pipeline node name
|
|
189
|
+
table_name: Target table name
|
|
190
|
+
content_hash: Hash string to store
|
|
191
|
+
"""
|
|
192
|
+
if state_backend is None:
|
|
193
|
+
return
|
|
194
|
+
|
|
195
|
+
from datetime import datetime, timezone
|
|
196
|
+
|
|
197
|
+
key = make_content_hash_key(node_name, table_name)
|
|
198
|
+
value = {
|
|
199
|
+
"hash": content_hash,
|
|
200
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
201
|
+
}
|
|
202
|
+
state_backend.set_hwm(key, value)
|
odibi/utils/duration.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Duration parsing utilities."""
|
|
2
|
+
|
|
3
|
+
from datetime import timedelta
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def parse_duration(duration_str: str) -> Optional[timedelta]:
|
|
8
|
+
"""Parse a duration string like '2h', '30m', '1d' into a timedelta.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
duration_str: Duration string with suffix (h=hours, m=minutes, d=days, s=seconds)
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
timedelta object or None if parsing fails
|
|
15
|
+
|
|
16
|
+
Examples:
|
|
17
|
+
>>> parse_duration("2h")
|
|
18
|
+
datetime.timedelta(seconds=7200)
|
|
19
|
+
>>> parse_duration("30m")
|
|
20
|
+
datetime.timedelta(seconds=1800)
|
|
21
|
+
>>> parse_duration("1d")
|
|
22
|
+
datetime.timedelta(days=1)
|
|
23
|
+
"""
|
|
24
|
+
if not duration_str:
|
|
25
|
+
return None
|
|
26
|
+
|
|
27
|
+
duration_str = duration_str.strip().lower()
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
if duration_str.endswith("h"):
|
|
31
|
+
return timedelta(hours=int(duration_str[:-1]))
|
|
32
|
+
elif duration_str.endswith("d"):
|
|
33
|
+
return timedelta(days=int(duration_str[:-1]))
|
|
34
|
+
elif duration_str.endswith("m"):
|
|
35
|
+
return timedelta(minutes=int(duration_str[:-1]))
|
|
36
|
+
elif duration_str.endswith("s"):
|
|
37
|
+
return timedelta(seconds=int(duration_str[:-1]))
|
|
38
|
+
elif duration_str.endswith("w"):
|
|
39
|
+
return timedelta(weeks=int(duration_str[:-1]))
|
|
40
|
+
else:
|
|
41
|
+
return None
|
|
42
|
+
except ValueError:
|
|
43
|
+
return None
|
odibi/utils/encoding.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Encoding detection utilities."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any, List, Optional
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
# Common encodings to try
|
|
9
|
+
CANDIDATE_ENCODINGS = ["utf-8", "utf-8-sig", "latin1", "cp1252"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def detect_encoding(
|
|
13
|
+
connection: Any,
|
|
14
|
+
path: str,
|
|
15
|
+
sample_bytes: int = 65536,
|
|
16
|
+
candidates: Optional[List[str]] = None,
|
|
17
|
+
) -> Optional[str]:
|
|
18
|
+
"""Detect text encoding of a file.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
connection: Connection object
|
|
22
|
+
path: File path (relative to connection base)
|
|
23
|
+
sample_bytes: Number of bytes to read for detection
|
|
24
|
+
candidates: List of encodings to try (default: common list)
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Detected encoding name or None if detection failed
|
|
28
|
+
"""
|
|
29
|
+
full_path = connection.get_path(path)
|
|
30
|
+
candidates = candidates or CANDIDATE_ENCODINGS
|
|
31
|
+
|
|
32
|
+
# Read sample bytes
|
|
33
|
+
sample = _read_sample_bytes(connection, full_path, sample_bytes)
|
|
34
|
+
if not sample:
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
# Try decoding
|
|
38
|
+
for encoding in candidates:
|
|
39
|
+
if _is_valid_encoding(sample, encoding):
|
|
40
|
+
return encoding
|
|
41
|
+
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _read_sample_bytes(connection: Any, path: str, size: int) -> Optional[bytes]:
|
|
46
|
+
"""Read bytes from file using available methods."""
|
|
47
|
+
# 1. Try fsspec (supports local and remote)
|
|
48
|
+
try:
|
|
49
|
+
import fsspec
|
|
50
|
+
|
|
51
|
+
# Get storage options from connection if available
|
|
52
|
+
storage_options = {}
|
|
53
|
+
if hasattr(connection, "pandas_storage_options"):
|
|
54
|
+
storage_options = connection.pandas_storage_options()
|
|
55
|
+
|
|
56
|
+
with fsspec.open(path, "rb", **storage_options) as f:
|
|
57
|
+
return f.read(size)
|
|
58
|
+
except ImportError:
|
|
59
|
+
pass
|
|
60
|
+
except Exception as e:
|
|
61
|
+
logger.debug(f"fsspec read failed for {path}: {e}")
|
|
62
|
+
|
|
63
|
+
# 2. Try local open (if path is local)
|
|
64
|
+
# Handle 'file://' prefix or plain path
|
|
65
|
+
local_path = path
|
|
66
|
+
if path.startswith("file://"):
|
|
67
|
+
local_path = path[7:]
|
|
68
|
+
elif "://" in path:
|
|
69
|
+
# Remote path and no fsspec -> cannot read
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
with open(local_path, "rb") as f:
|
|
74
|
+
return f.read(size)
|
|
75
|
+
except Exception as e:
|
|
76
|
+
logger.debug(f"Local open failed for {local_path}: {e}")
|
|
77
|
+
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _is_valid_encoding(sample: bytes, encoding: str) -> bool:
|
|
82
|
+
"""Check if bytes can be decoded with encoding and look reasonable."""
|
|
83
|
+
try:
|
|
84
|
+
sample.decode(encoding)
|
|
85
|
+
|
|
86
|
+
# Check for replacement characters (if decoder doesn't fail but inserts them)
|
|
87
|
+
# Note: strict errors would raise exception, but some encodings might be loose.
|
|
88
|
+
# We use strict check first.
|
|
89
|
+
sample.decode(encoding, errors="strict")
|
|
90
|
+
|
|
91
|
+
# Heuristics:
|
|
92
|
+
# 1. Check for excessive non-printable characters?
|
|
93
|
+
# For now, strict decoding is a strong signal.
|
|
94
|
+
# Latin1 accepts everything, so it always succeeds.
|
|
95
|
+
# So we prioritize UTF-8. If UTF-8 works, use it.
|
|
96
|
+
# If not, Latin1 will work but might show garbage.
|
|
97
|
+
# "Looks right" is hard.
|
|
98
|
+
# Maybe check for common delimiters if it's CSV?
|
|
99
|
+
|
|
100
|
+
return True
|
|
101
|
+
except UnicodeError:
|
|
102
|
+
return False
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Extension loading utilities."""
|
|
2
|
+
|
|
3
|
+
import importlib.util
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from odibi.utils.logging import logger
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def load_extensions(path: Path):
|
|
11
|
+
"""Load python extensions (transforms.py, plugins.py) from path."""
|
|
12
|
+
# Add path to sys.path to handle imports within the extensions
|
|
13
|
+
if str(path) not in sys.path:
|
|
14
|
+
sys.path.append(str(path))
|
|
15
|
+
|
|
16
|
+
for name in ["transforms.py", "plugins.py"]:
|
|
17
|
+
file_path = path / name
|
|
18
|
+
if file_path.exists():
|
|
19
|
+
try:
|
|
20
|
+
module_name = file_path.stem
|
|
21
|
+
spec = importlib.util.spec_from_file_location(module_name, file_path)
|
|
22
|
+
if spec and spec.loader:
|
|
23
|
+
module = importlib.util.module_from_spec(spec)
|
|
24
|
+
sys.modules[module_name] = module
|
|
25
|
+
spec.loader.exec_module(module)
|
|
26
|
+
logger.info(f"Loaded extension: {file_path}")
|
|
27
|
+
except Exception as e:
|
|
28
|
+
logger.warning(f"Failed to load {name}: {e}", exc_info=True)
|
odibi/utils/hashing.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Utilities for calculating configuration hashes."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
from typing import Any, Optional, Set
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
DEFAULT_EXCLUDE_FIELDS: Set[str] = {"description", "tags", "log_level"}
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def calculate_config_hash(
|
|
12
|
+
config: Any,
|
|
13
|
+
exclude: Optional[Set[str]] = None,
|
|
14
|
+
) -> str:
|
|
15
|
+
"""
|
|
16
|
+
Calculate MD5 hash of a configuration object.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
config: Pydantic model or object with model_dump/dict method
|
|
20
|
+
exclude: Set of field names to exclude from hash calculation
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
MD5 hex digest of the config
|
|
24
|
+
"""
|
|
25
|
+
exclude = exclude if exclude is not None else DEFAULT_EXCLUDE_FIELDS
|
|
26
|
+
|
|
27
|
+
if hasattr(config, "model_dump"):
|
|
28
|
+
dump = config.model_dump(mode="json", exclude=exclude)
|
|
29
|
+
elif hasattr(config, "dict"):
|
|
30
|
+
dump = config.model_dump(exclude=exclude)
|
|
31
|
+
else:
|
|
32
|
+
dump = config
|
|
33
|
+
|
|
34
|
+
dump_str = json.dumps(dump, sort_keys=True)
|
|
35
|
+
return hashlib.md5(dump_str.encode("utf-8")).hexdigest()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def calculate_pipeline_hash(config: Any) -> str:
|
|
39
|
+
"""
|
|
40
|
+
Calculate hash for a pipeline configuration.
|
|
41
|
+
|
|
42
|
+
Pipeline hashes include all fields (no exclusions).
|
|
43
|
+
"""
|
|
44
|
+
if hasattr(config, "model_dump"):
|
|
45
|
+
dump = config.model_dump(mode="json")
|
|
46
|
+
elif hasattr(config, "dict"):
|
|
47
|
+
dump = config.model_dump()
|
|
48
|
+
else:
|
|
49
|
+
dump = config
|
|
50
|
+
|
|
51
|
+
dump_str = json.dumps(dump, sort_keys=True)
|
|
52
|
+
return hashlib.md5(dump_str.encode("utf-8")).hexdigest()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def calculate_node_hash(config: Any) -> str:
|
|
56
|
+
"""
|
|
57
|
+
Calculate hash for a node configuration.
|
|
58
|
+
|
|
59
|
+
Node hashes exclude description, tags, and log_level.
|
|
60
|
+
"""
|
|
61
|
+
return calculate_config_hash(config, exclude=DEFAULT_EXCLUDE_FIELDS)
|
odibi/utils/logging.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"""Structured logging for Odibi framework.
|
|
2
|
+
|
|
3
|
+
This module provides the core logging infrastructure:
|
|
4
|
+
- StructuredLogger: Base logger with JSON/human-readable output
|
|
5
|
+
- LoggingContext: Context-aware logging wrapper (imported from logging_context)
|
|
6
|
+
- Secret redaction for sensitive data
|
|
7
|
+
|
|
8
|
+
For enhanced observability features, use:
|
|
9
|
+
from odibi.utils.logging_context import LoggingContext, OperationType
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import codecs
|
|
13
|
+
import json
|
|
14
|
+
import logging
|
|
15
|
+
import sys
|
|
16
|
+
from datetime import datetime, timezone
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
from rich.console import Console
|
|
20
|
+
from rich.logging import RichHandler
|
|
21
|
+
|
|
22
|
+
RICH_AVAILABLE = True
|
|
23
|
+
except ImportError:
|
|
24
|
+
RICH_AVAILABLE = False
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class StructuredLogger:
|
|
28
|
+
"""Logger that supports both human-readable and JSON output with secret redaction.
|
|
29
|
+
|
|
30
|
+
This is the base logging class for Odibi. For context-aware logging with
|
|
31
|
+
automatic pipeline/node tracking, use LoggingContext instead.
|
|
32
|
+
|
|
33
|
+
Example:
|
|
34
|
+
>>> logger = StructuredLogger(structured=True, level="DEBUG")
|
|
35
|
+
>>> logger.info("Processing started", pipeline="daily_etl", rows=1000)
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, structured: bool = False, level: str = "INFO"):
|
|
39
|
+
"""Initialize structured logger.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
structured: If True, output JSON logs; otherwise human-readable
|
|
43
|
+
level: Log level (DEBUG, INFO, WARNING, ERROR)
|
|
44
|
+
"""
|
|
45
|
+
self.structured = structured
|
|
46
|
+
self.level = getattr(logging, level.upper(), logging.INFO)
|
|
47
|
+
self._secrets: set = set()
|
|
48
|
+
self._initialized = False
|
|
49
|
+
|
|
50
|
+
if (
|
|
51
|
+
sys.platform == "win32"
|
|
52
|
+
and sys.stdout
|
|
53
|
+
and sys.stdout.encoding
|
|
54
|
+
and sys.stdout.encoding.lower() != "utf-8"
|
|
55
|
+
):
|
|
56
|
+
try:
|
|
57
|
+
sys.stdout.reconfigure(encoding="utf-8")
|
|
58
|
+
except AttributeError:
|
|
59
|
+
sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
|
|
60
|
+
|
|
61
|
+
self._setup_handlers()
|
|
62
|
+
|
|
63
|
+
def _setup_handlers(self) -> None:
|
|
64
|
+
"""Set up logging handlers."""
|
|
65
|
+
if self._initialized:
|
|
66
|
+
return
|
|
67
|
+
|
|
68
|
+
if not self.structured and RICH_AVAILABLE:
|
|
69
|
+
logging.basicConfig(
|
|
70
|
+
level=self.level,
|
|
71
|
+
format="%(message)s",
|
|
72
|
+
datefmt="[%X]",
|
|
73
|
+
handlers=[
|
|
74
|
+
RichHandler(
|
|
75
|
+
rich_tracebacks=True,
|
|
76
|
+
markup=True,
|
|
77
|
+
show_path=False,
|
|
78
|
+
console=(
|
|
79
|
+
Console(force_terminal=True, legacy_windows=False)
|
|
80
|
+
if sys.platform == "win32"
|
|
81
|
+
else None
|
|
82
|
+
),
|
|
83
|
+
)
|
|
84
|
+
],
|
|
85
|
+
)
|
|
86
|
+
else:
|
|
87
|
+
logging.basicConfig(level=self.level, format="%(message)s", stream=sys.stdout)
|
|
88
|
+
|
|
89
|
+
self.logger = logging.getLogger("odibi")
|
|
90
|
+
self.logger.setLevel(self.level)
|
|
91
|
+
|
|
92
|
+
third_party_level = max(self.level, logging.WARNING)
|
|
93
|
+
for logger_name in [
|
|
94
|
+
"py4j",
|
|
95
|
+
"azure",
|
|
96
|
+
"azure.core.pipeline.policies.http_logging_policy",
|
|
97
|
+
"adlfs",
|
|
98
|
+
"urllib3",
|
|
99
|
+
"fsspec",
|
|
100
|
+
]:
|
|
101
|
+
logging.getLogger(logger_name).setLevel(third_party_level)
|
|
102
|
+
|
|
103
|
+
self._initialized = True
|
|
104
|
+
|
|
105
|
+
def register_secret(self, secret: str) -> None:
|
|
106
|
+
"""Register a secret string to be redacted from logs.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
secret: Secret value to redact (passwords, keys, tokens)
|
|
110
|
+
"""
|
|
111
|
+
if secret and isinstance(secret, str) and len(secret.strip()) > 0:
|
|
112
|
+
self._secrets.add(secret)
|
|
113
|
+
|
|
114
|
+
def _redact(self, text: str) -> str:
|
|
115
|
+
"""Redact registered secrets from text.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
text: Text to redact
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
Text with secrets replaced by [REDACTED]
|
|
122
|
+
"""
|
|
123
|
+
if not text or not self._secrets:
|
|
124
|
+
return text
|
|
125
|
+
|
|
126
|
+
for secret in self._secrets:
|
|
127
|
+
if secret in text:
|
|
128
|
+
text = text.replace(secret, "[REDACTED]")
|
|
129
|
+
return text
|
|
130
|
+
|
|
131
|
+
def info(self, message: str, **kwargs) -> None:
|
|
132
|
+
"""Log info message."""
|
|
133
|
+
self._log("INFO", message, **kwargs)
|
|
134
|
+
|
|
135
|
+
def warning(self, message: str, **kwargs) -> None:
|
|
136
|
+
"""Log warning message."""
|
|
137
|
+
self._log("WARNING", message, **kwargs)
|
|
138
|
+
|
|
139
|
+
def error(self, message: str, **kwargs) -> None:
|
|
140
|
+
"""Log error message."""
|
|
141
|
+
self._log("ERROR", message, **kwargs)
|
|
142
|
+
|
|
143
|
+
def debug(self, message: str, **kwargs) -> None:
|
|
144
|
+
"""Log debug message."""
|
|
145
|
+
self._log("DEBUG", message, **kwargs)
|
|
146
|
+
|
|
147
|
+
def _log(self, level: str, message: str, **kwargs) -> None:
|
|
148
|
+
"""Internal log method with redaction and formatting.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
level: Log level
|
|
152
|
+
message: Log message
|
|
153
|
+
**kwargs: Additional context to include
|
|
154
|
+
"""
|
|
155
|
+
level_val = getattr(logging, level, logging.INFO)
|
|
156
|
+
if level_val < self.level:
|
|
157
|
+
return
|
|
158
|
+
|
|
159
|
+
message = self._redact(str(message))
|
|
160
|
+
|
|
161
|
+
redacted_kwargs = {}
|
|
162
|
+
for k, v in kwargs.items():
|
|
163
|
+
if isinstance(v, str):
|
|
164
|
+
redacted_kwargs[k] = self._redact(v)
|
|
165
|
+
elif v is None:
|
|
166
|
+
continue
|
|
167
|
+
else:
|
|
168
|
+
redacted_kwargs[k] = v
|
|
169
|
+
|
|
170
|
+
if self.structured:
|
|
171
|
+
log_entry = {
|
|
172
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
173
|
+
"level": level,
|
|
174
|
+
"message": message,
|
|
175
|
+
**redacted_kwargs,
|
|
176
|
+
}
|
|
177
|
+
print(json.dumps(log_entry, default=str))
|
|
178
|
+
else:
|
|
179
|
+
context_str = ""
|
|
180
|
+
if redacted_kwargs:
|
|
181
|
+
context_items = [f"{k}={v}" for k, v in redacted_kwargs.items()]
|
|
182
|
+
context_str = f" ({', '.join(context_items)})"
|
|
183
|
+
|
|
184
|
+
formatted_msg = f"{message}{context_str}"
|
|
185
|
+
|
|
186
|
+
if level == "INFO":
|
|
187
|
+
self.logger.info(formatted_msg)
|
|
188
|
+
elif level == "WARNING":
|
|
189
|
+
self.logger.warning(f"[WARN] {formatted_msg}")
|
|
190
|
+
elif level == "ERROR":
|
|
191
|
+
self.logger.error(f"[ERROR] {formatted_msg}")
|
|
192
|
+
elif level == "DEBUG":
|
|
193
|
+
self.logger.debug(f"[DEBUG] {formatted_msg}")
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# Global instance to be initialized
|
|
197
|
+
logger = StructuredLogger()
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def configure_logging(structured: bool, level: str):
|
|
201
|
+
"""Configure the global logger."""
|
|
202
|
+
global logger
|
|
203
|
+
logger = StructuredLogger(structured=structured, level=level)
|