odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Diagnostics Manager
|
|
3
|
+
===================
|
|
4
|
+
|
|
5
|
+
Handles loading and managing run history for diagnostics.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict, List, Optional
|
|
12
|
+
|
|
13
|
+
from odibi.story.metadata import DeltaWriteInfo, NodeExecutionMetadata, PipelineStoryMetadata
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class HistoryManager:
|
|
17
|
+
"""Manages access to pipeline run history."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, history_path: str = "stories/"):
|
|
20
|
+
"""
|
|
21
|
+
Initialize history manager.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
history_path: Path where stories are stored
|
|
25
|
+
"""
|
|
26
|
+
self.history_path = Path(history_path)
|
|
27
|
+
self.is_remote = "://" in history_path
|
|
28
|
+
|
|
29
|
+
def list_runs(self, pipeline_name: str) -> List[Dict[str, str]]:
|
|
30
|
+
"""
|
|
31
|
+
List available runs for a pipeline.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
List of dicts with keys: run_id, timestamp, path
|
|
35
|
+
"""
|
|
36
|
+
runs = []
|
|
37
|
+
|
|
38
|
+
if self.is_remote:
|
|
39
|
+
# Remote listing not implemented yet
|
|
40
|
+
return []
|
|
41
|
+
|
|
42
|
+
if not self.history_path.exists():
|
|
43
|
+
return []
|
|
44
|
+
|
|
45
|
+
# Look for .json files
|
|
46
|
+
# Pattern: {pipeline_name}_{timestamp}.json
|
|
47
|
+
pattern = f"{pipeline_name}_*.json"
|
|
48
|
+
|
|
49
|
+
for path in self.history_path.glob(pattern):
|
|
50
|
+
try:
|
|
51
|
+
# Parse timestamp from filename
|
|
52
|
+
# Filename: name_YYYYMMDD_HHMMSS.json
|
|
53
|
+
parts = path.stem.split("_")
|
|
54
|
+
if len(parts) >= 3:
|
|
55
|
+
ts_str = f"{parts[-2]}_{parts[-1]}"
|
|
56
|
+
# Validate format
|
|
57
|
+
datetime.strptime(ts_str, "%Y%m%d_%H%M%S")
|
|
58
|
+
|
|
59
|
+
runs.append({"run_id": ts_str, "timestamp": ts_str, "path": str(path)})
|
|
60
|
+
except (ValueError, IndexError):
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
# Sort by timestamp descending (newest first)
|
|
64
|
+
runs.sort(key=lambda x: x["timestamp"], reverse=True)
|
|
65
|
+
return runs
|
|
66
|
+
|
|
67
|
+
def get_latest_run(self, pipeline_name: str) -> Optional[PipelineStoryMetadata]:
|
|
68
|
+
"""Get the most recent run metadata."""
|
|
69
|
+
runs = self.list_runs(pipeline_name)
|
|
70
|
+
if not runs:
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
return self.load_run(runs[0]["path"])
|
|
74
|
+
|
|
75
|
+
def get_run_by_id(self, pipeline_name: str, run_id: str) -> Optional[PipelineStoryMetadata]:
|
|
76
|
+
"""Get specific run metadata."""
|
|
77
|
+
runs = self.list_runs(pipeline_name)
|
|
78
|
+
for run in runs:
|
|
79
|
+
if run["run_id"] == run_id:
|
|
80
|
+
return self.load_run(run["path"])
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
def get_previous_run(
|
|
84
|
+
self, pipeline_name: str, current_run_id: str
|
|
85
|
+
) -> Optional[PipelineStoryMetadata]:
|
|
86
|
+
"""Get the run immediately preceding the specified one."""
|
|
87
|
+
runs = self.list_runs(pipeline_name)
|
|
88
|
+
|
|
89
|
+
found_current = False
|
|
90
|
+
for run in runs:
|
|
91
|
+
if found_current:
|
|
92
|
+
return self.load_run(run["path"])
|
|
93
|
+
|
|
94
|
+
if run["run_id"] == current_run_id:
|
|
95
|
+
found_current = True
|
|
96
|
+
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
def load_run(self, path: str) -> PipelineStoryMetadata:
|
|
100
|
+
"""Load run metadata from JSON file."""
|
|
101
|
+
if self.is_remote:
|
|
102
|
+
raise NotImplementedError("Remote history loading not supported yet")
|
|
103
|
+
|
|
104
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
105
|
+
data = json.load(f)
|
|
106
|
+
|
|
107
|
+
return self._dict_to_metadata(data)
|
|
108
|
+
|
|
109
|
+
def _dict_to_metadata(self, data: Dict) -> PipelineStoryMetadata:
|
|
110
|
+
"""Convert dictionary to PipelineStoryMetadata object."""
|
|
111
|
+
nodes = []
|
|
112
|
+
for n in data.get("nodes", []):
|
|
113
|
+
# Reconstruct Delta Info
|
|
114
|
+
delta_info = None
|
|
115
|
+
if n.get("delta_info"):
|
|
116
|
+
d = n["delta_info"]
|
|
117
|
+
delta_info = DeltaWriteInfo(
|
|
118
|
+
version=d.get("version"),
|
|
119
|
+
timestamp=(
|
|
120
|
+
datetime.fromisoformat(d.get("timestamp")) if d.get("timestamp") else None
|
|
121
|
+
),
|
|
122
|
+
operation=d.get("operation"),
|
|
123
|
+
operation_metrics=d.get("operation_metrics"),
|
|
124
|
+
read_version=d.get("read_version"),
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
node = NodeExecutionMetadata(
|
|
128
|
+
node_name=n["node_name"],
|
|
129
|
+
operation=n.get("operation", "unknown"),
|
|
130
|
+
status=n.get("status", "unknown"),
|
|
131
|
+
duration=n.get("duration", 0.0),
|
|
132
|
+
rows_in=n.get("rows_in"),
|
|
133
|
+
rows_out=n.get("rows_out"),
|
|
134
|
+
rows_change=n.get("rows_change"),
|
|
135
|
+
rows_change_pct=n.get("rows_change_pct"),
|
|
136
|
+
sample_data=n.get("sample_data"),
|
|
137
|
+
schema_in=n.get("schema_in"),
|
|
138
|
+
schema_out=n.get("schema_out"),
|
|
139
|
+
columns_added=n.get("columns_added", []),
|
|
140
|
+
columns_removed=n.get("columns_removed", []),
|
|
141
|
+
columns_renamed=n.get("columns_renamed", []),
|
|
142
|
+
executed_sql=n.get("executed_sql", []),
|
|
143
|
+
sql_hash=n.get("sql_hash"),
|
|
144
|
+
transformation_stack=n.get("transformation_stack", []),
|
|
145
|
+
config_snapshot=n.get("config_snapshot"),
|
|
146
|
+
delta_info=delta_info,
|
|
147
|
+
data_diff=n.get("data_diff"),
|
|
148
|
+
error_message=n.get("error_message"),
|
|
149
|
+
error_type=n.get("error_type"),
|
|
150
|
+
started_at=n.get("started_at"),
|
|
151
|
+
completed_at=n.get("completed_at"),
|
|
152
|
+
)
|
|
153
|
+
nodes.append(node)
|
|
154
|
+
|
|
155
|
+
return PipelineStoryMetadata(
|
|
156
|
+
pipeline_name=data["pipeline_name"],
|
|
157
|
+
pipeline_layer=data.get("pipeline_layer"),
|
|
158
|
+
started_at=data.get("started_at"),
|
|
159
|
+
completed_at=data.get("completed_at"),
|
|
160
|
+
duration=data.get("duration", 0.0),
|
|
161
|
+
total_nodes=data.get("total_nodes", 0),
|
|
162
|
+
completed_nodes=data.get("completed_nodes", 0),
|
|
163
|
+
failed_nodes=data.get("failed_nodes", 0),
|
|
164
|
+
skipped_nodes=data.get("skipped_nodes", 0),
|
|
165
|
+
nodes=nodes,
|
|
166
|
+
project=data.get("project"),
|
|
167
|
+
plant=data.get("plant"),
|
|
168
|
+
asset=data.get("asset"),
|
|
169
|
+
business_unit=data.get("business_unit"),
|
|
170
|
+
theme=data.get("theme", "default"),
|
|
171
|
+
)
|
odibi/engine/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Engine implementations for ODIBI."""
|
|
2
|
+
|
|
3
|
+
from odibi.engine.base import Engine
|
|
4
|
+
from odibi.engine.pandas_engine import PandasEngine
|
|
5
|
+
|
|
6
|
+
# Try to import SparkEngine (optional dependency)
|
|
7
|
+
try:
|
|
8
|
+
from odibi.engine.spark_engine import SparkEngine
|
|
9
|
+
|
|
10
|
+
__all__ = ["Engine", "PandasEngine", "SparkEngine"]
|
|
11
|
+
except ImportError:
|
|
12
|
+
# PySpark not available
|
|
13
|
+
__all__ = ["Engine", "PandasEngine"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Lazy import helper for Spark (backward compatibility)
|
|
17
|
+
def get_spark_engine():
|
|
18
|
+
from .spark_engine import SparkEngine
|
|
19
|
+
|
|
20
|
+
return SparkEngine
|
odibi/engine/base.py
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
"""Base engine interface."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
from odibi.context import Context
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Engine(ABC):
|
|
10
|
+
"""Abstract base class for execution engines."""
|
|
11
|
+
|
|
12
|
+
# Custom format registry
|
|
13
|
+
_custom_readers: Dict[str, Any] = {}
|
|
14
|
+
_custom_writers: Dict[str, Any] = {}
|
|
15
|
+
|
|
16
|
+
@classmethod
|
|
17
|
+
def register_format(cls, fmt: str, reader: Optional[Any] = None, writer: Optional[Any] = None):
|
|
18
|
+
"""Register custom format reader/writer.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
fmt: Format name (e.g. 'netcdf')
|
|
22
|
+
reader: Function(path, **options) -> DataFrame
|
|
23
|
+
writer: Function(df, path, **options) -> None
|
|
24
|
+
"""
|
|
25
|
+
if reader:
|
|
26
|
+
cls._custom_readers[fmt] = reader
|
|
27
|
+
if writer:
|
|
28
|
+
cls._custom_writers[fmt] = writer
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def read(
|
|
32
|
+
self,
|
|
33
|
+
connection: Any,
|
|
34
|
+
format: str,
|
|
35
|
+
table: Optional[str] = None,
|
|
36
|
+
path: Optional[str] = None,
|
|
37
|
+
options: Optional[Dict[str, Any]] = None,
|
|
38
|
+
) -> Any:
|
|
39
|
+
"""Read data from source.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
connection: Connection object
|
|
43
|
+
format: Data format (csv, parquet, delta, etc.)
|
|
44
|
+
table: Table name (for SQL/Delta)
|
|
45
|
+
path: File path (for file-based sources)
|
|
46
|
+
options: Format-specific options
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
DataFrame (engine-specific type)
|
|
50
|
+
"""
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
def materialize(self, df: Any) -> Any:
|
|
54
|
+
"""Materialize lazy dataset into memory (DataFrame).
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
df: DataFrame or LazyDataset
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
Materialized DataFrame
|
|
61
|
+
"""
|
|
62
|
+
return df
|
|
63
|
+
|
|
64
|
+
@abstractmethod
|
|
65
|
+
def write(
|
|
66
|
+
self,
|
|
67
|
+
df: Any,
|
|
68
|
+
connection: Any,
|
|
69
|
+
format: str,
|
|
70
|
+
table: Optional[str] = None,
|
|
71
|
+
path: Optional[str] = None,
|
|
72
|
+
mode: str = "overwrite",
|
|
73
|
+
options: Optional[Dict[str, Any]] = None,
|
|
74
|
+
streaming_config: Optional[Any] = None,
|
|
75
|
+
) -> None:
|
|
76
|
+
"""Write data to destination.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
df: DataFrame to write
|
|
80
|
+
connection: Connection object
|
|
81
|
+
format: Output format
|
|
82
|
+
table: Table name (for SQL/Delta)
|
|
83
|
+
path: File path (for file-based outputs)
|
|
84
|
+
mode: Write mode (overwrite/append)
|
|
85
|
+
options: Format-specific options
|
|
86
|
+
"""
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
@abstractmethod
|
|
90
|
+
def execute_sql(self, sql: str, context: Context) -> Any:
|
|
91
|
+
"""Execute SQL query.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
sql: SQL query string
|
|
95
|
+
context: Execution context with registered DataFrames
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Result DataFrame
|
|
99
|
+
"""
|
|
100
|
+
pass
|
|
101
|
+
|
|
102
|
+
@abstractmethod
|
|
103
|
+
def execute_operation(self, operation: str, params: Dict[str, Any], df: Any) -> Any:
|
|
104
|
+
"""Execute built-in operation (pivot, etc.).
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
operation: Operation name
|
|
108
|
+
params: Operation parameters
|
|
109
|
+
df: Input DataFrame
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Result DataFrame
|
|
113
|
+
"""
|
|
114
|
+
pass
|
|
115
|
+
|
|
116
|
+
@abstractmethod
|
|
117
|
+
def get_schema(self, df: Any) -> Any:
|
|
118
|
+
"""Get DataFrame schema.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
df: DataFrame
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Dict[str, str] mapping column names to types, or List[str] of names (deprecated)
|
|
125
|
+
"""
|
|
126
|
+
pass
|
|
127
|
+
|
|
128
|
+
@abstractmethod
|
|
129
|
+
def get_shape(self, df: Any) -> tuple:
|
|
130
|
+
"""Get DataFrame shape.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
df: DataFrame
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
(rows, columns)
|
|
137
|
+
"""
|
|
138
|
+
pass
|
|
139
|
+
|
|
140
|
+
@abstractmethod
|
|
141
|
+
def count_rows(self, df: Any) -> int:
|
|
142
|
+
"""Count rows in DataFrame.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
df: DataFrame
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
Row count
|
|
149
|
+
"""
|
|
150
|
+
pass
|
|
151
|
+
|
|
152
|
+
@abstractmethod
|
|
153
|
+
def count_nulls(self, df: Any, columns: List[str]) -> Dict[str, int]:
|
|
154
|
+
"""Count nulls in specified columns.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
df: DataFrame
|
|
158
|
+
columns: Columns to check
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Dictionary of column -> null count
|
|
162
|
+
"""
|
|
163
|
+
pass
|
|
164
|
+
|
|
165
|
+
@abstractmethod
|
|
166
|
+
def validate_schema(self, df: Any, schema_rules: Dict[str, Any]) -> List[str]:
|
|
167
|
+
"""Validate DataFrame schema.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
df: DataFrame
|
|
171
|
+
schema_rules: Validation rules
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
List of validation failures (empty if valid)
|
|
175
|
+
"""
|
|
176
|
+
pass
|
|
177
|
+
|
|
178
|
+
@abstractmethod
|
|
179
|
+
def validate_data(self, df: Any, validation_config: Any) -> List[str]:
|
|
180
|
+
"""Validate data against rules.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
df: DataFrame to validate
|
|
184
|
+
validation_config: ValidationConfig object
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
List of validation failure messages (empty if valid)
|
|
188
|
+
"""
|
|
189
|
+
pass
|
|
190
|
+
|
|
191
|
+
@abstractmethod
|
|
192
|
+
def get_sample(self, df: Any, n: int = 10) -> List[Dict[str, Any]]:
|
|
193
|
+
"""Get sample rows as list of dictionaries.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
df: DataFrame
|
|
197
|
+
n: Number of rows to return
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
List of row dictionaries
|
|
201
|
+
"""
|
|
202
|
+
pass
|
|
203
|
+
|
|
204
|
+
def get_source_files(self, df: Any) -> List[str]:
|
|
205
|
+
"""Get list of source files that generated this DataFrame.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
df: DataFrame
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
List of file paths (or empty list if not applicable/supported)
|
|
212
|
+
"""
|
|
213
|
+
return []
|
|
214
|
+
|
|
215
|
+
def profile_nulls(self, df: Any) -> Dict[str, float]:
|
|
216
|
+
"""Calculate null percentage for each column.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
df: DataFrame
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
Dictionary of {column_name: null_percentage} (0.0 to 1.0)
|
|
223
|
+
"""
|
|
224
|
+
return {}
|
|
225
|
+
|
|
226
|
+
@abstractmethod
|
|
227
|
+
def table_exists(
|
|
228
|
+
self, connection: Any, table: Optional[str] = None, path: Optional[str] = None
|
|
229
|
+
) -> bool:
|
|
230
|
+
"""Check if table or location exists.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
connection: Connection object
|
|
234
|
+
table: Table name (for catalog tables)
|
|
235
|
+
path: File path (for path-based tables)
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
True if table/location exists, False otherwise
|
|
239
|
+
"""
|
|
240
|
+
pass
|
|
241
|
+
|
|
242
|
+
@abstractmethod
|
|
243
|
+
def harmonize_schema(self, df: Any, target_schema: Dict[str, str], policy: Any) -> Any:
|
|
244
|
+
"""Harmonize DataFrame schema with target schema according to policy.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
df: Input DataFrame
|
|
248
|
+
target_schema: Target schema (column name -> type)
|
|
249
|
+
policy: SchemaPolicyConfig object
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
Harmonized DataFrame
|
|
253
|
+
"""
|
|
254
|
+
pass
|
|
255
|
+
|
|
256
|
+
@abstractmethod
|
|
257
|
+
def anonymize(
|
|
258
|
+
self, df: Any, columns: List[str], method: str, salt: Optional[str] = None
|
|
259
|
+
) -> Any:
|
|
260
|
+
"""Anonymize specified columns.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
df: DataFrame to anonymize
|
|
264
|
+
columns: List of columns to anonymize
|
|
265
|
+
method: Method ('hash', 'mask', 'redact')
|
|
266
|
+
salt: Optional salt for hashing
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
Anonymized DataFrame
|
|
270
|
+
"""
|
|
271
|
+
pass
|
|
272
|
+
|
|
273
|
+
def get_table_schema(
|
|
274
|
+
self,
|
|
275
|
+
connection: Any,
|
|
276
|
+
table: Optional[str] = None,
|
|
277
|
+
path: Optional[str] = None,
|
|
278
|
+
format: Optional[str] = None,
|
|
279
|
+
) -> Optional[Dict[str, str]]:
|
|
280
|
+
"""Get schema of an existing table/file.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
connection: Connection object
|
|
284
|
+
table: Table name
|
|
285
|
+
path: File path
|
|
286
|
+
format: Data format (optional, helps with file-based sources)
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
Schema dict or None if table doesn't exist or schema fetch fails.
|
|
290
|
+
"""
|
|
291
|
+
return None
|
|
292
|
+
|
|
293
|
+
def maintain_table(
|
|
294
|
+
self,
|
|
295
|
+
connection: Any,
|
|
296
|
+
format: str,
|
|
297
|
+
table: Optional[str] = None,
|
|
298
|
+
path: Optional[str] = None,
|
|
299
|
+
config: Optional[Any] = None,
|
|
300
|
+
) -> None:
|
|
301
|
+
"""Run table maintenance operations (optimize, vacuum).
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
connection: Connection object
|
|
305
|
+
format: Table format
|
|
306
|
+
table: Table name
|
|
307
|
+
path: Table path
|
|
308
|
+
config: AutoOptimizeConfig object
|
|
309
|
+
"""
|
|
310
|
+
pass
|
|
311
|
+
|
|
312
|
+
def add_write_metadata(
|
|
313
|
+
self,
|
|
314
|
+
df: Any,
|
|
315
|
+
metadata_config: Any,
|
|
316
|
+
source_connection: Optional[str] = None,
|
|
317
|
+
source_table: Optional[str] = None,
|
|
318
|
+
source_path: Optional[str] = None,
|
|
319
|
+
is_file_source: bool = False,
|
|
320
|
+
) -> Any:
|
|
321
|
+
"""Add metadata columns to DataFrame before writing (Bronze layer lineage).
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
df: DataFrame
|
|
325
|
+
metadata_config: WriteMetadataConfig or True (for all defaults)
|
|
326
|
+
source_connection: Name of the source connection
|
|
327
|
+
source_table: Name of the source table (SQL sources)
|
|
328
|
+
source_path: Path of the source file (file sources)
|
|
329
|
+
is_file_source: True if source is a file-based read
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
DataFrame with metadata columns added (or unchanged if metadata_config is None/False)
|
|
333
|
+
"""
|
|
334
|
+
return df # Default: no-op
|