odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/story/themes.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Theme System
|
|
3
|
+
============
|
|
4
|
+
|
|
5
|
+
Customizable themes for story rendering with branding support.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Dict, Optional
|
|
11
|
+
|
|
12
|
+
import yaml
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class StoryTheme:
|
|
17
|
+
"""
|
|
18
|
+
Story theme configuration.
|
|
19
|
+
|
|
20
|
+
Defines colors, typography, branding, and layout options for
|
|
21
|
+
rendered stories (HTML).
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
name: str
|
|
25
|
+
|
|
26
|
+
# Colors
|
|
27
|
+
primary_color: str = "#0066cc"
|
|
28
|
+
success_color: str = "#28a745"
|
|
29
|
+
error_color: str = "#dc3545"
|
|
30
|
+
warning_color: str = "#ffc107"
|
|
31
|
+
bg_color: str = "#ffffff"
|
|
32
|
+
text_color: str = "#333333"
|
|
33
|
+
border_color: str = "#dddddd"
|
|
34
|
+
code_bg: str = "#f5f5f5"
|
|
35
|
+
|
|
36
|
+
# Typography
|
|
37
|
+
font_family: str = (
|
|
38
|
+
"system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif"
|
|
39
|
+
)
|
|
40
|
+
heading_font: str = "inherit"
|
|
41
|
+
code_font: str = "Consolas, Monaco, 'Courier New', monospace"
|
|
42
|
+
font_size: str = "16px"
|
|
43
|
+
|
|
44
|
+
# Branding
|
|
45
|
+
logo_url: Optional[str] = None
|
|
46
|
+
company_name: Optional[str] = None
|
|
47
|
+
footer_text: Optional[str] = None
|
|
48
|
+
|
|
49
|
+
# Layout
|
|
50
|
+
max_width: str = "1200px"
|
|
51
|
+
sidebar: bool = False
|
|
52
|
+
|
|
53
|
+
# Custom CSS
|
|
54
|
+
custom_css: Optional[str] = None
|
|
55
|
+
|
|
56
|
+
def to_css_vars(self) -> Dict[str, str]:
|
|
57
|
+
"""
|
|
58
|
+
Convert theme to CSS variables.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Dictionary of CSS variable names and values
|
|
62
|
+
"""
|
|
63
|
+
return {
|
|
64
|
+
"--primary-color": self.primary_color,
|
|
65
|
+
"--success-color": self.success_color,
|
|
66
|
+
"--error-color": self.error_color,
|
|
67
|
+
"--warning-color": self.warning_color,
|
|
68
|
+
"--bg-color": self.bg_color,
|
|
69
|
+
"--text-color": self.text_color,
|
|
70
|
+
"--border-color": self.border_color,
|
|
71
|
+
"--code-bg": self.code_bg,
|
|
72
|
+
"--font-family": self.font_family,
|
|
73
|
+
"--heading-font": self.heading_font,
|
|
74
|
+
"--code-font": self.code_font,
|
|
75
|
+
"--font-size": self.font_size,
|
|
76
|
+
"--max-width": self.max_width,
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
def to_css_string(self) -> str:
|
|
80
|
+
"""
|
|
81
|
+
Generate CSS string from theme.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
CSS string with :root variables
|
|
85
|
+
"""
|
|
86
|
+
lines = [":root {"]
|
|
87
|
+
for var_name, var_value in self.to_css_vars().items():
|
|
88
|
+
lines.append(f" {var_name}: {var_value};")
|
|
89
|
+
lines.append("}")
|
|
90
|
+
|
|
91
|
+
if self.custom_css:
|
|
92
|
+
lines.append("")
|
|
93
|
+
lines.append(self.custom_css)
|
|
94
|
+
|
|
95
|
+
return "\n".join(lines)
|
|
96
|
+
|
|
97
|
+
@classmethod
|
|
98
|
+
def from_dict(cls, data: Dict[str, Any]) -> "StoryTheme":
|
|
99
|
+
"""
|
|
100
|
+
Create theme from dictionary.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
data: Theme configuration dictionary
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
StoryTheme instance
|
|
107
|
+
"""
|
|
108
|
+
return cls(**data)
|
|
109
|
+
|
|
110
|
+
@classmethod
|
|
111
|
+
def from_yaml(cls, path: str) -> "StoryTheme":
|
|
112
|
+
"""
|
|
113
|
+
Load theme from YAML file.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
path: Path to YAML theme file
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
StoryTheme instance
|
|
120
|
+
"""
|
|
121
|
+
with open(path, "r") as f:
|
|
122
|
+
data = yaml.safe_load(f)
|
|
123
|
+
|
|
124
|
+
return cls.from_dict(data)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
# Built-in Themes
|
|
128
|
+
# ===============
|
|
129
|
+
|
|
130
|
+
DEFAULT_THEME = StoryTheme(
|
|
131
|
+
name="default",
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
CORPORATE_THEME = StoryTheme(
|
|
135
|
+
name="corporate",
|
|
136
|
+
primary_color="#003366",
|
|
137
|
+
success_color="#2e7d32",
|
|
138
|
+
error_color="#c62828",
|
|
139
|
+
font_family="Arial, Helvetica, sans-serif",
|
|
140
|
+
heading_font="Georgia, 'Times New Roman', serif",
|
|
141
|
+
font_size="15px",
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
DARK_THEME = StoryTheme(
|
|
145
|
+
name="dark",
|
|
146
|
+
primary_color="#00bfff",
|
|
147
|
+
success_color="#4caf50",
|
|
148
|
+
error_color="#f44336",
|
|
149
|
+
warning_color="#ffb300",
|
|
150
|
+
bg_color="#1e1e1e",
|
|
151
|
+
text_color="#e0e0e0",
|
|
152
|
+
border_color="#444444",
|
|
153
|
+
code_bg="#2d2d2d",
|
|
154
|
+
custom_css="""
|
|
155
|
+
body { background: #121212; }
|
|
156
|
+
.container { background: #1e1e1e; }
|
|
157
|
+
.node-header { background: #2d2d2d; }
|
|
158
|
+
.summary { background: #2d2d2d; }
|
|
159
|
+
""",
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
MINIMAL_THEME = StoryTheme(
|
|
163
|
+
name="minimal",
|
|
164
|
+
primary_color="#000000",
|
|
165
|
+
success_color="#006600",
|
|
166
|
+
error_color="#cc0000",
|
|
167
|
+
warning_color="#ff9900",
|
|
168
|
+
font_family="'Helvetica Neue', Helvetica, Arial, sans-serif",
|
|
169
|
+
heading_font="'Helvetica Neue', Helvetica, Arial, sans-serif",
|
|
170
|
+
font_size="14px",
|
|
171
|
+
max_width="900px",
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Theme registry
|
|
175
|
+
BUILTIN_THEMES = {
|
|
176
|
+
"default": DEFAULT_THEME,
|
|
177
|
+
"corporate": CORPORATE_THEME,
|
|
178
|
+
"dark": DARK_THEME,
|
|
179
|
+
"minimal": MINIMAL_THEME,
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def get_theme(name: str) -> StoryTheme:
|
|
184
|
+
"""
|
|
185
|
+
Get theme by name.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
name: Theme name or path to YAML theme file
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
StoryTheme instance
|
|
192
|
+
|
|
193
|
+
Raises:
|
|
194
|
+
ValueError: If theme not found
|
|
195
|
+
"""
|
|
196
|
+
# Check if it's a file path
|
|
197
|
+
if Path(name).exists():
|
|
198
|
+
return StoryTheme.from_yaml(name)
|
|
199
|
+
|
|
200
|
+
# Check built-in themes
|
|
201
|
+
if name.lower() in BUILTIN_THEMES:
|
|
202
|
+
return BUILTIN_THEMES[name.lower()]
|
|
203
|
+
|
|
204
|
+
raise ValueError(
|
|
205
|
+
f"Theme '{name}' not found. Available themes: {', '.join(BUILTIN_THEMES.keys())}"
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def list_themes() -> Dict[str, StoryTheme]:
|
|
210
|
+
"""
|
|
211
|
+
List all available built-in themes.
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Dictionary of theme name -> StoryTheme
|
|
215
|
+
"""
|
|
216
|
+
return BUILTIN_THEMES.copy()
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Testing utilities for Odibi."""
|
|
2
|
+
|
|
3
|
+
from .assertions import assert_frame_equal, assert_schema_equal
|
|
4
|
+
from .fixtures import generate_sample_data, temp_directory
|
|
5
|
+
|
|
6
|
+
__version__ = "1.3.0-alpha.1"
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"temp_directory",
|
|
10
|
+
"generate_sample_data",
|
|
11
|
+
"assert_frame_equal",
|
|
12
|
+
"assert_schema_equal",
|
|
13
|
+
]
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Testing Assertions
|
|
3
|
+
==================
|
|
4
|
+
|
|
5
|
+
Helpers for asserting DataFrame equality and properties.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Any, List
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def assert_frame_equal(
|
|
14
|
+
left: Any,
|
|
15
|
+
right: Any,
|
|
16
|
+
check_dtype: bool = True,
|
|
17
|
+
check_exact: bool = False,
|
|
18
|
+
atol: float = 1e-8,
|
|
19
|
+
rtol: float = 1e-5,
|
|
20
|
+
) -> None:
|
|
21
|
+
"""
|
|
22
|
+
Assert that two DataFrames are equal.
|
|
23
|
+
Supports both Pandas and Spark DataFrames.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
left: First DataFrame
|
|
27
|
+
right: Second DataFrame
|
|
28
|
+
check_dtype: Whether to check data types
|
|
29
|
+
check_exact: Whether to compare numbers exactly
|
|
30
|
+
atol: Absolute tolerance
|
|
31
|
+
rtol: Relative tolerance
|
|
32
|
+
"""
|
|
33
|
+
# Convert Spark to Pandas for comparison if needed
|
|
34
|
+
left_pdf = _to_pandas(left)
|
|
35
|
+
right_pdf = _to_pandas(right)
|
|
36
|
+
|
|
37
|
+
# Sort by first column to ensure order doesn't matter (Spark is unordered)
|
|
38
|
+
if not left_pdf.empty and not right_pdf.empty:
|
|
39
|
+
sort_col = left_pdf.columns[0]
|
|
40
|
+
left_pdf = left_pdf.sort_values(sort_col).reset_index(drop=True)
|
|
41
|
+
right_pdf = right_pdf.sort_values(sort_col).reset_index(drop=True)
|
|
42
|
+
|
|
43
|
+
pd.testing.assert_frame_equal(
|
|
44
|
+
left_pdf, right_pdf, check_dtype=check_dtype, check_exact=check_exact, atol=atol, rtol=rtol
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def assert_schema_equal(left: Any, right: Any) -> None:
|
|
49
|
+
"""
|
|
50
|
+
Assert that two DataFrames have the same schema (column names and types).
|
|
51
|
+
"""
|
|
52
|
+
# Simplified check for column names
|
|
53
|
+
left_cols = sorted(_get_columns(left))
|
|
54
|
+
right_cols = sorted(_get_columns(right))
|
|
55
|
+
|
|
56
|
+
assert left_cols == right_cols, f"Schema mismatch: {left_cols} != {right_cols}"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _to_pandas(df: Any) -> pd.DataFrame:
|
|
60
|
+
"""Convert to Pandas DataFrame if not already."""
|
|
61
|
+
if isinstance(df, pd.DataFrame):
|
|
62
|
+
return df
|
|
63
|
+
|
|
64
|
+
# Assume Spark DataFrame
|
|
65
|
+
try:
|
|
66
|
+
return df.toPandas()
|
|
67
|
+
except AttributeError:
|
|
68
|
+
raise TypeError(f"Expected DataFrame, got {type(df)}")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _get_columns(df: Any) -> List[str]:
|
|
72
|
+
"""Get column names."""
|
|
73
|
+
if isinstance(df, pd.DataFrame):
|
|
74
|
+
return list(df.columns)
|
|
75
|
+
return list(df.columns)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Testing Fixtures
|
|
3
|
+
================
|
|
4
|
+
|
|
5
|
+
Reusable fixtures for testing pipelines and transformations.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import shutil
|
|
9
|
+
import tempfile
|
|
10
|
+
from contextlib import contextmanager
|
|
11
|
+
from typing import Any, Dict, Generator, Optional
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@contextmanager
|
|
18
|
+
def temp_directory() -> Generator[str, None, None]:
|
|
19
|
+
"""
|
|
20
|
+
Create a temporary directory for test artifacts.
|
|
21
|
+
|
|
22
|
+
Yields:
|
|
23
|
+
Path to the temporary directory.
|
|
24
|
+
|
|
25
|
+
Example:
|
|
26
|
+
with temp_directory() as temp_dir:
|
|
27
|
+
path = os.path.join(temp_dir, "test.csv")
|
|
28
|
+
df.to_csv(path)
|
|
29
|
+
"""
|
|
30
|
+
temp_dir = tempfile.mkdtemp()
|
|
31
|
+
try:
|
|
32
|
+
yield temp_dir
|
|
33
|
+
finally:
|
|
34
|
+
shutil.rmtree(temp_dir)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def generate_sample_data(
|
|
38
|
+
rows: int = 10, engine_type: str = "pandas", schema: Optional[Dict[str, str]] = None
|
|
39
|
+
) -> Any:
|
|
40
|
+
"""
|
|
41
|
+
Generate a sample DataFrame (Pandas or Spark).
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
rows: Number of rows to generate
|
|
45
|
+
engine_type: "pandas" or "spark"
|
|
46
|
+
schema: Optional dictionary of {column_name: type}
|
|
47
|
+
Supported types: "int", "float", "str", "date"
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
DataFrame (pd.DataFrame or pyspark.sql.DataFrame)
|
|
51
|
+
"""
|
|
52
|
+
from datetime import datetime, timedelta
|
|
53
|
+
|
|
54
|
+
# Default schema if none provided
|
|
55
|
+
if not schema:
|
|
56
|
+
schema = {"id": "int", "value": "float", "category": "str", "timestamp": "date"}
|
|
57
|
+
|
|
58
|
+
data = {}
|
|
59
|
+
for col, dtype in schema.items():
|
|
60
|
+
if dtype == "int":
|
|
61
|
+
data[col] = np.random.randint(0, 1000, rows)
|
|
62
|
+
elif dtype == "float":
|
|
63
|
+
data[col] = np.random.rand(rows) * 100
|
|
64
|
+
elif dtype == "str":
|
|
65
|
+
data[col] = [f"val_{i}" for i in range(rows)]
|
|
66
|
+
elif dtype == "date":
|
|
67
|
+
base_date = datetime.now()
|
|
68
|
+
data[col] = [base_date - timedelta(days=i) for i in range(rows)]
|
|
69
|
+
|
|
70
|
+
pdf = pd.DataFrame(data)
|
|
71
|
+
|
|
72
|
+
if engine_type == "pandas":
|
|
73
|
+
return pdf
|
|
74
|
+
|
|
75
|
+
if engine_type == "spark":
|
|
76
|
+
try:
|
|
77
|
+
from pyspark.sql import SparkSession
|
|
78
|
+
|
|
79
|
+
# Try to get existing session or create new one
|
|
80
|
+
spark = SparkSession.builder.master("local[*]").appName("odibi-test").getOrCreate()
|
|
81
|
+
return spark.createDataFrame(pdf)
|
|
82
|
+
except ImportError:
|
|
83
|
+
raise ImportError("Spark not installed. Run 'pip install odibi[spark]'")
|
|
84
|
+
|
|
85
|
+
raise ValueError(f"Unknown engine type: {engine_type}")
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SourcePool: Deterministic, frozen test data sources for Odibi testing.
|
|
3
|
+
|
|
4
|
+
Phase 7.B.1 - Preparation only, NO runtime logic.
|
|
5
|
+
|
|
6
|
+
This module defines the schema and metadata structures for deterministic,
|
|
7
|
+
replayable data sources that exercise all supported Odibi data types and
|
|
8
|
+
ingestion paths.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from enum import Enum
|
|
13
|
+
from typing import Any, Dict, List, Literal, Optional
|
|
14
|
+
|
|
15
|
+
from pydantic import BaseModel, Field, field_validator, model_validator
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# ============================================
|
|
19
|
+
# Enums for SourcePool Configuration
|
|
20
|
+
# ============================================
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class FileFormat(str, Enum):
|
|
24
|
+
"""Supported file formats for source pools."""
|
|
25
|
+
|
|
26
|
+
CSV = "csv"
|
|
27
|
+
JSON = "json"
|
|
28
|
+
PARQUET = "parquet"
|
|
29
|
+
AVRO = "avro"
|
|
30
|
+
DELTA = "delta"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class SourceType(str, Enum):
|
|
34
|
+
"""Supported source types for ingestion testing."""
|
|
35
|
+
|
|
36
|
+
LOCAL = "local"
|
|
37
|
+
ADLS_EMULATED = "adls_emulated"
|
|
38
|
+
AZURE_BLOB_EMULATED = "azure_blob_emulated"
|
|
39
|
+
SQL_JDBC_LOCAL = "sql_jdbc_local"
|
|
40
|
+
CLOUDFILES = "cloudfiles"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class DataQuality(str, Enum):
|
|
44
|
+
"""Classification of data cleanliness."""
|
|
45
|
+
|
|
46
|
+
CLEAN = "clean" # No nulls, no duplicates, valid types
|
|
47
|
+
MESSY = "messy" # Contains nulls, edge cases, type issues
|
|
48
|
+
MIXED = "mixed" # Combination of clean and messy partitions
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class PoolStatus(str, Enum):
|
|
52
|
+
"""Lifecycle status of a source pool."""
|
|
53
|
+
|
|
54
|
+
DRAFT = "draft" # Schema defined, data not yet prepared
|
|
55
|
+
FROZEN = "frozen" # Data prepared and hash-verified
|
|
56
|
+
DEPRECATED = "deprecated" # Marked for removal
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# ============================================
|
|
60
|
+
# Schema Definitions (explicit, no inference)
|
|
61
|
+
# ============================================
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class ColumnSchema(BaseModel):
|
|
65
|
+
"""Explicit column definition - NO runtime inference."""
|
|
66
|
+
|
|
67
|
+
name: str = Field(description="Column name")
|
|
68
|
+
dtype: str = Field(description="Data type (string, int64, float64, bool, datetime, etc.)")
|
|
69
|
+
nullable: bool = Field(default=False, description="Whether column allows nulls")
|
|
70
|
+
primary_key: bool = Field(default=False, description="Part of primary key")
|
|
71
|
+
description: Optional[str] = Field(default=None, description="Column documentation")
|
|
72
|
+
sample_values: Optional[List[Any]] = Field(
|
|
73
|
+
default=None,
|
|
74
|
+
description="Example values for documentation (not used at runtime)",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class TableSchema(BaseModel):
|
|
79
|
+
"""Complete table schema definition."""
|
|
80
|
+
|
|
81
|
+
columns: List[ColumnSchema] = Field(description="Ordered list of columns")
|
|
82
|
+
primary_keys: Optional[List[str]] = Field(
|
|
83
|
+
default=None, description="List of primary key column names"
|
|
84
|
+
)
|
|
85
|
+
partition_columns: Optional[List[str]] = Field(
|
|
86
|
+
default=None, description="Partition columns (for Delta/Parquet)"
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
@model_validator(mode="after")
|
|
90
|
+
def validate_pk_columns_exist(self):
|
|
91
|
+
"""Ensure all primary key columns exist in schema."""
|
|
92
|
+
if self.primary_keys:
|
|
93
|
+
col_names = {c.name for c in self.columns}
|
|
94
|
+
for pk in self.primary_keys:
|
|
95
|
+
if pk not in col_names:
|
|
96
|
+
raise ValueError(f"Primary key column '{pk}' not in schema")
|
|
97
|
+
return self
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# ============================================
|
|
101
|
+
# Data Characteristics Metadata
|
|
102
|
+
# ============================================
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class DataCharacteristics(BaseModel):
|
|
106
|
+
"""Metadata about data characteristics for test coverage."""
|
|
107
|
+
|
|
108
|
+
row_count: int = Field(ge=0, description="Exact row count (deterministic)")
|
|
109
|
+
has_nulls: bool = Field(default=False, description="Contains null values")
|
|
110
|
+
has_duplicates: bool = Field(default=False, description="Contains duplicate keys")
|
|
111
|
+
has_unicode: bool = Field(default=False, description="Contains non-ASCII characters")
|
|
112
|
+
has_special_chars: bool = Field(default=False, description="Contains newlines, quotes, etc.")
|
|
113
|
+
has_empty_strings: bool = Field(default=False, description="Contains empty string values")
|
|
114
|
+
has_whitespace_issues: bool = Field(
|
|
115
|
+
default=False, description="Leading/trailing whitespace in strings"
|
|
116
|
+
)
|
|
117
|
+
has_type_coercion_cases: bool = Field(
|
|
118
|
+
default=False, description="Values that may coerce unexpectedly"
|
|
119
|
+
)
|
|
120
|
+
date_range: Optional[Dict[str, str]] = Field(
|
|
121
|
+
default=None,
|
|
122
|
+
description="Date range {min: ISO date, max: ISO date}",
|
|
123
|
+
)
|
|
124
|
+
numeric_ranges: Optional[Dict[str, Dict[str, float]]] = Field(
|
|
125
|
+
default=None,
|
|
126
|
+
description="Numeric column ranges {column: {min: v, max: v}}",
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# ============================================
|
|
131
|
+
# Integrity & Hashing
|
|
132
|
+
# ============================================
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class IntegrityManifest(BaseModel):
|
|
136
|
+
"""Cryptographic integrity manifest for frozen source pools."""
|
|
137
|
+
|
|
138
|
+
algorithm: Literal["sha256"] = "sha256"
|
|
139
|
+
file_hashes: Dict[str, str] = Field(description="Map of relative file path -> SHA256 hash")
|
|
140
|
+
manifest_hash: str = Field(
|
|
141
|
+
description="SHA256 hash of sorted file_hashes for quick verification"
|
|
142
|
+
)
|
|
143
|
+
frozen_at: datetime = Field(description="Timestamp when pool was frozen")
|
|
144
|
+
frozen_by: str = Field(default="system", description="User/system that froze the pool")
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# ============================================
|
|
148
|
+
# Source Pool Definition (Main Schema)
|
|
149
|
+
# ============================================
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class SourcePoolConfig(BaseModel):
|
|
153
|
+
"""
|
|
154
|
+
Complete SourcePool definition.
|
|
155
|
+
|
|
156
|
+
This is the primary schema for defining deterministic test data sources.
|
|
157
|
+
|
|
158
|
+
Invariants:
|
|
159
|
+
- All data is disk-backed and hashable
|
|
160
|
+
- Schemas are explicit (no runtime inference)
|
|
161
|
+
- Metadata is complete and machine-readable
|
|
162
|
+
- Sources are immutable once frozen
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
# === Identification ===
|
|
166
|
+
pool_id: str = Field(
|
|
167
|
+
description="Unique identifier (e.g., 'nyc_taxi_csv_clean')",
|
|
168
|
+
pattern=r"^[a-z][a-z0-9_]*$",
|
|
169
|
+
)
|
|
170
|
+
version: str = Field(
|
|
171
|
+
default="1.0.0",
|
|
172
|
+
description="Semantic version for tracking pool evolution",
|
|
173
|
+
)
|
|
174
|
+
name: str = Field(description="Human-readable name")
|
|
175
|
+
description: str = Field(description="Detailed description of the dataset")
|
|
176
|
+
|
|
177
|
+
# === Source Configuration ===
|
|
178
|
+
file_format: FileFormat = Field(description="File format")
|
|
179
|
+
source_type: SourceType = Field(description="Source/ingestion type to test")
|
|
180
|
+
data_quality: DataQuality = Field(description="Clean/messy/mixed classification")
|
|
181
|
+
|
|
182
|
+
# === Schema (Explicit, No Inference) ===
|
|
183
|
+
schema: TableSchema = Field(description="Explicit schema definition")
|
|
184
|
+
|
|
185
|
+
# === Disk Location (relative to .odibi/source_cache/) ===
|
|
186
|
+
cache_path: str = Field(
|
|
187
|
+
description="Relative path under .odibi/source_cache/ (e.g., 'nyc_taxi/csv/clean/')"
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# === Data Characteristics ===
|
|
191
|
+
characteristics: DataCharacteristics = Field(
|
|
192
|
+
description="Metadata about data properties for test coverage"
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# === Status & Integrity ===
|
|
196
|
+
status: PoolStatus = Field(
|
|
197
|
+
default=PoolStatus.DRAFT,
|
|
198
|
+
description="Current lifecycle status",
|
|
199
|
+
)
|
|
200
|
+
integrity: Optional[IntegrityManifest] = Field(
|
|
201
|
+
default=None,
|
|
202
|
+
description="Integrity manifest (required when status=frozen)",
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# === Provenance ===
|
|
206
|
+
original_source: Optional[str] = Field(
|
|
207
|
+
default=None,
|
|
208
|
+
description="URL or reference to original public dataset",
|
|
209
|
+
)
|
|
210
|
+
license: Optional[str] = Field(
|
|
211
|
+
default=None,
|
|
212
|
+
description="Data license (e.g., 'CC0', 'MIT', 'Public Domain')",
|
|
213
|
+
)
|
|
214
|
+
created_at: datetime = Field(
|
|
215
|
+
default_factory=lambda: datetime.now(timezone.utc),
|
|
216
|
+
description="When this pool definition was created",
|
|
217
|
+
)
|
|
218
|
+
updated_at: datetime = Field(
|
|
219
|
+
default_factory=lambda: datetime.now(timezone.utc),
|
|
220
|
+
description="Last modification timestamp",
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# === Test Coverage Hints ===
|
|
224
|
+
tests_coverage: List[str] = Field(
|
|
225
|
+
default_factory=list,
|
|
226
|
+
description="List of test scenarios this pool covers (e.g., 'null_handling', 'unicode_support')",
|
|
227
|
+
)
|
|
228
|
+
compatible_pipelines: List[str] = Field(
|
|
229
|
+
default_factory=list,
|
|
230
|
+
description="Pipeline patterns this pool is designed for (e.g., 'bronze_ingestion', 'silver_dedup')",
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
@model_validator(mode="after")
|
|
234
|
+
def validate_frozen_has_integrity(self):
|
|
235
|
+
"""Frozen pools must have integrity manifest."""
|
|
236
|
+
if self.status == PoolStatus.FROZEN and not self.integrity:
|
|
237
|
+
raise ValueError(f"Pool '{self.pool_id}': frozen status requires integrity manifest")
|
|
238
|
+
return self
|
|
239
|
+
|
|
240
|
+
@field_validator("cache_path")
|
|
241
|
+
@classmethod
|
|
242
|
+
def validate_cache_path(cls, v: str) -> str:
|
|
243
|
+
"""Ensure cache_path is relative and safe."""
|
|
244
|
+
if v.startswith("/") or v.startswith("\\") or ".." in v:
|
|
245
|
+
raise ValueError(f"cache_path must be relative without '..': {v}")
|
|
246
|
+
return v.replace("\\", "/")
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
# ============================================
|
|
250
|
+
# Source Pool Index (Registry)
|
|
251
|
+
# ============================================
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
class SourcePoolIndex(BaseModel):
|
|
255
|
+
"""
|
|
256
|
+
Index of all registered source pools.
|
|
257
|
+
|
|
258
|
+
Stored at: .odibi/source_metadata/pool_index.yaml
|
|
259
|
+
"""
|
|
260
|
+
|
|
261
|
+
version: str = Field(default="1.0.0", description="Index schema version")
|
|
262
|
+
updated_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
|
263
|
+
pools: Dict[str, str] = Field(
|
|
264
|
+
default_factory=dict,
|
|
265
|
+
description="Map of pool_id -> metadata file path (relative to source_metadata/)",
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
def add_pool(self, pool_id: str, metadata_path: str) -> None:
|
|
269
|
+
"""Register a pool in the index."""
|
|
270
|
+
self.pools[pool_id] = metadata_path
|
|
271
|
+
self.updated_at = datetime.now(timezone.utc)
|
|
272
|
+
|
|
273
|
+
def remove_pool(self, pool_id: str) -> None:
|
|
274
|
+
"""Remove a pool from the index."""
|
|
275
|
+
if pool_id in self.pools:
|
|
276
|
+
del self.pools[pool_id]
|
|
277
|
+
self.updated_at = datetime.now(timezone.utc)
|