odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/story/themes.py ADDED
@@ -0,0 +1,216 @@
1
+ """
2
+ Theme System
3
+ ============
4
+
5
+ Customizable themes for story rendering with branding support.
6
+ """
7
+
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+ from typing import Any, Dict, Optional
11
+
12
+ import yaml
13
+
14
+
15
+ @dataclass
16
+ class StoryTheme:
17
+ """
18
+ Story theme configuration.
19
+
20
+ Defines colors, typography, branding, and layout options for
21
+ rendered stories (HTML).
22
+ """
23
+
24
+ name: str
25
+
26
+ # Colors
27
+ primary_color: str = "#0066cc"
28
+ success_color: str = "#28a745"
29
+ error_color: str = "#dc3545"
30
+ warning_color: str = "#ffc107"
31
+ bg_color: str = "#ffffff"
32
+ text_color: str = "#333333"
33
+ border_color: str = "#dddddd"
34
+ code_bg: str = "#f5f5f5"
35
+
36
+ # Typography
37
+ font_family: str = (
38
+ "system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif"
39
+ )
40
+ heading_font: str = "inherit"
41
+ code_font: str = "Consolas, Monaco, 'Courier New', monospace"
42
+ font_size: str = "16px"
43
+
44
+ # Branding
45
+ logo_url: Optional[str] = None
46
+ company_name: Optional[str] = None
47
+ footer_text: Optional[str] = None
48
+
49
+ # Layout
50
+ max_width: str = "1200px"
51
+ sidebar: bool = False
52
+
53
+ # Custom CSS
54
+ custom_css: Optional[str] = None
55
+
56
+ def to_css_vars(self) -> Dict[str, str]:
57
+ """
58
+ Convert theme to CSS variables.
59
+
60
+ Returns:
61
+ Dictionary of CSS variable names and values
62
+ """
63
+ return {
64
+ "--primary-color": self.primary_color,
65
+ "--success-color": self.success_color,
66
+ "--error-color": self.error_color,
67
+ "--warning-color": self.warning_color,
68
+ "--bg-color": self.bg_color,
69
+ "--text-color": self.text_color,
70
+ "--border-color": self.border_color,
71
+ "--code-bg": self.code_bg,
72
+ "--font-family": self.font_family,
73
+ "--heading-font": self.heading_font,
74
+ "--code-font": self.code_font,
75
+ "--font-size": self.font_size,
76
+ "--max-width": self.max_width,
77
+ }
78
+
79
+ def to_css_string(self) -> str:
80
+ """
81
+ Generate CSS string from theme.
82
+
83
+ Returns:
84
+ CSS string with :root variables
85
+ """
86
+ lines = [":root {"]
87
+ for var_name, var_value in self.to_css_vars().items():
88
+ lines.append(f" {var_name}: {var_value};")
89
+ lines.append("}")
90
+
91
+ if self.custom_css:
92
+ lines.append("")
93
+ lines.append(self.custom_css)
94
+
95
+ return "\n".join(lines)
96
+
97
+ @classmethod
98
+ def from_dict(cls, data: Dict[str, Any]) -> "StoryTheme":
99
+ """
100
+ Create theme from dictionary.
101
+
102
+ Args:
103
+ data: Theme configuration dictionary
104
+
105
+ Returns:
106
+ StoryTheme instance
107
+ """
108
+ return cls(**data)
109
+
110
+ @classmethod
111
+ def from_yaml(cls, path: str) -> "StoryTheme":
112
+ """
113
+ Load theme from YAML file.
114
+
115
+ Args:
116
+ path: Path to YAML theme file
117
+
118
+ Returns:
119
+ StoryTheme instance
120
+ """
121
+ with open(path, "r") as f:
122
+ data = yaml.safe_load(f)
123
+
124
+ return cls.from_dict(data)
125
+
126
+
127
+ # Built-in Themes
128
+ # ===============
129
+
130
+ DEFAULT_THEME = StoryTheme(
131
+ name="default",
132
+ )
133
+
134
+ CORPORATE_THEME = StoryTheme(
135
+ name="corporate",
136
+ primary_color="#003366",
137
+ success_color="#2e7d32",
138
+ error_color="#c62828",
139
+ font_family="Arial, Helvetica, sans-serif",
140
+ heading_font="Georgia, 'Times New Roman', serif",
141
+ font_size="15px",
142
+ )
143
+
144
+ DARK_THEME = StoryTheme(
145
+ name="dark",
146
+ primary_color="#00bfff",
147
+ success_color="#4caf50",
148
+ error_color="#f44336",
149
+ warning_color="#ffb300",
150
+ bg_color="#1e1e1e",
151
+ text_color="#e0e0e0",
152
+ border_color="#444444",
153
+ code_bg="#2d2d2d",
154
+ custom_css="""
155
+ body { background: #121212; }
156
+ .container { background: #1e1e1e; }
157
+ .node-header { background: #2d2d2d; }
158
+ .summary { background: #2d2d2d; }
159
+ """,
160
+ )
161
+
162
+ MINIMAL_THEME = StoryTheme(
163
+ name="minimal",
164
+ primary_color="#000000",
165
+ success_color="#006600",
166
+ error_color="#cc0000",
167
+ warning_color="#ff9900",
168
+ font_family="'Helvetica Neue', Helvetica, Arial, sans-serif",
169
+ heading_font="'Helvetica Neue', Helvetica, Arial, sans-serif",
170
+ font_size="14px",
171
+ max_width="900px",
172
+ )
173
+
174
+ # Theme registry
175
+ BUILTIN_THEMES = {
176
+ "default": DEFAULT_THEME,
177
+ "corporate": CORPORATE_THEME,
178
+ "dark": DARK_THEME,
179
+ "minimal": MINIMAL_THEME,
180
+ }
181
+
182
+
183
+ def get_theme(name: str) -> StoryTheme:
184
+ """
185
+ Get theme by name.
186
+
187
+ Args:
188
+ name: Theme name or path to YAML theme file
189
+
190
+ Returns:
191
+ StoryTheme instance
192
+
193
+ Raises:
194
+ ValueError: If theme not found
195
+ """
196
+ # Check if it's a file path
197
+ if Path(name).exists():
198
+ return StoryTheme.from_yaml(name)
199
+
200
+ # Check built-in themes
201
+ if name.lower() in BUILTIN_THEMES:
202
+ return BUILTIN_THEMES[name.lower()]
203
+
204
+ raise ValueError(
205
+ f"Theme '{name}' not found. Available themes: {', '.join(BUILTIN_THEMES.keys())}"
206
+ )
207
+
208
+
209
+ def list_themes() -> Dict[str, StoryTheme]:
210
+ """
211
+ List all available built-in themes.
212
+
213
+ Returns:
214
+ Dictionary of theme name -> StoryTheme
215
+ """
216
+ return BUILTIN_THEMES.copy()
@@ -0,0 +1,13 @@
1
+ """Testing utilities for Odibi."""
2
+
3
+ from .assertions import assert_frame_equal, assert_schema_equal
4
+ from .fixtures import generate_sample_data, temp_directory
5
+
6
+ __version__ = "1.3.0-alpha.1"
7
+
8
+ __all__ = [
9
+ "temp_directory",
10
+ "generate_sample_data",
11
+ "assert_frame_equal",
12
+ "assert_schema_equal",
13
+ ]
@@ -0,0 +1,75 @@
1
+ """
2
+ Testing Assertions
3
+ ==================
4
+
5
+ Helpers for asserting DataFrame equality and properties.
6
+ """
7
+
8
+ from typing import Any, List
9
+
10
+ import pandas as pd
11
+
12
+
13
+ def assert_frame_equal(
14
+ left: Any,
15
+ right: Any,
16
+ check_dtype: bool = True,
17
+ check_exact: bool = False,
18
+ atol: float = 1e-8,
19
+ rtol: float = 1e-5,
20
+ ) -> None:
21
+ """
22
+ Assert that two DataFrames are equal.
23
+ Supports both Pandas and Spark DataFrames.
24
+
25
+ Args:
26
+ left: First DataFrame
27
+ right: Second DataFrame
28
+ check_dtype: Whether to check data types
29
+ check_exact: Whether to compare numbers exactly
30
+ atol: Absolute tolerance
31
+ rtol: Relative tolerance
32
+ """
33
+ # Convert Spark to Pandas for comparison if needed
34
+ left_pdf = _to_pandas(left)
35
+ right_pdf = _to_pandas(right)
36
+
37
+ # Sort by first column to ensure order doesn't matter (Spark is unordered)
38
+ if not left_pdf.empty and not right_pdf.empty:
39
+ sort_col = left_pdf.columns[0]
40
+ left_pdf = left_pdf.sort_values(sort_col).reset_index(drop=True)
41
+ right_pdf = right_pdf.sort_values(sort_col).reset_index(drop=True)
42
+
43
+ pd.testing.assert_frame_equal(
44
+ left_pdf, right_pdf, check_dtype=check_dtype, check_exact=check_exact, atol=atol, rtol=rtol
45
+ )
46
+
47
+
48
+ def assert_schema_equal(left: Any, right: Any) -> None:
49
+ """
50
+ Assert that two DataFrames have the same schema (column names and types).
51
+ """
52
+ # Simplified check for column names
53
+ left_cols = sorted(_get_columns(left))
54
+ right_cols = sorted(_get_columns(right))
55
+
56
+ assert left_cols == right_cols, f"Schema mismatch: {left_cols} != {right_cols}"
57
+
58
+
59
+ def _to_pandas(df: Any) -> pd.DataFrame:
60
+ """Convert to Pandas DataFrame if not already."""
61
+ if isinstance(df, pd.DataFrame):
62
+ return df
63
+
64
+ # Assume Spark DataFrame
65
+ try:
66
+ return df.toPandas()
67
+ except AttributeError:
68
+ raise TypeError(f"Expected DataFrame, got {type(df)}")
69
+
70
+
71
+ def _get_columns(df: Any) -> List[str]:
72
+ """Get column names."""
73
+ if isinstance(df, pd.DataFrame):
74
+ return list(df.columns)
75
+ return list(df.columns)
@@ -0,0 +1,85 @@
1
+ """
2
+ Testing Fixtures
3
+ ================
4
+
5
+ Reusable fixtures for testing pipelines and transformations.
6
+ """
7
+
8
+ import shutil
9
+ import tempfile
10
+ from contextlib import contextmanager
11
+ from typing import Any, Dict, Generator, Optional
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+
16
+
17
+ @contextmanager
18
+ def temp_directory() -> Generator[str, None, None]:
19
+ """
20
+ Create a temporary directory for test artifacts.
21
+
22
+ Yields:
23
+ Path to the temporary directory.
24
+
25
+ Example:
26
+ with temp_directory() as temp_dir:
27
+ path = os.path.join(temp_dir, "test.csv")
28
+ df.to_csv(path)
29
+ """
30
+ temp_dir = tempfile.mkdtemp()
31
+ try:
32
+ yield temp_dir
33
+ finally:
34
+ shutil.rmtree(temp_dir)
35
+
36
+
37
+ def generate_sample_data(
38
+ rows: int = 10, engine_type: str = "pandas", schema: Optional[Dict[str, str]] = None
39
+ ) -> Any:
40
+ """
41
+ Generate a sample DataFrame (Pandas or Spark).
42
+
43
+ Args:
44
+ rows: Number of rows to generate
45
+ engine_type: "pandas" or "spark"
46
+ schema: Optional dictionary of {column_name: type}
47
+ Supported types: "int", "float", "str", "date"
48
+
49
+ Returns:
50
+ DataFrame (pd.DataFrame or pyspark.sql.DataFrame)
51
+ """
52
+ from datetime import datetime, timedelta
53
+
54
+ # Default schema if none provided
55
+ if not schema:
56
+ schema = {"id": "int", "value": "float", "category": "str", "timestamp": "date"}
57
+
58
+ data = {}
59
+ for col, dtype in schema.items():
60
+ if dtype == "int":
61
+ data[col] = np.random.randint(0, 1000, rows)
62
+ elif dtype == "float":
63
+ data[col] = np.random.rand(rows) * 100
64
+ elif dtype == "str":
65
+ data[col] = [f"val_{i}" for i in range(rows)]
66
+ elif dtype == "date":
67
+ base_date = datetime.now()
68
+ data[col] = [base_date - timedelta(days=i) for i in range(rows)]
69
+
70
+ pdf = pd.DataFrame(data)
71
+
72
+ if engine_type == "pandas":
73
+ return pdf
74
+
75
+ if engine_type == "spark":
76
+ try:
77
+ from pyspark.sql import SparkSession
78
+
79
+ # Try to get existing session or create new one
80
+ spark = SparkSession.builder.master("local[*]").appName("odibi-test").getOrCreate()
81
+ return spark.createDataFrame(pdf)
82
+ except ImportError:
83
+ raise ImportError("Spark not installed. Run 'pip install odibi[spark]'")
84
+
85
+ raise ValueError(f"Unknown engine type: {engine_type}")
@@ -0,0 +1,277 @@
1
+ """
2
+ SourcePool: Deterministic, frozen test data sources for Odibi testing.
3
+
4
+ Phase 7.B.1 - Preparation only, NO runtime logic.
5
+
6
+ This module defines the schema and metadata structures for deterministic,
7
+ replayable data sources that exercise all supported Odibi data types and
8
+ ingestion paths.
9
+ """
10
+
11
+ from datetime import datetime, timezone
12
+ from enum import Enum
13
+ from typing import Any, Dict, List, Literal, Optional
14
+
15
+ from pydantic import BaseModel, Field, field_validator, model_validator
16
+
17
+
18
+ # ============================================
19
+ # Enums for SourcePool Configuration
20
+ # ============================================
21
+
22
+
23
+ class FileFormat(str, Enum):
24
+ """Supported file formats for source pools."""
25
+
26
+ CSV = "csv"
27
+ JSON = "json"
28
+ PARQUET = "parquet"
29
+ AVRO = "avro"
30
+ DELTA = "delta"
31
+
32
+
33
+ class SourceType(str, Enum):
34
+ """Supported source types for ingestion testing."""
35
+
36
+ LOCAL = "local"
37
+ ADLS_EMULATED = "adls_emulated"
38
+ AZURE_BLOB_EMULATED = "azure_blob_emulated"
39
+ SQL_JDBC_LOCAL = "sql_jdbc_local"
40
+ CLOUDFILES = "cloudfiles"
41
+
42
+
43
+ class DataQuality(str, Enum):
44
+ """Classification of data cleanliness."""
45
+
46
+ CLEAN = "clean" # No nulls, no duplicates, valid types
47
+ MESSY = "messy" # Contains nulls, edge cases, type issues
48
+ MIXED = "mixed" # Combination of clean and messy partitions
49
+
50
+
51
+ class PoolStatus(str, Enum):
52
+ """Lifecycle status of a source pool."""
53
+
54
+ DRAFT = "draft" # Schema defined, data not yet prepared
55
+ FROZEN = "frozen" # Data prepared and hash-verified
56
+ DEPRECATED = "deprecated" # Marked for removal
57
+
58
+
59
+ # ============================================
60
+ # Schema Definitions (explicit, no inference)
61
+ # ============================================
62
+
63
+
64
+ class ColumnSchema(BaseModel):
65
+ """Explicit column definition - NO runtime inference."""
66
+
67
+ name: str = Field(description="Column name")
68
+ dtype: str = Field(description="Data type (string, int64, float64, bool, datetime, etc.)")
69
+ nullable: bool = Field(default=False, description="Whether column allows nulls")
70
+ primary_key: bool = Field(default=False, description="Part of primary key")
71
+ description: Optional[str] = Field(default=None, description="Column documentation")
72
+ sample_values: Optional[List[Any]] = Field(
73
+ default=None,
74
+ description="Example values for documentation (not used at runtime)",
75
+ )
76
+
77
+
78
+ class TableSchema(BaseModel):
79
+ """Complete table schema definition."""
80
+
81
+ columns: List[ColumnSchema] = Field(description="Ordered list of columns")
82
+ primary_keys: Optional[List[str]] = Field(
83
+ default=None, description="List of primary key column names"
84
+ )
85
+ partition_columns: Optional[List[str]] = Field(
86
+ default=None, description="Partition columns (for Delta/Parquet)"
87
+ )
88
+
89
+ @model_validator(mode="after")
90
+ def validate_pk_columns_exist(self):
91
+ """Ensure all primary key columns exist in schema."""
92
+ if self.primary_keys:
93
+ col_names = {c.name for c in self.columns}
94
+ for pk in self.primary_keys:
95
+ if pk not in col_names:
96
+ raise ValueError(f"Primary key column '{pk}' not in schema")
97
+ return self
98
+
99
+
100
+ # ============================================
101
+ # Data Characteristics Metadata
102
+ # ============================================
103
+
104
+
105
+ class DataCharacteristics(BaseModel):
106
+ """Metadata about data characteristics for test coverage."""
107
+
108
+ row_count: int = Field(ge=0, description="Exact row count (deterministic)")
109
+ has_nulls: bool = Field(default=False, description="Contains null values")
110
+ has_duplicates: bool = Field(default=False, description="Contains duplicate keys")
111
+ has_unicode: bool = Field(default=False, description="Contains non-ASCII characters")
112
+ has_special_chars: bool = Field(default=False, description="Contains newlines, quotes, etc.")
113
+ has_empty_strings: bool = Field(default=False, description="Contains empty string values")
114
+ has_whitespace_issues: bool = Field(
115
+ default=False, description="Leading/trailing whitespace in strings"
116
+ )
117
+ has_type_coercion_cases: bool = Field(
118
+ default=False, description="Values that may coerce unexpectedly"
119
+ )
120
+ date_range: Optional[Dict[str, str]] = Field(
121
+ default=None,
122
+ description="Date range {min: ISO date, max: ISO date}",
123
+ )
124
+ numeric_ranges: Optional[Dict[str, Dict[str, float]]] = Field(
125
+ default=None,
126
+ description="Numeric column ranges {column: {min: v, max: v}}",
127
+ )
128
+
129
+
130
+ # ============================================
131
+ # Integrity & Hashing
132
+ # ============================================
133
+
134
+
135
+ class IntegrityManifest(BaseModel):
136
+ """Cryptographic integrity manifest for frozen source pools."""
137
+
138
+ algorithm: Literal["sha256"] = "sha256"
139
+ file_hashes: Dict[str, str] = Field(description="Map of relative file path -> SHA256 hash")
140
+ manifest_hash: str = Field(
141
+ description="SHA256 hash of sorted file_hashes for quick verification"
142
+ )
143
+ frozen_at: datetime = Field(description="Timestamp when pool was frozen")
144
+ frozen_by: str = Field(default="system", description="User/system that froze the pool")
145
+
146
+
147
+ # ============================================
148
+ # Source Pool Definition (Main Schema)
149
+ # ============================================
150
+
151
+
152
+ class SourcePoolConfig(BaseModel):
153
+ """
154
+ Complete SourcePool definition.
155
+
156
+ This is the primary schema for defining deterministic test data sources.
157
+
158
+ Invariants:
159
+ - All data is disk-backed and hashable
160
+ - Schemas are explicit (no runtime inference)
161
+ - Metadata is complete and machine-readable
162
+ - Sources are immutable once frozen
163
+ """
164
+
165
+ # === Identification ===
166
+ pool_id: str = Field(
167
+ description="Unique identifier (e.g., 'nyc_taxi_csv_clean')",
168
+ pattern=r"^[a-z][a-z0-9_]*$",
169
+ )
170
+ version: str = Field(
171
+ default="1.0.0",
172
+ description="Semantic version for tracking pool evolution",
173
+ )
174
+ name: str = Field(description="Human-readable name")
175
+ description: str = Field(description="Detailed description of the dataset")
176
+
177
+ # === Source Configuration ===
178
+ file_format: FileFormat = Field(description="File format")
179
+ source_type: SourceType = Field(description="Source/ingestion type to test")
180
+ data_quality: DataQuality = Field(description="Clean/messy/mixed classification")
181
+
182
+ # === Schema (Explicit, No Inference) ===
183
+ schema: TableSchema = Field(description="Explicit schema definition")
184
+
185
+ # === Disk Location (relative to .odibi/source_cache/) ===
186
+ cache_path: str = Field(
187
+ description="Relative path under .odibi/source_cache/ (e.g., 'nyc_taxi/csv/clean/')"
188
+ )
189
+
190
+ # === Data Characteristics ===
191
+ characteristics: DataCharacteristics = Field(
192
+ description="Metadata about data properties for test coverage"
193
+ )
194
+
195
+ # === Status & Integrity ===
196
+ status: PoolStatus = Field(
197
+ default=PoolStatus.DRAFT,
198
+ description="Current lifecycle status",
199
+ )
200
+ integrity: Optional[IntegrityManifest] = Field(
201
+ default=None,
202
+ description="Integrity manifest (required when status=frozen)",
203
+ )
204
+
205
+ # === Provenance ===
206
+ original_source: Optional[str] = Field(
207
+ default=None,
208
+ description="URL or reference to original public dataset",
209
+ )
210
+ license: Optional[str] = Field(
211
+ default=None,
212
+ description="Data license (e.g., 'CC0', 'MIT', 'Public Domain')",
213
+ )
214
+ created_at: datetime = Field(
215
+ default_factory=lambda: datetime.now(timezone.utc),
216
+ description="When this pool definition was created",
217
+ )
218
+ updated_at: datetime = Field(
219
+ default_factory=lambda: datetime.now(timezone.utc),
220
+ description="Last modification timestamp",
221
+ )
222
+
223
+ # === Test Coverage Hints ===
224
+ tests_coverage: List[str] = Field(
225
+ default_factory=list,
226
+ description="List of test scenarios this pool covers (e.g., 'null_handling', 'unicode_support')",
227
+ )
228
+ compatible_pipelines: List[str] = Field(
229
+ default_factory=list,
230
+ description="Pipeline patterns this pool is designed for (e.g., 'bronze_ingestion', 'silver_dedup')",
231
+ )
232
+
233
+ @model_validator(mode="after")
234
+ def validate_frozen_has_integrity(self):
235
+ """Frozen pools must have integrity manifest."""
236
+ if self.status == PoolStatus.FROZEN and not self.integrity:
237
+ raise ValueError(f"Pool '{self.pool_id}': frozen status requires integrity manifest")
238
+ return self
239
+
240
+ @field_validator("cache_path")
241
+ @classmethod
242
+ def validate_cache_path(cls, v: str) -> str:
243
+ """Ensure cache_path is relative and safe."""
244
+ if v.startswith("/") or v.startswith("\\") or ".." in v:
245
+ raise ValueError(f"cache_path must be relative without '..': {v}")
246
+ return v.replace("\\", "/")
247
+
248
+
249
+ # ============================================
250
+ # Source Pool Index (Registry)
251
+ # ============================================
252
+
253
+
254
+ class SourcePoolIndex(BaseModel):
255
+ """
256
+ Index of all registered source pools.
257
+
258
+ Stored at: .odibi/source_metadata/pool_index.yaml
259
+ """
260
+
261
+ version: str = Field(default="1.0.0", description="Index schema version")
262
+ updated_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
263
+ pools: Dict[str, str] = Field(
264
+ default_factory=dict,
265
+ description="Map of pool_id -> metadata file path (relative to source_metadata/)",
266
+ )
267
+
268
+ def add_pool(self, pool_id: str, metadata_path: str) -> None:
269
+ """Register a pool in the index."""
270
+ self.pools[pool_id] = metadata_path
271
+ self.updated_at = datetime.now(timezone.utc)
272
+
273
+ def remove_pool(self, pool_id: str) -> None:
274
+ """Remove a pool from the index."""
275
+ if pool_id in self.pools:
276
+ del self.pools[pool_id]
277
+ self.updated_at = datetime.now(timezone.utc)