powerbi-ontology-extractor 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,369 @@
1
+ """
2
+ Schema Mapper
3
+
4
+ Maps logical ontology entities to physical data sources and detects schema drift.
5
+ This prevents the $4.6M mistake by validating schema bindings.
6
+ """
7
+
8
+ import logging
9
+ from dataclasses import dataclass, field
10
+ from typing import Dict, List, Optional
11
+
12
+ from powerbi_ontology.ontology_generator import Ontology, OntologyEntity
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @dataclass
18
+ class SchemaBinding:
19
+ """Maps logical entity to physical data source."""
20
+ entity: str
21
+ physical_source: str # e.g., "SQL.dbo.customers"
22
+ property_mappings: Dict[str, str] = field(default_factory=dict) # logical -> physical
23
+ source_type: str = "sql" # "sql", "azure_sql", "fabric", etc.
24
+
25
+
26
+ @dataclass
27
+ class ValidationResult:
28
+ """Result of schema binding validation."""
29
+ is_valid: bool
30
+ errors: List[str] = field(default_factory=list)
31
+ warnings: List[str] = field(default_factory=list)
32
+ message: str = ""
33
+
34
+
35
+ @dataclass
36
+ class DriftReport:
37
+ """Report of schema drift detection."""
38
+ entity: str
39
+ missing_columns: List[str] = field(default_factory=list)
40
+ new_columns: List[str] = field(default_factory=list)
41
+ type_changes: Dict[str, str] = field(default_factory=dict) # column -> old_type -> new_type
42
+ renamed_columns: Dict[str, str] = field(default_factory=dict) # old_name -> new_name
43
+ severity: str = "INFO" # "INFO", "WARNING", "CRITICAL"
44
+ message: str = ""
45
+
46
+
47
+ @dataclass
48
+ class Fix:
49
+ """Suggested fix for schema drift."""
50
+ type: str # "update_mapping", "add_column", "remove_column"
51
+ description: str
52
+ action: str # SQL or mapping update
53
+ entity: str = ""
54
+ property: str = ""
55
+
56
+
57
+ class SchemaMapper:
58
+ """
59
+ Maps ontology entities to physical data sources and detects schema drift.
60
+
61
+ This is the CRITICAL piece that prevents the $4.6M mistake by detecting
62
+ when columns are renamed or deleted before AI agents use them.
63
+ """
64
+
65
+ def __init__(self, ontology: Ontology, data_source: Optional[str] = None):
66
+ """
67
+ Initialize schema mapper.
68
+
69
+ Args:
70
+ ontology: The ontology to map
71
+ data_source: Optional default data source
72
+ """
73
+ self.ontology = ontology
74
+ self.data_source = data_source
75
+ self.bindings: Dict[str, SchemaBinding] = {}
76
+
77
+ def create_binding(
78
+ self,
79
+ entity_name: str,
80
+ physical_table: str,
81
+ property_mappings: Optional[Dict[str, str]] = None
82
+ ) -> SchemaBinding:
83
+ """
84
+ Create a schema binding between logical entity and physical table.
85
+
86
+ Args:
87
+ entity_name: Name of the ontology entity
88
+ physical_table: Physical table name (e.g., "dbo.customers")
89
+ property_mappings: Optional explicit property mappings
90
+
91
+ Returns:
92
+ SchemaBinding object
93
+ """
94
+ entity = next((e for e in self.ontology.entities if e.name == entity_name), None)
95
+ if not entity:
96
+ raise ValueError(f"Entity not found: {entity_name}")
97
+
98
+ # Auto-generate mappings if not provided
99
+ if not property_mappings:
100
+ property_mappings = {}
101
+ for prop in entity.properties:
102
+ # Default: use property name as column name (snake_case conversion)
103
+ physical_name = self._to_snake_case(prop.name)
104
+ property_mappings[prop.name] = physical_name
105
+
106
+ binding = SchemaBinding(
107
+ entity=entity_name,
108
+ physical_source=physical_table,
109
+ property_mappings=property_mappings,
110
+ source_type=self._detect_source_type(physical_table)
111
+ )
112
+
113
+ self.bindings[entity_name] = binding
114
+ logger.info(f"Created binding: {entity_name} -> {physical_table}")
115
+
116
+ return binding
117
+
118
+ def validate_binding(self, binding: SchemaBinding) -> ValidationResult:
119
+ """
120
+ Validate a schema binding.
121
+
122
+ Args:
123
+ binding: SchemaBinding to validate
124
+
125
+ Returns:
126
+ ValidationResult
127
+ """
128
+ errors = []
129
+ warnings = []
130
+
131
+ # Check if entity exists in ontology
132
+ entity = next((e for e in self.ontology.entities if e.name == binding.entity), None)
133
+ if not entity:
134
+ errors.append(f"Entity '{binding.entity}' not found in ontology")
135
+ return ValidationResult(
136
+ is_valid=False,
137
+ errors=errors,
138
+ message=f"Entity '{binding.entity}' not found"
139
+ )
140
+
141
+ # Check if all mapped properties exist in entity
142
+ for logical_prop, physical_col in binding.property_mappings.items():
143
+ prop_exists = any(p.name == logical_prop for p in entity.properties)
144
+ if not prop_exists:
145
+ warnings.append(
146
+ f"Property '{logical_prop}' mapped but not found in entity '{binding.entity}'"
147
+ )
148
+
149
+ # Note: Full validation would require connection to actual data source
150
+ # This is a basic validation - full implementation would query the database
151
+
152
+ is_valid = len(errors) == 0
153
+ message = "Binding is valid" if is_valid else f"Found {len(errors)} errors"
154
+
155
+ return ValidationResult(
156
+ is_valid=is_valid,
157
+ errors=errors,
158
+ warnings=warnings,
159
+ message=message
160
+ )
161
+
162
+ def detect_drift(
163
+ self,
164
+ binding: SchemaBinding,
165
+ current_schema: Dict[str, any]
166
+ ) -> DriftReport:
167
+ """
168
+ Detect schema drift between binding and current schema.
169
+
170
+ This is the function that PREVENTS THE $4.6M MISTAKE!
171
+
172
+ Args:
173
+ binding: SchemaBinding to check
174
+ current_schema: Current physical schema (dict of column_name -> type)
175
+
176
+ Returns:
177
+ DriftReport
178
+ """
179
+ missing_columns = []
180
+ new_columns = []
181
+ type_changes = {}
182
+ renamed_columns = {}
183
+
184
+ # Get expected columns from binding
185
+ expected_columns = set(binding.property_mappings.values())
186
+ actual_columns = set(current_schema.keys())
187
+
188
+ # Find missing columns (THE $4.6M PROBLEM!)
189
+ missing_columns = list(expected_columns - actual_columns)
190
+
191
+ # Find new columns
192
+ new_columns = list(actual_columns - expected_columns)
193
+
194
+ # Check for type changes
195
+ for logical_prop, physical_col in binding.property_mappings.items():
196
+ if physical_col in current_schema:
197
+ # Get expected type from ontology
198
+ entity = next((e for e in self.ontology.entities if e.name == binding.entity), None)
199
+ if entity:
200
+ prop = next((p for p in entity.properties if p.name == logical_prop), None)
201
+ if prop:
202
+ expected_type = prop.data_type
203
+ actual_type = current_schema[physical_col]
204
+ if expected_type != actual_type:
205
+ type_changes[physical_col] = f"{expected_type} -> {actual_type}"
206
+
207
+ # Heuristic: If column is missing but similar name exists, might be renamed
208
+ if missing_columns and new_columns:
209
+ for missing in missing_columns:
210
+ for new_col in new_columns:
211
+ if self._similar_names(missing, new_col):
212
+ renamed_columns[missing] = new_col
213
+ # Remove from missing/new if we found a match
214
+ if missing in missing_columns:
215
+ missing_columns.remove(missing)
216
+ if new_col in new_columns:
217
+ new_columns.remove(new_col)
218
+
219
+ # Determine severity
220
+ severity = "INFO"
221
+ if missing_columns:
222
+ severity = "CRITICAL" # This is the $4.6M mistake scenario!
223
+ elif type_changes:
224
+ severity = "WARNING"
225
+ elif renamed_columns:
226
+ severity = "WARNING"
227
+
228
+ # Build message
229
+ message_parts = []
230
+ if missing_columns:
231
+ message_parts.append(
232
+ f"CRITICAL: Missing columns: {', '.join(missing_columns)}. "
233
+ f"This could cause the $4.6M mistake!"
234
+ )
235
+ if renamed_columns:
236
+ message_parts.append(
237
+ f"Columns may have been renamed: {', '.join(f'{k} -> {v}' for k, v in renamed_columns.items())}"
238
+ )
239
+ if type_changes:
240
+ message_parts.append(f"Type changes detected: {', '.join(f'{k}: {v}' for k, v in type_changes.items())}")
241
+ if new_columns:
242
+ message_parts.append(f"New columns found: {', '.join(new_columns)}")
243
+
244
+ message = " | ".join(message_parts) if message_parts else "No drift detected"
245
+
246
+ return DriftReport(
247
+ entity=binding.entity,
248
+ missing_columns=missing_columns,
249
+ new_columns=new_columns,
250
+ type_changes=type_changes,
251
+ renamed_columns=renamed_columns,
252
+ severity=severity,
253
+ message=message
254
+ )
255
+
256
+ def suggest_fix(self, drift_report: DriftReport) -> List[Fix]:
257
+ """
258
+ Suggest fixes for detected drift.
259
+
260
+ Args:
261
+ drift_report: DriftReport with detected issues
262
+
263
+ Returns:
264
+ List of Fix suggestions
265
+ """
266
+ fixes = []
267
+ binding = self.bindings.get(drift_report.entity)
268
+
269
+ if not binding:
270
+ return fixes
271
+
272
+ # Fix for renamed columns
273
+ for old_name, new_name in drift_report.renamed_columns.items():
274
+ fixes.append(Fix(
275
+ type="update_mapping",
276
+ description=f"Update mapping: {old_name} -> {new_name}",
277
+ action=f"Update binding.property_mappings['{old_name}'] = '{new_name}'",
278
+ entity=drift_report.entity,
279
+ property=old_name
280
+ ))
281
+
282
+ # Fix for missing columns
283
+ for missing_col in drift_report.missing_columns:
284
+ fixes.append(Fix(
285
+ type="update_mapping",
286
+ description=f"Column '{missing_col}' not found. Check if renamed or deleted.",
287
+ action=f"Verify column exists: SELECT * FROM {binding.physical_source} WHERE 1=0",
288
+ entity=drift_report.entity,
289
+ property=missing_col
290
+ ))
291
+
292
+ # Fix for new columns
293
+ for new_col in drift_report.new_columns:
294
+ fixes.append(Fix(
295
+ type="add_column",
296
+ description=f"New column '{new_col}' found. Consider adding to ontology.",
297
+ action=f"Review and potentially add '{new_col}' to entity '{drift_report.entity}'",
298
+ entity=drift_report.entity,
299
+ property=new_col
300
+ ))
301
+
302
+ return fixes
303
+
304
+ def generate_binding_yaml(self, ontology: Ontology) -> str:
305
+ """
306
+ Generate YAML configuration for schema bindings.
307
+
308
+ Args:
309
+ ontology: The ontology
310
+
311
+ Returns:
312
+ YAML string
313
+ """
314
+ try:
315
+ import yaml
316
+ except ImportError:
317
+ logger.warning("PyYAML not installed. Cannot generate YAML.")
318
+ return ""
319
+
320
+ yaml_data = {
321
+ "ontology": {
322
+ "name": ontology.name,
323
+ "version": ontology.version
324
+ },
325
+ "entities": {}
326
+ }
327
+
328
+ for entity_name, binding in self.bindings.items():
329
+ yaml_data["entities"][entity_name] = {
330
+ "source": binding.physical_source,
331
+ "source_type": binding.source_type,
332
+ "mappings": binding.property_mappings
333
+ }
334
+
335
+ return yaml.dump(yaml_data, default_flow_style=False, sort_keys=False)
336
+
337
+ def _to_snake_case(self, name: str) -> str:
338
+ """Convert property name to snake_case."""
339
+ import re
340
+ # Insert underscore before uppercase letters
341
+ s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
342
+ return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
343
+
344
+ def _detect_source_type(self, physical_table: str) -> str:
345
+ """Detect source type from table name."""
346
+ if "azure" in physical_table.lower() or "sql" in physical_table.lower():
347
+ return "azure_sql"
348
+ elif "fabric" in physical_table.lower() or "onelake" in physical_table.lower():
349
+ return "fabric"
350
+ else:
351
+ return "sql"
352
+
353
+ def _similar_names(self, name1: str, name2: str) -> bool:
354
+ """Check if two names are similar (heuristic for rename detection)."""
355
+ name1_lower = name1.lower().replace("_", "").replace("-", "")
356
+ name2_lower = name2.lower().replace("_", "").replace("-", "")
357
+
358
+ # Check if one contains the other
359
+ if name1_lower in name2_lower or name2_lower in name1_lower:
360
+ return True
361
+
362
+ # Check Levenshtein distance (simplified)
363
+ if abs(len(name1_lower) - len(name2_lower)) <= 3:
364
+ # Simple similarity check
365
+ common_chars = sum(1 for c in name1_lower if c in name2_lower)
366
+ similarity = common_chars / max(len(name1_lower), len(name2_lower))
367
+ return similarity > 0.7
368
+
369
+ return False