powerbi-ontology-extractor 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +1 -0
- cli/pbi_ontology_cli.py +286 -0
- powerbi_ontology/__init__.py +38 -0
- powerbi_ontology/analyzer.py +420 -0
- powerbi_ontology/chat.py +303 -0
- powerbi_ontology/cli.py +530 -0
- powerbi_ontology/contract_builder.py +269 -0
- powerbi_ontology/dax_parser.py +305 -0
- powerbi_ontology/export/__init__.py +17 -0
- powerbi_ontology/export/contract_to_owl.py +408 -0
- powerbi_ontology/export/fabric_iq.py +243 -0
- powerbi_ontology/export/fabric_iq_to_owl.py +463 -0
- powerbi_ontology/export/json_schema.py +110 -0
- powerbi_ontology/export/ontoguard.py +177 -0
- powerbi_ontology/export/owl.py +522 -0
- powerbi_ontology/extractor.py +368 -0
- powerbi_ontology/mcp_config.py +237 -0
- powerbi_ontology/mcp_models.py +166 -0
- powerbi_ontology/mcp_server.py +1106 -0
- powerbi_ontology/ontology_diff.py +776 -0
- powerbi_ontology/ontology_generator.py +406 -0
- powerbi_ontology/review.py +556 -0
- powerbi_ontology/schema_mapper.py +369 -0
- powerbi_ontology/semantic_debt.py +584 -0
- powerbi_ontology/utils/__init__.py +13 -0
- powerbi_ontology/utils/pbix_reader.py +558 -0
- powerbi_ontology/utils/visualizer.py +332 -0
- powerbi_ontology_extractor-0.1.0.dist-info/METADATA +507 -0
- powerbi_ontology_extractor-0.1.0.dist-info/RECORD +33 -0
- powerbi_ontology_extractor-0.1.0.dist-info/WHEEL +5 -0
- powerbi_ontology_extractor-0.1.0.dist-info/entry_points.txt +4 -0
- powerbi_ontology_extractor-0.1.0.dist-info/licenses/LICENSE +21 -0
- powerbi_ontology_extractor-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Schema Mapper
|
|
3
|
+
|
|
4
|
+
Maps logical ontology entities to physical data sources and detects schema drift.
|
|
5
|
+
This prevents the $4.6M mistake by validating schema bindings.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from powerbi_ontology.ontology_generator import Ontology, OntologyEntity
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class SchemaBinding:
|
|
19
|
+
"""Maps logical entity to physical data source."""
|
|
20
|
+
entity: str
|
|
21
|
+
physical_source: str # e.g., "SQL.dbo.customers"
|
|
22
|
+
property_mappings: Dict[str, str] = field(default_factory=dict) # logical -> physical
|
|
23
|
+
source_type: str = "sql" # "sql", "azure_sql", "fabric", etc.
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class ValidationResult:
|
|
28
|
+
"""Result of schema binding validation."""
|
|
29
|
+
is_valid: bool
|
|
30
|
+
errors: List[str] = field(default_factory=list)
|
|
31
|
+
warnings: List[str] = field(default_factory=list)
|
|
32
|
+
message: str = ""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class DriftReport:
|
|
37
|
+
"""Report of schema drift detection."""
|
|
38
|
+
entity: str
|
|
39
|
+
missing_columns: List[str] = field(default_factory=list)
|
|
40
|
+
new_columns: List[str] = field(default_factory=list)
|
|
41
|
+
type_changes: Dict[str, str] = field(default_factory=dict) # column -> old_type -> new_type
|
|
42
|
+
renamed_columns: Dict[str, str] = field(default_factory=dict) # old_name -> new_name
|
|
43
|
+
severity: str = "INFO" # "INFO", "WARNING", "CRITICAL"
|
|
44
|
+
message: str = ""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class Fix:
|
|
49
|
+
"""Suggested fix for schema drift."""
|
|
50
|
+
type: str # "update_mapping", "add_column", "remove_column"
|
|
51
|
+
description: str
|
|
52
|
+
action: str # SQL or mapping update
|
|
53
|
+
entity: str = ""
|
|
54
|
+
property: str = ""
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class SchemaMapper:
|
|
58
|
+
"""
|
|
59
|
+
Maps ontology entities to physical data sources and detects schema drift.
|
|
60
|
+
|
|
61
|
+
This is the CRITICAL piece that prevents the $4.6M mistake by detecting
|
|
62
|
+
when columns are renamed or deleted before AI agents use them.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def __init__(self, ontology: Ontology, data_source: Optional[str] = None):
|
|
66
|
+
"""
|
|
67
|
+
Initialize schema mapper.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
ontology: The ontology to map
|
|
71
|
+
data_source: Optional default data source
|
|
72
|
+
"""
|
|
73
|
+
self.ontology = ontology
|
|
74
|
+
self.data_source = data_source
|
|
75
|
+
self.bindings: Dict[str, SchemaBinding] = {}
|
|
76
|
+
|
|
77
|
+
def create_binding(
|
|
78
|
+
self,
|
|
79
|
+
entity_name: str,
|
|
80
|
+
physical_table: str,
|
|
81
|
+
property_mappings: Optional[Dict[str, str]] = None
|
|
82
|
+
) -> SchemaBinding:
|
|
83
|
+
"""
|
|
84
|
+
Create a schema binding between logical entity and physical table.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
entity_name: Name of the ontology entity
|
|
88
|
+
physical_table: Physical table name (e.g., "dbo.customers")
|
|
89
|
+
property_mappings: Optional explicit property mappings
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
SchemaBinding object
|
|
93
|
+
"""
|
|
94
|
+
entity = next((e for e in self.ontology.entities if e.name == entity_name), None)
|
|
95
|
+
if not entity:
|
|
96
|
+
raise ValueError(f"Entity not found: {entity_name}")
|
|
97
|
+
|
|
98
|
+
# Auto-generate mappings if not provided
|
|
99
|
+
if not property_mappings:
|
|
100
|
+
property_mappings = {}
|
|
101
|
+
for prop in entity.properties:
|
|
102
|
+
# Default: use property name as column name (snake_case conversion)
|
|
103
|
+
physical_name = self._to_snake_case(prop.name)
|
|
104
|
+
property_mappings[prop.name] = physical_name
|
|
105
|
+
|
|
106
|
+
binding = SchemaBinding(
|
|
107
|
+
entity=entity_name,
|
|
108
|
+
physical_source=physical_table,
|
|
109
|
+
property_mappings=property_mappings,
|
|
110
|
+
source_type=self._detect_source_type(physical_table)
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
self.bindings[entity_name] = binding
|
|
114
|
+
logger.info(f"Created binding: {entity_name} -> {physical_table}")
|
|
115
|
+
|
|
116
|
+
return binding
|
|
117
|
+
|
|
118
|
+
def validate_binding(self, binding: SchemaBinding) -> ValidationResult:
|
|
119
|
+
"""
|
|
120
|
+
Validate a schema binding.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
binding: SchemaBinding to validate
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
ValidationResult
|
|
127
|
+
"""
|
|
128
|
+
errors = []
|
|
129
|
+
warnings = []
|
|
130
|
+
|
|
131
|
+
# Check if entity exists in ontology
|
|
132
|
+
entity = next((e for e in self.ontology.entities if e.name == binding.entity), None)
|
|
133
|
+
if not entity:
|
|
134
|
+
errors.append(f"Entity '{binding.entity}' not found in ontology")
|
|
135
|
+
return ValidationResult(
|
|
136
|
+
is_valid=False,
|
|
137
|
+
errors=errors,
|
|
138
|
+
message=f"Entity '{binding.entity}' not found"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Check if all mapped properties exist in entity
|
|
142
|
+
for logical_prop, physical_col in binding.property_mappings.items():
|
|
143
|
+
prop_exists = any(p.name == logical_prop for p in entity.properties)
|
|
144
|
+
if not prop_exists:
|
|
145
|
+
warnings.append(
|
|
146
|
+
f"Property '{logical_prop}' mapped but not found in entity '{binding.entity}'"
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# Note: Full validation would require connection to actual data source
|
|
150
|
+
# This is a basic validation - full implementation would query the database
|
|
151
|
+
|
|
152
|
+
is_valid = len(errors) == 0
|
|
153
|
+
message = "Binding is valid" if is_valid else f"Found {len(errors)} errors"
|
|
154
|
+
|
|
155
|
+
return ValidationResult(
|
|
156
|
+
is_valid=is_valid,
|
|
157
|
+
errors=errors,
|
|
158
|
+
warnings=warnings,
|
|
159
|
+
message=message
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
def detect_drift(
|
|
163
|
+
self,
|
|
164
|
+
binding: SchemaBinding,
|
|
165
|
+
current_schema: Dict[str, any]
|
|
166
|
+
) -> DriftReport:
|
|
167
|
+
"""
|
|
168
|
+
Detect schema drift between binding and current schema.
|
|
169
|
+
|
|
170
|
+
This is the function that PREVENTS THE $4.6M MISTAKE!
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
binding: SchemaBinding to check
|
|
174
|
+
current_schema: Current physical schema (dict of column_name -> type)
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
DriftReport
|
|
178
|
+
"""
|
|
179
|
+
missing_columns = []
|
|
180
|
+
new_columns = []
|
|
181
|
+
type_changes = {}
|
|
182
|
+
renamed_columns = {}
|
|
183
|
+
|
|
184
|
+
# Get expected columns from binding
|
|
185
|
+
expected_columns = set(binding.property_mappings.values())
|
|
186
|
+
actual_columns = set(current_schema.keys())
|
|
187
|
+
|
|
188
|
+
# Find missing columns (THE $4.6M PROBLEM!)
|
|
189
|
+
missing_columns = list(expected_columns - actual_columns)
|
|
190
|
+
|
|
191
|
+
# Find new columns
|
|
192
|
+
new_columns = list(actual_columns - expected_columns)
|
|
193
|
+
|
|
194
|
+
# Check for type changes
|
|
195
|
+
for logical_prop, physical_col in binding.property_mappings.items():
|
|
196
|
+
if physical_col in current_schema:
|
|
197
|
+
# Get expected type from ontology
|
|
198
|
+
entity = next((e for e in self.ontology.entities if e.name == binding.entity), None)
|
|
199
|
+
if entity:
|
|
200
|
+
prop = next((p for p in entity.properties if p.name == logical_prop), None)
|
|
201
|
+
if prop:
|
|
202
|
+
expected_type = prop.data_type
|
|
203
|
+
actual_type = current_schema[physical_col]
|
|
204
|
+
if expected_type != actual_type:
|
|
205
|
+
type_changes[physical_col] = f"{expected_type} -> {actual_type}"
|
|
206
|
+
|
|
207
|
+
# Heuristic: If column is missing but similar name exists, might be renamed
|
|
208
|
+
if missing_columns and new_columns:
|
|
209
|
+
for missing in missing_columns:
|
|
210
|
+
for new_col in new_columns:
|
|
211
|
+
if self._similar_names(missing, new_col):
|
|
212
|
+
renamed_columns[missing] = new_col
|
|
213
|
+
# Remove from missing/new if we found a match
|
|
214
|
+
if missing in missing_columns:
|
|
215
|
+
missing_columns.remove(missing)
|
|
216
|
+
if new_col in new_columns:
|
|
217
|
+
new_columns.remove(new_col)
|
|
218
|
+
|
|
219
|
+
# Determine severity
|
|
220
|
+
severity = "INFO"
|
|
221
|
+
if missing_columns:
|
|
222
|
+
severity = "CRITICAL" # This is the $4.6M mistake scenario!
|
|
223
|
+
elif type_changes:
|
|
224
|
+
severity = "WARNING"
|
|
225
|
+
elif renamed_columns:
|
|
226
|
+
severity = "WARNING"
|
|
227
|
+
|
|
228
|
+
# Build message
|
|
229
|
+
message_parts = []
|
|
230
|
+
if missing_columns:
|
|
231
|
+
message_parts.append(
|
|
232
|
+
f"CRITICAL: Missing columns: {', '.join(missing_columns)}. "
|
|
233
|
+
f"This could cause the $4.6M mistake!"
|
|
234
|
+
)
|
|
235
|
+
if renamed_columns:
|
|
236
|
+
message_parts.append(
|
|
237
|
+
f"Columns may have been renamed: {', '.join(f'{k} -> {v}' for k, v in renamed_columns.items())}"
|
|
238
|
+
)
|
|
239
|
+
if type_changes:
|
|
240
|
+
message_parts.append(f"Type changes detected: {', '.join(f'{k}: {v}' for k, v in type_changes.items())}")
|
|
241
|
+
if new_columns:
|
|
242
|
+
message_parts.append(f"New columns found: {', '.join(new_columns)}")
|
|
243
|
+
|
|
244
|
+
message = " | ".join(message_parts) if message_parts else "No drift detected"
|
|
245
|
+
|
|
246
|
+
return DriftReport(
|
|
247
|
+
entity=binding.entity,
|
|
248
|
+
missing_columns=missing_columns,
|
|
249
|
+
new_columns=new_columns,
|
|
250
|
+
type_changes=type_changes,
|
|
251
|
+
renamed_columns=renamed_columns,
|
|
252
|
+
severity=severity,
|
|
253
|
+
message=message
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
def suggest_fix(self, drift_report: DriftReport) -> List[Fix]:
|
|
257
|
+
"""
|
|
258
|
+
Suggest fixes for detected drift.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
drift_report: DriftReport with detected issues
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
List of Fix suggestions
|
|
265
|
+
"""
|
|
266
|
+
fixes = []
|
|
267
|
+
binding = self.bindings.get(drift_report.entity)
|
|
268
|
+
|
|
269
|
+
if not binding:
|
|
270
|
+
return fixes
|
|
271
|
+
|
|
272
|
+
# Fix for renamed columns
|
|
273
|
+
for old_name, new_name in drift_report.renamed_columns.items():
|
|
274
|
+
fixes.append(Fix(
|
|
275
|
+
type="update_mapping",
|
|
276
|
+
description=f"Update mapping: {old_name} -> {new_name}",
|
|
277
|
+
action=f"Update binding.property_mappings['{old_name}'] = '{new_name}'",
|
|
278
|
+
entity=drift_report.entity,
|
|
279
|
+
property=old_name
|
|
280
|
+
))
|
|
281
|
+
|
|
282
|
+
# Fix for missing columns
|
|
283
|
+
for missing_col in drift_report.missing_columns:
|
|
284
|
+
fixes.append(Fix(
|
|
285
|
+
type="update_mapping",
|
|
286
|
+
description=f"Column '{missing_col}' not found. Check if renamed or deleted.",
|
|
287
|
+
action=f"Verify column exists: SELECT * FROM {binding.physical_source} WHERE 1=0",
|
|
288
|
+
entity=drift_report.entity,
|
|
289
|
+
property=missing_col
|
|
290
|
+
))
|
|
291
|
+
|
|
292
|
+
# Fix for new columns
|
|
293
|
+
for new_col in drift_report.new_columns:
|
|
294
|
+
fixes.append(Fix(
|
|
295
|
+
type="add_column",
|
|
296
|
+
description=f"New column '{new_col}' found. Consider adding to ontology.",
|
|
297
|
+
action=f"Review and potentially add '{new_col}' to entity '{drift_report.entity}'",
|
|
298
|
+
entity=drift_report.entity,
|
|
299
|
+
property=new_col
|
|
300
|
+
))
|
|
301
|
+
|
|
302
|
+
return fixes
|
|
303
|
+
|
|
304
|
+
def generate_binding_yaml(self, ontology: Ontology) -> str:
|
|
305
|
+
"""
|
|
306
|
+
Generate YAML configuration for schema bindings.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
ontology: The ontology
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
YAML string
|
|
313
|
+
"""
|
|
314
|
+
try:
|
|
315
|
+
import yaml
|
|
316
|
+
except ImportError:
|
|
317
|
+
logger.warning("PyYAML not installed. Cannot generate YAML.")
|
|
318
|
+
return ""
|
|
319
|
+
|
|
320
|
+
yaml_data = {
|
|
321
|
+
"ontology": {
|
|
322
|
+
"name": ontology.name,
|
|
323
|
+
"version": ontology.version
|
|
324
|
+
},
|
|
325
|
+
"entities": {}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
for entity_name, binding in self.bindings.items():
|
|
329
|
+
yaml_data["entities"][entity_name] = {
|
|
330
|
+
"source": binding.physical_source,
|
|
331
|
+
"source_type": binding.source_type,
|
|
332
|
+
"mappings": binding.property_mappings
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
return yaml.dump(yaml_data, default_flow_style=False, sort_keys=False)
|
|
336
|
+
|
|
337
|
+
def _to_snake_case(self, name: str) -> str:
|
|
338
|
+
"""Convert property name to snake_case."""
|
|
339
|
+
import re
|
|
340
|
+
# Insert underscore before uppercase letters
|
|
341
|
+
s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
|
|
342
|
+
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
|
|
343
|
+
|
|
344
|
+
def _detect_source_type(self, physical_table: str) -> str:
|
|
345
|
+
"""Detect source type from table name."""
|
|
346
|
+
if "azure" in physical_table.lower() or "sql" in physical_table.lower():
|
|
347
|
+
return "azure_sql"
|
|
348
|
+
elif "fabric" in physical_table.lower() or "onelake" in physical_table.lower():
|
|
349
|
+
return "fabric"
|
|
350
|
+
else:
|
|
351
|
+
return "sql"
|
|
352
|
+
|
|
353
|
+
def _similar_names(self, name1: str, name2: str) -> bool:
|
|
354
|
+
"""Check if two names are similar (heuristic for rename detection)."""
|
|
355
|
+
name1_lower = name1.lower().replace("_", "").replace("-", "")
|
|
356
|
+
name2_lower = name2.lower().replace("_", "").replace("-", "")
|
|
357
|
+
|
|
358
|
+
# Check if one contains the other
|
|
359
|
+
if name1_lower in name2_lower or name2_lower in name1_lower:
|
|
360
|
+
return True
|
|
361
|
+
|
|
362
|
+
# Check Levenshtein distance (simplified)
|
|
363
|
+
if abs(len(name1_lower) - len(name2_lower)) <= 3:
|
|
364
|
+
# Simple similarity check
|
|
365
|
+
common_chars = sum(1 for c in name1_lower if c in name2_lower)
|
|
366
|
+
similarity = common_chars / max(len(name1_lower), len(name2_lower))
|
|
367
|
+
return similarity > 0.7
|
|
368
|
+
|
|
369
|
+
return False
|