powerbi-ontology-extractor 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,420 @@
1
+ """
2
+ Semantic Model Analyzer
3
+
4
+ Analyzes multiple Power BI semantic models to detect conflicts and calculate semantic debt.
5
+ """
6
+
7
+ import logging
8
+ from dataclasses import dataclass, field
9
+ from typing import Dict, List, Optional
10
+
11
+ from powerbi_ontology.extractor import SemanticModel, Measure
12
+ from powerbi_ontology.dax_parser import DAXParser
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @dataclass
18
+ class Conflict:
19
+ """Represents a semantic conflict between dashboards."""
20
+ concept: str # e.g., "HighRiskCustomer"
21
+ dashboard1: str
22
+ definition1: str
23
+ dashboard2: str
24
+ definition2: str
25
+ severity: str = "MEDIUM" # "LOW", "MEDIUM", "HIGH", "CRITICAL"
26
+ description: str = ""
27
+
28
+
29
+ @dataclass
30
+ class Duplication:
31
+ """Represents duplicated logic across dashboards."""
32
+ measure_name: str
33
+ dashboards: List[str]
34
+ dax_formula: str
35
+ description: str = ""
36
+
37
+
38
+ @dataclass
39
+ class CanonicalEntity:
40
+ """Suggested canonical definition for an entity."""
41
+ name: str
42
+ suggested_definition: str
43
+ dashboards_using: List[str]
44
+ alternative_definitions: Dict[str, str] = field(default_factory=dict)
45
+ confidence: float = 0.0
46
+
47
+
48
+ @dataclass
49
+ class SemanticDebtReport:
50
+ """Report of semantic debt calculation."""
51
+ total_conflicts: int
52
+ total_duplications: int
53
+ cost_per_conflict: float = 50000.0 # $50K per conflict
54
+ total_cost: float = 0.0
55
+ conflicts_by_severity: Dict[str, int] = field(default_factory=dict)
56
+ message: str = ""
57
+
58
+
59
+ class SemanticAnalyzer:
60
+ """
61
+ Analyzes multiple Power BI semantic models to:
62
+ - Detect semantic conflicts
63
+ - Identify duplicate logic
64
+ - Calculate semantic debt
65
+ - Suggest canonical definitions
66
+ """
67
+
68
+ def __init__(self, semantic_models: List[SemanticModel]):
69
+ """
70
+ Initialize analyzer.
71
+
72
+ Args:
73
+ semantic_models: List of semantic models to analyze
74
+ """
75
+ self.semantic_models = semantic_models
76
+ self.dax_parser = DAXParser()
77
+ self._model_map = {model.source_file: model for model in semantic_models}
78
+
79
+ def detect_conflicts(self) -> List[Conflict]:
80
+ """
81
+ Detect conflicting definitions across dashboards.
82
+
83
+ Example:
84
+ - Dashboard A: HighRiskCustomer = RiskScore > 80
85
+ - Dashboard B: HighRiskCustomer = ChurnProbability > 0.7
86
+ - Conflict: Same concept, different definitions!
87
+
88
+ Returns:
89
+ List of Conflict objects
90
+ """
91
+ conflicts = []
92
+
93
+ # Group measures by name (case-insensitive)
94
+ measures_by_name: Dict[str, List[tuple]] = {} # name -> [(model, measure), ...]
95
+
96
+ for model in self.semantic_models:
97
+ for measure in model.measures:
98
+ measure_key = measure.name.lower()
99
+ if measure_key not in measures_by_name:
100
+ measures_by_name[measure_key] = []
101
+ measures_by_name[measure_key].append((model, measure))
102
+
103
+ # Find conflicts: same measure name, different definitions
104
+ for measure_name, measure_list in measures_by_name.items():
105
+ if len(measure_list) > 1:
106
+ # Compare all pairs
107
+ for i, (model1, measure1) in enumerate(measure_list):
108
+ for model2, measure2 in measure_list[i+1:]:
109
+ if measure1.dax_formula != measure2.dax_formula:
110
+ # Conflict detected!
111
+ conflict = Conflict(
112
+ concept=measure_name,
113
+ dashboard1=model1.source_file,
114
+ definition1=measure1.dax_formula,
115
+ dashboard2=model2.source_file,
116
+ definition2=measure2.dax_formula,
117
+ severity=self._determine_severity(measure1.dax_formula, measure2.dax_formula),
118
+ description=f"'{measure_name}' defined differently in {model1.source_file} vs {model2.source_file}"
119
+ )
120
+ conflicts.append(conflict)
121
+
122
+ # Also check for entity definition conflicts
123
+ entities_by_name: Dict[str, List[tuple]] = {}
124
+ for model in self.semantic_models:
125
+ for entity in model.entities:
126
+ entity_key = entity.name.lower()
127
+ if entity_key not in entities_by_name:
128
+ entities_by_name[entity_key] = []
129
+ entities_by_name[entity_key].append((model, entity))
130
+
131
+ for entity_name, entity_list in entities_by_name.items():
132
+ if len(entity_list) > 1:
133
+ # Check for property differences
134
+ for i, (model1, entity1) in enumerate(entity_list):
135
+ for model2, entity2 in entity_list[i+1:]:
136
+ props1 = {p.name: p.data_type for p in entity1.properties}
137
+ props2 = {p.name: p.data_type for p in entity2.properties}
138
+
139
+ if props1 != props2:
140
+ conflict = Conflict(
141
+ concept=entity_name,
142
+ dashboard1=model1.source_file,
143
+ definition1=f"{len(entity1.properties)} properties",
144
+ dashboard2=model2.source_file,
145
+ definition2=f"{len(entity2.properties)} properties",
146
+ severity="MEDIUM",
147
+ description=f"Entity '{entity_name}' has different properties across dashboards"
148
+ )
149
+ conflicts.append(conflict)
150
+
151
+ logger.info(f"Detected {len(conflicts)} conflicts")
152
+ return conflicts
153
+
154
+ def identify_duplicate_logic(self) -> List[Duplication]:
155
+ """
156
+ Identify duplicated DAX measures across dashboards.
157
+
158
+ Returns:
159
+ List of Duplication objects
160
+ """
161
+ duplications = []
162
+
163
+ # Group measures by formula (normalized)
164
+ measures_by_formula: Dict[str, List[tuple]] = {}
165
+
166
+ for model in self.semantic_models:
167
+ for measure in model.measures:
168
+ # Normalize formula (remove whitespace, case-insensitive)
169
+ normalized = self._normalize_formula(measure.dax_formula)
170
+ if normalized not in measures_by_formula:
171
+ measures_by_formula[normalized] = []
172
+ measures_by_formula[normalized].append((model, measure))
173
+
174
+ # Find duplications (same formula, different names or dashboards)
175
+ for formula, measure_list in measures_by_formula.items():
176
+ if len(measure_list) > 1:
177
+ dashboards = [model.source_file for model, _ in measure_list]
178
+ measure_names = [measure.name for _, measure in measure_list]
179
+
180
+ # Check if same name or different names
181
+ if len(set(measure_names)) == 1:
182
+ # Same name, same formula - true duplication
183
+ duplication = Duplication(
184
+ measure_name=measure_names[0],
185
+ dashboards=dashboards,
186
+ dax_formula=measure_list[0][1].dax_formula,
187
+ description=f"Same measure '{measure_names[0]}' duplicated across {len(dashboards)} dashboards"
188
+ )
189
+ else:
190
+ # Different names, same formula - opportunity for consolidation
191
+ duplication = Duplication(
192
+ measure_name=f"{measure_names[0]} (and {len(measure_names)-1} others)",
193
+ dashboards=dashboards,
194
+ dax_formula=measure_list[0][1].dax_formula,
195
+ description=f"Same logic with different names: {', '.join(measure_names)}"
196
+ )
197
+ duplications.append(duplication)
198
+
199
+ logger.info(f"Identified {len(duplications)} duplications")
200
+ return duplications
201
+
202
+ def calculate_semantic_debt(self) -> SemanticDebtReport:
203
+ """
204
+ Calculate semantic debt from conflicts.
205
+
206
+ From article: $50K per conflict to reconcile.
207
+
208
+ Returns:
209
+ SemanticDebtReport
210
+ """
211
+ conflicts = self.detect_conflicts()
212
+ duplications = self.identify_duplicate_logic()
213
+
214
+ # Count by severity
215
+ conflicts_by_severity = {}
216
+ for conflict in conflicts:
217
+ severity = conflict.severity
218
+ conflicts_by_severity[severity] = conflicts_by_severity.get(severity, 0) + 1
219
+
220
+ # Calculate cost
221
+ cost_per_conflict = 50000.0 # $50K per conflict
222
+ total_cost = len(conflicts) * cost_per_conflict
223
+
224
+ # Add cost for duplications (lower cost, but still significant)
225
+ duplication_cost = len(duplications) * 10000.0 # $10K per duplication
226
+ total_cost += duplication_cost
227
+
228
+ report = SemanticDebtReport(
229
+ total_conflicts=len(conflicts),
230
+ total_duplications=len(duplications),
231
+ cost_per_conflict=cost_per_conflict,
232
+ total_cost=total_cost,
233
+ conflicts_by_severity=conflicts_by_severity,
234
+ message=f"Total semantic debt: ${total_cost:,.0f} ({len(conflicts)} conflicts × ${cost_per_conflict:,.0f} + {len(duplications)} duplications × $10,000)"
235
+ )
236
+
237
+ logger.info(f"Semantic debt calculated: ${total_cost:,.0f}")
238
+ return report
239
+
240
+ def suggest_canonical_definitions(self) -> List[CanonicalEntity]:
241
+ """
242
+ Suggest canonical definitions for entities/concepts.
243
+
244
+ Returns:
245
+ List of CanonicalEntity suggestions
246
+ """
247
+ canonical_entities = []
248
+
249
+ # Group measures by name
250
+ measures_by_name: Dict[str, List[tuple]] = {}
251
+ for model in self.semantic_models:
252
+ for measure in model.measures:
253
+ key = measure.name.lower()
254
+ if key not in measures_by_name:
255
+ measures_by_name[key] = []
256
+ measures_by_name[key].append((model, measure))
257
+
258
+ # For each measure with multiple definitions, suggest canonical
259
+ for measure_name, measure_list in measures_by_name.items():
260
+ if len(measure_list) > 1:
261
+ # Find most common definition
262
+ formula_counts: Dict[str, int] = {}
263
+ for model, measure in measure_list:
264
+ normalized = self._normalize_formula(measure.dax_formula)
265
+ formula_counts[normalized] = formula_counts.get(normalized, 0) + 1
266
+
267
+ # Most common is the suggested canonical
268
+ most_common_formula = max(formula_counts.items(), key=lambda x: x[1])
269
+ suggested_def = most_common_formula[0]
270
+ confidence = most_common_formula[1] / len(measure_list)
271
+
272
+ # Get dashboards using this definition
273
+ dashboards_using = [
274
+ model.source_file for model, measure in measure_list
275
+ if self._normalize_formula(measure.dax_formula) == suggested_def
276
+ ]
277
+
278
+ # Get alternative definitions
279
+ alternative_defs = {}
280
+ for model, measure in measure_list:
281
+ normalized = self._normalize_formula(measure.dax_formula)
282
+ if normalized != suggested_def:
283
+ alternative_defs[model.source_file] = measure.dax_formula
284
+
285
+ canonical = CanonicalEntity(
286
+ name=measure_name,
287
+ suggested_definition=suggested_def,
288
+ dashboards_using=dashboards_using,
289
+ alternative_definitions=alternative_defs,
290
+ confidence=confidence
291
+ )
292
+ canonical_entities.append(canonical)
293
+
294
+ logger.info(f"Suggested {len(canonical_entities)} canonical definitions")
295
+ return canonical_entities
296
+
297
+ def generate_consolidation_report(self, output_path: str):
298
+ """
299
+ Generate HTML/PDF report showing analysis results.
300
+
301
+ Args:
302
+ output_path: Path to save report
303
+ """
304
+ conflicts = self.detect_conflicts()
305
+ duplications = self.identify_duplicate_logic()
306
+ debt_report = self.calculate_semantic_debt()
307
+ canonical_defs = self.suggest_canonical_definitions()
308
+
309
+ # Generate HTML report
310
+ html = self._generate_html_report(
311
+ conflicts, duplications, debt_report, canonical_defs
312
+ )
313
+
314
+ with open(output_path, 'w', encoding='utf-8') as f:
315
+ f.write(html)
316
+
317
+ logger.info(f"Generated consolidation report: {output_path}")
318
+
319
+ def _determine_severity(self, formula1: str, formula2: str) -> str:
320
+ """Determine conflict severity based on formula differences."""
321
+ # Simple heuristic
322
+ if formula1.lower() == formula2.lower():
323
+ return "LOW"
324
+
325
+ # Check if they're similar (threshold-based)
326
+ if ">" in formula1 and ">" in formula2:
327
+ # Extract thresholds
328
+ import re
329
+ thresholds1 = re.findall(r'[><=]+\s*(\d+)', formula1)
330
+ thresholds2 = re.findall(r'[><=]+\s*(\d+)', formula2)
331
+ if thresholds1 and thresholds2:
332
+ if abs(int(thresholds1[0]) - int(thresholds2[0])) > 20:
333
+ return "HIGH"
334
+
335
+ return "MEDIUM"
336
+
337
+ def _normalize_formula(self, formula: str) -> str:
338
+ """Normalize DAX formula for comparison."""
339
+ # Remove whitespace, convert to lowercase
340
+ normalized = formula.replace(" ", "").replace("\n", "").replace("\t", "").lower()
341
+ return normalized
342
+
343
+ def _generate_html_report(
344
+ self,
345
+ conflicts: List[Conflict],
346
+ duplications: List[Duplication],
347
+ debt_report: SemanticDebtReport,
348
+ canonical_defs: List[CanonicalEntity]
349
+ ) -> str:
350
+ """Generate HTML report."""
351
+ html = f"""
352
+ <!DOCTYPE html>
353
+ <html>
354
+ <head>
355
+ <title>Semantic Debt Analysis Report</title>
356
+ <style>
357
+ body {{ font-family: Arial, sans-serif; margin: 20px; }}
358
+ h1 {{ color: #333; }}
359
+ h2 {{ color: #666; margin-top: 30px; }}
360
+ .conflict {{ border-left: 4px solid #f44336; padding: 10px; margin: 10px 0; background: #ffebee; }}
361
+ .duplication {{ border-left: 4px solid #ff9800; padding: 10px; margin: 10px 0; background: #fff3e0; }}
362
+ .debt {{ border: 2px solid #f44336; padding: 20px; margin: 20px 0; background: #ffebee; }}
363
+ table {{ border-collapse: collapse; width: 100%; margin: 20px 0; }}
364
+ th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
365
+ th {{ background-color: #4CAF50; color: white; }}
366
+ </style>
367
+ </head>
368
+ <body>
369
+ <h1>Semantic Debt Analysis Report</h1>
370
+
371
+ <div class="debt">
372
+ <h2>Total Semantic Debt</h2>
373
+ <p><strong>${debt_report.total_cost:,.0f}</strong></p>
374
+ <p>{debt_report.message}</p>
375
+ <p>Conflicts: {debt_report.total_conflicts}</p>
376
+ <p>Duplications: {debt_report.total_duplications}</p>
377
+ </div>
378
+
379
+ <h2>Conflicts Detected ({len(conflicts)})</h2>
380
+ {"".join(f'''
381
+ <div class="conflict">
382
+ <h3>{conflict.concept}</h3>
383
+ <p><strong>Severity:</strong> {conflict.severity}</p>
384
+ <p><strong>{conflict.dashboard1}:</strong> {conflict.definition1}</p>
385
+ <p><strong>{conflict.dashboard2}:</strong> {conflict.definition2}</p>
386
+ <p>{conflict.description}</p>
387
+ </div>
388
+ ''' for conflict in conflicts)}
389
+
390
+ <h2>Duplications Identified ({len(duplications)})</h2>
391
+ {"".join(f'''
392
+ <div class="duplication">
393
+ <h3>{dup.measure_name}</h3>
394
+ <p><strong>Dashboards:</strong> {', '.join(dup.dashboards)}</p>
395
+ <p><strong>Formula:</strong> <code>{dup.dax_formula}</code></p>
396
+ <p>{dup.description}</p>
397
+ </div>
398
+ ''' for dup in duplications)}
399
+
400
+ <h2>Canonical Definition Suggestions ({len(canonical_defs)})</h2>
401
+ <table>
402
+ <tr>
403
+ <th>Concept</th>
404
+ <th>Suggested Definition</th>
405
+ <th>Confidence</th>
406
+ <th>Dashboards Using</th>
407
+ </tr>
408
+ {"".join(f'''
409
+ <tr>
410
+ <td>{canon.name}</td>
411
+ <td><code>{canon.suggested_definition[:100]}...</code></td>
412
+ <td>{canon.confidence:.0%}</td>
413
+ <td>{len(canon.dashboards_using)}</td>
414
+ </tr>
415
+ ''' for canon in canonical_defs)}
416
+ </table>
417
+ </body>
418
+ </html>
419
+ """
420
+ return html