powerbi-ontology-extractor 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,558 @@
1
+ """
2
+ PBIX Reader Utility
3
+
4
+ Reads Power BI .pbix files using PBIXRay library to parse binary DataModel.
5
+ Supports both modern .pbix files (binary DataModel) and legacy files (model.bim JSON).
6
+ """
7
+
8
+ import json
9
+ import logging
10
+ import tempfile
11
+ import zipfile
12
+ from pathlib import Path
13
+ from typing import Any, Dict, List, Optional
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Try to import pbixray for modern .pbix parsing
18
+ try:
19
+ from pbixray import PBIXRay
20
+ PBIXRAY_AVAILABLE = True
21
+ except ImportError:
22
+ PBIXRAY_AVAILABLE = False
23
+ logger.warning("pbixray not installed. Install with: pip install pbixray")
24
+
25
+
26
+ class PBIXReader:
27
+ """
28
+ Reads Power BI .pbix files and extracts semantic model information.
29
+
30
+ Uses PBIXRay library to parse binary DataModel (XPress9 compressed).
31
+ Falls back to JSON model.bim for legacy/export files.
32
+
33
+ .pbix files are ZIP archives containing:
34
+ - DataModel (binary, XPress9 compressed) - modern files
35
+ - DataModel/model.bim (JSON) - legacy/exported files
36
+ - Report/Layout (JSON, UTF-16) - report visualizations
37
+ - [DataMashup] - Power Query M code
38
+ """
39
+
40
+ def __init__(self, pbix_path: str):
41
+ """
42
+ Initialize PBIX reader.
43
+
44
+ Args:
45
+ pbix_path: Path to the .pbix file
46
+ """
47
+ self.pbix_path = Path(pbix_path)
48
+ if not self.pbix_path.exists():
49
+ raise FileNotFoundError(f"Power BI file not found: {pbix_path}")
50
+
51
+ self.temp_dir: Optional[Path] = None
52
+ self._pbixray: Optional[Any] = None
53
+ self._model_data: Optional[Dict] = None
54
+ self._use_pbixray: bool = False
55
+ self._tables_cache: Optional[List[Dict]] = None
56
+ self._relationships_cache: Optional[List[Dict]] = None
57
+ self._measures_cache: Optional[List[Dict]] = None
58
+
59
+ def __enter__(self):
60
+ """Context manager entry."""
61
+ self.extract_to_temp()
62
+ return self
63
+
64
+ def __exit__(self, exc_type, exc_val, exc_tb):
65
+ """Context manager exit - cleanup temp files."""
66
+ self.cleanup()
67
+
68
+ def extract_to_temp(self) -> Path:
69
+ """
70
+ Extract .pbix file to temporary directory (for legacy support).
71
+ Also initializes PBIXRay if available.
72
+
73
+ Returns:
74
+ Path to temporary extraction directory
75
+ """
76
+ if self.temp_dir:
77
+ return self.temp_dir
78
+
79
+ # Try PBIXRay first for modern .pbix files
80
+ if PBIXRAY_AVAILABLE:
81
+ try:
82
+ self._pbixray = PBIXRay(str(self.pbix_path))
83
+ # Test if we can read tables
84
+ _ = self._pbixray.tables
85
+ self._use_pbixray = True
86
+ logger.info(f"Using PBIXRay for {self.pbix_path}")
87
+ except Exception as e:
88
+ logger.warning(f"PBIXRay failed, falling back to JSON: {e}")
89
+ self._use_pbixray = False
90
+
91
+ # Extract to temp for fallback/additional data
92
+ try:
93
+ self.temp_dir = Path(tempfile.mkdtemp(prefix="pbix_extract_"))
94
+
95
+ with zipfile.ZipFile(self.pbix_path, 'r') as zip_ref:
96
+ zip_ref.extractall(self.temp_dir)
97
+
98
+ logger.info(f"Extracted .pbix file to {self.temp_dir}")
99
+ return self.temp_dir
100
+ except zipfile.BadZipFile:
101
+ raise ValueError(f"Invalid .pbix file format: {self.pbix_path}")
102
+ except Exception as e:
103
+ raise RuntimeError(f"Failed to extract .pbix file: {e}")
104
+
105
+ def read_model(self) -> Dict:
106
+ """
107
+ Read and parse the semantic model data.
108
+ Uses PBIXRay for binary DataModel, falls back to JSON.
109
+
110
+ Returns:
111
+ Parsed model data as dict
112
+ """
113
+ if self._model_data:
114
+ return self._model_data
115
+
116
+ if not self.temp_dir:
117
+ self.extract_to_temp()
118
+
119
+ if self._use_pbixray:
120
+ # Build model dict from PBIXRay data
121
+ self._model_data = self._build_model_from_pbixray()
122
+ return self._model_data
123
+
124
+ # Fallback to JSON model.bim
125
+ return self._read_model_json()
126
+
127
+ def _build_model_from_pbixray(self) -> Dict:
128
+ """Build model dict from PBIXRay data."""
129
+ model = {
130
+ "name": self.pbix_path.stem,
131
+ "tables": self.get_tables(),
132
+ "relationships": self.get_relationships(),
133
+ }
134
+ return {"model": model}
135
+
136
+ def _read_model_json(self) -> Dict:
137
+ """Read model from JSON file (legacy support)."""
138
+ # Try different possible paths for model.bim
139
+ possible_paths = [
140
+ self.temp_dir / "DataModel" / "model.bim",
141
+ self.temp_dir / "model.bim",
142
+ self.temp_dir / "DataModelSchema",
143
+ ]
144
+
145
+ model_path = None
146
+ for path in possible_paths:
147
+ if path.exists():
148
+ model_path = path
149
+ break
150
+
151
+ if not model_path:
152
+ # Try to find any .bim file
153
+ bim_files = list(self.temp_dir.rglob("*.bim"))
154
+ if bim_files:
155
+ model_path = bim_files[0]
156
+ else:
157
+ raise FileNotFoundError(
158
+ f"model.bim not found and PBIXRay not available for: {self.pbix_path}"
159
+ )
160
+
161
+ try:
162
+ with open(model_path, 'r', encoding='utf-8') as f:
163
+ self._model_data = json.load(f)
164
+ logger.info(f"Successfully read model.bim from {model_path}")
165
+ return self._model_data
166
+ except json.JSONDecodeError as e:
167
+ raise ValueError(f"Invalid JSON in model.bim: {e}")
168
+ except Exception as e:
169
+ raise RuntimeError(f"Failed to read model.bim: {e}")
170
+
171
+ def get_tables(self) -> List[Dict]:
172
+ """
173
+ Extract table definitions from model.
174
+
175
+ Returns:
176
+ List of table definitions with columns
177
+ """
178
+ if self._tables_cache is not None:
179
+ return self._tables_cache
180
+
181
+ if not self.temp_dir:
182
+ self.extract_to_temp()
183
+
184
+ if self._use_pbixray:
185
+ self._tables_cache = self._get_tables_pbixray()
186
+ return self._tables_cache
187
+
188
+ # Fallback to JSON
189
+ return self._get_tables_json()
190
+
191
+ def _get_tables_pbixray(self) -> List[Dict]:
192
+ """Get tables from PBIXRay."""
193
+ tables = []
194
+
195
+ # Get schema for column info
196
+ schema_df = self._pbixray.schema
197
+
198
+ for table_name in self._pbixray.tables:
199
+ # Get column info from schema
200
+ table_schema = schema_df[schema_df['TableName'] == table_name]
201
+
202
+ columns = []
203
+ for _, row in table_schema.iterrows():
204
+ col = {
205
+ "name": row['ColumnName'],
206
+ "dataType": self._map_pandas_type(row['PandasDataType']),
207
+ "isNullable": True,
208
+ "isKey": False,
209
+ "isUnique": False,
210
+ }
211
+ columns.append(col)
212
+
213
+ # Get measures for this table
214
+ measures = []
215
+ if self._pbixray.dax_measures is not None and not self._pbixray.dax_measures.empty:
216
+ table_measures = self._pbixray.dax_measures[
217
+ self._pbixray.dax_measures['TableName'] == table_name
218
+ ]
219
+ for _, row in table_measures.iterrows():
220
+ measure = {
221
+ "name": row['Name'],
222
+ "expression": row['Expression'] if row['Expression'] else "",
223
+ "displayFolder": row['DisplayFolder'] if row['DisplayFolder'] else "",
224
+ "description": row['Description'] if row['Description'] else "",
225
+ }
226
+ measures.append(measure)
227
+
228
+ # Get hierarchies (from DAX columns)
229
+ hierarchies = []
230
+
231
+ table_def = {
232
+ "name": table_name,
233
+ "description": "",
234
+ "columns": columns,
235
+ "measures": measures,
236
+ "hierarchies": hierarchies,
237
+ }
238
+ tables.append(table_def)
239
+
240
+ return tables
241
+
242
+ def _get_tables_json(self) -> List[Dict]:
243
+ """Get tables from JSON model."""
244
+ model = self.read_model()
245
+
246
+ # Handle different Power BI schema versions
247
+ if isinstance(model, dict):
248
+ if "model" in model:
249
+ model = model["model"]
250
+ if "tables" in model:
251
+ return model["tables"]
252
+ if "model" in model and "tables" in model["model"]:
253
+ return model["model"]["tables"]
254
+
255
+ return []
256
+
257
+ def get_relationships(self) -> List[Dict]:
258
+ """
259
+ Extract relationship definitions from model.
260
+
261
+ Returns:
262
+ List of relationship definitions
263
+ """
264
+ if self._relationships_cache is not None:
265
+ return self._relationships_cache
266
+
267
+ if not self.temp_dir:
268
+ self.extract_to_temp()
269
+
270
+ if self._use_pbixray:
271
+ self._relationships_cache = self._get_relationships_pbixray()
272
+ return self._relationships_cache
273
+
274
+ # Fallback to JSON
275
+ return self._get_relationships_json()
276
+
277
+ def _get_relationships_pbixray(self) -> List[Dict]:
278
+ """Get relationships from PBIXRay."""
279
+ relationships = []
280
+
281
+ if self._pbixray.relationships is None or self._pbixray.relationships.empty:
282
+ return relationships
283
+
284
+ for _, row in self._pbixray.relationships.iterrows():
285
+ # Map cardinality
286
+ cardinality_map = {
287
+ "M:1": ("many", "one"),
288
+ "1:M": ("one", "many"),
289
+ "1:1": ("one", "one"),
290
+ "M:M": ("many", "many"),
291
+ }
292
+ card = row.get('Cardinality', 'M:1')
293
+ from_card, to_card = cardinality_map.get(card, ("many", "one"))
294
+
295
+ # Map cross filter behavior
296
+ cross_filter = row.get('CrossFilteringBehavior', 'Single')
297
+ cross_filter_behavior = "bothDirections" if cross_filter == "Both" else "singleDirection"
298
+
299
+ rel = {
300
+ "fromTable": row['FromTableName'],
301
+ "fromColumn": row['FromColumnName'],
302
+ "toTable": row['ToTableName'] if row['ToTableName'] else "",
303
+ "toColumn": row['ToColumnName'] if row['ToColumnName'] else "",
304
+ "fromCardinality": from_card,
305
+ "toCardinality": to_card,
306
+ "crossFilteringBehavior": cross_filter_behavior,
307
+ "isActive": bool(row.get('IsActive', True)),
308
+ "name": f"{row['FromTableName']}_{row['ToTableName'] or 'Unknown'}",
309
+ }
310
+ relationships.append(rel)
311
+
312
+ return relationships
313
+
314
+ def _get_relationships_json(self) -> List[Dict]:
315
+ """Get relationships from JSON model."""
316
+ model = self.read_model()
317
+
318
+ # Handle different Power BI schema versions
319
+ if isinstance(model, dict):
320
+ if "model" in model:
321
+ model = model["model"]
322
+ if "relationships" in model:
323
+ return model["relationships"]
324
+ if "model" in model and "relationships" in model["model"]:
325
+ return model["model"]["relationships"]
326
+
327
+ return []
328
+
329
+ def get_measures(self) -> List[Dict]:
330
+ """
331
+ Extract DAX measures from all tables.
332
+
333
+ Returns:
334
+ List of measure definitions
335
+ """
336
+ if self._measures_cache is not None:
337
+ return self._measures_cache
338
+
339
+ if not self.temp_dir:
340
+ self.extract_to_temp()
341
+
342
+ if self._use_pbixray:
343
+ self._measures_cache = self._get_measures_pbixray()
344
+ return self._measures_cache
345
+
346
+ # Fallback to JSON
347
+ return self._get_measures_json()
348
+
349
+ def _get_measures_pbixray(self) -> List[Dict]:
350
+ """Get measures from PBIXRay."""
351
+ measures = []
352
+
353
+ if self._pbixray.dax_measures is None or self._pbixray.dax_measures.empty:
354
+ return measures
355
+
356
+ for _, row in self._pbixray.dax_measures.iterrows():
357
+ measure = {
358
+ "name": row['Name'],
359
+ "expression": row['Expression'] if row['Expression'] else "",
360
+ "displayFolder": row['DisplayFolder'] if row['DisplayFolder'] else "",
361
+ "description": row['Description'] if row['Description'] else "",
362
+ "table": row['TableName'],
363
+ }
364
+ measures.append(measure)
365
+
366
+ return measures
367
+
368
+ def _get_measures_json(self) -> List[Dict]:
369
+ """Get measures from JSON model."""
370
+ tables = self._get_tables_json()
371
+ measures = []
372
+
373
+ for table in tables:
374
+ if "measures" in table:
375
+ for measure in table["measures"]:
376
+ measure["table"] = table.get("name", "Unknown")
377
+ measures.append(measure)
378
+
379
+ return measures
380
+
381
+ def get_power_query(self) -> List[Dict]:
382
+ """
383
+ Extract Power Query (M) expressions.
384
+
385
+ Returns:
386
+ List of Power Query expressions per table
387
+ """
388
+ if not self._use_pbixray:
389
+ return []
390
+
391
+ if self._pbixray.power_query is None or self._pbixray.power_query.empty:
392
+ return []
393
+
394
+ queries = []
395
+ for _, row in self._pbixray.power_query.iterrows():
396
+ queries.append({
397
+ "table": row['TableName'],
398
+ "expression": row['Expression'],
399
+ })
400
+
401
+ return queries
402
+
403
+ def get_dax_columns(self) -> List[Dict]:
404
+ """
405
+ Extract calculated columns (DAX expressions).
406
+
407
+ Returns:
408
+ List of DAX column definitions
409
+ """
410
+ if not self._use_pbixray:
411
+ return []
412
+
413
+ if self._pbixray.dax_columns is None or self._pbixray.dax_columns.empty:
414
+ return []
415
+
416
+ columns = []
417
+ for _, row in self._pbixray.dax_columns.iterrows():
418
+ columns.append({
419
+ "table": row['TableName'],
420
+ "name": row['ColumnName'],
421
+ "expression": row['Expression'],
422
+ })
423
+
424
+ return columns
425
+
426
+ def get_rls_rules(self) -> List[Dict]:
427
+ """
428
+ Extract Row-Level Security (RLS) rules.
429
+
430
+ Returns:
431
+ List of RLS rule definitions
432
+ """
433
+ if not self._use_pbixray:
434
+ # Try JSON fallback
435
+ return self._get_rls_json()
436
+
437
+ if self._pbixray.rls is None or self._pbixray.rls.empty:
438
+ return []
439
+
440
+ rules = []
441
+ for _, row in self._pbixray.rls.iterrows():
442
+ rules.append({
443
+ "role": row.get('RoleName', ''),
444
+ "table": row.get('TableName', ''),
445
+ "filter_expression": row.get('FilterExpression', ''),
446
+ })
447
+
448
+ return rules
449
+
450
+ def _get_rls_json(self) -> List[Dict]:
451
+ """Get RLS from JSON model."""
452
+ model = self.read_model()
453
+ rules = []
454
+
455
+ if isinstance(model, dict):
456
+ if "model" in model:
457
+ model = model["model"]
458
+
459
+ roles = model.get("roles", [])
460
+ for role in roles:
461
+ role_name = role.get("name", "")
462
+ for perm in role.get("tablePermissions", []):
463
+ if perm.get("filterExpression"):
464
+ rules.append({
465
+ "role": role_name,
466
+ "table": perm.get("name", ""),
467
+ "filter_expression": perm.get("filterExpression", ""),
468
+ })
469
+
470
+ return rules
471
+
472
+ def get_table_data(self, table_name: str) -> Optional[Any]:
473
+ """
474
+ Get actual data from a table (PBIXRay only).
475
+
476
+ Args:
477
+ table_name: Name of the table
478
+
479
+ Returns:
480
+ DataFrame with table data or None
481
+ """
482
+ if not self._use_pbixray:
483
+ logger.warning("Table data extraction requires PBIXRay")
484
+ return None
485
+
486
+ try:
487
+ return self._pbixray.get_table(table_name)
488
+ except Exception as e:
489
+ logger.error(f"Failed to get table data for {table_name}: {e}")
490
+ return None
491
+
492
+ def _map_pandas_type(self, pandas_type: str) -> str:
493
+ """Map pandas dtype to Power BI data type."""
494
+ type_mapping = {
495
+ "string": "string",
496
+ "object": "string",
497
+ "int64": "int64",
498
+ "Int64": "int64",
499
+ "float64": "double",
500
+ "Float64": "double",
501
+ "bool": "boolean",
502
+ "datetime64[ns]": "datetime",
503
+ "datetime64": "datetime",
504
+ }
505
+ return type_mapping.get(pandas_type, "string")
506
+
507
+ def cleanup(self):
508
+ """Remove temporary extraction directory."""
509
+ if self.temp_dir and self.temp_dir.exists():
510
+ import shutil
511
+ try:
512
+ shutil.rmtree(self.temp_dir)
513
+ logger.info(f"Cleaned up temporary directory: {self.temp_dir}")
514
+ except Exception as e:
515
+ logger.warning(f"Failed to cleanup temp directory: {e}")
516
+ finally:
517
+ self.temp_dir = None
518
+
519
+ # Clear PBIXRay reference
520
+ self._pbixray = None
521
+
522
+ @property
523
+ def is_pbixray_available(self) -> bool:
524
+ """Check if PBIXRay is being used."""
525
+ return self._use_pbixray
526
+
527
+ def read_report(self) -> Optional[Dict]:
528
+ """
529
+ Read and parse the report.json file (optional, for context).
530
+
531
+ Returns:
532
+ Parsed JSON report data or None if not found
533
+ """
534
+ if not self.temp_dir:
535
+ self.extract_to_temp()
536
+
537
+ report_path = self.temp_dir / "Report" / "report.json"
538
+ if not report_path.exists():
539
+ # Try Layout file (UTF-16)
540
+ layout_path = self.temp_dir / "Report" / "Layout"
541
+ if layout_path.exists():
542
+ try:
543
+ with open(layout_path, 'rb') as f:
544
+ content = f.read()
545
+ text = content.decode('utf-16-le')
546
+ return json.loads(text)
547
+ except Exception as e:
548
+ logger.warning(f"Failed to read Layout: {e}")
549
+
550
+ logger.warning("report.json not found in .pbix file")
551
+ return None
552
+
553
+ try:
554
+ with open(report_path, 'r', encoding='utf-8') as f:
555
+ return json.load(f)
556
+ except Exception as e:
557
+ logger.warning(f"Failed to read report.json: {e}")
558
+ return None