perspective-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,29 @@
1
+ """Power BI ingestion dlt pipeline."""
2
+
3
+ from collections.abc import Generator
4
+
5
+ from perspective.ingest.sources.bi.powerbi.extract import powerbi
6
+ from perspective.ingest.sources.bi.powerbi.models import DataflowDetails, WorkspaceInfo
7
+ from perspective.ingest.sources.bi.powerbi.transform import transform
8
+ from perspective.models.dashboards import DashboardManifest
9
+
10
+
11
+ def pipeline() -> Generator[DashboardManifest, None, None]:
12
+ """Power BI ingestion pipeline."""
13
+ source = powerbi()
14
+ lineage = source.workspaces_lineage
15
+ dataflows = source.dataflows_details
16
+ dashboard_metadata = WorkspaceInfo(**next(iter(lineage)))
17
+ dataflows_metadata = [DataflowDetails(**item) for item in dataflows]
18
+ yield transform(dashboard_metadata, dataflows_metadata)
19
+
20
+
21
+ if __name__ == "__main__":
22
+ from pathlib import Path
23
+
24
+ manifest = next(iter(pipeline()))
25
+
26
+ Path("powerbi_extracted.json").write_text(
27
+ manifest.model_dump_json(by_alias=True), encoding="utf-8"
28
+ )
29
+ # print(manifest)
@@ -0,0 +1,478 @@
1
+ """Transform Power BI metadata into DashboardManifest format."""
2
+
3
+ import json
4
+ import logging
5
+ from pathlib import Path
6
+ import re
7
+
8
+ from perspective.ingest.sources.bi.powerbi.models import DataflowDetails, WorkspaceInfo
9
+ from perspective.models.dashboards import (
10
+ Dashboard,
11
+ DashboardManifest,
12
+ DashboardSchemaMetadata,
13
+ DataModel,
14
+ )
15
+
16
+
17
+ logger = logging.getLogger("dlt")
18
+
19
+
20
+ def transform(
21
+ workspace_info: WorkspaceInfo, dataflows_info: list[DataflowDetails]
22
+ ) -> DashboardManifest:
23
+ """Transform Power BI metadata into DashboardManifest format."""
24
+ tables = extract_tables(workspace_info, dataflows_info)
25
+ reports = extract_reports(workspace_info, tables=tables)
26
+
27
+ return DashboardManifest(
28
+ metadata=DashboardSchemaMetadata(schema="dashboard", version=1),
29
+ payload=reports,
30
+ )
31
+
32
+
33
+ def extract_tables(
34
+ workspace_info: WorkspaceInfo, dataflows_info: list[DataflowDetails]
35
+ ) -> list[dict]:
36
+ """Extract dataset tables and their underlying database tables."""
37
+ tables = []
38
+ # Each dataset table can only have one database table as a source.
39
+ for workspace in workspace_info.workspaces:
40
+ for dataset in workspace.datasets:
41
+ for dataset_table in dataset.tables:
42
+ if dataset_table.source is None:
43
+ continue
44
+
45
+ # Extract the underlying database table.
46
+ source_expression = dataset_table.source[0].expression
47
+
48
+ # There are two ways PowerBI can reference Dataflows:
49
+ # 1) The table expression directly walks the PowerPlatform.Dataflows
50
+ # tree and contains the dataflowId.
51
+ # 2) The table expression references a named dataset source which in
52
+ # turn references a dataflow entity. Those expressions do NOT
53
+ # contain the literal 'PowerPlatform.Dataflows' but do contain
54
+ # an entity reference like: Source{[entity="co_customer"]}[Data]
55
+ # Detect both cases and use the dataflow extractor when appropriate.
56
+ if (
57
+ "PowerPlatform.Dataflows" in source_expression
58
+ or 'entity="' in source_expression
59
+ ):
60
+ table_database_table = _extract_dataflow_table_from_expression(
61
+ source_expression, dataflows_info
62
+ )
63
+ else:
64
+ table_database_table = _extract_table_from_expression(
65
+ source_expression
66
+ )
67
+
68
+ if not table_database_table:
69
+ continue
70
+
71
+ database_table_name = table_database_table["name"]
72
+ tables.append({
73
+ "dataset_id": dataset.id,
74
+ "dataset_table_name": dataset_table.name,
75
+ "database_table_name": database_table_name,
76
+ "database_table_schema": table_database_table.get("schema"),
77
+ "database_table_database": table_database_table.get("database"),
78
+ "columns": [
79
+ {"name": column.name, "data_type": column.dataType}
80
+ for column in dataset_table.columns
81
+ ],
82
+ "tags": table_database_table.get("tags", []),
83
+ })
84
+ return tables
85
+
86
+
87
+ # def extract_dataflows(dataset) -> None:
88
+ # """Extract dataset-level Dataflows."""
89
+
90
+
91
+ def extract_reports(
92
+ workspace_info: WorkspaceInfo, tables: list[dict]
93
+ ) -> list[Dashboard]:
94
+ """Extract reports from workspace info."""
95
+ reports = []
96
+ for workspace in workspace_info.workspaces:
97
+ for report in workspace.reports:
98
+ # We're not interested in PowerBI Apps. Not sure why they're included
99
+ # - either way, the original report the app is based on is already included
100
+ # in the response.
101
+ if report.name.startswith("[App]"):
102
+ continue
103
+
104
+ report_filtered = {}
105
+ report_id = report.id
106
+ report_filtered["external_id"] = report_id
107
+ report_filtered["url"] = (
108
+ "https://app.powerbi.com/groups/" + workspace.id
109
+ or "" + "/reports/" + report_id
110
+ )
111
+ report_filtered["type"] = "powerbi"
112
+ report_filtered["name"] = report.name
113
+ report_filtered["workspace"] = workspace.name
114
+ report_filtered["created_at"] = report.createdDateTime
115
+ report_filtered["modified_at"] = report.modifiedDateTime
116
+ report_filtered["owners"] = [
117
+ {
118
+ "user_id": user.graphId,
119
+ "username": user.identifier,
120
+ "name": user.displayName,
121
+ }
122
+ for user in report.users
123
+ if user.reportUserAccessRight == "Owner"
124
+ ]
125
+
126
+ report_tables = [
127
+ {
128
+ "name": table["database_table_name"],
129
+ "schema": table["database_table_schema"],
130
+ "database": table["database_table_database"],
131
+ "columns": table["columns"],
132
+ "tags": table["tags"],
133
+ }
134
+ for table in tables
135
+ if table["dataset_id"] == report.datasetId
136
+ ]
137
+ report_filtered["parent_models"] = report_tables
138
+
139
+ reports.append(report_filtered)
140
+
141
+ return reports
142
+
143
+
144
+ def _extract_table_from_expression(expression: str) -> DataModel | None:
145
+ """Extract schema and table name from expression."""
146
+ # Check if this is a NativeQuery - we don't extract from those
147
+ if "Value.NativeQuery" in expression:
148
+ return None
149
+
150
+ # Get database name.
151
+ database_name_expr = re.search(
152
+ r'AmazonRedshift\.Database\s*\(\s*".*?"\s*,\s*"([^" ]+)"\s*\)',
153
+ expression,
154
+ re.IGNORECASE,
155
+ )
156
+ if not database_name_expr:
157
+ return None
158
+ database_name = database_name_expr.group(1).strip()
159
+
160
+ # Find source variable assigned to AmazonRedshift.Database
161
+ source_pattern = r"(\w+)\s*=\s*AmazonRedshift\.Database\s*\([^)]+\)"
162
+ source_match = re.search(source_pattern, expression, re.IGNORECASE)
163
+
164
+ if not source_match:
165
+ return None
166
+
167
+ source_var_name = source_match.group(1).strip()
168
+
169
+ # Get schema names.
170
+ schema_pattern = (
171
+ rf'(\w+)\s*=\s*{re.escape(source_var_name)}\s*{{\[Name="([^"]+)"\]}}\[Data\]'
172
+ )
173
+ schema_match = re.findall(schema_pattern, expression, re.IGNORECASE)
174
+
175
+ if not schema_match:
176
+ return None
177
+
178
+ schema_var_name = schema_match[0][0].strip() # The variable name given by user.
179
+ schema_name = schema_match[0][1].strip() # The actual database schema name.
180
+
181
+ # Get table metadata.
182
+ table_pattern = (
183
+ r"\w+\s*=\s*" + re.escape(schema_var_name) + r'{\[Name="([^" ]+)"\]}\[Data\]'
184
+ )
185
+ table_match = re.findall(table_pattern, expression, re.IGNORECASE)
186
+
187
+ if not table_match:
188
+ return None
189
+
190
+ return {
191
+ "name": table_match[0].strip(),
192
+ "schema": schema_name.strip(),
193
+ "database": database_name,
194
+ "tags": [{"source_system": "AmazonRedshift:Direct"}],
195
+ }
196
+
197
+
198
+ def _normalize_shared_name(raw: str) -> str:
199
+ """Strip M quoting, eg. #"Name" or "Name" -> Name."""
200
+ raw = raw.strip()
201
+ raw = raw.rstrip(",")
202
+ if raw.startswith('#"') and raw.endswith('"'):
203
+ return raw[2:-1]
204
+ if raw.startswith('"') and raw.endswith('"'):
205
+ return raw[1:-1]
206
+ return raw
207
+
208
+
209
+ def _extract_entity_block( # noqa: C901, PLR0911
210
+ entity: str, document: str, _seen: set | None = None
211
+ ) -> str | None:
212
+ """Extract the `let ... in ...;` block for a shared entity.
213
+
214
+ Handles names like: shared MyEntity, shared "My Entity", shared #"01_Events". Also
215
+ resolves simple 'Source = SomeOtherShared' references by inlining the referenced
216
+ shared block.
217
+ """
218
+ if _seen is None:
219
+ _seen = set()
220
+ # Avoid infinite recursion.
221
+ if entity in _seen:
222
+ return None
223
+ _seen.add(entity)
224
+
225
+ # Match shared <name> variants: simple, quoted "name", or #"<name>".
226
+ pattern = rf'shared\s+(?:#"?{re.escape(entity)}"?|"{re.escape(entity)}"|{re.escape(entity)})\s*=\s*let\b(.*?)\bin\s+[^\n;]+;'
227
+ m = re.search(pattern, document, re.DOTALL | re.IGNORECASE)
228
+ if not m:
229
+ return None
230
+ block = m.group(1)
231
+
232
+ # Find a 'Source = <rhs>' assignment in the block (first occurrence).
233
+ # Capture whole rhs up to end-of-line (allow commas inside parentheses like
234
+ # Table.Combine({a,b})).
235
+ m_src = re.search(r"(?m)^\s*Source\s*=\s*(?P<rhs>[^\n]+)", block)
236
+ if not m_src:
237
+ return block
238
+
239
+ rhs = m_src.group("rhs").strip()
240
+
241
+ # If rhs looks like a direct data source, nothing to resolve.
242
+ if re.search(
243
+ r"\b(AmazonRedshift\.Database|Value\.NativeQuery|PowerPlatform\.Dataflows|Csv\.Contents|Sql\.Database)\b",
244
+ rhs,
245
+ ):
246
+ return block
247
+
248
+ # Handle cases where the Source is a Table.Combine(...) of several shared tables
249
+ # e.g. Source = Table.Combine({CurrentYear, OneYear, TwoYears}).
250
+ m_tc = re.search(r"Table\.Combine\s*\(\s*\{(?P<items>[^\}]+)\}\s*\)", rhs)
251
+ if m_tc:
252
+ items = m_tc.group("items")
253
+ # Split on commas and normalize each referenced shared name.
254
+ parts = [p.strip() for p in items.split(",") if p.strip()]
255
+ parent_blocks = []
256
+ for p in parts:
257
+ ref = _normalize_shared_name(p)
258
+ if not ref or ref.lower() == entity.lower():
259
+ continue
260
+ pb = _extract_entity_block(ref, document, _seen=_seen)
261
+ if pb:
262
+ parent_blocks.append(pb.rstrip())
263
+
264
+ if parent_blocks:
265
+ # Remove the Source = Table.Combine(...) line from current block.
266
+ block_without_source = re.sub(
267
+ r"(?m)^\s*Source\s*=\s*Table\.Combine\s*\([^\n]+\),?\s*\n?",
268
+ "",
269
+ block,
270
+ count=1,
271
+ )
272
+ # Prepend all parent blocks so Database(...) will be visible downstream.
273
+ return "\n".join(parent_blocks) + "\n" + block_without_source.lstrip()
274
+
275
+ # Normalize referenced name (remove #, quotes).
276
+ referenced = _normalize_shared_name(rhs)
277
+ # If it references itself or is empty, bail out.
278
+ if not referenced or referenced.lower() == entity.lower():
279
+ return block
280
+
281
+ # Try to extract referenced shared block and inline/merge.
282
+ parent_block = _extract_entity_block(referenced, document, _seen=_seen)
283
+ if not parent_block:
284
+ return block
285
+
286
+ # Remove the Source = <ref> line from current block to avoid duplicate Source lines.
287
+ block_without_source = re.sub(
288
+ r"(?m)^\s*Source\s*=\s*[^\n,;]+,\s*\n?", "", block, count=1
289
+ )
290
+
291
+ # Prepend the parent's let-body so the resulting block contains the real Source
292
+ # definition.
293
+ return parent_block.rstrip() + "\n" + block_without_source.lstrip()
294
+
295
+
296
+ def _get_entity_columns(entity_name: str, dataflow: DataflowDetails) -> list[str]:
297
+ for entity in dataflow.entities:
298
+ if entity["name"] == entity_name:
299
+ return entity["attributes"]
300
+ return []
301
+
302
+
303
+ def _extract_model_from_dataflow_entity_block(
304
+ entity_block: str,
305
+ ) -> dict[str, str | list[dict]] | None:
306
+ """Extract database, schema and table from an entity block.
307
+
308
+ Strategy:
309
+ - Find the database name from AmazonRedshift.Database(..., "dbname", ...)
310
+ - Find all assignments of the form: <lhs> = <rhs>{[Name = "X"]}[Data]
311
+ (lhs/rhs may be quoted like #"Navigation 1").
312
+ - Prefer the first assignment whose RHS equals the variable assigned to
313
+ the Database(...) call as the schema step (e.g. #"Navigation 1" = Source{[Name="schema"]}[Data]).
314
+ - Use the last assignment's Name as the table (most downstream step).
315
+ """
316
+ db_match = re.search(
317
+ r'AmazonRedshift\.Database\(\s*"[^"]*"\s*,\s*"([^\"]+)"',
318
+ entity_block,
319
+ re.IGNORECASE,
320
+ )
321
+ if not db_match:
322
+ return None
323
+ database_name = db_match.group(1).strip()
324
+
325
+ # Detect variable assigned to the Database call (usually 'Source').
326
+ src_var_match = re.search(
327
+ r"(\w+)\s*=\s*AmazonRedshift\.Database\s*\(", entity_block, re.IGNORECASE
328
+ )
329
+ source_var_name = src_var_match.group(1).strip() if src_var_match else "Source"
330
+
331
+ # Find all assignments like: <lhs> = <rhs>{[Name = "X"]}[Data].
332
+ assign_pattern = r'(\#?"?[\w\d\s\-\_]+"?)\s*=\s*(\#?"?[\w\d\s\-\_]+"?)\s*\{\s*\[\s*Name\s*=\s*"([^\"]+)"\s*\]\s*\}\s*\[Data\]'
333
+ assigns = re.findall(assign_pattern, entity_block, re.IGNORECASE)
334
+ if not assigns:
335
+ return None
336
+
337
+ def normalize(n: str) -> str:
338
+ n = n.strip().rstrip(",")
339
+ if n.startswith('#"') and n.endswith('"'):
340
+ return n[2:-1]
341
+ if n.startswith('"') and n.endswith('"'):
342
+ return n[1:-1]
343
+ return n
344
+
345
+ normalized = [
346
+ (normalize(lhs), normalize(rhs), name.strip()) for lhs, rhs, name in assigns
347
+ ]
348
+
349
+ # Pick schema: first assignment whose RHS is the DB variable (Source), else first
350
+ # assignment.
351
+ schema_name = None
352
+ for _lhs, rhs, name in normalized:
353
+ if rhs.lower() == source_var_name.lower():
354
+ schema_name = name
355
+ break
356
+ if not schema_name:
357
+ schema_name = normalized[0][2]
358
+
359
+ # Pick table: last assignment's name (downstream step).
360
+ table_name = normalized[-1][2]
361
+
362
+ if not (table_name and schema_name and database_name):
363
+ logger.debug("Failed to extract model from entity block.")
364
+ logger.debug(entity_block)
365
+ return None
366
+
367
+ return {
368
+ "name": table_name,
369
+ "schema": schema_name,
370
+ "database": database_name,
371
+ "tags": [{"source_system": "AmazonRedshift:Dataflow"}],
372
+ }
373
+
374
+
375
+ def _extract_dataflow_table_from_expression( # noqa: C901, PLR0912
376
+ expression: str, dataflows_info: list[DataflowDetails]
377
+ ) -> dict[str, str | list[dict]] | None:
378
+ """Extract schema and table name from dataflow expression.
379
+
380
+ We first extract the dataflow ID and the name of the entity within the dataflow.
381
+
382
+ Next, we parse the dataflow's source code ("document" field in the dataflow's
383
+ section in dataflows_info) to extract the name of the source database table for the
384
+ entity.
385
+ """
386
+ # 1) Try to extract dataflowId directly from the table expression (most common case)
387
+ dataflow_id_pattern = r'dataflowId="([a-f0-9\-]+)"'
388
+ dataflow_id_match = re.search(dataflow_id_pattern, expression, re.IGNORECASE)
389
+
390
+ dataflow_id = None
391
+ dataflow_obj = None
392
+
393
+ if dataflow_id_match:
394
+ dataflow_id = dataflow_id_match.group(1).strip()
395
+
396
+ # 2) Try to extract entity name from the table expression
397
+ entity_pattern = r'{\[entity="([^"]+)"'
398
+ entity_match = re.search(entity_pattern, expression, re.IGNORECASE)
399
+ if not entity_match:
400
+ return None
401
+ entity_name = entity_match.group(1).strip()
402
+
403
+ # If we have a dataflow_id, find that dataflow. Otherwise try to resolve via
404
+ # a named source referenced by this table expression.
405
+ if dataflow_id:
406
+ for dataflow in dataflows_info:
407
+ if dataflow.id == dataflow_id:
408
+ dataflow_obj = dataflow
409
+ break
410
+ else:
411
+ # Look for a source reference in the table expression, e.g.
412
+ # 'Source = SourceNewStrategicDim'
413
+ src_ref_match = re.search(
414
+ r'(?m)^\s*Source\s*=\s*(?P<ref>[A-Za-z0-9_#"@]+)', expression
415
+ )
416
+ src_ref = None
417
+ if src_ref_match:
418
+ src_ref = src_ref_match.group("ref").strip()
419
+ # Normalize the name.
420
+ if src_ref.startswith('"') and src_ref.endswith('"'):
421
+ src_ref = src_ref[1:-1]
422
+ if src_ref.startswith('#"') and src_ref.endswith('"'):
423
+ src_ref = src_ref[2:-1]
424
+
425
+ if not dataflow_obj:
426
+ # As a last resort, try to find a dataflow that contains an entity with
427
+ # the given name (some datasets reference a named source which in turn
428
+ # points to a dataflow entity; if we don't have the dataset expressions
429
+ # available we can still try matching by entity name across known
430
+ # dataflows).
431
+ for dataflow in dataflows_info:
432
+ for ent in getattr(dataflow, "entities", []) or []:
433
+ ent_name = (
434
+ ent.get("name")
435
+ if isinstance(ent, dict)
436
+ else getattr(ent, "name", None)
437
+ )
438
+ if ent_name and ent_name == entity_name:
439
+ dataflow_obj = dataflow
440
+ break
441
+ if dataflow_obj:
442
+ break
443
+ if not dataflow_obj:
444
+ return None
445
+
446
+ # Retrieve the dataflow's M code document.
447
+ document = dataflow_obj.pbi_mashup.document
448
+
449
+ # Extract the code block containing the entity's definition.
450
+ entity_block = _extract_entity_block(entity_name, document)
451
+
452
+ if not entity_block:
453
+ return None
454
+
455
+ # Extract the source table from the code block.
456
+ model = _extract_model_from_dataflow_entity_block(entity_block)
457
+
458
+ if not model:
459
+ msg = f"Could not extract model from dataflow entity block for entity '{entity_name}'."
460
+ logger.warning(msg)
461
+
462
+ return model
463
+
464
+
465
+ if __name__ == "__main__":
466
+ with Path("powerbi_workspace_info.json").open(encoding="utf-8") as f:
467
+ workspace_info = WorkspaceInfo(**next(iter(json.load(f))))
468
+
469
+ with Path("powerbi_dataflows_info.json").open(encoding="utf-8") as f:
470
+ dataflows_info = [DataflowDetails(**item) for item in json.load(f)]
471
+
472
+ dashboard_manifest = transform(workspace_info, dataflows_info)
473
+
474
+ Path("powerbi_extracted.json").write_text(
475
+ dashboard_manifest.model_dump_json(by_alias=True), encoding="utf-8"
476
+ )
477
+
478
+ # # print(json.dumps(tables, indent=4))