perspective-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- perspective/__init__.py +1 -0
- perspective/config.py +240 -0
- perspective/exceptions.py +15 -0
- perspective/ingest/dbt.py +150 -0
- perspective/ingest/ingest.py +164 -0
- perspective/ingest/postgres.py +388 -0
- perspective/ingest/sources/bi/powerbi/extract.py +184 -0
- perspective/ingest/sources/bi/powerbi/models.py +137 -0
- perspective/ingest/sources/bi/powerbi/pipeline.py +29 -0
- perspective/ingest/sources/bi/powerbi/transform.py +478 -0
- perspective/ingest/sources/bi/qlik_sense/extract.py +297 -0
- perspective/ingest/sources/bi/qlik_sense/models.py +22 -0
- perspective/ingest/sources/bi/qlik_sense/pipeline.py +19 -0
- perspective/ingest/sources/bi/qlik_sense/transform.py +76 -0
- perspective/ingest/sources/database/sap/extract.py +253 -0
- perspective/ingest/sources/database/sap/pipeline.py +23 -0
- perspective/ingest/sources/database/sap/transform.py +85 -0
- perspective/main.py +74 -0
- perspective/models/configs.py +422 -0
- perspective/models/dashboards.py +44 -0
- perspective/models/databases.py +26 -0
- perspective/utils/__init__.py +3 -0
- perspective/utils/options.py +77 -0
- perspective/utils/utils.py +274 -0
- perspective_cli-0.1.0.dist-info/METADATA +49 -0
- perspective_cli-0.1.0.dist-info/RECORD +29 -0
- perspective_cli-0.1.0.dist-info/WHEEL +5 -0
- perspective_cli-0.1.0.dist-info/entry_points.txt +2 -0
- perspective_cli-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Power BI ingestion dlt pipeline."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Generator
|
|
4
|
+
|
|
5
|
+
from perspective.ingest.sources.bi.powerbi.extract import powerbi
|
|
6
|
+
from perspective.ingest.sources.bi.powerbi.models import DataflowDetails, WorkspaceInfo
|
|
7
|
+
from perspective.ingest.sources.bi.powerbi.transform import transform
|
|
8
|
+
from perspective.models.dashboards import DashboardManifest
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def pipeline() -> Generator[DashboardManifest, None, None]:
|
|
12
|
+
"""Power BI ingestion pipeline."""
|
|
13
|
+
source = powerbi()
|
|
14
|
+
lineage = source.workspaces_lineage
|
|
15
|
+
dataflows = source.dataflows_details
|
|
16
|
+
dashboard_metadata = WorkspaceInfo(**next(iter(lineage)))
|
|
17
|
+
dataflows_metadata = [DataflowDetails(**item) for item in dataflows]
|
|
18
|
+
yield transform(dashboard_metadata, dataflows_metadata)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
if __name__ == "__main__":
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
manifest = next(iter(pipeline()))
|
|
25
|
+
|
|
26
|
+
Path("powerbi_extracted.json").write_text(
|
|
27
|
+
manifest.model_dump_json(by_alias=True), encoding="utf-8"
|
|
28
|
+
)
|
|
29
|
+
# print(manifest)
|
|
@@ -0,0 +1,478 @@
|
|
|
1
|
+
"""Transform Power BI metadata into DashboardManifest format."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
from perspective.ingest.sources.bi.powerbi.models import DataflowDetails, WorkspaceInfo
|
|
9
|
+
from perspective.models.dashboards import (
|
|
10
|
+
Dashboard,
|
|
11
|
+
DashboardManifest,
|
|
12
|
+
DashboardSchemaMetadata,
|
|
13
|
+
DataModel,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger("dlt")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def transform(
|
|
21
|
+
workspace_info: WorkspaceInfo, dataflows_info: list[DataflowDetails]
|
|
22
|
+
) -> DashboardManifest:
|
|
23
|
+
"""Transform Power BI metadata into DashboardManifest format."""
|
|
24
|
+
tables = extract_tables(workspace_info, dataflows_info)
|
|
25
|
+
reports = extract_reports(workspace_info, tables=tables)
|
|
26
|
+
|
|
27
|
+
return DashboardManifest(
|
|
28
|
+
metadata=DashboardSchemaMetadata(schema="dashboard", version=1),
|
|
29
|
+
payload=reports,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def extract_tables(
|
|
34
|
+
workspace_info: WorkspaceInfo, dataflows_info: list[DataflowDetails]
|
|
35
|
+
) -> list[dict]:
|
|
36
|
+
"""Extract dataset tables and their underlying database tables."""
|
|
37
|
+
tables = []
|
|
38
|
+
# Each dataset table can only have one database table as a source.
|
|
39
|
+
for workspace in workspace_info.workspaces:
|
|
40
|
+
for dataset in workspace.datasets:
|
|
41
|
+
for dataset_table in dataset.tables:
|
|
42
|
+
if dataset_table.source is None:
|
|
43
|
+
continue
|
|
44
|
+
|
|
45
|
+
# Extract the underlying database table.
|
|
46
|
+
source_expression = dataset_table.source[0].expression
|
|
47
|
+
|
|
48
|
+
# There are two ways PowerBI can reference Dataflows:
|
|
49
|
+
# 1) The table expression directly walks the PowerPlatform.Dataflows
|
|
50
|
+
# tree and contains the dataflowId.
|
|
51
|
+
# 2) The table expression references a named dataset source which in
|
|
52
|
+
# turn references a dataflow entity. Those expressions do NOT
|
|
53
|
+
# contain the literal 'PowerPlatform.Dataflows' but do contain
|
|
54
|
+
# an entity reference like: Source{[entity="co_customer"]}[Data]
|
|
55
|
+
# Detect both cases and use the dataflow extractor when appropriate.
|
|
56
|
+
if (
|
|
57
|
+
"PowerPlatform.Dataflows" in source_expression
|
|
58
|
+
or 'entity="' in source_expression
|
|
59
|
+
):
|
|
60
|
+
table_database_table = _extract_dataflow_table_from_expression(
|
|
61
|
+
source_expression, dataflows_info
|
|
62
|
+
)
|
|
63
|
+
else:
|
|
64
|
+
table_database_table = _extract_table_from_expression(
|
|
65
|
+
source_expression
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
if not table_database_table:
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
database_table_name = table_database_table["name"]
|
|
72
|
+
tables.append({
|
|
73
|
+
"dataset_id": dataset.id,
|
|
74
|
+
"dataset_table_name": dataset_table.name,
|
|
75
|
+
"database_table_name": database_table_name,
|
|
76
|
+
"database_table_schema": table_database_table.get("schema"),
|
|
77
|
+
"database_table_database": table_database_table.get("database"),
|
|
78
|
+
"columns": [
|
|
79
|
+
{"name": column.name, "data_type": column.dataType}
|
|
80
|
+
for column in dataset_table.columns
|
|
81
|
+
],
|
|
82
|
+
"tags": table_database_table.get("tags", []),
|
|
83
|
+
})
|
|
84
|
+
return tables
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# def extract_dataflows(dataset) -> None:
|
|
88
|
+
# """Extract dataset-level Dataflows."""
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def extract_reports(
|
|
92
|
+
workspace_info: WorkspaceInfo, tables: list[dict]
|
|
93
|
+
) -> list[Dashboard]:
|
|
94
|
+
"""Extract reports from workspace info."""
|
|
95
|
+
reports = []
|
|
96
|
+
for workspace in workspace_info.workspaces:
|
|
97
|
+
for report in workspace.reports:
|
|
98
|
+
# We're not interested in PowerBI Apps. Not sure why they're included
|
|
99
|
+
# - either way, the original report the app is based on is already included
|
|
100
|
+
# in the response.
|
|
101
|
+
if report.name.startswith("[App]"):
|
|
102
|
+
continue
|
|
103
|
+
|
|
104
|
+
report_filtered = {}
|
|
105
|
+
report_id = report.id
|
|
106
|
+
report_filtered["external_id"] = report_id
|
|
107
|
+
report_filtered["url"] = (
|
|
108
|
+
"https://app.powerbi.com/groups/" + workspace.id
|
|
109
|
+
or "" + "/reports/" + report_id
|
|
110
|
+
)
|
|
111
|
+
report_filtered["type"] = "powerbi"
|
|
112
|
+
report_filtered["name"] = report.name
|
|
113
|
+
report_filtered["workspace"] = workspace.name
|
|
114
|
+
report_filtered["created_at"] = report.createdDateTime
|
|
115
|
+
report_filtered["modified_at"] = report.modifiedDateTime
|
|
116
|
+
report_filtered["owners"] = [
|
|
117
|
+
{
|
|
118
|
+
"user_id": user.graphId,
|
|
119
|
+
"username": user.identifier,
|
|
120
|
+
"name": user.displayName,
|
|
121
|
+
}
|
|
122
|
+
for user in report.users
|
|
123
|
+
if user.reportUserAccessRight == "Owner"
|
|
124
|
+
]
|
|
125
|
+
|
|
126
|
+
report_tables = [
|
|
127
|
+
{
|
|
128
|
+
"name": table["database_table_name"],
|
|
129
|
+
"schema": table["database_table_schema"],
|
|
130
|
+
"database": table["database_table_database"],
|
|
131
|
+
"columns": table["columns"],
|
|
132
|
+
"tags": table["tags"],
|
|
133
|
+
}
|
|
134
|
+
for table in tables
|
|
135
|
+
if table["dataset_id"] == report.datasetId
|
|
136
|
+
]
|
|
137
|
+
report_filtered["parent_models"] = report_tables
|
|
138
|
+
|
|
139
|
+
reports.append(report_filtered)
|
|
140
|
+
|
|
141
|
+
return reports
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _extract_table_from_expression(expression: str) -> DataModel | None:
|
|
145
|
+
"""Extract schema and table name from expression."""
|
|
146
|
+
# Check if this is a NativeQuery - we don't extract from those
|
|
147
|
+
if "Value.NativeQuery" in expression:
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
# Get database name.
|
|
151
|
+
database_name_expr = re.search(
|
|
152
|
+
r'AmazonRedshift\.Database\s*\(\s*".*?"\s*,\s*"([^" ]+)"\s*\)',
|
|
153
|
+
expression,
|
|
154
|
+
re.IGNORECASE,
|
|
155
|
+
)
|
|
156
|
+
if not database_name_expr:
|
|
157
|
+
return None
|
|
158
|
+
database_name = database_name_expr.group(1).strip()
|
|
159
|
+
|
|
160
|
+
# Find source variable assigned to AmazonRedshift.Database
|
|
161
|
+
source_pattern = r"(\w+)\s*=\s*AmazonRedshift\.Database\s*\([^)]+\)"
|
|
162
|
+
source_match = re.search(source_pattern, expression, re.IGNORECASE)
|
|
163
|
+
|
|
164
|
+
if not source_match:
|
|
165
|
+
return None
|
|
166
|
+
|
|
167
|
+
source_var_name = source_match.group(1).strip()
|
|
168
|
+
|
|
169
|
+
# Get schema names.
|
|
170
|
+
schema_pattern = (
|
|
171
|
+
rf'(\w+)\s*=\s*{re.escape(source_var_name)}\s*{{\[Name="([^"]+)"\]}}\[Data\]'
|
|
172
|
+
)
|
|
173
|
+
schema_match = re.findall(schema_pattern, expression, re.IGNORECASE)
|
|
174
|
+
|
|
175
|
+
if not schema_match:
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
schema_var_name = schema_match[0][0].strip() # The variable name given by user.
|
|
179
|
+
schema_name = schema_match[0][1].strip() # The actual database schema name.
|
|
180
|
+
|
|
181
|
+
# Get table metadata.
|
|
182
|
+
table_pattern = (
|
|
183
|
+
r"\w+\s*=\s*" + re.escape(schema_var_name) + r'{\[Name="([^" ]+)"\]}\[Data\]'
|
|
184
|
+
)
|
|
185
|
+
table_match = re.findall(table_pattern, expression, re.IGNORECASE)
|
|
186
|
+
|
|
187
|
+
if not table_match:
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
return {
|
|
191
|
+
"name": table_match[0].strip(),
|
|
192
|
+
"schema": schema_name.strip(),
|
|
193
|
+
"database": database_name,
|
|
194
|
+
"tags": [{"source_system": "AmazonRedshift:Direct"}],
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _normalize_shared_name(raw: str) -> str:
|
|
199
|
+
"""Strip M quoting, eg. #"Name" or "Name" -> Name."""
|
|
200
|
+
raw = raw.strip()
|
|
201
|
+
raw = raw.rstrip(",")
|
|
202
|
+
if raw.startswith('#"') and raw.endswith('"'):
|
|
203
|
+
return raw[2:-1]
|
|
204
|
+
if raw.startswith('"') and raw.endswith('"'):
|
|
205
|
+
return raw[1:-1]
|
|
206
|
+
return raw
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _extract_entity_block( # noqa: C901, PLR0911
|
|
210
|
+
entity: str, document: str, _seen: set | None = None
|
|
211
|
+
) -> str | None:
|
|
212
|
+
"""Extract the `let ... in ...;` block for a shared entity.
|
|
213
|
+
|
|
214
|
+
Handles names like: shared MyEntity, shared "My Entity", shared #"01_Events". Also
|
|
215
|
+
resolves simple 'Source = SomeOtherShared' references by inlining the referenced
|
|
216
|
+
shared block.
|
|
217
|
+
"""
|
|
218
|
+
if _seen is None:
|
|
219
|
+
_seen = set()
|
|
220
|
+
# Avoid infinite recursion.
|
|
221
|
+
if entity in _seen:
|
|
222
|
+
return None
|
|
223
|
+
_seen.add(entity)
|
|
224
|
+
|
|
225
|
+
# Match shared <name> variants: simple, quoted "name", or #"<name>".
|
|
226
|
+
pattern = rf'shared\s+(?:#"?{re.escape(entity)}"?|"{re.escape(entity)}"|{re.escape(entity)})\s*=\s*let\b(.*?)\bin\s+[^\n;]+;'
|
|
227
|
+
m = re.search(pattern, document, re.DOTALL | re.IGNORECASE)
|
|
228
|
+
if not m:
|
|
229
|
+
return None
|
|
230
|
+
block = m.group(1)
|
|
231
|
+
|
|
232
|
+
# Find a 'Source = <rhs>' assignment in the block (first occurrence).
|
|
233
|
+
# Capture whole rhs up to end-of-line (allow commas inside parentheses like
|
|
234
|
+
# Table.Combine({a,b})).
|
|
235
|
+
m_src = re.search(r"(?m)^\s*Source\s*=\s*(?P<rhs>[^\n]+)", block)
|
|
236
|
+
if not m_src:
|
|
237
|
+
return block
|
|
238
|
+
|
|
239
|
+
rhs = m_src.group("rhs").strip()
|
|
240
|
+
|
|
241
|
+
# If rhs looks like a direct data source, nothing to resolve.
|
|
242
|
+
if re.search(
|
|
243
|
+
r"\b(AmazonRedshift\.Database|Value\.NativeQuery|PowerPlatform\.Dataflows|Csv\.Contents|Sql\.Database)\b",
|
|
244
|
+
rhs,
|
|
245
|
+
):
|
|
246
|
+
return block
|
|
247
|
+
|
|
248
|
+
# Handle cases where the Source is a Table.Combine(...) of several shared tables
|
|
249
|
+
# e.g. Source = Table.Combine({CurrentYear, OneYear, TwoYears}).
|
|
250
|
+
m_tc = re.search(r"Table\.Combine\s*\(\s*\{(?P<items>[^\}]+)\}\s*\)", rhs)
|
|
251
|
+
if m_tc:
|
|
252
|
+
items = m_tc.group("items")
|
|
253
|
+
# Split on commas and normalize each referenced shared name.
|
|
254
|
+
parts = [p.strip() for p in items.split(",") if p.strip()]
|
|
255
|
+
parent_blocks = []
|
|
256
|
+
for p in parts:
|
|
257
|
+
ref = _normalize_shared_name(p)
|
|
258
|
+
if not ref or ref.lower() == entity.lower():
|
|
259
|
+
continue
|
|
260
|
+
pb = _extract_entity_block(ref, document, _seen=_seen)
|
|
261
|
+
if pb:
|
|
262
|
+
parent_blocks.append(pb.rstrip())
|
|
263
|
+
|
|
264
|
+
if parent_blocks:
|
|
265
|
+
# Remove the Source = Table.Combine(...) line from current block.
|
|
266
|
+
block_without_source = re.sub(
|
|
267
|
+
r"(?m)^\s*Source\s*=\s*Table\.Combine\s*\([^\n]+\),?\s*\n?",
|
|
268
|
+
"",
|
|
269
|
+
block,
|
|
270
|
+
count=1,
|
|
271
|
+
)
|
|
272
|
+
# Prepend all parent blocks so Database(...) will be visible downstream.
|
|
273
|
+
return "\n".join(parent_blocks) + "\n" + block_without_source.lstrip()
|
|
274
|
+
|
|
275
|
+
# Normalize referenced name (remove #, quotes).
|
|
276
|
+
referenced = _normalize_shared_name(rhs)
|
|
277
|
+
# If it references itself or is empty, bail out.
|
|
278
|
+
if not referenced or referenced.lower() == entity.lower():
|
|
279
|
+
return block
|
|
280
|
+
|
|
281
|
+
# Try to extract referenced shared block and inline/merge.
|
|
282
|
+
parent_block = _extract_entity_block(referenced, document, _seen=_seen)
|
|
283
|
+
if not parent_block:
|
|
284
|
+
return block
|
|
285
|
+
|
|
286
|
+
# Remove the Source = <ref> line from current block to avoid duplicate Source lines.
|
|
287
|
+
block_without_source = re.sub(
|
|
288
|
+
r"(?m)^\s*Source\s*=\s*[^\n,;]+,\s*\n?", "", block, count=1
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
# Prepend the parent's let-body so the resulting block contains the real Source
|
|
292
|
+
# definition.
|
|
293
|
+
return parent_block.rstrip() + "\n" + block_without_source.lstrip()
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _get_entity_columns(entity_name: str, dataflow: DataflowDetails) -> list[str]:
|
|
297
|
+
for entity in dataflow.entities:
|
|
298
|
+
if entity["name"] == entity_name:
|
|
299
|
+
return entity["attributes"]
|
|
300
|
+
return []
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _extract_model_from_dataflow_entity_block(
|
|
304
|
+
entity_block: str,
|
|
305
|
+
) -> dict[str, str | list[dict]] | None:
|
|
306
|
+
"""Extract database, schema and table from an entity block.
|
|
307
|
+
|
|
308
|
+
Strategy:
|
|
309
|
+
- Find the database name from AmazonRedshift.Database(..., "dbname", ...)
|
|
310
|
+
- Find all assignments of the form: <lhs> = <rhs>{[Name = "X"]}[Data]
|
|
311
|
+
(lhs/rhs may be quoted like #"Navigation 1").
|
|
312
|
+
- Prefer the first assignment whose RHS equals the variable assigned to
|
|
313
|
+
the Database(...) call as the schema step (e.g. #"Navigation 1" = Source{[Name="schema"]}[Data]).
|
|
314
|
+
- Use the last assignment's Name as the table (most downstream step).
|
|
315
|
+
"""
|
|
316
|
+
db_match = re.search(
|
|
317
|
+
r'AmazonRedshift\.Database\(\s*"[^"]*"\s*,\s*"([^\"]+)"',
|
|
318
|
+
entity_block,
|
|
319
|
+
re.IGNORECASE,
|
|
320
|
+
)
|
|
321
|
+
if not db_match:
|
|
322
|
+
return None
|
|
323
|
+
database_name = db_match.group(1).strip()
|
|
324
|
+
|
|
325
|
+
# Detect variable assigned to the Database call (usually 'Source').
|
|
326
|
+
src_var_match = re.search(
|
|
327
|
+
r"(\w+)\s*=\s*AmazonRedshift\.Database\s*\(", entity_block, re.IGNORECASE
|
|
328
|
+
)
|
|
329
|
+
source_var_name = src_var_match.group(1).strip() if src_var_match else "Source"
|
|
330
|
+
|
|
331
|
+
# Find all assignments like: <lhs> = <rhs>{[Name = "X"]}[Data].
|
|
332
|
+
assign_pattern = r'(\#?"?[\w\d\s\-\_]+"?)\s*=\s*(\#?"?[\w\d\s\-\_]+"?)\s*\{\s*\[\s*Name\s*=\s*"([^\"]+)"\s*\]\s*\}\s*\[Data\]'
|
|
333
|
+
assigns = re.findall(assign_pattern, entity_block, re.IGNORECASE)
|
|
334
|
+
if not assigns:
|
|
335
|
+
return None
|
|
336
|
+
|
|
337
|
+
def normalize(n: str) -> str:
|
|
338
|
+
n = n.strip().rstrip(",")
|
|
339
|
+
if n.startswith('#"') and n.endswith('"'):
|
|
340
|
+
return n[2:-1]
|
|
341
|
+
if n.startswith('"') and n.endswith('"'):
|
|
342
|
+
return n[1:-1]
|
|
343
|
+
return n
|
|
344
|
+
|
|
345
|
+
normalized = [
|
|
346
|
+
(normalize(lhs), normalize(rhs), name.strip()) for lhs, rhs, name in assigns
|
|
347
|
+
]
|
|
348
|
+
|
|
349
|
+
# Pick schema: first assignment whose RHS is the DB variable (Source), else first
|
|
350
|
+
# assignment.
|
|
351
|
+
schema_name = None
|
|
352
|
+
for _lhs, rhs, name in normalized:
|
|
353
|
+
if rhs.lower() == source_var_name.lower():
|
|
354
|
+
schema_name = name
|
|
355
|
+
break
|
|
356
|
+
if not schema_name:
|
|
357
|
+
schema_name = normalized[0][2]
|
|
358
|
+
|
|
359
|
+
# Pick table: last assignment's name (downstream step).
|
|
360
|
+
table_name = normalized[-1][2]
|
|
361
|
+
|
|
362
|
+
if not (table_name and schema_name and database_name):
|
|
363
|
+
logger.debug("Failed to extract model from entity block.")
|
|
364
|
+
logger.debug(entity_block)
|
|
365
|
+
return None
|
|
366
|
+
|
|
367
|
+
return {
|
|
368
|
+
"name": table_name,
|
|
369
|
+
"schema": schema_name,
|
|
370
|
+
"database": database_name,
|
|
371
|
+
"tags": [{"source_system": "AmazonRedshift:Dataflow"}],
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def _extract_dataflow_table_from_expression( # noqa: C901, PLR0912
|
|
376
|
+
expression: str, dataflows_info: list[DataflowDetails]
|
|
377
|
+
) -> dict[str, str | list[dict]] | None:
|
|
378
|
+
"""Extract schema and table name from dataflow expression.
|
|
379
|
+
|
|
380
|
+
We first extract the dataflow ID and the name of the entity within the dataflow.
|
|
381
|
+
|
|
382
|
+
Next, we parse the dataflow's source code ("document" field in the dataflow's
|
|
383
|
+
section in dataflows_info) to extract the name of the source database table for the
|
|
384
|
+
entity.
|
|
385
|
+
"""
|
|
386
|
+
# 1) Try to extract dataflowId directly from the table expression (most common case)
|
|
387
|
+
dataflow_id_pattern = r'dataflowId="([a-f0-9\-]+)"'
|
|
388
|
+
dataflow_id_match = re.search(dataflow_id_pattern, expression, re.IGNORECASE)
|
|
389
|
+
|
|
390
|
+
dataflow_id = None
|
|
391
|
+
dataflow_obj = None
|
|
392
|
+
|
|
393
|
+
if dataflow_id_match:
|
|
394
|
+
dataflow_id = dataflow_id_match.group(1).strip()
|
|
395
|
+
|
|
396
|
+
# 2) Try to extract entity name from the table expression
|
|
397
|
+
entity_pattern = r'{\[entity="([^"]+)"'
|
|
398
|
+
entity_match = re.search(entity_pattern, expression, re.IGNORECASE)
|
|
399
|
+
if not entity_match:
|
|
400
|
+
return None
|
|
401
|
+
entity_name = entity_match.group(1).strip()
|
|
402
|
+
|
|
403
|
+
# If we have a dataflow_id, find that dataflow. Otherwise try to resolve via
|
|
404
|
+
# a named source referenced by this table expression.
|
|
405
|
+
if dataflow_id:
|
|
406
|
+
for dataflow in dataflows_info:
|
|
407
|
+
if dataflow.id == dataflow_id:
|
|
408
|
+
dataflow_obj = dataflow
|
|
409
|
+
break
|
|
410
|
+
else:
|
|
411
|
+
# Look for a source reference in the table expression, e.g.
|
|
412
|
+
# 'Source = SourceNewStrategicDim'
|
|
413
|
+
src_ref_match = re.search(
|
|
414
|
+
r'(?m)^\s*Source\s*=\s*(?P<ref>[A-Za-z0-9_#"@]+)', expression
|
|
415
|
+
)
|
|
416
|
+
src_ref = None
|
|
417
|
+
if src_ref_match:
|
|
418
|
+
src_ref = src_ref_match.group("ref").strip()
|
|
419
|
+
# Normalize the name.
|
|
420
|
+
if src_ref.startswith('"') and src_ref.endswith('"'):
|
|
421
|
+
src_ref = src_ref[1:-1]
|
|
422
|
+
if src_ref.startswith('#"') and src_ref.endswith('"'):
|
|
423
|
+
src_ref = src_ref[2:-1]
|
|
424
|
+
|
|
425
|
+
if not dataflow_obj:
|
|
426
|
+
# As a last resort, try to find a dataflow that contains an entity with
|
|
427
|
+
# the given name (some datasets reference a named source which in turn
|
|
428
|
+
# points to a dataflow entity; if we don't have the dataset expressions
|
|
429
|
+
# available we can still try matching by entity name across known
|
|
430
|
+
# dataflows).
|
|
431
|
+
for dataflow in dataflows_info:
|
|
432
|
+
for ent in getattr(dataflow, "entities", []) or []:
|
|
433
|
+
ent_name = (
|
|
434
|
+
ent.get("name")
|
|
435
|
+
if isinstance(ent, dict)
|
|
436
|
+
else getattr(ent, "name", None)
|
|
437
|
+
)
|
|
438
|
+
if ent_name and ent_name == entity_name:
|
|
439
|
+
dataflow_obj = dataflow
|
|
440
|
+
break
|
|
441
|
+
if dataflow_obj:
|
|
442
|
+
break
|
|
443
|
+
if not dataflow_obj:
|
|
444
|
+
return None
|
|
445
|
+
|
|
446
|
+
# Retrieve the dataflow's M code document.
|
|
447
|
+
document = dataflow_obj.pbi_mashup.document
|
|
448
|
+
|
|
449
|
+
# Extract the code block containing the entity's definition.
|
|
450
|
+
entity_block = _extract_entity_block(entity_name, document)
|
|
451
|
+
|
|
452
|
+
if not entity_block:
|
|
453
|
+
return None
|
|
454
|
+
|
|
455
|
+
# Extract the source table from the code block.
|
|
456
|
+
model = _extract_model_from_dataflow_entity_block(entity_block)
|
|
457
|
+
|
|
458
|
+
if not model:
|
|
459
|
+
msg = f"Could not extract model from dataflow entity block for entity '{entity_name}'."
|
|
460
|
+
logger.warning(msg)
|
|
461
|
+
|
|
462
|
+
return model
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
if __name__ == "__main__":
|
|
466
|
+
with Path("powerbi_workspace_info.json").open(encoding="utf-8") as f:
|
|
467
|
+
workspace_info = WorkspaceInfo(**next(iter(json.load(f))))
|
|
468
|
+
|
|
469
|
+
with Path("powerbi_dataflows_info.json").open(encoding="utf-8") as f:
|
|
470
|
+
dataflows_info = [DataflowDetails(**item) for item in json.load(f)]
|
|
471
|
+
|
|
472
|
+
dashboard_manifest = transform(workspace_info, dataflows_info)
|
|
473
|
+
|
|
474
|
+
Path("powerbi_extracted.json").write_text(
|
|
475
|
+
dashboard_manifest.model_dump_json(by_alias=True), encoding="utf-8"
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
# # print(json.dumps(tables, indent=4))
|