semantic-link-labs 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of semantic-link-labs might be problematic. Click here for more details.

Files changed (54) hide show
  1. {semantic_link_labs-0.5.0.dist-info → semantic_link_labs-0.6.0.dist-info}/METADATA +2 -2
  2. semantic_link_labs-0.6.0.dist-info/RECORD +54 -0
  3. {semantic_link_labs-0.5.0.dist-info → semantic_link_labs-0.6.0.dist-info}/WHEEL +1 -1
  4. sempy_labs/__init__.py +19 -13
  5. sempy_labs/_ai.py +43 -24
  6. sempy_labs/_clear_cache.py +4 -5
  7. sempy_labs/_connections.py +77 -70
  8. sempy_labs/_dax.py +7 -9
  9. sempy_labs/_generate_semantic_model.py +55 -44
  10. sempy_labs/_helper_functions.py +13 -6
  11. sempy_labs/_icons.py +14 -0
  12. sempy_labs/_list_functions.py +491 -304
  13. sempy_labs/_model_auto_build.py +4 -3
  14. sempy_labs/_model_bpa.py +131 -1118
  15. sempy_labs/_model_bpa_rules.py +831 -0
  16. sempy_labs/_model_dependencies.py +14 -12
  17. sempy_labs/_one_lake_integration.py +11 -5
  18. sempy_labs/_query_scale_out.py +89 -81
  19. sempy_labs/_refresh_semantic_model.py +16 -10
  20. sempy_labs/_translations.py +213 -287
  21. sempy_labs/_vertipaq.py +53 -37
  22. sempy_labs/directlake/__init__.py +2 -0
  23. sempy_labs/directlake/_directlake_schema_compare.py +12 -5
  24. sempy_labs/directlake/_directlake_schema_sync.py +13 -19
  25. sempy_labs/directlake/_fallback.py +5 -3
  26. sempy_labs/directlake/_get_directlake_lakehouse.py +1 -1
  27. sempy_labs/directlake/_get_shared_expression.py +4 -2
  28. sempy_labs/directlake/_guardrails.py +3 -3
  29. sempy_labs/directlake/_list_directlake_model_calc_tables.py +17 -10
  30. sempy_labs/directlake/_show_unsupported_directlake_objects.py +3 -2
  31. sempy_labs/directlake/_update_directlake_model_lakehouse_connection.py +10 -5
  32. sempy_labs/directlake/_update_directlake_partition_entity.py +132 -9
  33. sempy_labs/directlake/_warm_cache.py +6 -3
  34. sempy_labs/lakehouse/_get_lakehouse_columns.py +1 -1
  35. sempy_labs/lakehouse/_get_lakehouse_tables.py +5 -3
  36. sempy_labs/lakehouse/_lakehouse.py +2 -1
  37. sempy_labs/lakehouse/_shortcuts.py +19 -12
  38. sempy_labs/migration/__init__.py +1 -1
  39. sempy_labs/migration/_create_pqt_file.py +21 -15
  40. sempy_labs/migration/_migrate_calctables_to_lakehouse.py +16 -13
  41. sempy_labs/migration/_migrate_calctables_to_semantic_model.py +17 -18
  42. sempy_labs/migration/_migrate_model_objects_to_semantic_model.py +43 -40
  43. sempy_labs/migration/_migrate_tables_columns_to_semantic_model.py +14 -14
  44. sempy_labs/migration/_migration_validation.py +2 -2
  45. sempy_labs/migration/_refresh_calc_tables.py +8 -5
  46. sempy_labs/report/__init__.py +2 -2
  47. sempy_labs/report/_generate_report.py +10 -5
  48. sempy_labs/report/_report_functions.py +67 -29
  49. sempy_labs/report/_report_rebind.py +9 -8
  50. sempy_labs/tom/__init__.py +1 -4
  51. sempy_labs/tom/_model.py +555 -152
  52. semantic_link_labs-0.5.0.dist-info/RECORD +0 -53
  53. {semantic_link_labs-0.5.0.dist-info → semantic_link_labs-0.6.0.dist-info}/LICENSE +0 -0
  54. {semantic_link_labs-0.5.0.dist-info → semantic_link_labs-0.6.0.dist-info}/top_level.txt +0 -0
sempy_labs/_model_bpa.py CHANGED
@@ -1,712 +1,32 @@
1
- import sempy
2
1
  import sempy.fabric as fabric
3
2
  import pandas as pd
4
- import re, unicodedata, warnings, datetime
5
- import numpy as np
3
+ import warnings
4
+ import datetime
6
5
  from IPython.display import display, HTML
7
6
  from pyspark.sql import SparkSession
8
- from sempy_labs._model_dependencies import get_measure_dependencies
9
- from sempy_labs._helper_functions import format_dax_object_name, resolve_lakehouse_name
7
+ from sempy_labs._model_dependencies import get_model_calc_dependencies
8
+ from sempy_labs._helper_functions import (
9
+ format_dax_object_name,
10
+ resolve_lakehouse_name,
11
+ create_relationship_name,
12
+ )
10
13
  from sempy_labs.lakehouse._get_lakehouse_tables import get_lakehouse_tables
11
14
  from sempy_labs.lakehouse._lakehouse import lakehouse_attached
12
- from typing import List, Optional, Union
15
+ from sempy_labs.tom import connect_semantic_model
16
+ from sempy_labs._model_bpa_rules import model_bpa_rules
17
+ from typing import Optional
13
18
  from sempy._utils._log import log
14
19
  import sempy_labs._icons as icons
15
20
 
16
- def model_bpa_rules():
17
- """
18
- Shows the default rules for the semantic model BPA used by the run_model_bpa function.
19
-
20
- Parameters
21
- ----------
22
-
23
-
24
- Returns
25
- -------
26
- pandas.DataFrame
27
- A pandas dataframe containing the default rules for the run_model_bpa function.
28
- """
29
-
30
- df_rules = pd.DataFrame(
31
- [
32
- (
33
- "Performance",
34
- "Column",
35
- "Warning",
36
- "Do not use floating point data types",
37
- lambda df: df["Data Type"] == "Double",
38
- 'The "Double" floating point data type should be avoided, as it can result in unpredictable roundoff errors and decreased performance in certain scenarios. Use "Int64" or "Decimal" where appropriate (but note that "Decimal" is limited to 4 digits after the decimal sign).',
39
- ),
40
- (
41
- "Performance",
42
- "Column",
43
- "Warning",
44
- "Avoid using calculated columns",
45
- lambda df: df["Type"] == "Calculated",
46
- "Calculated columns do not compress as well as data columns so they take up more memory. They also slow down processing times for both the table as well as process recalc. Offload calculated column logic to your data warehouse and turn these calculated columns into data columns.",
47
- "https://www.elegantbi.com/post/top10bestpractices",
48
- ),
49
- (
50
- "Performance",
51
- "Relationship",
52
- "Warning",
53
- "Check if bi-directional and many-to-many relationships are valid",
54
- lambda df: (df["Multiplicity"] == "m:m")
55
- | (df["Cross Filtering Behavior"] == "BothDirections"),
56
- "Bi-directional and many-to-many relationships may cause performance degradation or even have unintended consequences. Make sure to check these specific relationships to ensure they are working as designed and are actually necessary.",
57
- "https://www.sqlbi.com/articles/bidirectional-relationships-and-ambiguity-in-dax",
58
- ),
59
- (
60
- "Performance",
61
- "Row Level Security",
62
- "Info",
63
- "Check if dynamic row level security (RLS) is necessary",
64
- lambda df: df["Is Dynamic"],
65
- "Usage of dynamic row level security (RLS) can add memory and performance overhead. Please research the pros/cons of using it.",
66
- "https://docs.microsoft.com/power-bi/admin/service-admin-rls",
67
- ),
68
- (
69
- "Performance",
70
- "Table",
71
- "Warning",
72
- "Avoid using many-to-many relationships on tables used for dynamic row level security",
73
- lambda df: (df["Used in M2M Relationship"] == True)
74
- & (df["Used in Dynamic RLS"] == True),
75
- "Using many-to-many relationships on tables which use dynamic row level security can cause serious query performance degradation. This pattern's performance problems compound when snowflaking multiple many-to-many relationships against a table which contains row level security. Instead, use one of the patterns shown in the article below where a single dimension table relates many-to-one to a security table.",
76
- "https://www.elegantbi.com/post/dynamicrlspatterns",
77
- ),
78
- (
79
- "Performance",
80
- "Relationship",
81
- "Warning",
82
- "Many-to-many relationships should be single-direction",
83
- lambda df: (df["Multiplicity"] == "m:m")
84
- & (df["Cross Filtering Behavior"] == "BothDirections"),
85
- ),
86
- (
87
- "Performance",
88
- "Column",
89
- "Warning",
90
- "Set IsAvailableInMdx to false on non-attribute columns",
91
- lambda df: (df["Is Direct Lake"] == False)
92
- & (df["Is Available in MDX"] == True)
93
- & ((df["Hidden"] == True) | (df["Parent Is Hidden"] == True))
94
- & (df["Used in Sort By"] == False)
95
- & (df["Used in Hierarchy"] == False)
96
- & (df["Sort By Column"] == None),
97
- "To speed up processing time and conserve memory after processing, attribute hierarchies should not be built for columns that are never used for slicing by MDX clients. In other words, all hidden columns that are not used as a Sort By Column or referenced in user hierarchies should have their IsAvailableInMdx property set to false. The IsAvailableInMdx property is not relevant for Direct Lake models.",
98
- "https://blog.crossjoin.co.uk/2018/07/02/isavailableinmdx-ssas-tabular",
99
- ),
100
- # ('Performance', 'Partition', 'Warning', "Set 'Data Coverage Definition' property on the DirectQuery partition of a hybrid table",
101
- # lambda df: (df['Data Coverage Definition Expression'].isnull()) & (df['Mode'] == 'DirectQuery') & (df['Import Partitions'] > 0) & (df['Has Date Table']),
102
- # "Setting the 'Data Coverage Definition' property may lead to better performance because the engine knows when it can only query the import-portion of the table and when it needs to query the DirectQuery portion of the table.",
103
- # "https://learn.microsoft.com/analysis-services/tom/table-partitions?view=asallproducts-allversions",
104
- # ),
105
- (
106
- "Performance",
107
- "Table",
108
- "Warning",
109
- "Set dimensions tables to dual mode instead of import when using DirectQuery on fact tables",
110
- lambda df: (df["Import Partitions"] == 1)
111
- & (df["Model Has DQ"])
112
- & (df["Used in Relationship x:1"]),
113
- "https://learn.microsoft.com/power-bi/transform-model/desktop-storage-mode#propagation-of-the-dual-setting",
114
- ),
115
- (
116
- "Performance",
117
- "Partition",
118
- "Warning",
119
- "Minimize Power Query transformations",
120
- lambda df: (df["Source Type"] == "M")
121
- & (
122
- ('Table.Combine("' in df["Query"])
123
- | ('Table.Join("' in df["Query"])
124
- | ('Table.NestedJoin("' in df["Query"])
125
- | ('Table.AddColumn("' in df["Query"])
126
- | ('Table.Group("' in df["Query"])
127
- | ('Table.Sort("' in df["Query"])
128
- | ('Table.Sort("' in df["Query"])
129
- | ('Table.Pivot("' in df["Query"])
130
- | ('Table.Unpivot("' in df["Query"])
131
- | ('Table.UnpivotOtherColumns("' in df["Query"])
132
- | ('Table.Distinct("' in df["Query"])
133
- | ('[Query=(""SELECT' in df["Query"])
134
- | ("Value.NativeQuery" in df["Query"])
135
- | ("OleDb.Query" in df["Query"])
136
- | ("Odbc.Query" in df["Query"])
137
- ),
138
- "Minimize Power Query transformations in order to improve model processing performance. It is a best practice to offload these transformations to the data warehouse if possible. Also, please check whether query folding is occurring within your model. Please reference the article below for more information on query folding.",
139
- "https://docs.microsoft.com/power-query/power-query-folding",
140
- ),
141
- (
142
- "Performance",
143
- "Table",
144
- "Warning",
145
- "Consider a star-schema instead of a snowflake architecture",
146
- lambda df: (df["Type"] != "Calculation Group")
147
- & df["Used in Relationship Both Sides"],
148
- "Generally speaking, a star-schema is the optimal architecture for tabular models. That being the case, there are valid cases to use a snowflake approach. Please check your model and consider moving to a star-schema architecture.",
149
- "https://docs.microsoft.com/power-bi/guidance/star-schema",
150
- ),
151
- (
152
- "Performance",
153
- "Table",
154
- "Warning",
155
- "Reduce usage of calculated tables",
156
- lambda df: df["Type"] == "Calculated Table",
157
- "Migrate calculated table logic to your data warehouse. Reliance on calculated tables will lead to technical debt and potential misalignments if you have multiple models on your platform.",
158
- ),
159
- (
160
- "Performance",
161
- "Column",
162
- "Warning",
163
- "Reduce usage of calculated columns that use the RELATED function",
164
- lambda df: (df["Type"] == "Calculated")
165
- & (df["Source"].str.contains(r"related\s*\(", case=False)),
166
- "Calculated columns do not compress as well as data columns and may cause longer processing times. As such, calculated columns should be avoided if possible. One scenario where they may be easier to avoid is if they use the RELATED function.",
167
- "https://www.sqlbi.com/articles/storage-differences-between-calculated-columns-and-calculated-tables",
168
- ),
169
- (
170
- "Performance",
171
- "Model",
172
- "Warning",
173
- "Avoid excessive bi-directional or many-to-many relationships",
174
- lambda df: (
175
- df["M2M or BiDi Relationship Count"] / df["Relationship Count"]
176
- )
177
- > 0.3,
178
- "Limit use of b-di and many-to-many relationships. This rule flags the model if more than 30% of relationships are bi-di or many-to-many.",
179
- "https://www.sqlbi.com/articles/bidirectional-relationships-and-ambiguity-in-dax",
180
- ),
181
- (
182
- "Performance",
183
- "Column",
184
- "Warning",
185
- "Avoid bi-directional or many-to-many relationships against high-cardinality columns",
186
- lambda df: df["Used in M2M/BiDi Relationship"]
187
- & df["Column Cardinality"]
188
- > 100000,
189
- "For best performance, it is recommended to avoid using bi-directional relationships against high-cardinality columns",
190
- ),
191
- (
192
- "Performance",
193
- "Table",
194
- "Warning",
195
- "Remove auto-date table",
196
- lambda df: (df["Type"] == "Calculated Table")
197
- & (
198
- (df["Name"].str.startswith("DateTableTemplate_"))
199
- | (df["Name"].str.startswith("LocalDateTable_"))
200
- ),
201
- "Avoid using auto-date tables. Make sure to turn off auto-date table in the settings in Power BI Desktop. This will save memory resources.",
202
- "https://www.youtube.com/watch?v=xu3uDEHtCrg",
203
- ),
204
- (
205
- "Performance",
206
- "Table",
207
- "Warning",
208
- "Date/calendar tables should be marked as a date table",
209
- lambda df: (
210
- (df["Name"].str.contains(r"date", case=False))
211
- | (df["Name"].str.contains(r"calendar", case=False))
212
- )
213
- & (df["Data Category"] != "Time"),
214
- "This rule looks for tables that contain the words 'date' or 'calendar' as they should likely be marked as a date table.",
215
- "https://docs.microsoft.com/power-bi/transform-model/desktop-date-tables",
216
- ),
217
- (
218
- "Performance",
219
- "Table",
220
- "Warning",
221
- "Large tables should be partitioned",
222
- lambda df: (df["Is Direct Lake"] == False)
223
- & (df["Partition Count"] == 1)
224
- & (df["Row Count"] > 25000000),
225
- "Large tables should be partitioned in order to optimize processing. This is not relevant for semantic models in Direct Lake mode as they can only have one partition per table.",
226
- ),
227
- (
228
- "Performance",
229
- "Row Level Security",
230
- "Warning",
231
- "Limit row level security (RLS) logic",
232
- lambda df: df["Filter Expression"].str.contains(
233
- "|".join(["right", "left", "filter", "upper", "lower", "find"]),
234
- case=False,
235
- ),
236
- "Try to simplify the DAX used for row level security. Usage of the functions within this rule can likely be offloaded to the upstream systems (data warehouse).",
237
- ),
238
- (
239
- "Performance",
240
- "Model",
241
- "Warning",
242
- "Model should have a date table",
243
- lambda df: df["Has Date Table"],
244
- "Generally speaking, models should generally have a date table. Models that do not have a date table generally are not taking advantage of features such as time intelligence or may not have a properly structured architecture.",
245
- ),
246
- (
247
- "Performance",
248
- "Measure",
249
- "Warning",
250
- "Measures using time intelligence and model is using Direct Query",
251
- lambda df: df["DQ Date Function Used"],
252
- "At present, time intelligence functions are known to not perform as well when using Direct Query. If you are having performance issues, you may want to try alternative solutions such as adding columns in the fact table that show previous year or previous month data.",
253
- ),
254
- (
255
- "Error Prevention",
256
- "Calculation Item",
257
- "Error",
258
- "Calculation items must have an expression",
259
- lambda df: df["Expression"].str.len() == 0,
260
- "Calculation items must have an expression. Without an expression, they will not show any values.",
261
- ),
262
- (
263
- "Error Prevention",
264
- ["Table", "Column", "Measure", "Hierarchy", "Partition"],
265
- "Error",
266
- "Avoid invalid characters in names",
267
- lambda df: df["Name"].apply(
268
- lambda x: any(
269
- unicodedata.category(char) == "Cc" and not char.isspace()
270
- for char in x
271
- )
272
- ),
273
- "This rule identifies if a name for a given object in your model (i.e. table/column/measure) which contains an invalid character. Invalid characters will cause an error when deploying the model (and failure to deploy). This rule has a fix expression which converts the invalid character into a space, resolving the issue.",
274
- ),
275
- (
276
- "Error Prevention",
277
- ["Table", "Column", "Measure", "Hierarchy"],
278
- "Error",
279
- "Avoid invalid characters in descriptions",
280
- lambda df: df["Description"].apply(
281
- lambda x: any(
282
- unicodedata.category(char) == "Cc" and not char.isspace()
283
- for char in x
284
- )
285
- ),
286
- "This rule identifies if a description for a given object in your model (i.e. table/column/measure) which contains an invalid character. Invalid characters will cause an error when deploying the model (and failure to deploy). This rule has a fix expression which converts the invalid character into a space, resolving the issue.",
287
- ),
288
- (
289
- "Error Prevention",
290
- "Relationship",
291
- "Warning",
292
- "Relationship columns should be of the same data type",
293
- lambda df: df["From Column Data Type"] != df["To Column Data Type"],
294
- "Columns used in a relationship should be of the same data type. Ideally, they will be of integer data type (see the related rule '[Formatting] Relationship columns should be of integer data type'). Having columns within a relationship which are of different data types may lead to various issues.",
295
- ),
296
- (
297
- "Error Prevention",
298
- "Column",
299
- "Error",
300
- "Data columns must have a source column",
301
- lambda df: (df["Type"] == "Data") & (df["Source"].str.len() == 0),
302
- "Data columns must have a source column. A data column without a source column will cause an error when processing the model.",
303
- ),
304
- (
305
- "Error Prevention",
306
- "Column",
307
- "Warning",
308
- "Set IsAvailableInMdx to true on necessary columns",
309
- lambda df: (df["Is Direct Lake"] == False)
310
- & (df["Is Available in MDX"] == False)
311
- & (
312
- (df["Used in Sort By"] == True)
313
- | (df["Used in Hierarchy"] == True)
314
- | (df["Sort By Column"] != None)
315
- ),
316
- "In order to avoid errors, ensure that attribute hierarchies are enabled if a column is used for sorting another column, used in a hierarchy, used in variations, or is sorted by another column. The IsAvailableInMdx property is not relevant for Direct Lake models.",
317
- ),
318
- (
319
- "Error Prevention",
320
- "Table",
321
- "Error",
322
- "Avoid the USERELATIONSHIP function and RLS against the same table",
323
- lambda df: (df["USERELATIONSHIP Used"] == True)
324
- & (df["Used in RLS"] == True),
325
- "The USERELATIONSHIP function may not be used against a table which also leverages row-level security (RLS). This will generate an error when using the particular measure in a visual. This rule will highlight the table which is used in a measure's USERELATIONSHIP function as well as RLS.",
326
- "https://blog.crossjoin.co.uk/2013/05/10/userelationship-and-tabular-row-security",
327
- ),
328
- (
329
- "DAX Expressions",
330
- "Measure",
331
- "Warning",
332
- "Avoid using the IFERROR function",
333
- lambda df: df["Measure Expression"].str.contains(
334
- r"irerror\s*\(", case=False
335
- ),
336
- "Avoid using the IFERROR function as it may cause performance degradation. If you are concerned about a divide-by-zero error, use the DIVIDE function as it naturally resolves such errors as blank (or you can customize what should be shown in case of such an error).",
337
- "https://www.elegantbi.com/post/top10bestpractices",
338
- ),
339
- (
340
- "DAX Expressions",
341
- "Measure",
342
- "Warning",
343
- "Use the TREATAS function instead of INTERSECT for virtual relationships",
344
- lambda df: df["Measure Expression"].str.contains(
345
- r"intersect\s*\(", case=False
346
- ),
347
- "The TREATAS function is more efficient and provides better performance than the INTERSECT function when used in virutal relationships.",
348
- "https://www.sqlbi.com/articles/propagate-filters-using-treatas-in-dax",
349
- ),
350
- (
351
- "DAX Expressions",
352
- "Measure",
353
- "Warning",
354
- "The EVALUATEANDLOG function should not be used in production models",
355
- lambda df: df["Measure Expression"].str.contains(
356
- r"evaluateandlog\s*\(", case=False
357
- ),
358
- "The EVALUATEANDLOG function is meant to be used only in development/test environments and should not be used in production models.",
359
- "https://pbidax.wordpress.com/2022/08/16/introduce-the-dax-evaluateandlog-function",
360
- ),
361
- (
362
- "DAX Expressions",
363
- "Measure",
364
- "Warning",
365
- "Measures should not be direct references of other measures",
366
- lambda df: df["Measure Expression"]
367
- .str.strip()
368
- .isin(df["Measure Object"]),
369
- "This rule identifies measures which are simply a reference to another measure. As an example, consider a model with two measures: [MeasureA] and [MeasureB]. This rule would be triggered for MeasureB if MeasureB's DAX was MeasureB:=[MeasureA]. Such duplicative measures should be removed.",
370
- ),
371
- (
372
- "DAX Expressions",
373
- "Measure",
374
- "Warning",
375
- "No two measures should have the same definition",
376
- lambda df: df["Measure Expression"]
377
- .apply(lambda x: re.sub(r"\s+", "", x))
378
- .duplicated(keep=False),
379
- "Two measures with different names and defined by the same DAX expression should be avoided to reduce redundancy.",
380
- ),
381
- (
382
- "DAX Expressions",
383
- "Measure",
384
- "Warning",
385
- "Avoid addition or subtraction of constant values to results of divisions",
386
- lambda df: df["Measure Expression"].str.contains(
387
- "(?i)DIVIDE\\s*\\((\\s*.*?)\\)\\s*[+-]\\s*1"
388
- or "\\/\\s*.*(?=[-+]\\s*1)",
389
- regex=True,
390
- ),
391
- ),
392
- (
393
- "DAX Expressions",
394
- "Measure",
395
- "Warning",
396
- "Avoid using '1-(x/y)' syntax",
397
- lambda df: df["Measure Expression"].str.contains(
398
- "[0-9]+\\s*[-+]\\s*[\\(]*\\s*(?i)SUM\\s*\\(\\s*'*[A-Za-z0-9 _]+'*\\s*\\[[A-Za-z0-9 _]+\\]\\s*\\)\\s*\\/"
399
- or "[0-9]+\\s*[-+]\\s*(?i)DIVIDE\\s*\\(",
400
- regex=True,
401
- ),
402
- "Instead of using the '1-(x/y)' or '1+(x/y)' syntax to achieve a percentage calculation, use the basic DAX functions (as shown below). Using the improved syntax will generally improve the performance. The '1+/-...' syntax always returns a value whereas the solution without the '1+/-...' does not (as the value may be 'blank'). Therefore the '1+/-...' syntax may return more rows/columns which may result in a slower query speed. Let's clarify with an example: Avoid this: 1 - SUM ( 'Sales'[CostAmount] ) / SUM( 'Sales'[SalesAmount] ) Better: DIVIDE ( SUM ( 'Sales'[SalesAmount] ) - SUM ( 'Sales'[CostAmount] ), SUM ( 'Sales'[SalesAmount] ) ) Best: VAR x = SUM ( 'Sales'[SalesAmount] ) RETURN DIVIDE ( x - SUM ( 'Sales'[CostAmount] ), x )",
403
- ),
404
- (
405
- "DAX Expressions",
406
- "Measure",
407
- "Warning",
408
- "Filter measure values by columns, not tables",
409
- lambda df: df["Measure Expression"].str.contains(
410
- "(?i)CALCULATE\\s*\\(\\s*[^,]+,\\s*(?i)FILTER\\s*\\(\\s*'*[A-Za-z0-9 _]+'*\\s*,\\s*\\[[^\\]]+\\]"
411
- or "(?i)CALCULATETABLE\\s*\\([^,]*,\\s*(?i)FILTER\\s*\\(\\s*'*[A-Za-z0-9 _]+'*\\s*,\\s*\\[",
412
- regex=True,
413
- ),
414
- "Instead of using this pattern FILTER('Table',[Measure]>Value) for the filter parameters of a CALCULATE or CALCULATETABLE function, use one of the options below (if possible). Filtering on a specific column will produce a smaller table for the engine to process, thereby enabling faster performance. Using the VALUES function or the ALL function depends on the desired measure result.\nOption 1: FILTER(VALUES('Table'[Column]),[Measure] > Value)\nOption 2: FILTER(ALL('Table'[Column]),[Measure] > Value)",
415
- "https://docs.microsoft.com/power-bi/guidance/dax-avoid-avoid-filter-as-filter-argument",
416
- ),
417
- (
418
- "DAX Expressions",
419
- "Measure",
420
- "Warning",
421
- "Filter column values with proper syntax",
422
- lambda df: df["Measure Expression"].str.contains(
423
- "(?i)CALCULATE\\s*\\(\\s*[^,]+,\\s*(?i)FILTER\\s*\\(\\s*'*[A-Za-z0-9 _]+'*\\s*,\\s*'*[A-Za-z0-9 _]+'*\\[[A-Za-z0-9 _]+\\]"
424
- or "(?i)CALCULATETABLE\\s*\\([^,]*,\\s*(?i)FILTER\\s*\\(\\s*'*[A-Za-z0-9 _]+'*\\s*,\\s*'*[A-Za-z0-9 _]+'*\\[[A-Za-z0-9 _]+\\]",
425
- regex=True,
426
- ),
427
- "Instead of using this pattern FILTER('Table','Table'[Column]=\"Value\") for the filter parameters of a CALCULATE or CALCULATETABLE function, use one of the options below. As far as whether to use the KEEPFILTERS function, see the second reference link below.\nOption 1: KEEPFILTERS('Table'[Column]=\"Value\")\nOption 2: 'Table'[Column]=\"Value\"",
428
- "https://docs.microsoft.com/power-bi/guidance/dax-avoid-avoid-filter-as-filter-argument Reference: https://www.sqlbi.com/articles/using-keepfilters-in-dax",
429
- ),
430
- (
431
- "DAX Expressions",
432
- "Measure",
433
- "Warning",
434
- "Use the DIVIDE function for division",
435
- lambda df: df["Measure Expression"].str.contains(
436
- '\\]\\s*\\/(?!\\/)(?!\\*)" or "\\)\\s*\\/(?!\\/)(?!\\*)', regex=True
437
- ),
438
- 'Use the DIVIDE function instead of using "/". The DIVIDE function resolves divide-by-zero cases. As such, it is recommended to use to avoid errors.',
439
- "https://docs.microsoft.com/power-bi/guidance/dax-divide-function-operator",
440
- ),
441
- (
442
- "DAX Expressions",
443
- "Measure",
444
- "Error",
445
- "Column references should be fully qualified",
446
- lambda df: df["Has Unqualified Column Reference"],
447
- "Using fully qualified column references makes it easier to distinguish between column and measure references, and also helps avoid certain errors. When referencing a column in DAX, first specify the table name, then specify the column name in square brackets.",
448
- "https://www.elegantbi.com/post/top10bestpractices",
449
- ),
450
- (
451
- "DAX Expressions",
452
- "Measure",
453
- "Error",
454
- "Measure references should be unqualified",
455
- lambda df: df["Has Fully Qualified Measure Reference"],
456
- "Using unqualified measure references makes it easier to distinguish between column and measure references, and also helps avoid certain errors. When referencing a measure using DAX, do not specify the table name. Use only the measure name in square brackets.",
457
- "https://www.elegantbi.com/post/top10bestpractices",
458
- ),
459
- (
460
- "DAX Expressions",
461
- "Relationship",
462
- "Warning",
463
- "Inactive relationships that are never activated",
464
- lambda df: df["Inactive without USERELATIONSHIP"],
465
- "Inactive relationships are activated using the USERELATIONSHIP function. If an inactive relationship is not referenced in any measure via this function, the relationship will not be used. It should be determined whether the relationship is not necessary or to activate the relationship via this method.",
466
- "https://dax.guide/userelationship",
467
- ),
468
- (
469
- "Maintenance",
470
- "Column",
471
- "Warning",
472
- "Remove unnecessary columns",
473
- lambda df: (df["Hidden"] | df["Parent Is Hidden"])
474
- & ~df["Used in Relationship"]
475
- & ~df["Used in Sort By"]
476
- & ~df["Used in Hierarchy"]
477
- & (df["Referenced By"] == 0)
478
- & ~(df["Used in RLS"]), # usedInOLS
479
- "Hidden columns that are not referenced by any DAX expressions, relationships, hierarchy levels or Sort By-properties should be removed.",
480
- ),
481
- (
482
- "Maintenance",
483
- "Measure",
484
- "Warning",
485
- "Remove unnecessary measures",
486
- lambda df: df["Measure Hidden"] & (df["Referenced By"] == 0),
487
- "Hidden measures that are not referenced by any DAX expressions should be removed for maintainability.",
488
- ),
489
- # ('Maintenance', 'Role', 'Warning', 'Remove roles with no members',
490
- # lambda df: df['Member Count'] == 0,
491
- # ),
492
- (
493
- "Maintenance",
494
- "Table",
495
- "Warning",
496
- "Ensure tables have relationships",
497
- lambda df: (df["Used in Relationship"] == False)
498
- & (df["Type"] != "Calculation Group"),
499
- "This rule highlights tables which are not connected to any other table in the model with a relationship.",
500
- ),
501
- (
502
- "Maintenance",
503
- "Table",
504
- "Warning",
505
- "Calculation groups with no calculation items",
506
- lambda df: (df["Type"] == "Calculation Group")
507
- & (df["Has Calculation Items"]),
508
- ),
509
- (
510
- "Maintenance",
511
- "Column",
512
- "Info",
513
- "Visible objects with no description",
514
- lambda df: (df["Hidden"] == False) & (df["Description"].str.len() == 0),
515
- "Calculation groups have no function unless they have calculation items.",
516
- ),
517
- (
518
- "Formatting",
519
- "Column",
520
- "Warning",
521
- "Provide format string for 'Date' columns",
522
- lambda df: (df["Column Name"].str.contains(r"date", case=False))
523
- & (df["Data Type"] == "DateTime")
524
- & (df["Format String"] != "mm/dd/yyyy"),
525
- 'Columns of type "DateTime" that have "Month" in their names should be formatted as "mm/dd/yyyy".',
526
- ),
527
- (
528
- "Formatting",
529
- "Column",
530
- "Warning",
531
- "Do not summarize numeric columns",
532
- lambda df: (
533
- (df["Data Type"] == "Int64")
534
- | (df["Data Type"] == "Decimal")
535
- | (df["Data Type"] == "Double")
536
- )
537
- & (df["Summarize By"] != "None")
538
- & ~((df["Hidden"]) | (df["Parent Is Hidden"])),
539
- 'Numeric columns (integer, decimal, double) should have their SummarizeBy property set to "None" to avoid accidental summation in Power BI (create measures instead).',
540
- ),
541
- (
542
- "Formatting",
543
- "Measure",
544
- "Info",
545
- "Provide format string for measures",
546
- lambda df: ~((df["Measure Hidden"]) | (df["Parent Is Hidden"]))
547
- & (df["Format String"].str.len() == 0),
548
- "Visible measures should have their format string property assigned.",
549
- ),
550
- (
551
- "Formatting",
552
- "Column",
553
- "Info",
554
- "Add data category for columns",
555
- lambda df: (df["Data Category"] == "")
556
- & (
557
- (
558
- (
559
- (df["Column Name"].str.contains(r"country", case=False))
560
- | (df["Column Name"].str.contains(r"city", case=False))
561
- | (df["Column Name"].str.contains(r"continent", case=False))
562
- )
563
- & (df["Data Type"] == "String")
564
- )
565
- | (
566
- (
567
- (df["Column Name"].str.contains(r"latitude", case=False))
568
- | (df["Column Name"].str.contains(r"longitude", case=False))
569
- )
570
- & (df["Data Type"] == "String")
571
- )
572
- ),
573
- "Add Data Category property for appropriate columns.",
574
- "https://docs.microsoft.com/power-bi/transform-model/desktop-data-categorization",
575
- ),
576
- (
577
- "Formatting",
578
- "Measure",
579
- "Warning",
580
- "Percentages should be formatted with thousands separators and 1 decimal",
581
- lambda df: (df["Format String"].str.contains("%"))
582
- & (df["Format String"] != "#,0.0%;-#,0.0%;#,0.0%"),
583
- ),
584
- (
585
- "Formatting",
586
- "Measure",
587
- "Warning",
588
- "Whole numbers should be formatted with thousands separators and no decimals",
589
- lambda df: (~df["Format String"].str.contains("$"))
590
- & ~(df["Format String"].str.contains("%"))
591
- & ~((df["Format String"] == "#,0") | (df["Format String"] == "#,0.0")),
592
- ),
593
- (
594
- "Formatting",
595
- "Column",
596
- "Info",
597
- "Hide foreign keys",
598
- lambda df: (df["Foreign Key"]) & (df["Hidden"] == False),
599
- "Foreign keys should always be hidden.",
600
- ),
601
- (
602
- "Formatting",
603
- "Column",
604
- "Info",
605
- "Mark primary keys",
606
- lambda df: (df["Primary Key"]) & (df["Key"] == False),
607
- "Set the 'Key' property to 'True' for primary key columns within the column properties.",
608
- ),
609
- (
610
- "Formatting",
611
- "Column",
612
- "Info",
613
- "Month (as a string) must be sorted",
614
- lambda df: (df["Column Name"].str.contains(r"month", case=False))
615
- & ~(df["Column Name"].str.contains(r"months", case=False))
616
- & (df["Data Type"] == "String")
617
- & (df["Sort By Column"] == ""),
618
- "This rule highlights month columns which are strings and are not sorted. If left unsorted, they will sort alphabetically (i.e. April, August...). Make sure to sort such columns so that they sort properly (January, February, March...).",
619
- ),
620
- (
621
- "Formatting",
622
- "Relationship",
623
- "Warning",
624
- "Relationship columns should be of integer data type",
625
- lambda df: (df["From Column Data Type"] != "Int64")
626
- | (df["To Column Data Type"] != "Int64"),
627
- "It is a best practice for relationship columns to be of integer data type. This applies not only to data warehousing but data modeling as well.",
628
- ),
629
- (
630
- "Formatting",
631
- "Column",
632
- "Warning",
633
- 'Provide format string for "Month" columns',
634
- lambda df: (df["Column Name"].str.contains(r"month", case=False))
635
- & (df["Data Type"] == "DateTime")
636
- & (df["Format String"] != "MMMM yyyy"),
637
- 'Columns of type "DateTime" that have "Month" in their names should be formatted as "MMMM yyyy".',
638
- ),
639
- (
640
- "Formatting",
641
- "Column",
642
- "Info",
643
- "Format flag columns as Yes/No value strings",
644
- lambda df: (
645
- df["Column Name"].str.startswith("Is")
646
- & (df["Data Type"] == "Int64")
647
- & ~(df["Hidden"] | df["Parent Is Hidden"])
648
- )
649
- | (
650
- df["Column Name"].str.endswith(" Flag")
651
- & (df["Data Type"] != "String")
652
- & ~(df["Hidden"] | df["Parent Is Hidden"])
653
- ),
654
- "Flags must be properly formatted as Yes/No as this is easier to read than using 0/1 integer values.",
655
- ),
656
- # ('Formatting', ['Table', 'Column', 'Measure', 'Partition', 'Hierarchy'], 'Error', 'Objects should not start or end with a space',
657
- # lambda df: (df['Name'].str[0] == ' ') | (df['Name'].str[-1] == ' '),
658
- # 'Objects should not start or end with a space. This usually happens by accident and is difficult to find.',
659
- # ),
660
- (
661
- "Formatting",
662
- ["Table", "Column", "Measure", "Partition", "Hierarchy"],
663
- "Info",
664
- "First letter of objects must be capitalized",
665
- lambda df: df["Name"].str[0].str.upper() != df["Name"].str[0],
666
- "The first letter of object names should be capitalized to maintain professional quality.",
667
- ),
668
- (
669
- "Naming Conventions",
670
- ["Table", "Column", "Measure", "Partition", "Hierarchy"],
671
- "Warning",
672
- "Object names must not contain special characters",
673
- lambda df: df["Name"].str.contains(r"[\t\r\n]"),
674
- "Object names should not include tabs, line breaks, etc.",
675
- ), # ,
676
- # ('Error Prevention', ['Table'], 'Error', 'Avoid invalid characters in names',
677
- # lambda df: df['Name'].str.char.iscontrol() & ~ df['Name'].str.char.isspace(),
678
- # )#,
679
- ],
680
- columns=[
681
- "Category",
682
- "Scope",
683
- "Severity",
684
- "Rule Name",
685
- "Expression",
686
- "Description",
687
- "URL",
688
- ],
689
- )
690
-
691
- df_rules["Severity"] = (
692
- df_rules["Severity"]
693
- .replace("Warning", "⚠️")
694
- .replace("Error", "\u274C")
695
- .replace("Info", "ℹ️")
696
- )
697
-
698
- pd.set_option("display.max_colwidth", 1000)
699
-
700
- return df_rules
701
-
702
21
 
703
22
  @log
704
23
  def run_model_bpa(
705
24
  dataset: str,
706
- rules_dataframe: Optional[pd.DataFrame] = None,
25
+ rules: Optional[pd.DataFrame] = None,
707
26
  workspace: Optional[str] = None,
708
27
  export: Optional[bool] = False,
709
28
  return_dataframe: Optional[bool] = False,
29
+ extended: Optional[bool] = False,
710
30
  **kwargs,
711
31
  ):
712
32
  """
@@ -716,7 +36,7 @@ def run_model_bpa(
716
36
  ----------
717
37
  dataset : str
718
38
  Name of the semantic model.
719
- rules_dataframe : pandas.DataFrame, default=None
39
+ rules : pandas.DataFrame, default=None
720
40
  A pandas dataframe containing rules to be evaluated.
721
41
  workspace : str, default=None
722
42
  The Fabric workspace name.
@@ -726,6 +46,8 @@ def run_model_bpa(
726
46
  If True, exports the resulting dataframe to a delta table in the lakehouse attached to the notebook.
727
47
  return_dataframe : bool, default=False
728
48
  If True, returns a pandas dataframe instead of the visualization.
49
+ extended : bool, default=False
50
+ If True, runs the set_vertipaq_annotations function to collect Vertipaq Analyzer statistics to be used in the analysis of the semantic model.
729
51
 
730
52
  Returns
731
53
  -------
@@ -746,443 +68,134 @@ def run_model_bpa(
746
68
 
747
69
  workspace = fabric.resolve_workspace_name(workspace)
748
70
 
749
- if rules_dataframe is None:
750
- rules_dataframe = model_bpa_rules()
71
+ if extended:
72
+ with connect_semantic_model(
73
+ dataset=dataset, workspace=workspace, readonly=False
74
+ ) as tom:
75
+ tom.set_vertipaq_annotations()
751
76
 
752
- dfT = fabric.list_tables(dataset=dataset, workspace=workspace, extended=True)
753
- dfT = dfT.drop_duplicates()
754
- dfC = fabric.list_columns(
755
- dataset=dataset,
756
- workspace=workspace,
757
- extended=True,
758
- additional_xmla_properties=["Parent.DataCategory", "Parent.IsHidden"],
759
- )
760
- dfC = dfC[~dfC["Column Name"].str.startswith("RowNumber-")]
77
+ with connect_semantic_model(
78
+ dataset=dataset, workspace=workspace, readonly=True
79
+ ) as tom:
761
80
 
762
- dfM = fabric.list_measures(
763
- dataset=dataset,
764
- workspace=workspace,
765
- additional_xmla_properties=["Parent.IsHidden"],
766
- )
767
- dfR = fabric.list_relationships(
768
- dataset=dataset,
769
- workspace=workspace,
770
- additional_xmla_properties=["FromCardinality", "ToCardinality"],
771
- )
772
- dfP = fabric.list_partitions(
773
- dataset=dataset,
774
- workspace=workspace,
775
- additional_xmla_properties=["DataCoverageDefinition.Expression"],
776
- )
777
- dfH = fabric.list_hierarchies(dataset=dataset, workspace=workspace)
778
- dfRole = fabric.get_roles(dataset=dataset, workspace=workspace)
779
- dfRM = fabric.get_roles(dataset=dataset, workspace=workspace, include_members=True)
780
- dfRLS = fabric.get_row_level_security_permissions(
781
- dataset=dataset, workspace=workspace
782
- )
783
- # dfTr = fabric.list_translations(dataset = datasetName, workspace = workspaceName)
784
- # dfE = fabric.list_expressions(dataset = datasetName, workspace = workspaceName)
785
- dfCI = fabric.list_calculation_items(dataset=dataset, workspace=workspace)
786
- # dfDS = fabric.list_datasources(dataset = datasetName, workspace = workspaceName)
787
- # dfPersp = fabric.list_perspectives(dataset = datasetName, workspace = workspaceName)
788
- dfD = fabric.list_datasets(mode="rest", workspace=workspace)
789
- dfD = dfD[dfD["Dataset Name"] == dataset]
790
- # datasetOwner = dfD['Configured By'].iloc[0]
791
- md = get_measure_dependencies(dataset, workspace)
792
- isDirectLake = any(r["Mode"] == "DirectLake" for i, r in dfP.iterrows())
793
- dfC["Is Direct Lake"] = isDirectLake
794
- dfT["Is Direct Lake"] = isDirectLake
81
+ dep = get_model_calc_dependencies(dataset=dataset, workspace=workspace)
795
82
 
796
- cols = ["From Cardinality", "To Cardinality"]
797
-
798
- for col in cols:
799
- if col not in dfR:
800
- dfR[col] = None
801
-
802
- cols = ["Parent Is Hidden"]
803
-
804
- for col in cols:
805
- if col not in dfM:
806
- dfM[col] = None
807
-
808
- # Data Coverage Definition rule
809
- dfP_imp = dfP[dfP["Mode"] == "Import"]
810
- dfTP = dfP_imp.groupby("Table Name")["Partition Name"].count().reset_index()
811
- dfTP.rename(columns={"Partition Name": "Import Partitions"}, inplace=True)
812
- dfP = pd.merge(
813
- dfP, dfTP[["Table Name", "Import Partitions"]], on="Table Name", how="left"
814
- )
815
- dfP["Import Partitions"].fillna(0, inplace=True)
816
- dfC_DateKey = dfC[
817
- (dfC["Parent Data Category"] == "Time")
818
- & (dfC["Data Type"] == "DateTime")
819
- & (dfC["Key"])
820
- ]
821
- hasDateTable = False
822
-
823
- if len(dfC_DateKey) > 0:
824
- hasDateTable = True
825
-
826
- dfP["Has Date Table"] = hasDateTable
827
-
828
- # Set dims to dual mode
829
- dfR_one = dfR[dfR["To Cardinality"] == "One"]
830
- dfTP = dfP_imp.groupby("Table Name")["Partition Name"].count().reset_index()
831
- dfTP.rename(columns={"Partition Name": "Import Partitions"}, inplace=True)
832
- dfT = pd.merge(dfT, dfTP, left_on="Name", right_on="Table Name", how="left")
833
- dfT.drop(columns=["Table Name"], inplace=True)
834
- dfT["Import Partitions"].fillna(0, inplace=True)
835
- hasDQ = any(r["Mode"] == "DirectQuery" for i, r in dfP.iterrows())
836
- dfT["Model Has DQ"] = hasDQ
837
- dfT["Used in Relationship x:1"] = dfT["Name"].isin(dfR_one["To Table"])
838
-
839
- dfF = fabric.evaluate_dax(
840
- dataset=dataset,
841
- workspace=workspace,
842
- dax_string="""
843
- SELECT [FUNCTION_NAME]
844
- FROM $SYSTEM.MDSCHEMA_FUNCTIONS
845
- WHERE [INTERFACE_NAME] = 'DATETIME'
846
- """,
847
- )
848
-
849
- dfC["Name"] = dfC["Column Name"]
850
- dfH["Name"] = dfH["Hierarchy Name"]
851
- dfM["Name"] = dfM["Measure Name"]
852
- dfP["Name"] = dfP["Partition Name"]
853
- dfRole["Name"] = dfRole["Role"]
854
- dfD["Name"] = dfD["Dataset Name"]
855
- dfH["Description"] = dfH["Hierarchy Description"]
856
- dfM["Description"] = dfM["Measure Description"]
857
- dfH["Hierarchy Object"] = format_dax_object_name(
858
- dfH["Table Name"], dfH["Hierarchy Name"]
859
- )
860
-
861
- dfCI["Calculation Object"] = format_dax_object_name(
862
- dfCI["Calculation Group Name"], dfCI["Calculation Item Name"]
863
- )
864
-
865
- dfRole["Member Count"] = dfRM["Role"].isin(dfRole["Role"]).sum()
866
- dfRLS["Is Dynamic"] = dfRLS["Filter Expression"].str.contains(
867
- r"userprincipalname\s*\(", case=False
868
- ) | dfRLS["Filter Expression"].str.contains(r"username\s*\(", case=False)
869
-
870
- # Partition Count
871
- partition_count = (
872
- dfP.groupby("Table Name").size().reset_index(name="Partition Count")
873
- )
874
- dfT = pd.merge(
875
- dfT, partition_count, left_on="Name", right_on="Table Name", how="left"
876
- ).drop("Table Name", axis=1)
877
- dfT["Partition Count"] = dfT["Partition Count"].fillna(0).astype(int)
878
-
879
- dfT = dfT.merge(
880
- dfP[["Table Name", "Partition Name"]],
881
- how="left",
882
- left_on="Name",
883
- right_on="Table Name",
884
- )
885
- dfT["First Partition Name"] = dfT.groupby("Name")["Partition Name"].transform(
886
- "first"
887
- )
888
- dfT.drop("Table Name", axis=1, inplace=True)
889
-
890
- dfC["Sort By Column Object"] = format_dax_object_name(
891
- dfC["Table Name"], dfC["Sort By Column"]
892
- )
893
- dfC["Column Object"] = format_dax_object_name(dfC["Table Name"], dfC["Column Name"])
894
- dfM["Measure Object"] = "[" + dfM["Measure Name"] + "]"
895
- dfM["Measure Fully Qualified"] = format_dax_object_name(
896
- dfM["Table Name"], dfM["Measure Name"]
897
- )
898
- dfM["Measure Fully Qualified No Spaces"] = (
899
- dfM["Table Name"] + "[" + dfM["Measure Name"] + "]"
900
- )
901
- # dfM['Measure Fully Qualified No Spaces'] = dfM.apply(lambda row: row['Table Name'] + '[' + row['Measure Name'] + ']' if ' ' not in row['Table Name'] else '', axis=1)
902
- dfC["Column Unqualified"] = "[" + dfC["Column Name"] + "]"
903
- dfC["Column Object No Spaces"] = dfC.apply(
904
- lambda row: (
905
- row["Table Name"] + "[" + row["Column Name"] + "]"
906
- if " " not in row["Table Name"]
907
- else ""
908
- ),
909
- axis=1,
910
- )
911
- dfC["Used in Sort By"] = dfC["Column Object"].isin(dfC["Sort By Column Object"])
912
- dfH["Column Object"] = format_dax_object_name(dfH["Table Name"], dfH["Column Name"])
913
- dfC["Used in Hierarchy"] = dfC["Column Object"].isin(dfH["Column Object"])
914
- dfR["From Object"] = format_dax_object_name(dfR["From Table"], dfR["From Column"])
915
- dfR["To Object"] = format_dax_object_name(dfR["To Table"], dfR["To Column"])
916
- dfT["Used in Relationship"] = dfT["Name"].isin(dfR["From Table"]) | dfT[
917
- "Name"
918
- ].isin(dfR["To Table"])
919
- dfT["Used in Relationship Both Sides"] = dfT["Name"].isin(dfR["From Table"]) & dfT[
920
- "Name"
921
- ].isin(dfR["To Table"])
922
- dfC["Used in Relationship"] = dfC["Column Object"].isin(dfR["From Object"]) | dfC[
923
- "Column Object"
924
- ].isin(dfR["To Object"])
925
-
926
- dfR_filt = dfR[
927
- (dfR["Cross Filtering Behavior"] == "BothDirections")
928
- | (dfR["Multiplicity"] == "m:m")
929
- ]
930
- dfC["Used in M2M/BiDi Relationship"] = dfC["Column Object"].isin(
931
- dfR_filt["From Object"]
932
- ) | dfC["Column Object"].isin(dfR_filt["To Object"])
933
- dfC["Foreign Key"] = dfC["Column Object"].isin(
934
- dfR[dfR["From Cardinality"] == "Many"]["From Object"]
935
- )
936
- dfC["Primary Key"] = dfC["Column Object"].isin(
937
- dfR[dfR["To Cardinality"] == "One"]["To Object"]
938
- )
939
- dfT["Used in M2M Relationship"] = dfT["Name"].isin(
940
- dfR[dfR["Multiplicity"] == "m:m"][["From Table"]]
941
- ) | dfT["Name"].isin(dfR[dfR["Multiplicity"] == "m:m"][["To Table"]])
942
- dfT["Used in Dynamic RLS"] = dfT["Name"].isin(dfRLS[dfRLS["Is Dynamic"]]["Table"])
943
- dfT["Used in RLS"] = dfT["Name"].isin(
944
- dfRLS.loc[dfRLS["Filter Expression"].str.len() > 0, "Table"]
945
- )
946
- dfC["Primary Key"] = dfC["Column Object"].isin(
947
- dfR.loc[dfR["To Cardinality"] == "One", "To Object"]
948
- )
949
- dfD["Has Date Table"] = any(
950
- (r["Parent Data Category"] == "Time")
951
- & (r["Data Type"] == "DateTime")
952
- & (r["Key"] == True)
953
- for i, r in dfC.iterrows()
954
- )
955
- # dfC['In Date Table'] = dfC['Table Name'].isin(dfT.loc[dfT['Data Category'] == "Time", 'Name'])
956
- dfD["Relationship Count"] = len(dfR)
957
- dfD["M2M or BiDi Relationship Count"] = len(
958
- dfR[
959
- (dfR["Multiplicity"] == "m:m")
960
- | (dfR["Cross Filtering Behavior"] == "BothDirections")
961
- ]
962
- )
963
- dfD["Calculation Group Count"] = len(dfT[dfT["Type"] == "Calculation Group"])
964
- dfT["Has Calculation Items"] = np.where(
965
- (dfT["Type"] == "Calculation Group")
966
- & dfT["Name"].isin(dfCI["Calculation Group Name"]),
967
- True,
968
- False,
969
- )
970
- dfP["Partition Object"] = format_dax_object_name(
971
- dfP["Table Name"], dfP["Partition Name"]
972
- )
973
- dfRLS["RLS Object"] = format_dax_object_name(dfRLS["Role"], dfRLS["Table"])
974
-
975
- function_pattern = "|".join(dfF["FUNCTION_NAME"].map(re.escape))
976
-
977
- dfM["DQ Date Function Used"] = any(dfP["Mode"] == "DirectQuery") & dfM[
978
- "Measure Expression"
979
- ].str.contains(f"({function_pattern})\\s*\\(", case=False, regex=True)
980
-
981
- md["Reference"] = (
982
- "'" + md["Referenced Table"] + "'[" + md["Referenced Object"] + "]"
983
- )
984
-
985
- dfC["Referenced By"] = (
986
- md[
987
- (md["Referenced Object Type"] == "Column")
988
- & (md["Reference"].isin(dfC["Column Object"]))
989
- ]
990
- .groupby("Reference")
991
- .size()
992
- .reset_index(name="Count")["Count"]
993
- )
994
- dfC["Referenced By"].fillna(0, inplace=True)
995
- dfC["Referenced By"] = dfC["Referenced By"].fillna(0).astype(int)
996
-
997
- dfM["Referenced By"] = (
998
- md[
999
- (md["Referenced Object Type"] == "Measure")
1000
- & (md["Referenced Object"].isin(dfM["Measure Name"]))
1001
- ]
1002
- .groupby("Referenced Object")
1003
- .size()
1004
- .reset_index(name="Count")["Count"]
1005
- )
1006
- dfM["Referenced By"].fillna(0, inplace=True)
1007
- dfM["Referenced By"] = dfM["Referenced By"].fillna(0).astype(int)
1008
-
1009
- pattern = r"[^\( ][a-zA-Z0-9_()-]+\[[^\[]+\]|'[^']+'\[[^\[]+\]|\[[^\[]+\]"
1010
-
1011
- dfM["Has Fully Qualified Measure Reference"] = False
1012
- dfM["Has Unqualified Column Reference"] = False
1013
-
1014
- for i, r in dfM.iterrows():
1015
- tName = r["Table Name"]
1016
- mName = r["Measure Name"]
1017
- expr = r["Measure Expression"]
1018
-
1019
- matches = re.findall(pattern, expr)
1020
-
1021
- for m in matches:
1022
- if m[0] == "[":
1023
- if (m in dfC["Column Unqualified"].values) and (
1024
- dfC[dfC["Table Name"] == tName]["Column Unqualified"] == m
1025
- ).any():
1026
- dfM.at[i, "Has Unqualified Column Reference"] = True
1027
- else:
1028
- if (m in dfM["Measure Fully Qualified"].values) | (
1029
- m in dfM["Measure Fully Qualified No Spaces"].values
1030
- ):
1031
- dfM.at[i, "Has Fully Qualified Measure Reference"] = True
1032
-
1033
- dfR["Inactive without USERELATIONSHIP"] = False
1034
- for i, r in dfR[dfR["Active"] == False].iterrows():
1035
- fromTable = r["From Table"]
1036
- fromColumn = r["From Column"]
1037
- toTable = r["To Table"]
1038
- toColumn = r["To Column"]
1039
-
1040
- dfM_filt = dfM[
1041
- dfM["Measure Expression"].str.contains(
1042
- r"(?i)USERELATIONSHIP\s*\(\s*'*"
1043
- + re.escape(fromTable)
1044
- + r"'*\["
1045
- + re.escape(fromColumn)
1046
- + r"\]\s*,\s*'*"
1047
- + re.escape(toTable)
1048
- + r"'*\["
1049
- + re.escape(toColumn)
1050
- + r"\]",
1051
- regex=True,
83
+ if rules is None:
84
+ rules = model_bpa_rules(
85
+ dataset=dataset, workspace=workspace, dependencies=dep
1052
86
  )
1053
- ]
1054
- if len(dfM_filt) == 0:
1055
- dfR.at[i, "Inactive without USERELATIONSHIP"] = True
1056
87
 
1057
- dfC["Used in RLS"] = (
1058
- dfC["Column Object No Spaces"].isin(dfRLS["Filter Expression"])
1059
- | dfC["Column Object"].isin(dfRLS["Filter Expression"])
1060
- | dfC.apply(
1061
- lambda row: any(
1062
- row["Column Name"] in expr
1063
- for expr in dfRLS.loc[
1064
- dfRLS["Table"] == row["Table Name"], "Filter Expression"
1065
- ]
1066
- ),
1067
- axis=1,
1068
- )
1069
- )
88
+ rules["Severity"].replace("Warning", "⚠️", inplace=True)
89
+ rules["Severity"].replace("Error", "\u274C", inplace=True)
90
+ rules["Severity"].replace("Info", "ℹ️", inplace=True)
1070
91
 
1071
- # Merge dfR and dfC based on 'From Object' and 'Column Object'
1072
- merged_from = pd.merge(
1073
- dfR, dfC, left_on="From Object", right_on="Column Object", how="left"
1074
- )
1075
- merged_to = pd.merge(
1076
- dfR, dfC, left_on="To Object", right_on="Column Object", how="left"
1077
- )
92
+ pd.set_option("display.max_colwidth", 1000)
1078
93
 
1079
- dfR["From Column Data Type"] = merged_from["Data Type"]
1080
- dfR["To Column Data Type"] = merged_to["Data Type"]
94
+ violations = pd.DataFrame(columns=["Object Name", "Scope", "Rule Name"])
1081
95
 
1082
- # Check if USERELATIONSHIP objects are used in a given column, table
1083
- userelationship_pattern = re.compile(
1084
- r"USERELATIONSHIP\s*\(\s*(.*?)\s*,\s*(.*?)\s*\)", re.DOTALL | re.IGNORECASE
1085
- )
96
+ scope_to_dataframe = {
97
+ "Relationship": (
98
+ tom.model.Relationships,
99
+ lambda obj: create_relationship_name(
100
+ obj.FromTable.Name,
101
+ obj.FromColumn.Name,
102
+ obj.ToTable.Name,
103
+ obj.ToColumn.Name,
104
+ ),
105
+ ),
106
+ "Column": (
107
+ tom.all_columns(),
108
+ lambda obj: format_dax_object_name(obj.Parent.Name, obj.Name),
109
+ ),
110
+ "Measure": (tom.all_measures(), lambda obj: obj.Name),
111
+ "Hierarchy": (
112
+ tom.all_hierarchies(),
113
+ lambda obj: format_dax_object_name(obj.Parent.Name, obj.Name),
114
+ ),
115
+ "Table": (tom.model.Tables, lambda obj: obj.Name),
116
+ "Role": (tom.model.Roles, lambda obj: obj.Name),
117
+ "Model": (tom.model, lambda obj: obj.Model.Name),
118
+ "Calculation Item": (
119
+ tom.all_calculation_items(),
120
+ lambda obj: format_dax_object_name(obj.Parent.Table.Name, obj.Name),
121
+ ),
122
+ "Row Level Security": (
123
+ tom.all_rls(),
124
+ lambda obj: format_dax_object_name(obj.Parent.Name, obj.Name),
125
+ ),
126
+ "Partition": (
127
+ tom.all_partitions(),
128
+ lambda obj: format_dax_object_name(obj.Parent.Name, obj.Name),
129
+ ),
130
+ }
1086
131
 
1087
- # Function to extract objects within USERELATIONSHIP function
1088
- def extract_objects(measure_expression):
1089
- matches = userelationship_pattern.findall(measure_expression)
1090
- if matches:
1091
- return [obj.strip() for match in matches for obj in match]
1092
- else:
1093
- return []
132
+ for i, r in rules.iterrows():
133
+ ruleName = r["Rule Name"]
134
+ expr = r["Expression"]
135
+ scopes = r["Scope"]
136
+
137
+ if isinstance(scopes, str):
138
+ scopes = [scopes]
139
+
140
+ for scope in scopes:
141
+ func = scope_to_dataframe[scope][0]
142
+ nm = scope_to_dataframe[scope][1]
143
+
144
+ if scope == "Model":
145
+ x = []
146
+ if expr(func):
147
+ x = ["Model"]
148
+ elif scope == "Measure":
149
+ x = [nm(obj) for obj in tom.all_measures() if expr(obj)]
150
+ elif scope == "Column":
151
+ x = [nm(obj) for obj in tom.all_columns() if expr(obj)]
152
+ elif scope == "Partition":
153
+ x = [nm(obj) for obj in tom.all_partitions() if expr(obj)]
154
+ elif scope == "Hierarchy":
155
+ x = [nm(obj) for obj in tom.all_hierarchies() if expr(obj)]
156
+ elif scope == "Table":
157
+ x = [nm(obj) for obj in tom.model.Tables if expr(obj)]
158
+ elif scope == "Relationship":
159
+ x = [nm(obj) for obj in tom.model.Relationships if expr(obj)]
160
+ elif scope == "Role":
161
+ x = [nm(obj) for obj in tom.model.Roles if expr(obj)]
162
+ elif scope == "Row Level Security":
163
+ x = [nm(obj) for obj in tom.all_rls() if expr(obj)]
164
+ elif scope == "Calculation Item":
165
+ x = [nm(obj) for obj in tom.all_calculation_items() if expr(obj)]
166
+
167
+ if len(x) > 0:
168
+ new_data = {"Object Name": x, "Scope": scope, "Rule Name": ruleName}
169
+ violations = pd.concat(
170
+ [violations, pd.DataFrame(new_data)], ignore_index=True
171
+ )
1094
172
 
1095
- dfM["USERELATIONSHIP Objects"] = dfM["Measure Expression"].apply(extract_objects)
1096
- flat_object_list = [
1097
- item for sublist in dfM["USERELATIONSHIP Objects"] for item in sublist
1098
- ]
1099
- dfC["USERELATIONSHIP Used"] = dfC["Column Object"].isin(flat_object_list) | dfC[
1100
- "Column Object No Spaces"
1101
- ].isin(flat_object_list)
1102
- dfT["USERELATIONSHIP Used"] = dfT["Name"].isin(
1103
- dfC[dfC["USERELATIONSHIP Used"]]["Table Name"]
1104
- )
1105
- dfR["Relationship Name"] = (
1106
- format_dax_object_name(dfR["From Table"], dfR["From Column"])
1107
- + " -> "
1108
- + format_dax_object_name(dfR["To Table"], dfR["To Column"])
1109
- )
1110
- dfH = dfH[
1111
- [
1112
- "Name",
1113
- "Description",
1114
- "Table Name",
1115
- "Hierarchy Name",
1116
- "Hierarchy Description",
1117
- "Hierarchy Object",
173
+ prepDF = pd.merge(
174
+ violations,
175
+ rules[["Rule Name", "Category", "Severity", "Description", "URL"]],
176
+ left_on="Rule Name",
177
+ right_on="Rule Name",
178
+ how="left",
179
+ )
180
+ prepDF.rename(columns={"Scope": "Object Type"}, inplace=True)
181
+ finalDF = prepDF[
182
+ [
183
+ "Category",
184
+ "Rule Name",
185
+ "Severity",
186
+ "Object Type",
187
+ "Object Name",
188
+ "Description",
189
+ "URL",
190
+ ]
1118
191
  ]
1119
- ].drop_duplicates()
1120
-
1121
- scope_to_dataframe = {
1122
- "Table": (dfT, ["Name"]),
1123
- "Partition": (dfP, ["Partition Object"]),
1124
- "Column": (dfC, ["Column Object"]),
1125
- "Hierarchy": (dfH, ["Hierarchy Object"]),
1126
- "Measure": (dfM, ["Measure Name"]),
1127
- "Calculation Item": (dfCI, ["Calculation Object"]),
1128
- "Relationship": (dfR, ["Relationship Name"]),
1129
- "Row Level Security": (dfRLS, ["RLS Object"]),
1130
- "Role": (dfRole, ["Role"]),
1131
- "Model": (dfD, ["Dataset Name"]),
1132
- }
1133
-
1134
- def execute_rule(row):
1135
- scopes = row["Scope"]
1136
-
1137
- # support both str and list as scope type
1138
- if isinstance(scopes, str):
1139
- scopes = [scopes]
1140
-
1141
- # collect output dataframes
1142
- df_outputs = []
1143
-
1144
- for scope in scopes:
1145
- # common fields for each scope
1146
- (df, violation_cols_or_func) = scope_to_dataframe[scope]
1147
-
1148
- if scope in ["Hierarchy", "Measure"] and len(df) == 0:
1149
- continue
1150
- # execute rule and subset df
1151
- df_violations = df[row["Expression"](df)]
1152
-
1153
- # subset the right output columns (e.g. Table Name & Column Name)
1154
- if isinstance(violation_cols_or_func, list):
1155
- violation_func = lambda violations: violations[violation_cols_or_func]
1156
- else:
1157
- violation_func = violation_cols_or_func
1158
-
1159
- # build output data frame
1160
- df_output = violation_func(df_violations).copy()
1161
-
1162
- df_output.columns = ["Object Name"]
1163
- df_output["Rule Name"] = row["Rule Name"]
1164
- df_output["Category"] = row["Category"]
1165
-
1166
- df_output["Object Type"] = scope
1167
- df_output["Severity"] = row["Severity"]
1168
- df_output["Description"] = row["Description"]
1169
- df_output["URL"] = row["URL"]
1170
-
1171
- df_outputs.append(df_output)
1172
-
1173
- return df_outputs
1174
-
1175
- # flatten list of lists
1176
- flatten_dfs = [
1177
- df for dfs in rules_dataframe.apply(execute_rule, axis=1).tolist() for df in dfs
1178
- ]
1179
-
1180
- finalDF = pd.concat(flatten_dfs, ignore_index=True)
1181
192
 
1182
193
  if export:
1183
194
  lakeAttach = lakehouse_attached()
1184
195
  if lakeAttach is False:
1185
- raise ValueError(f"{icons.red_dot} In order to save the Best Practice Analyzer results, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook.")
196
+ raise ValueError(
197
+ f"{icons.red_dot} In order to save the Best Practice Analyzer results, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook."
198
+ )
1186
199
 
1187
200
  dfExport = finalDF.copy()
1188
201
  delta_table_name = "modelbparesults"
@@ -1330,7 +343,7 @@ def run_model_bpa(
1330
343
  content_html += '<table border="1">'
1331
344
  content_html += "<tr><th>Rule Name</th><th>Object Type</th><th>Object Name</th><th>Severity</th></tr>"
1332
345
  for _, row in df.iterrows():
1333
- content_html += f"<tr>"
346
+ content_html += "<tr>"
1334
347
  if pd.notnull(row["URL"]):
1335
348
  content_html += f'<td class="tooltip" onmouseover="adjustTooltipPosition(event)"><a href="{row["URL"]}">{row["Rule Name"]}</a><span class="tooltiptext">{row["Description"]}</span></td>'
1336
349
  elif pd.notnull(row["Description"]):
@@ -1340,7 +353,7 @@ def run_model_bpa(
1340
353
  content_html += f'<td>{row["Object Type"]}</td>'
1341
354
  content_html += f'<td>{row["Object Name"]}</td>'
1342
355
  content_html += f'<td>{row["Severity"]}</td>'
1343
- content_html += f"</tr>"
356
+ content_html += "</tr>"
1344
357
  content_html += "</table>"
1345
358
 
1346
359
  content_html += "</div>"