altimate-datapilot-cli 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- altimate_datapilot_cli-0.0.8.dist-info/AUTHORS.rst +5 -0
- altimate_datapilot_cli-0.0.8.dist-info/LICENSE +9 -0
- altimate_datapilot_cli-0.0.8.dist-info/METADATA +102 -0
- altimate_datapilot_cli-0.0.8.dist-info/RECORD +139 -0
- altimate_datapilot_cli-0.0.8.dist-info/WHEEL +5 -0
- altimate_datapilot_cli-0.0.8.dist-info/entry_points.txt +4 -0
- altimate_datapilot_cli-0.0.8.dist-info/top_level.txt +1 -0
- datapilot/__init__.py +1 -0
- datapilot/__main__.py +14 -0
- datapilot/cli/__init__.py +0 -0
- datapilot/cli/main.py +11 -0
- datapilot/clients/__init__.py +0 -0
- datapilot/clients/altimate/__init__.py +0 -0
- datapilot/clients/altimate/client.py +85 -0
- datapilot/clients/altimate/utils.py +75 -0
- datapilot/config/__init__.py +0 -0
- datapilot/config/config.py +16 -0
- datapilot/config/utils.py +32 -0
- datapilot/core/__init__.py +0 -0
- datapilot/core/insights/__init__.py +2 -0
- datapilot/core/insights/base/__init__.py +0 -0
- datapilot/core/insights/base/insight.py +34 -0
- datapilot/core/insights/report.py +16 -0
- datapilot/core/insights/schema.py +24 -0
- datapilot/core/insights/sql/__init__.py +0 -0
- datapilot/core/insights/sql/base/__init__.py +0 -0
- datapilot/core/insights/sql/base/insight.py +18 -0
- datapilot/core/insights/sql/runtime/__init__.py +0 -0
- datapilot/core/insights/sql/static/__init__.py +0 -0
- datapilot/core/insights/utils.py +20 -0
- datapilot/core/platforms/__init__.py +0 -0
- datapilot/core/platforms/dbt/__init__.py +0 -0
- datapilot/core/platforms/dbt/cli/__init__.py +0 -0
- datapilot/core/platforms/dbt/cli/cli.py +112 -0
- datapilot/core/platforms/dbt/constants.py +34 -0
- datapilot/core/platforms/dbt/exceptions.py +6 -0
- datapilot/core/platforms/dbt/executor.py +157 -0
- datapilot/core/platforms/dbt/factory.py +22 -0
- datapilot/core/platforms/dbt/formatting.py +45 -0
- datapilot/core/platforms/dbt/hooks/__init__.py +0 -0
- datapilot/core/platforms/dbt/hooks/executor_hook.py +86 -0
- datapilot/core/platforms/dbt/insights/__init__.py +115 -0
- datapilot/core/platforms/dbt/insights/base.py +133 -0
- datapilot/core/platforms/dbt/insights/checks/__init__.py +0 -0
- datapilot/core/platforms/dbt/insights/checks/base.py +26 -0
- datapilot/core/platforms/dbt/insights/checks/check_column_desc_are_same.py +105 -0
- datapilot/core/platforms/dbt/insights/checks/check_column_name_contract.py +154 -0
- datapilot/core/platforms/dbt/insights/checks/check_macro_args_have_desc.py +75 -0
- datapilot/core/platforms/dbt/insights/checks/check_macro_has_desc.py +63 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_has_all_columns.py +96 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_has_labels_keys.py +112 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_has_meta_keys.py +108 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_has_properties_file.py +64 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_group.py +118 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_name.py +114 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_type.py +119 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_materialization_by_childs.py +129 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_name_contract.py +132 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_parents_and_childs.py +135 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_parents_database.py +109 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_parents_schema.py +109 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_tags.py +87 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_childs.py +97 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_columns_have_desc.py +96 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_has_all_columns.py +103 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_has_freshness.py +94 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_has_labels_keys.py +110 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_has_loader.py +62 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_has_meta_keys.py +117 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_has_tests.py +82 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_group.py +117 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_name.py +113 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_type.py +119 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_table_has_description.py +62 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_tags.py +76 -0
- datapilot/core/platforms/dbt/insights/dbt_test/__init__.py +0 -0
- datapilot/core/platforms/dbt/insights/dbt_test/base.py +23 -0
- datapilot/core/platforms/dbt/insights/dbt_test/missing_primary_key_tests.py +130 -0
- datapilot/core/platforms/dbt/insights/dbt_test/test_coverage.py +118 -0
- datapilot/core/platforms/dbt/insights/governance/__init__.py +0 -0
- datapilot/core/platforms/dbt/insights/governance/base.py +23 -0
- datapilot/core/platforms/dbt/insights/governance/documentation_on_stale_columns.py +130 -0
- datapilot/core/platforms/dbt/insights/governance/exposures_dependent_on_private_models.py +90 -0
- datapilot/core/platforms/dbt/insights/governance/public_models_without_contracts.py +89 -0
- datapilot/core/platforms/dbt/insights/governance/undocumented_columns.py +148 -0
- datapilot/core/platforms/dbt/insights/governance/undocumented_public_models.py +110 -0
- datapilot/core/platforms/dbt/insights/modelling/README.md +15 -0
- datapilot/core/platforms/dbt/insights/modelling/__init__.py +0 -0
- datapilot/core/platforms/dbt/insights/modelling/base.py +31 -0
- datapilot/core/platforms/dbt/insights/modelling/direct_join_to_source.py +125 -0
- datapilot/core/platforms/dbt/insights/modelling/downstream_models_dependent_on_source.py +113 -0
- datapilot/core/platforms/dbt/insights/modelling/duplicate_sources.py +85 -0
- datapilot/core/platforms/dbt/insights/modelling/hard_coded_references.py +80 -0
- datapilot/core/platforms/dbt/insights/modelling/joining_of_upstream_concepts.py +79 -0
- datapilot/core/platforms/dbt/insights/modelling/model_fanout.py +126 -0
- datapilot/core/platforms/dbt/insights/modelling/multiple_sources_joined.py +83 -0
- datapilot/core/platforms/dbt/insights/modelling/root_model.py +82 -0
- datapilot/core/platforms/dbt/insights/modelling/source_fanout.py +102 -0
- datapilot/core/platforms/dbt/insights/modelling/staging_model_dependent_on_downstream_models.py +103 -0
- datapilot/core/platforms/dbt/insights/modelling/staging_model_dependent_on_staging_models.py +89 -0
- datapilot/core/platforms/dbt/insights/modelling/unused_sources.py +59 -0
- datapilot/core/platforms/dbt/insights/performance/__init__.py +0 -0
- datapilot/core/platforms/dbt/insights/performance/base.py +26 -0
- datapilot/core/platforms/dbt/insights/performance/chain_view_linking.py +92 -0
- datapilot/core/platforms/dbt/insights/performance/exposure_parent_materializations.py +104 -0
- datapilot/core/platforms/dbt/insights/schema.py +72 -0
- datapilot/core/platforms/dbt/insights/structure/__init__.py +0 -0
- datapilot/core/platforms/dbt/insights/structure/base.py +33 -0
- datapilot/core/platforms/dbt/insights/structure/model_directories_structure.py +92 -0
- datapilot/core/platforms/dbt/insights/structure/model_naming_conventions.py +97 -0
- datapilot/core/platforms/dbt/insights/structure/source_directories_structure.py +80 -0
- datapilot/core/platforms/dbt/insights/structure/test_directory_structure.py +74 -0
- datapilot/core/platforms/dbt/insights/utils.py +9 -0
- datapilot/core/platforms/dbt/schemas/__init__.py +0 -0
- datapilot/core/platforms/dbt/schemas/catalog.py +73 -0
- datapilot/core/platforms/dbt/schemas/manifest.py +462 -0
- datapilot/core/platforms/dbt/utils.py +525 -0
- datapilot/core/platforms/dbt/wrappers/__init__.py +0 -0
- datapilot/core/platforms/dbt/wrappers/catalog/__init__.py +0 -0
- datapilot/core/platforms/dbt/wrappers/catalog/v1/__init__.py +0 -0
- datapilot/core/platforms/dbt/wrappers/catalog/v1/wrapper.py +18 -0
- datapilot/core/platforms/dbt/wrappers/catalog/wrapper.py +9 -0
- datapilot/core/platforms/dbt/wrappers/manifest/__init__.py +0 -0
- datapilot/core/platforms/dbt/wrappers/manifest/v11/__init__.py +0 -0
- datapilot/core/platforms/dbt/wrappers/manifest/v11/schemas.py +47 -0
- datapilot/core/platforms/dbt/wrappers/manifest/v11/wrapper.py +396 -0
- datapilot/core/platforms/dbt/wrappers/manifest/wrapper.py +35 -0
- datapilot/core/platforms/dbt/wrappers/run_results/__init__.py +0 -0
- datapilot/core/platforms/dbt/wrappers/run_results/run_results.py +39 -0
- datapilot/exceptions/__init__.py +0 -0
- datapilot/exceptions/exceptions.py +10 -0
- datapilot/schemas/__init__.py +0 -0
- datapilot/schemas/constants.py +5 -0
- datapilot/schemas/nodes.py +19 -0
- datapilot/schemas/sql.py +10 -0
- datapilot/utils/__init__.py +0 -0
- datapilot/utils/formatting/__init__.py +0 -0
- datapilot/utils/formatting/utils.py +59 -0
- datapilot/utils/utils.py +317 -0
@@ -0,0 +1,89 @@
|
|
1
|
+
from typing import List
|
2
|
+
|
3
|
+
from datapilot.core.insights.utils import get_severity
|
4
|
+
from datapilot.core.platforms.dbt.insights.governance.base import DBTGovernanceInsight
|
5
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
|
6
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
|
7
|
+
from datapilot.core.platforms.dbt.schemas.manifest import AltimateAccess
|
8
|
+
from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
|
9
|
+
|
10
|
+
|
11
|
+
class DBTPublicModelWithoutContracts(DBTGovernanceInsight):
|
12
|
+
"""
|
13
|
+
DBTUndocumentedPublicModels identifies public models that are not documented.
|
14
|
+
"""
|
15
|
+
|
16
|
+
NAME = "Public models without contracts"
|
17
|
+
ALIAS = "public_models_without_contracts"
|
18
|
+
DESCRIPTION = "Identify public models that don't have contracts."
|
19
|
+
REASON_TO_FLAG = (
|
20
|
+
"Public models are accessible to all downstream consumers, making it crucial to have clear "
|
21
|
+
"contracts that specify data types and columns. This ensures consistency and predictability "
|
22
|
+
"in data consumption."
|
23
|
+
)
|
24
|
+
FAILURE_MESSAGE = (
|
25
|
+
"Model `{model_unique_id}` is marked as public but does not have a contract. "
|
26
|
+
"This can lead to ambiguity regarding data types and columns, impacting downstream consumers."
|
27
|
+
)
|
28
|
+
RECOMMENDATION = (
|
29
|
+
"Enhance the model `{model_unique_id}` by adding clear contract entries for columns along "
|
30
|
+
"with their data types. Contracts provide essential documentation and guarantees for downstream consumers."
|
31
|
+
)
|
32
|
+
|
33
|
+
def _build_failure_result(
|
34
|
+
self,
|
35
|
+
model_unique_id: str,
|
36
|
+
) -> DBTInsightResult:
|
37
|
+
"""
|
38
|
+
Build failure result for the insight if a model is a root model with 0 direct parents.
|
39
|
+
|
40
|
+
:param model_unique_id: Unique ID of the current model being evaluated.
|
41
|
+
:return: An instance of InsightResult containing failure message and recommendation.
|
42
|
+
"""
|
43
|
+
self.logger.debug(f"Building failure result model {model_unique_id} is public but not documented.")
|
44
|
+
|
45
|
+
failure = self.FAILURE_MESSAGE.format(
|
46
|
+
model_unique_id=model_unique_id,
|
47
|
+
)
|
48
|
+
recommendation = self.RECOMMENDATION.format(
|
49
|
+
model_unique_id=model_unique_id,
|
50
|
+
)
|
51
|
+
|
52
|
+
return DBTInsightResult(
|
53
|
+
type=self.TYPE,
|
54
|
+
name=self.NAME,
|
55
|
+
message=failure,
|
56
|
+
recommendation=recommendation,
|
57
|
+
reason_to_flag=self.REASON_TO_FLAG,
|
58
|
+
metadata={
|
59
|
+
"model": model_unique_id,
|
60
|
+
},
|
61
|
+
)
|
62
|
+
|
63
|
+
def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
|
64
|
+
"""
|
65
|
+
Generate a list of InsightResponse objects for each model in the DBT project,
|
66
|
+
identifying root models with 0 direct parents.
|
67
|
+
"""
|
68
|
+
self.logger.debug("Generating insights for public models without contracts")
|
69
|
+
insights = []
|
70
|
+
for node_id, node in self.nodes.items():
|
71
|
+
if self.should_skip_model(node_id):
|
72
|
+
self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
|
73
|
+
continue
|
74
|
+
if node.resource_type == AltimateResourceType.model and node.access == AltimateAccess.public:
|
75
|
+
if (not node.contract) or (not node.contract.enforced):
|
76
|
+
self.logger.debug(f"Found public model {node_id} without contract enforced")
|
77
|
+
insight_result = self._build_failure_result(node_id)
|
78
|
+
insights.append(
|
79
|
+
DBTModelInsightResponse(
|
80
|
+
unique_id=node_id,
|
81
|
+
package_name=node.package_name,
|
82
|
+
path=node.original_file_path,
|
83
|
+
original_file_path=node.original_file_path,
|
84
|
+
insight=insight_result,
|
85
|
+
severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
|
86
|
+
)
|
87
|
+
)
|
88
|
+
self.logger.debug("Finished generating insights for public models without contracts")
|
89
|
+
return insights
|
@@ -0,0 +1,148 @@
|
|
1
|
+
from typing import ClassVar
|
2
|
+
from typing import List
|
3
|
+
from typing import Tuple
|
4
|
+
|
5
|
+
from datapilot.core.insights.utils import get_severity
|
6
|
+
from datapilot.core.platforms.dbt.insights.governance.base import DBTGovernanceInsight
|
7
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
|
8
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
|
9
|
+
from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
|
10
|
+
from datapilot.core.platforms.dbt.wrappers.catalog.wrapper import BaseCatalogWrapper
|
11
|
+
from datapilot.utils.formatting.utils import numbered_list
|
12
|
+
|
13
|
+
|
14
|
+
class DBTMissingDocumentation(DBTGovernanceInsight):
|
15
|
+
"""
|
16
|
+
DBTDocumentationStaleColumns identifies columns that have been documented but are no longer present in the model.
|
17
|
+
"""
|
18
|
+
|
19
|
+
NAME = "Missing documentation"
|
20
|
+
ALIAS = "missing_documentation"
|
21
|
+
DESCRIPTION = (
|
22
|
+
"Detects columns and models in the dbt project that lack documentation. Proper documentation is essential "
|
23
|
+
"for understanding data structures and facilitating collaboration and usage of the dbt project."
|
24
|
+
)
|
25
|
+
REASON_TO_FLAG = (
|
26
|
+
"Missing documentation for columns and models can lead to confusion and hinder effective data "
|
27
|
+
"modeling and analysis. It's important to document data structures comprehensively."
|
28
|
+
)
|
29
|
+
FAILURE_MESSAGE = (
|
30
|
+
"The following columns in the model `{model_unique_id}` are missing documentation:\n{columns}. "
|
31
|
+
"Lack of documentation can impede understanding and usage of the model."
|
32
|
+
)
|
33
|
+
RECOMMENDATION = (
|
34
|
+
"Enhance the documentation for the columns listed above in the model `{model_unique_id}`. "
|
35
|
+
"Documentation provides valuable context and aids in data understanding and collaboration."
|
36
|
+
)
|
37
|
+
FILES_REQUIRED: ClassVar = ["Manifest", "Catalog"]
|
38
|
+
|
39
|
+
def __init__(self, catalog_wrapper: BaseCatalogWrapper, *args, **kwargs):
|
40
|
+
self.catalog = catalog_wrapper
|
41
|
+
super().__init__(*args, **kwargs)
|
42
|
+
|
43
|
+
def _build_failure_result(
|
44
|
+
self,
|
45
|
+
model_unique_id: str,
|
46
|
+
model_description_is_missing: bool,
|
47
|
+
columns: List[str],
|
48
|
+
) -> DBTInsightResult:
|
49
|
+
"""
|
50
|
+
Build failure result for the insight if a model is a root model with 0 direct parents.
|
51
|
+
|
52
|
+
:param model_unique_id: Unique ID of the current model being evaluated.
|
53
|
+
:param columns: List of columns that are documented but no longer present in the model.
|
54
|
+
:return: An instance of InsightResult containing failure message and recommendation.
|
55
|
+
"""
|
56
|
+
self.logger.debug(f"Building failure result for model {model_unique_id} with stale columns {columns}")
|
57
|
+
failure_message = ""
|
58
|
+
if model_description_is_missing:
|
59
|
+
failure_message += f"The model {model_unique_id} is missing a description.\n"
|
60
|
+
|
61
|
+
if columns:
|
62
|
+
failure_message += self.FAILURE_MESSAGE.format(
|
63
|
+
columns=numbered_list(columns),
|
64
|
+
model_unique_id=model_unique_id,
|
65
|
+
)
|
66
|
+
|
67
|
+
recommendation = self.RECOMMENDATION.format(model_unique_id=model_unique_id)
|
68
|
+
|
69
|
+
return DBTInsightResult(
|
70
|
+
type=self.TYPE,
|
71
|
+
name=self.NAME,
|
72
|
+
message=failure_message,
|
73
|
+
recommendation=recommendation,
|
74
|
+
reason_to_flag=self.REASON_TO_FLAG,
|
75
|
+
metadata={
|
76
|
+
"columns": columns,
|
77
|
+
"model_unique_id": model_unique_id,
|
78
|
+
"model_description_is_missing": model_description_is_missing,
|
79
|
+
},
|
80
|
+
)
|
81
|
+
|
82
|
+
def _get_columns_documented(self, node_id) -> List[str]:
|
83
|
+
"""
|
84
|
+
Get the list of columns that are documented for a given node.
|
85
|
+
:param node_id: The unique ID of the node.
|
86
|
+
:return: A list of column names.
|
87
|
+
"""
|
88
|
+
columns = []
|
89
|
+
for column_name, column_node in self.get_node(node_id).columns.items():
|
90
|
+
if column_node.description:
|
91
|
+
columns.append(column_name.lower())
|
92
|
+
return columns
|
93
|
+
|
94
|
+
def _get_columns_in_model(self, node_id) -> List[str]:
|
95
|
+
if node_id not in self.catalog.get_schema():
|
96
|
+
return []
|
97
|
+
return [k.lower() for k in self.catalog.get_schema()[node_id].keys()]
|
98
|
+
|
99
|
+
def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
|
100
|
+
"""
|
101
|
+
Generate a list of InsightResponse objects for each model in the DBT project,
|
102
|
+
identifying root models with 0 direct parents.
|
103
|
+
:return: A list of InsightResponse objects.
|
104
|
+
"""
|
105
|
+
insights = []
|
106
|
+
for node_id, node in self.nodes.items():
|
107
|
+
if self.should_skip_model(node_id):
|
108
|
+
self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
|
109
|
+
continue
|
110
|
+
if node.resource_type == AltimateResourceType.model:
|
111
|
+
model_description_is_missing = not node.description
|
112
|
+
columns_documented = self._get_columns_documented(node_id)
|
113
|
+
db_columns = self._get_columns_in_model(node_id)
|
114
|
+
columns_missing_documentation = list(set(db_columns) - set(columns_documented))
|
115
|
+
if columns_missing_documentation:
|
116
|
+
insights.append(
|
117
|
+
DBTModelInsightResponse(
|
118
|
+
unique_id=node_id,
|
119
|
+
package_name=node.package_name,
|
120
|
+
path=node.original_file_path,
|
121
|
+
original_file_path=node.original_file_path,
|
122
|
+
insight=self._build_failure_result(
|
123
|
+
node_id,
|
124
|
+
model_description_is_missing,
|
125
|
+
columns_missing_documentation,
|
126
|
+
),
|
127
|
+
severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
|
128
|
+
)
|
129
|
+
)
|
130
|
+
|
131
|
+
return insights
|
132
|
+
|
133
|
+
@classmethod
|
134
|
+
def has_all_required_data(cls, has_manifest: bool, has_catalog: bool, **kwargs) -> Tuple[bool, str]:
|
135
|
+
"""
|
136
|
+
return False
|
137
|
+
"""
|
138
|
+
if not has_manifest:
|
139
|
+
return False, "manifest is required for insight to run."
|
140
|
+
|
141
|
+
if not has_catalog:
|
142
|
+
return False, "catalog is required for insight to run."
|
143
|
+
|
144
|
+
return True, ""
|
145
|
+
|
146
|
+
@classmethod
|
147
|
+
def requires_catalog(cls) -> bool:
|
148
|
+
return True
|
@@ -0,0 +1,110 @@
|
|
1
|
+
from typing import List
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
from datapilot.core.insights.utils import get_severity
|
5
|
+
from datapilot.core.platforms.dbt.insights.governance.base import DBTGovernanceInsight
|
6
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
|
7
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
|
8
|
+
from datapilot.core.platforms.dbt.schemas.manifest import AltimateAccess
|
9
|
+
from datapilot.core.platforms.dbt.schemas.manifest import AltimateManifestNode
|
10
|
+
from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
|
11
|
+
from datapilot.utils.formatting.utils import numbered_list
|
12
|
+
|
13
|
+
|
14
|
+
# TODO: Include catalog information to make this better!
|
15
|
+
class DBTUndocumentedPublicModels(DBTGovernanceInsight):
|
16
|
+
"""
|
17
|
+
DBTUndocumentedPublicModels identifies public models that are not documented.
|
18
|
+
"""
|
19
|
+
|
20
|
+
NAME = "Undocumented public models"
|
21
|
+
ALIAS = "undocumented_public_models"
|
22
|
+
DESCRIPTION = "Identify public models that don't have documentation."
|
23
|
+
REASON_TO_FLAG = (
|
24
|
+
"Public models are accessible to a wide range of data consumers. To promote understanding and usability, "
|
25
|
+
"it's essential to document these models comprehensively."
|
26
|
+
)
|
27
|
+
FAILURE_MESSAGE = (
|
28
|
+
"Model `{model_unique_id}` is marked as public but is not documented. "
|
29
|
+
"Lack of documentation can lead to confusion for data consumers."
|
30
|
+
)
|
31
|
+
RECOMMENDATION = (
|
32
|
+
"For best practices, ensure that all models with public access are documented adequately. "
|
33
|
+
"Documentation enhances data understanding and facilitates collaboration among data consumers."
|
34
|
+
)
|
35
|
+
|
36
|
+
def _build_failure_result(
|
37
|
+
self,
|
38
|
+
model_unique_id: str,
|
39
|
+
model_description_is_missing: bool,
|
40
|
+
columns: Optional[List[str]] = None,
|
41
|
+
) -> DBTInsightResult:
|
42
|
+
"""
|
43
|
+
Build failure result for the insight if a model is a root model with 0 direct parents.
|
44
|
+
|
45
|
+
:param model_unique_id: Unique ID of the current model being evaluated.
|
46
|
+
:param model_description_is_missing: Whether the model description is missing.
|
47
|
+
:param columns: List of columns that are missing documentation.
|
48
|
+
:return: An instance of InsightResult containing failure message and recommendation.
|
49
|
+
"""
|
50
|
+
self.logger.debug(f"Building failure result model {model_unique_id} is public but not documented.")
|
51
|
+
|
52
|
+
failure = self.FAILURE_MESSAGE.format(
|
53
|
+
model_unique_id=model_unique_id,
|
54
|
+
)
|
55
|
+
failure += "Missing Model documentation." if model_description_is_missing else ""
|
56
|
+
|
57
|
+
failure += f"\n Columns missing documentation: {numbered_list(columns)}" if columns else ""
|
58
|
+
|
59
|
+
return DBTInsightResult(
|
60
|
+
type=self.TYPE,
|
61
|
+
name=self.NAME,
|
62
|
+
message=failure,
|
63
|
+
recommendation=self.RECOMMENDATION,
|
64
|
+
reason_to_flag=self.REASON_TO_FLAG,
|
65
|
+
metadata={
|
66
|
+
"model": model_unique_id,
|
67
|
+
"columns_without_documentation": columns,
|
68
|
+
"model_description_missing": model_description_is_missing,
|
69
|
+
},
|
70
|
+
)
|
71
|
+
|
72
|
+
def _get_missing_column_documentation(self, node: AltimateManifestNode) -> List[str]:
|
73
|
+
columns = []
|
74
|
+
for column in node.columns:
|
75
|
+
if not column.description:
|
76
|
+
columns.append(column.name)
|
77
|
+
return columns
|
78
|
+
|
79
|
+
def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
|
80
|
+
"""
|
81
|
+
Generate a list of InsightResponse objects for each model in the DBT project,
|
82
|
+
identifying root models with 0 direct parents.
|
83
|
+
"""
|
84
|
+
self.logger.debug("Generating insights for undocumented public models")
|
85
|
+
insights = []
|
86
|
+
for node_id, node in self.nodes.items():
|
87
|
+
if self.should_skip_model(node_id):
|
88
|
+
self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
|
89
|
+
continue
|
90
|
+
if node.resource_type == AltimateResourceType.model:
|
91
|
+
if node.access == AltimateAccess.public:
|
92
|
+
missing_model_documentation = not node.description
|
93
|
+
missing_columns = self._get_missing_column_documentation(node)
|
94
|
+
if missing_model_documentation or missing_columns:
|
95
|
+
insights.append(
|
96
|
+
DBTModelInsightResponse(
|
97
|
+
unique_id=node_id,
|
98
|
+
package_name=node.package_name,
|
99
|
+
path=node.original_file_path,
|
100
|
+
original_file_path=node.original_file_path,
|
101
|
+
insight=self._build_failure_result(
|
102
|
+
node_id,
|
103
|
+
not missing_model_documentation,
|
104
|
+
missing_columns,
|
105
|
+
),
|
106
|
+
severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
|
107
|
+
)
|
108
|
+
)
|
109
|
+
|
110
|
+
return insights
|
@@ -0,0 +1,15 @@
|
|
1
|
+
## Insights
|
2
|
+
|
3
|
+
|
4
|
+
- Direct Join to Source | Done
|
5
|
+
- Downstream Models Dependent on Source | Done
|
6
|
+
- Duplicate Sources | Done
|
7
|
+
- Hard Coded References | Done
|
8
|
+
- Model Fanout | Done
|
9
|
+
- Multiple Sources Joined | Done
|
10
|
+
- Rejoining of Upstream Concepts
|
11
|
+
- Root Models | Done
|
12
|
+
- Source Fanout | Done
|
13
|
+
- Staging Models Dependent on Downstream Models | Done
|
14
|
+
- Staging Models Dependent on Other Staging Models | Done
|
15
|
+
- Unused Sources | Done
|
File without changes
|
@@ -0,0 +1,31 @@
|
|
1
|
+
from abc import abstractmethod
|
2
|
+
from typing import Any
|
3
|
+
from typing import Dict
|
4
|
+
from typing import Optional
|
5
|
+
from typing import Tuple
|
6
|
+
|
7
|
+
from datapilot.core.platforms.dbt.insights.base import DBTInsight
|
8
|
+
|
9
|
+
|
10
|
+
class DBTModellingInsight(DBTInsight):
|
11
|
+
NAME = "DBTModellingInsight"
|
12
|
+
TYPE = "Modelling"
|
13
|
+
|
14
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None, *args, **kwargs):
|
15
|
+
self.config = config or {}
|
16
|
+
super().__init__(*args, **kwargs)
|
17
|
+
|
18
|
+
@abstractmethod
|
19
|
+
def generate(self, *args, **kwargs) -> dict:
|
20
|
+
pass
|
21
|
+
|
22
|
+
@classmethod
|
23
|
+
def has_all_required_data(cls, has_manifest: bool, **kwargs) -> Tuple[bool, str]:
|
24
|
+
"""
|
25
|
+
Check if all required data is available for the insight to run.
|
26
|
+
:param has_manifest: A boolean indicating if manifest is available.
|
27
|
+
:return: A boolean indicating if all required data is available.
|
28
|
+
"""
|
29
|
+
if not has_manifest:
|
30
|
+
return False, "manifest is required for insight to run."
|
31
|
+
return True, ""
|
@@ -0,0 +1,125 @@
|
|
1
|
+
from typing import Dict
|
2
|
+
from typing import List
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from datapilot.core.insights.utils import get_severity
|
6
|
+
from datapilot.core.platforms.dbt.constants import MODEL
|
7
|
+
from datapilot.core.platforms.dbt.constants import SOURCE
|
8
|
+
from datapilot.core.platforms.dbt.insights.modelling.base import DBTModellingInsight
|
9
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
|
10
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
|
11
|
+
from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
|
12
|
+
from datapilot.utils.formatting.utils import numbered_list
|
13
|
+
|
14
|
+
|
15
|
+
class DBTDirectJoinSource(DBTModellingInsight):
|
16
|
+
"""
|
17
|
+
DBTDirectJoinSource is used to ensure that DBT models have a proper mapping
|
18
|
+
from source to staging models and to flag any direct dependencies on multiple
|
19
|
+
sources without intermediate staging models.
|
20
|
+
Ref: https://github.com/dbt-labs/dbt-project-evaluator/blob/main/models/marts/dag/fct_direct_join_to_source.sql
|
21
|
+
"""
|
22
|
+
|
23
|
+
ALIAS = "source_staging_model_integrity"
|
24
|
+
NAME = "Source-Staging Model Integrity"
|
25
|
+
DESCRIPTION = "A model should not have direct joins to both sources and other staging models. "
|
26
|
+
REASON_TO_FLAG = (
|
27
|
+
"Flagged when a model directly joins a source and a model without a staging intermediary. "
|
28
|
+
"Direct source-model joins bypass the staging layer, leading to potential inconsistencies in data handling."
|
29
|
+
)
|
30
|
+
FAILURE_MESSAGE = (
|
31
|
+
"Model `{current_model_unique_id}` has direct joins to both sources and other models. "
|
32
|
+
"\n### Detected Sources\n{sources}\n\n### Connected Models \n{models}"
|
33
|
+
)
|
34
|
+
RECOMMENDATION = (
|
35
|
+
"Create a dedicated staging model for the source(s) and modify `{current_model_unique_id}` "
|
36
|
+
"to depend on this staging model. This ensures consistent initial data processing steps."
|
37
|
+
)
|
38
|
+
|
39
|
+
def _build_failure_result(self, current_model_unique_id: str, dependencies: Dict) -> DBTInsightResult:
|
40
|
+
"""
|
41
|
+
Build failure result for the insight if a model is directly joining to a source
|
42
|
+
and other models.
|
43
|
+
|
44
|
+
:param current_model_unique_id: Unique ID of the current model being evaluated.
|
45
|
+
:param dependencies: A dictionary of dependencies categorized as 'source' and 'model'.
|
46
|
+
:return: An instance of InsightResult containing failure message and recommendation and metadata.
|
47
|
+
"""
|
48
|
+
self.logger.debug(f"Found multiple sources and models for {current_model_unique_id}")
|
49
|
+
failure = self.FAILURE_MESSAGE.format(
|
50
|
+
current_model_unique_id=current_model_unique_id,
|
51
|
+
sources=numbered_list(dependencies["source"]),
|
52
|
+
models=numbered_list(dependencies["model"]),
|
53
|
+
)
|
54
|
+
recommendation = self.RECOMMENDATION.format(current_model_unique_id=current_model_unique_id)
|
55
|
+
|
56
|
+
return DBTInsightResult(
|
57
|
+
type=self.TYPE,
|
58
|
+
name=self.NAME,
|
59
|
+
message=failure,
|
60
|
+
recommendation=recommendation,
|
61
|
+
reason_to_flag=self.REASON_TO_FLAG,
|
62
|
+
metadata={
|
63
|
+
"model": current_model_unique_id,
|
64
|
+
"dependencies": dependencies,
|
65
|
+
},
|
66
|
+
)
|
67
|
+
|
68
|
+
def _check_dependency_on_both_models_and_sources(self, current_node) -> Optional[DBTInsightResult]:
|
69
|
+
"""
|
70
|
+
Check if the current node has dependencies on both models and sources or multiple sources.
|
71
|
+
|
72
|
+
:param current_node: The node representing the current model.
|
73
|
+
:return: A list of InsightResult instances with recommendations for each violation.
|
74
|
+
"""
|
75
|
+
self.logger.debug(f"Checking dependencies for model {current_node.unique_id}")
|
76
|
+
dependencies = {
|
77
|
+
MODEL: [],
|
78
|
+
SOURCE: [],
|
79
|
+
}
|
80
|
+
excluded_nodes = self.config.get("excluded_nodes", [])
|
81
|
+
for dependent_unique_id in current_node.depends_on.nodes:
|
82
|
+
if dependent_unique_id in excluded_nodes:
|
83
|
+
self.logger.debug(f"Skipping dependency {dependent_unique_id} as it is excluded list of the config")
|
84
|
+
continue
|
85
|
+
dependent_node = self.get_node(dependent_unique_id)
|
86
|
+
if dependent_node.resource_type == AltimateResourceType.model:
|
87
|
+
self.logger.debug(f"Found dependent model {dependent_unique_id} for model {current_node.unique_id}")
|
88
|
+
dependencies[MODEL].append(dependent_node.unique_id)
|
89
|
+
elif dependent_node.resource_type == AltimateResourceType.source:
|
90
|
+
self.logger.debug(f"Found dependent source {dependent_unique_id} for model {current_node.unique_id}")
|
91
|
+
dependencies[SOURCE].append(dependent_node.unique_id)
|
92
|
+
if dependencies[MODEL] and dependencies[SOURCE]:
|
93
|
+
self.logger.debug(f"Found dependencies on both models and sources for model {current_node.unique_id}")
|
94
|
+
return self._build_failure_result(current_node.unique_id, dependencies)
|
95
|
+
else:
|
96
|
+
self.logger.debug(f"No dependencies on both models and sources for model {current_node.unique_id}")
|
97
|
+
return None
|
98
|
+
|
99
|
+
def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
|
100
|
+
"""
|
101
|
+
Generate a list of InsightResponse objects for each model in the DBT project,
|
102
|
+
containing insights about direct source dependencies.
|
103
|
+
:return: A list of InsightResponse objects.
|
104
|
+
"""
|
105
|
+
self.logger.debug(f"Generating insights for DBTDirectJoinSource for project {self.project_name}")
|
106
|
+
recommendations = []
|
107
|
+
for node_id, node in self.nodes.items():
|
108
|
+
if self.should_skip_model(node_id):
|
109
|
+
self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
|
110
|
+
continue
|
111
|
+
if node.resource_type == AltimateResourceType.model:
|
112
|
+
recommendation = self._check_dependency_on_both_models_and_sources(node)
|
113
|
+
if recommendation:
|
114
|
+
self.logger.debug(f"Found recommendation for model {node_id} in DBTDirectJoinSource")
|
115
|
+
recommendations.append(
|
116
|
+
DBTModelInsightResponse(
|
117
|
+
unique_id=node_id,
|
118
|
+
package_name=node.package_name,
|
119
|
+
path=node.path,
|
120
|
+
original_file_path=node.original_file_path,
|
121
|
+
insight=recommendation,
|
122
|
+
severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
|
123
|
+
)
|
124
|
+
)
|
125
|
+
return recommendations
|
@@ -0,0 +1,113 @@
|
|
1
|
+
from typing import ClassVar
|
2
|
+
from typing import List
|
3
|
+
|
4
|
+
from datapilot.config.utils import get_regex_configuration
|
5
|
+
from datapilot.core.insights.utils import get_severity
|
6
|
+
from datapilot.core.platforms.dbt.constants import INTERMEDIATE
|
7
|
+
from datapilot.core.platforms.dbt.constants import MART
|
8
|
+
from datapilot.core.platforms.dbt.insights.modelling.base import DBTModellingInsight
|
9
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
|
10
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
|
11
|
+
from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
|
12
|
+
from datapilot.core.platforms.dbt.utils import classify_model_type
|
13
|
+
from datapilot.utils.formatting.utils import numbered_list
|
14
|
+
|
15
|
+
|
16
|
+
class DBTDownstreamModelsDependentOnSource(DBTModellingInsight):
|
17
|
+
"""
|
18
|
+
DBTDownstreamModelsDependentOnSource identifies downstream models (either marts or intermediate)
|
19
|
+
in a dbt project that depend directly on a source node.
|
20
|
+
"""
|
21
|
+
|
22
|
+
NAME = "Downstream models dependent on source"
|
23
|
+
ALIAS = "downstream_source_dependence"
|
24
|
+
DESCRIPTION = "Downstream models should not depend directly on source nodes. "
|
25
|
+
REASON_TO_FLAG = (
|
26
|
+
"Direct dependency of marts or intermediate models on a source node suggests a missing staging model. "
|
27
|
+
"Staging models serve as atomic units, maintaining a one-to-one relationship with source data tables, "
|
28
|
+
"while providing a consistent format for downstream consumption."
|
29
|
+
)
|
30
|
+
FAILURE_MESSAGE = (
|
31
|
+
"Downstream model `{current_model_unique_id}` of type {model_type} is directly dependent on a source nodes."
|
32
|
+
"Direct source dependencies bypass the critical staging layer, leading to potential data consistency issues."
|
33
|
+
" Source dependencies: {source_dependencies}"
|
34
|
+
)
|
35
|
+
RECOMMENDATION = (
|
36
|
+
"Introduce or utilize an existing staging model for the source node involved. Refactor the downstream model "
|
37
|
+
"`{current_model_unique_id}` to select from this staging layer, ensuring a proper abstraction layer between "
|
38
|
+
"raw data and downstream data artifacts."
|
39
|
+
)
|
40
|
+
MODEL_TYPES: ClassVar[List[str]] = [INTERMEDIATE, MART]
|
41
|
+
|
42
|
+
def _build_failure_result(
|
43
|
+
self,
|
44
|
+
current_model_unique_id: str,
|
45
|
+
source_dependencies: List[str],
|
46
|
+
model_type: str,
|
47
|
+
) -> DBTInsightResult:
|
48
|
+
"""
|
49
|
+
Build failure result for the insight if a downstream model depends directly on a source node.
|
50
|
+
|
51
|
+
:param current_model_unique_id: Unique ID of the current model being evaluated.
|
52
|
+
:param source_dependencies: List of source dependencies for the current model.
|
53
|
+
:return: An instance of InsightResult containing failure message and recommendation.
|
54
|
+
"""
|
55
|
+
self.logger.debug(f"Building failure result for model {current_model_unique_id} with direct source dependencies")
|
56
|
+
|
57
|
+
failure = self.FAILURE_MESSAGE.format(
|
58
|
+
current_model_unique_id=current_model_unique_id,
|
59
|
+
model_type=model_type,
|
60
|
+
source_dependencies=numbered_list(source_dependencies),
|
61
|
+
)
|
62
|
+
|
63
|
+
return DBTInsightResult(
|
64
|
+
type=self.TYPE,
|
65
|
+
name=self.NAME,
|
66
|
+
message=failure,
|
67
|
+
recommendation=self.RECOMMENDATION.format(current_model_unique_id=current_model_unique_id),
|
68
|
+
reason_to_flag=self.REASON_TO_FLAG,
|
69
|
+
metadata={
|
70
|
+
"model": current_model_unique_id,
|
71
|
+
"source_dependencies": source_dependencies,
|
72
|
+
"model_type": model_type,
|
73
|
+
},
|
74
|
+
)
|
75
|
+
|
76
|
+
def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
|
77
|
+
"""
|
78
|
+
Generate a list of InsightResponse objects for each downstream model in the DBT project,
|
79
|
+
identifying those that depend directly on source nodes.
|
80
|
+
:return: A list of InsightResponse objects.
|
81
|
+
"""
|
82
|
+
self.logger.debug(f"Generating insights for DBTDownstreamModelsDependentOnSource for project {self.manifest.get_package()}")
|
83
|
+
insights = []
|
84
|
+
regex_configuration = get_regex_configuration(self.config)
|
85
|
+
for node_id, node in self.nodes.items():
|
86
|
+
if self.should_skip_model(node_id):
|
87
|
+
self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
|
88
|
+
continue
|
89
|
+
if node.resource_type == AltimateResourceType.model:
|
90
|
+
model_type = classify_model_type(node.name, node.original_file_path, regex_configuration)
|
91
|
+
source_dependencies = [
|
92
|
+
dependent_node_id
|
93
|
+
for dependent_node_id in node.depends_on.nodes
|
94
|
+
if self.get_node(dependent_node_id).resource_type == AltimateResourceType.source
|
95
|
+
]
|
96
|
+
|
97
|
+
if source_dependencies and model_type in self.MODEL_TYPES:
|
98
|
+
self.logger.debug(f"Found downstream model {node_id} of type {model_type} with direct source dependencies")
|
99
|
+
insight_result = self._build_failure_result(node.unique_id, source_dependencies, model_type)
|
100
|
+
insights.append(
|
101
|
+
DBTModelInsightResponse(
|
102
|
+
unique_id=node_id,
|
103
|
+
package_name=node.package_name,
|
104
|
+
path=node.path,
|
105
|
+
original_file_path=node.original_file_path,
|
106
|
+
insight=insight_result,
|
107
|
+
severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
|
108
|
+
)
|
109
|
+
)
|
110
|
+
self.logger.debug(
|
111
|
+
f"Finished generating insights for DBTDownstreamModelsDependentOnSource. Found {len(insights)} models with direct source dependencies"
|
112
|
+
)
|
113
|
+
return insights
|