altimate-datapilot-cli 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- altimate_datapilot_cli-0.0.8.dist-info/AUTHORS.rst +5 -0
- altimate_datapilot_cli-0.0.8.dist-info/LICENSE +9 -0
- altimate_datapilot_cli-0.0.8.dist-info/METADATA +102 -0
- altimate_datapilot_cli-0.0.8.dist-info/RECORD +139 -0
- altimate_datapilot_cli-0.0.8.dist-info/WHEEL +5 -0
- altimate_datapilot_cli-0.0.8.dist-info/entry_points.txt +4 -0
- altimate_datapilot_cli-0.0.8.dist-info/top_level.txt +1 -0
- datapilot/__init__.py +1 -0
- datapilot/__main__.py +14 -0
- datapilot/cli/__init__.py +0 -0
- datapilot/cli/main.py +11 -0
- datapilot/clients/__init__.py +0 -0
- datapilot/clients/altimate/__init__.py +0 -0
- datapilot/clients/altimate/client.py +85 -0
- datapilot/clients/altimate/utils.py +75 -0
- datapilot/config/__init__.py +0 -0
- datapilot/config/config.py +16 -0
- datapilot/config/utils.py +32 -0
- datapilot/core/__init__.py +0 -0
- datapilot/core/insights/__init__.py +2 -0
- datapilot/core/insights/base/__init__.py +0 -0
- datapilot/core/insights/base/insight.py +34 -0
- datapilot/core/insights/report.py +16 -0
- datapilot/core/insights/schema.py +24 -0
- datapilot/core/insights/sql/__init__.py +0 -0
- datapilot/core/insights/sql/base/__init__.py +0 -0
- datapilot/core/insights/sql/base/insight.py +18 -0
- datapilot/core/insights/sql/runtime/__init__.py +0 -0
- datapilot/core/insights/sql/static/__init__.py +0 -0
- datapilot/core/insights/utils.py +20 -0
- datapilot/core/platforms/__init__.py +0 -0
- datapilot/core/platforms/dbt/__init__.py +0 -0
- datapilot/core/platforms/dbt/cli/__init__.py +0 -0
- datapilot/core/platforms/dbt/cli/cli.py +112 -0
- datapilot/core/platforms/dbt/constants.py +34 -0
- datapilot/core/platforms/dbt/exceptions.py +6 -0
- datapilot/core/platforms/dbt/executor.py +157 -0
- datapilot/core/platforms/dbt/factory.py +22 -0
- datapilot/core/platforms/dbt/formatting.py +45 -0
- datapilot/core/platforms/dbt/hooks/__init__.py +0 -0
- datapilot/core/platforms/dbt/hooks/executor_hook.py +86 -0
- datapilot/core/platforms/dbt/insights/__init__.py +115 -0
- datapilot/core/platforms/dbt/insights/base.py +133 -0
- datapilot/core/platforms/dbt/insights/checks/__init__.py +0 -0
- datapilot/core/platforms/dbt/insights/checks/base.py +26 -0
- datapilot/core/platforms/dbt/insights/checks/check_column_desc_are_same.py +105 -0
- datapilot/core/platforms/dbt/insights/checks/check_column_name_contract.py +154 -0
- datapilot/core/platforms/dbt/insights/checks/check_macro_args_have_desc.py +75 -0
- datapilot/core/platforms/dbt/insights/checks/check_macro_has_desc.py +63 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_has_all_columns.py +96 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_has_labels_keys.py +112 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_has_meta_keys.py +108 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_has_properties_file.py +64 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_group.py +118 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_name.py +114 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_type.py +119 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_materialization_by_childs.py +129 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_name_contract.py +132 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_parents_and_childs.py +135 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_parents_database.py +109 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_parents_schema.py +109 -0
- datapilot/core/platforms/dbt/insights/checks/check_model_tags.py +87 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_childs.py +97 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_columns_have_desc.py +96 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_has_all_columns.py +103 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_has_freshness.py +94 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_has_labels_keys.py +110 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_has_loader.py +62 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_has_meta_keys.py +117 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_has_tests.py +82 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_group.py +117 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_name.py +113 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_type.py +119 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_table_has_description.py +62 -0
- datapilot/core/platforms/dbt/insights/checks/check_source_tags.py +76 -0
- datapilot/core/platforms/dbt/insights/dbt_test/__init__.py +0 -0
- datapilot/core/platforms/dbt/insights/dbt_test/base.py +23 -0
- datapilot/core/platforms/dbt/insights/dbt_test/missing_primary_key_tests.py +130 -0
- datapilot/core/platforms/dbt/insights/dbt_test/test_coverage.py +118 -0
- datapilot/core/platforms/dbt/insights/governance/__init__.py +0 -0
- datapilot/core/platforms/dbt/insights/governance/base.py +23 -0
- datapilot/core/platforms/dbt/insights/governance/documentation_on_stale_columns.py +130 -0
- datapilot/core/platforms/dbt/insights/governance/exposures_dependent_on_private_models.py +90 -0
- datapilot/core/platforms/dbt/insights/governance/public_models_without_contracts.py +89 -0
- datapilot/core/platforms/dbt/insights/governance/undocumented_columns.py +148 -0
- datapilot/core/platforms/dbt/insights/governance/undocumented_public_models.py +110 -0
- datapilot/core/platforms/dbt/insights/modelling/README.md +15 -0
- datapilot/core/platforms/dbt/insights/modelling/__init__.py +0 -0
- datapilot/core/platforms/dbt/insights/modelling/base.py +31 -0
- datapilot/core/platforms/dbt/insights/modelling/direct_join_to_source.py +125 -0
- datapilot/core/platforms/dbt/insights/modelling/downstream_models_dependent_on_source.py +113 -0
- datapilot/core/platforms/dbt/insights/modelling/duplicate_sources.py +85 -0
- datapilot/core/platforms/dbt/insights/modelling/hard_coded_references.py +80 -0
- datapilot/core/platforms/dbt/insights/modelling/joining_of_upstream_concepts.py +79 -0
- datapilot/core/platforms/dbt/insights/modelling/model_fanout.py +126 -0
- datapilot/core/platforms/dbt/insights/modelling/multiple_sources_joined.py +83 -0
- datapilot/core/platforms/dbt/insights/modelling/root_model.py +82 -0
- datapilot/core/platforms/dbt/insights/modelling/source_fanout.py +102 -0
- datapilot/core/platforms/dbt/insights/modelling/staging_model_dependent_on_downstream_models.py +103 -0
- datapilot/core/platforms/dbt/insights/modelling/staging_model_dependent_on_staging_models.py +89 -0
- datapilot/core/platforms/dbt/insights/modelling/unused_sources.py +59 -0
- datapilot/core/platforms/dbt/insights/performance/__init__.py +0 -0
- datapilot/core/platforms/dbt/insights/performance/base.py +26 -0
- datapilot/core/platforms/dbt/insights/performance/chain_view_linking.py +92 -0
- datapilot/core/platforms/dbt/insights/performance/exposure_parent_materializations.py +104 -0
- datapilot/core/platforms/dbt/insights/schema.py +72 -0
- datapilot/core/platforms/dbt/insights/structure/__init__.py +0 -0
- datapilot/core/platforms/dbt/insights/structure/base.py +33 -0
- datapilot/core/platforms/dbt/insights/structure/model_directories_structure.py +92 -0
- datapilot/core/platforms/dbt/insights/structure/model_naming_conventions.py +97 -0
- datapilot/core/platforms/dbt/insights/structure/source_directories_structure.py +80 -0
- datapilot/core/platforms/dbt/insights/structure/test_directory_structure.py +74 -0
- datapilot/core/platforms/dbt/insights/utils.py +9 -0
- datapilot/core/platforms/dbt/schemas/__init__.py +0 -0
- datapilot/core/platforms/dbt/schemas/catalog.py +73 -0
- datapilot/core/platforms/dbt/schemas/manifest.py +462 -0
- datapilot/core/platforms/dbt/utils.py +525 -0
- datapilot/core/platforms/dbt/wrappers/__init__.py +0 -0
- datapilot/core/platforms/dbt/wrappers/catalog/__init__.py +0 -0
- datapilot/core/platforms/dbt/wrappers/catalog/v1/__init__.py +0 -0
- datapilot/core/platforms/dbt/wrappers/catalog/v1/wrapper.py +18 -0
- datapilot/core/platforms/dbt/wrappers/catalog/wrapper.py +9 -0
- datapilot/core/platforms/dbt/wrappers/manifest/__init__.py +0 -0
- datapilot/core/platforms/dbt/wrappers/manifest/v11/__init__.py +0 -0
- datapilot/core/platforms/dbt/wrappers/manifest/v11/schemas.py +47 -0
- datapilot/core/platforms/dbt/wrappers/manifest/v11/wrapper.py +396 -0
- datapilot/core/platforms/dbt/wrappers/manifest/wrapper.py +35 -0
- datapilot/core/platforms/dbt/wrappers/run_results/__init__.py +0 -0
- datapilot/core/platforms/dbt/wrappers/run_results/run_results.py +39 -0
- datapilot/exceptions/__init__.py +0 -0
- datapilot/exceptions/exceptions.py +10 -0
- datapilot/schemas/__init__.py +0 -0
- datapilot/schemas/constants.py +5 -0
- datapilot/schemas/nodes.py +19 -0
- datapilot/schemas/sql.py +10 -0
- datapilot/utils/__init__.py +0 -0
- datapilot/utils/formatting/__init__.py +0 -0
- datapilot/utils/formatting/utils.py +59 -0
- datapilot/utils/utils.py +317 -0
@@ -0,0 +1,87 @@
|
|
1
|
+
from typing import List
|
2
|
+
|
3
|
+
from datapilot.core.insights.utils import get_severity
|
4
|
+
from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
|
5
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
|
6
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
|
7
|
+
from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
|
8
|
+
|
9
|
+
|
10
|
+
class CheckModelTags(ChecksInsight):
|
11
|
+
NAME = "Model only has valid tags"
|
12
|
+
ALIAS = "check_model_tags"
|
13
|
+
DESCRIPTION = "Ensures that the model has only valid tags from the provided list."
|
14
|
+
REASON_TO_FLAG = "The model has tags that are not in the valid tags list"
|
15
|
+
TAGS_LIST_STR = "tag_list"
|
16
|
+
|
17
|
+
def _build_failure_result(
|
18
|
+
self,
|
19
|
+
node_id: str,
|
20
|
+
tags: List[str],
|
21
|
+
) -> DBTInsightResult:
|
22
|
+
"""
|
23
|
+
Build failure result for the insight if a model's tags are not in the provided tag list.
|
24
|
+
"""
|
25
|
+
|
26
|
+
failure_message = f"The model:{node_id}'s tags are not in the provided tag list:\n"
|
27
|
+
|
28
|
+
recommendation = "Update the model's tags to adhere to the provided tag list."
|
29
|
+
|
30
|
+
return DBTInsightResult(
|
31
|
+
type=self.TYPE,
|
32
|
+
name=self.NAME,
|
33
|
+
message=failure_message,
|
34
|
+
recommendation=recommendation,
|
35
|
+
reason_to_flag=self.REASON_TO_FLAG,
|
36
|
+
metadata={"tags": tags},
|
37
|
+
)
|
38
|
+
|
39
|
+
def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
|
40
|
+
"""
|
41
|
+
Generate a list of InsightResponse objects for each model in the DBT project,
|
42
|
+
Ensures that the model has only valid tags from the provided list.
|
43
|
+
The provided tag list is in the configuration file.
|
44
|
+
"""
|
45
|
+
insights = []
|
46
|
+
self.tag_list = self.get_check_config(self.TAGS_LIST_STR)
|
47
|
+
for node_id, node in self.nodes.items():
|
48
|
+
if self.should_skip_model(node_id):
|
49
|
+
self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
|
50
|
+
continue
|
51
|
+
if node.resource_type == AltimateResourceType.model:
|
52
|
+
if not self.valid_tag(node.config.tags):
|
53
|
+
insights.append(
|
54
|
+
DBTModelInsightResponse(
|
55
|
+
unique_id=node_id,
|
56
|
+
package_name=node.package_name,
|
57
|
+
original_file_path=node.original_file_path,
|
58
|
+
path=node.original_file_path,
|
59
|
+
insight=self._build_failure_result(node_id, node.config.tags),
|
60
|
+
severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
|
61
|
+
)
|
62
|
+
)
|
63
|
+
return insights
|
64
|
+
|
65
|
+
def valid_tag(self, tags: List[str]) -> bool:
|
66
|
+
"""
|
67
|
+
Check if the tags of the model are in the provided tag list.
|
68
|
+
"""
|
69
|
+
if not self.tag_list:
|
70
|
+
return True
|
71
|
+
return all(tag in self.tag_list for tag in tags)
|
72
|
+
|
73
|
+
@classmethod
|
74
|
+
def get_config_schema(cls):
|
75
|
+
config_schema = super().get_config_schema()
|
76
|
+
config_schema["config"] = {
|
77
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
78
|
+
"type": "object",
|
79
|
+
"properties": {
|
80
|
+
cls.TAGS_LIST_STR: {
|
81
|
+
"type": "array",
|
82
|
+
"items": {"type": "string"},
|
83
|
+
"description": "List of allowed tags for the model. If not provided, all tags are allowed.",
|
84
|
+
"default": [],
|
85
|
+
},
|
86
|
+
},
|
87
|
+
}
|
@@ -0,0 +1,97 @@
|
|
1
|
+
from typing import List
|
2
|
+
|
3
|
+
from datapilot.core.insights.utils import get_severity
|
4
|
+
from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
|
5
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
|
6
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
|
7
|
+
from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
|
8
|
+
|
9
|
+
|
10
|
+
class CheckSourceChilds(ChecksInsight):
|
11
|
+
NAME = "Source has allowed number of children"
|
12
|
+
ALIAS = "check_source_childs"
|
13
|
+
DESCRIPTION = "Check the source has a specific number (max/min) of childs"
|
14
|
+
REASON_TO_FLAG = "The source has a number of childs that is not in the valid range"
|
15
|
+
MIN_CHILDS_STR = "min_childs"
|
16
|
+
MAX_CHILDS_STR = "max_childs"
|
17
|
+
|
18
|
+
def _build_failure_result(
|
19
|
+
self,
|
20
|
+
node_id: str,
|
21
|
+
min_childs: int,
|
22
|
+
max_childs: int,
|
23
|
+
) -> DBTInsightResult:
|
24
|
+
"""
|
25
|
+
Build failure result for the insight if a source has a specific number (max/min) of childs
|
26
|
+
"""
|
27
|
+
failure_message = f"The source:{node_id} has a number of childs that is not in the valid range:\n"
|
28
|
+
failure_message += f"Min childs: {min_childs}\n"
|
29
|
+
failure_message += f"Max childs: {max_childs}\n"
|
30
|
+
|
31
|
+
recommendation = "Update the source to adhere to the valid range of childs."
|
32
|
+
return DBTInsightResult(
|
33
|
+
type=self.TYPE,
|
34
|
+
name=self.NAME,
|
35
|
+
message=failure_message,
|
36
|
+
recommendation=recommendation,
|
37
|
+
reason_to_flag=self.REASON_TO_FLAG,
|
38
|
+
metadata={"source_unique_id": node_id, "min_childs": min_childs, "max_childs": max_childs},
|
39
|
+
)
|
40
|
+
|
41
|
+
def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
|
42
|
+
"""
|
43
|
+
Generate a list of InsightResponse objects for each source in the DBT project,
|
44
|
+
Check the source has a specific number (max/min) of childs
|
45
|
+
The min and max number of childs is in the configuration file.
|
46
|
+
"""
|
47
|
+
insights = []
|
48
|
+
self.min_childs = self.get_check_config(self.MIN_CHILDS_STR)
|
49
|
+
self.max_childs = self.get_check_config(self.MAX_CHILDS_STR)
|
50
|
+
for node_id, node in self.sources.items():
|
51
|
+
if self.should_skip_model(node_id):
|
52
|
+
self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
|
53
|
+
continue
|
54
|
+
if node.resource_type == AltimateResourceType.source:
|
55
|
+
if not self.valid_childs(node_id):
|
56
|
+
insights.append(
|
57
|
+
DBTModelInsightResponse(
|
58
|
+
unique_id=node_id,
|
59
|
+
package_name=node.package_name,
|
60
|
+
original_file_path=node.original_file_path,
|
61
|
+
path=node.original_file_path,
|
62
|
+
insight=self._build_failure_result(node_id, min_childs=self.min_childs, max_childs=self.max_childs),
|
63
|
+
severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
|
64
|
+
)
|
65
|
+
)
|
66
|
+
return insights
|
67
|
+
|
68
|
+
def valid_childs(self, source_unique_id: str) -> bool:
|
69
|
+
"""
|
70
|
+
Check if the source has a specific number (max/min) of childs
|
71
|
+
"""
|
72
|
+
source_childs = self.children_map.get(source_unique_id, [])
|
73
|
+
if self.min_childs and len(source_childs) < self.min_childs:
|
74
|
+
return False
|
75
|
+
if self.max_childs and len(source_childs) > self.max_childs:
|
76
|
+
return False
|
77
|
+
return True
|
78
|
+
|
79
|
+
@classmethod
|
80
|
+
def get_config_schema(cls):
|
81
|
+
config_schema = super().get_config_schema()
|
82
|
+
config_schema["config"] = {
|
83
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
84
|
+
"type": "object",
|
85
|
+
"properties": {
|
86
|
+
cls.MAX_CHILDS_STR: {
|
87
|
+
"type": "integer",
|
88
|
+
"description": "The maximum number of childs a model can have.",
|
89
|
+
},
|
90
|
+
cls.MIN_CHILDS_STR: {
|
91
|
+
"type": "integer",
|
92
|
+
"description": "The minimum number of childs a model can have.",
|
93
|
+
"default": "0",
|
94
|
+
},
|
95
|
+
},
|
96
|
+
}
|
97
|
+
return config_schema
|
@@ -0,0 +1,96 @@
|
|
1
|
+
from typing import List
|
2
|
+
from typing import Sequence
|
3
|
+
from typing import Set
|
4
|
+
from typing import Tuple
|
5
|
+
|
6
|
+
from datapilot.core.insights.utils import get_severity
|
7
|
+
from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
|
8
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
|
9
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
|
10
|
+
from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
|
11
|
+
from datapilot.core.platforms.dbt.wrappers.catalog.wrapper import BaseCatalogWrapper
|
12
|
+
from datapilot.utils.formatting.utils import numbered_list
|
13
|
+
|
14
|
+
|
15
|
+
class CheckSourceColumnsHaveDescriptions(ChecksInsight):
|
16
|
+
NAME = "Source columns have descriptions"
|
17
|
+
ALIAS = "check_source_columns_have_desc"
|
18
|
+
DESCRIPTION = "Ensures that the source has columns with descriptions in the properties file (usually schema.yml)."
|
19
|
+
REASON_TO_FLAG = "Missing descriptions for columns in the source can lead to confusion and inconsistency in analysis. "
|
20
|
+
|
21
|
+
def __init__(self, catalog_wrapper: BaseCatalogWrapper, *args, **kwargs):
|
22
|
+
self.catalog = catalog_wrapper
|
23
|
+
super().__init__(*args, **kwargs)
|
24
|
+
|
25
|
+
def _build_failure_result(self, model_unique_id: str, columns: Sequence[str]) -> DBTInsightResult:
|
26
|
+
"""
|
27
|
+
Build failure result for the insight if a source has columns without descriptions.
|
28
|
+
"""
|
29
|
+
failure_message = f"The source:{model_unique_id} has columns without descriptions:\n"
|
30
|
+
failure_message += numbered_list(columns)
|
31
|
+
|
32
|
+
recommendation = "Update the source to include descriptions for all columns."
|
33
|
+
return DBTInsightResult(
|
34
|
+
type=self.TYPE,
|
35
|
+
name=self.NAME,
|
36
|
+
message=failure_message,
|
37
|
+
recommendation=recommendation,
|
38
|
+
reason_to_flag=self.REASON_TO_FLAG,
|
39
|
+
metadata={"source_unique_id": model_unique_id, "columns": columns},
|
40
|
+
)
|
41
|
+
|
42
|
+
def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
|
43
|
+
"""
|
44
|
+
Generate the insight response for the check. This method is called by the insight runner to generate the insight
|
45
|
+
response for the check.
|
46
|
+
Ensures that the source has columns with descriptions in the properties file (usually schema.yml).
|
47
|
+
|
48
|
+
|
49
|
+
Args:
|
50
|
+
*args: Variable length argument list.
|
51
|
+
**kwargs: Arbitrary keyword arguments.
|
52
|
+
Returns:
|
53
|
+
List[DBTModelInsightResponse]: List of insight responses for the check.
|
54
|
+
|
55
|
+
"""
|
56
|
+
insights = []
|
57
|
+
for node_id, node in self.sources.items():
|
58
|
+
if self.should_skip_model(node_id):
|
59
|
+
self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
|
60
|
+
continue
|
61
|
+
if node.resource_type == AltimateResourceType.source:
|
62
|
+
missing_columns = self._check_source_columns(node_id)
|
63
|
+
if missing_columns:
|
64
|
+
insights.append(
|
65
|
+
DBTModelInsightResponse(
|
66
|
+
unique_id=node_id,
|
67
|
+
package_name=node.package_name,
|
68
|
+
path=node.original_file_path,
|
69
|
+
original_file_path=node.original_file_path,
|
70
|
+
insight=self._build_failure_result(node_id, missing_columns),
|
71
|
+
severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
|
72
|
+
)
|
73
|
+
)
|
74
|
+
return insights
|
75
|
+
|
76
|
+
def _check_source_columns(self, node_id) -> Tuple[int, Set[str]]:
|
77
|
+
columns_with_missing_descriptions = set()
|
78
|
+
for column_name, column_node in self.get_node(node_id).columns.items():
|
79
|
+
if not column_node.description:
|
80
|
+
columns_with_missing_descriptions.add(column_name)
|
81
|
+
return columns_with_missing_descriptions
|
82
|
+
|
83
|
+
@classmethod
|
84
|
+
def has_all_required_data(cls, has_manifest: bool, has_catalog: bool, **kwargs) -> Tuple[bool, str]:
|
85
|
+
"""
|
86
|
+
Check if all required data is available for the insight to run.
|
87
|
+
:param has_manifest: A boolean indicating if manifest is available.
|
88
|
+
:return: A boolean indicating if all required data is available.
|
89
|
+
"""
|
90
|
+
if not has_manifest:
|
91
|
+
return False, "Manifest is required for insight to run."
|
92
|
+
|
93
|
+
if not has_catalog:
|
94
|
+
return False, "Catalog is required for insight to run."
|
95
|
+
|
96
|
+
return True, ""
|
@@ -0,0 +1,103 @@
|
|
1
|
+
from typing import ClassVar
|
2
|
+
from typing import List
|
3
|
+
from typing import Sequence
|
4
|
+
from typing import Set
|
5
|
+
from typing import Tuple
|
6
|
+
|
7
|
+
from datapilot.core.insights.utils import get_severity
|
8
|
+
from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
|
9
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
|
10
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
|
11
|
+
from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
|
12
|
+
from datapilot.core.platforms.dbt.wrappers.catalog.wrapper import BaseCatalogWrapper
|
13
|
+
from datapilot.utils.formatting.utils import numbered_list
|
14
|
+
|
15
|
+
|
16
|
+
class CheckSourceHasAllColumns(ChecksInsight):
|
17
|
+
NAME = "Source has all columns"
|
18
|
+
ALIAS = "check_source_has_all_columns"
|
19
|
+
DESCRIPTION = "Ensures that all columns in the database are also specified in the properties file. (usually schema.yml)."
|
20
|
+
REASON_TO_FLAG = "Missing columns in the source can lead to confusion and inconsistency in analysis. "
|
21
|
+
FILES_REQUIRED: ClassVar = ["Manifest", "Catalog"]
|
22
|
+
|
23
|
+
def __init__(self, catalog_wrapper: BaseCatalogWrapper, *args, **kwargs):
|
24
|
+
self.catalog = catalog_wrapper
|
25
|
+
super().__init__(*args, **kwargs)
|
26
|
+
|
27
|
+
def _build_failure_result(self, source_unique_id: str, columns: Sequence[str]) -> DBTInsightResult:
|
28
|
+
"""
|
29
|
+
Build failure result for the insight if a source has missing columns.
|
30
|
+
"""
|
31
|
+
failure_message = f"The source:{source_unique_id} has missing columns:\n"
|
32
|
+
failure_message += numbered_list(columns)
|
33
|
+
|
34
|
+
recommendation = "Update the source to include all columns."
|
35
|
+
return DBTInsightResult(
|
36
|
+
type=self.TYPE,
|
37
|
+
name=self.NAME,
|
38
|
+
message=failure_message,
|
39
|
+
recommendation=recommendation,
|
40
|
+
reason_to_flag=self.REASON_TO_FLAG,
|
41
|
+
metadata={"source_unique_id": source_unique_id, "columns": columns},
|
42
|
+
)
|
43
|
+
|
44
|
+
def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
|
45
|
+
"""
|
46
|
+
Generate the insight response for the check. This method is called by the insight runner to generate the insight
|
47
|
+
response for the check.
|
48
|
+
Ensures that the source has all columns in the properties file (usually schema.yml).
|
49
|
+
"""
|
50
|
+
insights = []
|
51
|
+
for node_id, node in self.sources.items():
|
52
|
+
if self.should_skip_model(node_id):
|
53
|
+
self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
|
54
|
+
continue
|
55
|
+
if node.resource_type == AltimateResourceType.source:
|
56
|
+
missing_columns = self._check_source_columns(node_id)
|
57
|
+
if missing_columns:
|
58
|
+
insights.append(
|
59
|
+
DBTModelInsightResponse(
|
60
|
+
unique_id=node_id,
|
61
|
+
package_name=node.package_name,
|
62
|
+
original_file_path=node.original_file_path,
|
63
|
+
path=node.original_file_path,
|
64
|
+
insight=self._build_failure_result(node_id, list(missing_columns)),
|
65
|
+
severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
|
66
|
+
)
|
67
|
+
)
|
68
|
+
return insights
|
69
|
+
|
70
|
+
def _check_source_columns(self, node_id) -> Tuple[int, Set[str]]:
|
71
|
+
"""
|
72
|
+
Check if the source has all columns
|
73
|
+
Checking if the source has all columns as defined in the catalog.
|
74
|
+
Ensuring that the source has all columns helps in maintaining data integrity and consistency.
|
75
|
+
"""
|
76
|
+
missing_columns = set()
|
77
|
+
schema = self.catalog.get_schema()
|
78
|
+
if node_id not in schema:
|
79
|
+
return missing_columns
|
80
|
+
catalog_columns = schema[node_id].keys()
|
81
|
+
for col_name in self.get_node(node_id).columns.items():
|
82
|
+
if col_name not in catalog_columns:
|
83
|
+
missing_columns.add(col_name)
|
84
|
+
return missing_columns
|
85
|
+
|
86
|
+
@classmethod
|
87
|
+
def has_all_required_data(cls, has_manifest: bool, has_catalog: bool, **kwargs) -> Tuple[bool, str]:
|
88
|
+
"""
|
89
|
+
Check if all required data is available for the insight to run.
|
90
|
+
:param has_manifest: A boolean indicating if manifest is available.
|
91
|
+
:return: A boolean indicating if all required data is available.
|
92
|
+
"""
|
93
|
+
if not has_manifest:
|
94
|
+
return False, "Manifest is required for insight to run."
|
95
|
+
|
96
|
+
if not has_catalog:
|
97
|
+
return False, "Catalog is required for insight to run."
|
98
|
+
|
99
|
+
return True, ""
|
100
|
+
|
101
|
+
@classmethod
|
102
|
+
def requires_catalog(cls) -> bool:
|
103
|
+
return True
|
@@ -0,0 +1,94 @@
|
|
1
|
+
from typing import List
|
2
|
+
|
3
|
+
from datapilot.core.insights.utils import get_severity
|
4
|
+
from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
|
5
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
|
6
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
|
7
|
+
from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
|
8
|
+
|
9
|
+
|
10
|
+
class CheckSourceHasFreshness(ChecksInsight):
|
11
|
+
NAME = "Source has freshness options"
|
12
|
+
ALIAS = "check_source_has_freshness"
|
13
|
+
DESCRIPTION = "Ensures that the source has freshness options"
|
14
|
+
REASON_TO_FLAG = "Missing freshness options for the source can lead to confusion and inconsistency in analysis. "
|
15
|
+
FRESHNESS_STR = "freshness"
|
16
|
+
|
17
|
+
def _build_failure_result(self, source_id: int, missing_keys) -> DBTInsightResult:
|
18
|
+
"""
|
19
|
+
Build failure result for the insight if a model's parent schema is not whitelist or in blacklist.
|
20
|
+
"""
|
21
|
+
missing_keys = ", ".join(missing_keys)
|
22
|
+
failure_message = f"The source:{source_id} does not have freshness options defined for the following keys:\n {missing_keys}"
|
23
|
+
|
24
|
+
recommendation = "Define the freshness options for the source to ensure consistency in analysis."
|
25
|
+
|
26
|
+
return DBTInsightResult(
|
27
|
+
type=self.TYPE,
|
28
|
+
name=self.NAME,
|
29
|
+
message=failure_message,
|
30
|
+
recommendation=recommendation,
|
31
|
+
reason_to_flag=self.REASON_TO_FLAG,
|
32
|
+
metadata={"source_id": source_id, "missing_keys": missing_keys},
|
33
|
+
)
|
34
|
+
|
35
|
+
def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
|
36
|
+
"""
|
37
|
+
Generate the insight response for the check. This method is called by the insight runner to generate the insight
|
38
|
+
response for the check.
|
39
|
+
Ensures that the source has freshness options
|
40
|
+
"""
|
41
|
+
self.freshness_keys = self.get_check_config(self.FRESHNESS_STR) or []
|
42
|
+
insights = []
|
43
|
+
for node_id, node in self.sources.items():
|
44
|
+
if self.should_skip_model(node_id):
|
45
|
+
self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
|
46
|
+
continue
|
47
|
+
if node.resource_type == AltimateResourceType.source:
|
48
|
+
missing_keys = self._check_source_has_freshness(node_id)
|
49
|
+
if missing_keys:
|
50
|
+
insights.append(
|
51
|
+
DBTModelInsightResponse(
|
52
|
+
unique_id=node_id,
|
53
|
+
package_name=node.package_name,
|
54
|
+
original_file_path=node.original_file_path,
|
55
|
+
path=node.original_file_path,
|
56
|
+
insight=self._build_failure_result(node_id, missing_keys),
|
57
|
+
severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
|
58
|
+
)
|
59
|
+
)
|
60
|
+
return insights
|
61
|
+
|
62
|
+
def _check_source_has_freshness(self, source_id: str) -> List[str]:
|
63
|
+
source = self.get_node(source_id)
|
64
|
+
freshness = source.freshness.dict() if source.freshness else {}
|
65
|
+
|
66
|
+
if not freshness:
|
67
|
+
return False
|
68
|
+
|
69
|
+
missing_keys = []
|
70
|
+
for key in self.freshness_keys:
|
71
|
+
if key not in freshness:
|
72
|
+
missing_keys.append(key)
|
73
|
+
|
74
|
+
return missing_keys
|
75
|
+
|
76
|
+
@classmethod
|
77
|
+
def get_config_schema(cls):
|
78
|
+
config_schema = super().get_config_schema()
|
79
|
+
config_schema["config"] = {
|
80
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
81
|
+
"type": "object",
|
82
|
+
"properties": {
|
83
|
+
cls.FRESHNESS_STR: {
|
84
|
+
"type": "array",
|
85
|
+
"description": "The freshness options that should be defined for the source. If not provided, all freshness options are allowed.",
|
86
|
+
"items": {
|
87
|
+
"type": "string",
|
88
|
+
"enum": ["error_after", "warn_after"],
|
89
|
+
},
|
90
|
+
},
|
91
|
+
},
|
92
|
+
"required": [cls.FRESHNESS_STR],
|
93
|
+
}
|
94
|
+
return config_schema
|
@@ -0,0 +1,110 @@
|
|
1
|
+
from typing import List
|
2
|
+
from typing import Sequence
|
3
|
+
from typing import Set
|
4
|
+
from typing import Tuple
|
5
|
+
|
6
|
+
from datapilot.core.insights.utils import get_severity
|
7
|
+
from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
|
8
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
|
9
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
|
10
|
+
from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
|
11
|
+
from datapilot.utils.formatting.utils import numbered_list
|
12
|
+
|
13
|
+
|
14
|
+
class CheckSourceHasLabelsKeys(ChecksInsight):
|
15
|
+
NAME = "Check source has labels keys"
|
16
|
+
ALIAS = "check_source_has_labels_keys"
|
17
|
+
DESCRIPTION = (
|
18
|
+
"Checks that the source has the specified labels keys as defined in the properties file. "
|
19
|
+
"Ensuring that the source has the required labels keys helps in maintaining metadata consistency and understanding."
|
20
|
+
)
|
21
|
+
REASON_TO_FLAG = (
|
22
|
+
"Missing labels keys in the source can lead to inconsistency in metadata management and understanding of the source. "
|
23
|
+
"It's important to ensure that the source includes all the required labels keys as per the configuration."
|
24
|
+
)
|
25
|
+
LABEL_KEYS_STR = "labels_keys"
|
26
|
+
ALLOW_EXTRA_KEYS_STR = "allow_extra_keys"
|
27
|
+
|
28
|
+
def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
|
29
|
+
insights = []
|
30
|
+
self.labels_keys = self.get_check_config(self.LABEL_KEYS_STR)
|
31
|
+
self.allow_extra_keys = self.get_check_config(self.ALLOW_EXTRA_KEYS_STR)
|
32
|
+
|
33
|
+
for node_id, node in self.sources.items():
|
34
|
+
if self.should_skip_model(node_id):
|
35
|
+
self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
|
36
|
+
continue
|
37
|
+
if node.resource_type == AltimateResourceType.source:
|
38
|
+
status_code, missing_keys, extra_keys = self._check_labels_keys(node_id)
|
39
|
+
if status_code == 1:
|
40
|
+
insights.append(
|
41
|
+
DBTModelInsightResponse(
|
42
|
+
unique_id=node_id,
|
43
|
+
package_name=node.package_name,
|
44
|
+
path=node.original_file_path,
|
45
|
+
original_file_path=node.original_file_path,
|
46
|
+
insight=self._build_failure_result(node_id, missing_keys, extra_keys),
|
47
|
+
severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
|
48
|
+
)
|
49
|
+
)
|
50
|
+
return insights
|
51
|
+
|
52
|
+
def _build_failure_result(self, model_unique_id: str, missing_keys: Sequence[str], extra_keys: Sequence[str]) -> DBTInsightResult:
|
53
|
+
failure_message = ""
|
54
|
+
if missing_keys:
|
55
|
+
failure_message += (
|
56
|
+
f"The model `{model_unique_id}` is missing the following labels keys: {missing_keys}. "
|
57
|
+
"Ensure that the model has the required labels keys."
|
58
|
+
)
|
59
|
+
if extra_keys:
|
60
|
+
failure_message += (
|
61
|
+
f"The model `{model_unique_id}` has the following extra labels keys: {extra_keys}. "
|
62
|
+
"Ensure that the model does not include any extra labels keys."
|
63
|
+
)
|
64
|
+
recommendation = (
|
65
|
+
"Add the following labels keys to the model `{model_unique_id}`: {missing_keys}. "
|
66
|
+
"Ensuring that the model has the required labels keys helps in maintaining metadata consistency and understanding."
|
67
|
+
)
|
68
|
+
return DBTInsightResult(
|
69
|
+
failure_message=failure_message.format(model_unique_id=model_unique_id, missing_keys=numbered_list(missing_keys)),
|
70
|
+
recommendation=recommendation.format(model_unique_id=model_unique_id, missing_keys=numbered_list(missing_keys)),
|
71
|
+
metadata={"model_unique_id": model_unique_id, "missing_keys": missing_keys},
|
72
|
+
)
|
73
|
+
|
74
|
+
def _check_labels_keys(self, node_id) -> Tuple[int, Set[str]]:
|
75
|
+
status_code = 0
|
76
|
+
missing_keys = set(self.labels_keys) - set(self.get_node(node_id).label)
|
77
|
+
config = self.get_node(node_id).config.dict() if self.get_node(node_id).config else {}
|
78
|
+
labels = config.get("labels", {})
|
79
|
+
label_keys = set(labels.keys())
|
80
|
+
extra_keys = set()
|
81
|
+
if missing_keys:
|
82
|
+
status_code = 1
|
83
|
+
if not self.allow_extra_keys:
|
84
|
+
extra_keys = label_keys - set(self.labels_keys)
|
85
|
+
if extra_keys:
|
86
|
+
status_code = 1
|
87
|
+
return status_code, missing_keys, extra_keys
|
88
|
+
|
89
|
+
@classmethod
|
90
|
+
def get_config_schema(cls):
|
91
|
+
config_schema = super().get_config_schema()
|
92
|
+
config_schema["config"] = {
|
93
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
94
|
+
"type": "object",
|
95
|
+
"properties": {
|
96
|
+
cls.LABEL_KEYS_STR: {
|
97
|
+
"type": "array",
|
98
|
+
"items": {
|
99
|
+
"type": "string",
|
100
|
+
},
|
101
|
+
"description": "A list of meta keys that should be present in the model.",
|
102
|
+
},
|
103
|
+
cls.ALLOW_EXTRA_KEYS_STR: {
|
104
|
+
"type": "boolean",
|
105
|
+
"default": False,
|
106
|
+
},
|
107
|
+
},
|
108
|
+
"required": [cls.LABEL_KEYS_STR],
|
109
|
+
}
|
110
|
+
return config_schema
|
@@ -0,0 +1,62 @@
|
|
1
|
+
from typing import List
|
2
|
+
|
3
|
+
from datapilot.core.insights.utils import get_severity
|
4
|
+
from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
|
5
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
|
6
|
+
from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
|
7
|
+
from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
|
8
|
+
|
9
|
+
|
10
|
+
class CheckSourceHasLoader(ChecksInsight):
|
11
|
+
NAME = "Source has loader"
|
12
|
+
ALIAS = "check_source_has_loader"
|
13
|
+
DESCRIPTION = "Check if the source has a loader"
|
14
|
+
REASON_TO_FLAG = "Missing loader for the source can lead to confusion and inconsistency in analysis. "
|
15
|
+
|
16
|
+
def _build_failure_result(self, source_id: int) -> DBTInsightResult:
|
17
|
+
"""
|
18
|
+
Build failure result for the insight if a model's parent schema is not whitelist or in blacklist.
|
19
|
+
"""
|
20
|
+
failure_message = f"The source:{source_id} does not have a loader defined.\n"
|
21
|
+
|
22
|
+
recommendation = "Define the loader for the source to ensure consistency in analysis."
|
23
|
+
|
24
|
+
return DBTInsightResult(
|
25
|
+
type=self.TYPE,
|
26
|
+
name=self.NAME,
|
27
|
+
message=failure_message,
|
28
|
+
recommendation=recommendation,
|
29
|
+
reason_to_flag=self.REASON_TO_FLAG,
|
30
|
+
metadata={"source_id": source_id},
|
31
|
+
)
|
32
|
+
|
33
|
+
def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
|
34
|
+
"""
|
35
|
+
Generate the insight response for the check. This method is called by the insight runner to generate the insight
|
36
|
+
response for the check.
|
37
|
+
Ensures that the source has a loader option
|
38
|
+
"""
|
39
|
+
insights = []
|
40
|
+
for node_id, node in self.sources.items():
|
41
|
+
if self.should_skip_model(node_id):
|
42
|
+
self.logger.debug(f"Skipping source {node_id} as it is not enabled for selected models")
|
43
|
+
continue
|
44
|
+
if node.resource_type == AltimateResourceType.source:
|
45
|
+
if not self._check_source_has_loader(node_id):
|
46
|
+
insights.append(
|
47
|
+
DBTModelInsightResponse(
|
48
|
+
unique_id=node_id,
|
49
|
+
package_name=node.package_name,
|
50
|
+
original_file_path=node.original_file_path,
|
51
|
+
path=node.original_file_path,
|
52
|
+
insight=self._build_failure_result(node_id),
|
53
|
+
severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
|
54
|
+
)
|
55
|
+
)
|
56
|
+
return insights
|
57
|
+
|
58
|
+
def _check_source_has_loader(self, source_unique_id: str) -> bool:
|
59
|
+
source = self.get_node(source_unique_id)
|
60
|
+
if not source.loader:
|
61
|
+
return False
|
62
|
+
return True
|