altimate-datapilot-cli 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. altimate_datapilot_cli-0.0.8.dist-info/AUTHORS.rst +5 -0
  2. altimate_datapilot_cli-0.0.8.dist-info/LICENSE +9 -0
  3. altimate_datapilot_cli-0.0.8.dist-info/METADATA +102 -0
  4. altimate_datapilot_cli-0.0.8.dist-info/RECORD +139 -0
  5. altimate_datapilot_cli-0.0.8.dist-info/WHEEL +5 -0
  6. altimate_datapilot_cli-0.0.8.dist-info/entry_points.txt +4 -0
  7. altimate_datapilot_cli-0.0.8.dist-info/top_level.txt +1 -0
  8. datapilot/__init__.py +1 -0
  9. datapilot/__main__.py +14 -0
  10. datapilot/cli/__init__.py +0 -0
  11. datapilot/cli/main.py +11 -0
  12. datapilot/clients/__init__.py +0 -0
  13. datapilot/clients/altimate/__init__.py +0 -0
  14. datapilot/clients/altimate/client.py +85 -0
  15. datapilot/clients/altimate/utils.py +75 -0
  16. datapilot/config/__init__.py +0 -0
  17. datapilot/config/config.py +16 -0
  18. datapilot/config/utils.py +32 -0
  19. datapilot/core/__init__.py +0 -0
  20. datapilot/core/insights/__init__.py +2 -0
  21. datapilot/core/insights/base/__init__.py +0 -0
  22. datapilot/core/insights/base/insight.py +34 -0
  23. datapilot/core/insights/report.py +16 -0
  24. datapilot/core/insights/schema.py +24 -0
  25. datapilot/core/insights/sql/__init__.py +0 -0
  26. datapilot/core/insights/sql/base/__init__.py +0 -0
  27. datapilot/core/insights/sql/base/insight.py +18 -0
  28. datapilot/core/insights/sql/runtime/__init__.py +0 -0
  29. datapilot/core/insights/sql/static/__init__.py +0 -0
  30. datapilot/core/insights/utils.py +20 -0
  31. datapilot/core/platforms/__init__.py +0 -0
  32. datapilot/core/platforms/dbt/__init__.py +0 -0
  33. datapilot/core/platforms/dbt/cli/__init__.py +0 -0
  34. datapilot/core/platforms/dbt/cli/cli.py +112 -0
  35. datapilot/core/platforms/dbt/constants.py +34 -0
  36. datapilot/core/platforms/dbt/exceptions.py +6 -0
  37. datapilot/core/platforms/dbt/executor.py +157 -0
  38. datapilot/core/platforms/dbt/factory.py +22 -0
  39. datapilot/core/platforms/dbt/formatting.py +45 -0
  40. datapilot/core/platforms/dbt/hooks/__init__.py +0 -0
  41. datapilot/core/platforms/dbt/hooks/executor_hook.py +86 -0
  42. datapilot/core/platforms/dbt/insights/__init__.py +115 -0
  43. datapilot/core/platforms/dbt/insights/base.py +133 -0
  44. datapilot/core/platforms/dbt/insights/checks/__init__.py +0 -0
  45. datapilot/core/platforms/dbt/insights/checks/base.py +26 -0
  46. datapilot/core/platforms/dbt/insights/checks/check_column_desc_are_same.py +105 -0
  47. datapilot/core/platforms/dbt/insights/checks/check_column_name_contract.py +154 -0
  48. datapilot/core/platforms/dbt/insights/checks/check_macro_args_have_desc.py +75 -0
  49. datapilot/core/platforms/dbt/insights/checks/check_macro_has_desc.py +63 -0
  50. datapilot/core/platforms/dbt/insights/checks/check_model_has_all_columns.py +96 -0
  51. datapilot/core/platforms/dbt/insights/checks/check_model_has_labels_keys.py +112 -0
  52. datapilot/core/platforms/dbt/insights/checks/check_model_has_meta_keys.py +108 -0
  53. datapilot/core/platforms/dbt/insights/checks/check_model_has_properties_file.py +64 -0
  54. datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_group.py +118 -0
  55. datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_name.py +114 -0
  56. datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_type.py +119 -0
  57. datapilot/core/platforms/dbt/insights/checks/check_model_materialization_by_childs.py +129 -0
  58. datapilot/core/platforms/dbt/insights/checks/check_model_name_contract.py +132 -0
  59. datapilot/core/platforms/dbt/insights/checks/check_model_parents_and_childs.py +135 -0
  60. datapilot/core/platforms/dbt/insights/checks/check_model_parents_database.py +109 -0
  61. datapilot/core/platforms/dbt/insights/checks/check_model_parents_schema.py +109 -0
  62. datapilot/core/platforms/dbt/insights/checks/check_model_tags.py +87 -0
  63. datapilot/core/platforms/dbt/insights/checks/check_source_childs.py +97 -0
  64. datapilot/core/platforms/dbt/insights/checks/check_source_columns_have_desc.py +96 -0
  65. datapilot/core/platforms/dbt/insights/checks/check_source_has_all_columns.py +103 -0
  66. datapilot/core/platforms/dbt/insights/checks/check_source_has_freshness.py +94 -0
  67. datapilot/core/platforms/dbt/insights/checks/check_source_has_labels_keys.py +110 -0
  68. datapilot/core/platforms/dbt/insights/checks/check_source_has_loader.py +62 -0
  69. datapilot/core/platforms/dbt/insights/checks/check_source_has_meta_keys.py +117 -0
  70. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests.py +82 -0
  71. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_group.py +117 -0
  72. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_name.py +113 -0
  73. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_type.py +119 -0
  74. datapilot/core/platforms/dbt/insights/checks/check_source_table_has_description.py +62 -0
  75. datapilot/core/platforms/dbt/insights/checks/check_source_tags.py +76 -0
  76. datapilot/core/platforms/dbt/insights/dbt_test/__init__.py +0 -0
  77. datapilot/core/platforms/dbt/insights/dbt_test/base.py +23 -0
  78. datapilot/core/platforms/dbt/insights/dbt_test/missing_primary_key_tests.py +130 -0
  79. datapilot/core/platforms/dbt/insights/dbt_test/test_coverage.py +118 -0
  80. datapilot/core/platforms/dbt/insights/governance/__init__.py +0 -0
  81. datapilot/core/platforms/dbt/insights/governance/base.py +23 -0
  82. datapilot/core/platforms/dbt/insights/governance/documentation_on_stale_columns.py +130 -0
  83. datapilot/core/platforms/dbt/insights/governance/exposures_dependent_on_private_models.py +90 -0
  84. datapilot/core/platforms/dbt/insights/governance/public_models_without_contracts.py +89 -0
  85. datapilot/core/platforms/dbt/insights/governance/undocumented_columns.py +148 -0
  86. datapilot/core/platforms/dbt/insights/governance/undocumented_public_models.py +110 -0
  87. datapilot/core/platforms/dbt/insights/modelling/README.md +15 -0
  88. datapilot/core/platforms/dbt/insights/modelling/__init__.py +0 -0
  89. datapilot/core/platforms/dbt/insights/modelling/base.py +31 -0
  90. datapilot/core/platforms/dbt/insights/modelling/direct_join_to_source.py +125 -0
  91. datapilot/core/platforms/dbt/insights/modelling/downstream_models_dependent_on_source.py +113 -0
  92. datapilot/core/platforms/dbt/insights/modelling/duplicate_sources.py +85 -0
  93. datapilot/core/platforms/dbt/insights/modelling/hard_coded_references.py +80 -0
  94. datapilot/core/platforms/dbt/insights/modelling/joining_of_upstream_concepts.py +79 -0
  95. datapilot/core/platforms/dbt/insights/modelling/model_fanout.py +126 -0
  96. datapilot/core/platforms/dbt/insights/modelling/multiple_sources_joined.py +83 -0
  97. datapilot/core/platforms/dbt/insights/modelling/root_model.py +82 -0
  98. datapilot/core/platforms/dbt/insights/modelling/source_fanout.py +102 -0
  99. datapilot/core/platforms/dbt/insights/modelling/staging_model_dependent_on_downstream_models.py +103 -0
  100. datapilot/core/platforms/dbt/insights/modelling/staging_model_dependent_on_staging_models.py +89 -0
  101. datapilot/core/platforms/dbt/insights/modelling/unused_sources.py +59 -0
  102. datapilot/core/platforms/dbt/insights/performance/__init__.py +0 -0
  103. datapilot/core/platforms/dbt/insights/performance/base.py +26 -0
  104. datapilot/core/platforms/dbt/insights/performance/chain_view_linking.py +92 -0
  105. datapilot/core/platforms/dbt/insights/performance/exposure_parent_materializations.py +104 -0
  106. datapilot/core/platforms/dbt/insights/schema.py +72 -0
  107. datapilot/core/platforms/dbt/insights/structure/__init__.py +0 -0
  108. datapilot/core/platforms/dbt/insights/structure/base.py +33 -0
  109. datapilot/core/platforms/dbt/insights/structure/model_directories_structure.py +92 -0
  110. datapilot/core/platforms/dbt/insights/structure/model_naming_conventions.py +97 -0
  111. datapilot/core/platforms/dbt/insights/structure/source_directories_structure.py +80 -0
  112. datapilot/core/platforms/dbt/insights/structure/test_directory_structure.py +74 -0
  113. datapilot/core/platforms/dbt/insights/utils.py +9 -0
  114. datapilot/core/platforms/dbt/schemas/__init__.py +0 -0
  115. datapilot/core/platforms/dbt/schemas/catalog.py +73 -0
  116. datapilot/core/platforms/dbt/schemas/manifest.py +462 -0
  117. datapilot/core/platforms/dbt/utils.py +525 -0
  118. datapilot/core/platforms/dbt/wrappers/__init__.py +0 -0
  119. datapilot/core/platforms/dbt/wrappers/catalog/__init__.py +0 -0
  120. datapilot/core/platforms/dbt/wrappers/catalog/v1/__init__.py +0 -0
  121. datapilot/core/platforms/dbt/wrappers/catalog/v1/wrapper.py +18 -0
  122. datapilot/core/platforms/dbt/wrappers/catalog/wrapper.py +9 -0
  123. datapilot/core/platforms/dbt/wrappers/manifest/__init__.py +0 -0
  124. datapilot/core/platforms/dbt/wrappers/manifest/v11/__init__.py +0 -0
  125. datapilot/core/platforms/dbt/wrappers/manifest/v11/schemas.py +47 -0
  126. datapilot/core/platforms/dbt/wrappers/manifest/v11/wrapper.py +396 -0
  127. datapilot/core/platforms/dbt/wrappers/manifest/wrapper.py +35 -0
  128. datapilot/core/platforms/dbt/wrappers/run_results/__init__.py +0 -0
  129. datapilot/core/platforms/dbt/wrappers/run_results/run_results.py +39 -0
  130. datapilot/exceptions/__init__.py +0 -0
  131. datapilot/exceptions/exceptions.py +10 -0
  132. datapilot/schemas/__init__.py +0 -0
  133. datapilot/schemas/constants.py +5 -0
  134. datapilot/schemas/nodes.py +19 -0
  135. datapilot/schemas/sql.py +10 -0
  136. datapilot/utils/__init__.py +0 -0
  137. datapilot/utils/formatting/__init__.py +0 -0
  138. datapilot/utils/formatting/utils.py +59 -0
  139. datapilot/utils/utils.py +317 -0
@@ -0,0 +1,133 @@
1
+ from abc import abstractmethod
2
+ from typing import ClassVar
3
+ from typing import Dict
4
+ from typing import List
5
+ from typing import Union
6
+
7
+ from datapilot.config.utils import get_insight_config
8
+ from datapilot.core.insights.base.insight import Insight
9
+ from datapilot.core.insights.schema import Severity
10
+ from datapilot.core.platforms.dbt.constants import NON_MATERIALIZED
11
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateManifestExposureNode
12
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateManifestMacroNode
13
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateManifestNode
14
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateManifestSourceNode
15
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateManifestTestNode
16
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
17
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateSeedNode
18
+ from datapilot.core.platforms.dbt.wrappers.manifest.wrapper import BaseManifestWrapper
19
+
20
+
21
+ class DBTInsight(Insight):
22
+ DEFAULT_SEVERITY = Severity.ERROR
23
+ FILES_REQUIRED: ClassVar = ["Manifest"]
24
+
25
+ def __init__(
26
+ self,
27
+ manifest_wrapper: BaseManifestWrapper,
28
+ nodes: Dict[str, AltimateManifestNode],
29
+ sources: Dict[str, AltimateManifestSourceNode],
30
+ exposures: Dict[str, AltimateManifestExposureNode],
31
+ tests: Dict[str, AltimateManifestTestNode],
32
+ seeds: Dict[str, AltimateSeedNode],
33
+ macros: Dict[str, AltimateManifestMacroNode],
34
+ children_map: Dict[str, List[str]],
35
+ project_name: str,
36
+ selected_models: Union[List[str], None] = None,
37
+ excluded_models: Union[List[str], None] = None,
38
+ *args,
39
+ **kwargs,
40
+ ):
41
+ self.manifest = manifest_wrapper
42
+ self.nodes = nodes
43
+ self.macros = macros or {}
44
+ self.sources = sources
45
+ self.exposures = exposures
46
+ self.tests = tests
47
+ self.seeds = seeds
48
+ self.children_map = children_map
49
+ self.project_name = project_name
50
+ self.selected_models = selected_models
51
+ self.excluded_models = excluded_models
52
+ super().__init__(*args, **kwargs)
53
+
54
+ @abstractmethod
55
+ def generate(self, *args, **kwargs) -> Dict:
56
+ pass
57
+
58
+ def check_part_of_project(self, node_project_name: str) -> bool:
59
+ return node_project_name == self.project_name
60
+
61
+ def get_node(
62
+ self, node_id: str
63
+ ) -> Union[
64
+ AltimateManifestNode, AltimateManifestSourceNode, AltimateManifestExposureNode, AltimateManifestTestNode, AltimateManifestMacroNode
65
+ ]:
66
+ if node_id in self.nodes:
67
+ return self.nodes[node_id]
68
+ elif node_id in self.sources:
69
+ return self.sources[node_id]
70
+ elif node_id in self.exposures:
71
+ return self.exposures[node_id]
72
+ elif node_id in self.tests:
73
+ return self.tests[node_id]
74
+ elif node_id in self.macros:
75
+ return self.macros[node_id]
76
+ elif node_id in self.seeds:
77
+ return self.seeds[node_id]
78
+ else:
79
+ self.logger.debug(f"Model {node_id} not found in manifest")
80
+ return None
81
+
82
+ def find_long_chains(self, min_chain_length=4):
83
+ """
84
+ Find chains of nodes with 'materialized' set to 'view' or 'ephemeral' of a given minimum length.
85
+
86
+ :param nodes: Dictionary of nodes where key is node_id and value is a node with 'depends_on' and 'materialized'.
87
+ :param min_chain_length: Minimum length of the chain to be found.
88
+ :return: A list of chains, where each chain is a list of node IDs.
89
+ """
90
+
91
+ def is_not_materialized(node: Union[AltimateManifestNode, AltimateManifestSourceNode]) -> bool:
92
+ if node.resource_type == AltimateResourceType.source:
93
+ return False
94
+ return node.config.materialized in NON_MATERIALIZED
95
+
96
+ def build_chain(node_id, current_chain):
97
+ if len(current_chain) >= min_chain_length:
98
+ long_chains.append(current_chain)
99
+ return
100
+ for parent_id in self.get_node(node_id).depends_on.nodes:
101
+ if is_not_materialized(self.get_node(parent_id)):
102
+ build_chain(parent_id, [*current_chain, parent_id])
103
+
104
+ long_chains = []
105
+ for node_id, node in self.nodes.items():
106
+ if is_not_materialized(node):
107
+ build_chain(node_id, [node_id])
108
+
109
+ return long_chains
110
+
111
+ def should_skip_model(self, model_unique_id):
112
+ """Check if a model is in the excluded models list."""
113
+ if self.selected_models:
114
+ return model_unique_id not in self.selected_models
115
+
116
+ return False
117
+
118
+ @classmethod
119
+ def get_config_schema(cls):
120
+ return {
121
+ "name": cls.NAME,
122
+ "alias": cls.ALIAS,
123
+ "type": cls.TYPE,
124
+ "files_required": cls.FILES_REQUIRED,
125
+ "description": cls.DESCRIPTION,
126
+ "config": {"$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": {}},
127
+ }
128
+
129
+ def requires_catalog(cls) -> bool:
130
+ return False
131
+
132
+ def get_check_config(self, key: str) -> any:
133
+ return get_insight_config(self.config, self.ALIAS, key)
@@ -0,0 +1,26 @@
1
+ from abc import abstractmethod
2
+ from typing import ClassVar
3
+ from typing import Tuple
4
+
5
+ from datapilot.core.platforms.dbt.insights.base import DBTInsight
6
+
7
+
8
+ class ChecksInsight(DBTInsight):
9
+ TYPE = "Checks"
10
+ FILES_REQUIRED: ClassVar = ["Manifest"]
11
+
12
+ @abstractmethod
13
+ def generate(self, *args, **kwargs) -> dict:
14
+ pass
15
+
16
+ @classmethod
17
+ def has_all_required_data(cls, has_manifest: bool, has_catalog: bool, **kwargs) -> Tuple[bool, str]:
18
+ """
19
+ Check if all required data is available for the insight to run.
20
+ :param has_manifest: A boolean indicating if manifest is available.
21
+ :return: A boolean indicating if all required data is available.
22
+ """
23
+ if not has_manifest:
24
+ return False, "Manifest is required for insight to run."
25
+
26
+ return True, ""
@@ -0,0 +1,105 @@
1
+ from typing import ClassVar
2
+ from typing import List
3
+
4
+ from datapilot.core.insights.utils import get_severity
5
+ from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
6
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
7
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
8
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
9
+
10
+
11
+ class CheckColumnDescAreSame(ChecksInsight):
12
+ NAME = "Column descriptions consistent for same column names"
13
+ ALIAS = "column_descriptions_are_same"
14
+ DESCRIPTION = "Column description for the same column name should be same "
15
+ REASON_TO_FLAG = (
16
+ "Different descriptions for the same column names can lead to confusion and hinder effective data "
17
+ "modeling and analysis. It's important to have consistent column descriptions."
18
+ )
19
+ FILES_REQUIRED: ClassVar = ["Manifest", "Catalog"]
20
+
21
+ def __init__(self, *args, **kwargs):
22
+ super().__init__(*args, **kwargs)
23
+ self.columns_with_different_desc = []
24
+ self.columns = {}
25
+ self.column_node_map = {}
26
+
27
+ def _build_failure_result(
28
+ self,
29
+ ) -> DBTInsightResult:
30
+ """
31
+ Build failure result for the insight if a column has a different description in multiple models or sources.
32
+
33
+ :return: An instance of InsightResult containing failure message and recommendation.
34
+ """
35
+
36
+ failure_message = "The following models or sources have different descriptions for some columns:\n"
37
+ for col_name in self.columns_with_different_desc:
38
+ failure_message += f"- {self.column_node_map[col_name]} (column: {col_name})\n"
39
+
40
+ recommendation = "Ensure that the description for the columns is consistent across all instances."
41
+
42
+ return DBTInsightResult(
43
+ type=self.TYPE,
44
+ name=self.NAME,
45
+ message=failure_message,
46
+ recommendation=recommendation,
47
+ reason_to_flag=self.REASON_TO_FLAG,
48
+ metadata={
49
+ "columns_with_diff_desc": self.columns_with_different_desc,
50
+ },
51
+ )
52
+
53
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
54
+ """
55
+ Generate a list of InsightResponse objects for each model in the DBT project,
56
+ identifying models with columns that have different descriptions for the same column name.
57
+ :return: A list of InsightResponse objects.
58
+ """
59
+
60
+ insights = []
61
+ for node_id, node in self.nodes.items():
62
+ if self.should_skip_model(node_id):
63
+ self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
64
+ continue
65
+ if node.resource_type == AltimateResourceType.model:
66
+ self._get_columns_with_different_desc(node_id)
67
+
68
+ for node_id, node in self.sources.items():
69
+ if self.should_skip_model(node_id):
70
+ self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
71
+ continue
72
+ elif node.resource_type == AltimateResourceType.source:
73
+ self._get_columns_with_different_desc(node_id)
74
+
75
+ if self.columns_with_different_desc:
76
+ insights.append(
77
+ DBTModelInsightResponse(
78
+ unique_id=node_id,
79
+ package_name=node.package_name,
80
+ path=node.original_file_path,
81
+ original_file_path=node.original_file_path,
82
+ insight=self._build_failure_result(),
83
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
84
+ )
85
+ )
86
+
87
+ return insights
88
+
89
+ def _get_columns_with_different_desc(self, node_id):
90
+ """
91
+ Get the list of models or sources that have different descriptions for the same column name.
92
+ :param node_id: The unique ID of the node.
93
+ """
94
+ for column_name, column_node in self.get_node(node_id).columns.items():
95
+ if column_name in self.column_node_map:
96
+ self.column_node_map[column_name].append(node_id)
97
+ else:
98
+ self.column_node_map[column_name] = [node_id]
99
+
100
+ if column_name in self.columns:
101
+ if column_node.description != self.columns[column_name]:
102
+ if column_name not in self.columns_with_different_desc:
103
+ self.columns_with_different_desc.append(column_name)
104
+ else:
105
+ self.columns[column_name] = column_node.description
@@ -0,0 +1,154 @@
1
+ import re
2
+ from typing import ClassVar
3
+ from typing import List
4
+ from typing import Sequence
5
+ from typing import Tuple
6
+
7
+ from datapilot.core.insights.utils import get_severity
8
+ from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
9
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
10
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
11
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
12
+ from datapilot.core.platforms.dbt.wrappers.catalog.wrapper import BaseCatalogWrapper
13
+ from datapilot.utils.formatting.utils import numbered_list
14
+
15
+
16
+ class CheckColumnNameContract(ChecksInsight):
17
+ NAME = "Column name follows contract pattern"
18
+ ALIAS = "column_name_contract"
19
+ DESCRIPTION = "Column names should adhere to the contract pattern defined for the data type. "
20
+ REASON_TO_FLAG = (
21
+ "Column names that do not adhere to the contract can lead to confusion and hinder effective data "
22
+ "modeling and analysis. It's important to maintain consistent column naming conventions."
23
+ )
24
+ FAILURE_MESSAGE = (
25
+ "The following columns in the model `{model_unique_id}` do not adhere to the contract:\n{columns}. "
26
+ "Inconsistent column naming conventions can impede understanding and usage of the model."
27
+ )
28
+ RECOMMENDATION = (
29
+ "Update the column names listed above in the model `{model_unique_id}` to adhere to the contract. "
30
+ "Consistent column naming conventions provide valuable context and aids in data understanding and collaboration."
31
+ )
32
+ PATTERN_STR = "pattern"
33
+ DATATYPE_STR = "dtype"
34
+ PATTERNS_LIST_STR = "patterns"
35
+ DEFAULT_PATTERN_STR = "default_pattern"
36
+ FILES_REQUIRED: ClassVar = ["Manifest", "Catalog"]
37
+
38
+ def __init__(self, catalog_wrapper: BaseCatalogWrapper, *args, **kwargs):
39
+ self.catalog = catalog_wrapper
40
+ super().__init__(*args, **kwargs)
41
+
42
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
43
+ self.default_pattern = self.get_check_config(self.DEFAULT_PATTERN_STR)
44
+ datatype_configs = self.get_check_config(self.PATTERNS_LIST_STR)
45
+ # Patterns : [{"pattern": "^[a-z_]+$", "dtype": "string"}, {"pattern": "^[a-z_]+$", "dtype": "string"}]
46
+ if not datatype_configs:
47
+ self.logger.debug(f"Column name contract not found in insight config for {self.ALIAS}. Skipping insight.")
48
+ return []
49
+ self.patterns = {
50
+ pattern.get(self.DATATYPE_STR).lower(): pattern.get(self.PATTERN_STR)
51
+ for pattern in datatype_configs
52
+ if pattern.get(self.PATTERN_STR) and pattern.get(self.DATATYPE_STR)
53
+ }
54
+ if not self.patterns:
55
+ self.logger.debug(f"Column name contract not found in insight config for {self.ALIAS}")
56
+ return []
57
+
58
+ insights = []
59
+ for node_id, node in self.nodes.items():
60
+ if self.should_skip_model(node_id):
61
+ self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
62
+ continue
63
+ if node.resource_type == AltimateResourceType.model:
64
+ columns = self._get_columns_with_contract_violation(node_id)
65
+ if columns:
66
+ insights.append(
67
+ DBTModelInsightResponse(
68
+ unique_id=node_id,
69
+ package_name=node.package_name,
70
+ path=node.original_file_path,
71
+ original_file_path=node.original_file_path,
72
+ insight=self._build_failure_result(node_id, columns),
73
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
74
+ )
75
+ )
76
+ return insights
77
+
78
+ def _build_failure_result(self, model_unique_id: str, columns: Sequence[str]) -> DBTInsightResult:
79
+ failure_message = self.FAILURE_MESSAGE.format(
80
+ columns=numbered_list(columns),
81
+ model_unique_id=model_unique_id,
82
+ )
83
+ recommendation = self.RECOMMENDATION.format(model_unique_id=model_unique_id)
84
+
85
+ return DBTInsightResult(
86
+ type=self.TYPE,
87
+ name=self.NAME,
88
+ message=failure_message,
89
+ recommendation=recommendation,
90
+ reason_to_flag=self.REASON_TO_FLAG,
91
+ metadata={"columns": columns, "model_unique_id": model_unique_id},
92
+ )
93
+
94
+ def _get_columns_in_model(self, node_id) -> List[str]:
95
+ if node_id not in self.catalog.get_schema():
96
+ return []
97
+ return self.catalog.get_schema()[node_id].keys()
98
+
99
+ def _get_columns_with_contract_violation(self, node_id) -> Sequence[str]:
100
+ columns = []
101
+ for col in self._get_columns_in_model(node_id):
102
+ schema = self.catalog.get_schema()[node_id]
103
+ col_name = col.lower()
104
+ col_type = schema[col]
105
+ if col_type.lower() in self.patterns:
106
+ if re.match(self.patterns[col_type.lower()], col_name, re.IGNORECASE) is None:
107
+ columns.append(col)
108
+ if self.default_pattern and re.match(self.default_pattern, col_name, re.IGNORECASE) is None:
109
+ columns.append(col)
110
+ return columns
111
+
112
+ @classmethod
113
+ def has_all_required_data(cls, has_manifest: bool, has_catalog: bool, **kwargs) -> Tuple[bool, str]:
114
+ if not has_manifest:
115
+ return False, "Manifest is required for insight to run."
116
+
117
+ if not has_catalog:
118
+ return False, "Catalog is required for insight to run."
119
+
120
+ return True, ""
121
+
122
+ @classmethod
123
+ def get_config_schema(cls):
124
+ config_schema = super().get_config_schema()
125
+ config_schema["config"] = {
126
+ "$schema": "http://json-schema.org/draft-07/schema#",
127
+ "type": "object",
128
+ "properties": {
129
+ cls.DEFAULT_PATTERN_STR: {
130
+ "type": "string",
131
+ "description": "The regex pattern to check the column name against if no pattern is found for the data type",
132
+ "default": "^[a-z_]+$",
133
+ },
134
+ cls.PATTERNS_LIST_STR: {
135
+ "type": "array",
136
+ "items": {
137
+ "type": "object",
138
+ "properties": {
139
+ cls.PATTERN_STR: {"type": "string", "description": "The regex pattern to check the column name against"},
140
+ cls.DATATYPE_STR: {
141
+ "type": "string",
142
+ "description": "The data type for which the pattern is defined",
143
+ },
144
+ },
145
+ "required": [cls.PATTERN_STR, cls.DATATYPE_STR],
146
+ },
147
+ "description": "A list of patterns to check the column name against for different data types",
148
+ "default": [],
149
+ },
150
+ },
151
+ "required": [cls.DEFAULT_PATTERN_STR, cls.PATTERNS_LIST_STR],
152
+ }
153
+ config_schema["files_required"] = cls.FILES_REQUIRED
154
+ return config_schema
@@ -0,0 +1,75 @@
1
+ from typing import List
2
+
3
+ from datapilot.core.insights.utils import get_severity
4
+ from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
5
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
6
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
7
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
8
+
9
+
10
+ class CheckMacroArgsHaveDesc(ChecksInsight):
11
+ NAME = "Check macro arguments has description"
12
+ ALIAS = "check_macro_args_have_desc"
13
+ DESCRIPTION = "Macro arguments should have a description. "
14
+ REASON_TO_FLAG = "Clear descriptions for macro arguments are crucial as they prevent misunderstandings, enhance user comprehension, and simplify maintenance. This leads to more accurate data analysis and efficient workflows."
15
+
16
+ def _build_failure_result(
17
+ self,
18
+ node_id: str,
19
+ ) -> DBTInsightResult:
20
+ """
21
+ Build failure result for the insight if a macro doesn't have a description.
22
+
23
+ :return: An instance of InsightResult containing failure message and recommendation.
24
+ """
25
+
26
+ failure_message = f"The macro `{node_id}` does not have a description."
27
+ recommendation = "Add a description to the macro to help in understanding the purpose of the macro."
28
+
29
+ return DBTInsightResult(
30
+ type=self.TYPE,
31
+ name=self.NAME,
32
+ message=failure_message,
33
+ recommendation=recommendation,
34
+ reason_to_flag=self.REASON_TO_FLAG,
35
+ )
36
+
37
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
38
+ """
39
+ Generate a list of InsightResponse objects for each model in the DBT project,
40
+ identifying macros whose arguments don't have descriptions.
41
+ :return: A list of InsightResponse objects.
42
+ """
43
+
44
+ insights = []
45
+ for macro_id, macro in self.macros.items():
46
+ if self.should_skip_model(macro_id):
47
+ self.logger.debug(f"Skipping model {macro_id} as it is not enabled for selected models")
48
+ continue
49
+ if macro.resource_type == AltimateResourceType.macro:
50
+ if not self._check_macro_args_have_desc(macro_id):
51
+ insights.append(
52
+ DBTModelInsightResponse(
53
+ unique_id=macro_id,
54
+ package_name=macro.package_name,
55
+ original_file_path=macro.original_file_path,
56
+ path=macro.original_file_path,
57
+ insight=self._build_failure_result(macro_id),
58
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
59
+ )
60
+ )
61
+
62
+ return insights
63
+
64
+ def _check_macro_args_have_desc(self, macro_id) -> bool:
65
+ """
66
+ Check if the macro has descriptions for its arguments.
67
+ """
68
+ macro = self.get_node(macro_id)
69
+ if not macro:
70
+ return True
71
+ args = macro.arguments or []
72
+ for arg in args:
73
+ if not arg.description:
74
+ return False
75
+ return True
@@ -0,0 +1,63 @@
1
+ from typing import List
2
+
3
+ from datapilot.core.insights.utils import get_severity
4
+ from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
5
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
6
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
7
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
8
+
9
+
10
+ class CheckMacroHasDesc(ChecksInsight):
11
+ NAME = "Macro has documentation"
12
+ ALIAS = "check_macro_has_desc"
13
+ DESCRIPTION = "Macros should be documented."
14
+ REASON_TO_FLAG = "Undocumented macros can cause misunderstandings and inefficiencies in data modeling and analysis, as they make it difficult to understand their purpose and usage. Clear descriptions are vital for accuracy and streamlined workflow."
15
+
16
+ def _build_failure_result(
17
+ self,
18
+ node_id: str,
19
+ ) -> DBTInsightResult:
20
+ """
21
+ Build failure result for the insight if a macro doesn't have a description.
22
+
23
+ :return: An instance of InsightResult containing failure message and recommendation.
24
+ """
25
+
26
+ failure_message = f"The macro `{node_id}` does not have a description."
27
+ recommendation = "Add a description to the macro to help in understanding the purpose of the macro."
28
+
29
+ return DBTInsightResult(
30
+ type=self.TYPE,
31
+ name=self.NAME,
32
+ message=failure_message,
33
+ recommendation=recommendation,
34
+ reason_to_flag=self.REASON_TO_FLAG,
35
+ metadata={"macro_unique_id": node_id},
36
+ )
37
+
38
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
39
+ """
40
+ Generate a list of InsightResponse objects for each model in the DBT project,
41
+ identifying macros that don't have descriptions.
42
+ :return: A list of InsightResponse objects.
43
+ """
44
+
45
+ insights = []
46
+ for macro_id, macro in self.macros.items():
47
+ if self.should_skip_model(macro_id):
48
+ self.logger.debug(f"Skipping model {macro_id} as it is not enabled for selected models")
49
+ continue
50
+ if macro.resource_type == AltimateResourceType.macro:
51
+ if not macro.description:
52
+ insights.append(
53
+ DBTModelInsightResponse(
54
+ unique_id=macro_id,
55
+ package_name=macro.package_name,
56
+ original_file_path=macro.original_file_path,
57
+ path=macro.original_file_path,
58
+ insight=self._build_failure_result(macro_id),
59
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
60
+ )
61
+ )
62
+
63
+ return insights
@@ -0,0 +1,96 @@
1
+ from typing import ClassVar
2
+ from typing import List
3
+ from typing import Sequence
4
+ from typing import Set
5
+ from typing import Tuple
6
+
7
+ from datapilot.core.insights.utils import get_severity
8
+ from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
9
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
10
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
11
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
12
+ from datapilot.core.platforms.dbt.wrappers.catalog.wrapper import BaseCatalogWrapper
13
+ from datapilot.utils.formatting.utils import numbered_list
14
+
15
+
16
+ class CheckModelHasAllColumns(ChecksInsight):
17
+ NAME = "Model has all columns as per catalog"
18
+ ALIAS = "check_model_has_all_columns"
19
+ DESCRIPTION = "Models should have all the columns as per the catalog."
20
+ REASON_TO_FLAG = (
21
+ "Missing columns in the model can lead to data integrity issues and inconsistency in analysis. "
22
+ "It's important to ensure that the model has all the required columns as per the catalog definition."
23
+ )
24
+ FILES_REQUIRED: ClassVar = ["Manifest", "Catalog"]
25
+
26
+ def __init__(self, catalog_wrapper: BaseCatalogWrapper, *args, **kwargs):
27
+ self.catalog = catalog_wrapper
28
+ super().__init__(*args, **kwargs)
29
+
30
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
31
+ insights = []
32
+ for node_id, node in self.nodes.items():
33
+ if self.should_skip_model(node_id):
34
+ self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
35
+ continue
36
+ if node.resource_type == AltimateResourceType.model:
37
+ missing_columns = self._check_model_columns(node_id)
38
+ if missing_columns:
39
+ insights.append(
40
+ DBTModelInsightResponse(
41
+ unique_id=node_id,
42
+ package_name=node.package_name,
43
+ path=node.original_file_path,
44
+ original_file_path=node.original_file_path,
45
+ insight=self._build_failure_result(node_id, missing_columns),
46
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
47
+ )
48
+ )
49
+ return insights
50
+
51
+ def _build_failure_result(self, model_unique_id: str, columns: Sequence[str]) -> DBTInsightResult:
52
+ failure_message = (
53
+ "The following columns in the model `{model_unique_id}` are missing:\n{columns}. "
54
+ "Ensure that the model includes all the required columns."
55
+ )
56
+ recommendation = (
57
+ "Add the missing columns listed above in the model `{model_unique_id}`. "
58
+ "Ensuring that the model has all the required columns helps in maintaining data integrity and consistency."
59
+ )
60
+
61
+ return DBTInsightResult(
62
+ type=self.TYPE,
63
+ name=self.NAME,
64
+ message=failure_message.format(
65
+ columns=numbered_list(columns),
66
+ model_unique_id=model_unique_id,
67
+ ),
68
+ recommendation=recommendation.format(model_unique_id=model_unique_id),
69
+ reason_to_flag=self.REASON_TO_FLAG,
70
+ metadata={"columns": columns, "model_unique_id": model_unique_id},
71
+ )
72
+
73
+ def _check_model_columns(self, node_id) -> Tuple[int, Set[str]]:
74
+ missing_columns = set()
75
+ schema = self.catalog.get_schema()
76
+ if node_id not in schema:
77
+ return missing_columns
78
+ catalog_columns = schema[node_id].keys()
79
+ for col_name in self.get_node(node_id).columns.keys():
80
+ if col_name not in catalog_columns:
81
+ missing_columns.add(col_name)
82
+ return missing_columns
83
+
84
+ @classmethod
85
+ def has_all_required_data(cls, has_manifest: bool, has_catalog: bool, **kwargs) -> Tuple[bool, str]:
86
+ if not has_manifest:
87
+ return False, "Manifest is required for insight to run."
88
+
89
+ if not has_catalog:
90
+ return False, "Catalog is required for insight to run."
91
+
92
+ return True, ""
93
+
94
+ @classmethod
95
+ def requires_catalog(cls) -> bool:
96
+ return True