altimate-datapilot-cli 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. altimate_datapilot_cli-0.0.8.dist-info/AUTHORS.rst +5 -0
  2. altimate_datapilot_cli-0.0.8.dist-info/LICENSE +9 -0
  3. altimate_datapilot_cli-0.0.8.dist-info/METADATA +102 -0
  4. altimate_datapilot_cli-0.0.8.dist-info/RECORD +139 -0
  5. altimate_datapilot_cli-0.0.8.dist-info/WHEEL +5 -0
  6. altimate_datapilot_cli-0.0.8.dist-info/entry_points.txt +4 -0
  7. altimate_datapilot_cli-0.0.8.dist-info/top_level.txt +1 -0
  8. datapilot/__init__.py +1 -0
  9. datapilot/__main__.py +14 -0
  10. datapilot/cli/__init__.py +0 -0
  11. datapilot/cli/main.py +11 -0
  12. datapilot/clients/__init__.py +0 -0
  13. datapilot/clients/altimate/__init__.py +0 -0
  14. datapilot/clients/altimate/client.py +85 -0
  15. datapilot/clients/altimate/utils.py +75 -0
  16. datapilot/config/__init__.py +0 -0
  17. datapilot/config/config.py +16 -0
  18. datapilot/config/utils.py +32 -0
  19. datapilot/core/__init__.py +0 -0
  20. datapilot/core/insights/__init__.py +2 -0
  21. datapilot/core/insights/base/__init__.py +0 -0
  22. datapilot/core/insights/base/insight.py +34 -0
  23. datapilot/core/insights/report.py +16 -0
  24. datapilot/core/insights/schema.py +24 -0
  25. datapilot/core/insights/sql/__init__.py +0 -0
  26. datapilot/core/insights/sql/base/__init__.py +0 -0
  27. datapilot/core/insights/sql/base/insight.py +18 -0
  28. datapilot/core/insights/sql/runtime/__init__.py +0 -0
  29. datapilot/core/insights/sql/static/__init__.py +0 -0
  30. datapilot/core/insights/utils.py +20 -0
  31. datapilot/core/platforms/__init__.py +0 -0
  32. datapilot/core/platforms/dbt/__init__.py +0 -0
  33. datapilot/core/platforms/dbt/cli/__init__.py +0 -0
  34. datapilot/core/platforms/dbt/cli/cli.py +112 -0
  35. datapilot/core/platforms/dbt/constants.py +34 -0
  36. datapilot/core/platforms/dbt/exceptions.py +6 -0
  37. datapilot/core/platforms/dbt/executor.py +157 -0
  38. datapilot/core/platforms/dbt/factory.py +22 -0
  39. datapilot/core/platforms/dbt/formatting.py +45 -0
  40. datapilot/core/platforms/dbt/hooks/__init__.py +0 -0
  41. datapilot/core/platforms/dbt/hooks/executor_hook.py +86 -0
  42. datapilot/core/platforms/dbt/insights/__init__.py +115 -0
  43. datapilot/core/platforms/dbt/insights/base.py +133 -0
  44. datapilot/core/platforms/dbt/insights/checks/__init__.py +0 -0
  45. datapilot/core/platforms/dbt/insights/checks/base.py +26 -0
  46. datapilot/core/platforms/dbt/insights/checks/check_column_desc_are_same.py +105 -0
  47. datapilot/core/platforms/dbt/insights/checks/check_column_name_contract.py +154 -0
  48. datapilot/core/platforms/dbt/insights/checks/check_macro_args_have_desc.py +75 -0
  49. datapilot/core/platforms/dbt/insights/checks/check_macro_has_desc.py +63 -0
  50. datapilot/core/platforms/dbt/insights/checks/check_model_has_all_columns.py +96 -0
  51. datapilot/core/platforms/dbt/insights/checks/check_model_has_labels_keys.py +112 -0
  52. datapilot/core/platforms/dbt/insights/checks/check_model_has_meta_keys.py +108 -0
  53. datapilot/core/platforms/dbt/insights/checks/check_model_has_properties_file.py +64 -0
  54. datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_group.py +118 -0
  55. datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_name.py +114 -0
  56. datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_type.py +119 -0
  57. datapilot/core/platforms/dbt/insights/checks/check_model_materialization_by_childs.py +129 -0
  58. datapilot/core/platforms/dbt/insights/checks/check_model_name_contract.py +132 -0
  59. datapilot/core/platforms/dbt/insights/checks/check_model_parents_and_childs.py +135 -0
  60. datapilot/core/platforms/dbt/insights/checks/check_model_parents_database.py +109 -0
  61. datapilot/core/platforms/dbt/insights/checks/check_model_parents_schema.py +109 -0
  62. datapilot/core/platforms/dbt/insights/checks/check_model_tags.py +87 -0
  63. datapilot/core/platforms/dbt/insights/checks/check_source_childs.py +97 -0
  64. datapilot/core/platforms/dbt/insights/checks/check_source_columns_have_desc.py +96 -0
  65. datapilot/core/platforms/dbt/insights/checks/check_source_has_all_columns.py +103 -0
  66. datapilot/core/platforms/dbt/insights/checks/check_source_has_freshness.py +94 -0
  67. datapilot/core/platforms/dbt/insights/checks/check_source_has_labels_keys.py +110 -0
  68. datapilot/core/platforms/dbt/insights/checks/check_source_has_loader.py +62 -0
  69. datapilot/core/platforms/dbt/insights/checks/check_source_has_meta_keys.py +117 -0
  70. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests.py +82 -0
  71. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_group.py +117 -0
  72. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_name.py +113 -0
  73. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_type.py +119 -0
  74. datapilot/core/platforms/dbt/insights/checks/check_source_table_has_description.py +62 -0
  75. datapilot/core/platforms/dbt/insights/checks/check_source_tags.py +76 -0
  76. datapilot/core/platforms/dbt/insights/dbt_test/__init__.py +0 -0
  77. datapilot/core/platforms/dbt/insights/dbt_test/base.py +23 -0
  78. datapilot/core/platforms/dbt/insights/dbt_test/missing_primary_key_tests.py +130 -0
  79. datapilot/core/platforms/dbt/insights/dbt_test/test_coverage.py +118 -0
  80. datapilot/core/platforms/dbt/insights/governance/__init__.py +0 -0
  81. datapilot/core/platforms/dbt/insights/governance/base.py +23 -0
  82. datapilot/core/platforms/dbt/insights/governance/documentation_on_stale_columns.py +130 -0
  83. datapilot/core/platforms/dbt/insights/governance/exposures_dependent_on_private_models.py +90 -0
  84. datapilot/core/platforms/dbt/insights/governance/public_models_without_contracts.py +89 -0
  85. datapilot/core/platforms/dbt/insights/governance/undocumented_columns.py +148 -0
  86. datapilot/core/platforms/dbt/insights/governance/undocumented_public_models.py +110 -0
  87. datapilot/core/platforms/dbt/insights/modelling/README.md +15 -0
  88. datapilot/core/platforms/dbt/insights/modelling/__init__.py +0 -0
  89. datapilot/core/platforms/dbt/insights/modelling/base.py +31 -0
  90. datapilot/core/platforms/dbt/insights/modelling/direct_join_to_source.py +125 -0
  91. datapilot/core/platforms/dbt/insights/modelling/downstream_models_dependent_on_source.py +113 -0
  92. datapilot/core/platforms/dbt/insights/modelling/duplicate_sources.py +85 -0
  93. datapilot/core/platforms/dbt/insights/modelling/hard_coded_references.py +80 -0
  94. datapilot/core/platforms/dbt/insights/modelling/joining_of_upstream_concepts.py +79 -0
  95. datapilot/core/platforms/dbt/insights/modelling/model_fanout.py +126 -0
  96. datapilot/core/platforms/dbt/insights/modelling/multiple_sources_joined.py +83 -0
  97. datapilot/core/platforms/dbt/insights/modelling/root_model.py +82 -0
  98. datapilot/core/platforms/dbt/insights/modelling/source_fanout.py +102 -0
  99. datapilot/core/platforms/dbt/insights/modelling/staging_model_dependent_on_downstream_models.py +103 -0
  100. datapilot/core/platforms/dbt/insights/modelling/staging_model_dependent_on_staging_models.py +89 -0
  101. datapilot/core/platforms/dbt/insights/modelling/unused_sources.py +59 -0
  102. datapilot/core/platforms/dbt/insights/performance/__init__.py +0 -0
  103. datapilot/core/platforms/dbt/insights/performance/base.py +26 -0
  104. datapilot/core/platforms/dbt/insights/performance/chain_view_linking.py +92 -0
  105. datapilot/core/platforms/dbt/insights/performance/exposure_parent_materializations.py +104 -0
  106. datapilot/core/platforms/dbt/insights/schema.py +72 -0
  107. datapilot/core/platforms/dbt/insights/structure/__init__.py +0 -0
  108. datapilot/core/platforms/dbt/insights/structure/base.py +33 -0
  109. datapilot/core/platforms/dbt/insights/structure/model_directories_structure.py +92 -0
  110. datapilot/core/platforms/dbt/insights/structure/model_naming_conventions.py +97 -0
  111. datapilot/core/platforms/dbt/insights/structure/source_directories_structure.py +80 -0
  112. datapilot/core/platforms/dbt/insights/structure/test_directory_structure.py +74 -0
  113. datapilot/core/platforms/dbt/insights/utils.py +9 -0
  114. datapilot/core/platforms/dbt/schemas/__init__.py +0 -0
  115. datapilot/core/platforms/dbt/schemas/catalog.py +73 -0
  116. datapilot/core/platforms/dbt/schemas/manifest.py +462 -0
  117. datapilot/core/platforms/dbt/utils.py +525 -0
  118. datapilot/core/platforms/dbt/wrappers/__init__.py +0 -0
  119. datapilot/core/platforms/dbt/wrappers/catalog/__init__.py +0 -0
  120. datapilot/core/platforms/dbt/wrappers/catalog/v1/__init__.py +0 -0
  121. datapilot/core/platforms/dbt/wrappers/catalog/v1/wrapper.py +18 -0
  122. datapilot/core/platforms/dbt/wrappers/catalog/wrapper.py +9 -0
  123. datapilot/core/platforms/dbt/wrappers/manifest/__init__.py +0 -0
  124. datapilot/core/platforms/dbt/wrappers/manifest/v11/__init__.py +0 -0
  125. datapilot/core/platforms/dbt/wrappers/manifest/v11/schemas.py +47 -0
  126. datapilot/core/platforms/dbt/wrappers/manifest/v11/wrapper.py +396 -0
  127. datapilot/core/platforms/dbt/wrappers/manifest/wrapper.py +35 -0
  128. datapilot/core/platforms/dbt/wrappers/run_results/__init__.py +0 -0
  129. datapilot/core/platforms/dbt/wrappers/run_results/run_results.py +39 -0
  130. datapilot/exceptions/__init__.py +0 -0
  131. datapilot/exceptions/exceptions.py +10 -0
  132. datapilot/schemas/__init__.py +0 -0
  133. datapilot/schemas/constants.py +5 -0
  134. datapilot/schemas/nodes.py +19 -0
  135. datapilot/schemas/sql.py +10 -0
  136. datapilot/utils/__init__.py +0 -0
  137. datapilot/utils/formatting/__init__.py +0 -0
  138. datapilot/utils/formatting/utils.py +59 -0
  139. datapilot/utils/utils.py +317 -0
@@ -0,0 +1,87 @@
1
+ from typing import List
2
+
3
+ from datapilot.core.insights.utils import get_severity
4
+ from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
5
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
6
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
7
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
8
+
9
+
10
+ class CheckModelTags(ChecksInsight):
11
+ NAME = "Model only has valid tags"
12
+ ALIAS = "check_model_tags"
13
+ DESCRIPTION = "Ensures that the model has only valid tags from the provided list."
14
+ REASON_TO_FLAG = "The model has tags that are not in the valid tags list"
15
+ TAGS_LIST_STR = "tag_list"
16
+
17
+ def _build_failure_result(
18
+ self,
19
+ node_id: str,
20
+ tags: List[str],
21
+ ) -> DBTInsightResult:
22
+ """
23
+ Build failure result for the insight if a model's tags are not in the provided tag list.
24
+ """
25
+
26
+ failure_message = f"The model:{node_id}'s tags are not in the provided tag list:\n"
27
+
28
+ recommendation = "Update the model's tags to adhere to the provided tag list."
29
+
30
+ return DBTInsightResult(
31
+ type=self.TYPE,
32
+ name=self.NAME,
33
+ message=failure_message,
34
+ recommendation=recommendation,
35
+ reason_to_flag=self.REASON_TO_FLAG,
36
+ metadata={"tags": tags},
37
+ )
38
+
39
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
40
+ """
41
+ Generate a list of InsightResponse objects for each model in the DBT project,
42
+ Ensures that the model has only valid tags from the provided list.
43
+ The provided tag list is in the configuration file.
44
+ """
45
+ insights = []
46
+ self.tag_list = self.get_check_config(self.TAGS_LIST_STR)
47
+ for node_id, node in self.nodes.items():
48
+ if self.should_skip_model(node_id):
49
+ self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
50
+ continue
51
+ if node.resource_type == AltimateResourceType.model:
52
+ if not self.valid_tag(node.config.tags):
53
+ insights.append(
54
+ DBTModelInsightResponse(
55
+ unique_id=node_id,
56
+ package_name=node.package_name,
57
+ original_file_path=node.original_file_path,
58
+ path=node.original_file_path,
59
+ insight=self._build_failure_result(node_id, node.config.tags),
60
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
61
+ )
62
+ )
63
+ return insights
64
+
65
+ def valid_tag(self, tags: List[str]) -> bool:
66
+ """
67
+ Check if the tags of the model are in the provided tag list.
68
+ """
69
+ if not self.tag_list:
70
+ return True
71
+ return all(tag in self.tag_list for tag in tags)
72
+
73
+ @classmethod
74
+ def get_config_schema(cls):
75
+ config_schema = super().get_config_schema()
76
+ config_schema["config"] = {
77
+ "$schema": "http://json-schema.org/draft-07/schema#",
78
+ "type": "object",
79
+ "properties": {
80
+ cls.TAGS_LIST_STR: {
81
+ "type": "array",
82
+ "items": {"type": "string"},
83
+ "description": "List of allowed tags for the model. If not provided, all tags are allowed.",
84
+ "default": [],
85
+ },
86
+ },
87
+ }
@@ -0,0 +1,97 @@
1
+ from typing import List
2
+
3
+ from datapilot.core.insights.utils import get_severity
4
+ from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
5
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
6
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
7
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
8
+
9
+
10
+ class CheckSourceChilds(ChecksInsight):
11
+ NAME = "Source has allowed number of children"
12
+ ALIAS = "check_source_childs"
13
+ DESCRIPTION = "Check the source has a specific number (max/min) of childs"
14
+ REASON_TO_FLAG = "The source has a number of childs that is not in the valid range"
15
+ MIN_CHILDS_STR = "min_childs"
16
+ MAX_CHILDS_STR = "max_childs"
17
+
18
+ def _build_failure_result(
19
+ self,
20
+ node_id: str,
21
+ min_childs: int,
22
+ max_childs: int,
23
+ ) -> DBTInsightResult:
24
+ """
25
+ Build failure result for the insight if a source has a specific number (max/min) of childs
26
+ """
27
+ failure_message = f"The source:{node_id} has a number of childs that is not in the valid range:\n"
28
+ failure_message += f"Min childs: {min_childs}\n"
29
+ failure_message += f"Max childs: {max_childs}\n"
30
+
31
+ recommendation = "Update the source to adhere to the valid range of childs."
32
+ return DBTInsightResult(
33
+ type=self.TYPE,
34
+ name=self.NAME,
35
+ message=failure_message,
36
+ recommendation=recommendation,
37
+ reason_to_flag=self.REASON_TO_FLAG,
38
+ metadata={"source_unique_id": node_id, "min_childs": min_childs, "max_childs": max_childs},
39
+ )
40
+
41
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
42
+ """
43
+ Generate a list of InsightResponse objects for each source in the DBT project,
44
+ Check the source has a specific number (max/min) of childs
45
+ The min and max number of childs is in the configuration file.
46
+ """
47
+ insights = []
48
+ self.min_childs = self.get_check_config(self.MIN_CHILDS_STR)
49
+ self.max_childs = self.get_check_config(self.MAX_CHILDS_STR)
50
+ for node_id, node in self.sources.items():
51
+ if self.should_skip_model(node_id):
52
+ self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
53
+ continue
54
+ if node.resource_type == AltimateResourceType.source:
55
+ if not self.valid_childs(node_id):
56
+ insights.append(
57
+ DBTModelInsightResponse(
58
+ unique_id=node_id,
59
+ package_name=node.package_name,
60
+ original_file_path=node.original_file_path,
61
+ path=node.original_file_path,
62
+ insight=self._build_failure_result(node_id, min_childs=self.min_childs, max_childs=self.max_childs),
63
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
64
+ )
65
+ )
66
+ return insights
67
+
68
+ def valid_childs(self, source_unique_id: str) -> bool:
69
+ """
70
+ Check if the source has a specific number (max/min) of childs
71
+ """
72
+ source_childs = self.children_map.get(source_unique_id, [])
73
+ if self.min_childs and len(source_childs) < self.min_childs:
74
+ return False
75
+ if self.max_childs and len(source_childs) > self.max_childs:
76
+ return False
77
+ return True
78
+
79
+ @classmethod
80
+ def get_config_schema(cls):
81
+ config_schema = super().get_config_schema()
82
+ config_schema["config"] = {
83
+ "$schema": "http://json-schema.org/draft-07/schema#",
84
+ "type": "object",
85
+ "properties": {
86
+ cls.MAX_CHILDS_STR: {
87
+ "type": "integer",
88
+ "description": "The maximum number of childs a model can have.",
89
+ },
90
+ cls.MIN_CHILDS_STR: {
91
+ "type": "integer",
92
+ "description": "The minimum number of childs a model can have.",
93
+ "default": "0",
94
+ },
95
+ },
96
+ }
97
+ return config_schema
@@ -0,0 +1,96 @@
1
+ from typing import List
2
+ from typing import Sequence
3
+ from typing import Set
4
+ from typing import Tuple
5
+
6
+ from datapilot.core.insights.utils import get_severity
7
+ from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
8
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
9
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
10
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
11
+ from datapilot.core.platforms.dbt.wrappers.catalog.wrapper import BaseCatalogWrapper
12
+ from datapilot.utils.formatting.utils import numbered_list
13
+
14
+
15
+ class CheckSourceColumnsHaveDescriptions(ChecksInsight):
16
+ NAME = "Source columns have descriptions"
17
+ ALIAS = "check_source_columns_have_desc"
18
+ DESCRIPTION = "Ensures that the source has columns with descriptions in the properties file (usually schema.yml)."
19
+ REASON_TO_FLAG = "Missing descriptions for columns in the source can lead to confusion and inconsistency in analysis. "
20
+
21
+ def __init__(self, catalog_wrapper: BaseCatalogWrapper, *args, **kwargs):
22
+ self.catalog = catalog_wrapper
23
+ super().__init__(*args, **kwargs)
24
+
25
+ def _build_failure_result(self, model_unique_id: str, columns: Sequence[str]) -> DBTInsightResult:
26
+ """
27
+ Build failure result for the insight if a source has columns without descriptions.
28
+ """
29
+ failure_message = f"The source:{model_unique_id} has columns without descriptions:\n"
30
+ failure_message += numbered_list(columns)
31
+
32
+ recommendation = "Update the source to include descriptions for all columns."
33
+ return DBTInsightResult(
34
+ type=self.TYPE,
35
+ name=self.NAME,
36
+ message=failure_message,
37
+ recommendation=recommendation,
38
+ reason_to_flag=self.REASON_TO_FLAG,
39
+ metadata={"source_unique_id": model_unique_id, "columns": columns},
40
+ )
41
+
42
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
43
+ """
44
+ Generate the insight response for the check. This method is called by the insight runner to generate the insight
45
+ response for the check.
46
+ Ensures that the source has columns with descriptions in the properties file (usually schema.yml).
47
+
48
+
49
+ Args:
50
+ *args: Variable length argument list.
51
+ **kwargs: Arbitrary keyword arguments.
52
+ Returns:
53
+ List[DBTModelInsightResponse]: List of insight responses for the check.
54
+
55
+ """
56
+ insights = []
57
+ for node_id, node in self.sources.items():
58
+ if self.should_skip_model(node_id):
59
+ self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
60
+ continue
61
+ if node.resource_type == AltimateResourceType.source:
62
+ missing_columns = self._check_source_columns(node_id)
63
+ if missing_columns:
64
+ insights.append(
65
+ DBTModelInsightResponse(
66
+ unique_id=node_id,
67
+ package_name=node.package_name,
68
+ path=node.original_file_path,
69
+ original_file_path=node.original_file_path,
70
+ insight=self._build_failure_result(node_id, missing_columns),
71
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
72
+ )
73
+ )
74
+ return insights
75
+
76
+ def _check_source_columns(self, node_id) -> Tuple[int, Set[str]]:
77
+ columns_with_missing_descriptions = set()
78
+ for column_name, column_node in self.get_node(node_id).columns.items():
79
+ if not column_node.description:
80
+ columns_with_missing_descriptions.add(column_name)
81
+ return columns_with_missing_descriptions
82
+
83
+ @classmethod
84
+ def has_all_required_data(cls, has_manifest: bool, has_catalog: bool, **kwargs) -> Tuple[bool, str]:
85
+ """
86
+ Check if all required data is available for the insight to run.
87
+ :param has_manifest: A boolean indicating if manifest is available.
88
+ :return: A boolean indicating if all required data is available.
89
+ """
90
+ if not has_manifest:
91
+ return False, "Manifest is required for insight to run."
92
+
93
+ if not has_catalog:
94
+ return False, "Catalog is required for insight to run."
95
+
96
+ return True, ""
@@ -0,0 +1,103 @@
1
+ from typing import ClassVar
2
+ from typing import List
3
+ from typing import Sequence
4
+ from typing import Set
5
+ from typing import Tuple
6
+
7
+ from datapilot.core.insights.utils import get_severity
8
+ from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
9
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
10
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
11
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
12
+ from datapilot.core.platforms.dbt.wrappers.catalog.wrapper import BaseCatalogWrapper
13
+ from datapilot.utils.formatting.utils import numbered_list
14
+
15
+
16
+ class CheckSourceHasAllColumns(ChecksInsight):
17
+ NAME = "Source has all columns"
18
+ ALIAS = "check_source_has_all_columns"
19
+ DESCRIPTION = "Ensures that all columns in the database are also specified in the properties file. (usually schema.yml)."
20
+ REASON_TO_FLAG = "Missing columns in the source can lead to confusion and inconsistency in analysis. "
21
+ FILES_REQUIRED: ClassVar = ["Manifest", "Catalog"]
22
+
23
+ def __init__(self, catalog_wrapper: BaseCatalogWrapper, *args, **kwargs):
24
+ self.catalog = catalog_wrapper
25
+ super().__init__(*args, **kwargs)
26
+
27
+ def _build_failure_result(self, source_unique_id: str, columns: Sequence[str]) -> DBTInsightResult:
28
+ """
29
+ Build failure result for the insight if a source has missing columns.
30
+ """
31
+ failure_message = f"The source:{source_unique_id} has missing columns:\n"
32
+ failure_message += numbered_list(columns)
33
+
34
+ recommendation = "Update the source to include all columns."
35
+ return DBTInsightResult(
36
+ type=self.TYPE,
37
+ name=self.NAME,
38
+ message=failure_message,
39
+ recommendation=recommendation,
40
+ reason_to_flag=self.REASON_TO_FLAG,
41
+ metadata={"source_unique_id": source_unique_id, "columns": columns},
42
+ )
43
+
44
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
45
+ """
46
+ Generate the insight response for the check. This method is called by the insight runner to generate the insight
47
+ response for the check.
48
+ Ensures that the source has all columns in the properties file (usually schema.yml).
49
+ """
50
+ insights = []
51
+ for node_id, node in self.sources.items():
52
+ if self.should_skip_model(node_id):
53
+ self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
54
+ continue
55
+ if node.resource_type == AltimateResourceType.source:
56
+ missing_columns = self._check_source_columns(node_id)
57
+ if missing_columns:
58
+ insights.append(
59
+ DBTModelInsightResponse(
60
+ unique_id=node_id,
61
+ package_name=node.package_name,
62
+ original_file_path=node.original_file_path,
63
+ path=node.original_file_path,
64
+ insight=self._build_failure_result(node_id, list(missing_columns)),
65
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
66
+ )
67
+ )
68
+ return insights
69
+
70
+ def _check_source_columns(self, node_id) -> Tuple[int, Set[str]]:
71
+ """
72
+ Check if the source has all columns
73
+ Checking if the source has all columns as defined in the catalog.
74
+ Ensuring that the source has all columns helps in maintaining data integrity and consistency.
75
+ """
76
+ missing_columns = set()
77
+ schema = self.catalog.get_schema()
78
+ if node_id not in schema:
79
+ return missing_columns
80
+ catalog_columns = schema[node_id].keys()
81
+ for col_name in self.get_node(node_id).columns.items():
82
+ if col_name not in catalog_columns:
83
+ missing_columns.add(col_name)
84
+ return missing_columns
85
+
86
+ @classmethod
87
+ def has_all_required_data(cls, has_manifest: bool, has_catalog: bool, **kwargs) -> Tuple[bool, str]:
88
+ """
89
+ Check if all required data is available for the insight to run.
90
+ :param has_manifest: A boolean indicating if manifest is available.
91
+ :return: A boolean indicating if all required data is available.
92
+ """
93
+ if not has_manifest:
94
+ return False, "Manifest is required for insight to run."
95
+
96
+ if not has_catalog:
97
+ return False, "Catalog is required for insight to run."
98
+
99
+ return True, ""
100
+
101
+ @classmethod
102
+ def requires_catalog(cls) -> bool:
103
+ return True
@@ -0,0 +1,94 @@
1
+ from typing import List
2
+
3
+ from datapilot.core.insights.utils import get_severity
4
+ from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
5
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
6
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
7
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
8
+
9
+
10
+ class CheckSourceHasFreshness(ChecksInsight):
11
+ NAME = "Source has freshness options"
12
+ ALIAS = "check_source_has_freshness"
13
+ DESCRIPTION = "Ensures that the source has freshness options"
14
+ REASON_TO_FLAG = "Missing freshness options for the source can lead to confusion and inconsistency in analysis. "
15
+ FRESHNESS_STR = "freshness"
16
+
17
+ def _build_failure_result(self, source_id: int, missing_keys) -> DBTInsightResult:
18
+ """
19
+ Build failure result for the insight if a model's parent schema is not whitelist or in blacklist.
20
+ """
21
+ missing_keys = ", ".join(missing_keys)
22
+ failure_message = f"The source:{source_id} does not have freshness options defined for the following keys:\n {missing_keys}"
23
+
24
+ recommendation = "Define the freshness options for the source to ensure consistency in analysis."
25
+
26
+ return DBTInsightResult(
27
+ type=self.TYPE,
28
+ name=self.NAME,
29
+ message=failure_message,
30
+ recommendation=recommendation,
31
+ reason_to_flag=self.REASON_TO_FLAG,
32
+ metadata={"source_id": source_id, "missing_keys": missing_keys},
33
+ )
34
+
35
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
36
+ """
37
+ Generate the insight response for the check. This method is called by the insight runner to generate the insight
38
+ response for the check.
39
+ Ensures that the source has freshness options
40
+ """
41
+ self.freshness_keys = self.get_check_config(self.FRESHNESS_STR) or []
42
+ insights = []
43
+ for node_id, node in self.sources.items():
44
+ if self.should_skip_model(node_id):
45
+ self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
46
+ continue
47
+ if node.resource_type == AltimateResourceType.source:
48
+ missing_keys = self._check_source_has_freshness(node_id)
49
+ if missing_keys:
50
+ insights.append(
51
+ DBTModelInsightResponse(
52
+ unique_id=node_id,
53
+ package_name=node.package_name,
54
+ original_file_path=node.original_file_path,
55
+ path=node.original_file_path,
56
+ insight=self._build_failure_result(node_id, missing_keys),
57
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
58
+ )
59
+ )
60
+ return insights
61
+
62
+ def _check_source_has_freshness(self, source_id: str) -> List[str]:
63
+ source = self.get_node(source_id)
64
+ freshness = source.freshness.dict() if source.freshness else {}
65
+
66
+ if not freshness:
67
+ return False
68
+
69
+ missing_keys = []
70
+ for key in self.freshness_keys:
71
+ if key not in freshness:
72
+ missing_keys.append(key)
73
+
74
+ return missing_keys
75
+
76
+ @classmethod
77
+ def get_config_schema(cls):
78
+ config_schema = super().get_config_schema()
79
+ config_schema["config"] = {
80
+ "$schema": "http://json-schema.org/draft-07/schema#",
81
+ "type": "object",
82
+ "properties": {
83
+ cls.FRESHNESS_STR: {
84
+ "type": "array",
85
+ "description": "The freshness options that should be defined for the source. If not provided, all freshness options are allowed.",
86
+ "items": {
87
+ "type": "string",
88
+ "enum": ["error_after", "warn_after"],
89
+ },
90
+ },
91
+ },
92
+ "required": [cls.FRESHNESS_STR],
93
+ }
94
+ return config_schema
@@ -0,0 +1,110 @@
1
+ from typing import List
2
+ from typing import Sequence
3
+ from typing import Set
4
+ from typing import Tuple
5
+
6
+ from datapilot.core.insights.utils import get_severity
7
+ from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
8
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
9
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
10
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
11
+ from datapilot.utils.formatting.utils import numbered_list
12
+
13
+
14
+ class CheckSourceHasLabelsKeys(ChecksInsight):
15
+ NAME = "Check source has labels keys"
16
+ ALIAS = "check_source_has_labels_keys"
17
+ DESCRIPTION = (
18
+ "Checks that the source has the specified labels keys as defined in the properties file. "
19
+ "Ensuring that the source has the required labels keys helps in maintaining metadata consistency and understanding."
20
+ )
21
+ REASON_TO_FLAG = (
22
+ "Missing labels keys in the source can lead to inconsistency in metadata management and understanding of the source. "
23
+ "It's important to ensure that the source includes all the required labels keys as per the configuration."
24
+ )
25
+ LABEL_KEYS_STR = "labels_keys"
26
+ ALLOW_EXTRA_KEYS_STR = "allow_extra_keys"
27
+
28
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
29
+ insights = []
30
+ self.labels_keys = self.get_check_config(self.LABEL_KEYS_STR)
31
+ self.allow_extra_keys = self.get_check_config(self.ALLOW_EXTRA_KEYS_STR)
32
+
33
+ for node_id, node in self.sources.items():
34
+ if self.should_skip_model(node_id):
35
+ self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
36
+ continue
37
+ if node.resource_type == AltimateResourceType.source:
38
+ status_code, missing_keys, extra_keys = self._check_labels_keys(node_id)
39
+ if status_code == 1:
40
+ insights.append(
41
+ DBTModelInsightResponse(
42
+ unique_id=node_id,
43
+ package_name=node.package_name,
44
+ path=node.original_file_path,
45
+ original_file_path=node.original_file_path,
46
+ insight=self._build_failure_result(node_id, missing_keys, extra_keys),
47
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
48
+ )
49
+ )
50
+ return insights
51
+
52
+ def _build_failure_result(self, model_unique_id: str, missing_keys: Sequence[str], extra_keys: Sequence[str]) -> DBTInsightResult:
53
+ failure_message = ""
54
+ if missing_keys:
55
+ failure_message += (
56
+ f"The model `{model_unique_id}` is missing the following labels keys: {missing_keys}. "
57
+ "Ensure that the model has the required labels keys."
58
+ )
59
+ if extra_keys:
60
+ failure_message += (
61
+ f"The model `{model_unique_id}` has the following extra labels keys: {extra_keys}. "
62
+ "Ensure that the model does not include any extra labels keys."
63
+ )
64
+ recommendation = (
65
+ "Add the following labels keys to the model `{model_unique_id}`: {missing_keys}. "
66
+ "Ensuring that the model has the required labels keys helps in maintaining metadata consistency and understanding."
67
+ )
68
+ return DBTInsightResult(
69
+ failure_message=failure_message.format(model_unique_id=model_unique_id, missing_keys=numbered_list(missing_keys)),
70
+ recommendation=recommendation.format(model_unique_id=model_unique_id, missing_keys=numbered_list(missing_keys)),
71
+ metadata={"model_unique_id": model_unique_id, "missing_keys": missing_keys},
72
+ )
73
+
74
+ def _check_labels_keys(self, node_id) -> Tuple[int, Set[str]]:
75
+ status_code = 0
76
+ missing_keys = set(self.labels_keys) - set(self.get_node(node_id).label)
77
+ config = self.get_node(node_id).config.dict() if self.get_node(node_id).config else {}
78
+ labels = config.get("labels", {})
79
+ label_keys = set(labels.keys())
80
+ extra_keys = set()
81
+ if missing_keys:
82
+ status_code = 1
83
+ if not self.allow_extra_keys:
84
+ extra_keys = label_keys - set(self.labels_keys)
85
+ if extra_keys:
86
+ status_code = 1
87
+ return status_code, missing_keys, extra_keys
88
+
89
+ @classmethod
90
+ def get_config_schema(cls):
91
+ config_schema = super().get_config_schema()
92
+ config_schema["config"] = {
93
+ "$schema": "http://json-schema.org/draft-07/schema#",
94
+ "type": "object",
95
+ "properties": {
96
+ cls.LABEL_KEYS_STR: {
97
+ "type": "array",
98
+ "items": {
99
+ "type": "string",
100
+ },
101
+ "description": "A list of meta keys that should be present in the model.",
102
+ },
103
+ cls.ALLOW_EXTRA_KEYS_STR: {
104
+ "type": "boolean",
105
+ "default": False,
106
+ },
107
+ },
108
+ "required": [cls.LABEL_KEYS_STR],
109
+ }
110
+ return config_schema
@@ -0,0 +1,62 @@
1
+ from typing import List
2
+
3
+ from datapilot.core.insights.utils import get_severity
4
+ from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
5
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
6
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
7
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
8
+
9
+
10
+ class CheckSourceHasLoader(ChecksInsight):
11
+ NAME = "Source has loader"
12
+ ALIAS = "check_source_has_loader"
13
+ DESCRIPTION = "Check if the source has a loader"
14
+ REASON_TO_FLAG = "Missing loader for the source can lead to confusion and inconsistency in analysis. "
15
+
16
+ def _build_failure_result(self, source_id: int) -> DBTInsightResult:
17
+ """
18
+ Build failure result for the insight if a model's parent schema is not whitelist or in blacklist.
19
+ """
20
+ failure_message = f"The source:{source_id} does not have a loader defined.\n"
21
+
22
+ recommendation = "Define the loader for the source to ensure consistency in analysis."
23
+
24
+ return DBTInsightResult(
25
+ type=self.TYPE,
26
+ name=self.NAME,
27
+ message=failure_message,
28
+ recommendation=recommendation,
29
+ reason_to_flag=self.REASON_TO_FLAG,
30
+ metadata={"source_id": source_id},
31
+ )
32
+
33
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
34
+ """
35
+ Generate the insight response for the check. This method is called by the insight runner to generate the insight
36
+ response for the check.
37
+ Ensures that the source has a loader option
38
+ """
39
+ insights = []
40
+ for node_id, node in self.sources.items():
41
+ if self.should_skip_model(node_id):
42
+ self.logger.debug(f"Skipping source {node_id} as it is not enabled for selected models")
43
+ continue
44
+ if node.resource_type == AltimateResourceType.source:
45
+ if not self._check_source_has_loader(node_id):
46
+ insights.append(
47
+ DBTModelInsightResponse(
48
+ unique_id=node_id,
49
+ package_name=node.package_name,
50
+ original_file_path=node.original_file_path,
51
+ path=node.original_file_path,
52
+ insight=self._build_failure_result(node_id),
53
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
54
+ )
55
+ )
56
+ return insights
57
+
58
+ def _check_source_has_loader(self, source_unique_id: str) -> bool:
59
+ source = self.get_node(source_unique_id)
60
+ if not source.loader:
61
+ return False
62
+ return True