altimate-datapilot-cli 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. altimate_datapilot_cli-0.0.8.dist-info/AUTHORS.rst +5 -0
  2. altimate_datapilot_cli-0.0.8.dist-info/LICENSE +9 -0
  3. altimate_datapilot_cli-0.0.8.dist-info/METADATA +102 -0
  4. altimate_datapilot_cli-0.0.8.dist-info/RECORD +139 -0
  5. altimate_datapilot_cli-0.0.8.dist-info/WHEEL +5 -0
  6. altimate_datapilot_cli-0.0.8.dist-info/entry_points.txt +4 -0
  7. altimate_datapilot_cli-0.0.8.dist-info/top_level.txt +1 -0
  8. datapilot/__init__.py +1 -0
  9. datapilot/__main__.py +14 -0
  10. datapilot/cli/__init__.py +0 -0
  11. datapilot/cli/main.py +11 -0
  12. datapilot/clients/__init__.py +0 -0
  13. datapilot/clients/altimate/__init__.py +0 -0
  14. datapilot/clients/altimate/client.py +85 -0
  15. datapilot/clients/altimate/utils.py +75 -0
  16. datapilot/config/__init__.py +0 -0
  17. datapilot/config/config.py +16 -0
  18. datapilot/config/utils.py +32 -0
  19. datapilot/core/__init__.py +0 -0
  20. datapilot/core/insights/__init__.py +2 -0
  21. datapilot/core/insights/base/__init__.py +0 -0
  22. datapilot/core/insights/base/insight.py +34 -0
  23. datapilot/core/insights/report.py +16 -0
  24. datapilot/core/insights/schema.py +24 -0
  25. datapilot/core/insights/sql/__init__.py +0 -0
  26. datapilot/core/insights/sql/base/__init__.py +0 -0
  27. datapilot/core/insights/sql/base/insight.py +18 -0
  28. datapilot/core/insights/sql/runtime/__init__.py +0 -0
  29. datapilot/core/insights/sql/static/__init__.py +0 -0
  30. datapilot/core/insights/utils.py +20 -0
  31. datapilot/core/platforms/__init__.py +0 -0
  32. datapilot/core/platforms/dbt/__init__.py +0 -0
  33. datapilot/core/platforms/dbt/cli/__init__.py +0 -0
  34. datapilot/core/platforms/dbt/cli/cli.py +112 -0
  35. datapilot/core/platforms/dbt/constants.py +34 -0
  36. datapilot/core/platforms/dbt/exceptions.py +6 -0
  37. datapilot/core/platforms/dbt/executor.py +157 -0
  38. datapilot/core/platforms/dbt/factory.py +22 -0
  39. datapilot/core/platforms/dbt/formatting.py +45 -0
  40. datapilot/core/platforms/dbt/hooks/__init__.py +0 -0
  41. datapilot/core/platforms/dbt/hooks/executor_hook.py +86 -0
  42. datapilot/core/platforms/dbt/insights/__init__.py +115 -0
  43. datapilot/core/platforms/dbt/insights/base.py +133 -0
  44. datapilot/core/platforms/dbt/insights/checks/__init__.py +0 -0
  45. datapilot/core/platforms/dbt/insights/checks/base.py +26 -0
  46. datapilot/core/platforms/dbt/insights/checks/check_column_desc_are_same.py +105 -0
  47. datapilot/core/platforms/dbt/insights/checks/check_column_name_contract.py +154 -0
  48. datapilot/core/platforms/dbt/insights/checks/check_macro_args_have_desc.py +75 -0
  49. datapilot/core/platforms/dbt/insights/checks/check_macro_has_desc.py +63 -0
  50. datapilot/core/platforms/dbt/insights/checks/check_model_has_all_columns.py +96 -0
  51. datapilot/core/platforms/dbt/insights/checks/check_model_has_labels_keys.py +112 -0
  52. datapilot/core/platforms/dbt/insights/checks/check_model_has_meta_keys.py +108 -0
  53. datapilot/core/platforms/dbt/insights/checks/check_model_has_properties_file.py +64 -0
  54. datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_group.py +118 -0
  55. datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_name.py +114 -0
  56. datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_type.py +119 -0
  57. datapilot/core/platforms/dbt/insights/checks/check_model_materialization_by_childs.py +129 -0
  58. datapilot/core/platforms/dbt/insights/checks/check_model_name_contract.py +132 -0
  59. datapilot/core/platforms/dbt/insights/checks/check_model_parents_and_childs.py +135 -0
  60. datapilot/core/platforms/dbt/insights/checks/check_model_parents_database.py +109 -0
  61. datapilot/core/platforms/dbt/insights/checks/check_model_parents_schema.py +109 -0
  62. datapilot/core/platforms/dbt/insights/checks/check_model_tags.py +87 -0
  63. datapilot/core/platforms/dbt/insights/checks/check_source_childs.py +97 -0
  64. datapilot/core/platforms/dbt/insights/checks/check_source_columns_have_desc.py +96 -0
  65. datapilot/core/platforms/dbt/insights/checks/check_source_has_all_columns.py +103 -0
  66. datapilot/core/platforms/dbt/insights/checks/check_source_has_freshness.py +94 -0
  67. datapilot/core/platforms/dbt/insights/checks/check_source_has_labels_keys.py +110 -0
  68. datapilot/core/platforms/dbt/insights/checks/check_source_has_loader.py +62 -0
  69. datapilot/core/platforms/dbt/insights/checks/check_source_has_meta_keys.py +117 -0
  70. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests.py +82 -0
  71. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_group.py +117 -0
  72. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_name.py +113 -0
  73. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_type.py +119 -0
  74. datapilot/core/platforms/dbt/insights/checks/check_source_table_has_description.py +62 -0
  75. datapilot/core/platforms/dbt/insights/checks/check_source_tags.py +76 -0
  76. datapilot/core/platforms/dbt/insights/dbt_test/__init__.py +0 -0
  77. datapilot/core/platforms/dbt/insights/dbt_test/base.py +23 -0
  78. datapilot/core/platforms/dbt/insights/dbt_test/missing_primary_key_tests.py +130 -0
  79. datapilot/core/platforms/dbt/insights/dbt_test/test_coverage.py +118 -0
  80. datapilot/core/platforms/dbt/insights/governance/__init__.py +0 -0
  81. datapilot/core/platforms/dbt/insights/governance/base.py +23 -0
  82. datapilot/core/platforms/dbt/insights/governance/documentation_on_stale_columns.py +130 -0
  83. datapilot/core/platforms/dbt/insights/governance/exposures_dependent_on_private_models.py +90 -0
  84. datapilot/core/platforms/dbt/insights/governance/public_models_without_contracts.py +89 -0
  85. datapilot/core/platforms/dbt/insights/governance/undocumented_columns.py +148 -0
  86. datapilot/core/platforms/dbt/insights/governance/undocumented_public_models.py +110 -0
  87. datapilot/core/platforms/dbt/insights/modelling/README.md +15 -0
  88. datapilot/core/platforms/dbt/insights/modelling/__init__.py +0 -0
  89. datapilot/core/platforms/dbt/insights/modelling/base.py +31 -0
  90. datapilot/core/platforms/dbt/insights/modelling/direct_join_to_source.py +125 -0
  91. datapilot/core/platforms/dbt/insights/modelling/downstream_models_dependent_on_source.py +113 -0
  92. datapilot/core/platforms/dbt/insights/modelling/duplicate_sources.py +85 -0
  93. datapilot/core/platforms/dbt/insights/modelling/hard_coded_references.py +80 -0
  94. datapilot/core/platforms/dbt/insights/modelling/joining_of_upstream_concepts.py +79 -0
  95. datapilot/core/platforms/dbt/insights/modelling/model_fanout.py +126 -0
  96. datapilot/core/platforms/dbt/insights/modelling/multiple_sources_joined.py +83 -0
  97. datapilot/core/platforms/dbt/insights/modelling/root_model.py +82 -0
  98. datapilot/core/platforms/dbt/insights/modelling/source_fanout.py +102 -0
  99. datapilot/core/platforms/dbt/insights/modelling/staging_model_dependent_on_downstream_models.py +103 -0
  100. datapilot/core/platforms/dbt/insights/modelling/staging_model_dependent_on_staging_models.py +89 -0
  101. datapilot/core/platforms/dbt/insights/modelling/unused_sources.py +59 -0
  102. datapilot/core/platforms/dbt/insights/performance/__init__.py +0 -0
  103. datapilot/core/platforms/dbt/insights/performance/base.py +26 -0
  104. datapilot/core/platforms/dbt/insights/performance/chain_view_linking.py +92 -0
  105. datapilot/core/platforms/dbt/insights/performance/exposure_parent_materializations.py +104 -0
  106. datapilot/core/platforms/dbt/insights/schema.py +72 -0
  107. datapilot/core/platforms/dbt/insights/structure/__init__.py +0 -0
  108. datapilot/core/platforms/dbt/insights/structure/base.py +33 -0
  109. datapilot/core/platforms/dbt/insights/structure/model_directories_structure.py +92 -0
  110. datapilot/core/platforms/dbt/insights/structure/model_naming_conventions.py +97 -0
  111. datapilot/core/platforms/dbt/insights/structure/source_directories_structure.py +80 -0
  112. datapilot/core/platforms/dbt/insights/structure/test_directory_structure.py +74 -0
  113. datapilot/core/platforms/dbt/insights/utils.py +9 -0
  114. datapilot/core/platforms/dbt/schemas/__init__.py +0 -0
  115. datapilot/core/platforms/dbt/schemas/catalog.py +73 -0
  116. datapilot/core/platforms/dbt/schemas/manifest.py +462 -0
  117. datapilot/core/platforms/dbt/utils.py +525 -0
  118. datapilot/core/platforms/dbt/wrappers/__init__.py +0 -0
  119. datapilot/core/platforms/dbt/wrappers/catalog/__init__.py +0 -0
  120. datapilot/core/platforms/dbt/wrappers/catalog/v1/__init__.py +0 -0
  121. datapilot/core/platforms/dbt/wrappers/catalog/v1/wrapper.py +18 -0
  122. datapilot/core/platforms/dbt/wrappers/catalog/wrapper.py +9 -0
  123. datapilot/core/platforms/dbt/wrappers/manifest/__init__.py +0 -0
  124. datapilot/core/platforms/dbt/wrappers/manifest/v11/__init__.py +0 -0
  125. datapilot/core/platforms/dbt/wrappers/manifest/v11/schemas.py +47 -0
  126. datapilot/core/platforms/dbt/wrappers/manifest/v11/wrapper.py +396 -0
  127. datapilot/core/platforms/dbt/wrappers/manifest/wrapper.py +35 -0
  128. datapilot/core/platforms/dbt/wrappers/run_results/__init__.py +0 -0
  129. datapilot/core/platforms/dbt/wrappers/run_results/run_results.py +39 -0
  130. datapilot/exceptions/__init__.py +0 -0
  131. datapilot/exceptions/exceptions.py +10 -0
  132. datapilot/schemas/__init__.py +0 -0
  133. datapilot/schemas/constants.py +5 -0
  134. datapilot/schemas/nodes.py +19 -0
  135. datapilot/schemas/sql.py +10 -0
  136. datapilot/utils/__init__.py +0 -0
  137. datapilot/utils/formatting/__init__.py +0 -0
  138. datapilot/utils/formatting/utils.py +59 -0
  139. datapilot/utils/utils.py +317 -0
@@ -0,0 +1,103 @@
1
+ from typing import ClassVar
2
+ from typing import List
3
+
4
+ from datapilot.config.utils import get_regex_configuration
5
+ from datapilot.core.insights.utils import get_severity
6
+ from datapilot.core.platforms.dbt.constants import INTERMEDIATE
7
+ from datapilot.core.platforms.dbt.constants import MART
8
+ from datapilot.core.platforms.dbt.constants import STAGING
9
+ from datapilot.core.platforms.dbt.insights.modelling.base import DBTModellingInsight
10
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
11
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
12
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
13
+ from datapilot.core.platforms.dbt.utils import classify_model_type
14
+ from datapilot.schemas.constants import CONFIG_METRICS
15
+ from datapilot.utils.formatting.utils import numbered_list
16
+
17
+
18
+ class DBTStagingModelsDependentOnDownstreamModels(DBTModellingInsight):
19
+ """
20
+ DBTStagingModelsDependentOnDownstream identifies staging models in a dbt project that depend on downstream models.
21
+ """
22
+
23
+ NAME = "Staging models dependency check"
24
+ ALIAS = "staging_models_dependency"
25
+ DESCRIPTION = "Staging models should not depend on downstream models."
26
+ REASON_TO_FLAG = (
27
+ "Best practice is for staging models to depend on source or raw data models, not on downstream models. "
28
+ "Dependencies in the wrong direction can lead to complications in data processing and lineage tracing."
29
+ )
30
+ FAILURE_MESSAGE = (
31
+ "Staging model `{current_model_unique_id}` has dependencies on downstream models, "
32
+ "which is against best practices: \n{downstream_dependencies}"
33
+ )
34
+ RECOMMENDATION = (
35
+ "Refactor the staging model `{current_model_unique_id}` to ensure it depends on source or raw data models. "
36
+ "This will align the model with best practices, enhancing data flow clarity and lineage tracing."
37
+ )
38
+ DOWNSTREAM_MODEL_TYPES_STR = "downstream_model_types"
39
+ DOWNSTREAM_MODEL_TYPES: ClassVar[List[str]] = [MART, INTERMEDIATE]
40
+
41
+ def _build_failure_result(self, current_model_unique_id: str, downstream_dependencies: List[str]) -> DBTInsightResult:
42
+ failure = self.FAILURE_MESSAGE.format(
43
+ current_model_unique_id=current_model_unique_id,
44
+ downstream_dependencies=numbered_list(downstream_dependencies),
45
+ )
46
+ recommendation = self.RECOMMENDATION.format(current_model_unique_id=current_model_unique_id)
47
+
48
+ return DBTInsightResult(
49
+ type=self.TYPE,
50
+ name=self.NAME,
51
+ message=failure,
52
+ recommendation=recommendation,
53
+ reason_to_flag=self.REASON_TO_FLAG,
54
+ metadata={
55
+ "model": current_model_unique_id,
56
+ "downstream_dependencies": downstream_dependencies,
57
+ },
58
+ )
59
+
60
+ def _get_downstream_models(self) -> List[str]:
61
+ metrics_config = self.config.get(CONFIG_METRICS, {})
62
+ metric_config = metrics_config.get(self.ALIAS, {})
63
+
64
+ # Return the configured fanout threshold or the default if not specified
65
+ return metric_config.get(self.DOWNSTREAM_MODEL_TYPES_STR, self.DOWNSTREAM_MODEL_TYPES)
66
+
67
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
68
+ insights = []
69
+ downstream_models = self._get_downstream_models()
70
+ regex_configuration = get_regex_configuration(self.config)
71
+ for node_id, node in self.nodes.items():
72
+ if self.should_skip_model(node_id):
73
+ self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
74
+ continue
75
+ if (
76
+ node.resource_type == AltimateResourceType.model
77
+ and classify_model_type(node.name, node.original_file_path, regex_configuration) == STAGING
78
+ ):
79
+ downstream_dependencies = [
80
+ dependent_node_id
81
+ for dependent_node_id in node.depends_on.nodes
82
+ if classify_model_type(
83
+ self.get_node(dependent_node_id).name,
84
+ self.get_node(dependent_node_id).original_file_path,
85
+ regex_configuration,
86
+ )
87
+ in downstream_models
88
+ ]
89
+
90
+ if downstream_dependencies:
91
+ insight_result = self._build_failure_result(node_id, downstream_dependencies)
92
+ insights.append(
93
+ DBTModelInsightResponse(
94
+ unique_id=node_id,
95
+ package_name=node.package_name,
96
+ path=node.path,
97
+ original_file_path=node.original_file_path,
98
+ insight=insight_result,
99
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
100
+ )
101
+ )
102
+
103
+ return insights
@@ -0,0 +1,89 @@
1
+ from typing import List
2
+
3
+ from datapilot.config.utils import get_regex_configuration
4
+ from datapilot.core.insights.utils import get_severity
5
+ from datapilot.core.platforms.dbt.constants import STAGING
6
+ from datapilot.core.platforms.dbt.insights.modelling.base import DBTModellingInsight
7
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
8
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
9
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
10
+ from datapilot.core.platforms.dbt.utils import classify_model_type
11
+ from datapilot.utils.formatting.utils import numbered_list
12
+
13
+
14
+ class DBTStagingModelsDependentOnStagingModels(DBTModellingInsight):
15
+ """
16
+ DBTStagingModelsDependentOnStagingModels identifies staging models in a dbt project that depend on staging models.
17
+ """
18
+
19
+ NAME = "Staging models dependency on staging Models"
20
+ ALIAS = "staging_models_on_staging"
21
+ DESCRIPTION = "Staging models should not directly depend on other staging models."
22
+ REASON_TO_FLAG = (
23
+ "Best practice is for staging models to depend on source or raw data models, not on other staging models. "
24
+ "Dependencies among staging models can lead to complicated data flows and hinder data lineage tracking."
25
+ )
26
+ FAILURE_MESSAGE = (
27
+ "Staging model `{current_model_unique_id}` has dependencies on other staging models, "
28
+ "which is against best practices: \n{downstream_dependencies}"
29
+ )
30
+ RECOMMENDATION = (
31
+ "Refactor staging model `{current_model_unique_id}` to ensure it depends on source or raw data models, "
32
+ "not on other staging models. This realignment with best practices promotes clear and effective data flow."
33
+ )
34
+
35
+ def _build_failure_result(self, current_model_unique_id: str, downstream_dependencies: List[str]) -> DBTInsightResult:
36
+ failure = self.FAILURE_MESSAGE.format(
37
+ current_model_unique_id=current_model_unique_id,
38
+ downstream_dependencies=numbered_list(downstream_dependencies),
39
+ )
40
+ recommendation = self.RECOMMENDATION.format(current_model_unique_id=current_model_unique_id)
41
+
42
+ return DBTInsightResult(
43
+ type=self.TYPE,
44
+ name=self.NAME,
45
+ message=failure,
46
+ recommendation=recommendation,
47
+ reason_to_flag=self.REASON_TO_FLAG,
48
+ metadata={
49
+ "model": current_model_unique_id,
50
+ "downstream_dependencies": downstream_dependencies,
51
+ },
52
+ )
53
+
54
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
55
+ insights = []
56
+ regex_configuration = get_regex_configuration(self.config)
57
+ for node_id, node in self.nodes.items():
58
+ if self.should_skip_model(node_id):
59
+ self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
60
+ continue
61
+ if (
62
+ node.resource_type == AltimateResourceType.model
63
+ and classify_model_type(node.name, node.original_file_path, regex_configuration) == STAGING
64
+ ):
65
+ downstream_dependencies = [
66
+ dependent_node_id
67
+ for dependent_node_id in node.depends_on.nodes
68
+ if classify_model_type(
69
+ self.get_node(dependent_node_id).name,
70
+ self.get_node(dependent_node_id).original_file_path,
71
+ regex_configuration,
72
+ )
73
+ == STAGING
74
+ ]
75
+
76
+ if downstream_dependencies:
77
+ insight_result = self._build_failure_result(node_id, downstream_dependencies)
78
+ insights.append(
79
+ DBTModelInsightResponse(
80
+ unique_id=node_id,
81
+ package_name=node.package_name,
82
+ path=node.path,
83
+ original_file_path=node.original_file_path,
84
+ insight=insight_result,
85
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
86
+ )
87
+ )
88
+
89
+ return insights
@@ -0,0 +1,59 @@
1
+ from typing import List
2
+
3
+ from datapilot.core.insights.utils import get_severity
4
+ from datapilot.core.platforms.dbt.insights.modelling.base import DBTModellingInsight
5
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
6
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
7
+
8
+
9
+ class DBTUnusedSources(DBTModellingInsight):
10
+ """
11
+ DBTUnusedSources identifies sources in a dbt project that are not being referenced by any models.
12
+ """
13
+
14
+ NAME = "Unused sources detection"
15
+ ALIAS = "unused_sources"
16
+ DESCRIPTION = "Detects sources in the dbt project that are not being referenced by any models."
17
+ REASON_TO_FLAG = (
18
+ "Unused sources, either defined in YML but not used in any model or leftover from deprecated models, "
19
+ "represent unnecessary complexity in the project. It's important to keep the dbt project lean and relevant."
20
+ )
21
+ FAILURE_MESSAGE = "Source `{source_unique_id}` is not being referenced by any model, indicating it is unused."
22
+ RECOMMENDATION = (
23
+ "Review the source `{source_unique_id}`. Consider removing it or integrating it into the project "
24
+ "if it's needed. Keeping only relevant sources in the project reduces complexity and improves maintainability."
25
+ )
26
+
27
+ def _build_failure_result(self, source_unique_id: str) -> DBTInsightResult:
28
+ failure_message = self.FAILURE_MESSAGE.format(source_unique_id=source_unique_id)
29
+ recommendation = self.RECOMMENDATION.format(source_unique_id=source_unique_id)
30
+
31
+ return DBTInsightResult(
32
+ type=self.TYPE,
33
+ name=self.NAME,
34
+ message=failure_message,
35
+ recommendation=recommendation,
36
+ reason_to_flag=self.REASON_TO_FLAG,
37
+ metadata={"source": source_unique_id},
38
+ )
39
+
40
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
41
+ insights = []
42
+ for source_id, source in self.sources.items():
43
+ if self.should_skip_model(source_id):
44
+ self.logger.debug(f"Skipping model {source_id} as it is not enabled for selected models")
45
+ continue
46
+ if source_id not in self.children_map.keys():
47
+ insight_result = self._build_failure_result(source_id)
48
+ insights.append(
49
+ DBTModelInsightResponse(
50
+ unique_id=source_id,
51
+ package_name=source.package_name,
52
+ path=source.path,
53
+ original_file_path=source.original_file_path,
54
+ insight=insight_result,
55
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
56
+ )
57
+ )
58
+
59
+ return insights
@@ -0,0 +1,26 @@
1
+ from abc import abstractmethod
2
+ from typing import Tuple
3
+
4
+ from datapilot.core.platforms.dbt.insights.base import DBTInsight
5
+
6
+
7
+ class DBTPerformanceInsight(DBTInsight):
8
+ TYPE = "Performance"
9
+
10
+ def __init__(self, *args, **kwargs):
11
+ super().__init__(*args, **kwargs)
12
+
13
+ @abstractmethod
14
+ def generate(self, *args, **kwargs) -> dict:
15
+ pass
16
+
17
+ @classmethod
18
+ def has_all_required_data(cls, has_manifest: bool, **kwargs) -> Tuple[bool, str]:
19
+ """
20
+ Check if all required data is available for the insight to run.
21
+ :param has_manifest: A boolean indicating if manifest is available.
22
+ :return: A boolean indicating if all required data is available.
23
+ """
24
+ if not has_manifest:
25
+ return False, "manifest is required for insight to run."
26
+ return True, ""
@@ -0,0 +1,92 @@
1
+ from typing import List
2
+
3
+ from datapilot.core.insights.utils import get_severity
4
+ from datapilot.core.platforms.dbt.insights.performance.base import DBTPerformanceInsight
5
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
6
+ from datapilot.core.platforms.dbt.insights.schema import DBTProjectInsightResponse
7
+ from datapilot.utils.formatting.utils import numbered_list
8
+
9
+
10
+ class DBTChainViewLinking(DBTPerformanceInsight):
11
+ """
12
+ Checks if the dbt model has a chain of views in it
13
+ """
14
+
15
+ CHAIN_LENGTH_STR = "chain_length"
16
+ NAME = "Chain view linking"
17
+ ALIAS = "chain_view_linking"
18
+ CHAIN_LENGTH = 4 # Default chain length, can be adjusted as needed
19
+ DESCRIPTION = "Checks for long chains of view/ephemeral models in the dbt project. Long chains can lead to slow computation "
20
+ REASON_TO_FLAG = (
21
+ "Long runtime can occur for a model when it is built on top of a long chain of 'non-physically-materialized'"
22
+ " models. Identifying these chains is crucial to optimize performance and reduce computation overhead."
23
+ )
24
+ FAILURE_MESSAGE = (
25
+ "Detected {number_of_chains} chains of views/ephemeral models in your dbt project that are at least {"
26
+ "chain_length} models long. Chains of concern: \n{chain_views}"
27
+ )
28
+ RECOMMENDATION = (
29
+ "Consider altering the materialization strategy of some key upstream models to 'table' or 'incremental'. "
30
+ "This change can reduce computation time, minimize in-memory data processing, and "
31
+ "prevent excessive nesting of views."
32
+ )
33
+
34
+ def _build_failure_result(
35
+ self,
36
+ chain_views: List[List[str]],
37
+ chain_length: int = CHAIN_LENGTH,
38
+ ) -> DBTInsightResult:
39
+ chains = [" -> ".join(chain_view[::-1]) for chain_view in chain_views]
40
+ failure_message = self.FAILURE_MESSAGE.format(
41
+ number_of_chains=len(chains),
42
+ chain_length=chain_length,
43
+ chain_views=numbered_list(chains),
44
+ )
45
+
46
+ return DBTInsightResult(
47
+ name=self.NAME,
48
+ type=self.TYPE,
49
+ message=failure_message,
50
+ recommendation=self.RECOMMENDATION,
51
+ reason_to_flag=self.REASON_TO_FLAG,
52
+ metadata={
53
+ "chains": chains,
54
+ },
55
+ )
56
+
57
+ def generate(self, *args, **kwargs) -> List[DBTProjectInsightResponse]:
58
+ chain_length = self.get_check_config(self.CHAIN_LENGTH_STR) or self.CHAIN_LENGTH
59
+ chain_views = self.find_long_chains(chain_length)
60
+
61
+ if chain_views:
62
+ insight_result = self._build_failure_result(chain_views)
63
+
64
+ return [
65
+ DBTProjectInsightResponse(
66
+ package_name=self.project_name,
67
+ insights=[insight_result],
68
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
69
+ )
70
+ ]
71
+ return []
72
+
73
+ @classmethod
74
+ def get_config_schema(cls):
75
+ """
76
+ :return: The configuration schema for the test coverage insight.
77
+ """
78
+ config_schema = super().get_config_schema()
79
+
80
+ config_schema["config"] = {
81
+ "$schema": "http://json-schema.org/draft-07/schema#",
82
+ "type": "object",
83
+ "properties": {
84
+ cls.CHAIN_LENGTH_STR: {
85
+ "type": "integer",
86
+ "description": "The maximum length of the chain of views to be considered.",
87
+ "default": cls.CHAIN_LENGTH,
88
+ },
89
+ },
90
+ "required": [cls.CHAIN_LENGTH_STR],
91
+ }
92
+ return config_schema
@@ -0,0 +1,104 @@
1
+ from typing import List
2
+
3
+ from datapilot.core.insights.utils import get_severity
4
+ from datapilot.core.platforms.dbt.constants import SOURCE
5
+ from datapilot.core.platforms.dbt.insights.performance.base import DBTPerformanceInsight
6
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
7
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
8
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
9
+ from datapilot.utils.formatting.utils import numbered_list
10
+
11
+
12
+ class DBTExposureParentMaterialization(DBTPerformanceInsight):
13
+ """
14
+ Checks if the dbt model has hard coded references to other models.
15
+ """
16
+
17
+ NAME = "Exposure parent materialization check"
18
+ ALIAS = "exposure_parent_bad_materialization"
19
+ DESCRIPTION = "Exposures should depend on transformed data models or metrics, not raw untransformed sources. "
20
+ REASON_TO_FLAG = (
21
+ "Exposures should depend on transformed data models or metrics, not raw untransformed sources. "
22
+ "Moreover, parent models of exposures, being heavily used in downstream systems, "
23
+ "should be materialized efficiently to ensure performance when queried."
24
+ )
25
+ FAILURE_MESSAGE = (
26
+ "Exposure `{exposure_unique_id}` has parent models with suboptimal materialization types. "
27
+ "This could impact performance and clarity in downstream systems."
28
+ )
29
+ RECOMMENDATION = (
30
+ "Review the parent models of exposure `{exposure_unique_id}`. If using sources, "
31
+ "consider transforming the raw data into a model first. If parent models are views or ephemerals,"
32
+ " evaluate materializing them as tables to enhance query performance."
33
+ )
34
+
35
+ def _build_failure_result(
36
+ self,
37
+ exposure_unique_id: str,
38
+ source_parents: List[str],
39
+ bad_materializations: List[str],
40
+ ) -> DBTInsightResult:
41
+ failure_message = self.FAILURE_MESSAGE.format(
42
+ exposure_unique_id=exposure_unique_id,
43
+ )
44
+
45
+ failure_message += f" It has some source models as it's parents:\n {numbered_list(source_parents)}" if source_parents else ""
46
+
47
+ failure_message += (
48
+ f" The following parent models are not materialized as table " f"or incremental :\n {numbered_list(bad_materializations)}"
49
+ if bad_materializations
50
+ else ""
51
+ )
52
+
53
+ recommendation = self.RECOMMENDATION.format(
54
+ exposure_unique_id=exposure_unique_id,
55
+ )
56
+
57
+ return DBTInsightResult(
58
+ name=self.NAME,
59
+ type=self.TYPE,
60
+ message=failure_message,
61
+ recommendation=recommendation,
62
+ reason_to_flag=self.REASON_TO_FLAG,
63
+ metadata={
64
+ "exposure_unique_id": exposure_unique_id,
65
+ "source_parents": source_parents,
66
+ "bad_materialization_parents": bad_materializations,
67
+ },
68
+ )
69
+
70
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
71
+ insights = []
72
+
73
+ for exposure_id, exposure in self.exposures.items():
74
+ if self.should_skip_model(exposure_id):
75
+ self.logger.debug(f"Skipping model {exposure_id} as it is not enabled for selected models")
76
+ continue
77
+ bad_materializations = []
78
+ source_parents = []
79
+ for parent_model in exposure.depends_on.nodes:
80
+ if parent_model.split(".")[0] == SOURCE:
81
+ source_parents.append(parent_model)
82
+ else:
83
+ node = self.nodes.get(parent_model)
84
+ materialization = node.config.materialized if node.config else "not defined"
85
+ if node and node.resource_type == AltimateResourceType.model and materialization not in ["table", "incremental"]:
86
+ bad_materializations.append(parent_model)
87
+
88
+ if source_parents or bad_materializations:
89
+ insights.append(
90
+ DBTModelInsightResponse(
91
+ unique_id=exposure_id,
92
+ package_name=exposure.package_name,
93
+ path=exposure.path,
94
+ original_file_path=exposure.original_file_path,
95
+ insight=self._build_failure_result(
96
+ exposure_unique_id=exposure.unique_id,
97
+ source_parents=source_parents,
98
+ bad_materializations=bad_materializations,
99
+ ),
100
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
101
+ )
102
+ )
103
+
104
+ return insights
@@ -0,0 +1,72 @@
1
+ from typing import List
2
+ from typing import Optional
3
+
4
+ from datapilot.core.insights.schema import InsightResponse
5
+ from datapilot.core.insights.schema import InsightResult
6
+ from datapilot.core.platforms.dbt.constants import MODEL
7
+ from datapilot.core.platforms.dbt.constants import PROJECT
8
+
9
+ # from src.utils.formatting.utils import get_severity_color, reset_color, bold, underline
10
+
11
+
12
+ class DBTInsightResult(InsightResult):
13
+ pass
14
+
15
+
16
+ class DBTInsightResponse(InsightResponse):
17
+ pass
18
+
19
+
20
+ class DBTModelInsightResponse(DBTInsightResponse):
21
+ unique_id: str
22
+ package_name: str
23
+ path: str
24
+ original_file_path: str
25
+ insight_level: str = MODEL
26
+
27
+ # def get_report(self, do_format=True) -> str:
28
+ # divider = "-" * 40
29
+ # report_lines = [
30
+ # f"{bold('Package Name:', do_format)} {self.package_name}",
31
+ # f"{bold('Unique ID:', do_format)} {self.unique_id}",
32
+ # f"{bold('File Path:', do_format)} {self.original_file_path}",
33
+ # f"{underline('Insight Details:', do_format)}",
34
+ # f" {bold('Name:', do_format)} {self.insight.name}",
35
+ # f" {bold('Severity:', do_format)} {get_severity_color(self.severity)}{self.severity.value}{reset_color(do_format) }",
36
+ # f" {bold('Message:', do_format)} {self.insight.message}",
37
+ # f" {bold('Recommendation:', do_format)} {self.insight.recommendation}",
38
+ # f" {bold('Reason to Flag:', do_format)} {self.insight.reason_to_flag}",
39
+ # divider,
40
+ # ]
41
+ # return "\n".join(report_lines)
42
+
43
+
44
+ class DBTProjectInsightResponse(DBTInsightResponse):
45
+ package_name: str
46
+ insight_level: str = PROJECT
47
+ insights: List[DBTInsightResult]
48
+ insight: Optional[DBTInsightResult] = None
49
+ #
50
+ # def get_report(self, do_format=True) -> str:
51
+ # divider = "-" * 40
52
+ # severity_color = get_severity_color(self.severity)
53
+ # report_lines = [
54
+ # f"Package Name: {self.package_name}",
55
+ # f"Insight Level: {self.insight_level}",
56
+ # divider,
57
+ # ]
58
+ #
59
+ # for insight in self.insights:
60
+ # report_lines.extend(
61
+ # [
62
+ # f"Insight Name: {insight.name}",
63
+ # f"Type: {insight.type}",
64
+ # f"Severity: {severity_color}{self.severity.value}{reset_color()}",
65
+ # f"Message: {insight.message}",
66
+ # f"Recommendation: {insight.recommendation}",
67
+ # f"Reason to Flag: {insight.reason_to_flag}",
68
+ # divider,
69
+ # ]
70
+ # )
71
+ #
72
+ # return "\n".join(report_lines)
@@ -0,0 +1,33 @@
1
+ from abc import abstractmethod
2
+ from typing import Any
3
+ from typing import Dict
4
+ from typing import Optional
5
+ from typing import Tuple
6
+
7
+ from datapilot.core.insights.schema import Severity
8
+ from datapilot.core.platforms.dbt.insights.base import DBTInsight
9
+
10
+
11
+ class DBTStructureInsight(DBTInsight):
12
+ NAME = "DBTStructureInsight"
13
+ TYPE = "structure"
14
+ DEFAULT_SEVERITY = Severity.WARNING
15
+
16
+ def __init__(self, config: Optional[Dict[str, Any]] = None, *args, **kwargs):
17
+ self.config = config or {}
18
+ super().__init__(*args, **kwargs)
19
+
20
+ @abstractmethod
21
+ def generate(self, *args, **kwargs) -> dict:
22
+ pass
23
+
24
+ @classmethod
25
+ def has_all_required_data(cls, has_manifest: bool, **kwargs) -> Tuple[bool, str]:
26
+ """
27
+ Check if all required data is available for the insight to run.
28
+ :param has_manifest: A boolean indicating if manifest is available.
29
+ :return: A boolean indicating if all required data is available.
30
+ """
31
+ if not has_manifest:
32
+ return False, "manifest is required for insight to run."
33
+ return True, ""