altimate-datapilot-cli 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. altimate_datapilot_cli-0.0.8.dist-info/AUTHORS.rst +5 -0
  2. altimate_datapilot_cli-0.0.8.dist-info/LICENSE +9 -0
  3. altimate_datapilot_cli-0.0.8.dist-info/METADATA +102 -0
  4. altimate_datapilot_cli-0.0.8.dist-info/RECORD +139 -0
  5. altimate_datapilot_cli-0.0.8.dist-info/WHEEL +5 -0
  6. altimate_datapilot_cli-0.0.8.dist-info/entry_points.txt +4 -0
  7. altimate_datapilot_cli-0.0.8.dist-info/top_level.txt +1 -0
  8. datapilot/__init__.py +1 -0
  9. datapilot/__main__.py +14 -0
  10. datapilot/cli/__init__.py +0 -0
  11. datapilot/cli/main.py +11 -0
  12. datapilot/clients/__init__.py +0 -0
  13. datapilot/clients/altimate/__init__.py +0 -0
  14. datapilot/clients/altimate/client.py +85 -0
  15. datapilot/clients/altimate/utils.py +75 -0
  16. datapilot/config/__init__.py +0 -0
  17. datapilot/config/config.py +16 -0
  18. datapilot/config/utils.py +32 -0
  19. datapilot/core/__init__.py +0 -0
  20. datapilot/core/insights/__init__.py +2 -0
  21. datapilot/core/insights/base/__init__.py +0 -0
  22. datapilot/core/insights/base/insight.py +34 -0
  23. datapilot/core/insights/report.py +16 -0
  24. datapilot/core/insights/schema.py +24 -0
  25. datapilot/core/insights/sql/__init__.py +0 -0
  26. datapilot/core/insights/sql/base/__init__.py +0 -0
  27. datapilot/core/insights/sql/base/insight.py +18 -0
  28. datapilot/core/insights/sql/runtime/__init__.py +0 -0
  29. datapilot/core/insights/sql/static/__init__.py +0 -0
  30. datapilot/core/insights/utils.py +20 -0
  31. datapilot/core/platforms/__init__.py +0 -0
  32. datapilot/core/platforms/dbt/__init__.py +0 -0
  33. datapilot/core/platforms/dbt/cli/__init__.py +0 -0
  34. datapilot/core/platforms/dbt/cli/cli.py +112 -0
  35. datapilot/core/platforms/dbt/constants.py +34 -0
  36. datapilot/core/platforms/dbt/exceptions.py +6 -0
  37. datapilot/core/platforms/dbt/executor.py +157 -0
  38. datapilot/core/platforms/dbt/factory.py +22 -0
  39. datapilot/core/platforms/dbt/formatting.py +45 -0
  40. datapilot/core/platforms/dbt/hooks/__init__.py +0 -0
  41. datapilot/core/platforms/dbt/hooks/executor_hook.py +86 -0
  42. datapilot/core/platforms/dbt/insights/__init__.py +115 -0
  43. datapilot/core/platforms/dbt/insights/base.py +133 -0
  44. datapilot/core/platforms/dbt/insights/checks/__init__.py +0 -0
  45. datapilot/core/platforms/dbt/insights/checks/base.py +26 -0
  46. datapilot/core/platforms/dbt/insights/checks/check_column_desc_are_same.py +105 -0
  47. datapilot/core/platforms/dbt/insights/checks/check_column_name_contract.py +154 -0
  48. datapilot/core/platforms/dbt/insights/checks/check_macro_args_have_desc.py +75 -0
  49. datapilot/core/platforms/dbt/insights/checks/check_macro_has_desc.py +63 -0
  50. datapilot/core/platforms/dbt/insights/checks/check_model_has_all_columns.py +96 -0
  51. datapilot/core/platforms/dbt/insights/checks/check_model_has_labels_keys.py +112 -0
  52. datapilot/core/platforms/dbt/insights/checks/check_model_has_meta_keys.py +108 -0
  53. datapilot/core/platforms/dbt/insights/checks/check_model_has_properties_file.py +64 -0
  54. datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_group.py +118 -0
  55. datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_name.py +114 -0
  56. datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_type.py +119 -0
  57. datapilot/core/platforms/dbt/insights/checks/check_model_materialization_by_childs.py +129 -0
  58. datapilot/core/platforms/dbt/insights/checks/check_model_name_contract.py +132 -0
  59. datapilot/core/platforms/dbt/insights/checks/check_model_parents_and_childs.py +135 -0
  60. datapilot/core/platforms/dbt/insights/checks/check_model_parents_database.py +109 -0
  61. datapilot/core/platforms/dbt/insights/checks/check_model_parents_schema.py +109 -0
  62. datapilot/core/platforms/dbt/insights/checks/check_model_tags.py +87 -0
  63. datapilot/core/platforms/dbt/insights/checks/check_source_childs.py +97 -0
  64. datapilot/core/platforms/dbt/insights/checks/check_source_columns_have_desc.py +96 -0
  65. datapilot/core/platforms/dbt/insights/checks/check_source_has_all_columns.py +103 -0
  66. datapilot/core/platforms/dbt/insights/checks/check_source_has_freshness.py +94 -0
  67. datapilot/core/platforms/dbt/insights/checks/check_source_has_labels_keys.py +110 -0
  68. datapilot/core/platforms/dbt/insights/checks/check_source_has_loader.py +62 -0
  69. datapilot/core/platforms/dbt/insights/checks/check_source_has_meta_keys.py +117 -0
  70. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests.py +82 -0
  71. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_group.py +117 -0
  72. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_name.py +113 -0
  73. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_type.py +119 -0
  74. datapilot/core/platforms/dbt/insights/checks/check_source_table_has_description.py +62 -0
  75. datapilot/core/platforms/dbt/insights/checks/check_source_tags.py +76 -0
  76. datapilot/core/platforms/dbt/insights/dbt_test/__init__.py +0 -0
  77. datapilot/core/platforms/dbt/insights/dbt_test/base.py +23 -0
  78. datapilot/core/platforms/dbt/insights/dbt_test/missing_primary_key_tests.py +130 -0
  79. datapilot/core/platforms/dbt/insights/dbt_test/test_coverage.py +118 -0
  80. datapilot/core/platforms/dbt/insights/governance/__init__.py +0 -0
  81. datapilot/core/platforms/dbt/insights/governance/base.py +23 -0
  82. datapilot/core/platforms/dbt/insights/governance/documentation_on_stale_columns.py +130 -0
  83. datapilot/core/platforms/dbt/insights/governance/exposures_dependent_on_private_models.py +90 -0
  84. datapilot/core/platforms/dbt/insights/governance/public_models_without_contracts.py +89 -0
  85. datapilot/core/platforms/dbt/insights/governance/undocumented_columns.py +148 -0
  86. datapilot/core/platforms/dbt/insights/governance/undocumented_public_models.py +110 -0
  87. datapilot/core/platforms/dbt/insights/modelling/README.md +15 -0
  88. datapilot/core/platforms/dbt/insights/modelling/__init__.py +0 -0
  89. datapilot/core/platforms/dbt/insights/modelling/base.py +31 -0
  90. datapilot/core/platforms/dbt/insights/modelling/direct_join_to_source.py +125 -0
  91. datapilot/core/platforms/dbt/insights/modelling/downstream_models_dependent_on_source.py +113 -0
  92. datapilot/core/platforms/dbt/insights/modelling/duplicate_sources.py +85 -0
  93. datapilot/core/platforms/dbt/insights/modelling/hard_coded_references.py +80 -0
  94. datapilot/core/platforms/dbt/insights/modelling/joining_of_upstream_concepts.py +79 -0
  95. datapilot/core/platforms/dbt/insights/modelling/model_fanout.py +126 -0
  96. datapilot/core/platforms/dbt/insights/modelling/multiple_sources_joined.py +83 -0
  97. datapilot/core/platforms/dbt/insights/modelling/root_model.py +82 -0
  98. datapilot/core/platforms/dbt/insights/modelling/source_fanout.py +102 -0
  99. datapilot/core/platforms/dbt/insights/modelling/staging_model_dependent_on_downstream_models.py +103 -0
  100. datapilot/core/platforms/dbt/insights/modelling/staging_model_dependent_on_staging_models.py +89 -0
  101. datapilot/core/platforms/dbt/insights/modelling/unused_sources.py +59 -0
  102. datapilot/core/platforms/dbt/insights/performance/__init__.py +0 -0
  103. datapilot/core/platforms/dbt/insights/performance/base.py +26 -0
  104. datapilot/core/platforms/dbt/insights/performance/chain_view_linking.py +92 -0
  105. datapilot/core/platforms/dbt/insights/performance/exposure_parent_materializations.py +104 -0
  106. datapilot/core/platforms/dbt/insights/schema.py +72 -0
  107. datapilot/core/platforms/dbt/insights/structure/__init__.py +0 -0
  108. datapilot/core/platforms/dbt/insights/structure/base.py +33 -0
  109. datapilot/core/platforms/dbt/insights/structure/model_directories_structure.py +92 -0
  110. datapilot/core/platforms/dbt/insights/structure/model_naming_conventions.py +97 -0
  111. datapilot/core/platforms/dbt/insights/structure/source_directories_structure.py +80 -0
  112. datapilot/core/platforms/dbt/insights/structure/test_directory_structure.py +74 -0
  113. datapilot/core/platforms/dbt/insights/utils.py +9 -0
  114. datapilot/core/platforms/dbt/schemas/__init__.py +0 -0
  115. datapilot/core/platforms/dbt/schemas/catalog.py +73 -0
  116. datapilot/core/platforms/dbt/schemas/manifest.py +462 -0
  117. datapilot/core/platforms/dbt/utils.py +525 -0
  118. datapilot/core/platforms/dbt/wrappers/__init__.py +0 -0
  119. datapilot/core/platforms/dbt/wrappers/catalog/__init__.py +0 -0
  120. datapilot/core/platforms/dbt/wrappers/catalog/v1/__init__.py +0 -0
  121. datapilot/core/platforms/dbt/wrappers/catalog/v1/wrapper.py +18 -0
  122. datapilot/core/platforms/dbt/wrappers/catalog/wrapper.py +9 -0
  123. datapilot/core/platforms/dbt/wrappers/manifest/__init__.py +0 -0
  124. datapilot/core/platforms/dbt/wrappers/manifest/v11/__init__.py +0 -0
  125. datapilot/core/platforms/dbt/wrappers/manifest/v11/schemas.py +47 -0
  126. datapilot/core/platforms/dbt/wrappers/manifest/v11/wrapper.py +396 -0
  127. datapilot/core/platforms/dbt/wrappers/manifest/wrapper.py +35 -0
  128. datapilot/core/platforms/dbt/wrappers/run_results/__init__.py +0 -0
  129. datapilot/core/platforms/dbt/wrappers/run_results/run_results.py +39 -0
  130. datapilot/exceptions/__init__.py +0 -0
  131. datapilot/exceptions/exceptions.py +10 -0
  132. datapilot/schemas/__init__.py +0 -0
  133. datapilot/schemas/constants.py +5 -0
  134. datapilot/schemas/nodes.py +19 -0
  135. datapilot/schemas/sql.py +10 -0
  136. datapilot/utils/__init__.py +0 -0
  137. datapilot/utils/formatting/__init__.py +0 -0
  138. datapilot/utils/formatting/utils.py +59 -0
  139. datapilot/utils/utils.py +317 -0
@@ -0,0 +1,85 @@
1
+ from collections import defaultdict
2
+ from typing import List
3
+
4
+ from datapilot.core.insights.utils import get_severity
5
+ from datapilot.core.platforms.dbt.insights.modelling.base import DBTModellingInsight
6
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
7
+ from datapilot.core.platforms.dbt.insights.schema import DBTProjectInsightResponse
8
+ from datapilot.core.platforms.dbt.utils import get_table_name_from_source
9
+ from datapilot.utils.formatting.utils import numbered_list
10
+
11
+
12
+ class DBTDuplicateSources(DBTModellingInsight):
13
+ """
14
+ Check if the DBT project has duplicate sources.
15
+ Ref: https://github.com/dbt-labs/dbt-project-evaluator/blob/main/models/marts/dag/fct_duplicate_sources.sql
16
+ """
17
+
18
+ NAME = "Duplicate sources"
19
+ ALIAS = "Duplicate_Sources"
20
+ DESCRIPTION = "Duplicate sources should be avoided."
21
+ REASON_TO_FLAG = (
22
+ "Having multiple source nodes pointing to the same database location can lead to an inaccurate "
23
+ "representation of data lineage and potential confusion in data management."
24
+ )
25
+ FAILURE_MESSAGE = (
26
+ "Duplicate source nodes detected: Multiple source nodes are referencing the same database object. "
27
+ "Database location {source_table} is referenced by:\n {source_nodes_list}"
28
+ )
29
+ RECOMMENDATION = (
30
+ "Consolidate the duplicate source nodes so that each database location has only a single source definition "
31
+ "in your dbt project. This will help maintain clear and accurate data lineage."
32
+ )
33
+
34
+ def _build_failure_result(self, source_table: str, source_ids: List[str]) -> DBTInsightResult:
35
+ """
36
+ Build Insight result if a source table has multiple source models defined.
37
+ :param source_table: Name of the source table.
38
+ :param source_ids: List of source IDs which are referencing the source table.
39
+ :return: An instance of DBTInsightResult containing failure message and recommendation and metadata.
40
+ """
41
+ self.logger.debug(f"Building failure result for source table {source_table}")
42
+ return DBTInsightResult(
43
+ name=self.NAME,
44
+ type=self.TYPE,
45
+ reason_to_flag=self.REASON_TO_FLAG,
46
+ message=self.FAILURE_MESSAGE.format(source_table=source_table, source_nodes_list=numbered_list(source_ids)),
47
+ recommendation=self.RECOMMENDATION.format(source_table=source_table),
48
+ metadata={
49
+ "source_table": source_table,
50
+ "source_ids": source_ids,
51
+ },
52
+ )
53
+
54
+ def generate(self, *args, **kwargs) -> List[DBTProjectInsightResponse]:
55
+ """
56
+ Generate a list of InsightResponse objects for each model in the DBT project,
57
+ containing insights about direct source dependencies.
58
+ :return: A list of InsightResponse objects.
59
+ """
60
+
61
+ self.logger.debug(f"Generating insights for DBTDuplicateSources for project {self.project_name}")
62
+
63
+ source_table_to_id_map = defaultdict(list)
64
+ for source_id, source in self.sources.items():
65
+ table_name = get_table_name_from_source(source)
66
+ source_table_to_id_map[table_name].append(source_id)
67
+
68
+ self.logger.debug(f"source_table_to_id_map: {source_table_to_id_map}")
69
+ insight_results = []
70
+ for source_table, source_ids in source_table_to_id_map.items():
71
+ if len(source_ids) > 1:
72
+ insight_results.append(self._build_failure_result(source_table, source_ids))
73
+
74
+ if insight_results:
75
+ self.logger.debug("Duplicate source models found")
76
+ return [
77
+ DBTProjectInsightResponse(
78
+ package_name=self.project_name,
79
+ insights=insight_results,
80
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
81
+ )
82
+ ]
83
+
84
+ self.logger.debug("No duplicate sources found")
85
+ return []
@@ -0,0 +1,80 @@
1
+ from typing import List
2
+
3
+ from datapilot.core.insights.utils import get_severity
4
+ from datapilot.core.platforms.dbt.constants import SQL
5
+ from datapilot.core.platforms.dbt.insights.modelling.base import DBTModellingInsight
6
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
7
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
8
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
9
+ from datapilot.core.platforms.dbt.utils import get_hard_coded_references
10
+ from datapilot.utils.formatting.utils import numbered_list
11
+
12
+
13
+ class DBTHardCodedReferences(DBTModellingInsight):
14
+ """
15
+ Checks if the dbt model has hard coded references to other models.
16
+ """
17
+
18
+ NAME = "Hard coded references"
19
+ ALIAS = "hard_coded_references"
20
+ DESCRIPTION = "Models should not have hard-coded references to tables"
21
+ REASON_TO_FLAG = (
22
+ "Hard-coded references in SQL prevent easy identification and tracking of data lineage, "
23
+ "and can lead to issues in maintainability and scalability of the data models."
24
+ )
25
+ SOURCE_FANOUT_THRESHOLD = 1 # Default threshold, can be overridden as needed
26
+ FAILURE_MESSAGE = (
27
+ "Model `{model_unique_id}` contains hard-coded references, which may obscure data lineage. "
28
+ "Detected hard-coded references: \n{hard_coded_references}"
29
+ )
30
+ RECOMMENDATION = (
31
+ "Replace hard-coded references in `{model_unique_id}` with dbt sources or model references to "
32
+ "improve clarity and maintainability of data lineage."
33
+ )
34
+
35
+ def _build_failure_result(self, model_unique_id: str, hard_coded_references: List[str]) -> DBTInsightResult:
36
+ failure_message = self.FAILURE_MESSAGE.format(
37
+ model_unique_id=model_unique_id,
38
+ hard_coded_references=numbered_list(hard_coded_references),
39
+ )
40
+ return DBTInsightResult(
41
+ name=self.NAME,
42
+ type=self.TYPE,
43
+ message=failure_message,
44
+ recommendation=self.RECOMMENDATION.format(model_unique_id=model_unique_id),
45
+ reason_to_flag=self.REASON_TO_FLAG,
46
+ metadata={
47
+ "model": model_unique_id,
48
+ "hard_coded_references": hard_coded_references,
49
+ },
50
+ )
51
+
52
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
53
+ insights = []
54
+
55
+ for node in self.nodes.values():
56
+ if self.should_skip_model(node.unique_id):
57
+ self.logger.debug(f"Skipping model {node.unique_id} as it is not enabled for selected models")
58
+ continue
59
+ if node.resource_type == AltimateResourceType.model:
60
+ raw_code = node.raw_code
61
+ if (not raw_code) or node.language != SQL:
62
+ continue
63
+ hard_coded_references = get_hard_coded_references(raw_code)
64
+ if hard_coded_references:
65
+ insight_result = self._build_failure_result(
66
+ model_unique_id=node.unique_id,
67
+ hard_coded_references=hard_coded_references,
68
+ )
69
+ insights.append(
70
+ DBTModelInsightResponse(
71
+ unique_id=node.unique_id,
72
+ package_name=node.package_name,
73
+ path=node.path,
74
+ original_file_path=node.original_file_path,
75
+ insight=insight_result,
76
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
77
+ )
78
+ )
79
+
80
+ return insights
@@ -0,0 +1,79 @@
1
+ from typing import List
2
+
3
+ from datapilot.core.insights.utils import get_severity
4
+ from datapilot.core.platforms.dbt.insights.modelling.base import DBTModellingInsight
5
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
6
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
7
+
8
+
9
+ class DBTRejoiningOfUpstreamConcepts(DBTModellingInsight):
10
+ """
11
+ DBTRejoiningOfUpstreamConcepts identifies cases in the dbt project where a parent model's direct child
12
+ is also the direct child of another one of the parent's direct children, with the condition that the intermediate
13
+ model has no other downstream dependencies.
14
+ """
15
+
16
+ NAME = "Rejoining of upstream Concepts"
17
+ ALIAS = "rejoining_upstream_concepts"
18
+ DESCRIPTION = (
19
+ "Detects scenarios where a parent's direct child is also a direct child of another one " "of the parent's direct children."
20
+ )
21
+ REASON_TO_FLAG = (
22
+ "Flagged to identify cases where a parent model has a direct child that is also a direct child "
23
+ "of another one of the parent's direct children. Such patterns can suggest loops or redundancies in the DAG."
24
+ )
25
+ FAILURE_MESSAGE = (
26
+ "Model `{child}` has a rejoining upstream concept with parent model `{parent_model}` "
27
+ "and downstream child: `{downstream_child}`. This may indicate a loop or redundancy in the DAG."
28
+ )
29
+ RECOMMENDATION = (
30
+ "Review and potentially refactor the model relationships in `{child}`,"
31
+ " `{parent_model}`, and `{downstream_child}` to simplify the DAG and "
32
+ "avoid unnecessary complexity or potential loops."
33
+ )
34
+
35
+ def _build_failure_result(self, child: str, parent_model: str, children_list: List[str]) -> DBTInsightResult:
36
+ failure_message = self.FAILURE_MESSAGE.format(child=child, parent_model=parent_model, downstream_child=children_list[0])
37
+
38
+ recommendation = self.RECOMMENDATION.format(child=child, parent_model=parent_model, downstream_child=children_list[0])
39
+ return DBTInsightResult(
40
+ type=self.TYPE,
41
+ name=self.NAME,
42
+ message=failure_message,
43
+ recommendation=recommendation,
44
+ reason_to_flag=self.REASON_TO_FLAG,
45
+ metadata={
46
+ "model": parent_model,
47
+ "children": children_list,
48
+ },
49
+ )
50
+
51
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
52
+ insights = []
53
+ for parent_model, children in self.children_map.items():
54
+ for child in children:
55
+ child_child_is_also_parent_child = any(
56
+ dwn_stream_child in self.children_map[child] for dwn_stream_child in self.children_map[parent_model]
57
+ )
58
+ if child_child_is_also_parent_child and len(self.children_map[child]) == 1:
59
+ insight_result = self._build_failure_result(
60
+ child=child,
61
+ parent_model=parent_model,
62
+ children_list=list(self.children_map[child]),
63
+ )
64
+ child_node = self.get_node(child)
65
+ if self.should_skip_model(child_node.unique_id):
66
+ self.logger.debug(f"Skipping model {child_node.unique_id} as it is not enabled for selected models")
67
+ continue
68
+ insights.append(
69
+ DBTModelInsightResponse(
70
+ unique_id=child_node.unique_id,
71
+ package_name=child_node.package_name,
72
+ path=child_node.path,
73
+ original_file_path=child_node.original_file_path,
74
+ insight=insight_result,
75
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
76
+ )
77
+ )
78
+
79
+ return insights
@@ -0,0 +1,126 @@
1
+ from typing import List
2
+
3
+ from datapilot.core.insights.utils import get_severity
4
+ from datapilot.core.platforms.dbt.insights.modelling.base import DBTModellingInsight
5
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
6
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
7
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
8
+
9
+
10
+ class DBTModelFanout(DBTModellingInsight):
11
+ """
12
+ DBTModelFanout identifies parent models in a dbt project with more than a specified number
13
+ of direct leaf children, indicating a high model fanout.
14
+ """
15
+
16
+ NAME = "Model fanout analysis"
17
+ ALIAS = "model_fanout"
18
+ DESCRIPTION = "Identifies parent models with an unusually high number of children. "
19
+ REASON_TO_FLAG = (
20
+ "Flagged to highlight parent models with an unusually high number of leaf children. This can suggest areas "
21
+ "in the data pipeline where complexity is increased and transformations might be optimized."
22
+ )
23
+ FANOUT_THRESHOLD = 3 # Default threshold, can be overridden as needed
24
+ FAILURE_MESSAGE = (
25
+ "Model `{parent_model_unique_id}` has `{leaf_children}` leaf children, "
26
+ "exceeding the fanout threshold of `{fanout_threshold}`. This level of fanout may lead to increased complexity."
27
+ )
28
+ RECOMMENDATION = (
29
+ "Consider reviewing and restructuring `{parent_model_unique_id}` to simplify its dependencies. "
30
+ "Reducing the number of leaf children can lead to a more streamlined and maintainable data pipeline."
31
+ )
32
+
33
+ FANOUT_THRESHOLD_STR = "max_fanout"
34
+
35
+ def _build_failure_result(
36
+ self,
37
+ parent_model_unique_id: str,
38
+ leaf_children: List[str],
39
+ fanout_threshold: int,
40
+ ) -> DBTInsightResult:
41
+ # Logic to build the failure result
42
+ self.logger.debug(f"Found {len(leaf_children)} leaf children for {parent_model_unique_id}")
43
+ failure_message = self.FAILURE_MESSAGE.format(
44
+ parent_model_unique_id=parent_model_unique_id,
45
+ leaf_children=len(leaf_children),
46
+ fanout_threshold=fanout_threshold,
47
+ )
48
+
49
+ recommendation = self.RECOMMENDATION.format(
50
+ parent_model_unique_id=parent_model_unique_id,
51
+ )
52
+
53
+ return DBTInsightResult(
54
+ type=self.TYPE,
55
+ name=self.NAME,
56
+ message=failure_message,
57
+ recommendation=recommendation,
58
+ reason_to_flag=self.REASON_TO_FLAG,
59
+ metadata={
60
+ "model": parent_model_unique_id,
61
+ "leaf_children_count": len(leaf_children),
62
+ "leaf_children": leaf_children,
63
+ },
64
+ )
65
+
66
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
67
+ fanout_threshold = self.get_check_config(self.FANOUT_THRESHOLD_STR) or self.FANOUT_THRESHOLD
68
+ insights = []
69
+ self.logger.debug(f"Checking for models with fanout greater than {fanout_threshold}")
70
+ for parent, children_set in self.children_map.items():
71
+ if self.should_skip_model(parent):
72
+ self.logger.debug(f"Skipping model {parent} as it is not enabled for selected models")
73
+ continue
74
+
75
+ node = self.get_node(parent)
76
+ if node.resource_type != AltimateResourceType.model:
77
+ continue
78
+
79
+ leaf_children = [
80
+ child
81
+ for child in children_set
82
+ if len(self.children_map[child]) == 0
83
+ and self.get_node(child).resource_type
84
+ not in [
85
+ AltimateResourceType.test,
86
+ AltimateResourceType.analysis,
87
+ AltimateResourceType.metric,
88
+ ]
89
+ ]
90
+
91
+ if len(leaf_children) > fanout_threshold:
92
+ insight_result = self._build_failure_result(parent, leaf_children, fanout_threshold)
93
+ insights.append(
94
+ DBTModelInsightResponse(
95
+ unique_id=parent,
96
+ package_name=node.package_name,
97
+ path=node.path,
98
+ original_file_path=node.original_file_path,
99
+ insight=insight_result,
100
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
101
+ )
102
+ )
103
+
104
+ self.logger.debug(f"Found {len(insights)} models with high fanout")
105
+ return insights
106
+
107
+ @classmethod
108
+ def get_config_schema(cls):
109
+ """
110
+ :return: The configuration schema for the test coverage insight.
111
+ """
112
+ config_schema = super().get_config_schema()
113
+
114
+ config_schema["config"] = {
115
+ "$schema": "http://json-schema.org/draft-07/schema#",
116
+ "type": "object",
117
+ "properties": {
118
+ cls.FANOUT_THRESHOLD_STR: {
119
+ "type": "integer",
120
+ "description": "The maximum number of direct leaf children a model can have before being flagged.",
121
+ "default": cls.FANOUT_THRESHOLD,
122
+ },
123
+ },
124
+ "required": [cls.FANOUT_THRESHOLD_STR],
125
+ }
126
+ return config_schema
@@ -0,0 +1,83 @@
1
+ from typing import List
2
+
3
+ from datapilot.core.insights.utils import get_severity
4
+ from datapilot.core.platforms.dbt.insights.modelling.base import DBTModellingInsight
5
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
6
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
7
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
8
+ from datapilot.utils.formatting.utils import numbered_list
9
+
10
+
11
+ class DBTModelsMultipleSourcesJoined(DBTModellingInsight):
12
+ """
13
+ DBTModelsMultipleSourcesJoined identifies models in a dbt project that reference more than one source.
14
+ """
15
+
16
+ NAME = "Multiple sources joined"
17
+ ALIAS = "multiple_sources_joined"
18
+ DESCRIPTION = "Models should not directly join multiple sources."
19
+ REASON_TO_FLAG = (
20
+ "Best practice is to have a single staging model per source and use this staging model as a "
21
+ "dependency for downstream models. Directly joining multiple sources in a single model can "
22
+ "lead to data management complexities and inconsistencies."
23
+ )
24
+ FAILURE_MESSAGE = (
25
+ "Model `{model_id}` directly uses multiple sources, which may complicate data management and lineage tracking. "
26
+ "Detected sources: \n{sources_list}"
27
+ )
28
+ RECOMMENDATION = (
29
+ "Consider refactoring `{model_id}` to reference a single source or "
30
+ "intermediate models that consolidate these sources. This approach simplifies data lineage"
31
+ " and improves maintainability."
32
+ )
33
+
34
+ def _build_failure_result(self, model_id: str, source_dependencies: List[str]) -> DBTInsightResult:
35
+ failure = self.FAILURE_MESSAGE.format(
36
+ model_id=model_id,
37
+ sources_list=numbered_list(source_dependencies),
38
+ )
39
+ recommendation = self.RECOMMENDATION.format(model_id=model_id)
40
+ return DBTInsightResult(
41
+ type=self.TYPE,
42
+ name=self.NAME,
43
+ message=failure,
44
+ recommendation=recommendation,
45
+ reason_to_flag=self.REASON_TO_FLAG,
46
+ metadata={
47
+ "model": model_id,
48
+ "source_dependencies": source_dependencies,
49
+ },
50
+ )
51
+
52
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
53
+ self.logger.debug(f"Generating insights for DBTModelsMultipleSourcesJoined for project {self.manifest.get_package()}")
54
+
55
+ insights = []
56
+
57
+ for node_id, node in self.nodes.items():
58
+ if self.should_skip_model(node_id):
59
+ self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
60
+ continue
61
+
62
+ if node.resource_type == AltimateResourceType.model:
63
+ source_dependencies = [
64
+ dependent_node_id
65
+ for dependent_node_id in node.depends_on.nodes
66
+ if self.get_node(dependent_node_id).resource_type == AltimateResourceType.source
67
+ ]
68
+
69
+ if len(source_dependencies) > 1:
70
+ self.logger.debug(f"Model {node_id} references multiple sources")
71
+ insight_result = self._build_failure_result(node_id, source_dependencies)
72
+ insights.append(
73
+ DBTModelInsightResponse(
74
+ unique_id=node_id,
75
+ package_name=node.package_name,
76
+ path=node.path,
77
+ original_file_path=node.original_file_path,
78
+ insight=insight_result,
79
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
80
+ )
81
+ )
82
+
83
+ return insights
@@ -0,0 +1,82 @@
1
+ from typing import List
2
+
3
+ from datapilot.core.insights.utils import get_severity
4
+ from datapilot.core.platforms.dbt.insights.modelling.base import DBTModellingInsight
5
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
6
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
7
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
8
+
9
+
10
+ class DBTRootModel(DBTModellingInsight):
11
+ """
12
+ DBTRootModels is used to identify models in a dbt project with 0 direct parents,
13
+ meaning these models cannot be traced back to a declared source or model.
14
+ """
15
+
16
+ NAME = "Root model traceability"
17
+ ALIAS = "root_model"
18
+ DESCRIPTION = "Identifies models in a dbt project with 0 direct parents, meaning these models cannot be traced back to a declared source or model."
19
+ REASON_TO_FLAG = (
20
+ "Best Practice is to ensure all models can be traced back to a source or another model in the project. "
21
+ "Root models with no direct parents can lead to challenges in tracking data lineage and understanding"
22
+ " the overall data model."
23
+ )
24
+ FAILURE_MESSAGE = (
25
+ "Model `{current_model_unique_id}` is identified as a root model with no direct parents. "
26
+ "This can hinder traceability and clarity in the data model."
27
+ )
28
+ RECOMMENDATION = (
29
+ "Ensure that model `{current_model_unique_id}` is appropriately linked to a source or another model "
30
+ "within the dbt project. This linkage is crucial for maintaining clear data lineage and project coherence."
31
+ )
32
+
33
+ def _build_failure_result(self, current_model_unique_id: str) -> DBTInsightResult:
34
+ """
35
+ Build failure result for the insight if a model is a root model with 0 direct parents.
36
+
37
+ :param current_model_unique_id: Unique ID of the current model being evaluated.
38
+ :return: An instance of InsightResult containing failure message and recommendation.
39
+ """
40
+ self.logger.debug(f"Building failure result for root model {current_model_unique_id}")
41
+
42
+ failure = self.FAILURE_MESSAGE.format(current_model_unique_id=current_model_unique_id)
43
+ recommendation = self.RECOMMENDATION.format(current_model_unique_id=current_model_unique_id)
44
+
45
+ return DBTInsightResult(
46
+ type=self.TYPE,
47
+ name=self.NAME,
48
+ message=failure,
49
+ recommendation=recommendation,
50
+ reason_to_flag=self.REASON_TO_FLAG,
51
+ metadata={"model": current_model_unique_id},
52
+ )
53
+
54
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
55
+ """
56
+ Generate a list of InsightResponse objects for each model in the DBT project,
57
+ identifying root models with 0 direct parents.
58
+ :return: A list of InsightResponse objects.
59
+ """
60
+ self.logger.debug(f"Generating insights for DBTRootModels for project {self.project_name}")
61
+ insights = []
62
+
63
+ for node_id, node in self.nodes.items():
64
+ if self.should_skip_model(node_id):
65
+ self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
66
+ continue
67
+ if node.resource_type == AltimateResourceType.model and not node.depends_on.nodes:
68
+ self.logger.debug(f"Found root model {node_id} with no direct parents")
69
+ insight_result = self._build_failure_result(node.unique_id)
70
+ insights.append(
71
+ DBTModelInsightResponse(
72
+ unique_id=node_id,
73
+ package_name=node.package_name,
74
+ path=node.path,
75
+ original_file_path=node.original_file_path,
76
+ insight=insight_result,
77
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
78
+ )
79
+ )
80
+
81
+ self.logger.debug(f"Found {len(insights)} root models")
82
+ return insights
@@ -0,0 +1,102 @@
1
+ from typing import List
2
+
3
+ from datapilot.core.insights.utils import get_severity
4
+ from datapilot.core.platforms.dbt.insights.modelling.base import DBTModellingInsight
5
+ from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
6
+ from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
7
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
8
+
9
+
10
+ class DBTSourceFanout(DBTModellingInsight):
11
+ """
12
+ DBTSourceFanout identifies instances where a source is the direct parent of multiple resources in the DAG.
13
+ """
14
+
15
+ NAME = "Source fanout analysis"
16
+ ALIAS = "source_fanout"
17
+ DESCRIPTION = "Identifies sources with a high number of direct children."
18
+ REASON_TO_FLAG = (
19
+ "Identifying sources with high fanout can indicate areas where the data model might be overly complex "
20
+ "or dependent on a single source. Such dependencies can introduce risks and "
21
+ "complicate maintenance and scalability."
22
+ )
23
+ SOURCE_FANOUT_THRESHOLD = 1 # Default threshold, can be overridden as needed
24
+ FAILURE_MESSAGE = (
25
+ "Source `{source_unique_id}` has `{children_count}` direct children, "
26
+ "exceeding the fanout threshold of `{fanout_threshold}`. This level of fanout may lead to increased complexity."
27
+ )
28
+ RECOMMENDATION = (
29
+ "Review the source `{source_unique_id}` to identify opportunities to reduce its direct dependencies. "
30
+ "This can help in simplifying the data model and reducing the risk associated with high source reliance."
31
+ )
32
+ SOURCE_FANOUT_THRESHOLD_STR = "max_fanout"
33
+
34
+ def _build_failure_result(self, source_unique_id: str, children_count: int, fanout_threshold: int) -> DBTInsightResult:
35
+ failure_message = self.FAILURE_MESSAGE.format(
36
+ source_unique_id=source_unique_id,
37
+ children_count=children_count,
38
+ fanout_threshold=fanout_threshold,
39
+ )
40
+
41
+ recommendation = self.RECOMMENDATION.format(source_unique_id=source_unique_id)
42
+ return DBTInsightResult(
43
+ type=self.TYPE,
44
+ name=self.NAME,
45
+ message=failure_message,
46
+ recommendation=recommendation,
47
+ reason_to_flag=self.REASON_TO_FLAG,
48
+ metadata={
49
+ "source": source_unique_id,
50
+ "direct_children_count": children_count,
51
+ },
52
+ )
53
+
54
+ def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
55
+ fanout_threshold = self.get_check_config(self.SOURCE_FANOUT_THRESHOLD_STR) or self.SOURCE_FANOUT_THRESHOLD
56
+ insights = []
57
+
58
+ for node_id, children_set in self.children_map.items():
59
+ if self.should_skip_model(node_id):
60
+ self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
61
+ continue
62
+ node = self.get_node(node_id)
63
+ if node.resource_type == AltimateResourceType.source:
64
+ if len(children_set) > fanout_threshold:
65
+ insight_result = self._build_failure_result(
66
+ source_unique_id=node_id,
67
+ children_count=len(children_set),
68
+ fanout_threshold=fanout_threshold,
69
+ )
70
+
71
+ insights.append(
72
+ DBTModelInsightResponse(
73
+ unique_id=node_id,
74
+ package_name=node.package_name,
75
+ path=node.path,
76
+ original_file_path=node.original_file_path,
77
+ insight=insight_result,
78
+ severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
79
+ )
80
+ )
81
+ return insights
82
+
83
+ @classmethod
84
+ def get_config_schema(cls):
85
+ """
86
+ :return: The configuration schema for the test coverage insight.
87
+ """
88
+ config_schema = super().get_config_schema()
89
+
90
+ config_schema["config"] = {
91
+ "$schema": "http://json-schema.org/draft-07/schema#",
92
+ "type": "object",
93
+ "properties": {
94
+ cls.SOURCE_FANOUT_THRESHOLD_STR: {
95
+ "type": "integer",
96
+ "description": "The maximum number of direct children a source can have before being flagged.",
97
+ "default": cls.SOURCE_FANOUT_THRESHOLD,
98
+ },
99
+ },
100
+ "required": [cls.SOURCE_FANOUT_THRESHOLD_STR],
101
+ }
102
+ return config_schema