PyPI - altimate-datapilot-cli - Versions diffs - 0.0.8__py3-none-any.whl - Mend

altimate-datapilot-cli 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

datapilot/core/platforms/dbt/insights/modelling/staging_model_dependent_on_downstream_models.py ADDED Viewed

@@ -0,0 +1,103 @@
+from typing import ClassVar
+from typing import List
+from datapilot.config.utils import get_regex_configuration
+from datapilot.core.insights.utils import get_severity
+from datapilot.core.platforms.dbt.constants import INTERMEDIATE
+from datapilot.core.platforms.dbt.constants import MART
+from datapilot.core.platforms.dbt.constants import STAGING
+from datapilot.core.platforms.dbt.insights.modelling.base import DBTModellingInsight
+from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
+from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
+from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
+from datapilot.core.platforms.dbt.utils import classify_model_type
+from datapilot.schemas.constants import CONFIG_METRICS
+from datapilot.utils.formatting.utils import numbered_list
+class DBTStagingModelsDependentOnDownstreamModels(DBTModellingInsight):
+    """
+    DBTStagingModelsDependentOnDownstream identifies staging models in a dbt project that depend on downstream models.
+    """
+    NAME = "Staging models dependency check"
+    ALIAS = "staging_models_dependency"
+    DESCRIPTION = "Staging models should not depend on downstream models."
+    REASON_TO_FLAG = (
+        "Best practice is for staging models to depend on source or raw data models, not on downstream models. "
+        "Dependencies in the wrong direction can lead to complications in data processing and lineage tracing."
+    )
+    FAILURE_MESSAGE = (
+        "Staging model `{current_model_unique_id}` has dependencies on downstream models, "
+        "which is against best practices: \n{downstream_dependencies}"
+    )
+    RECOMMENDATION = (
+        "Refactor the staging model `{current_model_unique_id}` to ensure it depends on source or raw data models. "
+        "This will align the model with best practices, enhancing data flow clarity and lineage tracing."
+    )
+    DOWNSTREAM_MODEL_TYPES_STR = "downstream_model_types"
+    DOWNSTREAM_MODEL_TYPES: ClassVar[List[str]] = [MART, INTERMEDIATE]
+    def _build_failure_result(self, current_model_unique_id: str, downstream_dependencies: List[str]) -> DBTInsightResult:
+        failure = self.FAILURE_MESSAGE.format(
+            current_model_unique_id=current_model_unique_id,
+            downstream_dependencies=numbered_list(downstream_dependencies),
+        )
+        recommendation = self.RECOMMENDATION.format(current_model_unique_id=current_model_unique_id)
+        return DBTInsightResult(
+            type=self.TYPE,
+            name=self.NAME,
+            message=failure,
+            recommendation=recommendation,
+            reason_to_flag=self.REASON_TO_FLAG,
+            metadata={
+                "model": current_model_unique_id,
+                "downstream_dependencies": downstream_dependencies,
+            },
+        )
+    def _get_downstream_models(self) -> List[str]:
+        metrics_config = self.config.get(CONFIG_METRICS, {})
+        metric_config = metrics_config.get(self.ALIAS, {})
+        # Return the configured fanout threshold or the default if not specified
+        return metric_config.get(self.DOWNSTREAM_MODEL_TYPES_STR, self.DOWNSTREAM_MODEL_TYPES)
+    def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
+        insights = []
+        downstream_models = self._get_downstream_models()
+        regex_configuration = get_regex_configuration(self.config)
+        for node_id, node in self.nodes.items():
+            if self.should_skip_model(node_id):
+                self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
+                continue
+            if (
+                node.resource_type == AltimateResourceType.model
+                and classify_model_type(node.name, node.original_file_path, regex_configuration) == STAGING
+            ):
+                downstream_dependencies = [
+                    dependent_node_id
+                    for dependent_node_id in node.depends_on.nodes
+                    if classify_model_type(
+                        self.get_node(dependent_node_id).name,
+                        self.get_node(dependent_node_id).original_file_path,
+                        regex_configuration,
+                    )
+                    in downstream_models
+                ]
+                if downstream_dependencies:
+                    insight_result = self._build_failure_result(node_id, downstream_dependencies)
+                    insights.append(
+                        DBTModelInsightResponse(
+                            unique_id=node_id,
+                            package_name=node.package_name,
+                            path=node.path,
+                            original_file_path=node.original_file_path,
+                            insight=insight_result,
+                            severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
+                        )
+                    )
+        return insights

datapilot/core/platforms/dbt/insights/modelling/staging_model_dependent_on_staging_models.py ADDED Viewed

@@ -0,0 +1,89 @@
+from typing import List
+from datapilot.config.utils import get_regex_configuration
+from datapilot.core.insights.utils import get_severity
+from datapilot.core.platforms.dbt.constants import STAGING
+from datapilot.core.platforms.dbt.insights.modelling.base import DBTModellingInsight
+from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
+from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
+from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
+from datapilot.core.platforms.dbt.utils import classify_model_type
+from datapilot.utils.formatting.utils import numbered_list
+class DBTStagingModelsDependentOnStagingModels(DBTModellingInsight):
+    """
+    DBTStagingModelsDependentOnStagingModels identifies staging models in a dbt project that depend on staging models.
+    """
+    NAME = "Staging models dependency on staging Models"
+    ALIAS = "staging_models_on_staging"
+    DESCRIPTION = "Staging models should not directly depend on other staging models."
+    REASON_TO_FLAG = (
+        "Best practice is for staging models to depend on source or raw data models, not on other staging models. "
+        "Dependencies among staging models can lead to complicated data flows and hinder data lineage tracking."
+    )
+    FAILURE_MESSAGE = (
+        "Staging model `{current_model_unique_id}` has dependencies on other staging models, "
+        "which is against best practices: \n{downstream_dependencies}"
+    )
+    RECOMMENDATION = (
+        "Refactor staging model `{current_model_unique_id}` to ensure it depends on source or raw data models, "
+        "not on other staging models. This realignment with best practices promotes clear and effective data flow."
+    )
+    def _build_failure_result(self, current_model_unique_id: str, downstream_dependencies: List[str]) -> DBTInsightResult:
+        failure = self.FAILURE_MESSAGE.format(
+            current_model_unique_id=current_model_unique_id,
+            downstream_dependencies=numbered_list(downstream_dependencies),
+        )
+        recommendation = self.RECOMMENDATION.format(current_model_unique_id=current_model_unique_id)
+        return DBTInsightResult(
+            type=self.TYPE,
+            name=self.NAME,
+            message=failure,
+            recommendation=recommendation,
+            reason_to_flag=self.REASON_TO_FLAG,
+            metadata={
+                "model": current_model_unique_id,
+                "downstream_dependencies": downstream_dependencies,
+            },
+        )
+    def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
+        insights = []
+        regex_configuration = get_regex_configuration(self.config)
+        for node_id, node in self.nodes.items():
+            if self.should_skip_model(node_id):
+                self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
+                continue
+            if (
+                node.resource_type == AltimateResourceType.model
+                and classify_model_type(node.name, node.original_file_path, regex_configuration) == STAGING
+            ):
+                downstream_dependencies = [
+                    dependent_node_id
+                    for dependent_node_id in node.depends_on.nodes
+                    if classify_model_type(
+                        self.get_node(dependent_node_id).name,
+                        self.get_node(dependent_node_id).original_file_path,
+                        regex_configuration,
+                    )
+                    == STAGING
+                ]
+                if downstream_dependencies:
+                    insight_result = self._build_failure_result(node_id, downstream_dependencies)
+                    insights.append(
+                        DBTModelInsightResponse(
+                            unique_id=node_id,
+                            package_name=node.package_name,
+                            path=node.path,
+                            original_file_path=node.original_file_path,
+                            insight=insight_result,
+                            severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
+                        )
+                    )
+        return insights

datapilot/core/platforms/dbt/insights/modelling/unused_sources.py ADDED Viewed

@@ -0,0 +1,59 @@
+from typing import List
+from datapilot.core.insights.utils import get_severity
+from datapilot.core.platforms.dbt.insights.modelling.base import DBTModellingInsight
+from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
+from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
+class DBTUnusedSources(DBTModellingInsight):
+    """
+    DBTUnusedSources identifies sources in a dbt project that are not being referenced by any models.
+    """
+    NAME = "Unused sources detection"
+    ALIAS = "unused_sources"
+    DESCRIPTION = "Detects sources in the dbt project that are not being referenced by any models."
+    REASON_TO_FLAG = (
+        "Unused sources, either defined in YML but not used in any model or leftover from deprecated models, "
+        "represent unnecessary complexity in the project. It's important to keep the dbt project lean and relevant."
+    )
+    FAILURE_MESSAGE = "Source `{source_unique_id}` is not being referenced by any model, indicating it is unused."
+    RECOMMENDATION = (
+        "Review the source `{source_unique_id}`. Consider removing it or integrating it into the project "
+        "if it's needed. Keeping only relevant sources in the project reduces complexity and improves maintainability."
+    )
+    def _build_failure_result(self, source_unique_id: str) -> DBTInsightResult:
+        failure_message = self.FAILURE_MESSAGE.format(source_unique_id=source_unique_id)
+        recommendation = self.RECOMMENDATION.format(source_unique_id=source_unique_id)
+        return DBTInsightResult(
+            type=self.TYPE,
+            name=self.NAME,
+            message=failure_message,
+            recommendation=recommendation,
+            reason_to_flag=self.REASON_TO_FLAG,
+            metadata={"source": source_unique_id},
+        )
+    def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
+        insights = []
+        for source_id, source in self.sources.items():
+            if self.should_skip_model(source_id):
+                self.logger.debug(f"Skipping model {source_id} as it is not enabled for selected models")
+                continue
+            if source_id not in self.children_map.keys():
+                insight_result = self._build_failure_result(source_id)
+                insights.append(
+                    DBTModelInsightResponse(
+                        unique_id=source_id,
+                        package_name=source.package_name,
+                        path=source.path,
+                        original_file_path=source.original_file_path,
+                        insight=insight_result,
+                        severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
+                    )
+                )
+        return insights

datapilot/core/platforms/dbt/insights/performance/__init__.py ADDED Viewed

File without changes

datapilot/core/platforms/dbt/insights/performance/base.py ADDED Viewed

@@ -0,0 +1,26 @@
+from abc import abstractmethod
+from typing import Tuple
+from datapilot.core.platforms.dbt.insights.base import DBTInsight
+class DBTPerformanceInsight(DBTInsight):
+    TYPE = "Performance"
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    @abstractmethod
+    def generate(self, *args, **kwargs) -> dict:
+        pass
+    @classmethod
+    def has_all_required_data(cls, has_manifest: bool, **kwargs) -> Tuple[bool, str]:
+        """
+        Check if all required data is available for the insight to run.
+        :param has_manifest: A boolean indicating if manifest is available.
+        :return: A boolean indicating if all required data is available.
+        """
+        if not has_manifest:
+            return False, "manifest is required for insight to run."
+        return True, ""

datapilot/core/platforms/dbt/insights/performance/chain_view_linking.py ADDED Viewed

@@ -0,0 +1,92 @@
+from typing import List
+from datapilot.core.insights.utils import get_severity
+from datapilot.core.platforms.dbt.insights.performance.base import DBTPerformanceInsight
+from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
+from datapilot.core.platforms.dbt.insights.schema import DBTProjectInsightResponse
+from datapilot.utils.formatting.utils import numbered_list
+class DBTChainViewLinking(DBTPerformanceInsight):
+    """
+    Checks if the dbt model has a chain of views in it
+    """
+    CHAIN_LENGTH_STR = "chain_length"
+    NAME = "Chain view linking"
+    ALIAS = "chain_view_linking"
+    CHAIN_LENGTH = 4  # Default chain length, can be adjusted as needed
+    DESCRIPTION = "Checks for long chains of view/ephemeral models in the dbt project. Long chains can lead to slow computation "
+    REASON_TO_FLAG = (
+        "Long runtime can occur for a model when it is built on top of a long chain of 'non-physically-materialized'"
+        " models. Identifying these chains is crucial to optimize performance and reduce computation overhead."
+    )
+    FAILURE_MESSAGE = (
+        "Detected {number_of_chains} chains of views/ephemeral models in your dbt project that are at least {"
+        "chain_length} models long. Chains of concern: \n{chain_views}"
+    )
+    RECOMMENDATION = (
+        "Consider altering the materialization strategy of some key upstream models to 'table' or 'incremental'. "
+        "This change can reduce computation time, minimize in-memory data processing, and "
+        "prevent excessive nesting of views."
+    )
+    def _build_failure_result(
+        self,
+        chain_views: List[List[str]],
+        chain_length: int = CHAIN_LENGTH,
+    ) -> DBTInsightResult:
+        chains = [" -> ".join(chain_view[::-1]) for chain_view in chain_views]
+        failure_message = self.FAILURE_MESSAGE.format(
+            number_of_chains=len(chains),
+            chain_length=chain_length,
+            chain_views=numbered_list(chains),
+        )
+        return DBTInsightResult(
+            name=self.NAME,
+            type=self.TYPE,
+            message=failure_message,
+            recommendation=self.RECOMMENDATION,
+            reason_to_flag=self.REASON_TO_FLAG,
+            metadata={
+                "chains": chains,
+            },
+        )
+    def generate(self, *args, **kwargs) -> List[DBTProjectInsightResponse]:
+        chain_length = self.get_check_config(self.CHAIN_LENGTH_STR) or self.CHAIN_LENGTH
+        chain_views = self.find_long_chains(chain_length)
+        if chain_views:
+            insight_result = self._build_failure_result(chain_views)
+            return [
+                DBTProjectInsightResponse(
+                    package_name=self.project_name,
+                    insights=[insight_result],
+                    severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
+                )
+            ]
+        return []
+    @classmethod
+    def get_config_schema(cls):
+        """
+        :return: The configuration schema for the test coverage insight.
+        """
+        config_schema = super().get_config_schema()
+        config_schema["config"] = {
+            "$schema": "http://json-schema.org/draft-07/schema#",
+            "type": "object",
+            "properties": {
+                cls.CHAIN_LENGTH_STR: {
+                    "type": "integer",
+                    "description": "The maximum length of the chain of views to be considered.",
+                    "default": cls.CHAIN_LENGTH,
+                },
+            },
+            "required": [cls.CHAIN_LENGTH_STR],
+        }
+        return config_schema

datapilot/core/platforms/dbt/insights/performance/exposure_parent_materializations.py ADDED Viewed

@@ -0,0 +1,104 @@
+from typing import List
+from datapilot.core.insights.utils import get_severity
+from datapilot.core.platforms.dbt.constants import SOURCE
+from datapilot.core.platforms.dbt.insights.performance.base import DBTPerformanceInsight
+from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
+from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
+from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
+from datapilot.utils.formatting.utils import numbered_list
+class DBTExposureParentMaterialization(DBTPerformanceInsight):
+    """
+    Checks if the dbt model has hard coded references to other models.
+    """
+    NAME = "Exposure parent materialization check"
+    ALIAS = "exposure_parent_bad_materialization"
+    DESCRIPTION = "Exposures should depend on transformed data models or metrics, not raw untransformed sources. "
+    REASON_TO_FLAG = (
+        "Exposures should depend on transformed data models or metrics, not raw untransformed sources. "
+        "Moreover, parent models of exposures, being heavily used in downstream systems, "
+        "should be materialized efficiently to ensure performance when queried."
+    )
+    FAILURE_MESSAGE = (
+        "Exposure `{exposure_unique_id}` has parent models with suboptimal materialization types. "
+        "This could impact performance and clarity in downstream systems."
+    )
+    RECOMMENDATION = (
+        "Review the parent models of exposure `{exposure_unique_id}`. If using sources, "
+        "consider transforming the raw data into a model first. If parent models are views or ephemerals,"
+        " evaluate materializing them as tables to enhance query performance."
+    )
+    def _build_failure_result(
+        self,
+        exposure_unique_id: str,
+        source_parents: List[str],
+        bad_materializations: List[str],
+    ) -> DBTInsightResult:
+        failure_message = self.FAILURE_MESSAGE.format(
+            exposure_unique_id=exposure_unique_id,
+        )
+        failure_message += f" It has some source models as it's parents:\n {numbered_list(source_parents)}" if source_parents else ""
+        failure_message += (
+            f" The following parent models are not materialized as table " f"or incremental :\n {numbered_list(bad_materializations)}"
+            if bad_materializations
+            else ""
+        )
+        recommendation = self.RECOMMENDATION.format(
+            exposure_unique_id=exposure_unique_id,
+        )
+        return DBTInsightResult(
+            name=self.NAME,
+            type=self.TYPE,
+            message=failure_message,
+            recommendation=recommendation,
+            reason_to_flag=self.REASON_TO_FLAG,
+            metadata={
+                "exposure_unique_id": exposure_unique_id,
+                "source_parents": source_parents,
+                "bad_materialization_parents": bad_materializations,
+            },
+        )
+    def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
+        insights = []
+        for exposure_id, exposure in self.exposures.items():
+            if self.should_skip_model(exposure_id):
+                self.logger.debug(f"Skipping model {exposure_id} as it is not enabled for selected models")
+                continue
+            bad_materializations = []
+            source_parents = []
+            for parent_model in exposure.depends_on.nodes:
+                if parent_model.split(".")[0] == SOURCE:
+                    source_parents.append(parent_model)
+                else:
+                    node = self.nodes.get(parent_model)
+                    materialization = node.config.materialized if node.config else "not defined"
+                    if node and node.resource_type == AltimateResourceType.model and materialization not in ["table", "incremental"]:
+                        bad_materializations.append(parent_model)
+            if source_parents or bad_materializations:
+                insights.append(
+                    DBTModelInsightResponse(
+                        unique_id=exposure_id,
+                        package_name=exposure.package_name,
+                        path=exposure.path,
+                        original_file_path=exposure.original_file_path,
+                        insight=self._build_failure_result(
+                            exposure_unique_id=exposure.unique_id,
+                            source_parents=source_parents,
+                            bad_materializations=bad_materializations,
+                        ),
+                        severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
+                    )
+                )
+        return insights

datapilot/core/platforms/dbt/insights/schema.py ADDED Viewed

@@ -0,0 +1,72 @@
+from typing import List
+from typing import Optional
+from datapilot.core.insights.schema import InsightResponse
+from datapilot.core.insights.schema import InsightResult
+from datapilot.core.platforms.dbt.constants import MODEL
+from datapilot.core.platforms.dbt.constants import PROJECT
+# from src.utils.formatting.utils import get_severity_color, reset_color, bold, underline
+class DBTInsightResult(InsightResult):
+    pass
+class DBTInsightResponse(InsightResponse):
+    pass
+class DBTModelInsightResponse(DBTInsightResponse):
+    unique_id: str
+    package_name: str
+    path: str
+    original_file_path: str
+    insight_level: str = MODEL
+    # def get_report(self, do_format=True) -> str:
+    #     divider = "-" * 40
+    #     report_lines = [
+    #         f"{bold('Package Name:', do_format)} {self.package_name}",
+    #         f"{bold('Unique ID:', do_format)} {self.unique_id}",
+    #         f"{bold('File Path:', do_format)} {self.original_file_path}",
+    #         f"{underline('Insight Details:', do_format)}",
+    #         f"  {bold('Name:', do_format)} {self.insight.name}",
+    #         f"  {bold('Severity:', do_format)} {get_severity_color(self.severity)}{self.severity.value}{reset_color(do_format) }",
+    #         f"  {bold('Message:', do_format)} {self.insight.message}",
+    #         f"  {bold('Recommendation:', do_format)} {self.insight.recommendation}",
+    #         f"  {bold('Reason to Flag:', do_format)} {self.insight.reason_to_flag}",
+    #         divider,
+    #     ]
+    #     return "\n".join(report_lines)
+class DBTProjectInsightResponse(DBTInsightResponse):
+    package_name: str
+    insight_level: str = PROJECT
+    insights: List[DBTInsightResult]
+    insight: Optional[DBTInsightResult] = None
+    #
+    # def get_report(self, do_format=True) -> str:
+    #     divider = "-" * 40
+    #     severity_color = get_severity_color(self.severity)
+    #     report_lines = [
+    #         f"Package Name: {self.package_name}",
+    #         f"Insight Level: {self.insight_level}",
+    #         divider,
+    #     ]
+    #
+    #     for insight in self.insights:
+    #         report_lines.extend(
+    #             [
+    #                 f"Insight Name: {insight.name}",
+    #                 f"Type: {insight.type}",
+    #                 f"Severity: {severity_color}{self.severity.value}{reset_color()}",
+    #                 f"Message: {insight.message}",
+    #                 f"Recommendation: {insight.recommendation}",
+    #                 f"Reason to Flag: {insight.reason_to_flag}",
+    #                 divider,
+    #             ]
+    #         )
+    #
+    #     return "\n".join(report_lines)

datapilot/core/platforms/dbt/insights/structure/__init__.py ADDED Viewed

File without changes

datapilot/core/platforms/dbt/insights/structure/base.py ADDED Viewed

@@ -0,0 +1,33 @@
+from abc import abstractmethod
+from typing import Any
+from typing import Dict
+from typing import Optional
+from typing import Tuple
+from datapilot.core.insights.schema import Severity
+from datapilot.core.platforms.dbt.insights.base import DBTInsight
+class DBTStructureInsight(DBTInsight):
+    NAME = "DBTStructureInsight"
+    TYPE = "structure"
+    DEFAULT_SEVERITY = Severity.WARNING
+    def __init__(self, config: Optional[Dict[str, Any]] = None, *args, **kwargs):
+        self.config = config or {}
+        super().__init__(*args, **kwargs)
+    @abstractmethod
+    def generate(self, *args, **kwargs) -> dict:
+        pass
+    @classmethod
+    def has_all_required_data(cls, has_manifest: bool, **kwargs) -> Tuple[bool, str]:
+        """
+        Check if all required data is available for the insight to run.
+        :param has_manifest: A boolean indicating if manifest is available.
+        :return: A boolean indicating if all required data is available.
+        """
+        if not has_manifest:
+            return False, "manifest is required for insight to run."
+        return True, ""