PyPI - altimate-datapilot-cli - Versions diffs - 0.0.8__py3-none-any.whl - Mend

altimate-datapilot-cli 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

datapilot/core/platforms/dbt/insights/checks/check_source_tags.py ADDED Viewed

@@ -0,0 +1,76 @@
+from typing import List
+from datapilot.core.insights.utils import get_severity
+from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
+from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
+from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
+from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
+class CheckSourceTags(ChecksInsight):
+    NAME = "Source has tags"
+    ALIAS = "check_source_tags"
+    DESCRIPTION = "The source has only valid tags from the provided list."
+    REASON_TO_FLAG = "The source has tags that are not in the valid tags list"
+    TESTS_STR = "tags"
+    def _build_failure_result(
+        self,
+        node_id: str,
+        tags: List[str],
+    ) -> DBTInsightResult:
+        """
+        Build failure result for the insight if a source's tags are not in the provided tag list.
+        """
+        failure_message = f"The source:{node_id}'s tags: {tags} are not in the provided tag list: {self.tag_list}\n"
+        recommendation = "Update the source's tags to adhere to the provided tag list."
+        return DBTInsightResult(
+            type=self.TYPE,
+            name=self.NAME,
+            message=failure_message,
+            recommendation=recommendation,
+            reason_to_flag=self.REASON_TO_FLAG,
+            metadata={"tags": tags, "source_id": node_id},
+        )
+    def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
+        """
+        Generate a list of InsightResponse objects for each source in the DBT project,
+        Ensures that the source has only valid tags from the provided list.
+        The provided tag list is in the configuration file.
+        """
+        insights = []
+        self.tag_list = self.get_check_config(self.TESTS_STR)
+        for node_id, node in self.sources.items():
+            if self.should_skip_model(node_id):
+                self.logger.debug(f"Skipping source {node_id} as it is not enabled for selected models")
+                continue
+            if node.resource_type == AltimateResourceType.source:
+                tag_list = self.valid_tag(node.tags)
+                if tag_list:
+                    insights.append(
+                        DBTModelInsightResponse(
+                            unique_id=node_id,
+                            package_name=node.package_name,
+                            original_file_path=node.original_file_path,
+                            path=node.original_file_path,
+                            insight=self._build_failure_result(node_id, tag_list),
+                            severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
+                        )
+                    )
+        return insights
+    def valid_tag(self, tags: List[str]) -> List[str]:
+        """
+        Check if the tags of the source are in the provided tag list.
+        """
+        if not self.tag_list:
+            return True
+        tag_list = []
+        for tag in tags:
+            if tag not in self.tag_list:
+                tag_list.append(tag)
+        return tag_list

datapilot/core/platforms/dbt/insights/dbt_test/__init__.py ADDED Viewed

File without changes

datapilot/core/platforms/dbt/insights/dbt_test/base.py ADDED Viewed

@@ -0,0 +1,23 @@
+from abc import abstractmethod
+from typing import Tuple
+from datapilot.core.platforms.dbt.insights.base import DBTInsight
+class DBTTestInsight(DBTInsight):
+    TYPE = "Test"
+    @abstractmethod
+    def generate(self, *args, **kwargs) -> dict:
+        pass
+    @classmethod
+    def has_all_required_data(cls, has_manifest: bool, **kwargs) -> Tuple[bool, str]:
+        """
+            Check if all required data is available for the insight to run.
+            :param has_manifest: A boolean indicating if manifest is available.
+        :return: A boolean indicating if all required data is available.
+        """
+        if not has_manifest:
+            return False, "manifest is required for insight to run."
+        return True, ""

datapilot/core/platforms/dbt/insights/dbt_test/missing_primary_key_tests.py ADDED Viewed

@@ -0,0 +1,130 @@
+from typing import Dict
+from typing import List
+from typing import Optional
+from datapilot.core.insights.utils import get_severity
+from datapilot.core.platforms.dbt.constants import GENERIC
+from datapilot.core.platforms.dbt.insights.dbt_test.base import DBTTestInsight
+from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
+from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
+from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
+class MissingPrimaryKeyTests(DBTTestInsight):
+    """
+    This class identifies DBT models that are missing primary key tests.
+    Primary key tests are essential for ensuring data integrity in DBT models.
+    This class generates insights for each model that lacks proper primary key tests.
+    """
+    _ALL_TESTS_KEY = "_all_tests"
+    NOT_NULL = "not_null"
+    UNIQUE = "unique"
+    UNIQUE_COMBINATION_OF_COLUMNS = "unique_combination_of_columns"
+    NAME = "Missing primary key tests"
+    ALIAS = "missing_primary_key_tests"
+    DESCRIPTION = "Checks if the model has a primary key test. "
+    REASON_TO_FLAG = (
+        "dbt tests play a crucial role in asserting data correctness. The absence of primary key tests can increase "
+        "the risk of data integrity issues, affecting project reliability and scalability."
+    )
+    FAILURE_MESSAGE = (
+        "dbt model `{model_unique_id}` does not have a primary key test. " "This omission may lead to data integrity challenges."
+    )
+    RECOMMENDATION = (
+        "To address this, apply a uniqueness test and a not-null test to the column representing the model's grain. "
+        "For models with unique combinations of columns, consider adding a surrogate key and "
+        "applying these tests to that column. You can refer to dbt_utils for a surrogate_key macro"
+        " and unique_combination_of_columns test."
+    )
+    def _build_failure_result(self, model_unique_id: str) -> DBTInsightResult:
+        """
+        Constructs a failure result for a given model.
+        :param model_unique_id: Unique ID of the model being evaluated.
+        :return: An instance of DBTInsightResult containing failure details.
+        """
+        self.logger.debug(f"Building failure result for model {model_unique_id}")
+        failure = self.FAILURE_MESSAGE.format(model_unique_id=model_unique_id)
+        return DBTInsightResult(
+            type=self.TYPE,
+            name=self.NAME,
+            message=failure,
+            recommendation=self.RECOMMENDATION,
+            reason_to_flag=self.REASON_TO_FLAG,
+            metadata={"model_unique_id": model_unique_id},
+        )
+    def _has_primary_key_test(self, column_tests: Optional[Dict[str, List]]) -> bool:
+        """
+        Checks if the given column tests include a primary key test.
+        :param column_tests: Dictionary of column tests.
+        :return: True if primary key test exists, False otherwise.
+        """
+        self.logger.debug("Checking for primary key tests")
+        if not column_tests:
+            return False
+        if self.UNIQUE_COMBINATION_OF_COLUMNS in column_tests.get(self._ALL_TESTS_KEY, []):
+            return True
+        column_tests.pop(self._ALL_TESTS_KEY, None)
+        for tests in column_tests.values():
+            if self.NOT_NULL in tests and self.UNIQUE in tests:
+                return True
+        return False
+    def _get_nodes_which_need_tests(self) -> List[str]:
+        return [
+            node_id
+            for node_id, node in self.nodes.items()
+            if self.check_part_of_project(node.package_name) and node.resource_type == AltimateResourceType.model
+        ]
+    def _get_nodes_with_tests(self, tests) -> Dict[str, Dict[str, List]]:
+        nodes_with_tests = {}
+        for test in tests.values():
+            for node_id in test.depends_on.nodes or []:
+                column = test.test_metadata.kwargs.get("column_name")
+                key = column if column else self._ALL_TESTS_KEY
+                nodes_with_tests.setdefault(node_id, {}).setdefault(key, []).append(test.test_metadata.name)
+        return nodes_with_tests
+    def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
+        """
+        Generates insights for each DBT model in the project.
+        :return: A list of DBTModelInsightResponse objects with insights for each model.
+        """
+        self.logger.debug("Generating insights for DBT models")
+        tests = self.manifest.get_tests(GENERIC)
+        nodes_which_need_tests = self._get_nodes_which_need_tests()
+        nodes_which_have_test = self._get_nodes_with_tests(tests)
+        insights = []
+        for node_id in nodes_which_need_tests:
+            if self.should_skip_model(node_id):
+                self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
+                continue
+            if not self._has_primary_key_test(nodes_which_have_test.get(node_id)):
+                node = self.get_node(node_id)
+                self.logger.debug(f"Adding insight for model {node_id}")
+                insights.append(
+                    DBTModelInsightResponse(
+                        unique_id=node_id,
+                        package_name=node.package_name,
+                        path=node.original_file_path,
+                        original_file_path=node.original_file_path,
+                        insight=self._build_failure_result(node_id),
+                        severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
+                    )
+                )
+        self.logger.debug("Completed generating insights")
+        return insights

datapilot/core/platforms/dbt/insights/dbt_test/test_coverage.py ADDED Viewed

@@ -0,0 +1,118 @@
+from typing import List
+from datapilot.core.insights.utils import get_severity
+from datapilot.core.platforms.dbt.constants import SINGULAR
+from datapilot.core.platforms.dbt.insights.dbt_test.base import DBTTestInsight
+from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
+from datapilot.core.platforms.dbt.insights.schema import DBTProjectInsightResponse
+from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
+class DBTTestCoverage(DBTTestInsight):
+    """
+    This class identifies DBT models with test coverage below a specified threshold.
+    It aims to ensure that a minimum percentage of tests are applied to each model to maintain data integrity.
+    """
+    NAME = "Low test coverage in dbt models"
+    ALIAS = "dbt_low_test_coverage"
+    DESCRIPTION = "Checks if the project test coverage is below the minimum threshold. "
+    REASON_TO_FLAG = (
+        "dbt models should have a minimum test coverage percentage to ensure the reliability and accuracy "
+        "of data transformations. Low test coverage can lead to data quality issues."
+    )
+    FAILURE_MESSAGE = (
+        "The test coverage {coverage_percent}% is below the minimum threshold"
+        " of {min_coverage_percent}%. Insufficient test coverage can impact data integrity and transformation accuracy."
+    )
+    RECOMMENDATION = (
+        "To address this issue, review and increase the number and variety of tests applied to your model to "
+        "improve its test coverage. Consider adding different types of tests such as uniqueness, not_null, "
+        "and referential integrity tests to ensure data quality and accuracy."
+    )
+    MIN_COVERAGE_PERCENT = 100
+    MIN_COVERAGE_PERCENT_STR = "min_test_coverage_percent"
+    def _build_failure_result(self, coverage: float, min_coverage=MIN_COVERAGE_PERCENT) -> DBTInsightResult:
+        """
+        Constructs a failure result for a given model with low test coverage.
+        :param coverage: The calculated test coverage percentage for the model.
+        :param min_coverage: The minimum required test coverage percentage.
+        :return: An instance of DBTInsightResult containing failure details.
+        """
+        self.logger.debug(f"CALCULATED COVERAGE: {coverage}")
+        failure = self.FAILURE_MESSAGE.format(min_coverage_percent=min_coverage, coverage_percent=coverage)
+        return DBTInsightResult(
+            type=self.TYPE,
+            name=self.NAME,
+            message=failure,
+            recommendation=self.RECOMMENDATION,
+            reason_to_flag=self.REASON_TO_FLAG,
+            metadata={"min_coverage_percent": min_coverage, "coverage": coverage},
+        )
+    def _calculate_coverage(self) -> float:
+        """
+        :return: Test coverage percentage for the model.
+        """
+        num_models = len(
+            [
+                node.unique_id
+                for node in self.nodes.values()
+                if node.resource_type == AltimateResourceType.model and self.check_part_of_project(node.package_name)
+            ]
+        )
+        models_with_tests = set()
+        for test in self.tests.values():
+            if test.test_type == SINGULAR:
+                return 100
+            if test.package_name == self.project_name:
+                models_with_tests = models_with_tests.union(set(test.depends_on.nodes) if test.depends_on else set())
+        return round((len(models_with_tests) / num_models) * 100) if num_models > 0 else 100
+    def generate(self, *args, **kwargs) -> List[DBTProjectInsightResponse]:
+        """
+        Generates insights for each DBT model in the project, focusing on test coverage.
+        :return: A list of DBTModelInsightResponse objects with insights for each model.
+        """
+        self.logger.debug("Generating test coverage insights for DBT models")
+        min_coverage = self.get_check_config(self.MIN_COVERAGE_PERCENT_STR) or self.MIN_COVERAGE_PERCENT
+        coverage = self._calculate_coverage()
+        insights = []
+        if coverage < min_coverage:
+            insights.append(
+                DBTProjectInsightResponse(
+                    package_name=self.project_name,
+                    insights=[self._build_failure_result(coverage, min_coverage)],
+                    severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
+                )
+            )
+        self.logger.debug("Completed generating test coverage insights")
+        return insights
+    @classmethod
+    def get_config_schema(cls):
+        """
+        :return: The configuration schema for the test coverage insight.
+        """
+        config_schema = super().get_config_schema()
+        config_schema["config"] = {
+            "$schema": "http://json-schema.org/draft-07/schema#",
+            "type": "object",
+            "properties": {
+                cls.MIN_COVERAGE_PERCENT_STR: {
+                    "type": "integer",
+                    "description": "The minimum test coverage percentage required for the models in the project",
+                    "default": cls.MIN_COVERAGE_PERCENT,
+                },
+            },
+            "required": [cls.MIN_COVERAGE_PERCENT_STR],
+        }
+        return config_schema

datapilot/core/platforms/dbt/insights/governance/__init__.py ADDED Viewed

File without changes

datapilot/core/platforms/dbt/insights/governance/base.py ADDED Viewed

@@ -0,0 +1,23 @@
+from abc import abstractmethod
+from typing import Tuple
+from datapilot.core.platforms.dbt.insights.base import DBTInsight
+class DBTGovernanceInsight(DBTInsight):
+    TYPE = "governance"
+    @abstractmethod
+    def generate(self, *args, **kwargs) -> dict:
+        pass
+    @classmethod
+    def has_all_required_data(cls, has_manifest: bool, **kwargs) -> Tuple[bool, str]:
+        """
+        Check if all required data is available for the insight to run.
+        :param has_manifest: A boolean indicating if manifest is available.
+        :return: A boolean indicating if all required data is available.
+        """
+        if not has_manifest:
+            return False, "manifest is required for insight to run."
+        return True, ""

datapilot/core/platforms/dbt/insights/governance/documentation_on_stale_columns.py ADDED Viewed

@@ -0,0 +1,130 @@
+from typing import ClassVar
+from typing import List
+from typing import Tuple
+from datapilot.core.insights.utils import get_severity
+from datapilot.core.platforms.dbt.insights.governance.base import DBTGovernanceInsight
+from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
+from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
+from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
+from datapilot.core.platforms.dbt.wrappers.catalog.wrapper import BaseCatalogWrapper
+from datapilot.utils.formatting.utils import numbered_list
+class DBTDocumentationStaleColumns(DBTGovernanceInsight):
+    """
+    DBTDocumentationStaleColumns identifies columns that have been documented but are no longer present in the model.
+    """
+    NAME = "Documentation of stale columns"
+    ALIAS = "documentation_on_stale_columns"
+    DESCRIPTION = (
+        "Identify columns that have been documented but are no longer present in the model. "
+        "This insight helps in maintaining accurate and up-to-date documentation."
+    )
+    REASON_TO_FLAG = (
+        "A column has been documented but is no longer present in the model/database. "
+        "This discrepancy can cause confusion and mislead users of the dbt project."
+    )
+    FAILURE_MESSAGE = (
+        "The following documented columns are no longer present in the model `{model_unique_id}`:\n{stale_columns}. "
+        "This inconsistency can lead to confusion regarding the model's current structure."
+    )
+    RECOMMENDATION = (
+        "Review and update the documentation for model `{model_unique_id}`. Remove documentation entries for columns "
+        "that are no longer present to maintain clarity and accuracy in the project documentation."
+    )
+    FILES_REQUIRED: ClassVar = ["Manifest", "Catalog"]
+    def __init__(self, catalog_wrapper: BaseCatalogWrapper, *args, **kwargs):
+        self.catalog = catalog_wrapper
+        super().__init__(*args, **kwargs)
+    def _build_failure_result(self, model_unique_id: str, columns: List[str]) -> DBTInsightResult:
+        """
+        Build failure result for the insight if a model is a root model with 0 direct parents.
+        :param model_unique_id: Unique ID of the current model being evaluated.
+        :param columns: List of columns that are documented but no longer present in the model.
+        :return: An instance of InsightResult containing failure message and recommendation.
+        """
+        self.logger.debug(f"Building failure result for model {model_unique_id} with stale columns {columns}")
+        failure = self.FAILURE_MESSAGE.format(
+            stale_columns=numbered_list(columns),
+            model_unique_id=model_unique_id,
+        )
+        recommendation = self.RECOMMENDATION.format(model_unique_id=model_unique_id)
+        return DBTInsightResult(
+            type=self.TYPE,
+            name=self.NAME,
+            message=failure,
+            recommendation=recommendation,
+            reason_to_flag=self.REASON_TO_FLAG,
+            metadata={"stale_columns": columns, "model_unique_id": model_unique_id},
+        )
+    def _get_columns_documented(self, node_id) -> List[str]:
+        """
+        Get the list of columns that are documented for a given node.
+        :param node_id: The unique ID of the node.
+        :return: A list of column names.
+        """
+        columns = []
+        for column_name, column_node in self.get_node(node_id).columns.items():
+            if column_node.description:
+                columns.append(column_name.lower())
+        return columns
+    def _get_columns_in_model(self, node_id) -> List[str]:
+        if node_id not in self.catalog.get_schema():
+            return []
+        return [k.lower() for k in self.catalog.get_schema()[node_id].keys()]
+    def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
+        """
+        Generate a list of InsightResponse objects for each model in the DBT project,
+        identifying root models with 0 direct parents.
+        :return: A list of InsightResponse objects.
+        """
+        insights = []
+        for node_id, node in self.nodes.items():
+            if self.should_skip_model(node_id):
+                self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
+                continue
+            if node.resource_type == AltimateResourceType.model:
+                columns_documented = self._get_columns_documented(node_id)
+                db_columns = self._get_columns_in_model(node_id)
+                columns_stale = list(set(columns_documented) - set(db_columns))
+                if columns_stale:
+                    insights.append(
+                        DBTModelInsightResponse(
+                            unique_id=node_id,
+                            package_name=node.package_name,
+                            path=node.original_file_path,
+                            original_file_path=node.original_file_path,
+                            insight=self._build_failure_result(node_id, columns_stale),
+                            severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
+                        )
+                    )
+        return insights
+    @classmethod
+    def has_all_required_data(cls, has_manifest: bool, has_catalog: bool, **kwargs) -> Tuple[bool, str]:
+        """
+        return False
+        """
+        if not has_manifest:
+            return False, "manifest is required for insight to run."
+        if not has_catalog:
+            return False, "catalog is required for insight to run."
+        return True, ""
+    @classmethod
+    def requires_catalog(cls) -> bool:
+        return True

datapilot/core/platforms/dbt/insights/governance/exposures_dependent_on_private_models.py ADDED Viewed

@@ -0,0 +1,90 @@
+from typing import List
+from datapilot.core.insights.utils import get_severity
+from datapilot.core.platforms.dbt.insights.governance.base import DBTGovernanceInsight
+from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
+from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
+from datapilot.core.platforms.dbt.schemas.manifest import AltimateAccess
+from datapilot.utils.formatting.utils import numbered_list
+class DBTExposureDependentOnPrivateModels(DBTGovernanceInsight):
+    """
+    DBTExposureDependentOnPrivateModels identifies exposures that are dependent on private models.
+    """
+    NAME = "Exposures dependent on private models"
+    ALIAS = "exposures_dependent_on_private_models"
+    DESCRIPTION = "Identify exposures that are dependent on private models. "
+    REASON_TO_FLAG = (
+        "Exposures illustrate how and where data is consumed in downstream tools. These tools should utilize "
+        "data from public, trusted, and contracted sources to ensure data reliability and integrity."
+    )
+    FAILURE_MESSAGE = (
+        "Exposure `{exposure_unique_id}` is dependent on private models, which may not be ideal for "
+        "downstream consumption:\n`{private_models}`."
+    )
+    RECOMMENDATION = (
+        "Consider revising the yml file to ensure that the models your exposures depend on are fully "
+        "exposed and public. While this rule flags non-public models, it is also recommended to document"
+        " and formalize contracts for these public models for best practices."
+    )
+    def _build_failure_result(self, exposure_unique_id: str, private_models: List[str]) -> DBTInsightResult:
+        """
+        Build failure result for the insight if a model is a root model with 0 direct parents.
+        :param exposure_unique_id: Unique ID of the current model being evaluated.
+        :return: An instance of InsightResult containing failure message and recommendation.
+        """
+        self.logger.debug(f"Building failure result exposure {exposure_unique_id} depends on private models {private_models}")
+        failure = self.FAILURE_MESSAGE.format(
+            exposure_unique_id=exposure_unique_id,
+            private_models=numbered_list(private_models),
+        )
+        return DBTInsightResult(
+            type=self.TYPE,
+            name=self.NAME,
+            message=failure,
+            recommendation=self.RECOMMENDATION,
+            reason_to_flag=self.REASON_TO_FLAG,
+            metadata={"exposure": exposure_unique_id, "private_models": private_models},
+        )
+    def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
+        """
+        Generate a list of InsightResponse objects for each model in the dbt project,
+        identifying root models with 0 direct parents.
+        :return: A list of InsightResponse objects.
+        """
+        if len(self.exposures) == 0:
+            self.logger.debug(f"No exposures found in project {self.project_name}")
+            return []
+        insights = []
+        for exposure_id, exposure in self.exposures.items():
+            if self.should_skip_model(exposure_id):
+                self.logger.debug(f"Skipping model {exposure_id} as it is not enabled for selected models")
+                continue
+            self.logger.debug(f"Checking exposure {exposure_id}")
+            private_models = []
+            for dependency_id in exposure.depends_on.nodes:
+                dependency_node = self.get_node(dependency_id)
+                if dependency_node.access == AltimateAccess.private:
+                    private_models.append(dependency_id)
+            if private_models:
+                insight_result = self._build_failure_result(exposure_unique_id=exposure_id, private_models=private_models)
+                insights.append(
+                    DBTModelInsightResponse(
+                        unique_id=exposure_id,
+                        package_name=exposure.package_name,
+                        path=exposure.original_file_path,
+                        original_file_path=exposure.original_file_path,
+                        insight=insight_result,
+                        severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
+                    )
+                )
+        return insights