PyPI - altimate-datapilot-cli - Versions diffs - 0.0.8__py3-none-any.whl - Mend

altimate-datapilot-cli 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

datapilot/core/platforms/dbt/insights/checks/check_model_materialization_by_childs.py ADDED Viewed

@@ -0,0 +1,129 @@
+from typing import List
+from datapilot.core.insights.utils import get_severity
+from datapilot.core.platforms.dbt.constants import VIEW
+from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
+from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
+from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
+class CheckModelMaterializationByChilds(ChecksInsight):
+    NAME = "Model materialization by children"
+    ALIAS = "check_model_materialization_by_childs"
+    DESCRIPTION = "Fewer children than threshold ideally should be view or ephemeral, more or equal should be table or incremental."
+    REASON_TO_FLAG = "The model is flagged due to inappropriate materialization: models with child counts above the threshold require robust and efficient data processing, hence they should be materialized as tables or incrementals for optimized query performance and data management."
+    THRESHOLD_CHILDS_STR = "threshold_childs"
+    def _build_failure_result_view_materialization(
+        self,
+        node_id: str,
+        nr_childs: int,
+        threshold_childs: int,
+        model_materialization: str,
+    ) -> DBTInsightResult:
+        """
+        Build failure result for the insight if a model's materialization is view and has less child models than the threshold.
+        """
+        failure_message = f"The model:{node_id} has {nr_childs} childs, but the materialization is {model_materialization}.\n"
+        recommendation = "Consider changing the materialization to table or incremental."
+        return DBTInsightResult(
+            type=self.TYPE,
+            name=self.NAME,
+            message=failure_message,
+            recommendation=recommendation,
+            reason_to_flag=self.REASON_TO_FLAG,
+            metadata={"threshold_childs": threshold_childs, "nr_childs": nr_childs, "model_materialization": model_materialization},
+        )
+    def _build_failure_result_not_view_materialization(
+        self,
+        node_id: str,
+        nr_childs: int,
+        threshold_childs: int,
+        model_materialization: str,
+    ) -> DBTInsightResult:
+        """
+        Build failure result for the insight if a model's materialization is not view and has more or equal child models than the threshold.
+        """
+        failure_message = f"The model:{node_id} has {nr_childs} childs, but the materialization is {model_materialization}.\n"
+        recommendation = "Consider changing the materialization to view or ephemeral."
+        return DBTInsightResult(
+            type=self.TYPE,
+            name=self.NAME,
+            message=failure_message,
+            recommendation=recommendation,
+            reason_to_flag=self.REASON_TO_FLAG,
+            metadata={"threshold_childs": threshold_childs, "nr_childs": nr_childs, "model_materialization": model_materialization},
+        )
+    def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
+        """
+        Generate a list of InsightResponse objects for each model in the DBT project,
+        Checks the model materialization by a given threshold of child models.
+        All models with less child models then the treshold should be materialized as views (or ephemerals),
+        all the rest as tables or incrementals.
+        threshold_childs: Threshold from which onwards the materialization should be changed.
+        threshold_childs will be taken from the config file.
+        """
+        insights = []
+        threshold_childs = self.get_check_config(self.THRESHOLD_CHILDS_STR)
+        if not threshold_childs:
+            self.logger.info(f"Threshold childs are not provided in the configuration file for the insight {self.ALIAS}")
+            return insights
+        for node_id, node in self.nodes.items():
+            if self.should_skip_model(node_id):
+                self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
+                continue
+            nr_childs = len(self.children_map.get(node_id, []))
+            model_materialization = node.config.materialized
+            if nr_childs > threshold_childs and model_materialization == VIEW:
+                insights.append(
+                    DBTModelInsightResponse(
+                        unique_id=node_id,
+                        package_name=node.package_name,
+                        path=node.original_file_path,
+                        original_file_path=node.original_file_path,
+                        insight=self._build_failure_result_view_materialization(
+                            node_id, nr_childs, threshold_childs, model_materialization
+                        ),
+                        severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
+                    )
+                )
+            elif nr_childs <= threshold_childs and model_materialization != VIEW:
+                insights.append(
+                    DBTModelInsightResponse(
+                        unique_id=node_id,
+                        package_name=node.package_name,
+                        path=node.original_file_path,
+                        original_file_path=node.original_file_path,
+                        insight=self._build_failure_result_not_view_materialization(
+                            node_id, nr_childs, threshold_childs, model_materialization
+                        ),
+                        severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
+                    )
+                )
+        return insights
+    @classmethod
+    def get_config_schema(cls):
+        config_schema = super().get_config_schema()
+        config_schema["config"] = {
+            "$schema": "http://json-schema.org/draft-07/schema#",
+            "type": "object",
+            "properties": {
+                cls.THRESHOLD_CHILDS_STR: {
+                    "type": "integer",
+                    "description": "Threshold from which onwards the materialization should be changed.",
+                    "default": 5,
+                },
+            },
+        }
+        return config_schema

datapilot/core/platforms/dbt/insights/checks/check_model_name_contract.py ADDED Viewed

@@ -0,0 +1,132 @@
+import re
+from typing import Dict
+from typing import List
+from datapilot.core.insights.utils import get_severity
+from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
+from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
+from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
+from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
+from datapilot.utils.utils import is_superset_path
+class CheckModelNameContract(ChecksInsight):
+    NAME = "Valid Mmdel name by folder"
+    ALIAS = "model_name_by_folder"
+    DESCRIPTION = (
+        "Check that model name abides to a contract (similar to check-column-name-contract). A contract consists of a regex pattern."
+    )
+    REASON_TO_FLAG = "Model naming convention is not as expected"
+    DEFAULT_PATTERN_STR = "default_pattern"
+    PATTERNS_LIST_STR = "patterns"
+    PATTERN_STR = "pattern"
+    FOLDER_STR = "folder"
+    def _build_failure_result(
+        self,
+        node_id: str,
+        failure: Dict[str, str],
+    ) -> DBTInsightResult:
+        """
+        Build failure result for the insight if a column has a different name that doesn't match the contract.
+        :return: An instance of InsightResult containing failure message and recommendation.
+        """
+        model_name = failure.get("model_name")
+        model_path = failure.get("model_path")
+        expected_pattern = failure.get("pattern")
+        failure_message = (
+            f"The model:{node_id} with name {model_name} in {model_path} does not match the contract pattern: {expected_pattern}."
+        )
+        recommendation = (
+            "Update the model name to adhere to the contract. "
+            "Consistent model naming conventions provide valuable context and aids in data understanding and collaboration."
+        )
+        return DBTInsightResult(
+            type=self.TYPE,
+            name=self.NAME,
+            message=failure_message,
+            recommendation=recommendation,
+            reason_to_flag=self.REASON_TO_FLAG,
+            metadata={"model_unique_id": node_id, **failure},
+        )
+    def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
+        """
+        Generate a list of InsightResponse objects for each model in the DBT project,
+        identifying models with model name that matches a certain regex pattern.
+        """
+        insights = []
+        self.default_pattern = self.get_check_config(self.DEFAULT_PATTERN_STR)
+        pattern_configs = self.get_check_config(self.PATTERNS_LIST_STR)
+        if not pattern_configs:
+            self.logger.debug(f"Model name contract not found in insight config for {self.ALIAS}. Skipping insight.")
+            return []
+        self.patterns = {
+            pattern.get(self.FOLDER_STR): pattern.get(self.PATTERN_STR)
+            for pattern in pattern_configs
+            if pattern.get(self.PATTERN_STR) and pattern.get(self.FOLDER_STR)
+        }
+        for node_id, node in self.nodes.items():
+            if self.should_skip_model(node_id):
+                self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
+                continue
+            if node.resource_type == AltimateResourceType.model:
+                failure = self._check_model_name_contract(node_id)
+                if failure:
+                    insights.append(
+                        DBTModelInsightResponse(
+                            unique_id=node_id,
+                            package_name=node.package_name,
+                            path=node.original_file_path,
+                            original_file_path=node.original_file_path,
+                            insight=self._build_failure_result(node_id, failure),
+                            severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
+                        )
+                    )
+        return insights
+    def _check_model_name_contract(self, model_unique_id: str) -> bool:
+        """
+        Check if the model name abides to the contract.
+        """
+        model_name = self.get_node(model_unique_id).name
+        model_path = self.get_node(model_unique_id).original_file_path
+        for folder, pattern in self.patterns.items():
+            if is_superset_path(folder, model_path):
+                if re.match(pattern, model_name, re.IGNORECASE) is None:
+                    return {"pattern": pattern, "model_name": model_name, "model_path": model_path}
+        return {}
+    @classmethod
+    def get_config_schema(cls):
+        config_schema = super().get_config_schema()
+        config_schema["config"] = {
+            "$schema": "http://json-schema.org/draft-07/schema#",
+            "type": "object",
+            "properties": {
+                cls.DEFAULT_PATTERN_STR: {
+                    "type": "string",
+                    "description": "The regex pattern to check the model name against",
+                    "default": "^[a-z_]+$",
+                },
+                cls.PATTERNS_LIST_STR: {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            cls.PATTERN_STR: {"type": "string", "description": "The regex pattern to check the model name against"},
+                            cls.FOLDER_STR: {"type": "string", "description": "The folder to apply the pattern to."},
+                        },
+                        "required": [cls.PATTERN_STR, cls.FOLDER_STR],
+                    },
+                    "description": "A list of regex patterns to check the model name against. Each pattern is applied to the folder specified. If no pattern is found for the folder, the default pattern is used.",
+                    "default": [],
+                },
+            },
+            "required": [cls.DEFAULT_PATTERN_STR, cls.PATTERNS_LIST_STR],
+        }
+        config_schema["files_required"] = cls.FILES_REQUIRED
+        return config_schema

datapilot/core/platforms/dbt/insights/checks/check_model_parents_and_childs.py ADDED Viewed

@@ -0,0 +1,135 @@
+from typing import List
+from typing import Optional
+from datapilot.core.insights.utils import get_severity
+from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
+from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
+from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
+from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
+class CheckModelParentsAndChilds(ChecksInsight):
+    NAME = "Model has specific number of parents or/and childs"
+    ALIAS = "check_model_parents_and_childs"
+    DESCRIPTION = "Ensures the model has a specific number (max/min) of parents or/and childs."
+    REASON_TO_FLAG = (
+        "Models with a specific number of parents or/and childs can lead to confusion and hinder effective data "
+        "modeling and analysis. It's important to have consistent model relationships."
+    )
+    MIN_PARENTS_STR = "min_parents"
+    MAX_PARENTS_STR = "max_parents"
+    MIN_CHILDS_STR = "min_children"
+    MAX_CHILDS_STR = "max_children"
+    def _build_failure_result(
+        self,
+        node_id: str,
+        failure_message: str,
+    ) -> DBTInsightResult:
+        """
+        Build failure result for the insight if a column has specific number (max/min) of parents or/and childs.
+        :return: An instance of InsightResult containing failure message and recommendation.
+        """
+        recommendation = (
+            "Update the model to adhere to have the required number of parents or childs."
+            "Models not following the required number of parents or childs can lead to confusion and hinder effective data "
+        )
+        return DBTInsightResult(
+            type=self.TYPE,
+            name=self.NAME,
+            message=failure_message,
+            recommendation=recommendation,
+            reason_to_flag=self.REASON_TO_FLAG,
+            metadata={
+                "min_parents": self.min_parents,
+                "max_parents": self.max_parents,
+                "min_childs": self.min_childs,
+                "max_childs": self.max_childs,
+                "model_unique_id": node_id,
+            },
+        )
+    def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
+        """
+        Generate a list of InsightResponse objects for each model in the DBT project,
+        ensures that the model has a specific number (max/min) of parents or/and childs.
+        The parent and child numbers are defined in the config file.
+        The parent and corresponding child information is present in self.children_map
+        """
+        insights = []
+        self.min_parents = self.get_check_config(self.MIN_PARENTS_STR) or 1
+        self.max_parents = self.get_check_config(self.MAX_PARENTS_STR)
+        self.min_childs = self.get_check_config(self.MIN_CHILDS_STR) or 0
+        self.max_childs = self.get_check_config(self.MAX_CHILDS_STR)
+        if not self.max_childs and not self.max_parents:
+            self.logger.info(
+                "max_children and max_parents are required values in the configuration. Please provide the required values. Skipping the insight."
+            )
+            return insights
+        for node_id, node in self.nodes.items():
+            if self.should_skip_model(node_id):
+                self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
+            if node.resource_type == AltimateResourceType.model:
+                failure_message = self._check_model_parents_and_childs(node_id)
+                if failure_message:
+                    insights.append(
+                        DBTModelInsightResponse(
+                            unique_id=node_id,
+                            package_name=node.package_name,
+                            path=node.original_file_path,
+                            original_file_path=node.original_file_path,
+                            insight=self._build_failure_result(node_id, failure_message),
+                            severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
+                        )
+                    )
+        return insights
+    def _check_model_parents_and_childs(self, model_unique_id: str) -> Optional[str]:
+        """
+        Check if the model has a specific number (max/min) of parents or/and childs.
+        """
+        children = self.children_map.get(model_unique_id, [])
+        node = self.get_node(model_unique_id)
+        parents = node.depends_on.nodes
+        message = ""
+        if len(parents) < self.min_parents or len(parents) > self.max_parents:
+            message += f"The model:{model_unique_id} doesn't have the required number of parents.\n Min parents: {self.min_parents}, Max parents: {self.max_parents}. It has f{len(parents)} parents\n"
+        if len(children) < self.min_childs or len(children) > self.max_childs:
+            message += f"The model:{model_unique_id} doesn't have the required number of childs.\n Min childs: {self.min_childs}, Max childs: {self.max_childs}. It has f{len(children)} childs\n"
+        return message
+    @classmethod
+    def get_config_schema(cls):
+        config_schema = super().get_config_schema()
+        config_schema["config"] = {
+            "$schema": "http://json-schema.org/draft-07/schema#",
+            "type": "object",
+            "properties": {
+                cls.MAX_CHILDS_STR: {"type": "integer", "description": "The maximum number of childs a model can have.", "default": 3},
+                cls.MIN_CHILDS_STR: {
+                    "type": "integer",
+                    "description": "The minimum number of childs a model can have.",
+                    "default": 0,
+                },
+                cls.MAX_PARENTS_STR: {
+                    "type": "integer",
+                    "description": "The maximum number of parents a model can have.",
+                    "default": 3,
+                },
+                cls.MIN_PARENTS_STR: {
+                    "type": "integer",
+                    "description": "The minimum number of parents a model can have.",
+                    "default": 0,
+                },
+            },
+            "required": [cls.MAX_CHILDS_STR, cls.MAX_PARENTS_STR],
+        }
+        config_schema["files_required"] = cls.FILES_REQUIRED
+        return config_schema

datapilot/core/platforms/dbt/insights/checks/check_model_parents_database.py ADDED Viewed

@@ -0,0 +1,109 @@
+from typing import List
+from datapilot.core.insights.utils import get_severity
+from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
+from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
+from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
+from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
+class CheckModelParentsDatabase(ChecksInsight):
+    NAME = "Check model parents database"
+    ALIAS = "check_model_parents_database"
+    DESCRIPTION = "Ensures the parent models or sources are from certain database."
+    REASON_TO_FLAG = "The model has a different database as parent model or source."
+    WHITELIST_STR = "whitelist"
+    BLACKLIST_STR = "blacklist"
+    def _build_failure_result(
+        self,
+        node_id: str,
+        parent_database: str,
+    ) -> DBTInsightResult:
+        """
+        Build failure result for the insight if a model's parent database is not whitelist or in blacklist.
+        """
+        failure_message = f"The model:{node_id}'s parent model's database is not in whitelist or blacklisted:\n"
+        recommendation = "Update the parent model's database to adhere to the whitelist or remove the model from the blacklist."
+        return DBTInsightResult(
+            type=self.TYPE,
+            name=self.NAME,
+            message=failure_message,
+            recommendation=recommendation,
+            reason_to_flag=self.REASON_TO_FLAG,
+            metadata={"parent_database": parent_database},
+        )
+    def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
+        """
+        Generate a list of InsightResponse objects for each model in the DBT project,
+        ensures the parent models or sources are from certain database.
+        The whitelist and blacklist of databases are defined in the config file.
+        """
+        insights = []
+        self.whitelist = self.get_check_config(self.WHITELIST_STR)
+        self.blacklist = self.get_check_config(self.BLACKLIST_STR) or []
+        for node_id in self.nodes.keys():
+            if self.should_skip_model(node_id):
+                self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
+                continue
+            parent_database = self._check_model_parents_database(node_id)
+            if parent_database:
+                insights.append(
+                    DBTModelInsightResponse(
+                        unique_id=node_id,
+                        package_name=self.nodes[node_id].package_name,
+                        path=self.nodes[node_id].original_file_path,
+                        original_file_path=self.nodes[node_id].original_file_path,
+                        insight=self._build_failure_result(node_id, parent_database),
+                        severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
+                    )
+                )
+        return insights
+    def _check_model_parents_database(self, model_unique_id: str) -> bool:
+        """
+        Check if the parent models or sources are from certain database.
+        """
+        model = self.get_node(model_unique_id)
+        if model.resource_type == AltimateResourceType.model:
+            for parent in getattr(model.depends_on, "nodes", []):
+                parent_model = self.get_node(parent)
+                if not parent_model:
+                    continue
+                if parent_model.resource_type not in [AltimateResourceType.model, AltimateResourceType.source]:
+                    continue
+                if self.whitelist and (parent_model.database not in self.whitelist):
+                    return parent_model.database
+                if self.blacklist and (parent_model.database in self.blacklist):
+                    return parent_model.database
+        return None
+    @classmethod
+    def get_config_schema(cls):
+        config_schema = super().get_config_schema()
+        config_schema["config"] = {
+            "$schema": "http://json-schema.org/draft-07/schema#",
+            "type": "object",
+            "properties": {
+                cls.WHITELIST_STR: {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "List of databases that are allowed as parent models or sources.",
+                },
+                cls.BLACKLIST_STR: {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "List of databases that are not allowed as parent models or sources.",
+                },
+            },
+        }
+        return config_schema

datapilot/core/platforms/dbt/insights/checks/check_model_parents_schema.py ADDED Viewed

@@ -0,0 +1,109 @@
+from typing import List
+from datapilot.core.insights.utils import get_severity
+from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
+from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult
+from datapilot.core.platforms.dbt.insights.schema import DBTModelInsightResponse
+from datapilot.core.platforms.dbt.schemas.manifest import AltimateResourceType
+class CheckModelParentsSchema(ChecksInsight):
+    NAME = "Model Parents are from an allowed list of schemas"
+    ALIAS = "check_model_parents_schema"
+    DESCRIPTION = "Ensures the parent models or sources are from certain schema."
+    REASON_TO_FLAG = "The model has a different schema as parent model or source."
+    WHITELIST_STR = "whitelist"
+    BLACKLIST_STR = "blacklist"
+    def _build_failure_result(
+        self,
+        node_id: str,
+        parent_schema: str,
+    ) -> DBTInsightResult:
+        """
+        Build failure result for the insight if a model's parent schema is not whitelist or in blacklist.
+        """
+        failure_message = f"The model:{node_id}'s parent model's schema is not in whitelist or blacklisted:\n"
+        recommendation = "Update the parent model's schema to adhere to the whitelist or remove the model from the blacklist."
+        return DBTInsightResult(
+            type=self.TYPE,
+            name=self.NAME,
+            message=failure_message,
+            recommendation=recommendation,
+            reason_to_flag=self.REASON_TO_FLAG,
+            metadata={"parent_schema": parent_schema},
+        )
+    def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
+        """
+        Generate a list of InsightResponse objects for each model in the DBT project,
+        ensures the parent models or sources are from certain schema.
+        The whitelist and blacklist of schemas are defined in the config file.
+        """
+        insights = []
+        self.whitelist = self.get_check_config(self.WHITELIST_STR)
+        self.blacklist = self.get_check_config(self.BLACKLIST_STR) or []
+        for node_id in self.nodes.keys():
+            if self.should_skip_model(node_id):
+                self.logger.debug(f"Skipping model {node_id} as it is not enabled for selected models")
+                continue
+            parent_schema = self._check_model_parents_schema(node_id)
+            if parent_schema:
+                insights.append(
+                    DBTModelInsightResponse(
+                        unique_id=node_id,
+                        package_name=self.nodes[node_id].package_name,
+                        path=self.nodes[node_id].original_file_path,
+                        original_file_path=self.nodes[node_id].original_file_path,
+                        insight=self._build_failure_result(node_id, parent_schema),
+                        severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
+                    )
+                )
+        return insights
+    def _check_model_parents_schema(self, model_unique_id: str) -> bool:
+        """
+        Check if the parent models or sources are from certain schema.
+        """
+        model = self.get_node(model_unique_id)
+        if model.resource_type == AltimateResourceType.model:
+            for parent in getattr(model.depends_on, "nodes", []):
+                parent_model = self.get_node(parent)
+                if not parent_model:
+                    continue
+                if parent_model.resource_type not in [AltimateResourceType.model, AltimateResourceType.source]:
+                    continue
+                if self.whitelist and (parent_model.schema_name not in self.whitelist):
+                    return parent_model.schema_name
+                if self.blacklist and (parent_model.schema_name in self.blacklist):
+                    return parent_model.schema_name
+        return None
+    @classmethod
+    def get_config_schema(cls):
+        config_schema = super().get_config_schema()
+        config_schema["config"] = {
+            "$schema": "http://json-schema.org/draft-07/schema#",
+            "type": "object",
+            "properties": {
+                cls.WHITELIST_STR: {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "List of schemas that are allowed as parent models or sources.",
+                },
+                cls.BLACKLIST_STR: {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "List of schemas that are not allowed as parent models or sources.",
+                },
+            },
+        }
+        return config_schema