PyPI - gooddata-pipelines - Versions diffs - 1.50.0__py3-none-any.whl → 1.50.1.dev2__py3-none-any.whl - Mend

gooddata-pipelines 1.50.0py3-none-any.whl → 1.50.1.dev2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gooddata-pipelines might be problematic. Click here for more details.

Files changed (22) hide show

gooddata_pipelines/ldm_extension/input_validator.py ADDED Viewed

@@ -0,0 +1,185 @@
+# (C) 2025 GoodData Corporation
+"""Module for validating custom fields input data.
+This module is responsible for validating custom fields input data checking for
+row level and aggregated constraints.
+"""
+from collections import Counter
+from typing import Any, TypeVar
+from pydantic import BaseModel
+from gooddata_pipelines.ldm_extension.models.aliases import (
+    DatasetId,
+    WorkspaceId,
+)
+from gooddata_pipelines.ldm_extension.models.custom_data_object import (
+    CustomDataset,
+    CustomDatasetDefinition,
+    CustomFieldDefinition,
+    CustomFieldType,
+)
+class LdmExtensionDataValidator:
+    ModelT = TypeVar("ModelT", bound=BaseModel)
+    def validate(
+        self,
+        dataset_definitions: list[CustomDatasetDefinition],
+        field_definitions: list[CustomFieldDefinition],
+    ) -> dict[WorkspaceId, dict[DatasetId, CustomDataset]]:
+        """Validate dataset and field definitions.
+        Validates the dataset definitions and field definitions by using Pydantic
+        models to check row level constraints, then aggregates the definitions
+        per workspace, while checking for integrity on aggregated level, i.e.,
+        uniqueness of combinations of identifieres on workspace level.
+        Args:
+            raw_dataset_definitions (list[dict[str, str]]): List of raw dataset definitions to validate.
+            raw_field_definitions (list[dict[str, str]]): List of raw field definitions to validate.
+        Returns:
+            dict[WorkspaceId, dict[DatasetId, CustomDataset]]:
+                Dictionary of validated dataset definitions per workspace,
+                where each dataset contains its custom fields:
+                ```python
+                {
+                    "workspace_id_1": {
+                        "dataset_id_1": CustomDataset(...),
+                        "dataset_id_2": CustomDataset(...),
+                    },
+                    ...
+                }
+                ```
+        """
+        # First, validate the dataset definitions and aggregate them per workspace.
+        validated_data = self._validate_dataset_definitions(dataset_definitions)
+        # Then validate the field definitions and connect them to the datasets
+        validated_data = self._validate_field_definitions(
+            validated_data, field_definitions
+        )
+        return validated_data
+    def _validate_dataset_definitions(
+        self,
+        dataset_definitions: list[CustomDatasetDefinition],
+    ) -> dict[WorkspaceId, dict[DatasetId, CustomDataset]]:
+        self._check_dataset_combinations(dataset_definitions)
+        validated_definitions: dict[
+            WorkspaceId, dict[DatasetId, CustomDataset]
+        ] = {}
+        for definition in dataset_definitions:
+            validated_definitions.setdefault(definition.workspace_id, {})[
+                definition.dataset_id
+            ] = CustomDataset(definition=definition, custom_fields=[])
+        return validated_definitions
+    def _check_dataset_combinations(
+        self, dataset_definitions: list[CustomDatasetDefinition]
+    ) -> None:
+        """Check integrity of provided dataset definitions.
+        Validation criteria:
+            - workspace_id + dataset_id must be unique across all dataset definitions.
+        Args:
+            dataset_definitions (list[CustomDatasetDefinition]): List of dataset definitions to check.
+        Raises:
+            ValueError: If there are duplicate dataset definitions based on workspace_id and dataset_id.
+        """
+        workspace_dataset_combinations = [
+            (definition.workspace_id, definition.dataset_id)
+            for definition in dataset_definitions
+        ]
+        if len(workspace_dataset_combinations) != len(
+            set(workspace_dataset_combinations)
+        ):
+            duplicates = self._get_duplicates(workspace_dataset_combinations)
+            raise ValueError(
+                "Duplicate dataset definitions found in the raw dataset "
+                + f"definitions (workspace_id, dataset_id): {duplicates}"
+            )
+    @staticmethod
+    def _get_duplicates(list_to_check: list[Any]) -> list[Any]:
+        """Get duplicates from a list.
+        Args:
+            list_to_check (list[Any]): List of items to check for duplicates.
+        Returns:
+            list[Any]: List of duplicate items.
+        """
+        counts = Counter(list_to_check)
+        return [item for item, count in counts.items() if count > 1]
+    def _check_field_combinations(
+        self, field_definitions: list[CustomFieldDefinition]
+    ) -> None:
+        """Check integrity of provided field definitions.
+        Validation criteria (per workspace):
+            - unique workspace_id + cf_id combinations (only for attribute and fact custom_field_type)
+            - there is no row with the same dataset_id and cf_id (only for date custom_field_type)
+        Args:
+            field_definitions (list[CustomFieldDefinition]): List of field definitions to check.
+        Raises:
+            ValueError: If there are duplicate field definitions based on workspace_id and cf_id.
+        """
+        workspace_field_combinations: set[tuple[str, str]] = set()
+        dataset_field_combinations: set[tuple[str, str]] = set()
+        for field in field_definitions:
+            if field.custom_field_type in [
+                CustomFieldType.ATTRIBUTE,
+                CustomFieldType.FACT,
+            ]:
+                combination = (field.workspace_id, field.custom_field_id)
+                if combination in workspace_field_combinations:
+                    raise ValueError(
+                        f"Duplicate custom field found for workspace {field.workspace_id} "
+                        + f"with field ID {field.custom_field_id}"
+                    )
+                workspace_field_combinations.add(combination)
+            elif field.custom_field_type == CustomFieldType.DATE:
+                combination = (field.dataset_id, field.custom_field_id)
+                if combination in dataset_field_combinations:
+                    raise ValueError(
+                        f"Duplicate custom field found for dataset {field.dataset_id} "
+                        + f"with field ID {field.custom_field_id}"
+                    )
+                dataset_field_combinations.add(combination)
+    def _validate_field_definitions(
+        self,
+        validated_definitions: dict[
+            WorkspaceId, dict[DatasetId, CustomDataset]
+        ],
+        field_definitions: list[CustomFieldDefinition],
+    ) -> dict[WorkspaceId, dict[DatasetId, CustomDataset]]:
+        """Validates custom field definitions amd connects them to the datasets.
+        Args:
+            validated_definitions (dict[WorkspaceId, dict[DatasetId, CustomDataset]]):
+                Dictionary of validated dataset definitions per workspace.
+            raw_field_definitions (list[dict[str, str]]): List of raw field definitions to validate.
+        Returns:
+            dict[WorkspaceId, dict[DatasetId, CustomDataset]]:
+                Updated dictionary of validated dataset definitions with custom fields added.
+        """
+        self._check_field_combinations(field_definitions)
+        for field_definition in field_definitions:
+            validated_definitions[field_definition.workspace_id][
+                field_definition.dataset_id
+            ].custom_fields.append(field_definition)
+        return validated_definitions

gooddata_pipelines/ldm_extension/ldm_extension_manager.py ADDED Viewed

@@ -0,0 +1,283 @@
+# (C) 2025 GoodData Corporation
+"""Module orchestrating the custom fields logic."""
+from pathlib import Path
+from gooddata_sdk.sdk import GoodDataSdk
+from gooddata_sdk.utils import PROFILES_FILE_PATH, profile_content
+from gooddata_pipelines.api import GoodDataApi
+from gooddata_pipelines.ldm_extension.input_processor import (
+    LdmExtensionDataProcessor,
+)
+from gooddata_pipelines.ldm_extension.input_validator import (
+    LdmExtensionDataValidator,
+)
+from gooddata_pipelines.ldm_extension.models.aliases import (
+    DatasetId,
+    WorkspaceId,
+)
+from gooddata_pipelines.ldm_extension.models.analytical_object import (
+    AnalyticalObject,
+    AnalyticalObjects,
+)
+from gooddata_pipelines.ldm_extension.models.custom_data_object import (
+    CustomDataset,
+    CustomDatasetDefinition,
+    CustomFieldDefinition,
+)
+from gooddata_pipelines.logger.logger import LogObserver
+class LdmExtensionManager:
+    """Manager for creating custom datasets and fields in GoodData workspaces."""
+    INDENT = " " * 2
+    @classmethod
+    def create(cls, host: str, token: str) -> "LdmExtensionManager":
+        return cls(host=host, token=token)
+    @classmethod
+    def create_from_profile(
+        cls,
+        profile: str = "default",
+        profiles_path: Path = PROFILES_FILE_PATH,
+    ) -> "LdmExtensionManager":
+        """Creates a provisioner instance using a GoodData profile file."""
+        content = profile_content(profile, profiles_path)
+        return cls(host=content["host"], token=content["token"])
+    def __init__(self, host: str, token: str):
+        self._validator = LdmExtensionDataValidator()
+        self._processor = LdmExtensionDataProcessor()
+        self._sdk = GoodDataSdk.create(host_=host, token_=token)
+        self._api = GoodDataApi(host=host, token=token)
+        self.logger = LogObserver()
+    def _get_objects_with_invalid_relations(
+        self, workspace_id: str
+    ) -> list[AnalyticalObject]:
+        """Check for invalid references in the provided analytical objects.
+        This method checks if the references in the provided analytical objects
+        are valid. It returns a set of analytical objects that have invalid references.
+        Args:
+            workspace_id (str): The ID of the workspace to check.
+        Returns:
+            list[AnalyticalObject]: Set of analytical objects with invalid references.
+        """
+        analytical_objects: list[AnalyticalObject] = (
+            self._get_analytical_objects(workspace_id=workspace_id)
+        )
+        objects_with_invalid_relations = [
+            obj
+            for obj in analytical_objects
+            if not obj.attributes.are_relations_valid
+        ]
+        return objects_with_invalid_relations
+    def _get_analytical_objects(
+        self, workspace_id: str
+    ) -> list[AnalyticalObject]:
+        """Get analytical objects in the workspace.
+        This method retrieves all analytical objects (metrics, visualizations, dashboards)
+        in the specified workspace and returns them as a list.
+        Args:
+            workspace_id (str): The ID of the workspace to retrieve objects from.
+        Returns:
+            list[AnalyticalObject]: List of analytical objects in the workspace.
+        """
+        metrics_response = self._api.get_all_metrics(workspace_id)
+        visualizations_response = self._api.get_all_visualization_objects(
+            workspace_id
+        )
+        dashboards_response = self._api.get_all_dashboards(workspace_id)
+        self._api.raise_if_response_not_ok(
+            metrics_response,
+            visualizations_response,
+            dashboards_response,
+        )
+        metrics = AnalyticalObjects(**metrics_response.json())
+        visualizations = AnalyticalObjects(**visualizations_response.json())
+        dashboards = AnalyticalObjects(**dashboards_response.json())
+        return metrics.data + visualizations.data + dashboards.data
+    @staticmethod
+    def _new_ldm_does_not_invalidate_relations(
+        current_invalid_relations: list[AnalyticalObject],
+        new_invalid_relations: list[AnalyticalObject],
+    ) -> bool:
+        """Check if the new LDM does not invalidate any new relations.
+        This method compares the lists of analytical objects containing invalid
+        relations. It creates sets of object IDs for each list and compares them.
+        If the set of new invalid relations is a subset of the set of current
+        invalid relations (that is before the changes to the LDM), the new LDM
+        does not invalidate any new relations and `True` is returned.
+        If the set of new invalid relations is not a subset of the current one,
+        it means that the new LDM invalidates new relations and `False` is returned.
+        Args:
+            current_invalid_relations (list[AnalyticalObject]): The current (before
+                changes to LDM) invalid relations.
+            new_invalid_relations (list[AnalyticalObject]): The new (after changes to
+                LDM) invalid relations.
+        Returns:
+            bool: True if the new LDM does not invalidate any relations, False otherwise.
+        """
+        # Create a set of IDs for each group, then compare those sets
+        set_current_invalid_relations = {
+            obj.id for obj in current_invalid_relations
+        }
+        set_new_invalid_relations = {obj.id for obj in new_invalid_relations}
+        # If the set of new invalid relations is a subset of the current one,
+        return set_new_invalid_relations.issubset(set_current_invalid_relations)
+    def _process_with_relations_check(
+        self,
+        validated_data: dict[WorkspaceId, dict[DatasetId, CustomDataset]],
+    ) -> None:
+        """Check whether relations of analytical objects are valid before and after
+        updating the LDM in the GoodData workspace.
+        """
+        # Iterate through the workspaces.
+        for workspace_id, datasets in validated_data.items():
+            self.logger.info(f"⚙️ Processing workspace {workspace_id}...")
+            # Get current workspace layout
+            current_layout = (
+                self._sdk.catalog_workspace.get_declarative_workspace(
+                    workspace_id
+                )
+            )
+            # Get a set of objects with invalid relations from current workspace state
+            current_invalid_relations = (
+                self._get_objects_with_invalid_relations(
+                    workspace_id=workspace_id
+                )
+            )
+            # Put the LDM with custom datasets into the GoodData workspace.
+            self._sdk.catalog_workspace_content.put_declarative_ldm(
+                workspace_id=workspace_id,
+                ldm=self._processor.datasets_to_ldm(datasets),
+            )
+            # Get a set of objects with invalid relations from the new workspace state
+            new_invalid_relations = self._get_objects_with_invalid_relations(
+                workspace_id=workspace_id
+            )
+            if self._new_ldm_does_not_invalidate_relations(
+                current_invalid_relations, new_invalid_relations
+            ):
+                self._log_success_message(workspace_id)
+                continue
+            self.logger.error(
+                f"❌ Difference in invalid relations found in workspace {workspace_id}."
+            )
+            self._log_diff_invalid_relations(
+                current_invalid_relations, new_invalid_relations
+            )
+            self.logger.info(
+                f"{self.INDENT}⚠️ Reverting the workspace layout to the original state."
+            )
+            # Put the original workspace layout back to the workspace
+            try:
+                self._sdk.catalog_workspace.put_declarative_workspace(
+                    workspace_id=workspace_id, workspace=current_layout
+                )
+            except Exception as e:
+                self.logger.error(
+                    f"Failed to revert workspace layout in {workspace_id}: {e}"
+                )
+    def _log_diff_invalid_relations(
+        self,
+        current_invalid_relations: list[AnalyticalObject],
+        new_invalid_relations: list[AnalyticalObject],
+    ) -> None:
+        """Logs objects with newly invalid relations.
+        Objects which previously did not have invalid relations, but do so after
+        updating the LDM, are logged.
+        """
+        # TODO: test !
+        diff_to_log: list[str] = []
+        for obj in new_invalid_relations:
+            if obj not in current_invalid_relations:
+                diff_to_log.append(
+                    f"{self.INDENT}∙ {obj.id} ({obj.type}) {obj.attributes.title}"
+                )
+        joined_diff_to_log = "\n".join(diff_to_log)
+        error_message = f"{self.INDENT}Objects with newly invalidated relations:\n{joined_diff_to_log}"
+        self.logger.error(error_message)
+    def _process_without_relations_check(
+        self,
+        validated_data: dict[WorkspaceId, dict[DatasetId, CustomDataset]],
+    ) -> None:
+        """Update the LDM in the GoodData workspace without checking relations."""
+        for workspace_id, datasets in validated_data.items():
+            # Put the LDM with custom datasets into the GoodData workspace.
+            self._sdk.catalog_workspace_content.put_declarative_ldm(
+                workspace_id=workspace_id,
+                ldm=self._processor.datasets_to_ldm(datasets),
+            )
+            self._log_success_message(workspace_id)
+    def _log_success_message(self, workspace_id: str) -> None:
+        """Log a success message after updating the workspace LDM."""
+        self.logger.info(f"✅ LDM in {workspace_id} updated successfully.")
+    def process(
+        self,
+        custom_datasets: list[CustomDatasetDefinition],
+        custom_fields: list[CustomFieldDefinition],
+        check_relations: bool = True,
+    ) -> None:
+        """Create custom datasets and fields in GoodData workspaces.
+        Creates custom datasets and fields to extend the Logical Data Model (LDM)
+        in GoodData workspaces based on the provided raw data definitions. The raw
+        data is validated by Pydantic models (CustomDatasetDefinition and CustomFieldDefinition).
+        The defined datasets and fields are then uploaded to GoodData Cloud.
+        Args:
+            custom_datasets (list[CustomDatasetDefinition]): List of custom dataset definitions.
+            custom_fields (list[CustomFieldDefinition]): List of custom field definitions.
+            check_relations (bool): If True, checks for invalid relations in the workspace
+                after updating the LDM. If the number of invalid relations increases,
+                the LDM will be reverted to its previous state. If False, the check
+                is skiped and the LDM is updated directly. Defaults to True.
+        Raises:
+            ValueError: If there are validation errors in the dataset or field definitions.
+        """
+        # Validate raw data and aggregate the custom field and dataset
+        # definitions per workspace.
+        validated_data: dict[WorkspaceId, dict[DatasetId, CustomDataset]] = (
+            self._validator.validate(custom_datasets, custom_fields)
+        )
+        if check_relations:
+            # Process the validated data with relations check.
+            self._process_with_relations_check(validated_data)
+        else:
+            self._process_without_relations_check(validated_data)

gooddata_pipelines/ldm_extension/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # (C) 2025 GoodData Corporation

gooddata_pipelines/ldm_extension/models/aliases.py ADDED Viewed

@@ -0,0 +1,9 @@
+# (C) 2025 GoodData Corporation
+"""This module defines type aliases intended to improve readability."""
+from typing import TypeAlias
+WorkspaceId: TypeAlias = str
+DatasetId: TypeAlias = str
+__all__ = ["WorkspaceId", "DatasetId"]

gooddata_pipelines/ldm_extension/models/analytical_object.py ADDED Viewed

@@ -0,0 +1,33 @@
+# (C) 2025 GoodData Corporation
+"""This module defines the AnalyticalObjects Pydantic model.
+The model is used to represent features of analytical objects important for
+checking the validity of references.
+"""
+from pydantic import BaseModel, Field
+class Attributes(BaseModel):
+    title: str
+    are_relations_valid: bool = Field(alias="areRelationsValid")
+class AnalyticalObject(BaseModel):
+    id: str
+    type: str
+    attributes: Attributes
+class AnalyticalObjects(BaseModel):
+    """Simplified model representing response obtained from GoodData API when querying
+    analytical objects.
+    This model is used to represent analytical objects such as metrics, visualizations,
+    and dashboard in a simplified manner, with the purpose of checkinf the validity
+    of references of these objects.
+    This is not a complete schema of the analytical objects!
+    """
+    data: list[AnalyticalObject]

gooddata_pipelines/ldm_extension/models/custom_data_object.py ADDED Viewed

@@ -0,0 +1,90 @@
+# (C) 2025 GoodData Corporation
+"""This module defines enums and models used to represent the input data.
+Models defined here are used to validate and structure the input data before
+further processing.
+"""
+from enum import Enum
+from pydantic import BaseModel, model_validator
+class CustomFieldType(str, Enum):
+    """GoodData field types."""
+    # NOTE: Start using StrEnum with Python 3.11
+    ATTRIBUTE = "attribute"
+    FACT = "fact"
+    DATE = "date"
+class ColumnDataType(str, Enum):
+    """Supported data types"""
+    # NOTE: Start using StrEnum with Python 3.11
+    INT = "INT"
+    STRING = "STRING"
+    DATE = "DATE"
+    NUMERIC = "NUMERIC"
+    TIMESTAMP = "TIMESTAMP"
+    TIMESTAMP_TZ = "TIMESTAMP_TZ"
+    BOOLEAN = "BOOLEAN"
+class CustomFieldDefinition(BaseModel):
+    """Input model for custom field definition."""
+    workspace_id: str
+    dataset_id: str
+    custom_field_id: str
+    custom_field_name: str
+    custom_field_type: CustomFieldType
+    custom_field_source_column: str
+    custom_field_source_column_data_type: ColumnDataType
+    @model_validator(mode="after")
+    def check_ids_not_equal(self) -> "CustomFieldDefinition":
+        """Check that custom field ID is not the same as dataset ID."""
+        if self.custom_field_id == self.dataset_id:
+            raise ValueError(
+                f"Custom field ID {self.custom_field_id} cannot be the same as dataset ID {self.dataset_id}"
+            )
+        return self
+class CustomDatasetDefinition(BaseModel):
+    """Input model for custom dataset definition."""
+    workspace_id: str
+    dataset_id: str
+    dataset_name: str
+    dataset_datasource_id: str
+    dataset_source_table: str | None
+    dataset_source_sql: str | None
+    parent_dataset_reference: str
+    parent_dataset_reference_attribute_id: str
+    dataset_reference_source_column: str
+    dataset_reference_source_column_data_type: ColumnDataType
+    workspace_data_filter_id: str
+    workspace_data_filter_column_name: str
+    @model_validator(mode="after")
+    def check_source(self) -> "CustomDatasetDefinition":
+        """At least one of dataset_source_table or dataset_source_sql is provided."""
+        if not (self.dataset_source_table or self.dataset_source_sql):
+            raise ValueError(
+                "One of dataset_source_table and dataset_source_sql must be provided"
+            )
+        if self.dataset_source_table and self.dataset_source_sql:
+            raise ValueError(
+                "Only one of dataset_source_table and dataset_source_sql can be provided"
+            )
+        return self
+class CustomDataset(BaseModel):
+    """Custom dataset with its definition and custom fields."""
+    definition: CustomDatasetDefinition
+    custom_fields: list[CustomFieldDefinition]

gooddata_pipelines/provisioning/entities/users/models/users.py CHANGED Viewed

@@ -3,7 +3,16 @@
 from typing import Any
 from gooddata_sdk.catalog.user.entity_model.user import CatalogUser
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
+class UserProfile(BaseModel):
+    """Minimal model of api/v1/profile response.
+    Does not contain all fields from the response.
+    """
+    user_id: str = Field(alias="userId")
 class BaseUser(BaseModel):

gooddata-pipelines 1.50.0__py3-none-any.whl → 1.50.1.dev2__py3-none-any.whl

Potentially problematic release.

gooddata-pipelines 1.50.0py3-none-any.whl → 1.50.1.dev2py3-none-any.whl