PyPI - gooddata-pipelines - Versions diffs - 1.50.0__py3-none-any.whl → 1.50.1.dev2__py3-none-any.whl - Mend

gooddata-pipelines 1.50.0py3-none-any.whl → 1.50.1.dev2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gooddata-pipelines might be problematic. Click here for more details.

Files changed (22) hide show

gooddata_pipelines/__init__.py CHANGED Viewed

@@ -6,11 +6,22 @@ from ._version import __version__
 from .backup_and_restore.backup_manager import BackupManager
 from .backup_and_restore.models.storage import (
     BackupRestoreConfig,
+    LocalStorageConfig,
+    S3StorageConfig,
     StorageType,
 )
 from .backup_and_restore.storage.local_storage import LocalStorage
 from .backup_and_restore.storage.s3_storage import S3Storage
+# -------- LDM Extension --------
+from .ldm_extension.ldm_extension_manager import LdmExtensionManager
+from .ldm_extension.models.custom_data_object import (
+    ColumnDataType,
+    CustomDatasetDefinition,
+    CustomFieldDefinition,
+    CustomFieldType,
+)
 # -------- Provisioning --------
 from .provisioning.entities.user_data_filters.models.udf_models import (
     UserDataFilterFullLoad,
@@ -51,6 +62,8 @@ __all__ = [
     "UserIncrementalLoad",
     "UserGroupIncrementalLoad",
     "PermissionFullLoad",
+    "LocalStorageConfig",
+    "S3StorageConfig",
     "PermissionIncrementalLoad",
     "UserFullLoad",
     "UserGroupFullLoad",
@@ -61,5 +74,10 @@ __all__ = [
     "UserDataFilterProvisioner",
     "UserDataFilterFullLoad",
     "EntityType",
+    "LdmExtensionManager",
+    "CustomDatasetDefinition",
+    "CustomFieldDefinition",
+    "ColumnDataType",
+    "CustomFieldType",
     "__version__",
 ]

gooddata_pipelines/api/gooddata_api.py CHANGED Viewed

@@ -174,6 +174,49 @@ class ApiMethods:
         )
         return self._get(endpoint)
+    def get_all_metrics(self, workspace_id: str) -> requests.Response:
+        """Get all metrics from the specified workspace.
+        Args:
+            workspace_id (str): The ID of the workspace to retrieve metrics from.
+        Returns:
+            requests.Response: The response containing the metrics.
+        """
+        endpoint = f"/entities/workspaces/{workspace_id}/metrics"
+        headers = {**self.headers, "X-GDC-VALIDATE-RELATIONS": "true"}
+        return self._get(endpoint, headers=headers)
+    def get_all_visualization_objects(
+        self, workspace_id: str
+    ) -> requests.Response:
+        """Get all visualizations from the specified workspace.
+        Args:
+            workspace_id (str): The ID of the workspace to retrieve visualizations from.
+        Returns:
+            requests.Response: The response containing the visualizations.
+        """
+        endpoint = f"/entities/workspaces/{workspace_id}/visualizationObjects"
+        headers = {**self.headers, "X-GDC-VALIDATE-RELATIONS": "true"}
+        return self._get(endpoint, headers=headers)
+    def get_all_dashboards(self, workspace_id: str) -> requests.Response:
+        """Get all dashboards from the specified workspace.
+        Args:
+            workspace_id (str): The ID of the workspace to retrieve dashboards from.
+        Returns:
+            requests.Response: The response containing the dashboards.
+        """
+        endpoint = f"/entities/workspaces/{workspace_id}/analyticalDashboards"
+        headers = {**self.headers, "X-GDC-VALIDATE-RELATIONS": "true"}
+        return self._get(endpoint, headers=headers)
+    def get_profile(self) -> requests.Response:
+        """Returns organization and current user information."""
+        endpoint = "/profile"
+        return self._get(endpoint)
     def _get(
         self, endpoint: str, headers: dict[str, str] | None = None
     ) -> requests.Response:
@@ -253,3 +296,15 @@ class ApiMethods:
         url = self._get_url(endpoint)
         return requests.delete(url, headers=self.headers, timeout=TIMEOUT)
+    @staticmethod
+    def raise_if_response_not_ok(*responses: requests.Response) -> None:
+        """Check if responses from API calls are OK.
+        Raises ValueError if any response is not OK (status code not 2xx).
+        """
+        for response in responses:
+            if not response.ok:
+                raise ValueError(
+                    f"Request to {response.url} failed with status code {response.status_code}: {response.text}"
+                )

gooddata_pipelines/backup_and_restore/backup_manager.py CHANGED Viewed

@@ -4,10 +4,8 @@ import json
 import os
 import shutil
 import tempfile
-import threading
 import time
 import traceback
-from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Type
@@ -39,6 +37,7 @@ from gooddata_pipelines.backup_and_restore.storage.s3_storage import (
     S3Storage,
 )
 from gooddata_pipelines.logger import LogObserver
+from gooddata_pipelines.utils.rate_limiter import RateLimiter
 @dataclass
@@ -60,6 +59,10 @@ class BackupManager:
         self.loader = BackupInputProcessor(self._api, self.config.api_page_size)
+        self._api_rate_limiter = RateLimiter(
+            calls_per_second=self.config.api_calls_per_second,
+        )
     @classmethod
     def create(
         cls: Type["BackupManager"],
@@ -95,11 +98,12 @@ class BackupManager:
     def get_user_data_filters(self, ws_id: str) -> dict:
         """Returns the user data filters for the specified workspace."""
-        response: requests.Response = self._api.get_user_data_filters(ws_id)
-        if response.ok:
-            return response.json()
-        else:
-            raise RuntimeError(f"{response.status_code}: {response.text}")
+        with self._api_rate_limiter:
+            response: requests.Response = self._api.get_user_data_filters(ws_id)
+            if response.ok:
+                return response.json()
+            else:
+                raise RuntimeError(f"{response.status_code}: {response.text}")
     def _store_user_data_filters(
         self,
@@ -144,14 +148,17 @@ class BackupManager:
     def _get_automations_from_api(self, workspace_id: str) -> Any:
         """Returns automations for the workspace as JSON."""
-        response: requests.Response = self._api.get_automations(workspace_id)
-        if response.ok:
-            return response.json()
-        else:
-            raise RuntimeError(
-                f"Failed to get automations for {workspace_id}. "
-                + f"{response.status_code}: {response.text}"
+        with self._api_rate_limiter:
+            response: requests.Response = self._api.get_automations(
+                workspace_id
             )
+            if response.ok:
+                return response.json()
+            else:
+                raise RuntimeError(
+                    f"Failed to get automations for {workspace_id}. "
+                    + f"{response.status_code}: {response.text}"
+                )
     def _store_automations(self, export_path: Path, workspace_id: str) -> None:
         """Stores the automations in the specified export path."""
@@ -183,7 +190,8 @@ class BackupManager:
     ) -> None:
         """Stores the filter views in the specified export path."""
         # Get the filter views YAML files from the API
-        self._api.store_declarative_filter_views(workspace_id, export_path)
+        with self._api_rate_limiter:
+            self._api.store_declarative_filter_views(workspace_id, export_path)
         # Move filter views to the subfolder containing the analytics model
         self._move_folder(
@@ -231,7 +239,10 @@ class BackupManager:
                 # the SDK. That way we could save and package all the declarations
                 # directly instead of reorganizing the folder structures. That should
                 # be more transparent/readable and possibly safer for threading
-                self._api.store_declarative_workspace(workspace_id, export_path)
+                with self._api_rate_limiter:
+                    self._api.store_declarative_workspace(
+                        workspace_id, export_path
+                    )
                 self.store_declarative_filter_views(export_path, workspace_id)
                 self._store_automations(export_path, workspace_id)
@@ -291,7 +302,6 @@ class BackupManager:
     def _process_batch(
         self,
         batch: BackupBatch,
-        stop_event: threading.Event,
         retry_count: int = 0,
     ) -> None:
         """Processes a single batch of workspaces for backup.
@@ -299,10 +309,6 @@ class BackupManager:
         and retry with exponential backoff up to BackupSettings.MAX_RETRIES.
         The base wait time is defined by BackupSettings.RETRY_DELAY.
         """
-        if stop_event.is_set():
-            # If the stop_event flag is set, return. This will terminate the thread
-            return
         try:
             with tempfile.TemporaryDirectory() as tmpdir:
                 self._get_workspace_export(tmpdir, batch.list_of_ids)
@@ -314,10 +320,7 @@ class BackupManager:
                 self.storage.export(tmpdir, self.org_id)
         except Exception as e:
-            if stop_event.is_set():
-                return
-            elif retry_count < BackupSettings.MAX_RETRIES:
+            if retry_count < BackupSettings.MAX_RETRIES:
                 # Retry with exponential backoff until MAX_RETRIES
                 next_retry = retry_count + 1
                 wait_time = BackupSettings.RETRY_DELAY**next_retry
@@ -328,52 +331,23 @@ class BackupManager:
                 )
                 time.sleep(wait_time)
-                self._process_batch(batch, stop_event, next_retry)
+                self._process_batch(batch, next_retry)
             else:
                 # If the batch fails after MAX_RETRIES, raise the error
                 self.logger.error(f"Batch failed: {e.__class__.__name__}: {e}")
                 raise
-    def _process_batches_in_parallel(
+    def _process_batches(
         self,
         batches: list[BackupBatch],
     ) -> None:
         """
-        Processes batches in parallel using concurrent.futures. Will stop the processing
-        if any one of the batches fails.
+        Processes batches sequentially to avoid overloading the API.
+        If any batch fails, the processing will stop.
         """
-        # Create a threading flag to control the threads that have already been started
-        stop_event = threading.Event()
-        with ThreadPoolExecutor(
-            max_workers=self.config.max_workers
-        ) as executor:
-            # Set the futures tasks.
-            futures = []
-            for batch in batches:
-                futures.append(
-                    executor.submit(
-                        self._process_batch,
-                        batch,
-                        stop_event,
-                    )
-                )
-            # Process futures as they complete
-            for future in as_completed(futures):
-                try:
-                    future.result()
-                except Exception:
-                    # On failure, set the flag to True - signal running processes to stop
-                    stop_event.set()
-                    # Cancel unstarted threads
-                    for f in futures:
-                        if not f.done():
-                            f.cancel()
-                    raise
+        for i, batch in enumerate(batches, 1):
+            self.logger.info(f"Processing batch {i}/{len(batches)}...")
+            self._process_batch(batch)
     def backup_workspaces(
         self,
@@ -440,7 +414,7 @@ class BackupManager:
                 f"Exporting {len(workspaces_to_export)} workspaces in {len(batches)} batches."
             )
-            self._process_batches_in_parallel(batches)
+            self._process_batches(batches)
             self.logger.info("Backup completed")
         except Exception as e:

gooddata_pipelines/backup_and_restore/constants.py CHANGED Viewed

@@ -21,19 +21,15 @@ class DirNames:
     UDF = "user_data_filters"
-@dataclass(frozen=True)
-class ConcurrencyDefaults:
-    MAX_WORKERS = 1
-    DEFAULT_BATCH_SIZE = 100
 @dataclass(frozen=True)
 class ApiDefaults:
     DEFAULT_PAGE_SIZE = 100
+    DEFAULT_BATCH_SIZE = 100
+    DEFAULT_API_CALLS_PER_SECOND = 1.0
 @dataclass(frozen=True)
-class BackupSettings(ConcurrencyDefaults, ApiDefaults):
+class BackupSettings(ApiDefaults):
     MAX_RETRIES = 3
     RETRY_DELAY = 5  # seconds
     TIMESTAMP_SDK_FOLDER = (

gooddata_pipelines/backup_and_restore/models/storage.py CHANGED Viewed

@@ -83,14 +83,13 @@ class BackupRestoreConfig(BaseModel):
             description="Batch size must be greater than 0",
         ),
     ] = Field(default=BackupSettings.DEFAULT_BATCH_SIZE)
-    max_workers: Annotated[
-        int,
+    api_calls_per_second: Annotated[
+        float,
         Field(
             gt=0,
-            lt=3,
-            description="Max workers must be greater than 0 and less than 3",
+            description="Maximum API calls per second (rate limiting)",
         ),
-    ] = Field(default=BackupSettings.MAX_WORKERS)
+    ] = Field(default=BackupSettings.DEFAULT_API_CALLS_PER_SECOND)
     @classmethod
     def from_yaml(cls, conf_path: str) -> "BackupRestoreConfig":

gooddata_pipelines/ldm_extension/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # (C) 2025 GoodData Corporation

gooddata_pipelines/ldm_extension/input_processor.py ADDED Viewed

@@ -0,0 +1,286 @@
+# (C) 2025 GoodData Corporation
+"""Module for processing validated custom datasets and fields data.
+This module is responsible for converting validated custom datasets and fields
+into objects defined in the GoodData Python SDK.
+"""
+from gooddata_sdk.catalog.identifier import (
+    CatalogDatasetWorkspaceDataFilterIdentifier,
+    CatalogGrainIdentifier,
+    CatalogReferenceIdentifier,
+)
+from gooddata_sdk.catalog.workspace.declarative_model.workspace.logical_model.data_filter_references import (
+    CatalogDeclarativeWorkspaceDataFilterReferences,
+)
+from gooddata_sdk.catalog.workspace.declarative_model.workspace.logical_model.dataset.dataset import (
+    CatalogDataSourceTableIdentifier,
+    CatalogDeclarativeAttribute,
+    CatalogDeclarativeDataset,
+    CatalogDeclarativeDatasetSql,
+    CatalogDeclarativeFact,
+    CatalogDeclarativeReference,
+    CatalogDeclarativeReferenceSource,
+    CatalogDeclarativeWorkspaceDataFilterColumn,
+)
+from gooddata_sdk.catalog.workspace.declarative_model.workspace.logical_model.date_dataset.date_dataset import (
+    CatalogDeclarativeDateDataset,
+    CatalogGranularitiesFormatting,
+)
+from gooddata_sdk.catalog.workspace.declarative_model.workspace.logical_model.ldm import (
+    CatalogDeclarativeLdm,
+    CatalogDeclarativeModel,
+)
+from gooddata_pipelines.ldm_extension.models.aliases import DatasetId
+from gooddata_pipelines.ldm_extension.models.custom_data_object import (
+    ColumnDataType,
+    CustomDataset,
+    CustomFieldDefinition,
+    CustomFieldType,
+)
+class LdmExtensionDataProcessor:
+    """Create GoodData LDM from validated custom datasets and fields."""
+    DATE_GRANULARITIES: list[str] = [
+        "MINUTE",
+        "HOUR",
+        "DAY",
+        "WEEK",
+        "MONTH",
+        "QUARTER",
+        "YEAR",
+        "MINUTE_OF_HOUR",
+        "HOUR_OF_DAY",
+        "DAY_OF_WEEK",
+        "DAY_OF_MONTH",
+        "DAY_OF_YEAR",
+        "WEEK_OF_YEAR",
+        "MONTH_OF_YEAR",
+        "QUARTER_OF_YEAR",
+    ]
+    @staticmethod
+    def _attribute_from_field(
+        dataset_name: str,
+        custom_field: CustomFieldDefinition,
+    ) -> CatalogDeclarativeAttribute:
+        """Assign a declarative attribute from a custom field definition."""
+        return CatalogDeclarativeAttribute(
+            id=custom_field.custom_field_id,
+            title=custom_field.custom_field_name,
+            source_column=custom_field.custom_field_source_column,
+            labels=[],
+            source_column_data_type=custom_field.custom_field_source_column_data_type.value,
+            tags=[dataset_name],
+        )
+    @staticmethod
+    def _fact_from_field(
+        dataset_name: str,
+        custom_field: CustomFieldDefinition,
+    ) -> CatalogDeclarativeFact:
+        """Assign a declarative fact from a custom field definition."""
+        return CatalogDeclarativeFact(
+            id=custom_field.custom_field_id,
+            title=custom_field.custom_field_name,
+            source_column=custom_field.custom_field_source_column,
+            source_column_data_type=custom_field.custom_field_source_column_data_type.value,
+            tags=[dataset_name],
+        )
+    def _date_from_field(
+        self,
+        dataset_name: str,
+        custom_field: CustomFieldDefinition,
+    ) -> CatalogDeclarativeDateDataset:
+        """Assign a declarative date dataset from a custom field definition."""
+        return CatalogDeclarativeDateDataset(
+            id=custom_field.custom_field_id,
+            title=custom_field.custom_field_name,
+            granularities_formatting=CatalogGranularitiesFormatting(
+                title_base="",
+                title_pattern="%titleBase - %granularityTitle",
+            ),
+            granularities=self.DATE_GRANULARITIES,
+            tags=[dataset_name],
+        )
+    @staticmethod
+    def _date_ref_from_field(
+        custom_field: CustomFieldDefinition,
+    ) -> CatalogDeclarativeReference:
+        """Create a date reference from a custom field definition."""
+        return CatalogDeclarativeReference(
+            identifier=CatalogReferenceIdentifier(
+                id=custom_field.custom_field_id
+            ),
+            multivalue=False,
+            sources=[
+                CatalogDeclarativeReferenceSource(
+                    column=custom_field.custom_field_source_column,
+                    target=CatalogGrainIdentifier(
+                        id=custom_field.custom_field_id,
+                        type=CustomFieldType.DATE.value,
+                    ),
+                    data_type=custom_field.custom_field_source_column_data_type.value,
+                )
+            ],
+        )
+    @staticmethod
+    def _get_sources(
+        dataset: CustomDataset,
+    ) -> tuple[
+        CatalogDataSourceTableIdentifier | None,
+        CatalogDeclarativeDatasetSql | None,
+    ]:
+        """Get the data source table and SQL from the dataset definition."""
+        # We will have either a table id or a sql statement. Let's store
+        # whatever data is available to variables and pass it to the
+        # dataset. Both can be object instances or None, but at least one
+        # should be valid as per prior validation.
+        dataset_source_table_id = (
+            CatalogDataSourceTableIdentifier(
+                id=dataset.definition.dataset_source_table,
+                data_source_id=dataset.definition.dataset_datasource_id,
+                path=[dataset.definition.dataset_source_table],
+            )
+            if dataset.definition.dataset_source_table
+            else None
+        )
+        dataset_sql = (
+            CatalogDeclarativeDatasetSql(
+                statement=dataset.definition.dataset_source_sql,
+                data_source_id=dataset.definition.dataset_datasource_id,
+            )
+            if dataset.definition.dataset_source_sql
+            else None
+        )
+        return dataset_source_table_id, dataset_sql
+    def datasets_to_ldm(
+        self, datasets: dict[DatasetId, CustomDataset]
+    ) -> CatalogDeclarativeModel:
+        """Convert validated datasets to GoodData declarative model.
+        Args:
+            datasets (dict[DatasetId, CustomDataset]): Dictionary of validated
+                datasets.
+        Returns:
+            CatalogDeclarativeModel: GoodData declarative model representation
+                of the datasets.
+        """
+        declarative_datasets: list[CatalogDeclarativeDataset] = []
+        # Date dimensions are not stored in a dataset, but as a separate datasets
+        # in `date_instances` object on the LDM
+        date_instances: list[CatalogDeclarativeDateDataset] = []
+        for dataset in datasets.values():
+            date_references: list[CatalogDeclarativeReference] = []
+            attributes: list[CatalogDeclarativeAttribute] = []
+            facts: list[CatalogDeclarativeFact] = []
+            # Iterate through the custom fields and create the appropriate objects
+            for custom_field in dataset.custom_fields:
+                if custom_field.custom_field_type == CustomFieldType.ATTRIBUTE:
+                    attributes.append(
+                        self._attribute_from_field(
+                            dataset.definition.dataset_name, custom_field
+                        )
+                    )
+                elif custom_field.custom_field_type == CustomFieldType.FACT:
+                    facts.append(
+                        self._fact_from_field(
+                            dataset.definition.dataset_name, custom_field
+                        )
+                    )
+                # Process date dimensions and store them to date_instances. Date
+                # dimensions are not stored in a dataset, but as a separate dataset.
+                # However, they need to be referenced in the dataset references to
+                # create the connection between the dataset and the date dimension
+                # in the GoodData Logical Data Model.
+                elif custom_field.custom_field_type == CustomFieldType.DATE:
+                    # Add the date dimension to the date_instances
+                    date_instances.append(
+                        self._date_from_field(
+                            dataset.definition.dataset_name, custom_field
+                        )
+                    )
+                    # Create a reference so that the date dimension is connected
+                    # to the dataset in the GoodData Logical Data Model.
+                    date_references.append(
+                        self._date_ref_from_field(custom_field)
+                    )
+                else:
+                    raise ValueError(
+                        f"Unsupported custom field type: {custom_field.custom_field_type}"
+                    )
+            # Get the data source info
+            dataset_source_table_id, dataset_sql = self._get_sources(dataset)
+            # Construct the declarative dataset object and append it to the list.
+            declarative_datasets.append(
+                CatalogDeclarativeDataset(
+                    id=dataset.definition.dataset_id,
+                    title=dataset.definition.dataset_name,
+                    grain=[],
+                    references=[
+                        CatalogDeclarativeReference(
+                            identifier=CatalogReferenceIdentifier(
+                                id=dataset.definition.parent_dataset_reference,
+                            ),
+                            multivalue=True,
+                            sources=[
+                                CatalogDeclarativeReferenceSource(
+                                    column=dataset.definition.dataset_reference_source_column,
+                                    data_type=dataset.definition.dataset_reference_source_column_data_type.value,
+                                    target=CatalogGrainIdentifier(
+                                        id=dataset.definition.parent_dataset_reference_attribute_id,
+                                        type=CustomFieldType.ATTRIBUTE.value,
+                                    ),
+                                )
+                            ],
+                        ),
+                    ]
+                    + date_references,
+                    description=None,
+                    attributes=attributes,
+                    facts=facts,
+                    data_source_table_id=dataset_source_table_id,
+                    sql=dataset_sql,
+                    workspace_data_filter_columns=[
+                        CatalogDeclarativeWorkspaceDataFilterColumn(
+                            name=dataset.definition.workspace_data_filter_column_name,
+                            data_type=ColumnDataType.STRING.value,
+                        )
+                    ],
+                    workspace_data_filter_references=[
+                        CatalogDeclarativeWorkspaceDataFilterReferences(
+                            filter_id=CatalogDatasetWorkspaceDataFilterIdentifier(
+                                id=dataset.definition.workspace_data_filter_id
+                            ),
+                            filter_column=dataset.definition.workspace_data_filter_column_name,
+                            filter_column_data_type=ColumnDataType.STRING.value,
+                        )
+                    ],
+                    tags=[dataset.definition.dataset_name],
+                )
+            )
+        # Create the Logical Data Model from the datasets and the date instances.
+        ldm = CatalogDeclarativeLdm(
+            datasets=declarative_datasets, date_instances=date_instances
+        )
+        return CatalogDeclarativeModel(ldm=ldm)

gooddata-pipelines 1.50.0__py3-none-any.whl → 1.50.1.dev2__py3-none-any.whl

Potentially problematic release.

gooddata-pipelines 1.50.0py3-none-any.whl → 1.50.1.dev2py3-none-any.whl