PyPI - pycarlo - Versions diffs - 0.12.24__py3-none-any.whl - Mend

pycarlo 0.12.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pycarlo might be problematic. Click here for more details.

Files changed (48) hide show

pycarlo/__init__.py +0 -0
pycarlo/common/__init__.py +31 -0
pycarlo/common/errors.py +31 -0
pycarlo/common/files.py +78 -0
pycarlo/common/http.py +36 -0
pycarlo/common/mcon.py +26 -0
pycarlo/common/retries.py +129 -0
pycarlo/common/settings.py +89 -0
pycarlo/common/utils.py +51 -0
pycarlo/core/__init__.py +10 -0
pycarlo/core/client.py +267 -0
pycarlo/core/endpoint.py +289 -0
pycarlo/core/operations.py +25 -0
pycarlo/core/session.py +127 -0
pycarlo/features/__init__.py +10 -0
pycarlo/features/circuit_breakers/__init__.py +3 -0
pycarlo/features/circuit_breakers/exceptions.py +10 -0
pycarlo/features/circuit_breakers/service.py +346 -0
pycarlo/features/dbt/__init__.py +3 -0
pycarlo/features/dbt/dbt_importer.py +208 -0
pycarlo/features/dbt/queries.py +31 -0
pycarlo/features/exceptions.py +18 -0
pycarlo/features/metadata/__init__.py +32 -0
pycarlo/features/metadata/asset_allow_block_list.py +22 -0
pycarlo/features/metadata/asset_filters_container.py +79 -0
pycarlo/features/metadata/base_allow_block_list.py +137 -0
pycarlo/features/metadata/metadata_allow_block_list.py +94 -0
pycarlo/features/metadata/metadata_filters_container.py +262 -0
pycarlo/features/pii/__init__.py +5 -0
pycarlo/features/pii/constants.py +3 -0
pycarlo/features/pii/pii_filterer.py +179 -0
pycarlo/features/pii/queries.py +20 -0
pycarlo/features/pii/service.py +56 -0
pycarlo/features/user/__init__.py +4 -0
pycarlo/features/user/exceptions.py +10 -0
pycarlo/features/user/models.py +9 -0
pycarlo/features/user/queries.py +13 -0
pycarlo/features/user/service.py +71 -0
pycarlo/lib/README.md +35 -0
pycarlo/lib/__init__.py +0 -0
pycarlo/lib/schema.json +210020 -0
pycarlo/lib/schema.py +82620 -0
pycarlo/lib/types.py +68 -0
pycarlo-0.12.24.dist-info/LICENSE +201 -0
pycarlo-0.12.24.dist-info/METADATA +249 -0
pycarlo-0.12.24.dist-info/RECORD +48 -0
pycarlo-0.12.24.dist-info/WHEEL +5 -0
pycarlo-0.12.24.dist-info/top_level.txt +1 -0

pycarlo/core/session.py ADDED Viewed

@@ -0,0 +1,127 @@
+import configparser
+import os
+import uuid
+from dataclasses import InitVar, dataclass, field
+from importlib.metadata import version as get_version
+from typing import Optional
+from pycarlo.common import get_logger
+from pycarlo.common.errors import InvalidConfigFileError, InvalidSessionError
+from pycarlo.common.settings import (
+    DEFAULT_CONFIG_PATH,
+    DEFAULT_MCD_API_ENDPOINT,
+    DEFAULT_MCD_API_ENDPOINT_CONFIG_KEY,
+    DEFAULT_MCD_API_ID_CONFIG_KEY,
+    DEFAULT_MCD_API_TOKEN_CONFIG_KEY,
+    DEFAULT_MCD_IGW_ENDPOINT,
+    DEFAULT_PACKAGE_NAME,
+    DEFAULT_PROFILE_NAME,
+    MCD_API_ENDPOINT,
+    MCD_DEFAULT_API_ID,
+    MCD_DEFAULT_API_TOKEN,
+    MCD_DEFAULT_PROFILE,
+    MCD_USER_ID_HEADER,
+    PROFILE_FILE_NAME,
+)
+logger = get_logger(__name__)
+@dataclass
+class Session:
+    """
+    Creates an MC access session.
+    Auth resolution hierarchy -
+    1. Passing credentials (mcd_id & mcd_token)
+    2. Environment variables (MCD_DEFAULT_API_ID & MCD_DEFAULT_API_TOKEN)
+    3. Config-file by passing passing profile name (mcd_profile)
+    4. Config-file by setting the profile as an environment variable (MCD_DEFAULT_PROFILE)
+    5. Config-file by default profile name (default)
+    Environment vars can be mixed with passed credentials, but not the config-file profile.
+    If necessary the MC API url can be overridden by specifying an endpoint.
+    The config-file path can be set via mcd_config_path.
+    An optional scope can be set to configure the Session to use the Integration Gateway
+    REST API instead of the GraphQL API.
+    """
+    mcd_id: InitVar[Optional[str]] = None
+    mcd_token: InitVar[Optional[str]] = None
+    mcd_profile: InitVar[Optional[str]] = None
+    mcd_config_path: InitVar[str] = DEFAULT_CONFIG_PATH
+    id: str = field(init=False)
+    token: str = field(init=False)
+    session_name: str = field(init=False)
+    endpoint: str = DEFAULT_MCD_API_ENDPOINT
+    user_id: Optional[str] = MCD_USER_ID_HEADER
+    scope: Optional[str] = None
+    def __post_init__(
+        self,
+        mcd_id: Optional[str],
+        mcd_token: Optional[str],
+        mcd_profile: Optional[str],
+        mcd_config_path: str,
+    ):
+        version = get_version(DEFAULT_PACKAGE_NAME)
+        self.session_name = f"python-sdk-{version}-{uuid.uuid4()}"
+        logger.info(f"Creating named session as '{self.session_name}'.")
+        mcd_id = mcd_id or MCD_DEFAULT_API_ID
+        mcd_token = mcd_token or MCD_DEFAULT_API_TOKEN
+        if mcd_id and mcd_token:
+            self.id = mcd_id
+            self.token = mcd_token
+        elif mcd_id or mcd_token:
+            raise InvalidSessionError("Partially setting a session is not supported.")
+        else:
+            self._read_config(
+                mcd_profile=mcd_profile or MCD_DEFAULT_PROFILE or DEFAULT_PROFILE_NAME,
+                mcd_config_path=mcd_config_path,
+            )
+        if MCD_API_ENDPOINT:
+            self.endpoint = MCD_API_ENDPOINT
+        elif self.scope and self.endpoint == DEFAULT_MCD_API_ENDPOINT:
+            # if scope is set and endpoint is the default one, change it to IGW
+            self.endpoint = DEFAULT_MCD_IGW_ENDPOINT
+        session_type = "GATEWAY_API" if self.scope else "APPLICATION_API"
+        logger.info(f"Created {session_type} session with MC API ID '{self.id}'.")
+    def _read_config(self, mcd_profile: str, mcd_config_path: str) -> None:
+        """
+        Return configuration from section (profile name) if it exists.
+        """
+        config_parser = Session._get_config_parser()
+        file_path = os.path.join(mcd_config_path, PROFILE_FILE_NAME)
+        logger.info(
+            "No provided connection details. Looking up session values from "
+            f"'{mcd_profile}' in '{file_path}'."
+        )
+        try:
+            config_parser.read(file_path)
+            self.id = config_parser.get(mcd_profile, DEFAULT_MCD_API_ID_CONFIG_KEY)
+            self.token = config_parser.get(mcd_profile, DEFAULT_MCD_API_TOKEN_CONFIG_KEY)
+            self.endpoint = config_parser.get(
+                mcd_profile,
+                DEFAULT_MCD_API_ENDPOINT_CONFIG_KEY,
+                fallback=DEFAULT_MCD_API_ENDPOINT,
+            )
+        except configparser.NoSectionError:
+            raise InvalidSessionError(f"Profile '{mcd_profile}' not found in '{file_path}'.")
+        except Exception as err:
+            raise InvalidConfigFileError from err
+    @staticmethod
+    def _get_config_parser() -> configparser.ConfigParser:
+        """
+        Gets a configparser
+        """
+        return configparser.ConfigParser()

pycarlo/features/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+from random import randrange
+def downtime() -> None:
+    """
+    A bit of fun. Demos SDK feature extensions.
+    """
+    if randrange(10) >= 5:
+        raise SystemExit("Bad luck.")
+    print("No data downtime found.")

pycarlo/features/circuit_breakers/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from pycarlo.features.circuit_breakers.service import CircuitBreakerService
+__all__ = ["CircuitBreakerService"]

pycarlo/features/circuit_breakers/exceptions.py ADDED Viewed

@@ -0,0 +1,10 @@
+from typing import Any
+class CircuitBreakerPipelineException(Exception):
+    pass
+class CircuitBreakerPollException(Exception):
+    def __init__(self, msg: str = "Polling timed out or contains a malformed log.", *args: Any):
+        super().__init__(msg, *args)

pycarlo/features/circuit_breakers/service.py ADDED Viewed

@@ -0,0 +1,346 @@
+import json
+import time
+from typing import Callable, Dict, List, Optional, Sequence, Union, cast
+from uuid import UUID
+from box import Box
+from pycarlo.common import get_logger
+from pycarlo.common.settings import (
+    HEADER_MCD_TELEMETRY_REASON,
+    HEADER_MCD_TELEMETRY_SERVICE,
+    RequestReason,
+)
+from pycarlo.common.utils import boxify
+from pycarlo.core import Client, Mutation, Query
+from pycarlo.features.circuit_breakers.exceptions import (
+    CircuitBreakerPipelineException,
+    CircuitBreakerPollException,
+)
+from pycarlo.lib.schema import CircuitBreakerState, SqlJobCheckpointStatus
+logger = get_logger(__name__)
+class CircuitBreakerService:
+    _TERM_STATES = {"PROCESSING_COMPLETE", "HAS_ERROR"}
+    def __init__(
+        self,
+        mc_client: Optional[Client] = None,
+        print_func: Callable = logger.info,
+    ):
+        """
+        Convenience methods to help with using circuit breaker rules.
+        :param mc_client: MCD client (e.g. for creating a custom session); created otherwise.
+        :param print_func: Function to use for echoing. Uses python logging by default, which
+                           requires setting MCD_VERBOSE_ERRORS.
+        """
+        self._client = mc_client or Client()
+        self._print_func = print_func
+    def trigger_and_poll(
+        self,
+        rule_uuid: Optional[Union[str, UUID]] = None,
+        namespace: Optional[str] = None,
+        rule_name: Optional[str] = None,
+        timeout_in_minutes: int = 5,
+        runtime_variables: Optional[Dict[str, str]] = None,
+    ) -> Optional[bool]:
+        """
+        Convenience method to both trigger and poll (wait) on circuit breaker rule execution.
+        :param rule_uuid: UUID of the rule (custom SQL monitor) to execute.
+        :param namespace: namespace of the rule (custom SQL monitor) to execute.
+        :param rule_name: name of the rule (custom SQL monitor) to execute.
+        :param timeout_in_minutes: Polling timeout in minutes. See poll() for details.
+        :param runtime_variables: runtime variables to use when executing the rule
+        :return: True if rule execution has breach; False otherwise. See poll() for any
+                 exceptions raised.
+        """
+        breaches = self.poll_all(
+            job_execution_uuids=self.trigger_all(
+                rule_uuid=rule_uuid,
+                namespace=namespace,
+                rule_name=rule_name,
+                runtime_variables=runtime_variables,
+            ),
+            timeout_in_minutes=timeout_in_minutes,
+        )
+        return bool(breaches > 0)
+    def trigger(
+        self,
+        rule_uuid: Optional[Union[str, UUID]] = None,
+        namespace: Optional[str] = None,
+        rule_name: Optional[str] = None,
+    ) -> str:
+        """
+        Trigger a rule to start execution with circuit breaker checkpointing.
+        :param rule_uuid: UUID of the rule (custom SQL monitor) to execute.
+        :param namespace: namespace of the rule (custom SQL monitor) to execute.
+        :param rule_name: name of the rule (custom SQL monitor) to execute.
+        :return: Job execution UUID, as a string, to be used to retrieve execution state / status.
+        """
+        mutation = Mutation()
+        if rule_uuid:
+            mutation.trigger_circuit_breaker_rule(rule_uuid=str(rule_uuid)).__fields__(
+                "job_execution_uuid"
+            )
+        elif rule_name:
+            if namespace:
+                mutation.trigger_circuit_breaker_rule(
+                    namespace=namespace, rule_name=rule_name
+                ).__fields__("job_execution_uuid")
+            else:
+                mutation.trigger_circuit_breaker_rule(rule_name=rule_name).__fields__(
+                    "job_execution_uuid"
+                )
+        else:
+            raise ValueError("rule UUID or namespace and rule name must be specified")
+        mutation_client = self._client(
+            mutation,
+            additional_headers={
+                HEADER_MCD_TELEMETRY_REASON: RequestReason.SERVICE.value,
+                HEADER_MCD_TELEMETRY_SERVICE: "circuit_breaker_service",
+            },
+        )
+        job_execution_uuid = mutation_client.trigger_circuit_breaker_rule.job_execution_uuid
+        self._print_func(
+            f"Triggered rule with ID '{rule_uuid}'. "
+            f"Received '{job_execution_uuid}' as execution ID."
+        )
+        return cast(str, job_execution_uuid)
+    def trigger_all(
+        self,
+        rule_uuid: Optional[Union[str, UUID]] = None,
+        namespace: Optional[str] = None,
+        rule_name: Optional[str] = None,
+        runtime_variables: Optional[Dict[str, str]] = None,
+    ) -> List[str]:
+        """
+        Trigger a rule to start execution with circuit breaker checkpointing.
+        This function supports rules that create multiple executions (e.g. rules with variables
+        or over multiple tables)
+        :param rule_uuid: UUID of the rule (custom SQL monitor) to execute.
+        :param namespace: namespace of the rule (custom SQL monitor) to execute.
+        :param rule_name: name of the rule (custom SQL monitor) to execute.
+        :param runtime_variables: runtime variables to use when executing the rule
+        :return: Job execution UUIDs, as strings, to be used to retrieve execution state / status.
+        """
+        mutation = Mutation()
+        runtime_variables_list: Optional[List[Dict[str, str]]]
+        if runtime_variables:
+            runtime_variables_list = [
+                {"name": key, "value": value} for key, value in runtime_variables.items()
+            ]
+        else:
+            runtime_variables_list = None
+        if rule_uuid:
+            mutation.trigger_circuit_breaker_rule_v2(
+                rule_uuid=str(rule_uuid),
+                **({"runtime_variables": runtime_variables_list} if runtime_variables else {}),
+            ).__fields__("job_execution_uuids")
+        elif rule_name:
+            if namespace:
+                mutation.trigger_circuit_breaker_rule_v2(
+                    namespace=namespace,
+                    rule_name=rule_name,
+                    **({"runtime_variables": runtime_variables_list} if runtime_variables else {}),
+                ).__fields__("job_execution_uuids")
+            else:
+                mutation.trigger_circuit_breaker_rule_v2(
+                    rule_name=rule_name,
+                    **({"runtime_variables": runtime_variables_list} if runtime_variables else {}),
+                ).__fields__("job_execution_uuids")
+        else:
+            raise ValueError("rule UUID or namespace and rule name must be specified")
+        job_execution_uuids = [
+            str(id)
+            for id in self._client(
+                mutation,
+                additional_headers={
+                    HEADER_MCD_TELEMETRY_REASON: RequestReason.SERVICE.value,
+                    HEADER_MCD_TELEMETRY_SERVICE: "circuit_breaker_service",
+                },
+            ).trigger_circuit_breaker_rule_v2.job_execution_uuids
+        ]
+        self._print_func(
+            f"Triggered rule with ID '{rule_uuid}'. "
+            f"Received {job_execution_uuids} as execution IDs."
+        )
+        return job_execution_uuids
+    def poll(
+        self,
+        job_execution_uuid: Union[str, UUID],
+        timeout_in_minutes: int = 5,
+    ) -> Optional[int]:
+        """
+        Poll status / state of an execution for a triggered rule. Polls until status is in a term
+        state or timeout.
+        :param job_execution_uuid: UUID for the job execution of a rule (custom SQL monitor).
+        :param timeout_in_minutes: Polling timeout in minutes. Note that The Data Collector Lambda
+                                   has a max timeout of 15 minutes when executing a query. Queries
+                                   that take longer to execute are not supported, so we recommend
+                                   filtering down the query output to improve performance (e.g limit
+                                   WHERE clause). If you expect a query to take the full 15 minutes
+                                   we recommend padding the timeout to 20 minutes.
+        :return: Breach count across all executions. A greater than 0 value indicates a breach.
+        :raise CircuitBreakerPipelineException: An error in executing the
+                                                rule (e.g. error in query).
+        :raise CircuitBreakerPollException: A timeout during polling or a malformed response.
+        """
+        return self.poll_all([job_execution_uuid], timeout_in_minutes=timeout_in_minutes)
+    def poll_all(
+        self,
+        job_execution_uuids: Sequence[Union[str, UUID]],
+        timeout_in_minutes: int = 5,
+    ) -> int:
+        """
+        Poll status / state of executions for a triggered rule. Polls until status is in a term
+        state or timeout.
+        :param job_execution_uuids: UUIDs for the job executions of a rule (custom SQL monitor).
+        :param timeout_in_minutes: Polling timeout in minutes. Note that The Data Collector Lambda
+                                   has a max timeout of 15 minutes when executing a query. Queries
+                                   that take longer to execute are not supported, so we recommend
+                                   filtering down the query output to improve performance (e.g limit
+                                   WHERE clause). If you expect a query to take the full 15 minutes
+                                   we recommend padding the timeout to 20 minutes.
+        :return: Breach count. A greater than 0 value indicates a breach.
+        :raise CircuitBreakerPipelineException: An error in executing the
+                                                rule (e.g. error in query).
+        :raise CircuitBreakerPollException: A timeout during polling or a malformed response.
+        """
+        logs = cast(
+            List[Box],
+            self._poll(
+                job_execution_uuids=job_execution_uuids,
+                timeout_in_minutes=timeout_in_minutes,
+            ),
+        )
+        if not logs:
+            raise CircuitBreakerPollException
+        self._print_func(
+            "Completed polling. Retrieved execution with logs "
+            f"{list(map(str, logs))} for IDs {job_execution_uuids}."
+        )
+        breaches = 0
+        has_breaches = False
+        if logs and len(logs) > 0:
+            for log in logs:
+                if log.payload.error:
+                    logs_str = "\n".join(str(log) for log in logs)
+                    raise CircuitBreakerPipelineException(
+                        f"Execution pipeline errored out. Details:\n{logs_str}"
+                    )
+                if log.payload.breach_count is not None:
+                    breaches += log.payload.breach_count
+                    has_breaches = True
+        if not has_breaches:
+            raise CircuitBreakerPollException
+        return breaches
+    @boxify(use_snakes=True, default_box_attr=None, default_box=True)
+    def _poll(
+        self,
+        job_execution_uuids: Sequence[Union[str, UUID]],
+        timeout_in_minutes: int,
+        sleep_interval_in_seconds: int = 15,
+    ) -> Optional[List[Box]]:
+        timeout_start = time.time()
+        while time.time() < timeout_start + 60 * timeout_in_minutes:
+            query = Query()
+            query.get_circuit_breaker_rule_state_v2(
+                job_execution_uuids=map(str, job_execution_uuids)
+            ).__fields__("status", "log")
+            circuit_rule_breaker_states = cast(
+                List[CircuitBreakerState],
+                self._client(
+                    query,
+                    additional_headers={
+                        HEADER_MCD_TELEMETRY_REASON: RequestReason.SERVICE.value,
+                        HEADER_MCD_TELEMETRY_SERVICE: "circuit_breaker_service",
+                    },
+                ).get_circuit_breaker_rule_state_v2,
+            )
+            aggregated_status = self._get_aggregated_status(circuit_rule_breaker_states)
+            self._print_func(
+                f"Retrieved execution with aggregated status '{aggregated_status}' for "
+                f"IDs {job_execution_uuids}."
+            )
+            if aggregated_status in self._TERM_STATES:
+                return self._get_payloads(circuit_rule_breaker_states, aggregated_status)
+            self._print_func(
+                f"Aggregated state is not terminal state for IDs {job_execution_uuids}. "
+                f"Polling again in '{sleep_interval_in_seconds}' seconds."
+            )
+            time.sleep(sleep_interval_in_seconds)
+    def _get_log_payload(self, log: str):
+        log_entries = json.loads(log)
+        log_entries.reverse()
+        for entry in log_entries:
+            if "payload" in entry:
+                return Box(entry, default_box_attr=None, default_box=True)
+        return Box()
+    def _get_payloads(
+        self,
+        states: List[CircuitBreakerState],
+        status: SqlJobCheckpointStatus,
+    ) -> List[Box]:
+        payloads = []
+        for state in states:
+            if state.status == status:
+                payloads.append(self._get_log_payload(str(state.log)))
+        return payloads
+    @staticmethod
+    def _get_aggregated_status(states: List[CircuitBreakerState]) -> SqlJobCheckpointStatus:
+        if not states:
+            return SqlJobCheckpointStatus.REGISTERED  # type: ignore
+        status_by_state = {}
+        for state in states:
+            status_by_state.setdefault(state.status, []).append(state)
+        def all_in_state(s: SqlJobCheckpointStatus):
+            return len(status_by_state.get(s, [])) == len(states)
+        return (
+            SqlJobCheckpointStatus.HAS_ERROR  # type: ignore
+            if status_by_state.get(SqlJobCheckpointStatus.HAS_ERROR)  # type: ignore
+            else SqlJobCheckpointStatus.PROCESSING_COMPLETE  # type: ignore
+            if all_in_state(SqlJobCheckpointStatus.PROCESSING_COMPLETE)  # type: ignore
+            else SqlJobCheckpointStatus.PROCESSING_START  # type: ignore
+            if status_by_state.get(SqlJobCheckpointStatus.PROCESSING_COMPLETE)  # type: ignore
+            or all_in_state(SqlJobCheckpointStatus.PROCESSING_START)  # type: ignore
+            else SqlJobCheckpointStatus.EXECUTING_COMPLETE  # type: ignore
+            if all_in_state(SqlJobCheckpointStatus.PROCESSING_COMPLETE)  # type: ignore
+            else SqlJobCheckpointStatus.EXECUTING_START  # type: ignore
+            if status_by_state.get(SqlJobCheckpointStatus.EXECUTING_COMPLETE)  # type: ignore
+            or all_in_state(SqlJobCheckpointStatus.EXECUTING_START)  # type: ignore
+            else SqlJobCheckpointStatus.REGISTERED  # type: ignore
+        )

pycarlo/features/dbt/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from pycarlo.features.dbt.dbt_importer import DbtImporter
+__all__ = ["DbtImporter"]