PyPI - castor-extractor - Versions diffs - 0.22.0__py3-none-any.whl → 0.22.5__py3-none-any.whl - Mend

castor-extractor 0.22.0py3-none-any.whl → 0.22.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of castor-extractor might be problematic. Click here for more details.

Files changed (38) hide show

CHANGELOG.md CHANGED Viewed

@@ -1,6 +1,26 @@
 # Changelog
+## 0.22.5 - 2025-01-09
+* Databricks: validate and deduplicate lineage links
+## 0.22.4 - 2025-01-08
+* ThoughtSpot: extract answers
+## 0.22.3 - 2024-12-10
+* Databricks: extract lineage from system tables
+## 0.22.2 - 2024-12-06
+* Sigma: multithreading to retrieve lineage
+## 0.22.1 - 2024-12-05
+* Salesforce: deduplicate tables
 ## 0.22.0 - 2024-12-04
 * Stop supporting python3.8

castor_extractor/utils/__init__.py CHANGED Viewed

@@ -45,6 +45,7 @@ from .time import (
     current_timestamp,
     date_after,
     format_date,
+    format_rfc_3339_date,
     past_date,
     timestamp_ms,
     yesterday,

castor_extractor/utils/time.py CHANGED Viewed

@@ -63,5 +63,9 @@ def format_date(timestamp: Union[datetime, date]) -> str:
     return timestamp.strftime(ISO_FORMAT)
+def format_rfc_3339_date(timestamp: datetime) -> str:
+    return timestamp.isoformat(timespec="seconds") + "Z"
 def yesterday() -> date:
     return current_date() - timedelta(days=1)

castor_extractor/utils/time_test.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from datetime import date, datetime
-from .time import at_midnight, date_after, timestamp_ms
+from .time import at_midnight, date_after, format_rfc_3339_date, timestamp_ms
 def test_at_midnight():
@@ -17,3 +17,10 @@ def test_timestamp_ms():
     result = timestamp_ms(dt)
     expected = 670636800000
     assert result == expected
+def test_format_rfc_3339_date():
+    dt = datetime(1995, 4, 3, 2, 1)
+    result = format_rfc_3339_date(dt)
+    expected = "1995-04-03T02:01:00Z"
+    assert result == expected

castor_extractor/visualization/looker_studio/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from .assets import LookerStudioAsset
+from .client import (
+    LookerStudioAssetType,
+    LookerStudioClient,
+    LookerStudioCredentials,
+)

castor_extractor/visualization/looker_studio/assets.py ADDED Viewed

@@ -0,0 +1,6 @@
+from ...types import ExternalAsset
+class LookerStudioAsset(ExternalAsset):
+    ASSETS = "assets"
+    VIEW_ACTIVITY = "view_activity"

castor_extractor/visualization/looker_studio/client/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .client import LookerStudioClient
+from .credentials import LookerStudioCredentials
+from .enums import LookerStudioAssetType

castor_extractor/visualization/looker_studio/client/admin_sdk_client.py ADDED Viewed

@@ -0,0 +1,90 @@
+from typing import Iterator, Optional
+from google.oauth2.service_account import Credentials
+from googleapiclient import discovery  # type: ignore
+from ....utils import (
+    at_midnight,
+    current_date,
+    fetch_all_pages,
+    format_rfc_3339_date,
+    past_date,
+)
+from .credentials import LookerStudioCredentials
+from .pagination import LookerStudioPagination
+from .scopes import SCOPES
+USER_EMAIL_FIELD = "primaryEmail"
+class AdminSDKClient:
+    """
+    Client to call the Report API and Directory API.
+    The service account must impersonate and admin account.
+    """
+    def __init__(self, credentials: LookerStudioCredentials):
+        self._credentials = Credentials.from_service_account_info(
+            credentials.model_dump(),
+            scopes=SCOPES,
+            subject=credentials.admin_email,  # impersonates an admin
+        )
+        self.directory_api = discovery.build(
+            "admin", "directory_v1", credentials=self._credentials
+        )
+        self.report_api = discovery.build(
+            "admin", "reports_v1", credentials=self._credentials
+        )
+    def list_users(self) -> Iterator[dict]:
+        """
+        Lists all users in the domain; only the primaryEmail field is selected.
+        Note:
+        * `my_customer` is an alias to represent the account's `customerId`
+        * `domain_public` allows non-admins to list users. This is technically
+           not necessary here because an admin account is impersonated, but it
+           avoids tapping into unnecessary data & serves for future reference.
+        See
+            https://googleapis.github.io/google-api-python-client/docs/dyn/admin_directory_v1.users.html#list
+            https://developers.google.com/admin-sdk/directory/reference/rest/v1/users/list
+            https://developers.google.com/admin-sdk/directory/v1/guides/manage-users#retrieve_users_non_admin
+            https://stackoverflow.com/a/71083443/14448410
+        """
+        def _users(pagination_params: Optional[dict] = None) -> dict:
+            parameters = {
+                "viewType": "domain_public",
+                "customer": "my_customer",
+                "fields": f"users({USER_EMAIL_FIELD}), nextPageToken",
+                **(pagination_params or {}),
+            }
+            return self.directory_api.users().list(**parameters).execute()
+        yield from fetch_all_pages(_users, LookerStudioPagination)
+    def list_view_events(self) -> Iterator[dict]:
+        """
+        Lists all Data Studio View events of the past day.
+        See
+            https://googleapis.github.io/google-api-python-client/docs/dyn/admin_reports_v1.activities.html
+            https://developers.google.com/admin-sdk/reports/reference/rest/v1/activities/list
+            https://developers.google.com/admin-sdk/reports/v1/appendix/activity/data-studio#VIEW
+        """
+        def _activity(pagination_params: Optional[dict] = None) -> dict:
+            yesterday = format_rfc_3339_date(at_midnight(past_date(1)))
+            today = format_rfc_3339_date(at_midnight(current_date()))
+            parameters = {
+                "userKey": "all",
+                "applicationName": "data_studio",
+                "eventName": "VIEW",
+                "startTime": yesterday,
+                "endTime": today,
+                **(pagination_params or {}),
+            }
+            return self.report_api.activities().list(**parameters).execute()
+        yield from fetch_all_pages(_activity, LookerStudioPagination)

castor_extractor/visualization/looker_studio/client/client.py ADDED Viewed

@@ -0,0 +1,37 @@
+from typing import Iterator
+from .. import LookerStudioAsset
+from .admin_sdk_client import USER_EMAIL_FIELD, AdminSDKClient
+from .credentials import LookerStudioCredentials
+from .looker_studio_api_client import LookerStudioAPIClient
+class LookerStudioClient:
+    """
+    Acts as a wrapper class to fetch Looker Studio assets, which requires
+    coordinating calls between the Admin SDK API and the Looker Studio API.
+    """
+    def __init__(self, credentials: LookerStudioCredentials):
+        self.admin_sdk_client = AdminSDKClient(credentials)
+        self.looker_studio_client = LookerStudioAPIClient(credentials)
+    def _get_assets(self) -> Iterator[dict]:
+        """
+        Extracts reports and data sources user by user.
+        """
+        users = self.admin_sdk_client.list_users()
+        for user in users:
+            email = user[USER_EMAIL_FIELD]
+            yield from self.looker_studio_client.fetch_user_assets(email)
+    def fetch(self, asset: LookerStudioAsset) -> Iterator[dict]:
+        if asset == LookerStudioAsset.VIEW_ACTIVITY:
+            yield from self.admin_sdk_client.list_view_events()
+        elif asset == LookerStudioAsset.ASSETS:
+            yield from self._get_assets()
+        else:
+            raise ValueError(f"The asset {asset}, is not supported")

castor_extractor/visualization/looker_studio/client/credentials.py ADDED Viewed

@@ -0,0 +1,20 @@
+from pydantic import BaseModel, SecretStr, field_serializer
+class LookerStudioCredentials(BaseModel):
+    admin_email: str
+    auth_provider_x509_cert_url: str
+    auth_uri: str
+    client_email: str
+    client_id: str
+    client_x509_cert_url: str
+    private_key: SecretStr
+    private_key_id: str
+    project_id: str
+    token_uri: str
+    type: str
+    @field_serializer("private_key")
+    def dump_secret(self, pk):
+        """When using model_dump, show private_key value"""
+        return pk.get_secret_value()

castor_extractor/visualization/looker_studio/client/endpoints.py ADDED Viewed

@@ -0,0 +1,18 @@
+class LookerStudioAPIEndpoint:
+    BASE_PATH = "https://datastudio.googleapis.com"
+    @classmethod
+    def search(cls) -> str:
+        """
+        Search a user's assets.
+        See https://developers.google.com/looker-studio/integrate/api/reference/assets/search
+        """
+        return f"{cls.BASE_PATH}/v1/assets:search"
+    @classmethod
+    def permissions(cls, asset_name: str) -> str:
+        """
+        Get the permissions of an asset. The user must be the owner of the asset.
+        See https://developers.google.com/looker-studio/integrate/api/reference/permissions/get
+        """
+        return f"{cls.BASE_PATH}/v1/assets/{asset_name}/permissions"

castor_extractor/visualization/looker_studio/client/enums.py ADDED Viewed

@@ -0,0 +1,8 @@
+from enum import Enum
+class LookerStudioAssetType(Enum):
+    DATA_SOURCE = "DATA_SOURCE"
+    EXPLORER = "EXPLORER"
+    REPORT = "REPORT"
+    WORKSPACE = "WORKSPACE"

castor_extractor/visualization/looker_studio/client/looker_studio_api_client.py ADDED Viewed

@@ -0,0 +1,102 @@
+from functools import partial
+from typing import Iterator, Optional
+from google.auth.transport.requests import Request
+from google.oauth2.service_account import Credentials
+from ....utils import (
+    APIClient,
+    BearerAuth,
+    fetch_all_pages,
+)
+from .credentials import LookerStudioCredentials
+from .endpoints import LookerStudioAPIEndpoint
+from .enums import LookerStudioAssetType
+from .pagination import LookerStudioPagination
+from .scopes import SCOPES
+class LookerStudioAPIAuth(BearerAuth):
+    def __init__(
+        self,
+        credentials: LookerStudioCredentials,
+        subject: Optional[str] = None,
+    ):
+        """
+        Instantiates the service account credentials.
+        If a `subject` email is passed, the service account will impersonate
+        that user and make requests on that user's behalf.
+        """
+        self._credentials = Credentials.from_service_account_info(
+            credentials.model_dump(), scopes=SCOPES
+        )
+        if subject:
+            self._credentials = self._credentials.with_subject(subject)
+    def fetch_token(self):
+        self._credentials.refresh(Request())
+        return self._credentials.token
+class LookerStudioAPIClient(APIClient):
+    def __init__(self, credentials: LookerStudioCredentials):
+        auth = LookerStudioAPIAuth(credentials=credentials)
+        super().__init__(auth=auth)
+        self._credentials = credentials
+    def _is_private_asset(self, asset_name: str) -> bool:
+        """
+        Returns True if the asset is not viewable by anyone other than the owner.
+        The permissions	dict contains `Role: Member[]` key-value pairs and has
+        at least one key-value pair to define the asset's unique OWNER.
+        If another key is present, it means the asset was shared with
+        another person or group.
+        See also https://developers.google.com/looker-studio/integrate/api/reference/types#Permissions
+        """
+        data = self._get(LookerStudioAPIEndpoint.permissions(asset_name))
+        permissions = data["permissions"]
+        return len(permissions.keys()) == 1
+    def _user_assets(
+        self, asset_type: LookerStudioAssetType, user_email: str
+    ) -> Iterator[dict]:
+        """
+        Yields all assets of the given type, owned by the given user and visible
+        by other members.
+        """
+        request = partial(
+            self._get,
+            LookerStudioAPIEndpoint.search(),
+            params={"assetTypes": [asset_type.value]},
+        )
+        assets = fetch_all_pages(request, LookerStudioPagination)
+        for asset in assets:
+            asset_name = asset["name"]
+            owner = asset["owner"]
+            if owner == user_email and not self._is_private_asset(asset_name):
+                yield asset
+    def _impersonate_user(self, user_email: str):
+        self._auth = LookerStudioAPIAuth(
+            credentials=self._credentials, subject=user_email
+        )
+    def fetch_user_assets(self, user_email: str) -> Iterator[dict]:
+        """Yields assets (reports and data sources) shared by the given user."""
+        self._impersonate_user(user_email)
+        reports = self._user_assets(
+            asset_type=LookerStudioAssetType.REPORT,
+            user_email=user_email,
+        )
+        data_sources = self._user_assets(
+            asset_type=LookerStudioAssetType.DATA_SOURCE,
+            user_email=user_email,
+        )
+        yield from reports
+        yield from data_sources

castor_extractor/visualization/looker_studio/client/pagination.py ADDED Viewed

@@ -0,0 +1,31 @@
+from typing import Optional
+from pydantic import AliasChoices, ConfigDict, Field
+from pydantic.alias_generators import to_camel
+from ....utils import PaginationModel
+NEXT_PAGE_KEY = "pageToken"
+class LookerStudioPagination(PaginationModel):
+    items: list = Field(
+        default_factory=list,
+        validation_alias=AliasChoices("items", "users", "assets"),
+    )
+    next_page_token: Optional[str] = None
+    model_config = ConfigDict(
+        alias_generator=to_camel,
+        populate_by_name=True,
+        from_attributes=True,
+    )
+    def is_last(self) -> bool:
+        return self.next_page_token is None
+    def next_page_payload(self) -> dict:
+        return {NEXT_PAGE_KEY: self.next_page_token}
+    def page_results(self) -> list:
+        return self.items

castor_extractor/visualization/looker_studio/client/scopes.py ADDED Viewed

@@ -0,0 +1,6 @@
+SCOPES = (
+    "https://www.googleapis.com/auth/datastudio",
+    "https://www.googleapis.com/auth/userinfo.profile",
+    "https://www.googleapis.com/auth/admin.reports.audit.readonly",
+    "https://www.googleapis.com/auth/admin.directory.user.readonly",
+)

castor_extractor/visualization/sigma/client/client.py CHANGED Viewed

@@ -1,9 +1,11 @@
 from collections.abc import Iterator
+from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from http import HTTPStatus
 from typing import Callable, Optional
 import requests
+from pydantic import BaseModel
 from ....utils import (
     APIClient,
@@ -12,6 +14,7 @@ from ....utils import (
     build_url,
     fetch_all_pages,
     handle_response,
+    retry,
 )
 from ..assets import SigmaAsset
 from .credentials import SigmaCredentials
@@ -29,7 +32,7 @@ _DATA_ELEMENTS: tuple[str, ...] = (
 )
 _AUTH_TIMEOUT_S = 60
-_SIGMA_TIMEOUT = 120
+_SIGMA_TIMEOUT_S = 300
 _SIGMA_HEADERS = {
     "Content-Type": _CONTENT_TYPE,
@@ -47,6 +50,23 @@ SIGMA_SAFE_MODE = RequestSafeMode(
     max_errors=_VOLUME_IGNORED,
     status_codes=_IGNORED_ERROR_CODES,
 )
+_THREADS_LINEAGE = 10  # empirically found; hit the rate limit with 20 workers
+_RETRY_NUMBER = 1
+_RETRY_BASE_MS = 60_000
+class LineageContext(BaseModel):
+    """all info needed to build the endpoint for lineage retrieval"""
+    workbook_id: str
+    element_id: str
+class Lineage(BaseModel):
+    """holds response from lineage API and context used to retrieve it"""
+    lineage: dict
+    context: LineageContext
 class SigmaBearerAuth(BearerAuth):
@@ -77,7 +97,7 @@ class SigmaClient(APIClient):
             host=credentials.host,
             auth=auth,
             headers=_SIGMA_HEADERS,
-            timeout=_SIGMA_TIMEOUT,
+            timeout=_SIGMA_TIMEOUT_S,
             safe_mode=safe_mode or SIGMA_SAFE_MODE,
         )
@@ -133,17 +153,51 @@ class SigmaClient(APIClient):
                     page=page, workbook_id=workbook_id
                 )
-    def _get_all_lineages(self, elements: list[dict]) -> Iterator[dict]:
+    @retry(
+        (ConnectionError,),
+        max_retries=_RETRY_NUMBER,
+        base_ms=_RETRY_BASE_MS,
+        log_exc_info=True,
+    )
+    def _get_lineage(self, lineage_context: LineageContext) -> Lineage:
+        """
+        return the lineage from API and other ids needed to characterize
+        lineage in castor
+        """
+        workbook_id = lineage_context.workbook_id
+        element_id = lineage_context.element_id
+        endpoint = SigmaEndpointFactory.lineage(workbook_id, element_id)
+        return Lineage(lineage=self._get(endpoint), context=lineage_context)
+    @staticmethod
+    def _lineage_context(elements: list[dict]) -> list[LineageContext]:
+        """
+        Helper function to prepare context for lineage retrieval.
+        Elements without associated columns are skipped.
+        """
+        contexts: list[LineageContext] = []
         for element in elements:
-            workbook_id = element["workbook_id"]
-            element_id = element["elementId"]
-            lineage = self._get(
-                endpoint=SigmaEndpointFactory.lineage(workbook_id, element_id)
+            if element.get("columns") is None:
+                continue
+            context = LineageContext(
+                workbook_id=element["workbook_id"],
+                element_id=element["elementId"],
             )
+            contexts.append(context)
+        return contexts
+    def _get_all_lineages(self, elements: list[dict]) -> Iterator[dict]:
+        lineage_context = self._lineage_context(elements)
+        with ThreadPoolExecutor(max_workers=_THREADS_LINEAGE) as executor:
+            results = executor.map(self._get_lineage, lineage_context)
+        for lineage in results:
             yield {
-                **lineage,
-                "workbook_id": workbook_id,
-                "element_id": element_id,
+                **lineage.lineage,
+                "workbook_id": lineage.context.workbook_id,
+                "element_id": lineage.context.element_id,
             }
     def _get_all_queries(self, workbooks: list[dict]) -> Iterator[dict]:

castor_extractor/visualization/thoughtspot/assets.py CHANGED Viewed

@@ -4,6 +4,8 @@ from ...types import ExternalAsset
 class ThoughtspotAsset(ExternalAsset):
     """Thoughtspot assets"""
+    ANSWERS = "answers"
+    ANSWER_USAGES = "answer_usages"
     LIVEBOARDS = "liveboards"
+    LIVEBOARD_USAGES = "liveboard_usages"
     LOGICAL_TABLES = "logical_tables"
-    USAGES = "usages"

castor_extractor/visualization/thoughtspot/client/client.py CHANGED Viewed

@@ -30,7 +30,12 @@ _THOUGHTSPOT_HEADERS = {
     "Content-Type": "application/json",
 }
 _METADATA_BATCH_SIZE = 100
-_USAGE_LIVEBOARD_ID = "bea79810-145f-4ad0-a02c-4177a6e7d861"
+# https://docs.thoughtspot.com/cloud/latest/object-usage-liveboard
+_OBJECT_USAGE_LIVEBOARD = "Object Usage"
+_ANSWER_USAGE_VIZ = "Answer Usage, by User"
+# https://docs.thoughtspot.com/cloud/latest/user-adoption
+_USER_ADOPTION_LIVEBOARD = "User Adoption"
+_LIVEBOARD_USAGE_VIZ = "Popular Liveboards Last 30 Days"
 # By default, no errors are ignored for the moment
 THOUGHTSPOT_SAFE_MODE = RequestSafeMode()
@@ -69,23 +74,39 @@ class ThoughtspotClient(APIClient):
     def _metadata_search(
         self,
         metadata_type: str,
+        identifier: Optional[str] = None,
     ) -> Iterator[dict]:
+        """
+        Yields assets of the given asset type, and optionally filters on a
+        specific identifier.
+        """
         offset = 0
         while True:
+            search_filters = {
+                "metadata": [{"type": metadata_type}],
+                "include_details": True,
+                "record_size": _METADATA_BATCH_SIZE,
+                "record_offset": offset,
+            }
+            if identifier:
+                search_filters["metadata"] = {
+                    "identifier": identifier,
+                    "type": metadata_type,
+                }
             metadata = self._post(
                 ThoughtspotEndpointFactory.metadata_search(),
-                data={
-                    "metadata": [{"type": metadata_type}],
-                    "include_details": True,
-                    "record_size": _METADATA_BATCH_SIZE,
-                    "record_offset": offset,
-                },
+                data=search_filters,
             )
             yield from metadata
             if len(metadata) < _METADATA_BATCH_SIZE:
                 break
             offset = offset + _METADATA_BATCH_SIZE
+    def _get_all_answers(self) -> Iterator[dict]:
+        yield from self._metadata_search(metadata_type="ANSWER")
     def _get_all_liveboards(self) -> Iterator[dict]:
         yield from self._metadata_search(metadata_type="LIVEBOARD")
@@ -95,26 +116,58 @@ class ThoughtspotClient(APIClient):
     def _get_all_tables(self) -> Iterator[dict]:
         yield from self._metadata_search(metadata_type="LOGICAL_TABLE")
-    def _get_liveboards_usages(self) -> Iterator[dict]:
+    def _get_usages(
+        self,
+        liveboard_name: str,
+        visualization_name: str,
+    ) -> Iterator[dict]:
+        """
+        Yields the data of a given visualization in the given liveboard.
+        ThoughtSpot maintains two system liveboards with stats about data usage,
+        which are useful to compute view counts and popularity.
+        """
+        usage_liveboard = next(
+            self._metadata_search(
+                metadata_type="LIVEBOARD", identifier=liveboard_name
+            )
+        )
+        liveboard_id = usage_liveboard["metadata_id"]
         data = self._post(
             endpoint=ThoughtspotEndpointFactory.liveboard(),
             headers={"Accept": "application/octet-stream"},
             data={
-                "metadata_identifier": _USAGE_LIVEBOARD_ID,
+                "metadata_identifier": liveboard_id,
                 "file_format": "CSV",
-                "visualization_identifiers": [
-                    "Popular Liveboards Last 30 Days"
-                ],
+                "visualization_identifiers": [visualization_name],
             },
             handler=lambda x: x.text,
         )
         yield from usage_liveboard_reader(data)
-    def fetch(self, asset: ThoughtspotAsset):
+    def _get_answer_usages(self) -> Iterator[dict]:
+        return self._get_usages(
+            liveboard_name=_OBJECT_USAGE_LIVEBOARD,
+            visualization_name=_ANSWER_USAGE_VIZ,
+        )
+    def _get_liveboards_usages(self) -> Iterator[dict]:
+        return self._get_usages(
+            liveboard_name=_USER_ADOPTION_LIVEBOARD,
+            visualization_name=_LIVEBOARD_USAGE_VIZ,
+        )
+    def fetch(self, asset: ThoughtspotAsset) -> Iterator[dict]:
+        if asset == ThoughtspotAsset.ANSWERS:
+            yield from self._get_all_answers()
+        if asset == ThoughtspotAsset.ANSWER_USAGES:
+            yield from self._get_answer_usages()
         if asset == ThoughtspotAsset.LIVEBOARDS:
             yield from self._get_all_liveboards()
-        if asset == ThoughtspotAsset.USAGES:
+        if asset == ThoughtspotAsset.LIVEBOARD_USAGES:
             yield from self._get_liveboards_usages()
         if asset == ThoughtspotAsset.LOGICAL_TABLES:

castor-extractor 0.22.0__py3-none-any.whl → 0.22.5__py3-none-any.whl

Potentially problematic release.

castor-extractor 0.22.0py3-none-any.whl → 0.22.5py3-none-any.whl