PyPI - castor-extractor - Versions diffs - 0.24.55__py3-none-any.whl → 0.25.2__py3-none-any.whl - Mend

castor-extractor 0.24.55py3-none-any.whl → 0.25.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of castor-extractor might be problematic. Click here for more details.

Files changed (29) hide show

CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,28 @@
 # Changelog
+## 0.25.2 - 2025-09-30
+* PowerBi: Support auth with private_key
+## 0.25.1 - 2025-09-29
+* Sigma: catch ReadTimeouts during elements extraction
+## 0.25.0 - 2025-09-15
+* Count: adding connector
+## 0.24.57 - 2025-09-24
+* Sigma:
+  * fix pagination
+  * remove redundant element lineages endpoint
+  * extract data model sources
+## 0.24.56 - 2025-09-24
+* bump dependencies
 ## 0.24.55 - 2025-09-19
 * Fix encoding in LocalStorage - force to utf-8

castor_extractor/commands/extract_count.py ADDED Viewed

@@ -0,0 +1,22 @@
+from argparse import ArgumentParser
+from castor_extractor.utils import parse_filled_arguments  # type: ignore
+from castor_extractor.visualization import count  # type: ignore
+def main():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "-c",
+        "--credentials",
+        help="GCP credentials as string",
+    )
+    parser.add_argument("-o", "--output", help="Directory to write to")
+    parser.add_argument(
+        "-d",
+        "--dataset_id",
+        help="dataset id, where count info is stored for the current customer",
+    )
+    count.extract_all(**parse_filled_arguments(parser))

castor_extractor/commands/extract_powerbi.py CHANGED Viewed

@@ -9,10 +9,21 @@ logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
 def main():
     parser = ArgumentParser()
+    auth_group = parser.add_mutually_exclusive_group(required=True)
     parser.add_argument("-t", "--tenant_id", help="PowerBi tenant ID")
     parser.add_argument("-c", "--client_id", help="PowerBi client ID")
-    parser.add_argument("-s", "--secret", help="PowerBi password")
+    auth_group.add_argument(
+        "-s",
+        "--secret",
+        help="PowerBi password as a string",
+    )
+    auth_group.add_argument(
+        "-cert",
+        "--certificate",
+        help="file path to json certificate file with"
+        "keys: private_key, thumbprint, public_certificate",
+    )
     parser.add_argument(
         "-sc",
         "--scopes",

castor_extractor/visualization/count/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .assets import CountAsset
+from .client import CountClient, CountCredentials
+from .extract import extract_all

castor_extractor/visualization/count/assets.py ADDED Viewed

@@ -0,0 +1,11 @@
+from ...types import ExternalAsset
+class CountAsset(ExternalAsset):
+    """Count assets"""
+    CANVASES = "canvases"
+    CANVAS_PERMISSIONS = "canvas_permissions"
+    CELLS = "cells"
+    PROJECTS = "projects"
+    USERS = "users"

castor_extractor/visualization/count/client/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .client import CountClient
2	+ from .credentials import CountCredentials

castor_extractor/visualization/count/client/client.py ADDED Viewed

@@ -0,0 +1,50 @@
+import logging
+from dataclasses import asdict
+from typing import Any, Iterator
+from ....utils import load_file
+from ....warehouse.bigquery import BigQueryClient
+from ..assets import (
+    CountAsset,
+)
+from .credentials import CountCredentials
+logger = logging.getLogger(__name__)
+_QUERIES_FOLDER = "queries"
+class CountClient(BigQueryClient):
+    """
+    Count.co does not currently provide an official API.
+    Instead, metadata such as dashboards, users, and queries is made available through
+    special metadata tables stored in BigQuery.
+    This client extends `BigQueryClient` to access and interact with those metadata tables.
+    """
+    def __init__(self, credentials: CountCredentials):
+        super().__init__(asdict(credentials))
+        self.project_id = credentials.project_id
+        self.dataset_id = credentials.dataset_id
+    def _load_query(self, asset: CountAsset) -> str:
+        query = load_file(
+            f"{_QUERIES_FOLDER}/{asset.name.lower()}.sql", __file__
+        )
+        return query.format(
+            project_id=self.project_id, dataset_id=self.dataset_id
+        )
+    def fetch(self, asset: CountAsset) -> Iterator[dict[str, Any]]:
+        """
+        Fetch the asset given as param, by running a BigQuery query.
+        """
+        logger.info(f"Running BigQuery query to fetch: {asset.name}")
+        query_str = self._load_query(asset)
+        job = self.client.query(query_str)
+        results = job.result()
+        for row in results:
+            yield dict(row)

castor_extractor/visualization/count/client/credentials.py ADDED Viewed

@@ -0,0 +1,10 @@
+from pydantic.dataclasses import dataclass
+from ....warehouse.bigquery import BigQueryCredentials
+@dataclass
+class CountCredentials(BigQueryCredentials):
+    """Count credentials extending BigQuery credentials with additional dataset information"""
+    dataset_id: str

castor_extractor/visualization/count/client/queries/canvas_permissions.sql ADDED Viewed

@@ -0,0 +1,6 @@
+SELECT
+    canvas_key,
+    type,
+    role,
+    user_key
+FROM `{project_id}.{dataset_id}.canvas_permissions`

castor_extractor/visualization/count/client/queries/canvases.sql ADDED Viewed

@@ -0,0 +1,6 @@
+SELECT
+    key,
+    project_key,
+    title
+FROM `{project_id}.{dataset_id}.canvases`

castor_extractor/visualization/count/client/queries/cells.sql ADDED Viewed

@@ -0,0 +1,8 @@
+SELECT
+    key,
+    canvas_key,
+    name,
+    type,
+    connection_key
+FROM `{project_id}.{dataset_id}.cells`

castor_extractor/visualization/count/client/queries/projects.sql ADDED Viewed

@@ -0,0 +1,5 @@
+SELECT
+    key,
+    name
+FROM `{project_id}.{dataset_id}.projects`

castor_extractor/visualization/count/client/queries/users.sql ADDED Viewed

@@ -0,0 +1,8 @@
+SELECT
+    key,
+    created_at,
+    name,
+    email,
+    role
+FROM `{project_id}.{dataset_id}.users`

castor_extractor/visualization/count/extract.py ADDED Viewed

@@ -0,0 +1,54 @@
+import logging
+from typing import Iterable, Iterator, Union
+from ...utils import (
+    OUTPUT_DIR,
+    current_timestamp,
+    deep_serialize,
+    from_env,
+    get_output_filename,
+    write_json,
+    write_summary,
+)
+from .assets import (
+    CountAsset,
+)
+from .client import (
+    CountClient,
+    CountCredentials,
+)
+logger = logging.getLogger(__name__)
+def iterate_all_data(
+    client: CountClient,
+) -> Iterable[tuple[CountAsset, Union[list, Iterator, dict]]]:
+    """Iterate over the extracted data from count"""
+    for asset in CountAsset:
+        logger.info(f"Extracting {asset.value} from API")
+        data = client.fetch(asset)
+        yield asset, deep_serialize(data)
+def extract_all(**kwargs) -> None:
+    """
+    Extract data from count BigQuery project
+    Store the output files locally under the given output_directory
+    """
+    _output_directory = kwargs.get("output") or from_env(OUTPUT_DIR)
+    dataset_id = kwargs.get("dataset_id")
+    if not dataset_id:
+        raise ValueError("dataset_id is required")
+    credentials = CountCredentials(**kwargs)
+    client = CountClient(credentials=credentials)
+    ts = current_timestamp()
+    for key, data in iterate_all_data(client):
+        filename = get_output_filename(key.name.lower(), _output_directory, ts)
+        write_json(filename, list(data))
+    write_summary(_output_directory, ts)

castor_extractor/visualization/powerbi/client/__init__.py CHANGED Viewed

@@ -3,5 +3,6 @@ from .credentials import (
     CLIENT_APP_BASE,
     DEFAULT_SCOPE,
     REST_API_BASE_PATH,
+    PowerbiCertificate,
     PowerbiCredentials,
 )

castor_extractor/visualization/powerbi/client/authentication.py CHANGED Viewed

@@ -1,11 +1,24 @@
+from typing import Optional, Union
 import msal  # type: ignore
 from ....utils import BearerAuth
 from .constants import Keys
-from .credentials import PowerbiCredentials
+from .credentials import PowerbiCertificate, PowerbiCredentials
 from .endpoints import PowerBiEndpointFactory
+def _get_client_credential(
+    secret: Optional[str], certificate: Optional[PowerbiCertificate]
+) -> Union[str, dict]:
+    if secret:
+        return secret
+    if certificate:
+        return certificate.model_dump()
+    raise ValueError("Either certificate or secret must be provided.")
 class PowerBiBearerAuth(BearerAuth):
     def __init__(self, credentials: PowerbiCredentials):
         self.credentials = credentials
@@ -14,10 +27,15 @@ class PowerBiBearerAuth(BearerAuth):
             api_base=self.credentials.api_base,
         )
         authority = endpoint_factory.authority(self.credentials.tenant_id)
+        client_credential = _get_client_credential(
+            self.credentials.secret, self.credentials.certificate
+        )
         self.app = msal.ConfidentialClientApplication(
             client_id=self.credentials.client_id,
             authority=authority,
-            client_credential=self.credentials.secret,
+            client_credential=client_credential,
         )
     def fetch_token(self):

castor_extractor/visualization/powerbi/client/credentials.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from typing import Optional
-from pydantic import Field, field_validator
+from pydantic import BaseModel, field_validator
 from pydantic_settings import BaseSettings, SettingsConfigDict
 DEFAULT_SCOPE = "https://analysis.windows.net/powerbi/api/.default"
@@ -10,6 +10,12 @@ CLIENT_APP_BASE = "https://login.microsoftonline.com"
 REST_API_BASE_PATH = "https://api.powerbi.com/v1.0/myorg"
+class PowerbiCertificate(BaseModel):
+    public_certificate: Optional[str] = None
+    private_key: str
+    thumbprint: str
 class PowerbiCredentials(BaseSettings):
     """Class to handle PowerBI rest API permissions"""
@@ -21,7 +27,8 @@ class PowerbiCredentials(BaseSettings):
     client_id: str
     tenant_id: str
-    secret: str = Field(repr=False)
+    secret: Optional[str] = None
+    certificate: Optional[PowerbiCertificate] = None
     api_base: str = REST_API_BASE_PATH
     login_url: str = CLIENT_APP_BASE
     scopes: list[str] = [DEFAULT_SCOPE]

castor_extractor/visualization/powerbi/extract.py CHANGED Viewed

@@ -1,6 +1,7 @@
+import json
 import logging
 from collections.abc import Iterable
-from typing import Union
+from typing import Optional, Union
 from ...utils import (
     OUTPUT_DIR,
@@ -12,11 +13,22 @@ from ...utils import (
     write_summary,
 )
 from .assets import PowerBiAsset
-from .client import PowerbiClient, PowerbiCredentials
+from .client import PowerbiCertificate, PowerbiClient, PowerbiCredentials
 logger = logging.getLogger(__name__)
+def _load_certificate(
+    certificate: Optional[str],
+) -> Optional[PowerbiCertificate]:
+    if not certificate:
+        return None
+    with open(certificate) as file:
+        cert = json.load(file)
+        return PowerbiCertificate(**cert)
 def iterate_all_data(
     client: PowerbiClient,
 ) -> Iterable[tuple[PowerBiAsset, Union[list, dict]]]:
@@ -36,7 +48,15 @@ def extract_all(**kwargs) -> None:
     Store the output files locally under the given output_directory
     """
     _output_directory = kwargs.get("output") or from_env(OUTPUT_DIR)
-    creds = PowerbiCredentials(**kwargs)
+    creds = PowerbiCredentials(
+        client_id=kwargs.get("client_id"),
+        tenant_id=kwargs.get("tenant_id"),
+        secret=kwargs.get("secret"),
+        certificate=_load_certificate(kwargs.get("certificate")),
+        api_base=kwargs.get("api_base"),
+        login_url=kwargs.get("login_url"),
+        scopes=kwargs.get("scopes"),
+    )
     client = PowerbiClient(creds)
     ts = current_timestamp()

castor_extractor/visualization/sigma/assets.py CHANGED Viewed

@@ -5,11 +5,11 @@ class SigmaAsset(ExternalAsset):
     """Sigma assets"""
     DATAMODELS = "datamodels"
+    DATAMODEL_SOURCES = "datamodel_sources"
     DATASETS = "datasets"
     DATASET_SOURCES = "dataset_sources"
     ELEMENTS = "elements"
     FILES = "files"
-    LINEAGES = "lineages"
     MEMBERS = "members"
     QUERIES = "queries"
     WORKBOOKS = "workbooks"

castor_extractor/visualization/sigma/client/client.py CHANGED Viewed

@@ -1,17 +1,15 @@
 import logging
 from collections.abc import Iterator
-from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from http import HTTPStatus
 from typing import Callable, Iterable, Optional
-from pydantic import BaseModel
+from requests import ReadTimeout
 from ....utils import (
     APIClient,
     RequestSafeMode,
     fetch_all_pages,
-    retry,
 )
 from ..assets import SigmaAsset
 from .authentication import SigmaBearerAuth
@@ -55,38 +53,12 @@ SIGMA_SAFE_MODE = RequestSafeMode(
     max_errors=_VOLUME_IGNORED,
     status_codes=_IGNORED_ERROR_CODES,
 )
-SIGMA_SAFE_MODE_LINEAGE = RequestSafeMode(
-    max_errors=_VOLUME_IGNORED,
-    status_codes=(
-        *_IGNORED_ERROR_CODES,
-        HTTPStatus.FORBIDDEN,
-    ),
-)
-_THREADS_LINEAGE = 10  # empirically found; hit the rate limit with 20 workers
 _RETRY_NUMBER = 1
 _RETRY_BASE_MS = 60_000
-class LineageContext(BaseModel):
-    """all info needed to build the endpoint for lineage retrieval"""
-    workbook_id: str
-    element_id: str
-class Lineage(BaseModel):
-    """holds response from lineage API and context used to retrieve it"""
-    lineage: dict
-    context: LineageContext
 class SigmaClient(APIClient):
-    def __init__(
-        self,
-        credentials: SigmaCredentials,
-        safe_mode: Optional[RequestSafeMode] = None,
-    ):
+    def __init__(self, credentials: SigmaCredentials):
         auth = SigmaBearerAuth(
             host=credentials.host,
             token_payload=credentials.token_payload,
@@ -96,7 +68,7 @@ class SigmaClient(APIClient):
             auth=auth,
             headers=_SIGMA_HEADERS,
             timeout=_SIGMA_TIMEOUT_S,
-            safe_mode=safe_mode or SIGMA_SAFE_MODE,
+            safe_mode=SIGMA_SAFE_MODE,
         )
     def _get_paginated(
@@ -144,6 +116,31 @@ class SigmaClient(APIClient):
         request = self._get_paginated(endpoint=SigmaEndpointFactory.workbooks())
         yield from fetch_all_pages(request, SigmaPagination)
+    @staticmethod
+    def _safe_fetch_elements(
+        elements: Iterator[dict],
+        workbook_id: str,
+        page_id: str,
+    ) -> Iterator[dict]:
+        """
+        Safely iterates over elements with ReadTimeout handling. In case of
+        said error, it skips the entire rest of the page.
+        """
+        try:
+            for element in elements:
+                if element.get("type") not in _DATA_ELEMENTS:
+                    continue
+                yield {
+                    **element,
+                    "workbook_id": workbook_id,
+                    "page_id": page_id,
+                }
+        except ReadTimeout:
+            logger.warning(
+                f"ReadTimeout for page {page_id} in workbook {workbook_id}"
+            )
+            return
     def _get_elements_per_page(
         self, page: dict, workbook_id: str
     ) -> Iterator[dict]:
@@ -152,14 +149,7 @@ class SigmaClient(APIClient):
             SigmaEndpointFactory.elements(workbook_id, page_id)
         )
         elements = fetch_all_pages(request, SigmaPagination)
-        for element in elements:
-            if element.get("type") not in _DATA_ELEMENTS:
-                continue
-            yield {
-                **element,
-                "workbook_id": workbook_id,
-                "page_id": page_id,
-            }
+        yield from self._safe_fetch_elements(elements, workbook_id, page_id)
     def _get_all_elements(self, workbooks: list[dict]) -> Iterator[dict]:
         for workbook in workbooks:
@@ -175,68 +165,6 @@ class SigmaClient(APIClient):
                     page=page, workbook_id=workbook_id
                 )
-    @retry(
-        (ConnectionError,),
-        max_retries=_RETRY_NUMBER,
-        base_ms=_RETRY_BASE_MS,
-        log_exc_info=True,
-    )
-    def _get_lineage(self, lineage_context: LineageContext) -> Lineage:
-        """
-        return the lineage from API and other ids needed to characterize
-        lineage in castor
-        """
-        workbook_id = lineage_context.workbook_id
-        element_id = lineage_context.element_id
-        endpoint = SigmaEndpointFactory.lineage(workbook_id, element_id)
-        return Lineage(lineage=self._get(endpoint), context=lineage_context)
-    @staticmethod
-    def _lineage_context(elements: list[dict]) -> list[LineageContext]:
-        """
-        Helper function to prepare context for lineage retrieval.
-        Elements without associated columns are skipped.
-        """
-        contexts: list[LineageContext] = []
-        for element in elements:
-            if element.get("columns") is None:
-                continue
-            context = LineageContext(
-                workbook_id=element["workbook_id"],
-                element_id=element["elementId"],
-            )
-            contexts.append(context)
-        return contexts
-    def _get_all_lineages(self, elements: list[dict]) -> Iterator[dict]:
-        """
-        The safe mode is temporarily modified to include 403 errors.
-        Due to concurrency issues, we force a refresh of the token in hopes that
-        the lineage extraction takes less than the token expiration time of
-        1 hour.
-        """
-        safe_mode = self._safe_mode
-        self._safe_mode = SIGMA_SAFE_MODE_LINEAGE
-        lineage_context = self._lineage_context(elements)
-        with ThreadPoolExecutor(max_workers=_THREADS_LINEAGE) as executor:
-            results = executor.map(self._get_lineage, lineage_context)
-        for lineage in results:
-            if not lineage.lineage:
-                continue
-            yield {
-                **lineage.lineage,
-                "workbook_id": lineage.context.workbook_id,
-                "element_id": lineage.context.element_id,
-            }
-        self._safe_mode = safe_mode
     @staticmethod
     def _yield_deduplicated_queries(
         queries: Iterable[dict], workbook_id: str
@@ -266,6 +194,13 @@ class SigmaClient(APIClient):
             yield from self._yield_deduplicated_queries(queries, workbook_id)
+    def _get_all_datamodel_sources(
+        self, datamodels: list[dict]
+    ) -> Iterator[dict]:
+        yield from SigmaSourcesTransformer(
+            self, table_id_key="tableId"
+        ).get_datamodel_sources(datamodels)
     def _get_all_dataset_sources(self, datasets: list[dict]) -> Iterator[dict]:
         yield from SigmaSourcesTransformer(self).get_dataset_sources(datasets)
@@ -277,14 +212,22 @@ class SigmaClient(APIClient):
     def fetch(
         self,
         asset: SigmaAsset,
+        datamodels: Optional[list[dict]] = None,
         datasets: Optional[list[dict]] = None,
-        elements: Optional[list[dict]] = None,
         workbooks: Optional[list[dict]] = None,
     ) -> Iterator[dict]:
         """Returns the needed metadata for the queried asset"""
         if asset == SigmaAsset.DATAMODELS:
             yield from self._get_all_datamodels()
+        elif asset == SigmaAsset.DATAMODEL_SOURCES:
+            if datamodels is None:
+                raise ValueError(
+                    "Missing data models to extract data model sources"
+                )
+            yield from self._get_all_datamodel_sources(datamodels)
         elif asset == SigmaAsset.DATASETS:
             yield from self._get_all_datasets()
@@ -303,12 +246,6 @@ class SigmaClient(APIClient):
         elif asset == SigmaAsset.FILES:
             yield from self._get_all_files()
-        elif asset == SigmaAsset.LINEAGES:
-            if elements is None:
-                raise ValueError("Missing elements to extract lineage")
-            yield from self._get_all_lineages(elements)
         elif asset == SigmaAsset.MEMBERS:
             yield from self._get_all_members()

castor_extractor/visualization/sigma/client/endpoints.py CHANGED Viewed

@@ -19,6 +19,10 @@ class SigmaEndpointFactory:
     def datamodels(cls) -> str:
         return f"v2/{cls.DATAMODELS}"
+    @classmethod
+    def datamodel_sources(cls, datamodel_id: str) -> str:
+        return f"v2/{cls.DATAMODELS}/{datamodel_id}/sources"
     @classmethod
     def datasets(cls) -> str:
         return f"v2/{cls.DATASETS}"

castor_extractor/visualization/sigma/client/pagination.py CHANGED Viewed

@@ -10,7 +10,7 @@ SIGMA_QUERIES_PAGINATION_LIMIT = 50
 class SigmaPagination(PaginationModel):
-    next_page: Optional[str] = "0"
+    next_page: Optional[str] = None
     entries: list = Field(default_factory=list)
     model_config = ConfigDict(
@@ -27,3 +27,23 @@ class SigmaPagination(PaginationModel):
     def page_results(self) -> list:
         return self.entries
+class SigmaTokenPagination(PaginationModel):
+    next_page_token: Optional[str] = ""  # noqa: S105
+    entries: list = Field(default_factory=list)
+    model_config = ConfigDict(
+        alias_generator=to_camel,
+        populate_by_name=True,
+        from_attributes=True,
+    )
+    def is_last(self) -> bool:
+        return not self.next_page_token
+    def next_page_payload(self) -> dict:
+        return {"pageToken": self.next_page_token}
+    def page_results(self) -> list:
+        return self.entries

castor_extractor/visualization/sigma/client/sources_transformer.py CHANGED Viewed

@@ -2,8 +2,9 @@ import logging
 from http import HTTPStatus
 from typing import TYPE_CHECKING, Callable, Iterator
-from ....utils import retry_request
+from ....utils import fetch_all_pages, retry_request
 from .endpoints import SigmaEndpointFactory
+from .pagination import SigmaTokenPagination
 if TYPE_CHECKING:
     from .client import SigmaClient
@@ -17,8 +18,11 @@ SIGMA_CONNECTION_PATH_SLEEP_MS = 30_000  # 30 seconds
 class SigmaSourcesTransformer:
     """Retrieves asset sources and enhances them with additional information."""
-    def __init__(self, api_client: "SigmaClient"):
+    def __init__(
+        self, api_client: "SigmaClient", table_id_key: str = "inodeId"
+    ):
         self.api_client = api_client
+        self.table_id_key = table_id_key
     @retry_request(
         status_codes=(HTTPStatus.TOO_MANY_REQUESTS,),
@@ -38,9 +42,9 @@ class SigmaSourcesTransformer:
         logger.info("Mapping table ids to connection and path information")
         unique_table_ids = {
-            source["inodeId"]
+            source[self.table_id_key]
             for asset_sources in all_sources
-            for source in asset_sources["sources"]
+            for source in asset_sources.get("sources", [])
             if source["type"] == "table"
         }
@@ -49,15 +53,14 @@ class SigmaSourcesTransformer:
             for table_id in unique_table_ids
         }
-    @staticmethod
-    def _enhance_table_source(source: dict, table_to_path: dict) -> dict:
+    def _enhance_table_source(self, source: dict, table_to_path: dict) -> dict:
         """
         Combines a single table source with its connection and path information.
         """
         if source["type"] != "table":
             return source
-        path_info = table_to_path.get(source["inodeId"], {})
+        path_info = table_to_path.get(source[self.table_id_key], {})
         source["connectionId"] = path_info.get("connectionId")
         source["path"] = path_info.get("path")
         return source
@@ -82,19 +85,35 @@ class SigmaSourcesTransformer:
             }
     def _get_all_sources(
-        self, endpoint: Callable[[str], str], asset_ids: set[str]
+        self,
+        endpoint: Callable[[str], str],
+        asset_ids: set[str],
+        with_pagination: bool = False,
     ) -> Iterator[dict]:
         """Returns transformed sources for the given assets"""
         all_sources = []
         for asset_id in asset_ids:
-            sources = self.api_client._get(endpoint=endpoint(asset_id))
+            endpoint_url = endpoint(asset_id)
+            if with_pagination:
+                request = self.api_client._get_paginated(endpoint=endpoint_url)
+                sources = list(fetch_all_pages(request, SigmaTokenPagination))
+            else:
+                sources = self.api_client._get(endpoint=endpoint_url)
             all_sources.append({"asset_id": asset_id, "sources": sources})
         table_to_path = self._map_table_id_to_connection_path(all_sources)
         yield from self._transform_sources(all_sources, table_to_path)
+    def get_datamodel_sources(self, datamodels: list[dict]) -> Iterator[dict]:
+        asset_ids = {datamodel["dataModelId"] for datamodel in datamodels}
+        yield from self._get_all_sources(
+            endpoint=SigmaEndpointFactory.datamodel_sources,
+            asset_ids=asset_ids,
+            with_pagination=True,
+        )
     def get_dataset_sources(self, datasets: list[dict]) -> Iterator[dict]:
         asset_ids = {dataset["datasetId"] for dataset in datasets}
         yield from self._get_all_sources(

castor_extractor/visualization/sigma/extract.py CHANGED Viewed

@@ -23,8 +23,14 @@ def iterate_all_data(
     """Iterate over the extracted data from Sigma"""
     logger.info("Extracting DATA MODELS from API")
-    datamodels = client.fetch(SigmaAsset.DATAMODELS)
-    yield SigmaAsset.DATASETS, list(deep_serialize(datamodels))
+    datamodels = list(client.fetch(SigmaAsset.DATAMODELS))
+    yield SigmaAsset.DATASETS, deep_serialize(datamodels)
+    logger.info("Extracting DATAMODEL SOURCES from API")
+    datamodel_sources = client.fetch(
+        SigmaAsset.DATAMODEL_SOURCES, datamodels=datamodels
+    )
+    yield SigmaAsset.DATAMODEL_SOURCES, list(deep_serialize(datamodel_sources))
     logger.info("Extracting DATASETS from API")
     datasets = list(client.fetch(SigmaAsset.DATASETS))
@@ -62,10 +68,6 @@ def iterate_all_data(
     elements = list(client.fetch(SigmaAsset.ELEMENTS, workbooks=workbooks))
     yield SigmaAsset.ELEMENTS, list(deep_serialize(elements))
-    logging.info("Extracting LINEAGES data from API")
-    lineages = client.fetch(SigmaAsset.LINEAGES, elements=elements)
-    yield SigmaAsset.LINEAGES, list(deep_serialize(lineages))
 def extract_all(**kwargs) -> None:
     """

{castor_extractor-0.24.55.dist-info → castor_extractor-0.25.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: castor-extractor
-Version: 0.24.55
+Version: 0.25.2
 Summary: Extract your metadata assets.
 Home-page: https://www.castordoc.com/
 License: EULA
@@ -16,6 +16,7 @@ Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Provides-Extra: all
 Provides-Extra: bigquery
+Provides-Extra: count
 Provides-Extra: databricks
 Provides-Extra: dbt
 Provides-Extra: looker
@@ -57,7 +58,7 @@ Requires-Dist: setuptools (>=78.1)
 Requires-Dist: snowflake-connector-python (>=3.4.0,<4.0.0) ; extra == "snowflake" or extra == "all"
 Requires-Dist: snowflake-sqlalchemy (!=1.2.5,<2.0.0) ; extra == "snowflake" or extra == "all"
 Requires-Dist: sqlalchemy (>=1.4,<1.5)
-Requires-Dist: sqlalchemy-bigquery[bqstorage] (>=1.0.0,<=2.0.0) ; extra == "bigquery" or extra == "all"
+Requires-Dist: sqlalchemy-bigquery[bqstorage] (>=1.0.0,<=2.0.0) ; extra == "bigquery" or extra == "count" or extra == "all"
 Requires-Dist: sqlalchemy-redshift (>=0.8.14,<0.9.0) ; extra == "redshift" or extra == "all"
 Requires-Dist: tableauserverclient (>=0.25.0,<0.26.0) ; extra == "tableau" or extra == "all"
 Requires-Dist: tqdm (>=4.0.0,<5.0.0)
@@ -215,6 +216,29 @@ For any questions or bug report, contact us at [support@coalesce.io](mailto:supp
 # Changelog
+## 0.25.2 - 2025-09-30
+* PowerBi: Support auth with private_key
+## 0.25.1 - 2025-09-29
+* Sigma: catch ReadTimeouts during elements extraction
+## 0.25.0 - 2025-09-15
+* Count: adding connector
+## 0.24.57 - 2025-09-24
+* Sigma:
+  * fix pagination
+  * remove redundant element lineages endpoint
+  * extract data model sources
+## 0.24.56 - 2025-09-24
+* bump dependencies
 ## 0.24.55 - 2025-09-19
 * Fix encoding in LocalStorage - force to utf-8

{castor_extractor-0.24.55.dist-info → castor_extractor-0.25.2.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-CHANGELOG.md,sha256=y8BAidkUDrMoQLEfu3LJLiqxoEUzI5hJZs4CUN_e1H0,20711
+CHANGELOG.md,sha256=nBloUrrG3Tt7TDnWCZqsNS0x6uIBYG7TFQHoTP8Q8a8,21086
 Dockerfile,sha256=xQ05-CFfGShT3oUqaiumaldwA288dj9Yb_pxofQpufg,301
 DockerfileUsage.md,sha256=2hkJQF-5JuuzfPZ7IOxgM6QgIQW7l-9oRMFVwyXC4gE,998
 LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
@@ -7,6 +7,7 @@ castor_extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
 castor_extractor/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 castor_extractor/commands/extract_bigquery.py,sha256=dU4OiYO1V0n32orvZnMh1_xtFKF_VxHNXcVsH3otY-g,1269
 castor_extractor/commands/extract_confluence.py,sha256=blYcnDqywXNKRQ1aZAD9FclhLlO7x8Y_tb0lgl85v0w,1641
+castor_extractor/commands/extract_count.py,sha256=cITp-2UmPYjbcICvYZzxE9oWieI8NbTH1DcWxLAZxJ4,611
 castor_extractor/commands/extract_databricks.py,sha256=SVKyoa-BBUQAM6HRHf1Wdg9-tpICic2yyvXQwHcNBhA,1264
 castor_extractor/commands/extract_domo.py,sha256=jvAawUsUTHrwCn_koK6StmQr4n_b5GyvJi6uu6WS0SM,1061
 castor_extractor/commands/extract_looker.py,sha256=cySLiolLCgrREJ9d0kMrJ7P8K3efHTBTzShalWVfI3A,1214
@@ -17,7 +18,7 @@ castor_extractor/commands/extract_mode.py,sha256=Q4iO-VAKMg4zFPejhAO-foZibL5Ht3j
 castor_extractor/commands/extract_mysql.py,sha256=7AH5qMzeLTsENCOeJwtesrWg8Vo8MCEq8fx2YT74Mcw,1034
 castor_extractor/commands/extract_notion.py,sha256=uaxcF3_bT7D_-JxnIW0F7VVDphI_ZgOfQQxZzoLXo_M,504
 castor_extractor/commands/extract_postgres.py,sha256=pX0RnCPi4nw6QQ6wiAuZ_Xt3ZbDuMUG9aQKuqFgJtAU,1154
-castor_extractor/commands/extract_powerbi.py,sha256=RKkw9H2ZsbJ4xLE84bmNFUgYUjlrLmSXahQSVrQr_Bc,934
+castor_extractor/commands/extract_powerbi.py,sha256=tM9fnQaU69zJ7E_uS1S432jprRi9WnpDJdm2NtyLjUg,1242
 castor_extractor/commands/extract_qlik.py,sha256=VBe_xFKh_nR0QSFFIncAaC8yDqBeMa6VunBAga7AeGg,891
 castor_extractor/commands/extract_redshift.py,sha256=zRBg2D_ft4GLdPSdmetRcgQVAA80DXtdRSYsQhAWIik,1334
 castor_extractor/commands/extract_salesforce.py,sha256=3j3YTmMkPAwocR-B1ozJQai0UIZPtpmAyWj-hHvdWn4,1226
@@ -160,6 +161,17 @@ castor_extractor/utils/validation.py,sha256=dRvC9SoFVecVZuLQNN3URq37yX2sBSW3-NxI
 castor_extractor/utils/validation_test.py,sha256=A7P6VmI0kYX2aGIeEN12y7LsY7Kpm8pE4bdVFhbBAMw,1184
 castor_extractor/utils/write.py,sha256=KQVWF29N766avzmSb129IUWrId5c_8BtnYhVLmU6YIs,2133
 castor_extractor/visualization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+castor_extractor/visualization/count/__init__.py,sha256=lvxGtSe3erjTYK0aPnkOyJibcsC6Q1AFchnK-hZt558,114
+castor_extractor/visualization/count/assets.py,sha256=VZCRVDKWSu6l2lVGJS4JKOOmfCUkbS8MnJiLcAY9vqw,232
+castor_extractor/visualization/count/client/__init__.py,sha256=YawYDutDI0sprp72jN9tKi8bbXCoc0Ij0Ev582tKjqk,74
+castor_extractor/visualization/count/client/client.py,sha256=WgljCj8G7D0Brxa0llaeOQ2Ipd7FvtDWFoLWoPyqT9A,1523
+castor_extractor/visualization/count/client/credentials.py,sha256=LZWvcz7p5lrgdgoIQLcxFyv4gqUBW4Jj4qDKN-VW31I,273
+castor_extractor/visualization/count/client/queries/canvas_permissions.sql,sha256=iFmMfR0zusjxTxmYUS6p0kibZCsnHOQMbAlxaNjx-H4,108
+castor_extractor/visualization/count/client/queries/canvases.sql,sha256=Ur5HBD9JJH0r14xIj_rwoctnds082_F931vlfcnwi_I,86
+castor_extractor/visualization/count/client/queries/cells.sql,sha256=Kkk0jyU337PD6RPshSo_ucLl5PS7kIvJZlUnVnmJUkM,111
+castor_extractor/visualization/count/client/queries/projects.sql,sha256=3Jem3QCVwk4wHiWRJL7cN6Vl2Yc5RZ8yC8ndvPAkaFM,68
+castor_extractor/visualization/count/client/queries/users.sql,sha256=H0n7S7P5cCAWbgPxU32psIc1epXySzsAaQ7MQ9JrkfM,102
+castor_extractor/visualization/count/extract.py,sha256=ZBsJ9tMxxaq1jG8qJp_OGVK3yPDNkVUsP1_3rcUMtYg,1378
 castor_extractor/visualization/domo/__init__.py,sha256=1axOCPm4RpdIyUt9LQEvlMvbOPllW8rk63h6EjVgJ0Y,111
 castor_extractor/visualization/domo/assets.py,sha256=bK1urFR2tnlWkVkkhR32mAKMoKbESNlop-CNGx-65PY,206
 castor_extractor/visualization/domo/client/__init__.py,sha256=Do0fU4B8Hhlhahcv734gnJl_ryCztfTBDea7XNCKfB8,72
@@ -236,16 +248,16 @@ castor_extractor/visualization/mode/errors.py,sha256=SKpFT2AiLOuWx2VRLyO7jbAiKcG
 castor_extractor/visualization/mode/extract.py,sha256=PmLWWjUwplQh3TNMemiGwyFdxMcKVMvumZPxSMLJAwk,1625
 castor_extractor/visualization/powerbi/__init__.py,sha256=hoZ73ngLhMc9edqxO9PUIE3FABQlvcfY2W8fuc6DEjY,197
 castor_extractor/visualization/powerbi/assets.py,sha256=IB_XKwgdN1pZYGZ4RfeHrLjflianTzWf_6tg-4CIwu0,742
-castor_extractor/visualization/powerbi/client/__init__.py,sha256=UPIhMaCCdNxhiLdkItC0IPFE_AMi-SgqI_ahwjB9utI,151
-castor_extractor/visualization/powerbi/client/authentication.py,sha256=cTohunKr1nUDfvxB0sejJSyfE2BdCtwT1WMPecWlbyU,1045
+castor_extractor/visualization/powerbi/client/__init__.py,sha256=rxWeAtmGsy1XYn2oIrGz5rIlxcTrzh2rl1V-MGxFOY4,175
+castor_extractor/visualization/powerbi/client/authentication.py,sha256=1pST-w7ceqrcKSccQSJBxT4lAsLU8keceSVJro1dg8k,1516
 castor_extractor/visualization/powerbi/client/client.py,sha256=Q_WHYGFpHT4wJ6nZvJa96nBVcpUGv7E2WnyZHBftsJM,8340
 castor_extractor/visualization/powerbi/client/client_test.py,sha256=zWgfc8fOHSRn3hxiX8ujJysmNHeypIoKin9h8_h178k,6668
 castor_extractor/visualization/powerbi/client/constants.py,sha256=88R_aGachNNUZh6OSH2fkDwZtY4KTStzKm_g7HNCqqo,387
-castor_extractor/visualization/powerbi/client/credentials.py,sha256=OVWdhZSNODzTdLysY-sbpBZ3uUkLokeayQZnbJAqt2I,1386
+castor_extractor/visualization/powerbi/client/credentials.py,sha256=Mqb9e9jbJrawE00xvLyej1i4tFM8VNiRnA0LpfqORd0,1565
 castor_extractor/visualization/powerbi/client/credentials_test.py,sha256=TzFqxsWVQ3sXR_n0bJsexK9Uz7ceXCEPVqDGWTJzW60,993
 castor_extractor/visualization/powerbi/client/endpoints.py,sha256=38ZETzSSnNq3vA9O6nLZQ8T1BVE01R9CjMC03-PRXsM,1911
 castor_extractor/visualization/powerbi/client/pagination.py,sha256=OZMjoDQPRGMoWd9QcKKrPh3aErJR20SHlrTqY_siLkk,755
-castor_extractor/visualization/powerbi/extract.py,sha256=Z5KbqMhMnqjWcnzged2G1-Gf6GYWJobTL9_TpAdgb8o,1309
+castor_extractor/visualization/powerbi/extract.py,sha256=bZOUbciWGPNRRrtcMezSdoeClHB2yiBATBC8UqoXz5M,1904
 castor_extractor/visualization/qlik/__init__.py,sha256=u6lIfm_WOykBwt6SlaB7C0Dtx37XBliUbM5oWv26gC8,177
 castor_extractor/visualization/qlik/assets.py,sha256=Ab_kG61mHcK8GoGZbfQW7RSWyd7D9bVga9DOqnm0iSE,1625
 castor_extractor/visualization/qlik/client/__init__.py,sha256=5O5N9Jrt3d99agFEJ28lKWs2KkDaXK-lZ07IUtLj56M,130
@@ -270,17 +282,17 @@ castor_extractor/visualization/salesforce_reporting/client/rest.py,sha256=AqL1DT
 castor_extractor/visualization/salesforce_reporting/client/soql.py,sha256=ytZnX6zE-NoS_Kz12KghMcCM4ukPwhMj6U0rQZ_8Isk,1621
 castor_extractor/visualization/salesforce_reporting/extract.py,sha256=ScStilebLGf4HDTFqhVTQAvv_OrKxc8waycfBKdsVAc,1359
 castor_extractor/visualization/sigma/__init__.py,sha256=GINql4yJLtjfOJgjHaWNpE13cMtnKNytiFRomwav27Q,114
-castor_extractor/visualization/sigma/assets.py,sha256=uKGKDaeY1ejc7XGh4eFaNp2ygG7hgca132xsX4eCwKQ,380
+castor_extractor/visualization/sigma/assets.py,sha256=iVZqi7XtNgSOVXy0jgeHZonVOeXi7jyikor8ztbECBc,398
 castor_extractor/visualization/sigma/client/__init__.py,sha256=YQv06FBBQHvBMFg_tN0nUcmUp2NCL2s-eFTXG8rXaBg,74
 castor_extractor/visualization/sigma/client/authentication.py,sha256=gHukrpfboIjZc_O9CcuDtrl6U-StH0J73VY2J74Bm9o,2279
-castor_extractor/visualization/sigma/client/client.py,sha256=De0xWJfUssfrwzyMNh8D2IIouUQzcS0qLUQrUYtjVkY,10827
+castor_extractor/visualization/sigma/client/client.py,sha256=SxSf5OjdDr8x-WZDezm8YNOw01R6CCoYIgW0od0ZgN8,8907
 castor_extractor/visualization/sigma/client/client_test.py,sha256=ae0ZOvKutCm44jnrJ-0_A5Y6ZGyDkMf9Ml3eEP8dNkY,581
 castor_extractor/visualization/sigma/client/credentials.py,sha256=XddAuQSmCKpxJ70TQgRnOj0vMPYVtiStk_lMMQ1AiNM,693
-castor_extractor/visualization/sigma/client/endpoints.py,sha256=i7KTKnl2Os6752CdtJl0vPSC_Z6JxmacodV_saOnce0,1662
-castor_extractor/visualization/sigma/client/pagination.py,sha256=1yLpCNps5FnDiPcXCcgHu23cxg15Gfc6FvE3AJleb2c,728
-castor_extractor/visualization/sigma/client/sources_transformer.py,sha256=n-5mZWSvzfTwpM5VP_bwlcxcaAwCKEEbpMCG_1KRVP4,3748
+castor_extractor/visualization/sigma/client/endpoints.py,sha256=by9VIFml2whlzQT66f2m56RYBsqPrWdAmIP4JkTaBV4,1799
+castor_extractor/visualization/sigma/client/pagination.py,sha256=9kCYQpO7hAH2qvYmnVjnGVUDLkpkEM6BgYlv-JTY8AE,1241
+castor_extractor/visualization/sigma/client/sources_transformer.py,sha256=2f7REl70wYitopftMtYQU-E8kISVck67i7rGYgf3tkk,4552
 castor_extractor/visualization/sigma/client/sources_transformer_test.py,sha256=06yUHXyv65amXLKXhix6K3kkVc1kpBqSjIYcxbyMI4Y,2766
-castor_extractor/visualization/sigma/extract.py,sha256=poTh70Xm2D6BwbdGApLkjXy6-t4iZnOoMB5DPfaTLEI,2929
+castor_extractor/visualization/sigma/extract.py,sha256=iRmRUzSnq_ObG9fxpOI5Rs07EKKT-VRLcyiti5-8D4c,2986
 castor_extractor/visualization/strategy/__init__.py,sha256=HOMv4JxqF5ZmViWi-pDE-PSXJRLTdXal_jtpHG_rlR8,123
 castor_extractor/visualization/strategy/assets.py,sha256=yFXF_dX01patC0HQ1eU7Jo_4DZ4m6IJEg0uCB71tMoI,480
 castor_extractor/visualization/strategy/client/__init__.py,sha256=XWP0yF5j6JefDJkDfX-RSJn3HF2ceQ0Yx1PLCfB3BBo,80
@@ -434,8 +446,8 @@ castor_extractor/warehouse/sqlserver/queries/user.sql,sha256=MAlnTis43E3Amu1e1Oz
 castor_extractor/warehouse/sqlserver/queries/view_ddl.sql,sha256=9rynvx6MWg3iZzrWPB7haZfVKEPkxulzryE2g19x804,315
 castor_extractor/warehouse/sqlserver/query.py,sha256=c8f7_SEMR17DhbtzuYphWqWDQ0sCRy-nR442RRBZVYw,1773
 castor_extractor/warehouse/synapse/queries/column.sql,sha256=lNcFoIW3Y0PFOqoOzJEXmPvZvfAsY0AP63Mu2LuPzPo,1351
-castor_extractor-0.24.55.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
-castor_extractor-0.24.55.dist-info/METADATA,sha256=MhFCdByqa4_T7A4-Mb96-ISq07W6BP7M-RHgjSfI8iY,28172
-castor_extractor-0.24.55.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-castor_extractor-0.24.55.dist-info/entry_points.txt,sha256=_F-qeZCybjoMkNb9ErEhnyqXuG6afHIFQhakdBHZsr4,1803
-castor_extractor-0.24.55.dist-info/RECORD,,
+castor_extractor-0.25.2.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
+castor_extractor-0.25.2.dist-info/METADATA,sha256=Lh6TLvQYvBJ0wL4ST5GXkpGX4DUaZzNsThF9ZiBCOzk,28588
+castor_extractor-0.25.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+castor_extractor-0.25.2.dist-info/entry_points.txt,sha256=qyTrKNByoq2HYi1xbA79OU7qxg-OWPvle8VwDqt-KnE,1869
+castor_extractor-0.25.2.dist-info/RECORD,,

{castor_extractor-0.24.55.dist-info → castor_extractor-0.25.2.dist-info}/entry_points.txt RENAMED Viewed

@@ -1,6 +1,7 @@
 [console_scripts]
 castor-extract-bigquery=castor_extractor.commands.extract_bigquery:main
 castor-extract-confluence=castor_extractor.commands.extract_confluence:main
+castor-extract-count=castor_extractor.commands.extract_count:main
 castor-extract-databricks=castor_extractor.commands.extract_databricks:main
 castor-extract-domo=castor_extractor.commands.extract_domo:main
 castor-extract-looker=castor_extractor.commands.extract_looker:main

{castor_extractor-0.24.55.dist-info → castor_extractor-0.25.2.dist-info}/LICENCE RENAMED Viewed

File without changes

{castor_extractor-0.24.55.dist-info → castor_extractor-0.25.2.dist-info}/WHEEL RENAMED Viewed

File without changes

castor-extractor 0.24.55__py3-none-any.whl → 0.25.2__py3-none-any.whl

Potentially problematic release.

castor-extractor 0.24.55py3-none-any.whl → 0.25.2py3-none-any.whl