PyPI - castor-extractor - Versions diffs - 0.24.55__py3-none-any.whl → 0.24.57__py3-none-any.whl - Mend

castor-extractor 0.24.55py3-none-any.whl → 0.24.57py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of castor-extractor might be problematic. Click here for more details.

Files changed (12) hide show

CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,16 @@
 # Changelog
+## 0.24.57 - 2025-09-24
+* Sigma:
+  * fix pagination
+  * remove redundant element lineages endpoint
+  * extract data model sources
+## 0.24.56 - 2025-09-24
+* bump dependencies
 ## 0.24.55 - 2025-09-19
 * Fix encoding in LocalStorage - force to utf-8

castor_extractor/visualization/sigma/assets.py CHANGED Viewed

@@ -5,11 +5,11 @@ class SigmaAsset(ExternalAsset):
     """Sigma assets"""
     DATAMODELS = "datamodels"
+    DATAMODEL_SOURCES = "datamodel_sources"
     DATASETS = "datasets"
     DATASET_SOURCES = "dataset_sources"
     ELEMENTS = "elements"
     FILES = "files"
-    LINEAGES = "lineages"
     MEMBERS = "members"
     QUERIES = "queries"
     WORKBOOKS = "workbooks"

castor_extractor/visualization/sigma/client/client.py CHANGED Viewed

@@ -1,17 +1,13 @@
 import logging
 from collections.abc import Iterator
-from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from http import HTTPStatus
 from typing import Callable, Iterable, Optional
-from pydantic import BaseModel
 from ....utils import (
     APIClient,
     RequestSafeMode,
     fetch_all_pages,
-    retry,
 )
 from ..assets import SigmaAsset
 from .authentication import SigmaBearerAuth
@@ -55,38 +51,12 @@ SIGMA_SAFE_MODE = RequestSafeMode(
     max_errors=_VOLUME_IGNORED,
     status_codes=_IGNORED_ERROR_CODES,
 )
-SIGMA_SAFE_MODE_LINEAGE = RequestSafeMode(
-    max_errors=_VOLUME_IGNORED,
-    status_codes=(
-        *_IGNORED_ERROR_CODES,
-        HTTPStatus.FORBIDDEN,
-    ),
-)
-_THREADS_LINEAGE = 10  # empirically found; hit the rate limit with 20 workers
 _RETRY_NUMBER = 1
 _RETRY_BASE_MS = 60_000
-class LineageContext(BaseModel):
-    """all info needed to build the endpoint for lineage retrieval"""
-    workbook_id: str
-    element_id: str
-class Lineage(BaseModel):
-    """holds response from lineage API and context used to retrieve it"""
-    lineage: dict
-    context: LineageContext
 class SigmaClient(APIClient):
-    def __init__(
-        self,
-        credentials: SigmaCredentials,
-        safe_mode: Optional[RequestSafeMode] = None,
-    ):
+    def __init__(self, credentials: SigmaCredentials):
         auth = SigmaBearerAuth(
             host=credentials.host,
             token_payload=credentials.token_payload,
@@ -96,7 +66,7 @@ class SigmaClient(APIClient):
             auth=auth,
             headers=_SIGMA_HEADERS,
             timeout=_SIGMA_TIMEOUT_S,
-            safe_mode=safe_mode or SIGMA_SAFE_MODE,
+            safe_mode=SIGMA_SAFE_MODE,
         )
     def _get_paginated(
@@ -175,68 +145,6 @@ class SigmaClient(APIClient):
                     page=page, workbook_id=workbook_id
                 )
-    @retry(
-        (ConnectionError,),
-        max_retries=_RETRY_NUMBER,
-        base_ms=_RETRY_BASE_MS,
-        log_exc_info=True,
-    )
-    def _get_lineage(self, lineage_context: LineageContext) -> Lineage:
-        """
-        return the lineage from API and other ids needed to characterize
-        lineage in castor
-        """
-        workbook_id = lineage_context.workbook_id
-        element_id = lineage_context.element_id
-        endpoint = SigmaEndpointFactory.lineage(workbook_id, element_id)
-        return Lineage(lineage=self._get(endpoint), context=lineage_context)
-    @staticmethod
-    def _lineage_context(elements: list[dict]) -> list[LineageContext]:
-        """
-        Helper function to prepare context for lineage retrieval.
-        Elements without associated columns are skipped.
-        """
-        contexts: list[LineageContext] = []
-        for element in elements:
-            if element.get("columns") is None:
-                continue
-            context = LineageContext(
-                workbook_id=element["workbook_id"],
-                element_id=element["elementId"],
-            )
-            contexts.append(context)
-        return contexts
-    def _get_all_lineages(self, elements: list[dict]) -> Iterator[dict]:
-        """
-        The safe mode is temporarily modified to include 403 errors.
-        Due to concurrency issues, we force a refresh of the token in hopes that
-        the lineage extraction takes less than the token expiration time of
-        1 hour.
-        """
-        safe_mode = self._safe_mode
-        self._safe_mode = SIGMA_SAFE_MODE_LINEAGE
-        lineage_context = self._lineage_context(elements)
-        with ThreadPoolExecutor(max_workers=_THREADS_LINEAGE) as executor:
-            results = executor.map(self._get_lineage, lineage_context)
-        for lineage in results:
-            if not lineage.lineage:
-                continue
-            yield {
-                **lineage.lineage,
-                "workbook_id": lineage.context.workbook_id,
-                "element_id": lineage.context.element_id,
-            }
-        self._safe_mode = safe_mode
     @staticmethod
     def _yield_deduplicated_queries(
         queries: Iterable[dict], workbook_id: str
@@ -266,6 +174,13 @@ class SigmaClient(APIClient):
             yield from self._yield_deduplicated_queries(queries, workbook_id)
+    def _get_all_datamodel_sources(
+        self, datamodels: list[dict]
+    ) -> Iterator[dict]:
+        yield from SigmaSourcesTransformer(
+            self, table_id_key="tableId"
+        ).get_datamodel_sources(datamodels)
     def _get_all_dataset_sources(self, datasets: list[dict]) -> Iterator[dict]:
         yield from SigmaSourcesTransformer(self).get_dataset_sources(datasets)
@@ -277,14 +192,22 @@ class SigmaClient(APIClient):
     def fetch(
         self,
         asset: SigmaAsset,
+        datamodels: Optional[list[dict]] = None,
         datasets: Optional[list[dict]] = None,
-        elements: Optional[list[dict]] = None,
         workbooks: Optional[list[dict]] = None,
     ) -> Iterator[dict]:
         """Returns the needed metadata for the queried asset"""
         if asset == SigmaAsset.DATAMODELS:
             yield from self._get_all_datamodels()
+        elif asset == SigmaAsset.DATAMODEL_SOURCES:
+            if datamodels is None:
+                raise ValueError(
+                    "Missing data models to extract data model sources"
+                )
+            yield from self._get_all_datamodel_sources(datamodels)
         elif asset == SigmaAsset.DATASETS:
             yield from self._get_all_datasets()
@@ -303,12 +226,6 @@ class SigmaClient(APIClient):
         elif asset == SigmaAsset.FILES:
             yield from self._get_all_files()
-        elif asset == SigmaAsset.LINEAGES:
-            if elements is None:
-                raise ValueError("Missing elements to extract lineage")
-            yield from self._get_all_lineages(elements)
         elif asset == SigmaAsset.MEMBERS:
             yield from self._get_all_members()

castor_extractor/visualization/sigma/client/endpoints.py CHANGED Viewed

@@ -19,6 +19,10 @@ class SigmaEndpointFactory:
     def datamodels(cls) -> str:
         return f"v2/{cls.DATAMODELS}"
+    @classmethod
+    def datamodel_sources(cls, datamodel_id: str) -> str:
+        return f"v2/{cls.DATAMODELS}/{datamodel_id}/sources"
     @classmethod
     def datasets(cls) -> str:
         return f"v2/{cls.DATASETS}"

castor_extractor/visualization/sigma/client/pagination.py CHANGED Viewed

@@ -10,7 +10,7 @@ SIGMA_QUERIES_PAGINATION_LIMIT = 50
 class SigmaPagination(PaginationModel):
-    next_page: Optional[str] = "0"
+    next_page: Optional[str] = None
     entries: list = Field(default_factory=list)
     model_config = ConfigDict(
@@ -27,3 +27,23 @@ class SigmaPagination(PaginationModel):
     def page_results(self) -> list:
         return self.entries
+class SigmaTokenPagination(PaginationModel):
+    next_page_token: Optional[str] = ""  # noqa: S105
+    entries: list = Field(default_factory=list)
+    model_config = ConfigDict(
+        alias_generator=to_camel,
+        populate_by_name=True,
+        from_attributes=True,
+    )
+    def is_last(self) -> bool:
+        return not self.next_page_token
+    def next_page_payload(self) -> dict:
+        return {"pageToken": self.next_page_token}
+    def page_results(self) -> list:
+        return self.entries

castor_extractor/visualization/sigma/client/sources_transformer.py CHANGED Viewed

@@ -2,8 +2,9 @@ import logging
 from http import HTTPStatus
 from typing import TYPE_CHECKING, Callable, Iterator
-from ....utils import retry_request
+from ....utils import fetch_all_pages, retry_request
 from .endpoints import SigmaEndpointFactory
+from .pagination import SigmaTokenPagination
 if TYPE_CHECKING:
     from .client import SigmaClient
@@ -17,8 +18,11 @@ SIGMA_CONNECTION_PATH_SLEEP_MS = 30_000  # 30 seconds
 class SigmaSourcesTransformer:
     """Retrieves asset sources and enhances them with additional information."""
-    def __init__(self, api_client: "SigmaClient"):
+    def __init__(
+        self, api_client: "SigmaClient", table_id_key: str = "inodeId"
+    ):
         self.api_client = api_client
+        self.table_id_key = table_id_key
     @retry_request(
         status_codes=(HTTPStatus.TOO_MANY_REQUESTS,),
@@ -38,9 +42,9 @@ class SigmaSourcesTransformer:
         logger.info("Mapping table ids to connection and path information")
         unique_table_ids = {
-            source["inodeId"]
+            source[self.table_id_key]
             for asset_sources in all_sources
-            for source in asset_sources["sources"]
+            for source in asset_sources.get("sources", [])
             if source["type"] == "table"
         }
@@ -49,15 +53,14 @@ class SigmaSourcesTransformer:
             for table_id in unique_table_ids
         }
-    @staticmethod
-    def _enhance_table_source(source: dict, table_to_path: dict) -> dict:
+    def _enhance_table_source(self, source: dict, table_to_path: dict) -> dict:
         """
         Combines a single table source with its connection and path information.
         """
         if source["type"] != "table":
             return source
-        path_info = table_to_path.get(source["inodeId"], {})
+        path_info = table_to_path.get(source[self.table_id_key], {})
         source["connectionId"] = path_info.get("connectionId")
         source["path"] = path_info.get("path")
         return source
@@ -82,19 +85,35 @@ class SigmaSourcesTransformer:
             }
     def _get_all_sources(
-        self, endpoint: Callable[[str], str], asset_ids: set[str]
+        self,
+        endpoint: Callable[[str], str],
+        asset_ids: set[str],
+        with_pagination: bool = False,
     ) -> Iterator[dict]:
         """Returns transformed sources for the given assets"""
         all_sources = []
         for asset_id in asset_ids:
-            sources = self.api_client._get(endpoint=endpoint(asset_id))
+            endpoint_url = endpoint(asset_id)
+            if with_pagination:
+                request = self.api_client._get_paginated(endpoint=endpoint_url)
+                sources = list(fetch_all_pages(request, SigmaTokenPagination))
+            else:
+                sources = self.api_client._get(endpoint=endpoint_url)
             all_sources.append({"asset_id": asset_id, "sources": sources})
         table_to_path = self._map_table_id_to_connection_path(all_sources)
         yield from self._transform_sources(all_sources, table_to_path)
+    def get_datamodel_sources(self, datamodels: list[dict]) -> Iterator[dict]:
+        asset_ids = {datamodel["dataModelId"] for datamodel in datamodels}
+        yield from self._get_all_sources(
+            endpoint=SigmaEndpointFactory.datamodel_sources,
+            asset_ids=asset_ids,
+            with_pagination=True,
+        )
     def get_dataset_sources(self, datasets: list[dict]) -> Iterator[dict]:
         asset_ids = {dataset["datasetId"] for dataset in datasets}
         yield from self._get_all_sources(

castor_extractor/visualization/sigma/extract.py CHANGED Viewed

@@ -23,8 +23,14 @@ def iterate_all_data(
     """Iterate over the extracted data from Sigma"""
     logger.info("Extracting DATA MODELS from API")
-    datamodels = client.fetch(SigmaAsset.DATAMODELS)
-    yield SigmaAsset.DATASETS, list(deep_serialize(datamodels))
+    datamodels = list(client.fetch(SigmaAsset.DATAMODELS))
+    yield SigmaAsset.DATASETS, deep_serialize(datamodels)
+    logger.info("Extracting DATAMODEL SOURCES from API")
+    datamodel_sources = client.fetch(
+        SigmaAsset.DATAMODEL_SOURCES, datamodels=datamodels
+    )
+    yield SigmaAsset.DATAMODEL_SOURCES, list(deep_serialize(datamodel_sources))
     logger.info("Extracting DATASETS from API")
     datasets = list(client.fetch(SigmaAsset.DATASETS))
@@ -62,10 +68,6 @@ def iterate_all_data(
     elements = list(client.fetch(SigmaAsset.ELEMENTS, workbooks=workbooks))
     yield SigmaAsset.ELEMENTS, list(deep_serialize(elements))
-    logging.info("Extracting LINEAGES data from API")
-    lineages = client.fetch(SigmaAsset.LINEAGES, elements=elements)
-    yield SigmaAsset.LINEAGES, list(deep_serialize(lineages))
 def extract_all(**kwargs) -> None:
     """

{castor_extractor-0.24.55.dist-info → castor_extractor-0.24.57.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: castor-extractor
-Version: 0.24.55
+Version: 0.24.57
 Summary: Extract your metadata assets.
 Home-page: https://www.castordoc.com/
 License: EULA
@@ -215,6 +215,17 @@ For any questions or bug report, contact us at [support@coalesce.io](mailto:supp
 # Changelog
+## 0.24.57 - 2025-09-24
+* Sigma:
+  * fix pagination
+  * remove redundant element lineages endpoint
+  * extract data model sources
+## 0.24.56 - 2025-09-24
+* bump dependencies
 ## 0.24.55 - 2025-09-19
 * Fix encoding in LocalStorage - force to utf-8

{castor_extractor-0.24.55.dist-info → castor_extractor-0.24.57.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-CHANGELOG.md,sha256=y8BAidkUDrMoQLEfu3LJLiqxoEUzI5hJZs4CUN_e1H0,20711
+CHANGELOG.md,sha256=-WezbaTjM4tDXii_RVXSYDz39xuZYqWUsabdyqoh2Kc,20889
 Dockerfile,sha256=xQ05-CFfGShT3oUqaiumaldwA288dj9Yb_pxofQpufg,301
 DockerfileUsage.md,sha256=2hkJQF-5JuuzfPZ7IOxgM6QgIQW7l-9oRMFVwyXC4gE,998
 LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
@@ -270,17 +270,17 @@ castor_extractor/visualization/salesforce_reporting/client/rest.py,sha256=AqL1DT
 castor_extractor/visualization/salesforce_reporting/client/soql.py,sha256=ytZnX6zE-NoS_Kz12KghMcCM4ukPwhMj6U0rQZ_8Isk,1621
 castor_extractor/visualization/salesforce_reporting/extract.py,sha256=ScStilebLGf4HDTFqhVTQAvv_OrKxc8waycfBKdsVAc,1359
 castor_extractor/visualization/sigma/__init__.py,sha256=GINql4yJLtjfOJgjHaWNpE13cMtnKNytiFRomwav27Q,114
-castor_extractor/visualization/sigma/assets.py,sha256=uKGKDaeY1ejc7XGh4eFaNp2ygG7hgca132xsX4eCwKQ,380
+castor_extractor/visualization/sigma/assets.py,sha256=iVZqi7XtNgSOVXy0jgeHZonVOeXi7jyikor8ztbECBc,398
 castor_extractor/visualization/sigma/client/__init__.py,sha256=YQv06FBBQHvBMFg_tN0nUcmUp2NCL2s-eFTXG8rXaBg,74
 castor_extractor/visualization/sigma/client/authentication.py,sha256=gHukrpfboIjZc_O9CcuDtrl6U-StH0J73VY2J74Bm9o,2279
-castor_extractor/visualization/sigma/client/client.py,sha256=De0xWJfUssfrwzyMNh8D2IIouUQzcS0qLUQrUYtjVkY,10827
+castor_extractor/visualization/sigma/client/client.py,sha256=uUEZoTa1WU5bJEjOrgzWqSiJMKgbru5HPBEPazyu1Hc,8272
 castor_extractor/visualization/sigma/client/client_test.py,sha256=ae0ZOvKutCm44jnrJ-0_A5Y6ZGyDkMf9Ml3eEP8dNkY,581
 castor_extractor/visualization/sigma/client/credentials.py,sha256=XddAuQSmCKpxJ70TQgRnOj0vMPYVtiStk_lMMQ1AiNM,693
-castor_extractor/visualization/sigma/client/endpoints.py,sha256=i7KTKnl2Os6752CdtJl0vPSC_Z6JxmacodV_saOnce0,1662
-castor_extractor/visualization/sigma/client/pagination.py,sha256=1yLpCNps5FnDiPcXCcgHu23cxg15Gfc6FvE3AJleb2c,728
-castor_extractor/visualization/sigma/client/sources_transformer.py,sha256=n-5mZWSvzfTwpM5VP_bwlcxcaAwCKEEbpMCG_1KRVP4,3748
+castor_extractor/visualization/sigma/client/endpoints.py,sha256=by9VIFml2whlzQT66f2m56RYBsqPrWdAmIP4JkTaBV4,1799
+castor_extractor/visualization/sigma/client/pagination.py,sha256=9kCYQpO7hAH2qvYmnVjnGVUDLkpkEM6BgYlv-JTY8AE,1241
+castor_extractor/visualization/sigma/client/sources_transformer.py,sha256=2f7REl70wYitopftMtYQU-E8kISVck67i7rGYgf3tkk,4552
 castor_extractor/visualization/sigma/client/sources_transformer_test.py,sha256=06yUHXyv65amXLKXhix6K3kkVc1kpBqSjIYcxbyMI4Y,2766
-castor_extractor/visualization/sigma/extract.py,sha256=poTh70Xm2D6BwbdGApLkjXy6-t4iZnOoMB5DPfaTLEI,2929
+castor_extractor/visualization/sigma/extract.py,sha256=iRmRUzSnq_ObG9fxpOI5Rs07EKKT-VRLcyiti5-8D4c,2986
 castor_extractor/visualization/strategy/__init__.py,sha256=HOMv4JxqF5ZmViWi-pDE-PSXJRLTdXal_jtpHG_rlR8,123
 castor_extractor/visualization/strategy/assets.py,sha256=yFXF_dX01patC0HQ1eU7Jo_4DZ4m6IJEg0uCB71tMoI,480
 castor_extractor/visualization/strategy/client/__init__.py,sha256=XWP0yF5j6JefDJkDfX-RSJn3HF2ceQ0Yx1PLCfB3BBo,80
@@ -434,8 +434,8 @@ castor_extractor/warehouse/sqlserver/queries/user.sql,sha256=MAlnTis43E3Amu1e1Oz
 castor_extractor/warehouse/sqlserver/queries/view_ddl.sql,sha256=9rynvx6MWg3iZzrWPB7haZfVKEPkxulzryE2g19x804,315
 castor_extractor/warehouse/sqlserver/query.py,sha256=c8f7_SEMR17DhbtzuYphWqWDQ0sCRy-nR442RRBZVYw,1773
 castor_extractor/warehouse/synapse/queries/column.sql,sha256=lNcFoIW3Y0PFOqoOzJEXmPvZvfAsY0AP63Mu2LuPzPo,1351
-castor_extractor-0.24.55.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
-castor_extractor-0.24.55.dist-info/METADATA,sha256=MhFCdByqa4_T7A4-Mb96-ISq07W6BP7M-RHgjSfI8iY,28172
-castor_extractor-0.24.55.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-castor_extractor-0.24.55.dist-info/entry_points.txt,sha256=_F-qeZCybjoMkNb9ErEhnyqXuG6afHIFQhakdBHZsr4,1803
-castor_extractor-0.24.55.dist-info/RECORD,,
+castor_extractor-0.24.57.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
+castor_extractor-0.24.57.dist-info/METADATA,sha256=uSN01JxGlu1gIF4bpBnZtHM3tLQKfU9qT0uimCqtrjI,28350
+castor_extractor-0.24.57.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+castor_extractor-0.24.57.dist-info/entry_points.txt,sha256=_F-qeZCybjoMkNb9ErEhnyqXuG6afHIFQhakdBHZsr4,1803
+castor_extractor-0.24.57.dist-info/RECORD,,

{castor_extractor-0.24.55.dist-info → castor_extractor-0.24.57.dist-info}/LICENCE RENAMED Viewed

File without changes

{castor_extractor-0.24.55.dist-info → castor_extractor-0.24.57.dist-info}/WHEEL RENAMED Viewed

File without changes

{castor_extractor-0.24.55.dist-info → castor_extractor-0.24.57.dist-info}/entry_points.txt RENAMED Viewed

File without changes

castor-extractor 0.24.55__py3-none-any.whl → 0.24.57__py3-none-any.whl

Potentially problematic release.

castor-extractor 0.24.55py3-none-any.whl → 0.24.57py3-none-any.whl