PyPI - castor-extractor - Versions diffs - 0.24.36__py3-none-any.whl → 0.24.40__py3-none-any.whl - Mend

castor-extractor 0.24.36py3-none-any.whl → 0.24.40py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of castor-extractor might be problematic. Click here for more details.

Files changed (31) hide show

CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,23 @@
 # Changelog
+## 0.24.40 - 2025-08-18
+* SQLServer: fix database allowlist/blocklist filtering
+## 0.24.39 - 2025-08-18
+* Databricks:
+  * Fix vanishing owner ID column for tables
+  * Deduplicate lineage with SQL to reduce memory use
+## 0.24.38 - 2025-08-07
+* Uploader: Support US and EU zones
+## 0.24.37 - 2025-08-06
+* Sigma: extract data models, dataset sources and workbook sources
 ## 0.24.36 - 2025-08-04
 * Sigma:

castor_extractor/commands/upload.py CHANGED Viewed

@@ -3,6 +3,7 @@ from argparse import ArgumentParser
 from castor_extractor.uploader import (  # type: ignore
     FileType,
+    Zone,
     upload_any,
 )
 from castor_extractor.utils import parse_filled_arguments  # type: ignore
@@ -40,6 +41,15 @@ def _args() -> ArgumentParser:
         ),
         choices=supported_file_type,
     )
+    supported_zones = [zone.value for zone in Zone]
+    parser.add_argument(
+        "-z",
+        "--zone",
+        help="geographic zone to upload, currently supported are {}, defaults to EU".format(
+            supported_zones,
+        ),
+        choices=supported_zones,
+    )
     return parser

castor_extractor/uploader/__init__.py CHANGED Viewed

@@ -1,2 +1,3 @@
 from .constant import FileType
+from .enums import Zone
 from .upload import upload, upload_any, upload_manifest

castor_extractor/uploader/constant.py CHANGED Viewed

@@ -1,9 +1,13 @@
 from enum import Enum
 from ..utils import RetryStrategy
+from .enums import Zone
 # url of the gcs proxy
-INGEST_URL = "https://ingest.castordoc.com"
+INGEST_URLS = {
+    Zone.EU: "https://ingest.castordoc.com",
+    Zone.US: "https://ingest.us.castordoc.com",
+}
 RETRY_BASE_MS = 10_000
 RETRY_JITTER_MS = 1_000

castor_extractor/uploader/enums.py ADDED Viewed

@@ -0,0 +1,8 @@
+from enum import Enum
+class Zone(Enum):
+    """Geographic cluster location"""
+    EU = "EU"
+    US = "US"

castor_extractor/uploader/settings.py CHANGED Viewed

@@ -4,6 +4,7 @@ from pydantic import UUID4, Field
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from .constant import FileType
+from .enums import Zone
 UPLOADER_ENV_PREFIX = "CASTOR_UPLOADER_"
@@ -22,3 +23,4 @@ class UploaderSettings(BaseSettings):
     file_type: FileType
     source_id: UUID4
     token: str = Field(repr=False)
+    zone: Optional[Zone] = Zone.EU

castor_extractor/uploader/upload.py CHANGED Viewed

@@ -10,13 +10,14 @@ import requests
 from ..utils.retry import retry
 from .constant import (
-    INGEST_URL,
+    INGEST_URLS,
     PATH_TEMPLATES,
     RETRY_BASE_MS,
     RETRY_JITTER_MS,
     RETRY_STRATEGY,
     FileType,
 )
+from .enums import Zone
 from .env import get_blob_env
 from .settings import UploaderSettings
 from .utils import iter_files
@@ -33,6 +34,7 @@ def _path_and_url(
     source_id: UUID,
     file_type: FileType,
     file_path: str,
+    zone: Zone,
 ) -> tuple[str, str]:
     now = datetime.utcnow()
     timestamp = int(now.timestamp())
@@ -44,7 +46,7 @@ def _path_and_url(
         filename=filename,
     )
-    url = f"{INGEST_URL}/{path}"
+    url = f"{INGEST_URLS[zone]}/{path}"
     return path, url
@@ -61,13 +63,16 @@ def _upload(
     source_id: UUID,
     file_path: str,
     file_type: FileType,
+    zone: Optional[Zone] = Zone.EU,
 ) -> None:
     """
     Upload the given file to Google Cloud Storage (GCS)
     - Don't call GCS API directly
     - Call the ingestion proxy which handles authorisation and uploading
     """
-    path, url = _path_and_url(source_id, file_type, file_path)
+    if not zone:
+        zone = Zone.EU
+    path, url = _path_and_url(source_id, file_type, file_path, zone)
     headers = _headers(token)
     timeout, max_retries = get_blob_env()
@@ -97,6 +102,7 @@ def _upload(
 def upload_manifest(
     token: str,
     source_id: UUID,
+    zone: Optional[Zone],
     file_path: Optional[str] = None,
 ) -> None:
     """
@@ -106,13 +112,20 @@ def upload_manifest(
     """
     if not file_path:
         raise ValueError("file path is needed to upload a manifest")
-    _upload(token, source_id, file_path, FileType.DBT)
+    _upload(
+        token=token,
+        source_id=source_id,
+        file_path=file_path,
+        file_type=FileType.DBT,
+        zone=zone,
+    )
 def upload(
     token: str,
     source_id: UUID,
     file_type: FileType,
+    zone: Optional[Zone],
     file_path: Optional[str] = None,
     directory_path: Optional[str] = None,
 ) -> None:
@@ -133,7 +146,13 @@ def upload(
         raise ValueError(message)
     for file_ in files:
-        _upload(token, source_id, file_, file_type)
+        _upload(
+            token=token,
+            source_id=source_id,
+            file_path=file_,
+            file_type=file_type,
+            zone=zone,
+        )
 def upload_any(**kwargs) -> None:
@@ -156,6 +175,7 @@ def upload_any(**kwargs) -> None:
             token=settings.token,
             source_id=settings.source_id,
             file_path=settings.file_path,
+            zone=settings.zone,
         )
         return None
@@ -165,4 +185,5 @@ def upload_any(**kwargs) -> None:
         file_type=file_type,
         file_path=settings.file_path,
         directory_path=settings.directory_path,
+        zone=settings.zone,
     )

castor_extractor/uploader/upload_test.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from uuid import UUID
-from .constant import INGEST_URL, FileType
+from .constant import INGEST_URLS, FileType
+from .enums import Zone
 from .upload import _path_and_url
@@ -8,7 +9,8 @@ def test__path():
     source_id = UUID("399a8b22-3187-11ec-8d3d-0242ac130003")
     file_type = FileType.VIZ
     file_path = "filename"
+    zone = Zone.EU
-    path, url = _path_and_url(source_id, file_type, file_path)
+    path, url = _path_and_url(source_id, file_type, file_path, zone)
     assert path == f"visualization-{source_id}/{file_path}"
-    assert url == f"{INGEST_URL}/{path}"
+    assert url == f"{INGEST_URLS[Zone.EU]}/{path}"

castor_extractor/visualization/powerbi/client/client.py CHANGED Viewed

@@ -28,6 +28,7 @@ POWERBI_DEFAULT_TIMEOUT_S = 30
 METADATA_BATCH_SIZE = 100
 POWERBI_SCAN_STATUS_DONE = "Succeeded"
 POWERBI_SCAN_SLEEP_S = 1
+POWERBI_SCAN_TIMEOUT_S = 60
 MAX_RETRY_PAGES = 1
 RETRY_PAGES_TIMEOUT_MS = 35 * 1000  # 35 seconds
@@ -142,7 +143,7 @@ class PowerbiClient(APIClient):
         endpoint = self.endpoint_factory.metadata_scan_status(scan_id)
         total_waiting_time_s = 0
-        while total_waiting_time_s < POWERBI_DEFAULT_TIMEOUT_S:
+        while total_waiting_time_s < POWERBI_SCAN_TIMEOUT_S:
             try:
                 result = self._get(endpoint)
             except HTTPError as e:

castor_extractor/visualization/sigma/assets.py CHANGED Viewed

@@ -4,10 +4,13 @@ from ...types import ExternalAsset
 class SigmaAsset(ExternalAsset):
     """Sigma assets"""
+    DATAMODELS = "datamodels"
     DATASETS = "datasets"
+    DATASET_SOURCES = "dataset_sources"
     ELEMENTS = "elements"
     FILES = "files"
     LINEAGES = "lineages"
     MEMBERS = "members"
     QUERIES = "queries"
     WORKBOOKS = "workbooks"
+    WORKBOOK_SOURCES = "workbook_sources"

castor_extractor/visualization/sigma/client/client.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import logging
 from collections.abc import Iterator
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
@@ -24,6 +25,9 @@ from .pagination import (
     SIGMA_QUERIES_PAGINATION_LIMIT,
     SigmaPagination,
 )
+from .sources_transformer import SigmaSourcesTransformer
+logger = logging.getLogger(__name__)
 _CONTENT_TYPE = "application/x-www-form-urlencoded"
@@ -135,6 +139,12 @@ class SigmaClient(APIClient):
             params={"limit": limit},
         )
+    def _get_all_datamodels(self) -> Iterator[dict]:
+        request = self._get_paginated(
+            endpoint=SigmaEndpointFactory.datamodels()
+        )
+        yield from fetch_all_pages(request, SigmaPagination)
     def _get_all_datasets(self) -> Iterator[dict]:
         request = self._get_paginated(endpoint=SigmaEndpointFactory.datasets())
         yield from fetch_all_pages(request, SigmaPagination)
@@ -275,18 +285,36 @@ class SigmaClient(APIClient):
             yield from self._yield_deduplicated_queries(queries, workbook_id)
+    def _get_all_dataset_sources(self, datasets: list[dict]) -> Iterator[dict]:
+        yield from SigmaSourcesTransformer(self).get_dataset_sources(datasets)
+    def _get_all_workbook_sources(
+        self, workbooks: list[dict]
+    ) -> Iterator[dict]:
+        yield from SigmaSourcesTransformer(self).get_workbook_sources(workbooks)
     def fetch(
         self,
         asset: SigmaAsset,
-        workbooks: Optional[list[dict]] = None,
+        datasets: Optional[list[dict]] = None,
         elements: Optional[list[dict]] = None,
+        workbooks: Optional[list[dict]] = None,
     ) -> Iterator[dict]:
         """Returns the needed metadata for the queried asset"""
-        if asset == SigmaAsset.DATASETS:
+        if asset == SigmaAsset.DATAMODELS:
+            yield from self._get_all_datamodels()
+        elif asset == SigmaAsset.DATASETS:
             yield from self._get_all_datasets()
+        elif asset == SigmaAsset.DATASET_SOURCES:
+            if datasets is None:
+                raise ValueError("Missing datasets to extract dataset sources")
+            yield from self._get_all_dataset_sources(datasets)
         elif asset == SigmaAsset.ELEMENTS:
-            if not workbooks:
+            if workbooks is None:
                 raise ValueError("Missing workbooks to extract elements")
             yield from self._get_all_elements(workbooks)
@@ -295,15 +323,16 @@ class SigmaClient(APIClient):
             yield from self._get_all_files()
         elif asset == SigmaAsset.LINEAGES:
-            if not elements:
+            if elements is None:
                 raise ValueError("Missing elements to extract lineage")
             yield from self._get_all_lineages(elements)
         elif asset == SigmaAsset.MEMBERS:
             yield from self._get_all_members()
         elif asset == SigmaAsset.QUERIES:
-            if not workbooks:
+            if workbooks is None:
                 raise ValueError("Missing workbooks to extract queries")
             yield from self._get_all_queries(workbooks)
@@ -311,5 +340,13 @@ class SigmaClient(APIClient):
         elif asset == SigmaAsset.WORKBOOKS:
             yield from self._get_all_workbooks()
+        elif asset == SigmaAsset.WORKBOOK_SOURCES:
+            if workbooks is None:
+                raise ValueError(
+                    "Missing workbooks to extract workbook sources"
+                )
+            yield from self._get_all_workbook_sources(workbooks)
         else:
             raise ValueError(f"This asset {asset} is unknown")

castor_extractor/visualization/sigma/client/endpoints.py CHANGED Viewed

@@ -1,6 +1,7 @@
 class SigmaEndpointFactory:
     """Wrapper class around all endpoints we're using"""
+    DATAMODELS = "dataModels"
     DATASETS = "datasets"
     FILES = "files"
     MEMBERS = "members"
@@ -10,10 +11,22 @@ class SigmaEndpointFactory:
     def authentication(cls) -> str:
         return "v2/auth/token"
+    @classmethod
+    def connection_path(cls, inode_id: str) -> str:
+        return f"v2/connections/paths/{inode_id}"
+    @classmethod
+    def datamodels(cls) -> str:
+        return f"v2/{cls.DATAMODELS}"
     @classmethod
     def datasets(cls) -> str:
         return f"v2/{cls.DATASETS}"
+    @classmethod
+    def dataset_sources(cls, dataset_id: str) -> str:
+        return f"v2/{cls.DATASETS}/{dataset_id}/sources"
     @classmethod
     def elements(cls, workbook_id: str, page_id: str) -> str:
         return f"v2/{cls.WORKBOOKS}/{workbook_id}/pages/{page_id}/elements"
@@ -41,3 +54,7 @@ class SigmaEndpointFactory:
     @classmethod
     def workbooks(cls) -> str:
         return f"v2/{cls.WORKBOOKS}"
+    @classmethod
+    def workbook_sources(cls, workbook_id: str) -> str:
+        return f"v2/{cls.WORKBOOKS}/{workbook_id}/sources"

castor_extractor/visualization/sigma/client/sources_transformer.py ADDED Viewed

@@ -0,0 +1,94 @@
+import logging
+from typing import TYPE_CHECKING, Callable, Iterator
+from .endpoints import SigmaEndpointFactory
+if TYPE_CHECKING:
+    from .client import SigmaClient
+logger = logging.getLogger(__name__)
+class SigmaSourcesTransformer:
+    """Retrieves asset sources and enhances them with additional information."""
+    def __init__(self, api_client: "SigmaClient"):
+        self.api_client = api_client
+    def _map_table_id_to_connection_path(
+        self, all_sources: list
+    ) -> dict[str, dict]:
+        """Maps a table id to its connection and path information."""
+        logger.info("Mapping table ids to connection and path information")
+        unique_table_ids = {
+            source["inodeId"]
+            for asset_sources in all_sources
+            for source in asset_sources["sources"]
+            if source["type"] == "table"
+        }
+        return {
+            table_id: self.api_client._get(
+                endpoint=SigmaEndpointFactory.connection_path(table_id)
+            )
+            for table_id in unique_table_ids
+        }
+    @staticmethod
+    def _enhance_table_source(source: dict, table_to_path: dict) -> dict:
+        """
+        Combines a single table source with its connection and path information.
+        """
+        if source["type"] != "table":
+            return source
+        path_info = table_to_path.get(source["inodeId"], {})
+        source["connectionId"] = path_info.get("connectionId")
+        source["path"] = path_info.get("path")
+        return source
+    def _transform_sources(
+        self, all_sources: list, table_to_path: dict
+    ) -> Iterator[dict]:
+        """
+        Yields all sources, with table sources being enhanced with additional information.
+        """
+        logger.info("Merging sources with table information")
+        for asset_sources in all_sources:
+            enhanced_sources = [
+                self._enhance_table_source(source, table_to_path)
+                for source in asset_sources["sources"]
+            ]
+            yield {
+                "asset_id": asset_sources["asset_id"],
+                "sources": enhanced_sources,
+            }
+    def _get_all_sources(
+        self, endpoint: Callable[[str], str], asset_ids: set[str]
+    ) -> Iterator[dict]:
+        """Returns transformed sources for the given assets"""
+        all_sources = []
+        for asset_id in asset_ids:
+            sources = self.api_client._get(endpoint=endpoint(asset_id))
+            all_sources.append({"asset_id": asset_id, "sources": sources})
+        table_to_path = self._map_table_id_to_connection_path(all_sources)
+        yield from self._transform_sources(all_sources, table_to_path)
+    def get_dataset_sources(self, datasets: list[dict]) -> Iterator[dict]:
+        asset_ids = {dataset["datasetId"] for dataset in datasets}
+        yield from self._get_all_sources(
+            endpoint=SigmaEndpointFactory.dataset_sources, asset_ids=asset_ids
+        )
+    def get_workbook_sources(self, workbooks: list[dict]) -> Iterator[dict]:
+        asset_ids = {workbook["workbookId"] for workbook in workbooks}
+        yield from self._get_all_sources(
+            endpoint=SigmaEndpointFactory.workbook_sources, asset_ids=asset_ids
+        )

castor_extractor/visualization/sigma/client/sources_transformer_test.py ADDED Viewed

@@ -0,0 +1,101 @@
+from unittest.mock import Mock
+from .sources_transformer import SigmaSourcesTransformer
+_ALL_SOURCES = [
+    {
+        "asset_id": "asset1",
+        "sources": [
+            {"type": "dataset", "inodeId": "1234"},  # non-table source
+            {"type": "table", "inodeId": "table1"},
+            {"type": "table", "inodeId": "table2"},
+        ],
+    },
+    {
+        "asset_id": "asset2",
+        "sources": [
+            {"type": "table", "inodeId": "table1"},  # repeated source
+        ],
+    },
+]
+_TABLE_TO_PATH = {
+    "table1": {
+        "connectionId": "conn1",
+        "path": ["db", "schema", "table1"],
+    },
+    "table2": {
+        "connectionId": "conn2",
+        "path": ["db", "schema", "table2"],
+    },
+}
+def test__map_table_id_to_connection_path():
+    transformer = SigmaSourcesTransformer(api_client=Mock())
+    def mock_get(endpoint):
+        if "table1" in endpoint:
+            return _TABLE_TO_PATH["table1"]
+        elif "table2" in endpoint:
+            return _TABLE_TO_PATH["table2"]
+        else:
+            raise ValueError(f"Unexpected endpoint: {endpoint}")
+    transformer.api_client._get.side_effect = mock_get
+    result = transformer._map_table_id_to_connection_path(_ALL_SOURCES)
+    assert len(result) == 2
+    assert result["table1"] == {
+        "connectionId": "conn1",
+        "path": ["db", "schema", "table1"],
+    }
+    assert result["table2"] == {
+        "connectionId": "conn2",
+        "path": ["db", "schema", "table2"],
+    }
+    assert transformer.api_client._get.call_count == 2
+def test__transform_sources():
+    transformer = SigmaSourcesTransformer(api_client=Mock())
+    result = list(transformer._transform_sources(_ALL_SOURCES, _TABLE_TO_PATH))
+    assert len(result) == 2
+    asset_1_results = result[0]
+    assert len(asset_1_results["sources"]) == 3
+    actual_sources = sorted(
+        asset_1_results["sources"], key=lambda x: x["inodeId"]
+    )
+    expected_sources = [
+        {"type": "dataset", "inodeId": "1234"},
+        {
+            "type": "table",
+            "inodeId": "table1",
+            "connectionId": "conn1",
+            "path": ["db", "schema", "table1"],
+        },
+        {
+            "type": "table",
+            "inodeId": "table2",
+            "connectionId": "conn2",
+            "path": ["db", "schema", "table2"],
+        },
+    ]
+    expected_sources = sorted(expected_sources, key=lambda x: x["inodeId"])
+    assert actual_sources == expected_sources
+    asset_2_results = result[1]
+    assert asset_2_results["asset_id"] == "asset2"
+    assert asset_2_results["sources"] == [
+        {
+            "type": "table",
+            "inodeId": "table1",
+            "connectionId": "conn1",
+            "path": ["db", "schema", "table1"],
+        }
+    ]

castor_extractor/visualization/sigma/extract.py CHANGED Viewed

@@ -22,14 +22,30 @@ def iterate_all_data(
 ) -> Iterable[tuple[SigmaAsset, Union[list, Iterator, dict]]]:
     """Iterate over the extracted data from Sigma"""
+    logger.info("Extracting DATA MODELS from API")
+    datamodels = client.fetch(SigmaAsset.DATAMODELS)
+    yield SigmaAsset.DATASETS, list(deep_serialize(datamodels))
     logger.info("Extracting DATASETS from API")
-    datasets = client.fetch(SigmaAsset.DATASETS)
+    datasets = list(client.fetch(SigmaAsset.DATASETS))
     yield SigmaAsset.DATASETS, list(deep_serialize(datasets))
+    logger.info("Extracting DATASET SOURCES from API")
+    dataset_sources = client.fetch(
+        SigmaAsset.DATASET_SOURCES, datasets=datasets
+    )
+    yield SigmaAsset.DATASET_SOURCES, list(deep_serialize(dataset_sources))
     logger.info("Extracting WORKBOOKS from API")
     workbooks = list(client.fetch(SigmaAsset.WORKBOOKS))
     yield SigmaAsset.WORKBOOKS, list(deep_serialize(workbooks))
+    logger.info("Extracting WORKBOOK SOURCES from API")
+    workbook_sources = client.fetch(
+        SigmaAsset.WORKBOOK_SOURCES, workbooks=workbooks
+    )
+    yield SigmaAsset.WORKBOOKS, list(deep_serialize(workbook_sources))
     logger.info("Extracting FILES from API")
     files = client.fetch(SigmaAsset.FILES)
     yield SigmaAsset.FILES, list(deep_serialize(files))

castor_extractor/warehouse/databricks/client.py CHANGED Viewed

@@ -46,12 +46,11 @@ class DatabricksClient:
     @staticmethod
     def _match_table_with_user(table: dict, user_mapping: dict) -> dict:
+        """Matches the table's owner email to an ID, or None if not found."""
         table_owner_email = table.get("owner_email")
-        if not table_owner_email:
-            return table
-        owner_external_id = user_mapping.get(table_owner_email)
-        if not owner_external_id:
-            return table
+        owner_external_id = (
+            user_mapping.get(table_owner_email) if table_owner_email else None
+        )
         return {**table, "owner_external_id": owner_external_id}
     @staticmethod

castor_extractor/warehouse/databricks/client_test.py CHANGED Viewed

@@ -36,5 +36,6 @@ def test_DatabricksClient__match_table_with_user():
     assert table_with_owner == {**table, "owner_external_id": 3}
     table_without_owner = {"id": 1, "owner_email": None}
+    expected = {"id": 1, "owner_email": None, "owner_external_id": None}
     actual = client._match_table_with_user(table_without_owner, user_mapping)
-    assert actual == table_without_owner
+    assert actual == expected

castor_extractor/warehouse/databricks/queries/column_lineage.sql ADDED Viewed

@@ -0,0 +1,25 @@
+/*
+Selects all column lineage events for the given day.
+This excludes self-lineage and deduplicates (parent, child) pairs to keep only the most recent lineage event.
+Passing parameters is not always supported, so the query must be Python-formatted to set the date.
+*/
+WITH deduplicated_lineage AS (
+    SELECT *,
+           ROW_NUMBER() OVER (
+               PARTITION BY source_table_full_name, source_column_name, target_table_full_name, target_column_name
+               ORDER BY event_time DESC
+           ) AS rank
+    FROM system.access.column_lineage
+    WHERE
+        TRUE
+        AND event_date = DATE('{day}')
+        AND source_table_full_name IS NOT NULL
+        AND source_column_name IS NOT NULL
+        AND target_table_full_name IS NOT NULL
+        AND target_column_name IS NOT NULL
+        AND CONCAT(source_table_full_name, '.', source_column_name) != CONCAT(target_table_full_name, '.', target_column_name)
+)
+SELECT *
+FROM deduplicated_lineage
+WHERE rank = 1

castor_extractor/warehouse/databricks/queries/table_lineage.sql ADDED Viewed

@@ -0,0 +1,23 @@
+/*
+Selects all table lineage events for the given day.
+This excludes self-lineage and deduplicates (parent, child) pairs to keep only the most recent lineage event.
+Passing parameters is not always supported, so the query must be Python-formatted to set the date.
+*/
+WITH deduplicated_lineage AS (
+    SELECT *,
+           ROW_NUMBER() OVER (
+               PARTITION BY source_table_full_name, target_table_full_name
+               ORDER BY event_time DESC
+           ) AS rank
+    FROM system.access.table_lineage
+    WHERE
+        TRUE
+        AND event_date = DATE('{day}')
+        AND source_table_full_name IS NOT NULL
+        AND target_table_full_name IS NOT NULL
+        AND source_table_full_name != target_table_full_name
+)
+SELECT *
+FROM deduplicated_lineage
+WHERE rank = 1

castor_extractor/warehouse/databricks/sql_client.py CHANGED Viewed

@@ -4,20 +4,25 @@ from datetime import date
 from databricks import sql  # type: ignore
+from ...utils import load_file
 from .credentials import DatabricksCredentials
 from .enums import LineageEntity, TagEntity
 from .format import TagMapping
-from .lineage import valid_lineage
 from .utils import build_path, tag_label
 logger = logging.getLogger(__name__)
 _INFORMATION_SCHEMA_SQL = "SELECT * FROM system.information_schema"
-_LINEAGE_SQL_TPL = """
-SELECT * FROM system.access.{table_name}
-WHERE event_date = DATE('{day}')
-"""
+_LINEAGE_SQL_PATHS = {
+    LineageEntity.COLUMN: "queries/column_lineage.sql",
+    LineageEntity.TABLE: "queries/table_lineage.sql",
+}
+def _load_lineage_query(lineage_entity: LineageEntity) -> str:
+    filename = _LINEAGE_SQL_PATHS[lineage_entity]
+    return load_file(filename, __file__)
 class DatabricksSQLClient:
@@ -95,13 +100,11 @@ class DatabricksSQLClient:
         Unfortunately, passing parameters is not always supported. We have to
         format the query beforehand and pass it as plain text for execution.
         """
-        table_name = f"{lineage_entity.value.lower()}_lineage"
-        query = _LINEAGE_SQL_TPL.format(
-            table_name=table_name,
-            day=day,
-        )
+        query_template = _load_lineage_query(lineage_entity)
+        query = query_template.format(day=day)
         result = self.execute_sql(query)
         data = []
         for row in result:
             data.append(row.asDict())
-        return valid_lineage(data, lineage_entity)
+        return data

castor_extractor/warehouse/sqlserver/extract.py CHANGED Viewed

@@ -52,7 +52,9 @@ def extract_all(**kwargs) -> None:
     client = MSSQLClient(credentials=_credentials(kwargs))
     databases = filter_items(
-        client.get_databases(), kwargs.get("allowed"), kwargs.get("blocked")
+        items=client.get_databases(),
+        allowed=kwargs.get("db_allowed"),
+        blocked=kwargs.get("db_blocked"),
     )
     query_builder = MSSQLQueryBuilder(

castor_extractor/warehouse/sqlserver/queries/column.sql CHANGED Viewed

@@ -91,9 +91,9 @@ columns AS (
     LEFT JOIN column_ids AS i
         ON
             (
-                c.table_name = i.table_name
-                AND c.table_schema = i.schema_name
-                AND c.column_name = i.column_name
+                c.table_name COLLATE DATABASE_DEFAULT = i.table_name COLLATE DATABASE_DEFAULT
+                AND c.table_schema COLLATE DATABASE_DEFAULT = i.schema_name COLLATE DATABASE_DEFAULT
+                AND c.column_name COLLATE DATABASE_DEFAULT = i.column_name COLLATE DATABASE_DEFAULT
             )
 )

castor_extractor/warehouse/sqlserver/queries/schema.sql CHANGED Viewed

@@ -1,4 +1,9 @@
--- Fetch database information
+/*
+Fetch database information
+Collation is a set of rules that defines how text data is stored and compared, and it can differ between databases.
+The "COLLATE DATABASE_DEFAULT" is to ensure that text is compared with the same collation.
+ */
 WITH ids AS (
     SELECT DISTINCT
         table_catalog,
@@ -19,4 +24,4 @@ INNER JOIN ids AS i
 LEFT JOIN {database}.sys.sysusers AS u
     ON s.principal_id = u.uid
 LEFT JOIN {database}.sys.databases AS d
-    ON i.table_catalog = d.name
+    ON i.table_catalog COLLATE DATABASE_DEFAULT = d.name COLLATE DATABASE_DEFAULT

castor_extractor/warehouse/sqlserver/queries/table.sql CHANGED Viewed

@@ -92,7 +92,7 @@ meta AS (
     FROM
         {database}.information_schema.tables AS t
     LEFT JOIN {database}.sys.databases AS db
-        ON t.table_catalog = db.name
+        ON t.table_catalog COLLATE DATABASE_DEFAULT = db.name COLLATE DATABASE_DEFAULT
 )
 SELECT

{castor_extractor-0.24.36.dist-info → castor_extractor-0.24.40.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: castor-extractor
-Version: 0.24.36
+Version: 0.24.40
 Summary: Extract your metadata assets.
 Home-page: https://www.castordoc.com/
 License: EULA
@@ -215,6 +215,24 @@ For any questions or bug report, contact us at [support@coalesce.io](mailto:supp
 # Changelog
+## 0.24.40 - 2025-08-18
+* SQLServer: fix database allowlist/blocklist filtering
+## 0.24.39 - 2025-08-18
+* Databricks:
+  * Fix vanishing owner ID column for tables
+  * Deduplicate lineage with SQL to reduce memory use
+## 0.24.38 - 2025-08-07
+* Uploader: Support US and EU zones
+## 0.24.37 - 2025-08-06
+* Sigma: extract data models, dataset sources and workbook sources
 ## 0.24.36 - 2025-08-04
 * Sigma:

{castor_extractor-0.24.36.dist-info → castor_extractor-0.24.40.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-CHANGELOG.md,sha256=HAHFgRYnv-pbsKwbHrRCrWoLpsqr8mg7Fp7tDsBsN9E,19030
+CHANGELOG.md,sha256=tgZkN-SNTMCro37DG0nW91MaD6ZnHM9VWWZG2-7TP68,19406
 Dockerfile,sha256=xQ05-CFfGShT3oUqaiumaldwA288dj9Yb_pxofQpufg,301
 DockerfileUsage.md,sha256=2hkJQF-5JuuzfPZ7IOxgM6QgIQW7l-9oRMFVwyXC4gE,998
 LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
@@ -29,7 +29,7 @@ castor_extractor/commands/extract_strategy.py,sha256=Q-pUymatPrBFGXobhyUPzFph0-t
 castor_extractor/commands/extract_tableau.py,sha256=LNtI29LbVk1vp4RNrn89GmdW6R_7QBYunRmkowDhbco,1982
 castor_extractor/commands/extract_thoughtspot.py,sha256=caAYJlH-vK7u5IUB6OKXxcaWfLgc7d_XqnFDWK6YNS4,639
 castor_extractor/commands/file_check.py,sha256=TJx76Ymd0QCECmq35zRJMkPE8DJtSInB28MuSXWk8Ao,2644
-castor_extractor/commands/upload.py,sha256=rLXp7gQ8zb1kLbho4FT87q8eJd8Gvo_TkyIynAaQ-4s,1342
+castor_extractor/commands/upload.py,sha256=sqpEF_qqCNvT_niIrM6jPhzLaFVjtYwpc2iZw540F20,1633
 castor_extractor/file_checker/__init__.py,sha256=OSt6YLhUT42U_Cp3LCLHMVruwDkksL75Ij13X2UPnVk,119
 castor_extractor/file_checker/column.py,sha256=6bJhcW1snYwgHKkqlS0Ak7XLHZr4YBwO46JCIlnQNKg,3086
 castor_extractor/file_checker/column_test.py,sha256=1j8PxvmvmJgpd-mk30iMYOme32ovPSIn4yCXywFoXrg,1935
@@ -86,13 +86,14 @@ castor_extractor/transformation/dbt/client.py,sha256=BIue1DNAn2b7kHeiXBkGNosq8jZ
 castor_extractor/transformation/dbt/client_test.py,sha256=RLL7y_pLDv2QBM03qBht8yYEooeT_woRADHcb8vgBQ4,4535
 castor_extractor/transformation/dbt/credentials.py,sha256=pGq7GqFQTw9TwN1DXSHC-0yJ2H6B_wMAbHyQTLqJVh0,543
 castor_extractor/types.py,sha256=nHel2hv6NoHmdpOX_heEfO2-DnZPoYA2x0eJdbFvT0s,1276
-castor_extractor/uploader/__init__.py,sha256=A4bq_SrEtKAsl0r_D_duSTvL5WIQjVfsMy7tDx9IKg0,87
-castor_extractor/uploader/constant.py,sha256=yTigLHDlYwoRr6CpFIl7ReElFsQd4H-qkluMZJPWSx0,865
+castor_extractor/uploader/__init__.py,sha256=xe3QHmHb35TILEhr7__nI_0t0tDolpQuujUyd84YcjI,111
+castor_extractor/uploader/constant.py,sha256=ZmQtFx9nnR0GSLZ9k41upzV3ub4FJCUIyojIEVh-qIg,956
+castor_extractor/uploader/enums.py,sha256=s5KVeBZWRDbDu-qOnrJhTSkSqzh0gxv0W1Z4cUsXfb8,109
 castor_extractor/uploader/env.py,sha256=5KiWHV-WTHfF68T_vzI-ypKAxzy9b9fnz2y4T3lH6QY,871
 castor_extractor/uploader/env_test.py,sha256=ClCWWtwd2N-5ClIDUxVMeKkWfhhOTxpppsXUDmdjxSg,472
-castor_extractor/uploader/settings.py,sha256=3MvOX-UFRqrLZoiT7wYn9jUGro7NX4RCafYzrXrLQtA,590
-castor_extractor/uploader/upload.py,sha256=PSQfkO_7LSE0WBo9Tm_hlS2ONepKeB0cBFdJXySnues,4310
-castor_extractor/uploader/upload_test.py,sha256=7fwstdQe7FjuwGilsCdFpEQr1qLoR2WTRUzyy93fISw,402
+castor_extractor/uploader/settings.py,sha256=sUZpg9eHemM99DMrBW8bnlMuoTmCmLCKq-D0OCuQbGA,649
+castor_extractor/uploader/upload.py,sha256=b2g9vWWjXWbt8Ms7brTc7OK_I7Z-1VSibNbppGoB2oQ,4764
+castor_extractor/uploader/upload_test.py,sha256=UgN7TnT9Chn6KVzRcAX0Tuvp7-tps3ugxGitlgb9TSY,462
 castor_extractor/uploader/utils.py,sha256=otAaySj5aeem6f0CTd0Te6ioJ6uP2J1p348j-SdIwDI,802
 castor_extractor/utils/__init__.py,sha256=z_BdKTUyuug3I5AzCuSGrAVskfLax4_olfORIjhZw_M,1691
 castor_extractor/utils/argument_parser.py,sha256=S4EcIh3wNDjs3fOrQnttCcPsAmG8m_Txl7xvEh0Q37s,283
@@ -237,7 +238,7 @@ castor_extractor/visualization/powerbi/__init__.py,sha256=hoZ73ngLhMc9edqxO9PUIE
 castor_extractor/visualization/powerbi/assets.py,sha256=IB_XKwgdN1pZYGZ4RfeHrLjflianTzWf_6tg-4CIwu0,742
 castor_extractor/visualization/powerbi/client/__init__.py,sha256=UPIhMaCCdNxhiLdkItC0IPFE_AMi-SgqI_ahwjB9utI,151
 castor_extractor/visualization/powerbi/client/authentication.py,sha256=cTohunKr1nUDfvxB0sejJSyfE2BdCtwT1WMPecWlbyU,1045
-castor_extractor/visualization/powerbi/client/client.py,sha256=CWCYmj2spYin74qq9T8v2ZJ5TcxBuEy5EjArhCVZjLM,8141
+castor_extractor/visualization/powerbi/client/client.py,sha256=9PRckoGdjfhOjhf5yqWTuNdivXcOC2PMgvcx-3uCh3k,8166
 castor_extractor/visualization/powerbi/client/client_test.py,sha256=Ox_bHpCSckEpT6IiR7drx2c9fmaVl1btUZxnwEmamGQ,5718
 castor_extractor/visualization/powerbi/client/constants.py,sha256=88R_aGachNNUZh6OSH2fkDwZtY4KTStzKm_g7HNCqqo,387
 castor_extractor/visualization/powerbi/client/credentials.py,sha256=OVWdhZSNODzTdLysY-sbpBZ3uUkLokeayQZnbJAqt2I,1386
@@ -269,14 +270,16 @@ castor_extractor/visualization/salesforce_reporting/client/rest.py,sha256=AqL1DT
 castor_extractor/visualization/salesforce_reporting/client/soql.py,sha256=ytZnX6zE-NoS_Kz12KghMcCM4ukPwhMj6U0rQZ_8Isk,1621
 castor_extractor/visualization/salesforce_reporting/extract.py,sha256=ScStilebLGf4HDTFqhVTQAvv_OrKxc8waycfBKdsVAc,1359
 castor_extractor/visualization/sigma/__init__.py,sha256=GINql4yJLtjfOJgjHaWNpE13cMtnKNytiFRomwav27Q,114
-castor_extractor/visualization/sigma/assets.py,sha256=JZ1Cpxnml8P3mIJoTUM57hvylB18ErECQXaP5FF63O4,268
+castor_extractor/visualization/sigma/assets.py,sha256=uKGKDaeY1ejc7XGh4eFaNp2ygG7hgca132xsX4eCwKQ,380
 castor_extractor/visualization/sigma/client/__init__.py,sha256=YQv06FBBQHvBMFg_tN0nUcmUp2NCL2s-eFTXG8rXaBg,74
-castor_extractor/visualization/sigma/client/client.py,sha256=ifCxhZ8-p9u7MnJRE8EYF_YP_G3REr_PELTSrtHiZwk,10099
+castor_extractor/visualization/sigma/client/client.py,sha256=VU0BHlug3tCpGA1je0PjEy4hU4TKhCH9UUGi8LRmNy8,11422
 castor_extractor/visualization/sigma/client/client_test.py,sha256=ae0ZOvKutCm44jnrJ-0_A5Y6ZGyDkMf9Ml3eEP8dNkY,581
 castor_extractor/visualization/sigma/client/credentials.py,sha256=XddAuQSmCKpxJ70TQgRnOj0vMPYVtiStk_lMMQ1AiNM,693
-castor_extractor/visualization/sigma/client/endpoints.py,sha256=DBFphbgoH78_MZUGM_bKBAq28Nl7LWSZ6VRsbxrxtDg,1162
+castor_extractor/visualization/sigma/client/endpoints.py,sha256=i7KTKnl2Os6752CdtJl0vPSC_Z6JxmacodV_saOnce0,1662
 castor_extractor/visualization/sigma/client/pagination.py,sha256=2bFA7GiBUUasFtHJKA90516d283p7Pg50-4zw6Fwt8I,726
-castor_extractor/visualization/sigma/extract.py,sha256=XIT1qsj6g6dgBWP8HPfj_medZexu48EaY9tUwi14gzM,2298
+castor_extractor/visualization/sigma/client/sources_transformer.py,sha256=mRupzxjtjDqELIouHF0egBkgslDmn5Y4uqO_sbUGCNs,3244
+castor_extractor/visualization/sigma/client/sources_transformer_test.py,sha256=06yUHXyv65amXLKXhix6K3kkVc1kpBqSjIYcxbyMI4Y,2766
+castor_extractor/visualization/sigma/extract.py,sha256=poTh70Xm2D6BwbdGApLkjXy6-t4iZnOoMB5DPfaTLEI,2929
 castor_extractor/visualization/strategy/__init__.py,sha256=HOMv4JxqF5ZmViWi-pDE-PSXJRLTdXal_jtpHG_rlR8,123
 castor_extractor/visualization/strategy/assets.py,sha256=yFXF_dX01patC0HQ1eU7Jo_4DZ4m6IJEg0uCB71tMoI,480
 castor_extractor/visualization/strategy/client/__init__.py,sha256=XWP0yF5j6JefDJkDfX-RSJn3HF2ceQ0Yx1PLCfB3BBo,80
@@ -334,18 +337,18 @@ castor_extractor/warehouse/bigquery/types.py,sha256=rfKkKA13Et7TM4I0uVaXkLfuaBXk
 castor_extractor/warehouse/databricks/__init__.py,sha256=YG3YSIJgCFRjjI8eExy9T7qGnfnjWhMFh8c15KTs_BA,184
 castor_extractor/warehouse/databricks/api_client.py,sha256=kLcUGSgrfybZUrpt0tE7qe2OoSSN7IK4myyB7c0czOY,6260
 castor_extractor/warehouse/databricks/api_client_test.py,sha256=YTWC-X7L-XAfK5b39TUgTmR1ifv0QrY5tvLNoSbpmjg,466
-castor_extractor/warehouse/databricks/client.py,sha256=H6vcKfos7op5AKSQF9qduG4afx-GZgBdyGE7waS6__o,3292
-castor_extractor/warehouse/databricks/client_test.py,sha256=hOuSPh45z6m9T1hjuqpOayby_q8bYdJVdq5qiwkiXrg,1370
+castor_extractor/warehouse/databricks/client.py,sha256=LzpeVQIOYi_QTfdOHbK6SB4SgxhZ7p9TNxh0Iwfz850,3307
+castor_extractor/warehouse/databricks/client_test.py,sha256=dqEdEAt-6e8CtQ7M2L5vDYkn4JvOjqyqZSFEpQ55WRc,1432
 castor_extractor/warehouse/databricks/credentials.py,sha256=ExtVcl2NpMXTx1Lg8vHQdzQtSEm2aqpg3D1BJrNAUjI,528
 castor_extractor/warehouse/databricks/endpoints.py,sha256=qPoL9CtPFJdwVuW9rJ37nmeMd-nChOBouEVYb4SlaUE,670
 castor_extractor/warehouse/databricks/enums.py,sha256=3T6BbVvbWvfWkD23krsYT1x0kKh1qRzNPl6WpcXe300,274
 castor_extractor/warehouse/databricks/extract.py,sha256=Z4VTEIf0QMiua0QGAlJdQ86kxmGAXekQ304aCKme6IY,7358
 castor_extractor/warehouse/databricks/format.py,sha256=S3BOcwJubc1pyKr-li26uftUUfsjfrm5Qf4LqmElXVk,6736
 castor_extractor/warehouse/databricks/format_test.py,sha256=ls0IcOElqp_qecAzNbK0zdca7Pms4seCHimbw8NAoAI,3322
-castor_extractor/warehouse/databricks/lineage.py,sha256=jwiRXrgqBAtzQt5EgErYrN8YRyviEEHmyrSbw8TSPq4,2105
-castor_extractor/warehouse/databricks/lineage_test.py,sha256=PyBn1eAoxLm4Bz5M0F4zmaxFX2mXRTM_uug5OKbQPQs,2684
 castor_extractor/warehouse/databricks/pagination.py,sha256=sM1G0sN1pf1TPpI0Y3Oew378UGEKVkMRc2Mlu9tDjLo,545
-castor_extractor/warehouse/databricks/sql_client.py,sha256=BchHMNqHPtZsJWhj2XYq3QVVTj3XfKhzhhPTJng8vXo,3656
+castor_extractor/warehouse/databricks/queries/column_lineage.sql,sha256=Q8MAZ5N3fNcolTMtRRw2fIrbKgV4ax9StgJgtYMpxNQ,980
+castor_extractor/warehouse/databricks/queries/table_lineage.sql,sha256=5k5jHj11SdGpfMqJEKJihAhd_ngO4kZOZJ8TCPihWDs,786
+castor_extractor/warehouse/databricks/sql_client.py,sha256=oypv_2pomoleXUJJhS8CSKO_ucalQhS9_mcsnsb5wsc,3750
 castor_extractor/warehouse/databricks/types.py,sha256=-TFX4jS6_c3wQLOpJTKpLeGS21YIPjKDjISnzeUPdCc,46
 castor_extractor/warehouse/databricks/utils.py,sha256=5CKn6Me1Tus97H_qDEz_5tkhd4ARmwk2qiC3GndjyCc,1969
 castor_extractor/warehouse/databricks/utils_test.py,sha256=_guTuzRWRTZdDY7ils0X1K8jhI9T877MEtw3x_YDg9I,2415
@@ -419,17 +422,17 @@ castor_extractor/warehouse/snowflake/queries/view_ddl.sql,sha256=eWsci_50cxiYIv3
 castor_extractor/warehouse/snowflake/query.py,sha256=C2LTdPwBzMQ_zMncg0Kq4_WkoY7K9as5tvxBDrIOlwI,1763
 castor_extractor/warehouse/sqlserver/__init__.py,sha256=PdOuYznmvKAbfWAm8UdN47MfEsd9jqPi_dDi3WEo1KY,116
 castor_extractor/warehouse/sqlserver/client.py,sha256=Bjfpw96IKAQfWPiU5SZYEDfetwfkqZrnKbQYoStcnZc,2007
-castor_extractor/warehouse/sqlserver/extract.py,sha256=-LoHY5wAGJk4vutrO3N0_PaRqts7rkEn7pADRHzoxiI,2249
+castor_extractor/warehouse/sqlserver/extract.py,sha256=GbOlSq8JR6HaJZunkfiRxaSt0pbgazQjF8GpgqWWIcU,2294
 castor_extractor/warehouse/sqlserver/queries/.sqlfluff,sha256=yy0KQdz8I_67vnXyX8eeWwOWkxTXvHyVKSVwhURktd8,48
-castor_extractor/warehouse/sqlserver/queries/column.sql,sha256=_K5OS63N7fM7kGPudnnjJEnIyaxR1xE2hoZgnJ_A3p8,2763
+castor_extractor/warehouse/sqlserver/queries/column.sql,sha256=eRILCgdygYRvtfSdxaswIiIYKW-PiJXW2qi3yHtrfns,2913
 castor_extractor/warehouse/sqlserver/queries/database.sql,sha256=4dPeBCn85MEOXr1f-DPXxiI3RvvoE_1n8lsbTs26E0I,150
-castor_extractor/warehouse/sqlserver/queries/schema.sql,sha256=UR3eTiYw7Iq5-GukelnNg_uq6haZ_dwg_SedZfOWUoA,619
-castor_extractor/warehouse/sqlserver/queries/table.sql,sha256=4RgeSkHDWTWRyU2iLxaBR0KuSwIBvb3GbQGdkJYXbn0,2787
+castor_extractor/warehouse/sqlserver/queries/schema.sql,sha256=Zp4G86FJ_Be8Zqvdlu7K8DqmsUL62kxbwaUk5asZ0V4,881
+castor_extractor/warehouse/sqlserver/queries/table.sql,sha256=YwFhHc6rGbszqQt7Izh7EngVwrrBoEZ9kniuWXNtGco,2837
 castor_extractor/warehouse/sqlserver/queries/user.sql,sha256=gOrZsMVypusR2dc4vwVs4E1a-CliRsr_UjnD2EbXs-A,94
 castor_extractor/warehouse/sqlserver/query.py,sha256=7sW8cK3JzxPt6faTJ7e4lk9tE4fo_AeCymI-LqsSols,1276
 castor_extractor/warehouse/synapse/queries/column.sql,sha256=lNcFoIW3Y0PFOqoOzJEXmPvZvfAsY0AP63Mu2LuPzPo,1351
-castor_extractor-0.24.36.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
-castor_extractor-0.24.36.dist-info/METADATA,sha256=m14Hk_AYJo9_bZE7IOb6U_LdhG8JfXnVqisiJHjgMS4,26483
-castor_extractor-0.24.36.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-castor_extractor-0.24.36.dist-info/entry_points.txt,sha256=_F-qeZCybjoMkNb9ErEhnyqXuG6afHIFQhakdBHZsr4,1803
-castor_extractor-0.24.36.dist-info/RECORD,,
+castor_extractor-0.24.40.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
+castor_extractor-0.24.40.dist-info/METADATA,sha256=ONg1SCc3gcrOJqBE92EtyfQctf-hRxI_u2VUbBpvgVA,26859
+castor_extractor-0.24.40.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+castor_extractor-0.24.40.dist-info/entry_points.txt,sha256=_F-qeZCybjoMkNb9ErEhnyqXuG6afHIFQhakdBHZsr4,1803
+castor_extractor-0.24.40.dist-info/RECORD,,

castor_extractor/warehouse/databricks/lineage.py DELETED Viewed

@@ -1,69 +0,0 @@
-from typing import Iterable, Optional
-from .enums import LineageEntity
-class LineageProcessor:
-    """
-    helper class that handles lineage deduplication and filtering
-    """
-    def __init__(self, lineage_entity: LineageEntity):
-        self.lineage_entity = lineage_entity
-        self.lineage: dict[tuple[str, str], dict] = dict()
-    def _parent_path(self, link) -> Optional[str]:
-        if self.lineage_entity == LineageEntity.TABLE:
-            return link["source_table_full_name"]
-        source_table = link["source_table_full_name"]
-        source_column = link["source_column_name"]
-        if not (source_table and source_column):
-            return None
-        return f"{source_table}.{source_column}"
-    def _child_path(self, link) -> Optional[str]:
-        if self.lineage_entity == LineageEntity.TABLE:
-            return link["target_table_full_name"]
-        target_table = link["target_table_full_name"]
-        target_column = link["target_column_name"]
-        if not (target_table and target_column):
-            return None
-        return f"{target_table}.{target_column}"
-    def add(self, link: dict) -> None:
-        """
-        If the parent and child paths are valid, keeps the most recent lineage
-        link in the `self.lineage` map.
-        """
-        parent = self._parent_path(link)
-        child = self._child_path(link)
-        timestamp = link["event_time"]
-        if not (parent and child and parent != child):
-            return
-        key = (parent, child)
-        if key in self.lineage and self.lineage[key]["event_time"] > timestamp:
-            return
-        self.lineage[key] = link
-def valid_lineage(
-    lineage: Iterable[dict], lineage_entity: LineageEntity
-) -> list[dict]:
-    """
-    Filters out self-lineage or lineage with a missing source or target path,
-    then deduplicates by picking the link with the most recent event timestmap.
-    """
-    deduplicated_lineage = LineageProcessor(lineage_entity)
-    for link in lineage:
-        deduplicated_lineage.add(link)
-    return list(deduplicated_lineage.lineage.values())

castor_extractor/warehouse/databricks/lineage_test.py DELETED Viewed

@@ -1,89 +0,0 @@
-from .enums import LineageEntity
-from .lineage import LineageProcessor, valid_lineage
-_OLDER_DATE = "2025-01-01 00:00:01.0"
-_CLOSER_DATE = "2025-01-01 02:02:02.0"
-_TABLE_LINEAGES = [
-    {
-        "source_table_full_name": "a.b.source",
-        "target_table_full_name": "a.b.target",
-        "event_time": _CLOSER_DATE,
-        "other": "more recent stuff",
-    },
-    {
-        "source_table_full_name": "a.b.source",
-        "target_table_full_name": "a.b.target",
-        "event_time": _OLDER_DATE,
-        "other": "stuff that's too old",
-    },
-    {
-        "source_table_full_name": "no target",
-        "target_table_full_name": None,
-        "event_time": _CLOSER_DATE,
-    },
-    {
-        "source_table_full_name": None,
-        "target_table_full_name": "no source",
-        "event_time": _CLOSER_DATE,
-    },
-]
-_COLUMN_LINEAGES = [
-    {
-        "source_table_full_name": "a.b.source",
-        "source_column_name": "src_col",
-        "target_table_full_name": "a.b.target",
-        "target_column_name": "trgt_col",
-        "event_time": _OLDER_DATE,
-        "other": "old stuff",
-    },
-    {
-        "source_table_full_name": "a.b.source",
-        "source_column_name": "src_col",
-        "target_table_full_name": "a.b.target",
-        "target_column_name": "trgt_col",
-        "event_time": _CLOSER_DATE,
-        "other": "newer stuff",
-    },
-    {
-        "source_table_full_name": "a.b.toto",
-        "source_column_name": "toto_col",
-        "target_table_full_name": "a.b.tata",
-        "target_column_name": "tata_col",
-        "event_time": _OLDER_DATE,
-    },
-    {
-        "source_table_full_name": "a.b.source",
-        "source_column_name": "a.b.source",
-        "target_table_full_name": None,
-        "target_column_name": None,
-        "event_time": _CLOSER_DATE,
-    },
-]
-def test_valid_lineage():
-    table_links = valid_lineage(_TABLE_LINEAGES, LineageEntity.TABLE)
-    assert len(table_links) == 1
-    assert table_links[0]["source_table_full_name"] == "a.b.source"
-    assert table_links[0]["target_table_full_name"] == "a.b.target"
-    assert table_links[0]["event_time"] == _CLOSER_DATE
-    assert table_links[0]["other"] == "more recent stuff"
-def test_LineageLinks_add():
-    deduplicated_lineage = LineageProcessor(LineageEntity.COLUMN)
-    for link in _COLUMN_LINEAGES:
-        deduplicated_lineage.add(link)
-    lineage = deduplicated_lineage.lineage
-    assert len(lineage) == 2
-    assert ("a.b.source.src_col", "a.b.target.trgt_col") in lineage
-    assert ("a.b.toto.toto_col", "a.b.tata.tata_col") in lineage
-    assert (
-        lineage[("a.b.source.src_col", "a.b.target.trgt_col")]["other"]
-        == "newer stuff"
-    )

{castor_extractor-0.24.36.dist-info → castor_extractor-0.24.40.dist-info}/LICENCE RENAMED Viewed

File without changes

{castor_extractor-0.24.36.dist-info → castor_extractor-0.24.40.dist-info}/WHEEL RENAMED Viewed

File without changes

{castor_extractor-0.24.36.dist-info → castor_extractor-0.24.40.dist-info}/entry_points.txt RENAMED Viewed

File without changes

castor-extractor 0.24.36__py3-none-any.whl → 0.24.40__py3-none-any.whl

Potentially problematic release.

castor-extractor 0.24.36py3-none-any.whl → 0.24.40py3-none-any.whl