PyPI - castor-extractor - Versions diffs - 0.22.1__py3-none-any.whl → 0.22.5__py3-none-any.whl - Mend

castor-extractor 0.22.1py3-none-any.whl → 0.22.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of castor-extractor might be problematic. Click here for more details.

Files changed (22) hide show

CHANGELOG.md CHANGED Viewed

@@ -1,6 +1,22 @@
 # Changelog
+## 0.22.5 - 2025-01-09
+* Databricks: validate and deduplicate lineage links
+## 0.22.4 - 2025-01-08
+* ThoughtSpot: extract answers
+## 0.22.3 - 2024-12-10
+* Databricks: extract lineage from system tables
+## 0.22.2 - 2024-12-06
+* Sigma: multithreading to retrieve lineage
 ## 0.22.1 - 2024-12-05
 * Salesforce: deduplicate tables

castor_extractor/visualization/sigma/client/client.py CHANGED Viewed

@@ -1,9 +1,11 @@
 from collections.abc import Iterator
+from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from http import HTTPStatus
 from typing import Callable, Optional
 import requests
+from pydantic import BaseModel
 from ....utils import (
     APIClient,
@@ -12,6 +14,7 @@ from ....utils import (
     build_url,
     fetch_all_pages,
     handle_response,
+    retry,
 )
 from ..assets import SigmaAsset
 from .credentials import SigmaCredentials
@@ -29,7 +32,7 @@ _DATA_ELEMENTS: tuple[str, ...] = (
 )
 _AUTH_TIMEOUT_S = 60
-_SIGMA_TIMEOUT = 120
+_SIGMA_TIMEOUT_S = 300
 _SIGMA_HEADERS = {
     "Content-Type": _CONTENT_TYPE,
@@ -47,6 +50,23 @@ SIGMA_SAFE_MODE = RequestSafeMode(
     max_errors=_VOLUME_IGNORED,
     status_codes=_IGNORED_ERROR_CODES,
 )
+_THREADS_LINEAGE = 10  # empirically found; hit the rate limit with 20 workers
+_RETRY_NUMBER = 1
+_RETRY_BASE_MS = 60_000
+class LineageContext(BaseModel):
+    """all info needed to build the endpoint for lineage retrieval"""
+    workbook_id: str
+    element_id: str
+class Lineage(BaseModel):
+    """holds response from lineage API and context used to retrieve it"""
+    lineage: dict
+    context: LineageContext
 class SigmaBearerAuth(BearerAuth):
@@ -77,7 +97,7 @@ class SigmaClient(APIClient):
             host=credentials.host,
             auth=auth,
             headers=_SIGMA_HEADERS,
-            timeout=_SIGMA_TIMEOUT,
+            timeout=_SIGMA_TIMEOUT_S,
             safe_mode=safe_mode or SIGMA_SAFE_MODE,
         )
@@ -133,17 +153,51 @@ class SigmaClient(APIClient):
                     page=page, workbook_id=workbook_id
                 )
-    def _get_all_lineages(self, elements: list[dict]) -> Iterator[dict]:
+    @retry(
+        (ConnectionError,),
+        max_retries=_RETRY_NUMBER,
+        base_ms=_RETRY_BASE_MS,
+        log_exc_info=True,
+    )
+    def _get_lineage(self, lineage_context: LineageContext) -> Lineage:
+        """
+        return the lineage from API and other ids needed to characterize
+        lineage in castor
+        """
+        workbook_id = lineage_context.workbook_id
+        element_id = lineage_context.element_id
+        endpoint = SigmaEndpointFactory.lineage(workbook_id, element_id)
+        return Lineage(lineage=self._get(endpoint), context=lineage_context)
+    @staticmethod
+    def _lineage_context(elements: list[dict]) -> list[LineageContext]:
+        """
+        Helper function to prepare context for lineage retrieval.
+        Elements without associated columns are skipped.
+        """
+        contexts: list[LineageContext] = []
         for element in elements:
-            workbook_id = element["workbook_id"]
-            element_id = element["elementId"]
-            lineage = self._get(
-                endpoint=SigmaEndpointFactory.lineage(workbook_id, element_id)
+            if element.get("columns") is None:
+                continue
+            context = LineageContext(
+                workbook_id=element["workbook_id"],
+                element_id=element["elementId"],
             )
+            contexts.append(context)
+        return contexts
+    def _get_all_lineages(self, elements: list[dict]) -> Iterator[dict]:
+        lineage_context = self._lineage_context(elements)
+        with ThreadPoolExecutor(max_workers=_THREADS_LINEAGE) as executor:
+            results = executor.map(self._get_lineage, lineage_context)
+        for lineage in results:
             yield {
-                **lineage,
-                "workbook_id": workbook_id,
-                "element_id": element_id,
+                **lineage.lineage,
+                "workbook_id": lineage.context.workbook_id,
+                "element_id": lineage.context.element_id,
             }
     def _get_all_queries(self, workbooks: list[dict]) -> Iterator[dict]:

castor_extractor/visualization/thoughtspot/assets.py CHANGED Viewed

@@ -4,6 +4,8 @@ from ...types import ExternalAsset
 class ThoughtspotAsset(ExternalAsset):
     """Thoughtspot assets"""
+    ANSWERS = "answers"
+    ANSWER_USAGES = "answer_usages"
     LIVEBOARDS = "liveboards"
+    LIVEBOARD_USAGES = "liveboard_usages"
     LOGICAL_TABLES = "logical_tables"
-    USAGES = "usages"

castor_extractor/visualization/thoughtspot/client/client.py CHANGED Viewed

@@ -30,7 +30,12 @@ _THOUGHTSPOT_HEADERS = {
     "Content-Type": "application/json",
 }
 _METADATA_BATCH_SIZE = 100
-_USAGE_LIVEBOARD_ID = "bea79810-145f-4ad0-a02c-4177a6e7d861"
+# https://docs.thoughtspot.com/cloud/latest/object-usage-liveboard
+_OBJECT_USAGE_LIVEBOARD = "Object Usage"
+_ANSWER_USAGE_VIZ = "Answer Usage, by User"
+# https://docs.thoughtspot.com/cloud/latest/user-adoption
+_USER_ADOPTION_LIVEBOARD = "User Adoption"
+_LIVEBOARD_USAGE_VIZ = "Popular Liveboards Last 30 Days"
 # By default, no errors are ignored for the moment
 THOUGHTSPOT_SAFE_MODE = RequestSafeMode()
@@ -69,23 +74,39 @@ class ThoughtspotClient(APIClient):
     def _metadata_search(
         self,
         metadata_type: str,
+        identifier: Optional[str] = None,
     ) -> Iterator[dict]:
+        """
+        Yields assets of the given asset type, and optionally filters on a
+        specific identifier.
+        """
         offset = 0
         while True:
+            search_filters = {
+                "metadata": [{"type": metadata_type}],
+                "include_details": True,
+                "record_size": _METADATA_BATCH_SIZE,
+                "record_offset": offset,
+            }
+            if identifier:
+                search_filters["metadata"] = {
+                    "identifier": identifier,
+                    "type": metadata_type,
+                }
             metadata = self._post(
                 ThoughtspotEndpointFactory.metadata_search(),
-                data={
-                    "metadata": [{"type": metadata_type}],
-                    "include_details": True,
-                    "record_size": _METADATA_BATCH_SIZE,
-                    "record_offset": offset,
-                },
+                data=search_filters,
             )
             yield from metadata
             if len(metadata) < _METADATA_BATCH_SIZE:
                 break
             offset = offset + _METADATA_BATCH_SIZE
+    def _get_all_answers(self) -> Iterator[dict]:
+        yield from self._metadata_search(metadata_type="ANSWER")
     def _get_all_liveboards(self) -> Iterator[dict]:
         yield from self._metadata_search(metadata_type="LIVEBOARD")
@@ -95,26 +116,58 @@ class ThoughtspotClient(APIClient):
     def _get_all_tables(self) -> Iterator[dict]:
         yield from self._metadata_search(metadata_type="LOGICAL_TABLE")
-    def _get_liveboards_usages(self) -> Iterator[dict]:
+    def _get_usages(
+        self,
+        liveboard_name: str,
+        visualization_name: str,
+    ) -> Iterator[dict]:
+        """
+        Yields the data of a given visualization in the given liveboard.
+        ThoughtSpot maintains two system liveboards with stats about data usage,
+        which are useful to compute view counts and popularity.
+        """
+        usage_liveboard = next(
+            self._metadata_search(
+                metadata_type="LIVEBOARD", identifier=liveboard_name
+            )
+        )
+        liveboard_id = usage_liveboard["metadata_id"]
         data = self._post(
             endpoint=ThoughtspotEndpointFactory.liveboard(),
             headers={"Accept": "application/octet-stream"},
             data={
-                "metadata_identifier": _USAGE_LIVEBOARD_ID,
+                "metadata_identifier": liveboard_id,
                 "file_format": "CSV",
-                "visualization_identifiers": [
-                    "Popular Liveboards Last 30 Days"
-                ],
+                "visualization_identifiers": [visualization_name],
             },
             handler=lambda x: x.text,
         )
         yield from usage_liveboard_reader(data)
-    def fetch(self, asset: ThoughtspotAsset):
+    def _get_answer_usages(self) -> Iterator[dict]:
+        return self._get_usages(
+            liveboard_name=_OBJECT_USAGE_LIVEBOARD,
+            visualization_name=_ANSWER_USAGE_VIZ,
+        )
+    def _get_liveboards_usages(self) -> Iterator[dict]:
+        return self._get_usages(
+            liveboard_name=_USER_ADOPTION_LIVEBOARD,
+            visualization_name=_LIVEBOARD_USAGE_VIZ,
+        )
+    def fetch(self, asset: ThoughtspotAsset) -> Iterator[dict]:
+        if asset == ThoughtspotAsset.ANSWERS:
+            yield from self._get_all_answers()
+        if asset == ThoughtspotAsset.ANSWER_USAGES:
+            yield from self._get_answer_usages()
         if asset == ThoughtspotAsset.LIVEBOARDS:
             yield from self._get_all_liveboards()
-        if asset == ThoughtspotAsset.USAGES:
+        if asset == ThoughtspotAsset.LIVEBOARD_USAGES:
             yield from self._get_liveboards_usages()
         if asset == ThoughtspotAsset.LOGICAL_TABLES:

castor_extractor/visualization/thoughtspot/client/utils.py CHANGED Viewed

@@ -1,13 +1,17 @@
 import csv
+import re
 from collections.abc import Iterator
 from io import StringIO
+_END_OF_GENERATED_TEXT = r'^""$'
 def usage_liveboard_reader(usage_liveboard_csv: str) -> Iterator[dict]:
     """
     Converts a CSV string into an iterator of dictionaries after
-    ignoring the first 6 lines, using the 7th line as the header.
-    First 6 lines looks like the following:
+    ignoring the generated text that preceeds the actual CSV header row.
+    The generated block ends with a row containing only two double quotes.
+    Here is an example:
         "Data extract produced by Castor on 09/19/2024 06:54"
         "Filters applied on data :"
@@ -15,11 +19,13 @@ def usage_liveboard_reader(usage_liveboard_csv: str) -> Iterator[dict]:
         "Pinboard NOT IN [mlm - availability pinboard,null]"
         "Timestamp >= 20240820 00:00:00 < 20240919 00:00:00"
         "Timestamp >= 20240919 00:00:00 < 20240920 00:00:00"
+        ""
     """
     csv_file = StringIO(usage_liveboard_csv)
-    for _ in range(7):
-        next(csv_file)
+    line = next(csv_file)
+    while not re.match(_END_OF_GENERATED_TEXT, line.strip()):
+        line = next(csv_file)
     yield from csv.DictReader(csv_file)

castor_extractor/visualization/thoughtspot/client/utils_test.py CHANGED Viewed

@@ -2,7 +2,7 @@ from .utils import (
     usage_liveboard_reader,
 )
-VALID_CSV = '''"Data extract produced by Castor on 09/19/2024 06:54"
+VALID_CSV_1 = '''"Data extract produced by Castor on 09/19/2024 06:54"
 "Filters applied on data :"
 "User Action IN [pinboard_embed_view,pinboard_tspublic_no_runtime_filter,pinboard_tspublic_runtime_filter,pinboard_view]"
 "Pinboard NOT IN [mlm - availability pinboard,null]"
@@ -16,6 +16,13 @@ VALID_CSV = '''"Data extract produced by Castor on 09/19/2024 06:54"
 "September test","25","2"'''
+VALID_CSV_2 = '''"Data extract produced by Castor on 01/07/2025 16:07"
+"Filters applied on data :"
+"Timestamp >= 20241208 00:00:00 < 20250107 00:00:00"
+""
+"Answer name","User name","Number of unique users","Count of object interactions"
+"toto","tata","1","666"'''
 # Invalid CSV input (missing data rows)
 INVALID_CSV = '''"Data extract produced by Castor on 09/19/2024 06:54"
 "Filters applied on data :"
@@ -27,7 +34,7 @@ INVALID_CSV = '''"Data extract produced by Castor on 09/19/2024 06:54"
 def test_usage_liveboard_reader():
-    expected_output = [
+    expected_output_1 = [
         {
             "Pinboard": "Market Report",
             "Pinboard Views": "559",
@@ -49,9 +56,20 @@ def test_usage_liveboard_reader():
             "Unique Number of User": "2",
         },
     ]
+    expected_output_2 = [
+        {
+            "Answer name": "toto",
+            "User name": "tata",
+            "Number of unique users": "1",
+            "Count of object interactions": "666",
+        }
+    ]
+    result = list(usage_liveboard_reader(VALID_CSV_1))
+    assert result == expected_output_1
-    result = list(usage_liveboard_reader(VALID_CSV))
-    assert result == expected_output
+    result = list(usage_liveboard_reader(VALID_CSV_2))
+    assert result == expected_output_2
     result = list(usage_liveboard_reader(INVALID_CSV))
     assert result == []  # Expect an empty result since there is no data

castor_extractor/warehouse/databricks/api_client.py CHANGED Viewed

@@ -1,8 +1,6 @@
 import logging
-from collections.abc import Iterator
 from functools import partial
-from http import HTTPStatus
-from typing import Optional
+from typing import Iterator, Optional
 import requests
@@ -14,16 +12,14 @@ from ...utils import (
     fetch_all_pages,
     handle_response,
     retry,
-    retry_request,
     safe_mode,
 )
 from ..abstract import TimeFilter
 from .credentials import DatabricksCredentials
 from .endpoints import DatabricksEndpointFactory
 from .format import DatabricksFormatter, TagMapping
-from .lineage import single_column_lineage_links, single_table_lineage_links
 from .pagination import DATABRICKS_PAGE_SIZE, DatabricksPagination
-from .types import TablesColumns, TimestampedLink
+from .types import TablesColumns
 from .utils import hourly_time_filters
 logger = logging.getLogger(__name__)
@@ -132,60 +128,6 @@ class DatabricksAPIClient(APIClient):
             column_tags=column_tags,
         )
-    @safe_mode(safe_lineage_params, lambda: [])
-    @retry(
-        exceptions=_RETRY_EXCEPTIONS,
-        max_retries=_RETRY_ATTEMPTS,
-        base_ms=_RETRY_BASE_MS,
-    )
-    @retry_request(
-        status_codes=(HTTPStatus.TOO_MANY_REQUESTS,),
-        max_retries=_RETRY_ATTEMPTS,
-    )
-    def get_single_column_lineage(
-        self,
-        names: tuple[str, str],
-    ) -> list[TimestampedLink]:
-        """
-        Helper function used in get_lineage_links.
-        Call data lineage API and return the content of the result
-        eg table_path: broward_prd.bronze.account_adjustments
-        FYI: Maximum rate of 10 requests per SECOND
-        """
-        table_path, column_name = names
-        payload = {
-            "table_name": table_path,
-            "column_name": column_name,
-            "include_entity_lineage": True,
-        }
-        content = self._get(
-            DatabricksEndpointFactory.column_lineage(), params=payload
-        )
-        column_path = f"{table_path}.{column_name}"
-        return single_column_lineage_links(column_path, content)
-    @safe_mode(safe_lineage_params, lambda: [])
-    @retry(
-        exceptions=_RETRY_EXCEPTIONS,
-        max_retries=_RETRY_ATTEMPTS,
-        base_ms=_RETRY_BASE_MS,
-    )
-    def get_single_table_lineage(
-        self, table_path: str
-    ) -> list[TimestampedLink]:
-        """
-        Helper function used in get_lineage_links.
-        Call data lineage API and return the content of the result
-        eg table_path: broward_prd.bronze.account_adjustments
-        FYI: Maximum rate of 50 requests per SECOND
-        """
-        payload = {"table_name": table_path, "include_entity_lineage": True}
-        content = self._get(
-            DatabricksEndpointFactory.table_lineage(), params=payload
-        )
-        return single_table_lineage_links(table_path, content)
     @safe_mode(safe_query_params, lambda: [])
     @retry(
         exceptions=_RETRY_EXCEPTIONS,

castor_extractor/warehouse/databricks/client.py CHANGED Viewed

@@ -1,17 +1,14 @@
 import logging
-from concurrent.futures import ThreadPoolExecutor
 from typing import Optional
-from ...utils import (
-    mapping_from_rows,
-)
+from ...utils import mapping_from_rows
 from ..abstract import TimeFilter
 from .api_client import DatabricksAPIClient
 from .credentials import DatabricksCredentials
+from .enums import TagEntity
 from .format import DatabricksFormatter
-from .lineage import deduplicate_lineage, paths_for_column_lineage
-from .sql_client import DatabricksSQLClient, TagEntity
-from .types import TablesColumns, TimestampedLink
+from .sql_client import DatabricksSQLClient
+from .types import TablesColumns
 logger = logging.getLogger(__name__)
@@ -95,46 +92,6 @@ class DatabricksClient:
             columns.extend(c_to_add)
         return tables, columns
-    def table_lineage(self, tables: list[dict]) -> list[dict]:
-        """
-        Wrapper function that retrieves all table lineage
-        """
-        # retrieve table lineage
-        with ThreadPoolExecutor(max_workers=_THREADS_TABLE_LINEAGE) as executor:
-            table_paths = [
-                ".".join([table["schema_id"], table["table_name"]])
-                for table in tables
-            ]
-            results = executor.map(
-                self.api_client.get_single_table_lineage, table_paths
-            )
-        lineages = [link for links in results for link in links]
-        deduplicated = deduplicate_lineage(lineages)
-        return self.formatter.format_lineage(deduplicated)
-    def column_lineage(
-        self, tables: list[dict], columns: list[dict], table_lineage: list[dict]
-    ) -> list[dict]:
-        """
-        Wrapper function that retrieves all column lineage
-        we only try to retrieve column lineage if we found table lineage
-        """
-        candidate_paths = paths_for_column_lineage(
-            tables, columns, table_lineage
-        )
-        # retrieve column lineage
-        with ThreadPoolExecutor(
-            max_workers=_THREADS_COLUMN_LINEAGE
-        ) as executor:
-            results = executor.map(
-                self.api_client.get_single_column_lineage, candidate_paths
-            )
-        lineages: list[TimestampedLink] = [
-            link for links in results for link in links
-        ]
-        deduplicated = deduplicate_lineage(lineages)
-        return self.formatter.format_lineage(deduplicated)
     def queries(self, time_filter: Optional[TimeFilter] = None) -> list[dict]:
         return self.api_client.queries(time_filter)

castor_extractor/warehouse/databricks/client_test.py CHANGED Viewed

@@ -1,14 +1,4 @@
-from unittest.mock import Mock, patch
-from .client import (
-    DatabricksClient,
-)
-from .test_constants import (
-    CLOSER_DATE,
-    MOCK_TABLES_FOR_TABLE_LINEAGE,
-    OLDER_DATE,
-    TABLE_LINEAGE_SIDE_EFFECT,
-)
+from .client import DatabricksClient
 class MockDatabricksClient(DatabricksClient):
@@ -48,27 +38,3 @@ def test_DatabricksClient__match_table_with_user():
     table_without_owner = {"id": 1, "owner_email": None}
     actual = client._match_table_with_user(table_without_owner, user_mapping)
     assert actual == table_without_owner
-@patch(
-    "source.packages.extractor.castor_extractor.warehouse.databricks.client.DatabricksAPIClient._get",
-    side_effect=TABLE_LINEAGE_SIDE_EFFECT,
-)
-def test_DatabricksClient_table_lineage(mock_get):
-    client = DatabricksClient(Mock())
-    lineage = client.table_lineage(MOCK_TABLES_FOR_TABLE_LINEAGE)
-    assert len(lineage) == 2
-    expected_link_1 = {
-        "parent_path": "dev.silver.pre_analytics",
-        "child_path": "dev.silver.analytics",
-        "timestamp": OLDER_DATE,
-    }
-    expected_link_2 = {
-        "parent_path": "dev.bronze.analytics",
-        "child_path": "dev.silver.analytics",
-        "timestamp": CLOSER_DATE,
-    }
-    assert expected_link_1 in lineage
-    assert expected_link_2 in lineage

castor_extractor/warehouse/databricks/credentials.py CHANGED Viewed

@@ -1,24 +1,22 @@
 from dataclasses import field
-from typing import Optional
-from pydantic.dataclasses import dataclass
-from pydantic_settings import SettingsConfigDict
+from pydantic_settings import BaseSettings, SettingsConfigDict
 DATABRICKS_ENV_PREFIX = "CASTOR_DATABRICKS_"
-@dataclass
-class DatabricksCredentials:
+class DatabricksCredentials(BaseSettings):
     """
     Credentials needed by Databricks client
     Requires:
     - host
+    - http_path
     - token
     """
     host: str
+    http_path: str
     token: str = field(metadata={"sensitive": True})
-    http_path: Optional[str] = field(default=None)
     model_config = SettingsConfigDict(
         env_prefix=DATABRICKS_ENV_PREFIX,

castor_extractor/warehouse/databricks/enums.py ADDED Viewed

@@ -0,0 +1,15 @@
+from enum import Enum
+class LineageEntity(Enum):
+    """Entities that can be linked in Databricks lineage"""
+    COLUMN = "COLUMN"
+    TABLE = "TABLE"
+class TagEntity(Enum):
+    """Entities that can be tagged in Databricks"""
+    COLUMN = "COLUMN"
+    TABLE = "TABLE"

castor_extractor/warehouse/databricks/extract.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+from datetime import date
 from typing import Optional
 from ...utils import AbstractStorage, LocalStorage, write_summary
@@ -16,6 +17,7 @@ from ..abstract import (
 )
 from .client import DatabricksClient
 from .credentials import DatabricksCredentials
+from .enums import LineageEntity
 DATABRICKS_ASSETS: SupportedAssets = {
     WarehouseAssetGroup.ADDITIONAL_LINEAGE: ADDITIONAL_LINEAGE_ASSETS,
@@ -32,6 +34,12 @@ OTimeFilter = Optional[TimeFilter]
 Paths = dict[str, str]
+def _day(time_filter: OTimeFilter) -> date:
+    if not time_filter:
+        return TimeFilter.default().day
+    return time_filter.day
 class DatabricksExtractionProcessor:
     """Databricks' API-based extraction management"""
@@ -96,22 +104,18 @@ class DatabricksExtractionProcessor:
         logger.info(f"Extracted {len(columns)} columns to {location}")
         return catalog_locations
-    def extract_lineage(self) -> Paths:
+    def extract_lineage(self, time_filter: OTimeFilter = None) -> Paths:
         if self._should_not_reextract(WarehouseAssetGroup.ADDITIONAL_LINEAGE):
             return self._existing_group_paths(
                 WarehouseAssetGroup.ADDITIONAL_LINEAGE
             )
         lineage_locations: dict[str, str] = dict()
-        # extract catalog
-        databases = self._client.databases()
-        schemas = self._client.schemas(databases)
-        users = self._client.users()
-        tables, columns = self._client.tables_and_columns(schemas, users)
-        logger.info("Extracted pre-requisite catalog. Next comes lineage")
+        day = _day(time_filter)
+        client = self._client.sql_client
         # extract table lineage
-        table_lineage = self._client.table_lineage(tables)
+        table_lineage = client.get_lineage(LineageEntity.TABLE, day)
         table_lineage_key = WarehouseAsset.ADDITIONAL_TABLE_LINEAGE.value
         location = self._storage.put(table_lineage_key, table_lineage)
         lineage_locations[table_lineage_key] = location
@@ -119,9 +123,7 @@ class DatabricksExtractionProcessor:
         logger.info(msg)
         # extract column lineage
-        column_lineage = self._client.column_lineage(
-            tables, columns, table_lineage
-        )
+        column_lineage = client.get_lineage(LineageEntity.COLUMN, day)
         column_lineage_key = WarehouseAsset.ADDITIONAL_COLUMN_LINEAGE.value
         location = self._storage.put(column_lineage_key, column_lineage)
         lineage_locations[column_lineage_key] = location

castor_extractor/warehouse/databricks/lineage.py CHANGED Viewed

@@ -1,141 +1,69 @@
-from typing import cast
+from typing import Iterable, Optional
-from .types import Link, Ostr, OTimestampedLink, TimestampedLink
+from .enums import LineageEntity
-class LineageLinks:
+class LineageProcessor:
     """
     helper class that handles lineage deduplication and filtering
     """
-    def __init__(self):
-        self.lineage: dict[Link, Ostr] = dict()
+    def __init__(self, lineage_entity: LineageEntity):
+        self.lineage_entity = lineage_entity
-    def add(self, timestamped_link: TimestampedLink) -> None:
-        """
-        keep the most recent lineage link, adding to `self.lineage`
-        """
-        parent, child, timestamp = timestamped_link
-        link = (parent, child)
-        if not self.lineage.get(link):
-            self.lineage[link] = timestamp
-            return
-        if not timestamp:
-            return
-        # keep most recent link; cast for mypy
-        recent = max(cast(str, self.lineage[link]), cast(str, timestamp))
-        self.lineage[link] = recent
+        self.lineage: dict[tuple[str, str], dict] = dict()
+    def _parent_path(self, link) -> Optional[str]:
+        if self.lineage_entity == LineageEntity.TABLE:
+            return link["source_table_full_name"]
-def _to_table_path(table: dict) -> Ostr:
-    if table.get("name"):
-        return f"{table['catalog_name']}.{table['schema_name']}.{table['name']}"
-    return None
+        source_table = link["source_table_full_name"]
+        source_column = link["source_column_name"]
+        if not (source_table and source_column):
+            return None
+        return f"{source_table}.{source_column}"
-def _to_column_path(column: dict) -> Ostr:
-    if column.get("name"):
-        return f"{column['catalog_name']}.{column['schema_name']}.{column['table_name']}.{column['name']}"
-    return None
+    def _child_path(self, link) -> Optional[str]:
+        if self.lineage_entity == LineageEntity.TABLE:
+            return link["target_table_full_name"]
+        target_table = link["target_table_full_name"]
+        target_column = link["target_column_name"]
+        if not (target_table and target_column):
+            return None
-def _link(path_from: Ostr, path_to: Ostr, timestamp: Ostr) -> OTimestampedLink:
-    """exclude missing path and self-lineage"""
-    if (not path_from) or (not path_to):
-        return None
-    is_self_lineage = path_from.lower() == path_to.lower()
-    if is_self_lineage:
-        return None
-    return path_from, path_to, timestamp
+        return f"{target_table}.{target_column}"
+    def add(self, link: dict) -> None:
+        """
+        If the parent and child paths are valid, keeps the most recent lineage
+        link in the `self.lineage` map.
+        """
+        parent = self._parent_path(link)
+        child = self._child_path(link)
+        timestamp = link["event_time"]
-def single_table_lineage_links(
-    table_path: str, single_table_lineage: dict
-) -> list[TimestampedLink]:
-    """
-    process databricks lineage API response for a given table
-    returns a list of (parent, child, timestamp)
-    Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
-    we could also have `notebookInfos` or `fileInfo`
-    """
-    links: list[OTimestampedLink] = []
-    # add parent:
-    for link in single_table_lineage.get("upstreams", []):
-        parent = link.get("tableInfo", {})
-        parent_path = _to_table_path(parent)
-        timestamp: Ostr = parent.get("lineage_timestamp")
-        links.append(_link(parent_path, table_path, timestamp))
-    # add children:
-    for link in single_table_lineage.get("downstreams", []):
-        child = link.get("tableInfo", {})
-        child_path = _to_table_path(child)
-        timestamp = child.get("lineage_timestamp")
-        links.append(_link(table_path, child_path, timestamp))
-    return list(filter(None, links))
-def single_column_lineage_links(
-    column_path: str, single_column_lineage: dict
-) -> list[TimestampedLink]:
-    """
-    process databricks lineage API response for a given table
-    returns a list of (parent, child, timestamp)
-    Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
-    we could also have `notebookInfos` or `fileInfo`
-    """
-    links: list[OTimestampedLink] = []
-    # add parent:
-    for link in single_column_lineage.get("upstream_cols", []):
-        parent_path = _to_column_path(link)
-        timestamp: Ostr = link.get("lineage_timestamp")
-        links.append(_link(parent_path, column_path, timestamp))
+        if not (parent and child and parent != child):
+            return
-    # add children:
-    for link in single_column_lineage.get("downstream_cols", []):
-        child_path = _to_column_path(link)
-        timestamp = link.get("lineage_timestamp")
-        links.append(_link(column_path, child_path, timestamp))
+        key = (parent, child)
+        if key in self.lineage and self.lineage[key]["event_time"] > timestamp:
+            return
-    return list(filter(None, links))
+        self.lineage[key] = link
-def paths_for_column_lineage(
-    tables: list[dict], columns: list[dict], table_lineage: list[dict]
-) -> list[tuple[str, str]]:
+def valid_lineage(
+    lineage: Iterable[dict], lineage_entity: LineageEntity
+) -> list[dict]:
     """
-    helper providing a list of candidate columns to look lineage for:
-    we only look for column lineage where there is table lineage
+    Filters out self-lineage or lineage with a missing source or target path,
+    then deduplicates by picking the link with the most recent event timestmap.
     """
-    # mapping between table id and its path db.schema.table
-    # table["schema_id"] follows the pattern `db.schema`
-    mapping = {
-        table["id"]: ".".join([table["schema_id"], table["table_name"]])
-        for table in tables
-    }
-    tables_with_lineage: set[str] = set()
-    for t in table_lineage:
-        tables_with_lineage.add(t["parent_path"])
-        tables_with_lineage.add(t["child_path"])
-    paths_to_return: list[tuple[str, str]] = []
-    for column in columns:
-        table_path = mapping[column["table_id"]]
-        if table_path not in tables_with_lineage:
-            continue
-        column_ = (table_path, column["column_name"])
-        paths_to_return.append(column_)
-    return paths_to_return
-def deduplicate_lineage(lineages: list[TimestampedLink]) -> dict:
-    deduplicated_lineage = LineageLinks()
-    for timestamped_link in lineages:
-        deduplicated_lineage.add(timestamped_link)
-    return deduplicated_lineage.lineage
+    deduplicated_lineage = LineageProcessor(lineage_entity)
+    for link in lineage:
+        deduplicated_lineage.add(link)
+    return list(deduplicated_lineage.lineage.values())

castor_extractor/warehouse/databricks/lineage_test.py CHANGED Viewed

@@ -1,34 +1,89 @@
-from .lineage import LineageLinks
-from .test_constants import (
-    CLOSER_DATE,
-    OLDER_DATE,
-)
+from .enums import LineageEntity
+from .lineage import LineageProcessor, valid_lineage
+_OLDER_DATE = "2025-01-01 00:00:01.0"
+_CLOSER_DATE = "2025-01-01 02:02:02.0"
+_TABLE_LINEAGES = [
+    {
+        "source_table_full_name": "a.b.source",
+        "target_table_full_name": "a.b.target",
+        "event_time": _CLOSER_DATE,
+        "other": "more recent stuff",
+    },
+    {
+        "source_table_full_name": "a.b.source",
+        "target_table_full_name": "a.b.target",
+        "event_time": _OLDER_DATE,
+        "other": "stuff that's too old",
+    },
+    {
+        "source_table_full_name": "no target",
+        "target_table_full_name": None,
+        "event_time": _CLOSER_DATE,
+    },
+    {
+        "source_table_full_name": None,
+        "target_table_full_name": "no source",
+        "event_time": _CLOSER_DATE,
+    },
+]
+_COLUMN_LINEAGES = [
+    {
+        "source_table_full_name": "a.b.source",
+        "source_column_name": "src_col",
+        "target_table_full_name": "a.b.target",
+        "target_column_name": "trgt_col",
+        "event_time": _OLDER_DATE,
+        "other": "old stuff",
+    },
+    {
+        "source_table_full_name": "a.b.source",
+        "source_column_name": "src_col",
+        "target_table_full_name": "a.b.target",
+        "target_column_name": "trgt_col",
+        "event_time": _CLOSER_DATE,
+        "other": "newer stuff",
+    },
+    {
+        "source_table_full_name": "a.b.toto",
+        "source_column_name": "toto_col",
+        "target_table_full_name": "a.b.tata",
+        "target_column_name": "tata_col",
+        "event_time": _OLDER_DATE,
+    },
+    {
+        "source_table_full_name": "a.b.source",
+        "source_column_name": "a.b.source",
+        "target_table_full_name": None,
+        "target_column_name": None,
+        "event_time": _CLOSER_DATE,
+    },
+]
+def test_valid_lineage():
+    table_links = valid_lineage(_TABLE_LINEAGES, LineageEntity.TABLE)
+    assert len(table_links) == 1
+    assert table_links[0]["source_table_full_name"] == "a.b.source"
+    assert table_links[0]["target_table_full_name"] == "a.b.target"
+    assert table_links[0]["event_time"] == _CLOSER_DATE
+    assert table_links[0]["other"] == "more recent stuff"
 def test_LineageLinks_add():
-    links = LineageLinks()
-    timestamped_link = ("parent", "child", None)
-    expected_key = ("parent", "child")
-    links.add(timestamped_link)
-    assert expected_key in links.lineage
-    assert links.lineage[expected_key] is None
-    # we replace None by an actual timestamp
-    timestamped_link = ("parent", "child", OLDER_DATE)
-    links.add(timestamped_link)
-    assert expected_key in links.lineage
-    assert links.lineage[expected_key] == OLDER_DATE
-    # we update with the more recent timestamp
-    timestamped_link = ("parent", "child", CLOSER_DATE)
-    links.add(timestamped_link)
-    assert expected_key in links.lineage
-    assert links.lineage[expected_key] == CLOSER_DATE
-    # we keep the more recent timestamp
-    timestamped_link = ("parent", "child", OLDER_DATE)
-    links.add(timestamped_link)
-    assert expected_key in links.lineage
-    assert links.lineage[expected_key] == CLOSER_DATE
+    deduplicated_lineage = LineageProcessor(LineageEntity.COLUMN)
+    for link in _COLUMN_LINEAGES:
+        deduplicated_lineage.add(link)
+    lineage = deduplicated_lineage.lineage
+    assert len(lineage) == 2
+    assert ("a.b.source.src_col", "a.b.target.trgt_col") in lineage
+    assert ("a.b.toto.toto_col", "a.b.tata.tata_col") in lineage
+    assert (
+        lineage[("a.b.source.src_col", "a.b.target.trgt_col")]["other"]
+        == "newer stuff"
+    )

castor_extractor/warehouse/databricks/sql_client.py CHANGED Viewed

@@ -1,24 +1,24 @@
 import logging
 from collections import defaultdict
-from enum import Enum
+from datetime import date
 from typing import Optional
 from databricks import sql  # type: ignore
 from .credentials import DatabricksCredentials
+from .enums import LineageEntity, TagEntity
 from .format import TagMapping
+from .lineage import valid_lineage
 from .utils import build_path, tag_label
 logger = logging.getLogger(__name__)
 _INFORMATION_SCHEMA_SQL = "SELECT * FROM system.information_schema"
-class TagEntity(Enum):
-    """Entities that can be tagged in Databricks"""
-    COLUMN = "COLUMN"
-    TABLE = "TABLE"
+_LINEAGE_SQL_TPL = """
+SELECT * FROM system.access.{table_name}
+WHERE event_date = :day
+"""
 class DatabricksSQLClient:
@@ -71,7 +71,6 @@ class DatabricksSQLClient:
         https://docs.databricks.com/en/sql/language-manual/information-schema/column_tags.html
         """
         if not self._needs_extraction(entity):
-            # extracting tags require additional credentials (http_path)
             return dict()
         table = f"{entity.value.lower()}_tags"
@@ -88,3 +87,19 @@ class DatabricksSQLClient:
             mapping[path].append(label)
         return mapping
+    def get_lineage(
+        self, lineage_entity: LineageEntity, day: date
+    ) -> list[dict]:
+        """
+        Fetch {TABLE|COLUMN} lineage of the given day, via system tables
+        https://docs.databricks.com/en/admin/system-tables/lineage.html
+        """
+        table_name = f"{lineage_entity.value.lower()}_lineage"
+        query = _LINEAGE_SQL_TPL.format(table_name=table_name)
+        params = {"day": day}
+        result = self.execute_sql(query, params)
+        data = []
+        for row in result:
+            data.append(row.asDict())
+        return valid_lineage(data, lineage_entity)

castor_extractor/warehouse/databricks/types.py CHANGED Viewed

@@ -1,8 +1 @@
-from typing import Optional
-Link = tuple[str, str]
 TablesColumns = tuple[list[dict], list[dict]]
-Ostr = Optional[str]
-TimestampedLink = tuple[str, str, Ostr]
-OTimestampedLink = Optional[TimestampedLink]

{castor_extractor-0.22.1.dist-info → castor_extractor-0.22.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: castor-extractor
-Version: 0.22.1
+Version: 0.22.5
 Summary: Extract your metadata assets.
 Home-page: https://www.castordoc.com/
 License: EULA
@@ -207,6 +207,22 @@ For any questions or bug report, contact us at [support@castordoc.com](mailto:su
 # Changelog
+## 0.22.5 - 2025-01-09
+* Databricks: validate and deduplicate lineage links
+## 0.22.4 - 2025-01-08
+* ThoughtSpot: extract answers
+## 0.22.3 - 2024-12-10
+* Databricks: extract lineage from system tables
+## 0.22.2 - 2024-12-06
+* Sigma: multithreading to retrieve lineage
 ## 0.22.1 - 2024-12-05
 * Salesforce: deduplicate tables

{castor_extractor-0.22.1.dist-info → castor_extractor-0.22.5.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-CHANGELOG.md,sha256=p1jUz1AWTVMfmt6dwNvWxUSloLrkhHoWRxpT2RU1Hcc,15058
+CHANGELOG.md,sha256=JzTJEZxIMP9F_aePVfIvqLt0OuG0jYcDygsLyfTAV84,15335
 Dockerfile,sha256=xQ05-CFfGShT3oUqaiumaldwA288dj9Yb_pxofQpufg,301
 DockerfileUsage.md,sha256=2hkJQF-5JuuzfPZ7IOxgM6QgIQW7l-9oRMFVwyXC4gE,998
 LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
@@ -252,7 +252,7 @@ castor_extractor/visualization/salesforce_reporting/extract.py,sha256=ScStilebLG
 castor_extractor/visualization/sigma/__init__.py,sha256=GINql4yJLtjfOJgjHaWNpE13cMtnKNytiFRomwav27Q,114
 castor_extractor/visualization/sigma/assets.py,sha256=JZ1Cpxnml8P3mIJoTUM57hvylB18ErECQXaP5FF63O4,268
 castor_extractor/visualization/sigma/client/__init__.py,sha256=YQv06FBBQHvBMFg_tN0nUcmUp2NCL2s-eFTXG8rXaBg,74
-castor_extractor/visualization/sigma/client/client.py,sha256=nT61lN2yRpKd6jeqwR0NVOAUVpA5KAQyHkEGTl7n00A,6283
+castor_extractor/visualization/sigma/client/client.py,sha256=d9CpE7vRZAPGzck0jFn37LY_6E_Njz9D1sCnFVGJSWk,8006
 castor_extractor/visualization/sigma/client/credentials.py,sha256=XddAuQSmCKpxJ70TQgRnOj0vMPYVtiStk_lMMQ1AiNM,693
 castor_extractor/visualization/sigma/client/endpoints.py,sha256=DBFphbgoH78_MZUGM_bKBAq28Nl7LWSZ6VRsbxrxtDg,1162
 castor_extractor/visualization/sigma/client/pagination.py,sha256=kNEhNq08tTGbypyMjxs0w4uvDtQc_iaWpOZweaa_FsU,690
@@ -306,13 +306,13 @@ castor_extractor/visualization/tableau_revamp/client/rest_fields.py,sha256=3kvaq
 castor_extractor/visualization/tableau_revamp/constants.py,sha256=lHGB50FgVNO2nXeIhkvQKivD8ZFBIjDrflgD5cTXKJw,104
 castor_extractor/visualization/tableau_revamp/extract.py,sha256=HqnBypuNGx_xKk-68WEOy_ucD15LuRF4t2xXf0XKPE0,1370
 castor_extractor/visualization/thoughtspot/__init__.py,sha256=NhTGUk5Kdt54oCjHYoAt0cLBmVLys5lFYiRANL6wCmI,150
-castor_extractor/visualization/thoughtspot/assets.py,sha256=lPRvXk0PKybgLv1AcDVxg-ssf4XLTs0biRqLrqC2TzU,196
+castor_extractor/visualization/thoughtspot/assets.py,sha256=SAQWPKaD2NTSDg7-GSkcRSSEkKSws0MJfOVcHkdeTSg,276
 castor_extractor/visualization/thoughtspot/client/__init__.py,sha256=svrE2rMxR-OXctjPeAHMEPePlfcra-9KDevTMcHunAA,86
-castor_extractor/visualization/thoughtspot/client/client.py,sha256=RHOaJjvlWcSdASXzvlgMbmsSU9oTIixPhH8g0NgyIbc,3719
+castor_extractor/visualization/thoughtspot/client/client.py,sha256=mtwMCPI1-1tyZb1gSYYr-O2QZMTFQwNgillU6ycsOU4,5552
 castor_extractor/visualization/thoughtspot/client/credentials.py,sha256=fp4YHiZy-dstWiLr5c4kFU9SyPK5rd2nCeh8k5sVRpM,462
 castor_extractor/visualization/thoughtspot/client/endpoints.py,sha256=u3FRkmG6j5OIMEeXWZcgRObP8JeC4EutIJEeitNV44c,330
-castor_extractor/visualization/thoughtspot/client/utils.py,sha256=ua7-10HKpFHYRDBVGLJ5hIEfuUA7ryIH9tl0sBjl0MU,883
-castor_extractor/visualization/thoughtspot/client/utils_test.py,sha256=-5ZaEYpQSrIp1-Sx-ViQOLPlv2LoOajEs2mE5YNi_tU,1887
+castor_extractor/visualization/thoughtspot/client/utils.py,sha256=3LgbIWoG1e39VW8rYaV4ot_0EFipziwf3rFAZKxrlEY,1072
+castor_extractor/visualization/thoughtspot/client/utils_test.py,sha256=2XysRU7a58KA2JgNwU2j4GPrN0rkN7Gvk8kQCJlYXVk,2469
 castor_extractor/visualization/thoughtspot/extract.py,sha256=mcXS0jGFpa50td98AVbbTqxchyI5wDCpB-v1o5iRc3g,1354
 castor_extractor/warehouse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 castor_extractor/warehouse/abstract/__init__.py,sha256=Fdfa026tgOo64MvzVRLHM_F2G-JmcehrF0mh3dHgb7s,419
@@ -340,21 +340,21 @@ castor_extractor/warehouse/bigquery/queries/view_ddl.sql,sha256=obCm-IN9V8_YSZTw
 castor_extractor/warehouse/bigquery/query.py,sha256=FEekxlkrfAXzsT8Kj1AIqYd5mURB5MlZIkbFVXVqEhU,4762
 castor_extractor/warehouse/bigquery/types.py,sha256=rfKkKA13Et7TM4I0uVaXkLfuaBXkv51bNTp4AO0QSdw,57
 castor_extractor/warehouse/databricks/__init__.py,sha256=YG3YSIJgCFRjjI8eExy9T7qGnfnjWhMFh8c15KTs_BA,184
-castor_extractor/warehouse/databricks/api_client.py,sha256=1E3t8uCi3b8xVXLCodwlH5y8FIGmu9otORvA7ZqcGKE,8283
+castor_extractor/warehouse/databricks/api_client.py,sha256=kLcUGSgrfybZUrpt0tE7qe2OoSSN7IK4myyB7c0czOY,6260
 castor_extractor/warehouse/databricks/api_client_test.py,sha256=YTWC-X7L-XAfK5b39TUgTmR1ifv0QrY5tvLNoSbpmjg,466
-castor_extractor/warehouse/databricks/client.py,sha256=K3RafGL_UerFAGmRKK2Cp2IXzalQYqkneQFvgsYdOZY,4993
-castor_extractor/warehouse/databricks/client_test.py,sha256=UKr_D3M8mhqV1oL2_3y_6pEzAFLVE3FHDNZh4omFLK4,2286
-castor_extractor/warehouse/databricks/credentials.py,sha256=iphbVynVTQXMEbJy4QaT5fer-GpOi7QtbAlg8R7-Lj4,598
+castor_extractor/warehouse/databricks/client.py,sha256=H6vcKfos7op5AKSQF9qduG4afx-GZgBdyGE7waS6__o,3292
+castor_extractor/warehouse/databricks/client_test.py,sha256=hOuSPh45z6m9T1hjuqpOayby_q8bYdJVdq5qiwkiXrg,1370
+castor_extractor/warehouse/databricks/credentials.py,sha256=ExtVcl2NpMXTx1Lg8vHQdzQtSEm2aqpg3D1BJrNAUjI,528
 castor_extractor/warehouse/databricks/endpoints.py,sha256=qPoL9CtPFJdwVuW9rJ37nmeMd-nChOBouEVYb4SlaUE,670
-castor_extractor/warehouse/databricks/extract.py,sha256=G_-78-vrvEyn8rcKXXDXlxjad4Ot-Ko4vnhvEcOzjJQ,7389
+castor_extractor/warehouse/databricks/enums.py,sha256=3T6BbVvbWvfWkD23krsYT1x0kKh1qRzNPl6WpcXe300,274
+castor_extractor/warehouse/databricks/extract.py,sha256=Z4VTEIf0QMiua0QGAlJdQ86kxmGAXekQ304aCKme6IY,7358
 castor_extractor/warehouse/databricks/format.py,sha256=FUBMrFFWSa_lX5PtixJCDR3eRYycqeMw0oKHt7AkA4o,6732
 castor_extractor/warehouse/databricks/format_test.py,sha256=ls0IcOElqp_qecAzNbK0zdca7Pms4seCHimbw8NAoAI,3322
-castor_extractor/warehouse/databricks/lineage.py,sha256=RUCcKz19R0dJVab6JUSUbGx4L5Vyb4sVoTAwLbfgjxo,4700
-castor_extractor/warehouse/databricks/lineage_test.py,sha256=EejO4qKH_kJlJSrIap6GvkUi9E55RFvfiySKazAh0_A,1048
+castor_extractor/warehouse/databricks/lineage.py,sha256=jwiRXrgqBAtzQt5EgErYrN8YRyviEEHmyrSbw8TSPq4,2105
+castor_extractor/warehouse/databricks/lineage_test.py,sha256=PyBn1eAoxLm4Bz5M0F4zmaxFX2mXRTM_uug5OKbQPQs,2684
 castor_extractor/warehouse/databricks/pagination.py,sha256=sM1G0sN1pf1TPpI0Y3Oew378UGEKVkMRc2Mlu9tDjLo,545
-castor_extractor/warehouse/databricks/sql_client.py,sha256=KBP0rmMQBWw3jshDfv_NpFW8HqPxGfcBkS4d9T9aXvE,2977
-castor_extractor/warehouse/databricks/test_constants.py,sha256=Hm96yq_ltVAKv7WYhYz637r4Cuj-1cCdyOuxMEe3J-Q,2246
-castor_extractor/warehouse/databricks/types.py,sha256=-qO5y-uI95B666iDhyNM0TL8WlwYC-3Q4xZuolh3PwE,205
+castor_extractor/warehouse/databricks/sql_client.py,sha256=5isGsRL0MW1lu_E_xTyCvSj_rwaJ2nh-kPlhvTvDy_w,3566
+castor_extractor/warehouse/databricks/types.py,sha256=-TFX4jS6_c3wQLOpJTKpLeGS21YIPjKDjISnzeUPdCc,46
 castor_extractor/warehouse/databricks/utils.py,sha256=5CKn6Me1Tus97H_qDEz_5tkhd4ARmwk2qiC3GndjyCc,1969
 castor_extractor/warehouse/databricks/utils_test.py,sha256=_guTuzRWRTZdDY7ils0X1K8jhI9T877MEtw3x_YDg9I,2415
 castor_extractor/warehouse/mysql/__init__.py,sha256=2KFDogo9GNbApHqw3Vm5t_uNmIRjdp76nmP_WQQMfQY,116
@@ -436,8 +436,8 @@ castor_extractor/warehouse/sqlserver/queries/table.sql,sha256=kbBQP-TdG5px1IVgyx
 castor_extractor/warehouse/sqlserver/queries/user.sql,sha256=gOrZsMVypusR2dc4vwVs4E1a-CliRsr_UjnD2EbXs-A,94
 castor_extractor/warehouse/sqlserver/query.py,sha256=g0hPT-RmeGi2DyenAi3o72cTlQsLToXIFYojqc8E5fQ,533
 castor_extractor/warehouse/synapse/queries/column.sql,sha256=lNcFoIW3Y0PFOqoOzJEXmPvZvfAsY0AP63Mu2LuPzPo,1351
-castor_extractor-0.22.1.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
-castor_extractor-0.22.1.dist-info/METADATA,sha256=52H1eJe_L62yUSWkBJYLbRanXS6OdauukGW0RfeNiS4,22075
-castor_extractor-0.22.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-castor_extractor-0.22.1.dist-info/entry_points.txt,sha256=7aVSxc-_2dicp28Ow-S4y0p4wGoTm9zGmVptMvfLdw8,1649
-castor_extractor-0.22.1.dist-info/RECORD,,
+castor_extractor-0.22.5.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
+castor_extractor-0.22.5.dist-info/METADATA,sha256=11A9xI9Bd6Uu1Na_AJngfTbkt-ECXjsabWNTppaZsOk,22352
+castor_extractor-0.22.5.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+castor_extractor-0.22.5.dist-info/entry_points.txt,sha256=7aVSxc-_2dicp28Ow-S4y0p4wGoTm9zGmVptMvfLdw8,1649
+castor_extractor-0.22.5.dist-info/RECORD,,

castor_extractor/warehouse/databricks/test_constants.py DELETED Viewed

@@ -1,79 +0,0 @@
-OLDER_DATE = "2024-04-18 20:20:20.0"
-CLOSER_DATE = "2024-04-19 20:20:20.0"
-MOCK_TABLES_FOR_TABLE_LINEAGE = [
-    {
-        "id": "f51ba2ca-8cc3-4de6-8f8b-730359e8f40f",
-        "schema_id": "dev.silver",
-        "table_name": "analytics",
-    },
-    {
-        "id": "4e140bdc-a67c-4b68-8a07-c684657d8b44",
-        "schema_id": "dev.silver",
-        "table_name": "pre_analytics",
-    },
-    {
-        "id": "7d403198-55ea-4a40-9995-6ee2f4c79dfa",
-        "schema_id": "dev.bronze",
-        "table_name": "analytics",
-    },
-]
-_RAW_LINEAGE_DEV_SILVER_ANALYTICS = {
-    "upstreams": [
-        {  # there could be other keys: jobInfos, notebookInfos, queryInfos
-            "tableInfo": {
-                "name": "pre_analytics",
-                "catalog_name": "dev",
-                "schema_name": "silver",
-                "table_type": "PERSISTED_VIEW",  # not used
-                "lineage_timestamp": OLDER_DATE,
-            }
-        },
-        {
-            "tableInfo": {
-                "name": "analytics",
-                "catalog_name": "dev",
-                "schema_name": "bronze",
-                "table_type": "PERSISTED_VIEW",  # not used
-                "lineage_timestamp": CLOSER_DATE,
-            }
-        },
-    ],
-    "downstreams": [],
-}
-_RAW_LINEAGE_DEV_SILVER_PRE_ANALYTICS = {
-    "upstreams": [],
-    "downstreams": [
-        {
-            "tableInfo": {
-                "name": "analytics",
-                "catalog_name": "dev",
-                "schema_name": "silver",
-                "table_type": "PERSISTED_VIEW",  # not used
-                "lineage_timestamp": OLDER_DATE,
-            }
-        },
-    ],
-}
-_RAW_LINEAGE_DEV_BRONZE_ANALYTICS = {
-    "upstreams": [],
-    "downstreams": [
-        {
-            "tableInfo": {
-                "name": "analytics",
-                "catalog_name": "dev",
-                "schema_name": "silver",
-                "table_type": "PERSISTED_VIEW",  # not used
-                "lineage_timestamp": OLDER_DATE,
-            }
-        },
-    ],
-}
-# should be in the same order as MOCK_TABLES_FOR_TABLE_LINEAGE
-TABLE_LINEAGE_SIDE_EFFECT: tuple = (
-    _RAW_LINEAGE_DEV_SILVER_ANALYTICS,
-    _RAW_LINEAGE_DEV_SILVER_PRE_ANALYTICS,
-    _RAW_LINEAGE_DEV_BRONZE_ANALYTICS,
-)

{castor_extractor-0.22.1.dist-info → castor_extractor-0.22.5.dist-info}/LICENCE RENAMED Viewed

File without changes

{castor_extractor-0.22.1.dist-info → castor_extractor-0.22.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{castor_extractor-0.22.1.dist-info → castor_extractor-0.22.5.dist-info}/entry_points.txt RENAMED Viewed

File without changes

castor-extractor 0.22.1__py3-none-any.whl → 0.22.5__py3-none-any.whl

Potentially problematic release.

castor-extractor 0.22.1py3-none-any.whl → 0.22.5py3-none-any.whl