castor-extractor 0.24.36__py3-none-any.whl → 0.24.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

Files changed (31) hide show
  1. CHANGELOG.md +18 -0
  2. castor_extractor/commands/upload.py +10 -0
  3. castor_extractor/uploader/__init__.py +1 -0
  4. castor_extractor/uploader/constant.py +5 -1
  5. castor_extractor/uploader/enums.py +8 -0
  6. castor_extractor/uploader/settings.py +2 -0
  7. castor_extractor/uploader/upload.py +26 -5
  8. castor_extractor/uploader/upload_test.py +5 -3
  9. castor_extractor/visualization/powerbi/client/client.py +2 -1
  10. castor_extractor/visualization/sigma/assets.py +3 -0
  11. castor_extractor/visualization/sigma/client/client.py +42 -5
  12. castor_extractor/visualization/sigma/client/endpoints.py +17 -0
  13. castor_extractor/visualization/sigma/client/sources_transformer.py +94 -0
  14. castor_extractor/visualization/sigma/client/sources_transformer_test.py +101 -0
  15. castor_extractor/visualization/sigma/extract.py +17 -1
  16. castor_extractor/warehouse/databricks/client.py +4 -5
  17. castor_extractor/warehouse/databricks/client_test.py +2 -1
  18. castor_extractor/warehouse/databricks/queries/column_lineage.sql +25 -0
  19. castor_extractor/warehouse/databricks/queries/table_lineage.sql +23 -0
  20. castor_extractor/warehouse/databricks/sql_client.py +14 -11
  21. castor_extractor/warehouse/sqlserver/extract.py +3 -1
  22. castor_extractor/warehouse/sqlserver/queries/column.sql +3 -3
  23. castor_extractor/warehouse/sqlserver/queries/schema.sql +7 -2
  24. castor_extractor/warehouse/sqlserver/queries/table.sql +1 -1
  25. {castor_extractor-0.24.36.dist-info → castor_extractor-0.24.40.dist-info}/METADATA +19 -1
  26. {castor_extractor-0.24.36.dist-info → castor_extractor-0.24.40.dist-info}/RECORD +29 -26
  27. castor_extractor/warehouse/databricks/lineage.py +0 -69
  28. castor_extractor/warehouse/databricks/lineage_test.py +0 -89
  29. {castor_extractor-0.24.36.dist-info → castor_extractor-0.24.40.dist-info}/LICENCE +0 -0
  30. {castor_extractor-0.24.36.dist-info → castor_extractor-0.24.40.dist-info}/WHEEL +0 -0
  31. {castor_extractor-0.24.36.dist-info → castor_extractor-0.24.40.dist-info}/entry_points.txt +0 -0
CHANGELOG.md CHANGED
@@ -1,5 +1,23 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.24.40 - 2025-08-18
4
+
5
+ * SQLServer: fix database allowlist/blocklist filtering
6
+
7
+ ## 0.24.39 - 2025-08-18
8
+
9
+ * Databricks:
10
+ * Fix vanishing owner ID column for tables
11
+ * Deduplicate lineage with SQL to reduce memory use
12
+
13
+ ## 0.24.38 - 2025-08-07
14
+
15
+ * Uploader: Support US and EU zones
16
+
17
+ ## 0.24.37 - 2025-08-06
18
+
19
+ * Sigma: extract data models, dataset sources and workbook sources
20
+
3
21
  ## 0.24.36 - 2025-08-04
4
22
 
5
23
  * Sigma:
@@ -3,6 +3,7 @@ from argparse import ArgumentParser
3
3
 
4
4
  from castor_extractor.uploader import ( # type: ignore
5
5
  FileType,
6
+ Zone,
6
7
  upload_any,
7
8
  )
8
9
  from castor_extractor.utils import parse_filled_arguments # type: ignore
@@ -40,6 +41,15 @@ def _args() -> ArgumentParser:
40
41
  ),
41
42
  choices=supported_file_type,
42
43
  )
44
+ supported_zones = [zone.value for zone in Zone]
45
+ parser.add_argument(
46
+ "-z",
47
+ "--zone",
48
+ help="geographic zone to upload, currently supported are {}, defaults to EU".format(
49
+ supported_zones,
50
+ ),
51
+ choices=supported_zones,
52
+ )
43
53
  return parser
44
54
 
45
55
 
@@ -1,2 +1,3 @@
1
1
  from .constant import FileType
2
+ from .enums import Zone
2
3
  from .upload import upload, upload_any, upload_manifest
@@ -1,9 +1,13 @@
1
1
  from enum import Enum
2
2
 
3
3
  from ..utils import RetryStrategy
4
+ from .enums import Zone
4
5
 
5
6
  # url of the gcs proxy
6
- INGEST_URL = "https://ingest.castordoc.com"
7
+ INGEST_URLS = {
8
+ Zone.EU: "https://ingest.castordoc.com",
9
+ Zone.US: "https://ingest.us.castordoc.com",
10
+ }
7
11
 
8
12
  RETRY_BASE_MS = 10_000
9
13
  RETRY_JITTER_MS = 1_000
@@ -0,0 +1,8 @@
1
+ from enum import Enum
2
+
3
+
4
+ class Zone(Enum):
5
+ """Geographic cluster location"""
6
+
7
+ EU = "EU"
8
+ US = "US"
@@ -4,6 +4,7 @@ from pydantic import UUID4, Field
4
4
  from pydantic_settings import BaseSettings, SettingsConfigDict
5
5
 
6
6
  from .constant import FileType
7
+ from .enums import Zone
7
8
 
8
9
  UPLOADER_ENV_PREFIX = "CASTOR_UPLOADER_"
9
10
 
@@ -22,3 +23,4 @@ class UploaderSettings(BaseSettings):
22
23
  file_type: FileType
23
24
  source_id: UUID4
24
25
  token: str = Field(repr=False)
26
+ zone: Optional[Zone] = Zone.EU
@@ -10,13 +10,14 @@ import requests
10
10
 
11
11
  from ..utils.retry import retry
12
12
  from .constant import (
13
- INGEST_URL,
13
+ INGEST_URLS,
14
14
  PATH_TEMPLATES,
15
15
  RETRY_BASE_MS,
16
16
  RETRY_JITTER_MS,
17
17
  RETRY_STRATEGY,
18
18
  FileType,
19
19
  )
20
+ from .enums import Zone
20
21
  from .env import get_blob_env
21
22
  from .settings import UploaderSettings
22
23
  from .utils import iter_files
@@ -33,6 +34,7 @@ def _path_and_url(
33
34
  source_id: UUID,
34
35
  file_type: FileType,
35
36
  file_path: str,
37
+ zone: Zone,
36
38
  ) -> tuple[str, str]:
37
39
  now = datetime.utcnow()
38
40
  timestamp = int(now.timestamp())
@@ -44,7 +46,7 @@ def _path_and_url(
44
46
  filename=filename,
45
47
  )
46
48
 
47
- url = f"{INGEST_URL}/{path}"
49
+ url = f"{INGEST_URLS[zone]}/{path}"
48
50
 
49
51
  return path, url
50
52
 
@@ -61,13 +63,16 @@ def _upload(
61
63
  source_id: UUID,
62
64
  file_path: str,
63
65
  file_type: FileType,
66
+ zone: Optional[Zone] = Zone.EU,
64
67
  ) -> None:
65
68
  """
66
69
  Upload the given file to Google Cloud Storage (GCS)
67
70
  - Don't call GCS API directly
68
71
  - Call the ingestion proxy which handles authorisation and uploading
69
72
  """
70
- path, url = _path_and_url(source_id, file_type, file_path)
73
+ if not zone:
74
+ zone = Zone.EU
75
+ path, url = _path_and_url(source_id, file_type, file_path, zone)
71
76
  headers = _headers(token)
72
77
  timeout, max_retries = get_blob_env()
73
78
 
@@ -97,6 +102,7 @@ def _upload(
97
102
  def upload_manifest(
98
103
  token: str,
99
104
  source_id: UUID,
105
+ zone: Optional[Zone],
100
106
  file_path: Optional[str] = None,
101
107
  ) -> None:
102
108
  """
@@ -106,13 +112,20 @@ def upload_manifest(
106
112
  """
107
113
  if not file_path:
108
114
  raise ValueError("file path is needed to upload a manifest")
109
- _upload(token, source_id, file_path, FileType.DBT)
115
+ _upload(
116
+ token=token,
117
+ source_id=source_id,
118
+ file_path=file_path,
119
+ file_type=FileType.DBT,
120
+ zone=zone,
121
+ )
110
122
 
111
123
 
112
124
  def upload(
113
125
  token: str,
114
126
  source_id: UUID,
115
127
  file_type: FileType,
128
+ zone: Optional[Zone],
116
129
  file_path: Optional[str] = None,
117
130
  directory_path: Optional[str] = None,
118
131
  ) -> None:
@@ -133,7 +146,13 @@ def upload(
133
146
  raise ValueError(message)
134
147
 
135
148
  for file_ in files:
136
- _upload(token, source_id, file_, file_type)
149
+ _upload(
150
+ token=token,
151
+ source_id=source_id,
152
+ file_path=file_,
153
+ file_type=file_type,
154
+ zone=zone,
155
+ )
137
156
 
138
157
 
139
158
  def upload_any(**kwargs) -> None:
@@ -156,6 +175,7 @@ def upload_any(**kwargs) -> None:
156
175
  token=settings.token,
157
176
  source_id=settings.source_id,
158
177
  file_path=settings.file_path,
178
+ zone=settings.zone,
159
179
  )
160
180
  return None
161
181
 
@@ -165,4 +185,5 @@ def upload_any(**kwargs) -> None:
165
185
  file_type=file_type,
166
186
  file_path=settings.file_path,
167
187
  directory_path=settings.directory_path,
188
+ zone=settings.zone,
168
189
  )
@@ -1,6 +1,7 @@
1
1
  from uuid import UUID
2
2
 
3
- from .constant import INGEST_URL, FileType
3
+ from .constant import INGEST_URLS, FileType
4
+ from .enums import Zone
4
5
  from .upload import _path_and_url
5
6
 
6
7
 
@@ -8,7 +9,8 @@ def test__path():
8
9
  source_id = UUID("399a8b22-3187-11ec-8d3d-0242ac130003")
9
10
  file_type = FileType.VIZ
10
11
  file_path = "filename"
12
+ zone = Zone.EU
11
13
 
12
- path, url = _path_and_url(source_id, file_type, file_path)
14
+ path, url = _path_and_url(source_id, file_type, file_path, zone)
13
15
  assert path == f"visualization-{source_id}/{file_path}"
14
- assert url == f"{INGEST_URL}/{path}"
16
+ assert url == f"{INGEST_URLS[Zone.EU]}/{path}"
@@ -28,6 +28,7 @@ POWERBI_DEFAULT_TIMEOUT_S = 30
28
28
  METADATA_BATCH_SIZE = 100
29
29
  POWERBI_SCAN_STATUS_DONE = "Succeeded"
30
30
  POWERBI_SCAN_SLEEP_S = 1
31
+ POWERBI_SCAN_TIMEOUT_S = 60
31
32
 
32
33
  MAX_RETRY_PAGES = 1
33
34
  RETRY_PAGES_TIMEOUT_MS = 35 * 1000 # 35 seconds
@@ -142,7 +143,7 @@ class PowerbiClient(APIClient):
142
143
  endpoint = self.endpoint_factory.metadata_scan_status(scan_id)
143
144
  total_waiting_time_s = 0
144
145
 
145
- while total_waiting_time_s < POWERBI_DEFAULT_TIMEOUT_S:
146
+ while total_waiting_time_s < POWERBI_SCAN_TIMEOUT_S:
146
147
  try:
147
148
  result = self._get(endpoint)
148
149
  except HTTPError as e:
@@ -4,10 +4,13 @@ from ...types import ExternalAsset
4
4
  class SigmaAsset(ExternalAsset):
5
5
  """Sigma assets"""
6
6
 
7
+ DATAMODELS = "datamodels"
7
8
  DATASETS = "datasets"
9
+ DATASET_SOURCES = "dataset_sources"
8
10
  ELEMENTS = "elements"
9
11
  FILES = "files"
10
12
  LINEAGES = "lineages"
11
13
  MEMBERS = "members"
12
14
  QUERIES = "queries"
13
15
  WORKBOOKS = "workbooks"
16
+ WORKBOOK_SOURCES = "workbook_sources"
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  from collections.abc import Iterator
2
3
  from concurrent.futures import ThreadPoolExecutor
3
4
  from functools import partial
@@ -24,6 +25,9 @@ from .pagination import (
24
25
  SIGMA_QUERIES_PAGINATION_LIMIT,
25
26
  SigmaPagination,
26
27
  )
28
+ from .sources_transformer import SigmaSourcesTransformer
29
+
30
+ logger = logging.getLogger(__name__)
27
31
 
28
32
  _CONTENT_TYPE = "application/x-www-form-urlencoded"
29
33
 
@@ -135,6 +139,12 @@ class SigmaClient(APIClient):
135
139
  params={"limit": limit},
136
140
  )
137
141
 
142
+ def _get_all_datamodels(self) -> Iterator[dict]:
143
+ request = self._get_paginated(
144
+ endpoint=SigmaEndpointFactory.datamodels()
145
+ )
146
+ yield from fetch_all_pages(request, SigmaPagination)
147
+
138
148
  def _get_all_datasets(self) -> Iterator[dict]:
139
149
  request = self._get_paginated(endpoint=SigmaEndpointFactory.datasets())
140
150
  yield from fetch_all_pages(request, SigmaPagination)
@@ -275,18 +285,36 @@ class SigmaClient(APIClient):
275
285
 
276
286
  yield from self._yield_deduplicated_queries(queries, workbook_id)
277
287
 
288
+ def _get_all_dataset_sources(self, datasets: list[dict]) -> Iterator[dict]:
289
+ yield from SigmaSourcesTransformer(self).get_dataset_sources(datasets)
290
+
291
+ def _get_all_workbook_sources(
292
+ self, workbooks: list[dict]
293
+ ) -> Iterator[dict]:
294
+ yield from SigmaSourcesTransformer(self).get_workbook_sources(workbooks)
295
+
278
296
  def fetch(
279
297
  self,
280
298
  asset: SigmaAsset,
281
- workbooks: Optional[list[dict]] = None,
299
+ datasets: Optional[list[dict]] = None,
282
300
  elements: Optional[list[dict]] = None,
301
+ workbooks: Optional[list[dict]] = None,
283
302
  ) -> Iterator[dict]:
284
303
  """Returns the needed metadata for the queried asset"""
285
- if asset == SigmaAsset.DATASETS:
304
+ if asset == SigmaAsset.DATAMODELS:
305
+ yield from self._get_all_datamodels()
306
+
307
+ elif asset == SigmaAsset.DATASETS:
286
308
  yield from self._get_all_datasets()
287
309
 
310
+ elif asset == SigmaAsset.DATASET_SOURCES:
311
+ if datasets is None:
312
+ raise ValueError("Missing datasets to extract dataset sources")
313
+
314
+ yield from self._get_all_dataset_sources(datasets)
315
+
288
316
  elif asset == SigmaAsset.ELEMENTS:
289
- if not workbooks:
317
+ if workbooks is None:
290
318
  raise ValueError("Missing workbooks to extract elements")
291
319
 
292
320
  yield from self._get_all_elements(workbooks)
@@ -295,15 +323,16 @@ class SigmaClient(APIClient):
295
323
  yield from self._get_all_files()
296
324
 
297
325
  elif asset == SigmaAsset.LINEAGES:
298
- if not elements:
326
+ if elements is None:
299
327
  raise ValueError("Missing elements to extract lineage")
328
+
300
329
  yield from self._get_all_lineages(elements)
301
330
 
302
331
  elif asset == SigmaAsset.MEMBERS:
303
332
  yield from self._get_all_members()
304
333
 
305
334
  elif asset == SigmaAsset.QUERIES:
306
- if not workbooks:
335
+ if workbooks is None:
307
336
  raise ValueError("Missing workbooks to extract queries")
308
337
 
309
338
  yield from self._get_all_queries(workbooks)
@@ -311,5 +340,13 @@ class SigmaClient(APIClient):
311
340
  elif asset == SigmaAsset.WORKBOOKS:
312
341
  yield from self._get_all_workbooks()
313
342
 
343
+ elif asset == SigmaAsset.WORKBOOK_SOURCES:
344
+ if workbooks is None:
345
+ raise ValueError(
346
+ "Missing workbooks to extract workbook sources"
347
+ )
348
+
349
+ yield from self._get_all_workbook_sources(workbooks)
350
+
314
351
  else:
315
352
  raise ValueError(f"This asset {asset} is unknown")
@@ -1,6 +1,7 @@
1
1
  class SigmaEndpointFactory:
2
2
  """Wrapper class around all endpoints we're using"""
3
3
 
4
+ DATAMODELS = "dataModels"
4
5
  DATASETS = "datasets"
5
6
  FILES = "files"
6
7
  MEMBERS = "members"
@@ -10,10 +11,22 @@ class SigmaEndpointFactory:
10
11
  def authentication(cls) -> str:
11
12
  return "v2/auth/token"
12
13
 
14
+ @classmethod
15
+ def connection_path(cls, inode_id: str) -> str:
16
+ return f"v2/connections/paths/{inode_id}"
17
+
18
+ @classmethod
19
+ def datamodels(cls) -> str:
20
+ return f"v2/{cls.DATAMODELS}"
21
+
13
22
  @classmethod
14
23
  def datasets(cls) -> str:
15
24
  return f"v2/{cls.DATASETS}"
16
25
 
26
+ @classmethod
27
+ def dataset_sources(cls, dataset_id: str) -> str:
28
+ return f"v2/{cls.DATASETS}/{dataset_id}/sources"
29
+
17
30
  @classmethod
18
31
  def elements(cls, workbook_id: str, page_id: str) -> str:
19
32
  return f"v2/{cls.WORKBOOKS}/{workbook_id}/pages/{page_id}/elements"
@@ -41,3 +54,7 @@ class SigmaEndpointFactory:
41
54
  @classmethod
42
55
  def workbooks(cls) -> str:
43
56
  return f"v2/{cls.WORKBOOKS}"
57
+
58
+ @classmethod
59
+ def workbook_sources(cls, workbook_id: str) -> str:
60
+ return f"v2/{cls.WORKBOOKS}/{workbook_id}/sources"
@@ -0,0 +1,94 @@
1
+ import logging
2
+ from typing import TYPE_CHECKING, Callable, Iterator
3
+
4
+ from .endpoints import SigmaEndpointFactory
5
+
6
+ if TYPE_CHECKING:
7
+ from .client import SigmaClient
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class SigmaSourcesTransformer:
13
+ """Retrieves asset sources and enhances them with additional information."""
14
+
15
+ def __init__(self, api_client: "SigmaClient"):
16
+ self.api_client = api_client
17
+
18
+ def _map_table_id_to_connection_path(
19
+ self, all_sources: list
20
+ ) -> dict[str, dict]:
21
+ """Maps a table id to its connection and path information."""
22
+ logger.info("Mapping table ids to connection and path information")
23
+
24
+ unique_table_ids = {
25
+ source["inodeId"]
26
+ for asset_sources in all_sources
27
+ for source in asset_sources["sources"]
28
+ if source["type"] == "table"
29
+ }
30
+
31
+ return {
32
+ table_id: self.api_client._get(
33
+ endpoint=SigmaEndpointFactory.connection_path(table_id)
34
+ )
35
+ for table_id in unique_table_ids
36
+ }
37
+
38
+ @staticmethod
39
+ def _enhance_table_source(source: dict, table_to_path: dict) -> dict:
40
+ """
41
+ Combines a single table source with its connection and path information.
42
+ """
43
+ if source["type"] != "table":
44
+ return source
45
+
46
+ path_info = table_to_path.get(source["inodeId"], {})
47
+ source["connectionId"] = path_info.get("connectionId")
48
+ source["path"] = path_info.get("path")
49
+ return source
50
+
51
+ def _transform_sources(
52
+ self, all_sources: list, table_to_path: dict
53
+ ) -> Iterator[dict]:
54
+ """
55
+ Yields all sources, with table sources being enhanced with additional information.
56
+ """
57
+ logger.info("Merging sources with table information")
58
+
59
+ for asset_sources in all_sources:
60
+ enhanced_sources = [
61
+ self._enhance_table_source(source, table_to_path)
62
+ for source in asset_sources["sources"]
63
+ ]
64
+
65
+ yield {
66
+ "asset_id": asset_sources["asset_id"],
67
+ "sources": enhanced_sources,
68
+ }
69
+
70
+ def _get_all_sources(
71
+ self, endpoint: Callable[[str], str], asset_ids: set[str]
72
+ ) -> Iterator[dict]:
73
+ """Returns transformed sources for the given assets"""
74
+ all_sources = []
75
+
76
+ for asset_id in asset_ids:
77
+ sources = self.api_client._get(endpoint=endpoint(asset_id))
78
+ all_sources.append({"asset_id": asset_id, "sources": sources})
79
+
80
+ table_to_path = self._map_table_id_to_connection_path(all_sources)
81
+
82
+ yield from self._transform_sources(all_sources, table_to_path)
83
+
84
+ def get_dataset_sources(self, datasets: list[dict]) -> Iterator[dict]:
85
+ asset_ids = {dataset["datasetId"] for dataset in datasets}
86
+ yield from self._get_all_sources(
87
+ endpoint=SigmaEndpointFactory.dataset_sources, asset_ids=asset_ids
88
+ )
89
+
90
+ def get_workbook_sources(self, workbooks: list[dict]) -> Iterator[dict]:
91
+ asset_ids = {workbook["workbookId"] for workbook in workbooks}
92
+ yield from self._get_all_sources(
93
+ endpoint=SigmaEndpointFactory.workbook_sources, asset_ids=asset_ids
94
+ )
@@ -0,0 +1,101 @@
1
+ from unittest.mock import Mock
2
+
3
+ from .sources_transformer import SigmaSourcesTransformer
4
+
5
+ _ALL_SOURCES = [
6
+ {
7
+ "asset_id": "asset1",
8
+ "sources": [
9
+ {"type": "dataset", "inodeId": "1234"}, # non-table source
10
+ {"type": "table", "inodeId": "table1"},
11
+ {"type": "table", "inodeId": "table2"},
12
+ ],
13
+ },
14
+ {
15
+ "asset_id": "asset2",
16
+ "sources": [
17
+ {"type": "table", "inodeId": "table1"}, # repeated source
18
+ ],
19
+ },
20
+ ]
21
+
22
+
23
+ _TABLE_TO_PATH = {
24
+ "table1": {
25
+ "connectionId": "conn1",
26
+ "path": ["db", "schema", "table1"],
27
+ },
28
+ "table2": {
29
+ "connectionId": "conn2",
30
+ "path": ["db", "schema", "table2"],
31
+ },
32
+ }
33
+
34
+
35
+ def test__map_table_id_to_connection_path():
36
+ transformer = SigmaSourcesTransformer(api_client=Mock())
37
+
38
+ def mock_get(endpoint):
39
+ if "table1" in endpoint:
40
+ return _TABLE_TO_PATH["table1"]
41
+ elif "table2" in endpoint:
42
+ return _TABLE_TO_PATH["table2"]
43
+ else:
44
+ raise ValueError(f"Unexpected endpoint: {endpoint}")
45
+
46
+ transformer.api_client._get.side_effect = mock_get
47
+
48
+ result = transformer._map_table_id_to_connection_path(_ALL_SOURCES)
49
+
50
+ assert len(result) == 2
51
+ assert result["table1"] == {
52
+ "connectionId": "conn1",
53
+ "path": ["db", "schema", "table1"],
54
+ }
55
+ assert result["table2"] == {
56
+ "connectionId": "conn2",
57
+ "path": ["db", "schema", "table2"],
58
+ }
59
+ assert transformer.api_client._get.call_count == 2
60
+
61
+
62
+ def test__transform_sources():
63
+ transformer = SigmaSourcesTransformer(api_client=Mock())
64
+
65
+ result = list(transformer._transform_sources(_ALL_SOURCES, _TABLE_TO_PATH))
66
+
67
+ assert len(result) == 2
68
+
69
+ asset_1_results = result[0]
70
+ assert len(asset_1_results["sources"]) == 3
71
+ actual_sources = sorted(
72
+ asset_1_results["sources"], key=lambda x: x["inodeId"]
73
+ )
74
+ expected_sources = [
75
+ {"type": "dataset", "inodeId": "1234"},
76
+ {
77
+ "type": "table",
78
+ "inodeId": "table1",
79
+ "connectionId": "conn1",
80
+ "path": ["db", "schema", "table1"],
81
+ },
82
+ {
83
+ "type": "table",
84
+ "inodeId": "table2",
85
+ "connectionId": "conn2",
86
+ "path": ["db", "schema", "table2"],
87
+ },
88
+ ]
89
+ expected_sources = sorted(expected_sources, key=lambda x: x["inodeId"])
90
+ assert actual_sources == expected_sources
91
+
92
+ asset_2_results = result[1]
93
+ assert asset_2_results["asset_id"] == "asset2"
94
+ assert asset_2_results["sources"] == [
95
+ {
96
+ "type": "table",
97
+ "inodeId": "table1",
98
+ "connectionId": "conn1",
99
+ "path": ["db", "schema", "table1"],
100
+ }
101
+ ]
@@ -22,14 +22,30 @@ def iterate_all_data(
22
22
  ) -> Iterable[tuple[SigmaAsset, Union[list, Iterator, dict]]]:
23
23
  """Iterate over the extracted data from Sigma"""
24
24
 
25
+ logger.info("Extracting DATA MODELS from API")
26
+ datamodels = client.fetch(SigmaAsset.DATAMODELS)
27
+ yield SigmaAsset.DATASETS, list(deep_serialize(datamodels))
28
+
25
29
  logger.info("Extracting DATASETS from API")
26
- datasets = client.fetch(SigmaAsset.DATASETS)
30
+ datasets = list(client.fetch(SigmaAsset.DATASETS))
27
31
  yield SigmaAsset.DATASETS, list(deep_serialize(datasets))
28
32
 
33
+ logger.info("Extracting DATASET SOURCES from API")
34
+ dataset_sources = client.fetch(
35
+ SigmaAsset.DATASET_SOURCES, datasets=datasets
36
+ )
37
+ yield SigmaAsset.DATASET_SOURCES, list(deep_serialize(dataset_sources))
38
+
29
39
  logger.info("Extracting WORKBOOKS from API")
30
40
  workbooks = list(client.fetch(SigmaAsset.WORKBOOKS))
31
41
  yield SigmaAsset.WORKBOOKS, list(deep_serialize(workbooks))
32
42
 
43
+ logger.info("Extracting WORKBOOK SOURCES from API")
44
+ workbook_sources = client.fetch(
45
+ SigmaAsset.WORKBOOK_SOURCES, workbooks=workbooks
46
+ )
47
+ yield SigmaAsset.WORKBOOKS, list(deep_serialize(workbook_sources))
48
+
33
49
  logger.info("Extracting FILES from API")
34
50
  files = client.fetch(SigmaAsset.FILES)
35
51
  yield SigmaAsset.FILES, list(deep_serialize(files))
@@ -46,12 +46,11 @@ class DatabricksClient:
46
46
 
47
47
  @staticmethod
48
48
  def _match_table_with_user(table: dict, user_mapping: dict) -> dict:
49
+ """Matches the table's owner email to an ID, or None if not found."""
49
50
  table_owner_email = table.get("owner_email")
50
- if not table_owner_email:
51
- return table
52
- owner_external_id = user_mapping.get(table_owner_email)
53
- if not owner_external_id:
54
- return table
51
+ owner_external_id = (
52
+ user_mapping.get(table_owner_email) if table_owner_email else None
53
+ )
55
54
  return {**table, "owner_external_id": owner_external_id}
56
55
 
57
56
  @staticmethod
@@ -36,5 +36,6 @@ def test_DatabricksClient__match_table_with_user():
36
36
  assert table_with_owner == {**table, "owner_external_id": 3}
37
37
 
38
38
  table_without_owner = {"id": 1, "owner_email": None}
39
+ expected = {"id": 1, "owner_email": None, "owner_external_id": None}
39
40
  actual = client._match_table_with_user(table_without_owner, user_mapping)
40
- assert actual == table_without_owner
41
+ assert actual == expected
@@ -0,0 +1,25 @@
1
+ /*
2
+ Selects all column lineage events for the given day.
3
+ This excludes self-lineage and deduplicates (parent, child) pairs to keep only the most recent lineage event.
4
+
5
+ Passing parameters is not always supported, so the query must be Python-formatted to set the date.
6
+ */
7
+ WITH deduplicated_lineage AS (
8
+ SELECT *,
9
+ ROW_NUMBER() OVER (
10
+ PARTITION BY source_table_full_name, source_column_name, target_table_full_name, target_column_name
11
+ ORDER BY event_time DESC
12
+ ) AS rank
13
+ FROM system.access.column_lineage
14
+ WHERE
15
+ TRUE
16
+ AND event_date = DATE('{day}')
17
+ AND source_table_full_name IS NOT NULL
18
+ AND source_column_name IS NOT NULL
19
+ AND target_table_full_name IS NOT NULL
20
+ AND target_column_name IS NOT NULL
21
+ AND CONCAT(source_table_full_name, '.', source_column_name) != CONCAT(target_table_full_name, '.', target_column_name)
22
+ )
23
+ SELECT *
24
+ FROM deduplicated_lineage
25
+ WHERE rank = 1
@@ -0,0 +1,23 @@
1
+ /*
2
+ Selects all table lineage events for the given day.
3
+ This excludes self-lineage and deduplicates (parent, child) pairs to keep only the most recent lineage event.
4
+
5
+ Passing parameters is not always supported, so the query must be Python-formatted to set the date.
6
+ */
7
+ WITH deduplicated_lineage AS (
8
+ SELECT *,
9
+ ROW_NUMBER() OVER (
10
+ PARTITION BY source_table_full_name, target_table_full_name
11
+ ORDER BY event_time DESC
12
+ ) AS rank
13
+ FROM system.access.table_lineage
14
+ WHERE
15
+ TRUE
16
+ AND event_date = DATE('{day}')
17
+ AND source_table_full_name IS NOT NULL
18
+ AND target_table_full_name IS NOT NULL
19
+ AND source_table_full_name != target_table_full_name
20
+ )
21
+ SELECT *
22
+ FROM deduplicated_lineage
23
+ WHERE rank = 1
@@ -4,20 +4,25 @@ from datetime import date
4
4
 
5
5
  from databricks import sql # type: ignore
6
6
 
7
+ from ...utils import load_file
7
8
  from .credentials import DatabricksCredentials
8
9
  from .enums import LineageEntity, TagEntity
9
10
  from .format import TagMapping
10
- from .lineage import valid_lineage
11
11
  from .utils import build_path, tag_label
12
12
 
13
13
  logger = logging.getLogger(__name__)
14
14
 
15
15
  _INFORMATION_SCHEMA_SQL = "SELECT * FROM system.information_schema"
16
16
 
17
- _LINEAGE_SQL_TPL = """
18
- SELECT * FROM system.access.{table_name}
19
- WHERE event_date = DATE('{day}')
20
- """
17
+ _LINEAGE_SQL_PATHS = {
18
+ LineageEntity.COLUMN: "queries/column_lineage.sql",
19
+ LineageEntity.TABLE: "queries/table_lineage.sql",
20
+ }
21
+
22
+
23
+ def _load_lineage_query(lineage_entity: LineageEntity) -> str:
24
+ filename = _LINEAGE_SQL_PATHS[lineage_entity]
25
+ return load_file(filename, __file__)
21
26
 
22
27
 
23
28
  class DatabricksSQLClient:
@@ -95,13 +100,11 @@ class DatabricksSQLClient:
95
100
  Unfortunately, passing parameters is not always supported. We have to
96
101
  format the query beforehand and pass it as plain text for execution.
97
102
  """
98
- table_name = f"{lineage_entity.value.lower()}_lineage"
99
- query = _LINEAGE_SQL_TPL.format(
100
- table_name=table_name,
101
- day=day,
102
- )
103
+ query_template = _load_lineage_query(lineage_entity)
104
+ query = query_template.format(day=day)
105
+
103
106
  result = self.execute_sql(query)
104
107
  data = []
105
108
  for row in result:
106
109
  data.append(row.asDict())
107
- return valid_lineage(data, lineage_entity)
110
+ return data
@@ -52,7 +52,9 @@ def extract_all(**kwargs) -> None:
52
52
  client = MSSQLClient(credentials=_credentials(kwargs))
53
53
 
54
54
  databases = filter_items(
55
- client.get_databases(), kwargs.get("allowed"), kwargs.get("blocked")
55
+ items=client.get_databases(),
56
+ allowed=kwargs.get("db_allowed"),
57
+ blocked=kwargs.get("db_blocked"),
56
58
  )
57
59
 
58
60
  query_builder = MSSQLQueryBuilder(
@@ -91,9 +91,9 @@ columns AS (
91
91
  LEFT JOIN column_ids AS i
92
92
  ON
93
93
  (
94
- c.table_name = i.table_name
95
- AND c.table_schema = i.schema_name
96
- AND c.column_name = i.column_name
94
+ c.table_name COLLATE DATABASE_DEFAULT = i.table_name COLLATE DATABASE_DEFAULT
95
+ AND c.table_schema COLLATE DATABASE_DEFAULT = i.schema_name COLLATE DATABASE_DEFAULT
96
+ AND c.column_name COLLATE DATABASE_DEFAULT = i.column_name COLLATE DATABASE_DEFAULT
97
97
  )
98
98
  )
99
99
 
@@ -1,4 +1,9 @@
1
- -- Fetch database information
1
+ /*
2
+ Fetch database information
3
+
4
+ Collation is a set of rules that defines how text data is stored and compared, and it can differ between databases.
5
+ The "COLLATE DATABASE_DEFAULT" is to ensure that text is compared with the same collation.
6
+ */
2
7
  WITH ids AS (
3
8
  SELECT DISTINCT
4
9
  table_catalog,
@@ -19,4 +24,4 @@ INNER JOIN ids AS i
19
24
  LEFT JOIN {database}.sys.sysusers AS u
20
25
  ON s.principal_id = u.uid
21
26
  LEFT JOIN {database}.sys.databases AS d
22
- ON i.table_catalog = d.name
27
+ ON i.table_catalog COLLATE DATABASE_DEFAULT = d.name COLLATE DATABASE_DEFAULT
@@ -92,7 +92,7 @@ meta AS (
92
92
  FROM
93
93
  {database}.information_schema.tables AS t
94
94
  LEFT JOIN {database}.sys.databases AS db
95
- ON t.table_catalog = db.name
95
+ ON t.table_catalog COLLATE DATABASE_DEFAULT = db.name COLLATE DATABASE_DEFAULT
96
96
  )
97
97
 
98
98
  SELECT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: castor-extractor
3
- Version: 0.24.36
3
+ Version: 0.24.40
4
4
  Summary: Extract your metadata assets.
5
5
  Home-page: https://www.castordoc.com/
6
6
  License: EULA
@@ -215,6 +215,24 @@ For any questions or bug report, contact us at [support@coalesce.io](mailto:supp
215
215
 
216
216
  # Changelog
217
217
 
218
+ ## 0.24.40 - 2025-08-18
219
+
220
+ * SQLServer: fix database allowlist/blocklist filtering
221
+
222
+ ## 0.24.39 - 2025-08-18
223
+
224
+ * Databricks:
225
+ * Fix vanishing owner ID column for tables
226
+ * Deduplicate lineage with SQL to reduce memory use
227
+
228
+ ## 0.24.38 - 2025-08-07
229
+
230
+ * Uploader: Support US and EU zones
231
+
232
+ ## 0.24.37 - 2025-08-06
233
+
234
+ * Sigma: extract data models, dataset sources and workbook sources
235
+
218
236
  ## 0.24.36 - 2025-08-04
219
237
 
220
238
  * Sigma:
@@ -1,4 +1,4 @@
1
- CHANGELOG.md,sha256=HAHFgRYnv-pbsKwbHrRCrWoLpsqr8mg7Fp7tDsBsN9E,19030
1
+ CHANGELOG.md,sha256=tgZkN-SNTMCro37DG0nW91MaD6ZnHM9VWWZG2-7TP68,19406
2
2
  Dockerfile,sha256=xQ05-CFfGShT3oUqaiumaldwA288dj9Yb_pxofQpufg,301
3
3
  DockerfileUsage.md,sha256=2hkJQF-5JuuzfPZ7IOxgM6QgIQW7l-9oRMFVwyXC4gE,998
4
4
  LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
@@ -29,7 +29,7 @@ castor_extractor/commands/extract_strategy.py,sha256=Q-pUymatPrBFGXobhyUPzFph0-t
29
29
  castor_extractor/commands/extract_tableau.py,sha256=LNtI29LbVk1vp4RNrn89GmdW6R_7QBYunRmkowDhbco,1982
30
30
  castor_extractor/commands/extract_thoughtspot.py,sha256=caAYJlH-vK7u5IUB6OKXxcaWfLgc7d_XqnFDWK6YNS4,639
31
31
  castor_extractor/commands/file_check.py,sha256=TJx76Ymd0QCECmq35zRJMkPE8DJtSInB28MuSXWk8Ao,2644
32
- castor_extractor/commands/upload.py,sha256=rLXp7gQ8zb1kLbho4FT87q8eJd8Gvo_TkyIynAaQ-4s,1342
32
+ castor_extractor/commands/upload.py,sha256=sqpEF_qqCNvT_niIrM6jPhzLaFVjtYwpc2iZw540F20,1633
33
33
  castor_extractor/file_checker/__init__.py,sha256=OSt6YLhUT42U_Cp3LCLHMVruwDkksL75Ij13X2UPnVk,119
34
34
  castor_extractor/file_checker/column.py,sha256=6bJhcW1snYwgHKkqlS0Ak7XLHZr4YBwO46JCIlnQNKg,3086
35
35
  castor_extractor/file_checker/column_test.py,sha256=1j8PxvmvmJgpd-mk30iMYOme32ovPSIn4yCXywFoXrg,1935
@@ -86,13 +86,14 @@ castor_extractor/transformation/dbt/client.py,sha256=BIue1DNAn2b7kHeiXBkGNosq8jZ
86
86
  castor_extractor/transformation/dbt/client_test.py,sha256=RLL7y_pLDv2QBM03qBht8yYEooeT_woRADHcb8vgBQ4,4535
87
87
  castor_extractor/transformation/dbt/credentials.py,sha256=pGq7GqFQTw9TwN1DXSHC-0yJ2H6B_wMAbHyQTLqJVh0,543
88
88
  castor_extractor/types.py,sha256=nHel2hv6NoHmdpOX_heEfO2-DnZPoYA2x0eJdbFvT0s,1276
89
- castor_extractor/uploader/__init__.py,sha256=A4bq_SrEtKAsl0r_D_duSTvL5WIQjVfsMy7tDx9IKg0,87
90
- castor_extractor/uploader/constant.py,sha256=yTigLHDlYwoRr6CpFIl7ReElFsQd4H-qkluMZJPWSx0,865
89
+ castor_extractor/uploader/__init__.py,sha256=xe3QHmHb35TILEhr7__nI_0t0tDolpQuujUyd84YcjI,111
90
+ castor_extractor/uploader/constant.py,sha256=ZmQtFx9nnR0GSLZ9k41upzV3ub4FJCUIyojIEVh-qIg,956
91
+ castor_extractor/uploader/enums.py,sha256=s5KVeBZWRDbDu-qOnrJhTSkSqzh0gxv0W1Z4cUsXfb8,109
91
92
  castor_extractor/uploader/env.py,sha256=5KiWHV-WTHfF68T_vzI-ypKAxzy9b9fnz2y4T3lH6QY,871
92
93
  castor_extractor/uploader/env_test.py,sha256=ClCWWtwd2N-5ClIDUxVMeKkWfhhOTxpppsXUDmdjxSg,472
93
- castor_extractor/uploader/settings.py,sha256=3MvOX-UFRqrLZoiT7wYn9jUGro7NX4RCafYzrXrLQtA,590
94
- castor_extractor/uploader/upload.py,sha256=PSQfkO_7LSE0WBo9Tm_hlS2ONepKeB0cBFdJXySnues,4310
95
- castor_extractor/uploader/upload_test.py,sha256=7fwstdQe7FjuwGilsCdFpEQr1qLoR2WTRUzyy93fISw,402
94
+ castor_extractor/uploader/settings.py,sha256=sUZpg9eHemM99DMrBW8bnlMuoTmCmLCKq-D0OCuQbGA,649
95
+ castor_extractor/uploader/upload.py,sha256=b2g9vWWjXWbt8Ms7brTc7OK_I7Z-1VSibNbppGoB2oQ,4764
96
+ castor_extractor/uploader/upload_test.py,sha256=UgN7TnT9Chn6KVzRcAX0Tuvp7-tps3ugxGitlgb9TSY,462
96
97
  castor_extractor/uploader/utils.py,sha256=otAaySj5aeem6f0CTd0Te6ioJ6uP2J1p348j-SdIwDI,802
97
98
  castor_extractor/utils/__init__.py,sha256=z_BdKTUyuug3I5AzCuSGrAVskfLax4_olfORIjhZw_M,1691
98
99
  castor_extractor/utils/argument_parser.py,sha256=S4EcIh3wNDjs3fOrQnttCcPsAmG8m_Txl7xvEh0Q37s,283
@@ -237,7 +238,7 @@ castor_extractor/visualization/powerbi/__init__.py,sha256=hoZ73ngLhMc9edqxO9PUIE
237
238
  castor_extractor/visualization/powerbi/assets.py,sha256=IB_XKwgdN1pZYGZ4RfeHrLjflianTzWf_6tg-4CIwu0,742
238
239
  castor_extractor/visualization/powerbi/client/__init__.py,sha256=UPIhMaCCdNxhiLdkItC0IPFE_AMi-SgqI_ahwjB9utI,151
239
240
  castor_extractor/visualization/powerbi/client/authentication.py,sha256=cTohunKr1nUDfvxB0sejJSyfE2BdCtwT1WMPecWlbyU,1045
240
- castor_extractor/visualization/powerbi/client/client.py,sha256=CWCYmj2spYin74qq9T8v2ZJ5TcxBuEy5EjArhCVZjLM,8141
241
+ castor_extractor/visualization/powerbi/client/client.py,sha256=9PRckoGdjfhOjhf5yqWTuNdivXcOC2PMgvcx-3uCh3k,8166
241
242
  castor_extractor/visualization/powerbi/client/client_test.py,sha256=Ox_bHpCSckEpT6IiR7drx2c9fmaVl1btUZxnwEmamGQ,5718
242
243
  castor_extractor/visualization/powerbi/client/constants.py,sha256=88R_aGachNNUZh6OSH2fkDwZtY4KTStzKm_g7HNCqqo,387
243
244
  castor_extractor/visualization/powerbi/client/credentials.py,sha256=OVWdhZSNODzTdLysY-sbpBZ3uUkLokeayQZnbJAqt2I,1386
@@ -269,14 +270,16 @@ castor_extractor/visualization/salesforce_reporting/client/rest.py,sha256=AqL1DT
269
270
  castor_extractor/visualization/salesforce_reporting/client/soql.py,sha256=ytZnX6zE-NoS_Kz12KghMcCM4ukPwhMj6U0rQZ_8Isk,1621
270
271
  castor_extractor/visualization/salesforce_reporting/extract.py,sha256=ScStilebLGf4HDTFqhVTQAvv_OrKxc8waycfBKdsVAc,1359
271
272
  castor_extractor/visualization/sigma/__init__.py,sha256=GINql4yJLtjfOJgjHaWNpE13cMtnKNytiFRomwav27Q,114
272
- castor_extractor/visualization/sigma/assets.py,sha256=JZ1Cpxnml8P3mIJoTUM57hvylB18ErECQXaP5FF63O4,268
273
+ castor_extractor/visualization/sigma/assets.py,sha256=uKGKDaeY1ejc7XGh4eFaNp2ygG7hgca132xsX4eCwKQ,380
273
274
  castor_extractor/visualization/sigma/client/__init__.py,sha256=YQv06FBBQHvBMFg_tN0nUcmUp2NCL2s-eFTXG8rXaBg,74
274
- castor_extractor/visualization/sigma/client/client.py,sha256=ifCxhZ8-p9u7MnJRE8EYF_YP_G3REr_PELTSrtHiZwk,10099
275
+ castor_extractor/visualization/sigma/client/client.py,sha256=VU0BHlug3tCpGA1je0PjEy4hU4TKhCH9UUGi8LRmNy8,11422
275
276
  castor_extractor/visualization/sigma/client/client_test.py,sha256=ae0ZOvKutCm44jnrJ-0_A5Y6ZGyDkMf9Ml3eEP8dNkY,581
276
277
  castor_extractor/visualization/sigma/client/credentials.py,sha256=XddAuQSmCKpxJ70TQgRnOj0vMPYVtiStk_lMMQ1AiNM,693
277
- castor_extractor/visualization/sigma/client/endpoints.py,sha256=DBFphbgoH78_MZUGM_bKBAq28Nl7LWSZ6VRsbxrxtDg,1162
278
+ castor_extractor/visualization/sigma/client/endpoints.py,sha256=i7KTKnl2Os6752CdtJl0vPSC_Z6JxmacodV_saOnce0,1662
278
279
  castor_extractor/visualization/sigma/client/pagination.py,sha256=2bFA7GiBUUasFtHJKA90516d283p7Pg50-4zw6Fwt8I,726
279
- castor_extractor/visualization/sigma/extract.py,sha256=XIT1qsj6g6dgBWP8HPfj_medZexu48EaY9tUwi14gzM,2298
280
+ castor_extractor/visualization/sigma/client/sources_transformer.py,sha256=mRupzxjtjDqELIouHF0egBkgslDmn5Y4uqO_sbUGCNs,3244
281
+ castor_extractor/visualization/sigma/client/sources_transformer_test.py,sha256=06yUHXyv65amXLKXhix6K3kkVc1kpBqSjIYcxbyMI4Y,2766
282
+ castor_extractor/visualization/sigma/extract.py,sha256=poTh70Xm2D6BwbdGApLkjXy6-t4iZnOoMB5DPfaTLEI,2929
280
283
  castor_extractor/visualization/strategy/__init__.py,sha256=HOMv4JxqF5ZmViWi-pDE-PSXJRLTdXal_jtpHG_rlR8,123
281
284
  castor_extractor/visualization/strategy/assets.py,sha256=yFXF_dX01patC0HQ1eU7Jo_4DZ4m6IJEg0uCB71tMoI,480
282
285
  castor_extractor/visualization/strategy/client/__init__.py,sha256=XWP0yF5j6JefDJkDfX-RSJn3HF2ceQ0Yx1PLCfB3BBo,80
@@ -334,18 +337,18 @@ castor_extractor/warehouse/bigquery/types.py,sha256=rfKkKA13Et7TM4I0uVaXkLfuaBXk
334
337
  castor_extractor/warehouse/databricks/__init__.py,sha256=YG3YSIJgCFRjjI8eExy9T7qGnfnjWhMFh8c15KTs_BA,184
335
338
  castor_extractor/warehouse/databricks/api_client.py,sha256=kLcUGSgrfybZUrpt0tE7qe2OoSSN7IK4myyB7c0czOY,6260
336
339
  castor_extractor/warehouse/databricks/api_client_test.py,sha256=YTWC-X7L-XAfK5b39TUgTmR1ifv0QrY5tvLNoSbpmjg,466
337
- castor_extractor/warehouse/databricks/client.py,sha256=H6vcKfos7op5AKSQF9qduG4afx-GZgBdyGE7waS6__o,3292
338
- castor_extractor/warehouse/databricks/client_test.py,sha256=hOuSPh45z6m9T1hjuqpOayby_q8bYdJVdq5qiwkiXrg,1370
340
+ castor_extractor/warehouse/databricks/client.py,sha256=LzpeVQIOYi_QTfdOHbK6SB4SgxhZ7p9TNxh0Iwfz850,3307
341
+ castor_extractor/warehouse/databricks/client_test.py,sha256=dqEdEAt-6e8CtQ7M2L5vDYkn4JvOjqyqZSFEpQ55WRc,1432
339
342
  castor_extractor/warehouse/databricks/credentials.py,sha256=ExtVcl2NpMXTx1Lg8vHQdzQtSEm2aqpg3D1BJrNAUjI,528
340
343
  castor_extractor/warehouse/databricks/endpoints.py,sha256=qPoL9CtPFJdwVuW9rJ37nmeMd-nChOBouEVYb4SlaUE,670
341
344
  castor_extractor/warehouse/databricks/enums.py,sha256=3T6BbVvbWvfWkD23krsYT1x0kKh1qRzNPl6WpcXe300,274
342
345
  castor_extractor/warehouse/databricks/extract.py,sha256=Z4VTEIf0QMiua0QGAlJdQ86kxmGAXekQ304aCKme6IY,7358
343
346
  castor_extractor/warehouse/databricks/format.py,sha256=S3BOcwJubc1pyKr-li26uftUUfsjfrm5Qf4LqmElXVk,6736
344
347
  castor_extractor/warehouse/databricks/format_test.py,sha256=ls0IcOElqp_qecAzNbK0zdca7Pms4seCHimbw8NAoAI,3322
345
- castor_extractor/warehouse/databricks/lineage.py,sha256=jwiRXrgqBAtzQt5EgErYrN8YRyviEEHmyrSbw8TSPq4,2105
346
- castor_extractor/warehouse/databricks/lineage_test.py,sha256=PyBn1eAoxLm4Bz5M0F4zmaxFX2mXRTM_uug5OKbQPQs,2684
347
348
  castor_extractor/warehouse/databricks/pagination.py,sha256=sM1G0sN1pf1TPpI0Y3Oew378UGEKVkMRc2Mlu9tDjLo,545
348
- castor_extractor/warehouse/databricks/sql_client.py,sha256=BchHMNqHPtZsJWhj2XYq3QVVTj3XfKhzhhPTJng8vXo,3656
349
+ castor_extractor/warehouse/databricks/queries/column_lineage.sql,sha256=Q8MAZ5N3fNcolTMtRRw2fIrbKgV4ax9StgJgtYMpxNQ,980
350
+ castor_extractor/warehouse/databricks/queries/table_lineage.sql,sha256=5k5jHj11SdGpfMqJEKJihAhd_ngO4kZOZJ8TCPihWDs,786
351
+ castor_extractor/warehouse/databricks/sql_client.py,sha256=oypv_2pomoleXUJJhS8CSKO_ucalQhS9_mcsnsb5wsc,3750
349
352
  castor_extractor/warehouse/databricks/types.py,sha256=-TFX4jS6_c3wQLOpJTKpLeGS21YIPjKDjISnzeUPdCc,46
350
353
  castor_extractor/warehouse/databricks/utils.py,sha256=5CKn6Me1Tus97H_qDEz_5tkhd4ARmwk2qiC3GndjyCc,1969
351
354
  castor_extractor/warehouse/databricks/utils_test.py,sha256=_guTuzRWRTZdDY7ils0X1K8jhI9T877MEtw3x_YDg9I,2415
@@ -419,17 +422,17 @@ castor_extractor/warehouse/snowflake/queries/view_ddl.sql,sha256=eWsci_50cxiYIv3
419
422
  castor_extractor/warehouse/snowflake/query.py,sha256=C2LTdPwBzMQ_zMncg0Kq4_WkoY7K9as5tvxBDrIOlwI,1763
420
423
  castor_extractor/warehouse/sqlserver/__init__.py,sha256=PdOuYznmvKAbfWAm8UdN47MfEsd9jqPi_dDi3WEo1KY,116
421
424
  castor_extractor/warehouse/sqlserver/client.py,sha256=Bjfpw96IKAQfWPiU5SZYEDfetwfkqZrnKbQYoStcnZc,2007
422
- castor_extractor/warehouse/sqlserver/extract.py,sha256=-LoHY5wAGJk4vutrO3N0_PaRqts7rkEn7pADRHzoxiI,2249
425
+ castor_extractor/warehouse/sqlserver/extract.py,sha256=GbOlSq8JR6HaJZunkfiRxaSt0pbgazQjF8GpgqWWIcU,2294
423
426
  castor_extractor/warehouse/sqlserver/queries/.sqlfluff,sha256=yy0KQdz8I_67vnXyX8eeWwOWkxTXvHyVKSVwhURktd8,48
424
- castor_extractor/warehouse/sqlserver/queries/column.sql,sha256=_K5OS63N7fM7kGPudnnjJEnIyaxR1xE2hoZgnJ_A3p8,2763
427
+ castor_extractor/warehouse/sqlserver/queries/column.sql,sha256=eRILCgdygYRvtfSdxaswIiIYKW-PiJXW2qi3yHtrfns,2913
425
428
  castor_extractor/warehouse/sqlserver/queries/database.sql,sha256=4dPeBCn85MEOXr1f-DPXxiI3RvvoE_1n8lsbTs26E0I,150
426
- castor_extractor/warehouse/sqlserver/queries/schema.sql,sha256=UR3eTiYw7Iq5-GukelnNg_uq6haZ_dwg_SedZfOWUoA,619
427
- castor_extractor/warehouse/sqlserver/queries/table.sql,sha256=4RgeSkHDWTWRyU2iLxaBR0KuSwIBvb3GbQGdkJYXbn0,2787
429
+ castor_extractor/warehouse/sqlserver/queries/schema.sql,sha256=Zp4G86FJ_Be8Zqvdlu7K8DqmsUL62kxbwaUk5asZ0V4,881
430
+ castor_extractor/warehouse/sqlserver/queries/table.sql,sha256=YwFhHc6rGbszqQt7Izh7EngVwrrBoEZ9kniuWXNtGco,2837
428
431
  castor_extractor/warehouse/sqlserver/queries/user.sql,sha256=gOrZsMVypusR2dc4vwVs4E1a-CliRsr_UjnD2EbXs-A,94
429
432
  castor_extractor/warehouse/sqlserver/query.py,sha256=7sW8cK3JzxPt6faTJ7e4lk9tE4fo_AeCymI-LqsSols,1276
430
433
  castor_extractor/warehouse/synapse/queries/column.sql,sha256=lNcFoIW3Y0PFOqoOzJEXmPvZvfAsY0AP63Mu2LuPzPo,1351
431
- castor_extractor-0.24.36.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
432
- castor_extractor-0.24.36.dist-info/METADATA,sha256=m14Hk_AYJo9_bZE7IOb6U_LdhG8JfXnVqisiJHjgMS4,26483
433
- castor_extractor-0.24.36.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
434
- castor_extractor-0.24.36.dist-info/entry_points.txt,sha256=_F-qeZCybjoMkNb9ErEhnyqXuG6afHIFQhakdBHZsr4,1803
435
- castor_extractor-0.24.36.dist-info/RECORD,,
434
+ castor_extractor-0.24.40.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
435
+ castor_extractor-0.24.40.dist-info/METADATA,sha256=ONg1SCc3gcrOJqBE92EtyfQctf-hRxI_u2VUbBpvgVA,26859
436
+ castor_extractor-0.24.40.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
437
+ castor_extractor-0.24.40.dist-info/entry_points.txt,sha256=_F-qeZCybjoMkNb9ErEhnyqXuG6afHIFQhakdBHZsr4,1803
438
+ castor_extractor-0.24.40.dist-info/RECORD,,
@@ -1,69 +0,0 @@
1
- from typing import Iterable, Optional
2
-
3
- from .enums import LineageEntity
4
-
5
-
6
- class LineageProcessor:
7
- """
8
- helper class that handles lineage deduplication and filtering
9
- """
10
-
11
- def __init__(self, lineage_entity: LineageEntity):
12
- self.lineage_entity = lineage_entity
13
-
14
- self.lineage: dict[tuple[str, str], dict] = dict()
15
-
16
- def _parent_path(self, link) -> Optional[str]:
17
- if self.lineage_entity == LineageEntity.TABLE:
18
- return link["source_table_full_name"]
19
-
20
- source_table = link["source_table_full_name"]
21
- source_column = link["source_column_name"]
22
- if not (source_table and source_column):
23
- return None
24
-
25
- return f"{source_table}.{source_column}"
26
-
27
- def _child_path(self, link) -> Optional[str]:
28
- if self.lineage_entity == LineageEntity.TABLE:
29
- return link["target_table_full_name"]
30
-
31
- target_table = link["target_table_full_name"]
32
- target_column = link["target_column_name"]
33
- if not (target_table and target_column):
34
- return None
35
-
36
- return f"{target_table}.{target_column}"
37
-
38
- def add(self, link: dict) -> None:
39
- """
40
- If the parent and child paths are valid, keeps the most recent lineage
41
- link in the `self.lineage` map.
42
- """
43
- parent = self._parent_path(link)
44
- child = self._child_path(link)
45
- timestamp = link["event_time"]
46
-
47
- if not (parent and child and parent != child):
48
- return
49
-
50
- key = (parent, child)
51
- if key in self.lineage and self.lineage[key]["event_time"] > timestamp:
52
- return
53
-
54
- self.lineage[key] = link
55
-
56
-
57
- def valid_lineage(
58
- lineage: Iterable[dict], lineage_entity: LineageEntity
59
- ) -> list[dict]:
60
- """
61
- Filters out self-lineage or lineage with a missing source or target path,
62
- then deduplicates by picking the link with the most recent event timestmap.
63
- """
64
- deduplicated_lineage = LineageProcessor(lineage_entity)
65
-
66
- for link in lineage:
67
- deduplicated_lineage.add(link)
68
-
69
- return list(deduplicated_lineage.lineage.values())
@@ -1,89 +0,0 @@
1
- from .enums import LineageEntity
2
- from .lineage import LineageProcessor, valid_lineage
3
-
4
- _OLDER_DATE = "2025-01-01 00:00:01.0"
5
- _CLOSER_DATE = "2025-01-01 02:02:02.0"
6
-
7
- _TABLE_LINEAGES = [
8
- {
9
- "source_table_full_name": "a.b.source",
10
- "target_table_full_name": "a.b.target",
11
- "event_time": _CLOSER_DATE,
12
- "other": "more recent stuff",
13
- },
14
- {
15
- "source_table_full_name": "a.b.source",
16
- "target_table_full_name": "a.b.target",
17
- "event_time": _OLDER_DATE,
18
- "other": "stuff that's too old",
19
- },
20
- {
21
- "source_table_full_name": "no target",
22
- "target_table_full_name": None,
23
- "event_time": _CLOSER_DATE,
24
- },
25
- {
26
- "source_table_full_name": None,
27
- "target_table_full_name": "no source",
28
- "event_time": _CLOSER_DATE,
29
- },
30
- ]
31
-
32
-
33
- _COLUMN_LINEAGES = [
34
- {
35
- "source_table_full_name": "a.b.source",
36
- "source_column_name": "src_col",
37
- "target_table_full_name": "a.b.target",
38
- "target_column_name": "trgt_col",
39
- "event_time": _OLDER_DATE,
40
- "other": "old stuff",
41
- },
42
- {
43
- "source_table_full_name": "a.b.source",
44
- "source_column_name": "src_col",
45
- "target_table_full_name": "a.b.target",
46
- "target_column_name": "trgt_col",
47
- "event_time": _CLOSER_DATE,
48
- "other": "newer stuff",
49
- },
50
- {
51
- "source_table_full_name": "a.b.toto",
52
- "source_column_name": "toto_col",
53
- "target_table_full_name": "a.b.tata",
54
- "target_column_name": "tata_col",
55
- "event_time": _OLDER_DATE,
56
- },
57
- {
58
- "source_table_full_name": "a.b.source",
59
- "source_column_name": "a.b.source",
60
- "target_table_full_name": None,
61
- "target_column_name": None,
62
- "event_time": _CLOSER_DATE,
63
- },
64
- ]
65
-
66
-
67
- def test_valid_lineage():
68
- table_links = valid_lineage(_TABLE_LINEAGES, LineageEntity.TABLE)
69
-
70
- assert len(table_links) == 1
71
- assert table_links[0]["source_table_full_name"] == "a.b.source"
72
- assert table_links[0]["target_table_full_name"] == "a.b.target"
73
- assert table_links[0]["event_time"] == _CLOSER_DATE
74
- assert table_links[0]["other"] == "more recent stuff"
75
-
76
-
77
- def test_LineageLinks_add():
78
- deduplicated_lineage = LineageProcessor(LineageEntity.COLUMN)
79
- for link in _COLUMN_LINEAGES:
80
- deduplicated_lineage.add(link)
81
-
82
- lineage = deduplicated_lineage.lineage
83
- assert len(lineage) == 2
84
- assert ("a.b.source.src_col", "a.b.target.trgt_col") in lineage
85
- assert ("a.b.toto.toto_col", "a.b.tata.tata_col") in lineage
86
- assert (
87
- lineage[("a.b.source.src_col", "a.b.target.trgt_col")]["other"]
88
- == "newer stuff"
89
- )