castor-extractor 0.24.36__py3-none-any.whl → 0.24.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +18 -0
- castor_extractor/commands/upload.py +10 -0
- castor_extractor/uploader/__init__.py +1 -0
- castor_extractor/uploader/constant.py +5 -1
- castor_extractor/uploader/enums.py +8 -0
- castor_extractor/uploader/settings.py +2 -0
- castor_extractor/uploader/upload.py +26 -5
- castor_extractor/uploader/upload_test.py +5 -3
- castor_extractor/visualization/powerbi/client/client.py +2 -1
- castor_extractor/visualization/sigma/assets.py +3 -0
- castor_extractor/visualization/sigma/client/client.py +42 -5
- castor_extractor/visualization/sigma/client/endpoints.py +17 -0
- castor_extractor/visualization/sigma/client/sources_transformer.py +94 -0
- castor_extractor/visualization/sigma/client/sources_transformer_test.py +101 -0
- castor_extractor/visualization/sigma/extract.py +17 -1
- castor_extractor/warehouse/databricks/client.py +4 -5
- castor_extractor/warehouse/databricks/client_test.py +2 -1
- castor_extractor/warehouse/databricks/queries/column_lineage.sql +25 -0
- castor_extractor/warehouse/databricks/queries/table_lineage.sql +23 -0
- castor_extractor/warehouse/databricks/sql_client.py +14 -11
- castor_extractor/warehouse/sqlserver/extract.py +3 -1
- castor_extractor/warehouse/sqlserver/queries/column.sql +3 -3
- castor_extractor/warehouse/sqlserver/queries/schema.sql +7 -2
- castor_extractor/warehouse/sqlserver/queries/table.sql +1 -1
- {castor_extractor-0.24.36.dist-info → castor_extractor-0.24.40.dist-info}/METADATA +19 -1
- {castor_extractor-0.24.36.dist-info → castor_extractor-0.24.40.dist-info}/RECORD +29 -26
- castor_extractor/warehouse/databricks/lineage.py +0 -69
- castor_extractor/warehouse/databricks/lineage_test.py +0 -89
- {castor_extractor-0.24.36.dist-info → castor_extractor-0.24.40.dist-info}/LICENCE +0 -0
- {castor_extractor-0.24.36.dist-info → castor_extractor-0.24.40.dist-info}/WHEEL +0 -0
- {castor_extractor-0.24.36.dist-info → castor_extractor-0.24.40.dist-info}/entry_points.txt +0 -0
CHANGELOG.md
CHANGED
|
@@ -1,5 +1,23 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.24.40 - 2025-08-18
|
|
4
|
+
|
|
5
|
+
* SQLServer: fix database allowlist/blocklist filtering
|
|
6
|
+
|
|
7
|
+
## 0.24.39 - 2025-08-18
|
|
8
|
+
|
|
9
|
+
* Databricks:
|
|
10
|
+
* Fix vanishing owner ID column for tables
|
|
11
|
+
* Deduplicate lineage with SQL to reduce memory use
|
|
12
|
+
|
|
13
|
+
## 0.24.38 - 2025-08-07
|
|
14
|
+
|
|
15
|
+
* Uploader: Support US and EU zones
|
|
16
|
+
|
|
17
|
+
## 0.24.37 - 2025-08-06
|
|
18
|
+
|
|
19
|
+
* Sigma: extract data models, dataset sources and workbook sources
|
|
20
|
+
|
|
3
21
|
## 0.24.36 - 2025-08-04
|
|
4
22
|
|
|
5
23
|
* Sigma:
|
|
@@ -3,6 +3,7 @@ from argparse import ArgumentParser
|
|
|
3
3
|
|
|
4
4
|
from castor_extractor.uploader import ( # type: ignore
|
|
5
5
|
FileType,
|
|
6
|
+
Zone,
|
|
6
7
|
upload_any,
|
|
7
8
|
)
|
|
8
9
|
from castor_extractor.utils import parse_filled_arguments # type: ignore
|
|
@@ -40,6 +41,15 @@ def _args() -> ArgumentParser:
|
|
|
40
41
|
),
|
|
41
42
|
choices=supported_file_type,
|
|
42
43
|
)
|
|
44
|
+
supported_zones = [zone.value for zone in Zone]
|
|
45
|
+
parser.add_argument(
|
|
46
|
+
"-z",
|
|
47
|
+
"--zone",
|
|
48
|
+
help="geographic zone to upload, currently supported are {}, defaults to EU".format(
|
|
49
|
+
supported_zones,
|
|
50
|
+
),
|
|
51
|
+
choices=supported_zones,
|
|
52
|
+
)
|
|
43
53
|
return parser
|
|
44
54
|
|
|
45
55
|
|
|
@@ -1,9 +1,13 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
2
|
|
|
3
3
|
from ..utils import RetryStrategy
|
|
4
|
+
from .enums import Zone
|
|
4
5
|
|
|
5
6
|
# url of the gcs proxy
|
|
6
|
-
|
|
7
|
+
INGEST_URLS = {
|
|
8
|
+
Zone.EU: "https://ingest.castordoc.com",
|
|
9
|
+
Zone.US: "https://ingest.us.castordoc.com",
|
|
10
|
+
}
|
|
7
11
|
|
|
8
12
|
RETRY_BASE_MS = 10_000
|
|
9
13
|
RETRY_JITTER_MS = 1_000
|
|
@@ -4,6 +4,7 @@ from pydantic import UUID4, Field
|
|
|
4
4
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
5
5
|
|
|
6
6
|
from .constant import FileType
|
|
7
|
+
from .enums import Zone
|
|
7
8
|
|
|
8
9
|
UPLOADER_ENV_PREFIX = "CASTOR_UPLOADER_"
|
|
9
10
|
|
|
@@ -22,3 +23,4 @@ class UploaderSettings(BaseSettings):
|
|
|
22
23
|
file_type: FileType
|
|
23
24
|
source_id: UUID4
|
|
24
25
|
token: str = Field(repr=False)
|
|
26
|
+
zone: Optional[Zone] = Zone.EU
|
|
@@ -10,13 +10,14 @@ import requests
|
|
|
10
10
|
|
|
11
11
|
from ..utils.retry import retry
|
|
12
12
|
from .constant import (
|
|
13
|
-
|
|
13
|
+
INGEST_URLS,
|
|
14
14
|
PATH_TEMPLATES,
|
|
15
15
|
RETRY_BASE_MS,
|
|
16
16
|
RETRY_JITTER_MS,
|
|
17
17
|
RETRY_STRATEGY,
|
|
18
18
|
FileType,
|
|
19
19
|
)
|
|
20
|
+
from .enums import Zone
|
|
20
21
|
from .env import get_blob_env
|
|
21
22
|
from .settings import UploaderSettings
|
|
22
23
|
from .utils import iter_files
|
|
@@ -33,6 +34,7 @@ def _path_and_url(
|
|
|
33
34
|
source_id: UUID,
|
|
34
35
|
file_type: FileType,
|
|
35
36
|
file_path: str,
|
|
37
|
+
zone: Zone,
|
|
36
38
|
) -> tuple[str, str]:
|
|
37
39
|
now = datetime.utcnow()
|
|
38
40
|
timestamp = int(now.timestamp())
|
|
@@ -44,7 +46,7 @@ def _path_and_url(
|
|
|
44
46
|
filename=filename,
|
|
45
47
|
)
|
|
46
48
|
|
|
47
|
-
url = f"{
|
|
49
|
+
url = f"{INGEST_URLS[zone]}/{path}"
|
|
48
50
|
|
|
49
51
|
return path, url
|
|
50
52
|
|
|
@@ -61,13 +63,16 @@ def _upload(
|
|
|
61
63
|
source_id: UUID,
|
|
62
64
|
file_path: str,
|
|
63
65
|
file_type: FileType,
|
|
66
|
+
zone: Optional[Zone] = Zone.EU,
|
|
64
67
|
) -> None:
|
|
65
68
|
"""
|
|
66
69
|
Upload the given file to Google Cloud Storage (GCS)
|
|
67
70
|
- Don't call GCS API directly
|
|
68
71
|
- Call the ingestion proxy which handles authorisation and uploading
|
|
69
72
|
"""
|
|
70
|
-
|
|
73
|
+
if not zone:
|
|
74
|
+
zone = Zone.EU
|
|
75
|
+
path, url = _path_and_url(source_id, file_type, file_path, zone)
|
|
71
76
|
headers = _headers(token)
|
|
72
77
|
timeout, max_retries = get_blob_env()
|
|
73
78
|
|
|
@@ -97,6 +102,7 @@ def _upload(
|
|
|
97
102
|
def upload_manifest(
|
|
98
103
|
token: str,
|
|
99
104
|
source_id: UUID,
|
|
105
|
+
zone: Optional[Zone],
|
|
100
106
|
file_path: Optional[str] = None,
|
|
101
107
|
) -> None:
|
|
102
108
|
"""
|
|
@@ -106,13 +112,20 @@ def upload_manifest(
|
|
|
106
112
|
"""
|
|
107
113
|
if not file_path:
|
|
108
114
|
raise ValueError("file path is needed to upload a manifest")
|
|
109
|
-
_upload(
|
|
115
|
+
_upload(
|
|
116
|
+
token=token,
|
|
117
|
+
source_id=source_id,
|
|
118
|
+
file_path=file_path,
|
|
119
|
+
file_type=FileType.DBT,
|
|
120
|
+
zone=zone,
|
|
121
|
+
)
|
|
110
122
|
|
|
111
123
|
|
|
112
124
|
def upload(
|
|
113
125
|
token: str,
|
|
114
126
|
source_id: UUID,
|
|
115
127
|
file_type: FileType,
|
|
128
|
+
zone: Optional[Zone],
|
|
116
129
|
file_path: Optional[str] = None,
|
|
117
130
|
directory_path: Optional[str] = None,
|
|
118
131
|
) -> None:
|
|
@@ -133,7 +146,13 @@ def upload(
|
|
|
133
146
|
raise ValueError(message)
|
|
134
147
|
|
|
135
148
|
for file_ in files:
|
|
136
|
-
_upload(
|
|
149
|
+
_upload(
|
|
150
|
+
token=token,
|
|
151
|
+
source_id=source_id,
|
|
152
|
+
file_path=file_,
|
|
153
|
+
file_type=file_type,
|
|
154
|
+
zone=zone,
|
|
155
|
+
)
|
|
137
156
|
|
|
138
157
|
|
|
139
158
|
def upload_any(**kwargs) -> None:
|
|
@@ -156,6 +175,7 @@ def upload_any(**kwargs) -> None:
|
|
|
156
175
|
token=settings.token,
|
|
157
176
|
source_id=settings.source_id,
|
|
158
177
|
file_path=settings.file_path,
|
|
178
|
+
zone=settings.zone,
|
|
159
179
|
)
|
|
160
180
|
return None
|
|
161
181
|
|
|
@@ -165,4 +185,5 @@ def upload_any(**kwargs) -> None:
|
|
|
165
185
|
file_type=file_type,
|
|
166
186
|
file_path=settings.file_path,
|
|
167
187
|
directory_path=settings.directory_path,
|
|
188
|
+
zone=settings.zone,
|
|
168
189
|
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from uuid import UUID
|
|
2
2
|
|
|
3
|
-
from .constant import
|
|
3
|
+
from .constant import INGEST_URLS, FileType
|
|
4
|
+
from .enums import Zone
|
|
4
5
|
from .upload import _path_and_url
|
|
5
6
|
|
|
6
7
|
|
|
@@ -8,7 +9,8 @@ def test__path():
|
|
|
8
9
|
source_id = UUID("399a8b22-3187-11ec-8d3d-0242ac130003")
|
|
9
10
|
file_type = FileType.VIZ
|
|
10
11
|
file_path = "filename"
|
|
12
|
+
zone = Zone.EU
|
|
11
13
|
|
|
12
|
-
path, url = _path_and_url(source_id, file_type, file_path)
|
|
14
|
+
path, url = _path_and_url(source_id, file_type, file_path, zone)
|
|
13
15
|
assert path == f"visualization-{source_id}/{file_path}"
|
|
14
|
-
assert url == f"{
|
|
16
|
+
assert url == f"{INGEST_URLS[Zone.EU]}/{path}"
|
|
@@ -28,6 +28,7 @@ POWERBI_DEFAULT_TIMEOUT_S = 30
|
|
|
28
28
|
METADATA_BATCH_SIZE = 100
|
|
29
29
|
POWERBI_SCAN_STATUS_DONE = "Succeeded"
|
|
30
30
|
POWERBI_SCAN_SLEEP_S = 1
|
|
31
|
+
POWERBI_SCAN_TIMEOUT_S = 60
|
|
31
32
|
|
|
32
33
|
MAX_RETRY_PAGES = 1
|
|
33
34
|
RETRY_PAGES_TIMEOUT_MS = 35 * 1000 # 35 seconds
|
|
@@ -142,7 +143,7 @@ class PowerbiClient(APIClient):
|
|
|
142
143
|
endpoint = self.endpoint_factory.metadata_scan_status(scan_id)
|
|
143
144
|
total_waiting_time_s = 0
|
|
144
145
|
|
|
145
|
-
while total_waiting_time_s <
|
|
146
|
+
while total_waiting_time_s < POWERBI_SCAN_TIMEOUT_S:
|
|
146
147
|
try:
|
|
147
148
|
result = self._get(endpoint)
|
|
148
149
|
except HTTPError as e:
|
|
@@ -4,10 +4,13 @@ from ...types import ExternalAsset
|
|
|
4
4
|
class SigmaAsset(ExternalAsset):
|
|
5
5
|
"""Sigma assets"""
|
|
6
6
|
|
|
7
|
+
DATAMODELS = "datamodels"
|
|
7
8
|
DATASETS = "datasets"
|
|
9
|
+
DATASET_SOURCES = "dataset_sources"
|
|
8
10
|
ELEMENTS = "elements"
|
|
9
11
|
FILES = "files"
|
|
10
12
|
LINEAGES = "lineages"
|
|
11
13
|
MEMBERS = "members"
|
|
12
14
|
QUERIES = "queries"
|
|
13
15
|
WORKBOOKS = "workbooks"
|
|
16
|
+
WORKBOOK_SOURCES = "workbook_sources"
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from collections.abc import Iterator
|
|
2
3
|
from concurrent.futures import ThreadPoolExecutor
|
|
3
4
|
from functools import partial
|
|
@@ -24,6 +25,9 @@ from .pagination import (
|
|
|
24
25
|
SIGMA_QUERIES_PAGINATION_LIMIT,
|
|
25
26
|
SigmaPagination,
|
|
26
27
|
)
|
|
28
|
+
from .sources_transformer import SigmaSourcesTransformer
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
27
31
|
|
|
28
32
|
_CONTENT_TYPE = "application/x-www-form-urlencoded"
|
|
29
33
|
|
|
@@ -135,6 +139,12 @@ class SigmaClient(APIClient):
|
|
|
135
139
|
params={"limit": limit},
|
|
136
140
|
)
|
|
137
141
|
|
|
142
|
+
def _get_all_datamodels(self) -> Iterator[dict]:
|
|
143
|
+
request = self._get_paginated(
|
|
144
|
+
endpoint=SigmaEndpointFactory.datamodels()
|
|
145
|
+
)
|
|
146
|
+
yield from fetch_all_pages(request, SigmaPagination)
|
|
147
|
+
|
|
138
148
|
def _get_all_datasets(self) -> Iterator[dict]:
|
|
139
149
|
request = self._get_paginated(endpoint=SigmaEndpointFactory.datasets())
|
|
140
150
|
yield from fetch_all_pages(request, SigmaPagination)
|
|
@@ -275,18 +285,36 @@ class SigmaClient(APIClient):
|
|
|
275
285
|
|
|
276
286
|
yield from self._yield_deduplicated_queries(queries, workbook_id)
|
|
277
287
|
|
|
288
|
+
def _get_all_dataset_sources(self, datasets: list[dict]) -> Iterator[dict]:
|
|
289
|
+
yield from SigmaSourcesTransformer(self).get_dataset_sources(datasets)
|
|
290
|
+
|
|
291
|
+
def _get_all_workbook_sources(
|
|
292
|
+
self, workbooks: list[dict]
|
|
293
|
+
) -> Iterator[dict]:
|
|
294
|
+
yield from SigmaSourcesTransformer(self).get_workbook_sources(workbooks)
|
|
295
|
+
|
|
278
296
|
def fetch(
|
|
279
297
|
self,
|
|
280
298
|
asset: SigmaAsset,
|
|
281
|
-
|
|
299
|
+
datasets: Optional[list[dict]] = None,
|
|
282
300
|
elements: Optional[list[dict]] = None,
|
|
301
|
+
workbooks: Optional[list[dict]] = None,
|
|
283
302
|
) -> Iterator[dict]:
|
|
284
303
|
"""Returns the needed metadata for the queried asset"""
|
|
285
|
-
if asset == SigmaAsset.
|
|
304
|
+
if asset == SigmaAsset.DATAMODELS:
|
|
305
|
+
yield from self._get_all_datamodels()
|
|
306
|
+
|
|
307
|
+
elif asset == SigmaAsset.DATASETS:
|
|
286
308
|
yield from self._get_all_datasets()
|
|
287
309
|
|
|
310
|
+
elif asset == SigmaAsset.DATASET_SOURCES:
|
|
311
|
+
if datasets is None:
|
|
312
|
+
raise ValueError("Missing datasets to extract dataset sources")
|
|
313
|
+
|
|
314
|
+
yield from self._get_all_dataset_sources(datasets)
|
|
315
|
+
|
|
288
316
|
elif asset == SigmaAsset.ELEMENTS:
|
|
289
|
-
if
|
|
317
|
+
if workbooks is None:
|
|
290
318
|
raise ValueError("Missing workbooks to extract elements")
|
|
291
319
|
|
|
292
320
|
yield from self._get_all_elements(workbooks)
|
|
@@ -295,15 +323,16 @@ class SigmaClient(APIClient):
|
|
|
295
323
|
yield from self._get_all_files()
|
|
296
324
|
|
|
297
325
|
elif asset == SigmaAsset.LINEAGES:
|
|
298
|
-
if
|
|
326
|
+
if elements is None:
|
|
299
327
|
raise ValueError("Missing elements to extract lineage")
|
|
328
|
+
|
|
300
329
|
yield from self._get_all_lineages(elements)
|
|
301
330
|
|
|
302
331
|
elif asset == SigmaAsset.MEMBERS:
|
|
303
332
|
yield from self._get_all_members()
|
|
304
333
|
|
|
305
334
|
elif asset == SigmaAsset.QUERIES:
|
|
306
|
-
if
|
|
335
|
+
if workbooks is None:
|
|
307
336
|
raise ValueError("Missing workbooks to extract queries")
|
|
308
337
|
|
|
309
338
|
yield from self._get_all_queries(workbooks)
|
|
@@ -311,5 +340,13 @@ class SigmaClient(APIClient):
|
|
|
311
340
|
elif asset == SigmaAsset.WORKBOOKS:
|
|
312
341
|
yield from self._get_all_workbooks()
|
|
313
342
|
|
|
343
|
+
elif asset == SigmaAsset.WORKBOOK_SOURCES:
|
|
344
|
+
if workbooks is None:
|
|
345
|
+
raise ValueError(
|
|
346
|
+
"Missing workbooks to extract workbook sources"
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
yield from self._get_all_workbook_sources(workbooks)
|
|
350
|
+
|
|
314
351
|
else:
|
|
315
352
|
raise ValueError(f"This asset {asset} is unknown")
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
class SigmaEndpointFactory:
|
|
2
2
|
"""Wrapper class around all endpoints we're using"""
|
|
3
3
|
|
|
4
|
+
DATAMODELS = "dataModels"
|
|
4
5
|
DATASETS = "datasets"
|
|
5
6
|
FILES = "files"
|
|
6
7
|
MEMBERS = "members"
|
|
@@ -10,10 +11,22 @@ class SigmaEndpointFactory:
|
|
|
10
11
|
def authentication(cls) -> str:
|
|
11
12
|
return "v2/auth/token"
|
|
12
13
|
|
|
14
|
+
@classmethod
|
|
15
|
+
def connection_path(cls, inode_id: str) -> str:
|
|
16
|
+
return f"v2/connections/paths/{inode_id}"
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def datamodels(cls) -> str:
|
|
20
|
+
return f"v2/{cls.DATAMODELS}"
|
|
21
|
+
|
|
13
22
|
@classmethod
|
|
14
23
|
def datasets(cls) -> str:
|
|
15
24
|
return f"v2/{cls.DATASETS}"
|
|
16
25
|
|
|
26
|
+
@classmethod
|
|
27
|
+
def dataset_sources(cls, dataset_id: str) -> str:
|
|
28
|
+
return f"v2/{cls.DATASETS}/{dataset_id}/sources"
|
|
29
|
+
|
|
17
30
|
@classmethod
|
|
18
31
|
def elements(cls, workbook_id: str, page_id: str) -> str:
|
|
19
32
|
return f"v2/{cls.WORKBOOKS}/{workbook_id}/pages/{page_id}/elements"
|
|
@@ -41,3 +54,7 @@ class SigmaEndpointFactory:
|
|
|
41
54
|
@classmethod
|
|
42
55
|
def workbooks(cls) -> str:
|
|
43
56
|
return f"v2/{cls.WORKBOOKS}"
|
|
57
|
+
|
|
58
|
+
@classmethod
|
|
59
|
+
def workbook_sources(cls, workbook_id: str) -> str:
|
|
60
|
+
return f"v2/{cls.WORKBOOKS}/{workbook_id}/sources"
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import TYPE_CHECKING, Callable, Iterator
|
|
3
|
+
|
|
4
|
+
from .endpoints import SigmaEndpointFactory
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from .client import SigmaClient
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SigmaSourcesTransformer:
|
|
13
|
+
"""Retrieves asset sources and enhances them with additional information."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, api_client: "SigmaClient"):
|
|
16
|
+
self.api_client = api_client
|
|
17
|
+
|
|
18
|
+
def _map_table_id_to_connection_path(
|
|
19
|
+
self, all_sources: list
|
|
20
|
+
) -> dict[str, dict]:
|
|
21
|
+
"""Maps a table id to its connection and path information."""
|
|
22
|
+
logger.info("Mapping table ids to connection and path information")
|
|
23
|
+
|
|
24
|
+
unique_table_ids = {
|
|
25
|
+
source["inodeId"]
|
|
26
|
+
for asset_sources in all_sources
|
|
27
|
+
for source in asset_sources["sources"]
|
|
28
|
+
if source["type"] == "table"
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
return {
|
|
32
|
+
table_id: self.api_client._get(
|
|
33
|
+
endpoint=SigmaEndpointFactory.connection_path(table_id)
|
|
34
|
+
)
|
|
35
|
+
for table_id in unique_table_ids
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
@staticmethod
|
|
39
|
+
def _enhance_table_source(source: dict, table_to_path: dict) -> dict:
|
|
40
|
+
"""
|
|
41
|
+
Combines a single table source with its connection and path information.
|
|
42
|
+
"""
|
|
43
|
+
if source["type"] != "table":
|
|
44
|
+
return source
|
|
45
|
+
|
|
46
|
+
path_info = table_to_path.get(source["inodeId"], {})
|
|
47
|
+
source["connectionId"] = path_info.get("connectionId")
|
|
48
|
+
source["path"] = path_info.get("path")
|
|
49
|
+
return source
|
|
50
|
+
|
|
51
|
+
def _transform_sources(
|
|
52
|
+
self, all_sources: list, table_to_path: dict
|
|
53
|
+
) -> Iterator[dict]:
|
|
54
|
+
"""
|
|
55
|
+
Yields all sources, with table sources being enhanced with additional information.
|
|
56
|
+
"""
|
|
57
|
+
logger.info("Merging sources with table information")
|
|
58
|
+
|
|
59
|
+
for asset_sources in all_sources:
|
|
60
|
+
enhanced_sources = [
|
|
61
|
+
self._enhance_table_source(source, table_to_path)
|
|
62
|
+
for source in asset_sources["sources"]
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
yield {
|
|
66
|
+
"asset_id": asset_sources["asset_id"],
|
|
67
|
+
"sources": enhanced_sources,
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
def _get_all_sources(
|
|
71
|
+
self, endpoint: Callable[[str], str], asset_ids: set[str]
|
|
72
|
+
) -> Iterator[dict]:
|
|
73
|
+
"""Returns transformed sources for the given assets"""
|
|
74
|
+
all_sources = []
|
|
75
|
+
|
|
76
|
+
for asset_id in asset_ids:
|
|
77
|
+
sources = self.api_client._get(endpoint=endpoint(asset_id))
|
|
78
|
+
all_sources.append({"asset_id": asset_id, "sources": sources})
|
|
79
|
+
|
|
80
|
+
table_to_path = self._map_table_id_to_connection_path(all_sources)
|
|
81
|
+
|
|
82
|
+
yield from self._transform_sources(all_sources, table_to_path)
|
|
83
|
+
|
|
84
|
+
def get_dataset_sources(self, datasets: list[dict]) -> Iterator[dict]:
|
|
85
|
+
asset_ids = {dataset["datasetId"] for dataset in datasets}
|
|
86
|
+
yield from self._get_all_sources(
|
|
87
|
+
endpoint=SigmaEndpointFactory.dataset_sources, asset_ids=asset_ids
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
def get_workbook_sources(self, workbooks: list[dict]) -> Iterator[dict]:
|
|
91
|
+
asset_ids = {workbook["workbookId"] for workbook in workbooks}
|
|
92
|
+
yield from self._get_all_sources(
|
|
93
|
+
endpoint=SigmaEndpointFactory.workbook_sources, asset_ids=asset_ids
|
|
94
|
+
)
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from unittest.mock import Mock
|
|
2
|
+
|
|
3
|
+
from .sources_transformer import SigmaSourcesTransformer
|
|
4
|
+
|
|
5
|
+
_ALL_SOURCES = [
|
|
6
|
+
{
|
|
7
|
+
"asset_id": "asset1",
|
|
8
|
+
"sources": [
|
|
9
|
+
{"type": "dataset", "inodeId": "1234"}, # non-table source
|
|
10
|
+
{"type": "table", "inodeId": "table1"},
|
|
11
|
+
{"type": "table", "inodeId": "table2"},
|
|
12
|
+
],
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"asset_id": "asset2",
|
|
16
|
+
"sources": [
|
|
17
|
+
{"type": "table", "inodeId": "table1"}, # repeated source
|
|
18
|
+
],
|
|
19
|
+
},
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
_TABLE_TO_PATH = {
|
|
24
|
+
"table1": {
|
|
25
|
+
"connectionId": "conn1",
|
|
26
|
+
"path": ["db", "schema", "table1"],
|
|
27
|
+
},
|
|
28
|
+
"table2": {
|
|
29
|
+
"connectionId": "conn2",
|
|
30
|
+
"path": ["db", "schema", "table2"],
|
|
31
|
+
},
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test__map_table_id_to_connection_path():
|
|
36
|
+
transformer = SigmaSourcesTransformer(api_client=Mock())
|
|
37
|
+
|
|
38
|
+
def mock_get(endpoint):
|
|
39
|
+
if "table1" in endpoint:
|
|
40
|
+
return _TABLE_TO_PATH["table1"]
|
|
41
|
+
elif "table2" in endpoint:
|
|
42
|
+
return _TABLE_TO_PATH["table2"]
|
|
43
|
+
else:
|
|
44
|
+
raise ValueError(f"Unexpected endpoint: {endpoint}")
|
|
45
|
+
|
|
46
|
+
transformer.api_client._get.side_effect = mock_get
|
|
47
|
+
|
|
48
|
+
result = transformer._map_table_id_to_connection_path(_ALL_SOURCES)
|
|
49
|
+
|
|
50
|
+
assert len(result) == 2
|
|
51
|
+
assert result["table1"] == {
|
|
52
|
+
"connectionId": "conn1",
|
|
53
|
+
"path": ["db", "schema", "table1"],
|
|
54
|
+
}
|
|
55
|
+
assert result["table2"] == {
|
|
56
|
+
"connectionId": "conn2",
|
|
57
|
+
"path": ["db", "schema", "table2"],
|
|
58
|
+
}
|
|
59
|
+
assert transformer.api_client._get.call_count == 2
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test__transform_sources():
|
|
63
|
+
transformer = SigmaSourcesTransformer(api_client=Mock())
|
|
64
|
+
|
|
65
|
+
result = list(transformer._transform_sources(_ALL_SOURCES, _TABLE_TO_PATH))
|
|
66
|
+
|
|
67
|
+
assert len(result) == 2
|
|
68
|
+
|
|
69
|
+
asset_1_results = result[0]
|
|
70
|
+
assert len(asset_1_results["sources"]) == 3
|
|
71
|
+
actual_sources = sorted(
|
|
72
|
+
asset_1_results["sources"], key=lambda x: x["inodeId"]
|
|
73
|
+
)
|
|
74
|
+
expected_sources = [
|
|
75
|
+
{"type": "dataset", "inodeId": "1234"},
|
|
76
|
+
{
|
|
77
|
+
"type": "table",
|
|
78
|
+
"inodeId": "table1",
|
|
79
|
+
"connectionId": "conn1",
|
|
80
|
+
"path": ["db", "schema", "table1"],
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
"type": "table",
|
|
84
|
+
"inodeId": "table2",
|
|
85
|
+
"connectionId": "conn2",
|
|
86
|
+
"path": ["db", "schema", "table2"],
|
|
87
|
+
},
|
|
88
|
+
]
|
|
89
|
+
expected_sources = sorted(expected_sources, key=lambda x: x["inodeId"])
|
|
90
|
+
assert actual_sources == expected_sources
|
|
91
|
+
|
|
92
|
+
asset_2_results = result[1]
|
|
93
|
+
assert asset_2_results["asset_id"] == "asset2"
|
|
94
|
+
assert asset_2_results["sources"] == [
|
|
95
|
+
{
|
|
96
|
+
"type": "table",
|
|
97
|
+
"inodeId": "table1",
|
|
98
|
+
"connectionId": "conn1",
|
|
99
|
+
"path": ["db", "schema", "table1"],
|
|
100
|
+
}
|
|
101
|
+
]
|
|
@@ -22,14 +22,30 @@ def iterate_all_data(
|
|
|
22
22
|
) -> Iterable[tuple[SigmaAsset, Union[list, Iterator, dict]]]:
|
|
23
23
|
"""Iterate over the extracted data from Sigma"""
|
|
24
24
|
|
|
25
|
+
logger.info("Extracting DATA MODELS from API")
|
|
26
|
+
datamodels = client.fetch(SigmaAsset.DATAMODELS)
|
|
27
|
+
yield SigmaAsset.DATASETS, list(deep_serialize(datamodels))
|
|
28
|
+
|
|
25
29
|
logger.info("Extracting DATASETS from API")
|
|
26
|
-
datasets = client.fetch(SigmaAsset.DATASETS)
|
|
30
|
+
datasets = list(client.fetch(SigmaAsset.DATASETS))
|
|
27
31
|
yield SigmaAsset.DATASETS, list(deep_serialize(datasets))
|
|
28
32
|
|
|
33
|
+
logger.info("Extracting DATASET SOURCES from API")
|
|
34
|
+
dataset_sources = client.fetch(
|
|
35
|
+
SigmaAsset.DATASET_SOURCES, datasets=datasets
|
|
36
|
+
)
|
|
37
|
+
yield SigmaAsset.DATASET_SOURCES, list(deep_serialize(dataset_sources))
|
|
38
|
+
|
|
29
39
|
logger.info("Extracting WORKBOOKS from API")
|
|
30
40
|
workbooks = list(client.fetch(SigmaAsset.WORKBOOKS))
|
|
31
41
|
yield SigmaAsset.WORKBOOKS, list(deep_serialize(workbooks))
|
|
32
42
|
|
|
43
|
+
logger.info("Extracting WORKBOOK SOURCES from API")
|
|
44
|
+
workbook_sources = client.fetch(
|
|
45
|
+
SigmaAsset.WORKBOOK_SOURCES, workbooks=workbooks
|
|
46
|
+
)
|
|
47
|
+
yield SigmaAsset.WORKBOOKS, list(deep_serialize(workbook_sources))
|
|
48
|
+
|
|
33
49
|
logger.info("Extracting FILES from API")
|
|
34
50
|
files = client.fetch(SigmaAsset.FILES)
|
|
35
51
|
yield SigmaAsset.FILES, list(deep_serialize(files))
|
|
@@ -46,12 +46,11 @@ class DatabricksClient:
|
|
|
46
46
|
|
|
47
47
|
@staticmethod
|
|
48
48
|
def _match_table_with_user(table: dict, user_mapping: dict) -> dict:
|
|
49
|
+
"""Matches the table's owner email to an ID, or None if not found."""
|
|
49
50
|
table_owner_email = table.get("owner_email")
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
if not owner_external_id:
|
|
54
|
-
return table
|
|
51
|
+
owner_external_id = (
|
|
52
|
+
user_mapping.get(table_owner_email) if table_owner_email else None
|
|
53
|
+
)
|
|
55
54
|
return {**table, "owner_external_id": owner_external_id}
|
|
56
55
|
|
|
57
56
|
@staticmethod
|
|
@@ -36,5 +36,6 @@ def test_DatabricksClient__match_table_with_user():
|
|
|
36
36
|
assert table_with_owner == {**table, "owner_external_id": 3}
|
|
37
37
|
|
|
38
38
|
table_without_owner = {"id": 1, "owner_email": None}
|
|
39
|
+
expected = {"id": 1, "owner_email": None, "owner_external_id": None}
|
|
39
40
|
actual = client._match_table_with_user(table_without_owner, user_mapping)
|
|
40
|
-
assert actual ==
|
|
41
|
+
assert actual == expected
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/*
|
|
2
|
+
Selects all column lineage events for the given day.
|
|
3
|
+
This excludes self-lineage and deduplicates (parent, child) pairs to keep only the most recent lineage event.
|
|
4
|
+
|
|
5
|
+
Passing parameters is not always supported, so the query must be Python-formatted to set the date.
|
|
6
|
+
*/
|
|
7
|
+
WITH deduplicated_lineage AS (
|
|
8
|
+
SELECT *,
|
|
9
|
+
ROW_NUMBER() OVER (
|
|
10
|
+
PARTITION BY source_table_full_name, source_column_name, target_table_full_name, target_column_name
|
|
11
|
+
ORDER BY event_time DESC
|
|
12
|
+
) AS rank
|
|
13
|
+
FROM system.access.column_lineage
|
|
14
|
+
WHERE
|
|
15
|
+
TRUE
|
|
16
|
+
AND event_date = DATE('{day}')
|
|
17
|
+
AND source_table_full_name IS NOT NULL
|
|
18
|
+
AND source_column_name IS NOT NULL
|
|
19
|
+
AND target_table_full_name IS NOT NULL
|
|
20
|
+
AND target_column_name IS NOT NULL
|
|
21
|
+
AND CONCAT(source_table_full_name, '.', source_column_name) != CONCAT(target_table_full_name, '.', target_column_name)
|
|
22
|
+
)
|
|
23
|
+
SELECT *
|
|
24
|
+
FROM deduplicated_lineage
|
|
25
|
+
WHERE rank = 1
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/*
|
|
2
|
+
Selects all table lineage events for the given day.
|
|
3
|
+
This excludes self-lineage and deduplicates (parent, child) pairs to keep only the most recent lineage event.
|
|
4
|
+
|
|
5
|
+
Passing parameters is not always supported, so the query must be Python-formatted to set the date.
|
|
6
|
+
*/
|
|
7
|
+
WITH deduplicated_lineage AS (
|
|
8
|
+
SELECT *,
|
|
9
|
+
ROW_NUMBER() OVER (
|
|
10
|
+
PARTITION BY source_table_full_name, target_table_full_name
|
|
11
|
+
ORDER BY event_time DESC
|
|
12
|
+
) AS rank
|
|
13
|
+
FROM system.access.table_lineage
|
|
14
|
+
WHERE
|
|
15
|
+
TRUE
|
|
16
|
+
AND event_date = DATE('{day}')
|
|
17
|
+
AND source_table_full_name IS NOT NULL
|
|
18
|
+
AND target_table_full_name IS NOT NULL
|
|
19
|
+
AND source_table_full_name != target_table_full_name
|
|
20
|
+
)
|
|
21
|
+
SELECT *
|
|
22
|
+
FROM deduplicated_lineage
|
|
23
|
+
WHERE rank = 1
|
|
@@ -4,20 +4,25 @@ from datetime import date
|
|
|
4
4
|
|
|
5
5
|
from databricks import sql # type: ignore
|
|
6
6
|
|
|
7
|
+
from ...utils import load_file
|
|
7
8
|
from .credentials import DatabricksCredentials
|
|
8
9
|
from .enums import LineageEntity, TagEntity
|
|
9
10
|
from .format import TagMapping
|
|
10
|
-
from .lineage import valid_lineage
|
|
11
11
|
from .utils import build_path, tag_label
|
|
12
12
|
|
|
13
13
|
logger = logging.getLogger(__name__)
|
|
14
14
|
|
|
15
15
|
_INFORMATION_SCHEMA_SQL = "SELECT * FROM system.information_schema"
|
|
16
16
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
17
|
+
_LINEAGE_SQL_PATHS = {
|
|
18
|
+
LineageEntity.COLUMN: "queries/column_lineage.sql",
|
|
19
|
+
LineageEntity.TABLE: "queries/table_lineage.sql",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _load_lineage_query(lineage_entity: LineageEntity) -> str:
|
|
24
|
+
filename = _LINEAGE_SQL_PATHS[lineage_entity]
|
|
25
|
+
return load_file(filename, __file__)
|
|
21
26
|
|
|
22
27
|
|
|
23
28
|
class DatabricksSQLClient:
|
|
@@ -95,13 +100,11 @@ class DatabricksSQLClient:
|
|
|
95
100
|
Unfortunately, passing parameters is not always supported. We have to
|
|
96
101
|
format the query beforehand and pass it as plain text for execution.
|
|
97
102
|
"""
|
|
98
|
-
|
|
99
|
-
query =
|
|
100
|
-
|
|
101
|
-
day=day,
|
|
102
|
-
)
|
|
103
|
+
query_template = _load_lineage_query(lineage_entity)
|
|
104
|
+
query = query_template.format(day=day)
|
|
105
|
+
|
|
103
106
|
result = self.execute_sql(query)
|
|
104
107
|
data = []
|
|
105
108
|
for row in result:
|
|
106
109
|
data.append(row.asDict())
|
|
107
|
-
return
|
|
110
|
+
return data
|
|
@@ -52,7 +52,9 @@ def extract_all(**kwargs) -> None:
|
|
|
52
52
|
client = MSSQLClient(credentials=_credentials(kwargs))
|
|
53
53
|
|
|
54
54
|
databases = filter_items(
|
|
55
|
-
client.get_databases(),
|
|
55
|
+
items=client.get_databases(),
|
|
56
|
+
allowed=kwargs.get("db_allowed"),
|
|
57
|
+
blocked=kwargs.get("db_blocked"),
|
|
56
58
|
)
|
|
57
59
|
|
|
58
60
|
query_builder = MSSQLQueryBuilder(
|
|
@@ -91,9 +91,9 @@ columns AS (
|
|
|
91
91
|
LEFT JOIN column_ids AS i
|
|
92
92
|
ON
|
|
93
93
|
(
|
|
94
|
-
c.table_name = i.table_name
|
|
95
|
-
AND c.table_schema = i.schema_name
|
|
96
|
-
AND c.column_name = i.column_name
|
|
94
|
+
c.table_name COLLATE DATABASE_DEFAULT = i.table_name COLLATE DATABASE_DEFAULT
|
|
95
|
+
AND c.table_schema COLLATE DATABASE_DEFAULT = i.schema_name COLLATE DATABASE_DEFAULT
|
|
96
|
+
AND c.column_name COLLATE DATABASE_DEFAULT = i.column_name COLLATE DATABASE_DEFAULT
|
|
97
97
|
)
|
|
98
98
|
)
|
|
99
99
|
|
|
@@ -1,4 +1,9 @@
|
|
|
1
|
-
|
|
1
|
+
/*
|
|
2
|
+
Fetch database information
|
|
3
|
+
|
|
4
|
+
Collation is a set of rules that defines how text data is stored and compared, and it can differ between databases.
|
|
5
|
+
The "COLLATE DATABASE_DEFAULT" is to ensure that text is compared with the same collation.
|
|
6
|
+
*/
|
|
2
7
|
WITH ids AS (
|
|
3
8
|
SELECT DISTINCT
|
|
4
9
|
table_catalog,
|
|
@@ -19,4 +24,4 @@ INNER JOIN ids AS i
|
|
|
19
24
|
LEFT JOIN {database}.sys.sysusers AS u
|
|
20
25
|
ON s.principal_id = u.uid
|
|
21
26
|
LEFT JOIN {database}.sys.databases AS d
|
|
22
|
-
ON i.table_catalog = d.name
|
|
27
|
+
ON i.table_catalog COLLATE DATABASE_DEFAULT = d.name COLLATE DATABASE_DEFAULT
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: castor-extractor
|
|
3
|
-
Version: 0.24.
|
|
3
|
+
Version: 0.24.40
|
|
4
4
|
Summary: Extract your metadata assets.
|
|
5
5
|
Home-page: https://www.castordoc.com/
|
|
6
6
|
License: EULA
|
|
@@ -215,6 +215,24 @@ For any questions or bug report, contact us at [support@coalesce.io](mailto:supp
|
|
|
215
215
|
|
|
216
216
|
# Changelog
|
|
217
217
|
|
|
218
|
+
## 0.24.40 - 2025-08-18
|
|
219
|
+
|
|
220
|
+
* SQLServer: fix database allowlist/blocklist filtering
|
|
221
|
+
|
|
222
|
+
## 0.24.39 - 2025-08-18
|
|
223
|
+
|
|
224
|
+
* Databricks:
|
|
225
|
+
* Fix vanishing owner ID column for tables
|
|
226
|
+
* Deduplicate lineage with SQL to reduce memory use
|
|
227
|
+
|
|
228
|
+
## 0.24.38 - 2025-08-07
|
|
229
|
+
|
|
230
|
+
* Uploader: Support US and EU zones
|
|
231
|
+
|
|
232
|
+
## 0.24.37 - 2025-08-06
|
|
233
|
+
|
|
234
|
+
* Sigma: extract data models, dataset sources and workbook sources
|
|
235
|
+
|
|
218
236
|
## 0.24.36 - 2025-08-04
|
|
219
237
|
|
|
220
238
|
* Sigma:
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
CHANGELOG.md,sha256=
|
|
1
|
+
CHANGELOG.md,sha256=tgZkN-SNTMCro37DG0nW91MaD6ZnHM9VWWZG2-7TP68,19406
|
|
2
2
|
Dockerfile,sha256=xQ05-CFfGShT3oUqaiumaldwA288dj9Yb_pxofQpufg,301
|
|
3
3
|
DockerfileUsage.md,sha256=2hkJQF-5JuuzfPZ7IOxgM6QgIQW7l-9oRMFVwyXC4gE,998
|
|
4
4
|
LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
|
|
@@ -29,7 +29,7 @@ castor_extractor/commands/extract_strategy.py,sha256=Q-pUymatPrBFGXobhyUPzFph0-t
|
|
|
29
29
|
castor_extractor/commands/extract_tableau.py,sha256=LNtI29LbVk1vp4RNrn89GmdW6R_7QBYunRmkowDhbco,1982
|
|
30
30
|
castor_extractor/commands/extract_thoughtspot.py,sha256=caAYJlH-vK7u5IUB6OKXxcaWfLgc7d_XqnFDWK6YNS4,639
|
|
31
31
|
castor_extractor/commands/file_check.py,sha256=TJx76Ymd0QCECmq35zRJMkPE8DJtSInB28MuSXWk8Ao,2644
|
|
32
|
-
castor_extractor/commands/upload.py,sha256=
|
|
32
|
+
castor_extractor/commands/upload.py,sha256=sqpEF_qqCNvT_niIrM6jPhzLaFVjtYwpc2iZw540F20,1633
|
|
33
33
|
castor_extractor/file_checker/__init__.py,sha256=OSt6YLhUT42U_Cp3LCLHMVruwDkksL75Ij13X2UPnVk,119
|
|
34
34
|
castor_extractor/file_checker/column.py,sha256=6bJhcW1snYwgHKkqlS0Ak7XLHZr4YBwO46JCIlnQNKg,3086
|
|
35
35
|
castor_extractor/file_checker/column_test.py,sha256=1j8PxvmvmJgpd-mk30iMYOme32ovPSIn4yCXywFoXrg,1935
|
|
@@ -86,13 +86,14 @@ castor_extractor/transformation/dbt/client.py,sha256=BIue1DNAn2b7kHeiXBkGNosq8jZ
|
|
|
86
86
|
castor_extractor/transformation/dbt/client_test.py,sha256=RLL7y_pLDv2QBM03qBht8yYEooeT_woRADHcb8vgBQ4,4535
|
|
87
87
|
castor_extractor/transformation/dbt/credentials.py,sha256=pGq7GqFQTw9TwN1DXSHC-0yJ2H6B_wMAbHyQTLqJVh0,543
|
|
88
88
|
castor_extractor/types.py,sha256=nHel2hv6NoHmdpOX_heEfO2-DnZPoYA2x0eJdbFvT0s,1276
|
|
89
|
-
castor_extractor/uploader/__init__.py,sha256=
|
|
90
|
-
castor_extractor/uploader/constant.py,sha256=
|
|
89
|
+
castor_extractor/uploader/__init__.py,sha256=xe3QHmHb35TILEhr7__nI_0t0tDolpQuujUyd84YcjI,111
|
|
90
|
+
castor_extractor/uploader/constant.py,sha256=ZmQtFx9nnR0GSLZ9k41upzV3ub4FJCUIyojIEVh-qIg,956
|
|
91
|
+
castor_extractor/uploader/enums.py,sha256=s5KVeBZWRDbDu-qOnrJhTSkSqzh0gxv0W1Z4cUsXfb8,109
|
|
91
92
|
castor_extractor/uploader/env.py,sha256=5KiWHV-WTHfF68T_vzI-ypKAxzy9b9fnz2y4T3lH6QY,871
|
|
92
93
|
castor_extractor/uploader/env_test.py,sha256=ClCWWtwd2N-5ClIDUxVMeKkWfhhOTxpppsXUDmdjxSg,472
|
|
93
|
-
castor_extractor/uploader/settings.py,sha256=
|
|
94
|
-
castor_extractor/uploader/upload.py,sha256=
|
|
95
|
-
castor_extractor/uploader/upload_test.py,sha256=
|
|
94
|
+
castor_extractor/uploader/settings.py,sha256=sUZpg9eHemM99DMrBW8bnlMuoTmCmLCKq-D0OCuQbGA,649
|
|
95
|
+
castor_extractor/uploader/upload.py,sha256=b2g9vWWjXWbt8Ms7brTc7OK_I7Z-1VSibNbppGoB2oQ,4764
|
|
96
|
+
castor_extractor/uploader/upload_test.py,sha256=UgN7TnT9Chn6KVzRcAX0Tuvp7-tps3ugxGitlgb9TSY,462
|
|
96
97
|
castor_extractor/uploader/utils.py,sha256=otAaySj5aeem6f0CTd0Te6ioJ6uP2J1p348j-SdIwDI,802
|
|
97
98
|
castor_extractor/utils/__init__.py,sha256=z_BdKTUyuug3I5AzCuSGrAVskfLax4_olfORIjhZw_M,1691
|
|
98
99
|
castor_extractor/utils/argument_parser.py,sha256=S4EcIh3wNDjs3fOrQnttCcPsAmG8m_Txl7xvEh0Q37s,283
|
|
@@ -237,7 +238,7 @@ castor_extractor/visualization/powerbi/__init__.py,sha256=hoZ73ngLhMc9edqxO9PUIE
|
|
|
237
238
|
castor_extractor/visualization/powerbi/assets.py,sha256=IB_XKwgdN1pZYGZ4RfeHrLjflianTzWf_6tg-4CIwu0,742
|
|
238
239
|
castor_extractor/visualization/powerbi/client/__init__.py,sha256=UPIhMaCCdNxhiLdkItC0IPFE_AMi-SgqI_ahwjB9utI,151
|
|
239
240
|
castor_extractor/visualization/powerbi/client/authentication.py,sha256=cTohunKr1nUDfvxB0sejJSyfE2BdCtwT1WMPecWlbyU,1045
|
|
240
|
-
castor_extractor/visualization/powerbi/client/client.py,sha256=
|
|
241
|
+
castor_extractor/visualization/powerbi/client/client.py,sha256=9PRckoGdjfhOjhf5yqWTuNdivXcOC2PMgvcx-3uCh3k,8166
|
|
241
242
|
castor_extractor/visualization/powerbi/client/client_test.py,sha256=Ox_bHpCSckEpT6IiR7drx2c9fmaVl1btUZxnwEmamGQ,5718
|
|
242
243
|
castor_extractor/visualization/powerbi/client/constants.py,sha256=88R_aGachNNUZh6OSH2fkDwZtY4KTStzKm_g7HNCqqo,387
|
|
243
244
|
castor_extractor/visualization/powerbi/client/credentials.py,sha256=OVWdhZSNODzTdLysY-sbpBZ3uUkLokeayQZnbJAqt2I,1386
|
|
@@ -269,14 +270,16 @@ castor_extractor/visualization/salesforce_reporting/client/rest.py,sha256=AqL1DT
|
|
|
269
270
|
castor_extractor/visualization/salesforce_reporting/client/soql.py,sha256=ytZnX6zE-NoS_Kz12KghMcCM4ukPwhMj6U0rQZ_8Isk,1621
|
|
270
271
|
castor_extractor/visualization/salesforce_reporting/extract.py,sha256=ScStilebLGf4HDTFqhVTQAvv_OrKxc8waycfBKdsVAc,1359
|
|
271
272
|
castor_extractor/visualization/sigma/__init__.py,sha256=GINql4yJLtjfOJgjHaWNpE13cMtnKNytiFRomwav27Q,114
|
|
272
|
-
castor_extractor/visualization/sigma/assets.py,sha256=
|
|
273
|
+
castor_extractor/visualization/sigma/assets.py,sha256=uKGKDaeY1ejc7XGh4eFaNp2ygG7hgca132xsX4eCwKQ,380
|
|
273
274
|
castor_extractor/visualization/sigma/client/__init__.py,sha256=YQv06FBBQHvBMFg_tN0nUcmUp2NCL2s-eFTXG8rXaBg,74
|
|
274
|
-
castor_extractor/visualization/sigma/client/client.py,sha256=
|
|
275
|
+
castor_extractor/visualization/sigma/client/client.py,sha256=VU0BHlug3tCpGA1je0PjEy4hU4TKhCH9UUGi8LRmNy8,11422
|
|
275
276
|
castor_extractor/visualization/sigma/client/client_test.py,sha256=ae0ZOvKutCm44jnrJ-0_A5Y6ZGyDkMf9Ml3eEP8dNkY,581
|
|
276
277
|
castor_extractor/visualization/sigma/client/credentials.py,sha256=XddAuQSmCKpxJ70TQgRnOj0vMPYVtiStk_lMMQ1AiNM,693
|
|
277
|
-
castor_extractor/visualization/sigma/client/endpoints.py,sha256=
|
|
278
|
+
castor_extractor/visualization/sigma/client/endpoints.py,sha256=i7KTKnl2Os6752CdtJl0vPSC_Z6JxmacodV_saOnce0,1662
|
|
278
279
|
castor_extractor/visualization/sigma/client/pagination.py,sha256=2bFA7GiBUUasFtHJKA90516d283p7Pg50-4zw6Fwt8I,726
|
|
279
|
-
castor_extractor/visualization/sigma/
|
|
280
|
+
castor_extractor/visualization/sigma/client/sources_transformer.py,sha256=mRupzxjtjDqELIouHF0egBkgslDmn5Y4uqO_sbUGCNs,3244
|
|
281
|
+
castor_extractor/visualization/sigma/client/sources_transformer_test.py,sha256=06yUHXyv65amXLKXhix6K3kkVc1kpBqSjIYcxbyMI4Y,2766
|
|
282
|
+
castor_extractor/visualization/sigma/extract.py,sha256=poTh70Xm2D6BwbdGApLkjXy6-t4iZnOoMB5DPfaTLEI,2929
|
|
280
283
|
castor_extractor/visualization/strategy/__init__.py,sha256=HOMv4JxqF5ZmViWi-pDE-PSXJRLTdXal_jtpHG_rlR8,123
|
|
281
284
|
castor_extractor/visualization/strategy/assets.py,sha256=yFXF_dX01patC0HQ1eU7Jo_4DZ4m6IJEg0uCB71tMoI,480
|
|
282
285
|
castor_extractor/visualization/strategy/client/__init__.py,sha256=XWP0yF5j6JefDJkDfX-RSJn3HF2ceQ0Yx1PLCfB3BBo,80
|
|
@@ -334,18 +337,18 @@ castor_extractor/warehouse/bigquery/types.py,sha256=rfKkKA13Et7TM4I0uVaXkLfuaBXk
|
|
|
334
337
|
castor_extractor/warehouse/databricks/__init__.py,sha256=YG3YSIJgCFRjjI8eExy9T7qGnfnjWhMFh8c15KTs_BA,184
|
|
335
338
|
castor_extractor/warehouse/databricks/api_client.py,sha256=kLcUGSgrfybZUrpt0tE7qe2OoSSN7IK4myyB7c0czOY,6260
|
|
336
339
|
castor_extractor/warehouse/databricks/api_client_test.py,sha256=YTWC-X7L-XAfK5b39TUgTmR1ifv0QrY5tvLNoSbpmjg,466
|
|
337
|
-
castor_extractor/warehouse/databricks/client.py,sha256=
|
|
338
|
-
castor_extractor/warehouse/databricks/client_test.py,sha256=
|
|
340
|
+
castor_extractor/warehouse/databricks/client.py,sha256=LzpeVQIOYi_QTfdOHbK6SB4SgxhZ7p9TNxh0Iwfz850,3307
|
|
341
|
+
castor_extractor/warehouse/databricks/client_test.py,sha256=dqEdEAt-6e8CtQ7M2L5vDYkn4JvOjqyqZSFEpQ55WRc,1432
|
|
339
342
|
castor_extractor/warehouse/databricks/credentials.py,sha256=ExtVcl2NpMXTx1Lg8vHQdzQtSEm2aqpg3D1BJrNAUjI,528
|
|
340
343
|
castor_extractor/warehouse/databricks/endpoints.py,sha256=qPoL9CtPFJdwVuW9rJ37nmeMd-nChOBouEVYb4SlaUE,670
|
|
341
344
|
castor_extractor/warehouse/databricks/enums.py,sha256=3T6BbVvbWvfWkD23krsYT1x0kKh1qRzNPl6WpcXe300,274
|
|
342
345
|
castor_extractor/warehouse/databricks/extract.py,sha256=Z4VTEIf0QMiua0QGAlJdQ86kxmGAXekQ304aCKme6IY,7358
|
|
343
346
|
castor_extractor/warehouse/databricks/format.py,sha256=S3BOcwJubc1pyKr-li26uftUUfsjfrm5Qf4LqmElXVk,6736
|
|
344
347
|
castor_extractor/warehouse/databricks/format_test.py,sha256=ls0IcOElqp_qecAzNbK0zdca7Pms4seCHimbw8NAoAI,3322
|
|
345
|
-
castor_extractor/warehouse/databricks/lineage.py,sha256=jwiRXrgqBAtzQt5EgErYrN8YRyviEEHmyrSbw8TSPq4,2105
|
|
346
|
-
castor_extractor/warehouse/databricks/lineage_test.py,sha256=PyBn1eAoxLm4Bz5M0F4zmaxFX2mXRTM_uug5OKbQPQs,2684
|
|
347
348
|
castor_extractor/warehouse/databricks/pagination.py,sha256=sM1G0sN1pf1TPpI0Y3Oew378UGEKVkMRc2Mlu9tDjLo,545
|
|
348
|
-
castor_extractor/warehouse/databricks/
|
|
349
|
+
castor_extractor/warehouse/databricks/queries/column_lineage.sql,sha256=Q8MAZ5N3fNcolTMtRRw2fIrbKgV4ax9StgJgtYMpxNQ,980
|
|
350
|
+
castor_extractor/warehouse/databricks/queries/table_lineage.sql,sha256=5k5jHj11SdGpfMqJEKJihAhd_ngO4kZOZJ8TCPihWDs,786
|
|
351
|
+
castor_extractor/warehouse/databricks/sql_client.py,sha256=oypv_2pomoleXUJJhS8CSKO_ucalQhS9_mcsnsb5wsc,3750
|
|
349
352
|
castor_extractor/warehouse/databricks/types.py,sha256=-TFX4jS6_c3wQLOpJTKpLeGS21YIPjKDjISnzeUPdCc,46
|
|
350
353
|
castor_extractor/warehouse/databricks/utils.py,sha256=5CKn6Me1Tus97H_qDEz_5tkhd4ARmwk2qiC3GndjyCc,1969
|
|
351
354
|
castor_extractor/warehouse/databricks/utils_test.py,sha256=_guTuzRWRTZdDY7ils0X1K8jhI9T877MEtw3x_YDg9I,2415
|
|
@@ -419,17 +422,17 @@ castor_extractor/warehouse/snowflake/queries/view_ddl.sql,sha256=eWsci_50cxiYIv3
|
|
|
419
422
|
castor_extractor/warehouse/snowflake/query.py,sha256=C2LTdPwBzMQ_zMncg0Kq4_WkoY7K9as5tvxBDrIOlwI,1763
|
|
420
423
|
castor_extractor/warehouse/sqlserver/__init__.py,sha256=PdOuYznmvKAbfWAm8UdN47MfEsd9jqPi_dDi3WEo1KY,116
|
|
421
424
|
castor_extractor/warehouse/sqlserver/client.py,sha256=Bjfpw96IKAQfWPiU5SZYEDfetwfkqZrnKbQYoStcnZc,2007
|
|
422
|
-
castor_extractor/warehouse/sqlserver/extract.py,sha256
|
|
425
|
+
castor_extractor/warehouse/sqlserver/extract.py,sha256=GbOlSq8JR6HaJZunkfiRxaSt0pbgazQjF8GpgqWWIcU,2294
|
|
423
426
|
castor_extractor/warehouse/sqlserver/queries/.sqlfluff,sha256=yy0KQdz8I_67vnXyX8eeWwOWkxTXvHyVKSVwhURktd8,48
|
|
424
|
-
castor_extractor/warehouse/sqlserver/queries/column.sql,sha256=
|
|
427
|
+
castor_extractor/warehouse/sqlserver/queries/column.sql,sha256=eRILCgdygYRvtfSdxaswIiIYKW-PiJXW2qi3yHtrfns,2913
|
|
425
428
|
castor_extractor/warehouse/sqlserver/queries/database.sql,sha256=4dPeBCn85MEOXr1f-DPXxiI3RvvoE_1n8lsbTs26E0I,150
|
|
426
|
-
castor_extractor/warehouse/sqlserver/queries/schema.sql,sha256=
|
|
427
|
-
castor_extractor/warehouse/sqlserver/queries/table.sql,sha256=
|
|
429
|
+
castor_extractor/warehouse/sqlserver/queries/schema.sql,sha256=Zp4G86FJ_Be8Zqvdlu7K8DqmsUL62kxbwaUk5asZ0V4,881
|
|
430
|
+
castor_extractor/warehouse/sqlserver/queries/table.sql,sha256=YwFhHc6rGbszqQt7Izh7EngVwrrBoEZ9kniuWXNtGco,2837
|
|
428
431
|
castor_extractor/warehouse/sqlserver/queries/user.sql,sha256=gOrZsMVypusR2dc4vwVs4E1a-CliRsr_UjnD2EbXs-A,94
|
|
429
432
|
castor_extractor/warehouse/sqlserver/query.py,sha256=7sW8cK3JzxPt6faTJ7e4lk9tE4fo_AeCymI-LqsSols,1276
|
|
430
433
|
castor_extractor/warehouse/synapse/queries/column.sql,sha256=lNcFoIW3Y0PFOqoOzJEXmPvZvfAsY0AP63Mu2LuPzPo,1351
|
|
431
|
-
castor_extractor-0.24.
|
|
432
|
-
castor_extractor-0.24.
|
|
433
|
-
castor_extractor-0.24.
|
|
434
|
-
castor_extractor-0.24.
|
|
435
|
-
castor_extractor-0.24.
|
|
434
|
+
castor_extractor-0.24.40.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
|
|
435
|
+
castor_extractor-0.24.40.dist-info/METADATA,sha256=ONg1SCc3gcrOJqBE92EtyfQctf-hRxI_u2VUbBpvgVA,26859
|
|
436
|
+
castor_extractor-0.24.40.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
|
437
|
+
castor_extractor-0.24.40.dist-info/entry_points.txt,sha256=_F-qeZCybjoMkNb9ErEhnyqXuG6afHIFQhakdBHZsr4,1803
|
|
438
|
+
castor_extractor-0.24.40.dist-info/RECORD,,
|
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
from typing import Iterable, Optional
|
|
2
|
-
|
|
3
|
-
from .enums import LineageEntity
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class LineageProcessor:
|
|
7
|
-
"""
|
|
8
|
-
helper class that handles lineage deduplication and filtering
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
def __init__(self, lineage_entity: LineageEntity):
|
|
12
|
-
self.lineage_entity = lineage_entity
|
|
13
|
-
|
|
14
|
-
self.lineage: dict[tuple[str, str], dict] = dict()
|
|
15
|
-
|
|
16
|
-
def _parent_path(self, link) -> Optional[str]:
|
|
17
|
-
if self.lineage_entity == LineageEntity.TABLE:
|
|
18
|
-
return link["source_table_full_name"]
|
|
19
|
-
|
|
20
|
-
source_table = link["source_table_full_name"]
|
|
21
|
-
source_column = link["source_column_name"]
|
|
22
|
-
if not (source_table and source_column):
|
|
23
|
-
return None
|
|
24
|
-
|
|
25
|
-
return f"{source_table}.{source_column}"
|
|
26
|
-
|
|
27
|
-
def _child_path(self, link) -> Optional[str]:
|
|
28
|
-
if self.lineage_entity == LineageEntity.TABLE:
|
|
29
|
-
return link["target_table_full_name"]
|
|
30
|
-
|
|
31
|
-
target_table = link["target_table_full_name"]
|
|
32
|
-
target_column = link["target_column_name"]
|
|
33
|
-
if not (target_table and target_column):
|
|
34
|
-
return None
|
|
35
|
-
|
|
36
|
-
return f"{target_table}.{target_column}"
|
|
37
|
-
|
|
38
|
-
def add(self, link: dict) -> None:
|
|
39
|
-
"""
|
|
40
|
-
If the parent and child paths are valid, keeps the most recent lineage
|
|
41
|
-
link in the `self.lineage` map.
|
|
42
|
-
"""
|
|
43
|
-
parent = self._parent_path(link)
|
|
44
|
-
child = self._child_path(link)
|
|
45
|
-
timestamp = link["event_time"]
|
|
46
|
-
|
|
47
|
-
if not (parent and child and parent != child):
|
|
48
|
-
return
|
|
49
|
-
|
|
50
|
-
key = (parent, child)
|
|
51
|
-
if key in self.lineage and self.lineage[key]["event_time"] > timestamp:
|
|
52
|
-
return
|
|
53
|
-
|
|
54
|
-
self.lineage[key] = link
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
def valid_lineage(
|
|
58
|
-
lineage: Iterable[dict], lineage_entity: LineageEntity
|
|
59
|
-
) -> list[dict]:
|
|
60
|
-
"""
|
|
61
|
-
Filters out self-lineage or lineage with a missing source or target path,
|
|
62
|
-
then deduplicates by picking the link with the most recent event timestmap.
|
|
63
|
-
"""
|
|
64
|
-
deduplicated_lineage = LineageProcessor(lineage_entity)
|
|
65
|
-
|
|
66
|
-
for link in lineage:
|
|
67
|
-
deduplicated_lineage.add(link)
|
|
68
|
-
|
|
69
|
-
return list(deduplicated_lineage.lineage.values())
|
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
from .enums import LineageEntity
|
|
2
|
-
from .lineage import LineageProcessor, valid_lineage
|
|
3
|
-
|
|
4
|
-
_OLDER_DATE = "2025-01-01 00:00:01.0"
|
|
5
|
-
_CLOSER_DATE = "2025-01-01 02:02:02.0"
|
|
6
|
-
|
|
7
|
-
_TABLE_LINEAGES = [
|
|
8
|
-
{
|
|
9
|
-
"source_table_full_name": "a.b.source",
|
|
10
|
-
"target_table_full_name": "a.b.target",
|
|
11
|
-
"event_time": _CLOSER_DATE,
|
|
12
|
-
"other": "more recent stuff",
|
|
13
|
-
},
|
|
14
|
-
{
|
|
15
|
-
"source_table_full_name": "a.b.source",
|
|
16
|
-
"target_table_full_name": "a.b.target",
|
|
17
|
-
"event_time": _OLDER_DATE,
|
|
18
|
-
"other": "stuff that's too old",
|
|
19
|
-
},
|
|
20
|
-
{
|
|
21
|
-
"source_table_full_name": "no target",
|
|
22
|
-
"target_table_full_name": None,
|
|
23
|
-
"event_time": _CLOSER_DATE,
|
|
24
|
-
},
|
|
25
|
-
{
|
|
26
|
-
"source_table_full_name": None,
|
|
27
|
-
"target_table_full_name": "no source",
|
|
28
|
-
"event_time": _CLOSER_DATE,
|
|
29
|
-
},
|
|
30
|
-
]
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
_COLUMN_LINEAGES = [
|
|
34
|
-
{
|
|
35
|
-
"source_table_full_name": "a.b.source",
|
|
36
|
-
"source_column_name": "src_col",
|
|
37
|
-
"target_table_full_name": "a.b.target",
|
|
38
|
-
"target_column_name": "trgt_col",
|
|
39
|
-
"event_time": _OLDER_DATE,
|
|
40
|
-
"other": "old stuff",
|
|
41
|
-
},
|
|
42
|
-
{
|
|
43
|
-
"source_table_full_name": "a.b.source",
|
|
44
|
-
"source_column_name": "src_col",
|
|
45
|
-
"target_table_full_name": "a.b.target",
|
|
46
|
-
"target_column_name": "trgt_col",
|
|
47
|
-
"event_time": _CLOSER_DATE,
|
|
48
|
-
"other": "newer stuff",
|
|
49
|
-
},
|
|
50
|
-
{
|
|
51
|
-
"source_table_full_name": "a.b.toto",
|
|
52
|
-
"source_column_name": "toto_col",
|
|
53
|
-
"target_table_full_name": "a.b.tata",
|
|
54
|
-
"target_column_name": "tata_col",
|
|
55
|
-
"event_time": _OLDER_DATE,
|
|
56
|
-
},
|
|
57
|
-
{
|
|
58
|
-
"source_table_full_name": "a.b.source",
|
|
59
|
-
"source_column_name": "a.b.source",
|
|
60
|
-
"target_table_full_name": None,
|
|
61
|
-
"target_column_name": None,
|
|
62
|
-
"event_time": _CLOSER_DATE,
|
|
63
|
-
},
|
|
64
|
-
]
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def test_valid_lineage():
|
|
68
|
-
table_links = valid_lineage(_TABLE_LINEAGES, LineageEntity.TABLE)
|
|
69
|
-
|
|
70
|
-
assert len(table_links) == 1
|
|
71
|
-
assert table_links[0]["source_table_full_name"] == "a.b.source"
|
|
72
|
-
assert table_links[0]["target_table_full_name"] == "a.b.target"
|
|
73
|
-
assert table_links[0]["event_time"] == _CLOSER_DATE
|
|
74
|
-
assert table_links[0]["other"] == "more recent stuff"
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def test_LineageLinks_add():
|
|
78
|
-
deduplicated_lineage = LineageProcessor(LineageEntity.COLUMN)
|
|
79
|
-
for link in _COLUMN_LINEAGES:
|
|
80
|
-
deduplicated_lineage.add(link)
|
|
81
|
-
|
|
82
|
-
lineage = deduplicated_lineage.lineage
|
|
83
|
-
assert len(lineage) == 2
|
|
84
|
-
assert ("a.b.source.src_col", "a.b.target.trgt_col") in lineage
|
|
85
|
-
assert ("a.b.toto.toto_col", "a.b.tata.tata_col") in lineage
|
|
86
|
-
assert (
|
|
87
|
-
lineage[("a.b.source.src_col", "a.b.target.trgt_col")]["other"]
|
|
88
|
-
== "newer stuff"
|
|
89
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|