castor-extractor 0.24.35__py3-none-any.whl → 0.24.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +14 -0
- castor_extractor/commands/upload.py +10 -0
- castor_extractor/uploader/__init__.py +1 -0
- castor_extractor/uploader/constant.py +5 -1
- castor_extractor/uploader/enums.py +8 -0
- castor_extractor/uploader/settings.py +2 -0
- castor_extractor/uploader/upload.py +26 -5
- castor_extractor/uploader/upload_test.py +5 -3
- castor_extractor/visualization/sigma/assets.py +3 -0
- castor_extractor/visualization/sigma/client/client.py +66 -5
- castor_extractor/visualization/sigma/client/endpoints.py +17 -0
- castor_extractor/visualization/sigma/client/sources_transformer.py +94 -0
- castor_extractor/visualization/sigma/client/sources_transformer_test.py +101 -0
- castor_extractor/visualization/sigma/extract.py +17 -1
- {castor_extractor-0.24.35.dist-info → castor_extractor-0.24.38.dist-info}/METADATA +15 -1
- {castor_extractor-0.24.35.dist-info → castor_extractor-0.24.38.dist-info}/RECORD +19 -16
- {castor_extractor-0.24.35.dist-info → castor_extractor-0.24.38.dist-info}/LICENCE +0 -0
- {castor_extractor-0.24.35.dist-info → castor_extractor-0.24.38.dist-info}/WHEEL +0 -0
- {castor_extractor-0.24.35.dist-info → castor_extractor-0.24.38.dist-info}/entry_points.txt +0 -0
CHANGELOG.md
CHANGED
|
@@ -1,5 +1,19 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.24.38 - 2025-08-07
|
|
4
|
+
|
|
5
|
+
* Uploader: Support US and EU zones
|
|
6
|
+
|
|
7
|
+
## 0.24.37 - 2025-08-06
|
|
8
|
+
|
|
9
|
+
* Sigma: extract data models, dataset sources and workbook sources
|
|
10
|
+
|
|
11
|
+
## 0.24.36 - 2025-08-04
|
|
12
|
+
|
|
13
|
+
* Sigma:
|
|
14
|
+
* Refresh token before lineage extraction
|
|
15
|
+
* Disregard 403 errors during lineage extraction
|
|
16
|
+
|
|
3
17
|
## 0.24.35 - 2025-07-29
|
|
4
18
|
|
|
5
19
|
* Coalesce - Fix pagination issue
|
|
@@ -3,6 +3,7 @@ from argparse import ArgumentParser
|
|
|
3
3
|
|
|
4
4
|
from castor_extractor.uploader import ( # type: ignore
|
|
5
5
|
FileType,
|
|
6
|
+
Zone,
|
|
6
7
|
upload_any,
|
|
7
8
|
)
|
|
8
9
|
from castor_extractor.utils import parse_filled_arguments # type: ignore
|
|
@@ -40,6 +41,15 @@ def _args() -> ArgumentParser:
|
|
|
40
41
|
),
|
|
41
42
|
choices=supported_file_type,
|
|
42
43
|
)
|
|
44
|
+
supported_zones = [zone.value for zone in Zone]
|
|
45
|
+
parser.add_argument(
|
|
46
|
+
"-z",
|
|
47
|
+
"--zone",
|
|
48
|
+
help="geographic zone to upload, currently supported are {}, defaults to EU".format(
|
|
49
|
+
supported_zones,
|
|
50
|
+
),
|
|
51
|
+
choices=supported_zones,
|
|
52
|
+
)
|
|
43
53
|
return parser
|
|
44
54
|
|
|
45
55
|
|
|
@@ -1,9 +1,13 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
2
|
|
|
3
3
|
from ..utils import RetryStrategy
|
|
4
|
+
from .enums import Zone
|
|
4
5
|
|
|
5
6
|
# url of the gcs proxy
|
|
6
|
-
|
|
7
|
+
INGEST_URLS = {
|
|
8
|
+
Zone.EU: "https://ingest.castordoc.com",
|
|
9
|
+
Zone.US: "https://ingest.us.castordoc.com",
|
|
10
|
+
}
|
|
7
11
|
|
|
8
12
|
RETRY_BASE_MS = 10_000
|
|
9
13
|
RETRY_JITTER_MS = 1_000
|
|
@@ -4,6 +4,7 @@ from pydantic import UUID4, Field
|
|
|
4
4
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
5
5
|
|
|
6
6
|
from .constant import FileType
|
|
7
|
+
from .enums import Zone
|
|
7
8
|
|
|
8
9
|
UPLOADER_ENV_PREFIX = "CASTOR_UPLOADER_"
|
|
9
10
|
|
|
@@ -22,3 +23,4 @@ class UploaderSettings(BaseSettings):
|
|
|
22
23
|
file_type: FileType
|
|
23
24
|
source_id: UUID4
|
|
24
25
|
token: str = Field(repr=False)
|
|
26
|
+
zone: Optional[Zone] = Zone.EU
|
|
@@ -10,13 +10,14 @@ import requests
|
|
|
10
10
|
|
|
11
11
|
from ..utils.retry import retry
|
|
12
12
|
from .constant import (
|
|
13
|
-
|
|
13
|
+
INGEST_URLS,
|
|
14
14
|
PATH_TEMPLATES,
|
|
15
15
|
RETRY_BASE_MS,
|
|
16
16
|
RETRY_JITTER_MS,
|
|
17
17
|
RETRY_STRATEGY,
|
|
18
18
|
FileType,
|
|
19
19
|
)
|
|
20
|
+
from .enums import Zone
|
|
20
21
|
from .env import get_blob_env
|
|
21
22
|
from .settings import UploaderSettings
|
|
22
23
|
from .utils import iter_files
|
|
@@ -33,6 +34,7 @@ def _path_and_url(
|
|
|
33
34
|
source_id: UUID,
|
|
34
35
|
file_type: FileType,
|
|
35
36
|
file_path: str,
|
|
37
|
+
zone: Zone,
|
|
36
38
|
) -> tuple[str, str]:
|
|
37
39
|
now = datetime.utcnow()
|
|
38
40
|
timestamp = int(now.timestamp())
|
|
@@ -44,7 +46,7 @@ def _path_and_url(
|
|
|
44
46
|
filename=filename,
|
|
45
47
|
)
|
|
46
48
|
|
|
47
|
-
url = f"{
|
|
49
|
+
url = f"{INGEST_URLS[zone]}/{path}"
|
|
48
50
|
|
|
49
51
|
return path, url
|
|
50
52
|
|
|
@@ -61,13 +63,16 @@ def _upload(
|
|
|
61
63
|
source_id: UUID,
|
|
62
64
|
file_path: str,
|
|
63
65
|
file_type: FileType,
|
|
66
|
+
zone: Optional[Zone] = Zone.EU,
|
|
64
67
|
) -> None:
|
|
65
68
|
"""
|
|
66
69
|
Upload the given file to Google Cloud Storage (GCS)
|
|
67
70
|
- Don't call GCS API directly
|
|
68
71
|
- Call the ingestion proxy which handles authorisation and uploading
|
|
69
72
|
"""
|
|
70
|
-
|
|
73
|
+
if not zone:
|
|
74
|
+
zone = Zone.EU
|
|
75
|
+
path, url = _path_and_url(source_id, file_type, file_path, zone)
|
|
71
76
|
headers = _headers(token)
|
|
72
77
|
timeout, max_retries = get_blob_env()
|
|
73
78
|
|
|
@@ -97,6 +102,7 @@ def _upload(
|
|
|
97
102
|
def upload_manifest(
|
|
98
103
|
token: str,
|
|
99
104
|
source_id: UUID,
|
|
105
|
+
zone: Optional[Zone],
|
|
100
106
|
file_path: Optional[str] = None,
|
|
101
107
|
) -> None:
|
|
102
108
|
"""
|
|
@@ -106,13 +112,20 @@ def upload_manifest(
|
|
|
106
112
|
"""
|
|
107
113
|
if not file_path:
|
|
108
114
|
raise ValueError("file path is needed to upload a manifest")
|
|
109
|
-
_upload(
|
|
115
|
+
_upload(
|
|
116
|
+
token=token,
|
|
117
|
+
source_id=source_id,
|
|
118
|
+
file_path=file_path,
|
|
119
|
+
file_type=FileType.DBT,
|
|
120
|
+
zone=zone,
|
|
121
|
+
)
|
|
110
122
|
|
|
111
123
|
|
|
112
124
|
def upload(
|
|
113
125
|
token: str,
|
|
114
126
|
source_id: UUID,
|
|
115
127
|
file_type: FileType,
|
|
128
|
+
zone: Optional[Zone],
|
|
116
129
|
file_path: Optional[str] = None,
|
|
117
130
|
directory_path: Optional[str] = None,
|
|
118
131
|
) -> None:
|
|
@@ -133,7 +146,13 @@ def upload(
|
|
|
133
146
|
raise ValueError(message)
|
|
134
147
|
|
|
135
148
|
for file_ in files:
|
|
136
|
-
_upload(
|
|
149
|
+
_upload(
|
|
150
|
+
token=token,
|
|
151
|
+
source_id=source_id,
|
|
152
|
+
file_path=file_,
|
|
153
|
+
file_type=file_type,
|
|
154
|
+
zone=zone,
|
|
155
|
+
)
|
|
137
156
|
|
|
138
157
|
|
|
139
158
|
def upload_any(**kwargs) -> None:
|
|
@@ -156,6 +175,7 @@ def upload_any(**kwargs) -> None:
|
|
|
156
175
|
token=settings.token,
|
|
157
176
|
source_id=settings.source_id,
|
|
158
177
|
file_path=settings.file_path,
|
|
178
|
+
zone=settings.zone,
|
|
159
179
|
)
|
|
160
180
|
return None
|
|
161
181
|
|
|
@@ -165,4 +185,5 @@ def upload_any(**kwargs) -> None:
|
|
|
165
185
|
file_type=file_type,
|
|
166
186
|
file_path=settings.file_path,
|
|
167
187
|
directory_path=settings.directory_path,
|
|
188
|
+
zone=settings.zone,
|
|
168
189
|
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from uuid import UUID
|
|
2
2
|
|
|
3
|
-
from .constant import
|
|
3
|
+
from .constant import INGEST_URLS, FileType
|
|
4
|
+
from .enums import Zone
|
|
4
5
|
from .upload import _path_and_url
|
|
5
6
|
|
|
6
7
|
|
|
@@ -8,7 +9,8 @@ def test__path():
|
|
|
8
9
|
source_id = UUID("399a8b22-3187-11ec-8d3d-0242ac130003")
|
|
9
10
|
file_type = FileType.VIZ
|
|
10
11
|
file_path = "filename"
|
|
12
|
+
zone = Zone.EU
|
|
11
13
|
|
|
12
|
-
path, url = _path_and_url(source_id, file_type, file_path)
|
|
14
|
+
path, url = _path_and_url(source_id, file_type, file_path, zone)
|
|
13
15
|
assert path == f"visualization-{source_id}/{file_path}"
|
|
14
|
-
assert url == f"{
|
|
16
|
+
assert url == f"{INGEST_URLS[Zone.EU]}/{path}"
|
|
@@ -4,10 +4,13 @@ from ...types import ExternalAsset
|
|
|
4
4
|
class SigmaAsset(ExternalAsset):
|
|
5
5
|
"""Sigma assets"""
|
|
6
6
|
|
|
7
|
+
DATAMODELS = "datamodels"
|
|
7
8
|
DATASETS = "datasets"
|
|
9
|
+
DATASET_SOURCES = "dataset_sources"
|
|
8
10
|
ELEMENTS = "elements"
|
|
9
11
|
FILES = "files"
|
|
10
12
|
LINEAGES = "lineages"
|
|
11
13
|
MEMBERS = "members"
|
|
12
14
|
QUERIES = "queries"
|
|
13
15
|
WORKBOOKS = "workbooks"
|
|
16
|
+
WORKBOOK_SOURCES = "workbook_sources"
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from collections.abc import Iterator
|
|
2
3
|
from concurrent.futures import ThreadPoolExecutor
|
|
3
4
|
from functools import partial
|
|
@@ -24,6 +25,9 @@ from .pagination import (
|
|
|
24
25
|
SIGMA_QUERIES_PAGINATION_LIMIT,
|
|
25
26
|
SigmaPagination,
|
|
26
27
|
)
|
|
28
|
+
from .sources_transformer import SigmaSourcesTransformer
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
27
31
|
|
|
28
32
|
_CONTENT_TYPE = "application/x-www-form-urlencoded"
|
|
29
33
|
|
|
@@ -54,6 +58,13 @@ SIGMA_SAFE_MODE = RequestSafeMode(
|
|
|
54
58
|
max_errors=_VOLUME_IGNORED,
|
|
55
59
|
status_codes=_IGNORED_ERROR_CODES,
|
|
56
60
|
)
|
|
61
|
+
SIGMA_SAFE_MODE_LINEAGE = RequestSafeMode(
|
|
62
|
+
max_errors=_VOLUME_IGNORED,
|
|
63
|
+
status_codes=(
|
|
64
|
+
*_IGNORED_ERROR_CODES,
|
|
65
|
+
HTTPStatus.FORBIDDEN,
|
|
66
|
+
),
|
|
67
|
+
)
|
|
57
68
|
_THREADS_LINEAGE = 10 # empirically found; hit the rate limit with 20 workers
|
|
58
69
|
_RETRY_NUMBER = 1
|
|
59
70
|
_RETRY_BASE_MS = 60_000
|
|
@@ -128,6 +139,12 @@ class SigmaClient(APIClient):
|
|
|
128
139
|
params={"limit": limit},
|
|
129
140
|
)
|
|
130
141
|
|
|
142
|
+
def _get_all_datamodels(self) -> Iterator[dict]:
|
|
143
|
+
request = self._get_paginated(
|
|
144
|
+
endpoint=SigmaEndpointFactory.datamodels()
|
|
145
|
+
)
|
|
146
|
+
yield from fetch_all_pages(request, SigmaPagination)
|
|
147
|
+
|
|
131
148
|
def _get_all_datasets(self) -> Iterator[dict]:
|
|
132
149
|
request = self._get_paginated(endpoint=SigmaEndpointFactory.datasets())
|
|
133
150
|
yield from fetch_all_pages(request, SigmaPagination)
|
|
@@ -210,18 +227,35 @@ class SigmaClient(APIClient):
|
|
|
210
227
|
return contexts
|
|
211
228
|
|
|
212
229
|
def _get_all_lineages(self, elements: list[dict]) -> Iterator[dict]:
|
|
230
|
+
"""
|
|
231
|
+
The safe mode is temporarily modified to include 403 errors.
|
|
232
|
+
|
|
233
|
+
Due to concurrency issues, we force a refresh of the token in hopes that
|
|
234
|
+
the lineage extraction takes less than the token expiration time of
|
|
235
|
+
1 hour.
|
|
236
|
+
"""
|
|
237
|
+
safe_mode = self._safe_mode
|
|
238
|
+
self._safe_mode = SIGMA_SAFE_MODE_LINEAGE
|
|
239
|
+
|
|
213
240
|
lineage_context = self._lineage_context(elements)
|
|
214
241
|
|
|
242
|
+
self._auth.refresh_token()
|
|
243
|
+
|
|
215
244
|
with ThreadPoolExecutor(max_workers=_THREADS_LINEAGE) as executor:
|
|
216
245
|
results = executor.map(self._get_lineage, lineage_context)
|
|
217
246
|
|
|
218
247
|
for lineage in results:
|
|
248
|
+
if not lineage.lineage:
|
|
249
|
+
continue
|
|
250
|
+
|
|
219
251
|
yield {
|
|
220
252
|
**lineage.lineage,
|
|
221
253
|
"workbook_id": lineage.context.workbook_id,
|
|
222
254
|
"element_id": lineage.context.element_id,
|
|
223
255
|
}
|
|
224
256
|
|
|
257
|
+
self._safe_mode = safe_mode
|
|
258
|
+
|
|
225
259
|
@staticmethod
|
|
226
260
|
def _yield_deduplicated_queries(
|
|
227
261
|
queries: Iterable[dict], workbook_id: str
|
|
@@ -251,18 +285,36 @@ class SigmaClient(APIClient):
|
|
|
251
285
|
|
|
252
286
|
yield from self._yield_deduplicated_queries(queries, workbook_id)
|
|
253
287
|
|
|
288
|
+
def _get_all_dataset_sources(self, datasets: list[dict]) -> Iterator[dict]:
|
|
289
|
+
yield from SigmaSourcesTransformer(self).get_dataset_sources(datasets)
|
|
290
|
+
|
|
291
|
+
def _get_all_workbook_sources(
|
|
292
|
+
self, workbooks: list[dict]
|
|
293
|
+
) -> Iterator[dict]:
|
|
294
|
+
yield from SigmaSourcesTransformer(self).get_workbook_sources(workbooks)
|
|
295
|
+
|
|
254
296
|
def fetch(
|
|
255
297
|
self,
|
|
256
298
|
asset: SigmaAsset,
|
|
257
|
-
|
|
299
|
+
datasets: Optional[list[dict]] = None,
|
|
258
300
|
elements: Optional[list[dict]] = None,
|
|
301
|
+
workbooks: Optional[list[dict]] = None,
|
|
259
302
|
) -> Iterator[dict]:
|
|
260
303
|
"""Returns the needed metadata for the queried asset"""
|
|
261
|
-
if asset == SigmaAsset.
|
|
304
|
+
if asset == SigmaAsset.DATAMODELS:
|
|
305
|
+
yield from self._get_all_datamodels()
|
|
306
|
+
|
|
307
|
+
elif asset == SigmaAsset.DATASETS:
|
|
262
308
|
yield from self._get_all_datasets()
|
|
263
309
|
|
|
310
|
+
elif asset == SigmaAsset.DATASET_SOURCES:
|
|
311
|
+
if datasets is None:
|
|
312
|
+
raise ValueError("Missing datasets to extract dataset sources")
|
|
313
|
+
|
|
314
|
+
yield from self._get_all_dataset_sources(datasets)
|
|
315
|
+
|
|
264
316
|
elif asset == SigmaAsset.ELEMENTS:
|
|
265
|
-
if
|
|
317
|
+
if workbooks is None:
|
|
266
318
|
raise ValueError("Missing workbooks to extract elements")
|
|
267
319
|
|
|
268
320
|
yield from self._get_all_elements(workbooks)
|
|
@@ -271,15 +323,16 @@ class SigmaClient(APIClient):
|
|
|
271
323
|
yield from self._get_all_files()
|
|
272
324
|
|
|
273
325
|
elif asset == SigmaAsset.LINEAGES:
|
|
274
|
-
if
|
|
326
|
+
if elements is None:
|
|
275
327
|
raise ValueError("Missing elements to extract lineage")
|
|
328
|
+
|
|
276
329
|
yield from self._get_all_lineages(elements)
|
|
277
330
|
|
|
278
331
|
elif asset == SigmaAsset.MEMBERS:
|
|
279
332
|
yield from self._get_all_members()
|
|
280
333
|
|
|
281
334
|
elif asset == SigmaAsset.QUERIES:
|
|
282
|
-
if
|
|
335
|
+
if workbooks is None:
|
|
283
336
|
raise ValueError("Missing workbooks to extract queries")
|
|
284
337
|
|
|
285
338
|
yield from self._get_all_queries(workbooks)
|
|
@@ -287,5 +340,13 @@ class SigmaClient(APIClient):
|
|
|
287
340
|
elif asset == SigmaAsset.WORKBOOKS:
|
|
288
341
|
yield from self._get_all_workbooks()
|
|
289
342
|
|
|
343
|
+
elif asset == SigmaAsset.WORKBOOK_SOURCES:
|
|
344
|
+
if workbooks is None:
|
|
345
|
+
raise ValueError(
|
|
346
|
+
"Missing workbooks to extract workbook sources"
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
yield from self._get_all_workbook_sources(workbooks)
|
|
350
|
+
|
|
290
351
|
else:
|
|
291
352
|
raise ValueError(f"This asset {asset} is unknown")
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
class SigmaEndpointFactory:
|
|
2
2
|
"""Wrapper class around all endpoints we're using"""
|
|
3
3
|
|
|
4
|
+
DATAMODELS = "dataModels"
|
|
4
5
|
DATASETS = "datasets"
|
|
5
6
|
FILES = "files"
|
|
6
7
|
MEMBERS = "members"
|
|
@@ -10,10 +11,22 @@ class SigmaEndpointFactory:
|
|
|
10
11
|
def authentication(cls) -> str:
|
|
11
12
|
return "v2/auth/token"
|
|
12
13
|
|
|
14
|
+
@classmethod
|
|
15
|
+
def connection_path(cls, inode_id: str) -> str:
|
|
16
|
+
return f"v2/connections/paths/{inode_id}"
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def datamodels(cls) -> str:
|
|
20
|
+
return f"v2/{cls.DATAMODELS}"
|
|
21
|
+
|
|
13
22
|
@classmethod
|
|
14
23
|
def datasets(cls) -> str:
|
|
15
24
|
return f"v2/{cls.DATASETS}"
|
|
16
25
|
|
|
26
|
+
@classmethod
|
|
27
|
+
def dataset_sources(cls, dataset_id: str) -> str:
|
|
28
|
+
return f"v2/{cls.DATASETS}/{dataset_id}/sources"
|
|
29
|
+
|
|
17
30
|
@classmethod
|
|
18
31
|
def elements(cls, workbook_id: str, page_id: str) -> str:
|
|
19
32
|
return f"v2/{cls.WORKBOOKS}/{workbook_id}/pages/{page_id}/elements"
|
|
@@ -41,3 +54,7 @@ class SigmaEndpointFactory:
|
|
|
41
54
|
@classmethod
|
|
42
55
|
def workbooks(cls) -> str:
|
|
43
56
|
return f"v2/{cls.WORKBOOKS}"
|
|
57
|
+
|
|
58
|
+
@classmethod
|
|
59
|
+
def workbook_sources(cls, workbook_id: str) -> str:
|
|
60
|
+
return f"v2/{cls.WORKBOOKS}/{workbook_id}/sources"
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import TYPE_CHECKING, Callable, Iterator
|
|
3
|
+
|
|
4
|
+
from .endpoints import SigmaEndpointFactory
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from .client import SigmaClient
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SigmaSourcesTransformer:
|
|
13
|
+
"""Retrieves asset sources and enhances them with additional information."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, api_client: "SigmaClient"):
|
|
16
|
+
self.api_client = api_client
|
|
17
|
+
|
|
18
|
+
def _map_table_id_to_connection_path(
|
|
19
|
+
self, all_sources: list
|
|
20
|
+
) -> dict[str, dict]:
|
|
21
|
+
"""Maps a table id to its connection and path information."""
|
|
22
|
+
logger.info("Mapping table ids to connection and path information")
|
|
23
|
+
|
|
24
|
+
unique_table_ids = {
|
|
25
|
+
source["inodeId"]
|
|
26
|
+
for asset_sources in all_sources
|
|
27
|
+
for source in asset_sources["sources"]
|
|
28
|
+
if source["type"] == "table"
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
return {
|
|
32
|
+
table_id: self.api_client._get(
|
|
33
|
+
endpoint=SigmaEndpointFactory.connection_path(table_id)
|
|
34
|
+
)
|
|
35
|
+
for table_id in unique_table_ids
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
@staticmethod
|
|
39
|
+
def _enhance_table_source(source: dict, table_to_path: dict) -> dict:
|
|
40
|
+
"""
|
|
41
|
+
Combines a single table source with its connection and path information.
|
|
42
|
+
"""
|
|
43
|
+
if source["type"] != "table":
|
|
44
|
+
return source
|
|
45
|
+
|
|
46
|
+
path_info = table_to_path.get(source["inodeId"], {})
|
|
47
|
+
source["connectionId"] = path_info.get("connectionId")
|
|
48
|
+
source["path"] = path_info.get("path")
|
|
49
|
+
return source
|
|
50
|
+
|
|
51
|
+
def _transform_sources(
|
|
52
|
+
self, all_sources: list, table_to_path: dict
|
|
53
|
+
) -> Iterator[dict]:
|
|
54
|
+
"""
|
|
55
|
+
Yields all sources, with table sources being enhanced with additional information.
|
|
56
|
+
"""
|
|
57
|
+
logger.info("Merging sources with table information")
|
|
58
|
+
|
|
59
|
+
for asset_sources in all_sources:
|
|
60
|
+
enhanced_sources = [
|
|
61
|
+
self._enhance_table_source(source, table_to_path)
|
|
62
|
+
for source in asset_sources["sources"]
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
yield {
|
|
66
|
+
"asset_id": asset_sources["asset_id"],
|
|
67
|
+
"sources": enhanced_sources,
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
def _get_all_sources(
|
|
71
|
+
self, endpoint: Callable[[str], str], asset_ids: set[str]
|
|
72
|
+
) -> Iterator[dict]:
|
|
73
|
+
"""Returns transformed sources for the given assets"""
|
|
74
|
+
all_sources = []
|
|
75
|
+
|
|
76
|
+
for asset_id in asset_ids:
|
|
77
|
+
sources = self.api_client._get(endpoint=endpoint(asset_id))
|
|
78
|
+
all_sources.append({"asset_id": asset_id, "sources": sources})
|
|
79
|
+
|
|
80
|
+
table_to_path = self._map_table_id_to_connection_path(all_sources)
|
|
81
|
+
|
|
82
|
+
yield from self._transform_sources(all_sources, table_to_path)
|
|
83
|
+
|
|
84
|
+
def get_dataset_sources(self, datasets: list[dict]) -> Iterator[dict]:
|
|
85
|
+
asset_ids = {dataset["datasetId"] for dataset in datasets}
|
|
86
|
+
yield from self._get_all_sources(
|
|
87
|
+
endpoint=SigmaEndpointFactory.dataset_sources, asset_ids=asset_ids
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
def get_workbook_sources(self, workbooks: list[dict]) -> Iterator[dict]:
|
|
91
|
+
asset_ids = {workbook["workbookId"] for workbook in workbooks}
|
|
92
|
+
yield from self._get_all_sources(
|
|
93
|
+
endpoint=SigmaEndpointFactory.workbook_sources, asset_ids=asset_ids
|
|
94
|
+
)
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from unittest.mock import Mock
|
|
2
|
+
|
|
3
|
+
from .sources_transformer import SigmaSourcesTransformer
|
|
4
|
+
|
|
5
|
+
_ALL_SOURCES = [
|
|
6
|
+
{
|
|
7
|
+
"asset_id": "asset1",
|
|
8
|
+
"sources": [
|
|
9
|
+
{"type": "dataset", "inodeId": "1234"}, # non-table source
|
|
10
|
+
{"type": "table", "inodeId": "table1"},
|
|
11
|
+
{"type": "table", "inodeId": "table2"},
|
|
12
|
+
],
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"asset_id": "asset2",
|
|
16
|
+
"sources": [
|
|
17
|
+
{"type": "table", "inodeId": "table1"}, # repeated source
|
|
18
|
+
],
|
|
19
|
+
},
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
_TABLE_TO_PATH = {
|
|
24
|
+
"table1": {
|
|
25
|
+
"connectionId": "conn1",
|
|
26
|
+
"path": ["db", "schema", "table1"],
|
|
27
|
+
},
|
|
28
|
+
"table2": {
|
|
29
|
+
"connectionId": "conn2",
|
|
30
|
+
"path": ["db", "schema", "table2"],
|
|
31
|
+
},
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test__map_table_id_to_connection_path():
|
|
36
|
+
transformer = SigmaSourcesTransformer(api_client=Mock())
|
|
37
|
+
|
|
38
|
+
def mock_get(endpoint):
|
|
39
|
+
if "table1" in endpoint:
|
|
40
|
+
return _TABLE_TO_PATH["table1"]
|
|
41
|
+
elif "table2" in endpoint:
|
|
42
|
+
return _TABLE_TO_PATH["table2"]
|
|
43
|
+
else:
|
|
44
|
+
raise ValueError(f"Unexpected endpoint: {endpoint}")
|
|
45
|
+
|
|
46
|
+
transformer.api_client._get.side_effect = mock_get
|
|
47
|
+
|
|
48
|
+
result = transformer._map_table_id_to_connection_path(_ALL_SOURCES)
|
|
49
|
+
|
|
50
|
+
assert len(result) == 2
|
|
51
|
+
assert result["table1"] == {
|
|
52
|
+
"connectionId": "conn1",
|
|
53
|
+
"path": ["db", "schema", "table1"],
|
|
54
|
+
}
|
|
55
|
+
assert result["table2"] == {
|
|
56
|
+
"connectionId": "conn2",
|
|
57
|
+
"path": ["db", "schema", "table2"],
|
|
58
|
+
}
|
|
59
|
+
assert transformer.api_client._get.call_count == 2
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test__transform_sources():
|
|
63
|
+
transformer = SigmaSourcesTransformer(api_client=Mock())
|
|
64
|
+
|
|
65
|
+
result = list(transformer._transform_sources(_ALL_SOURCES, _TABLE_TO_PATH))
|
|
66
|
+
|
|
67
|
+
assert len(result) == 2
|
|
68
|
+
|
|
69
|
+
asset_1_results = result[0]
|
|
70
|
+
assert len(asset_1_results["sources"]) == 3
|
|
71
|
+
actual_sources = sorted(
|
|
72
|
+
asset_1_results["sources"], key=lambda x: x["inodeId"]
|
|
73
|
+
)
|
|
74
|
+
expected_sources = [
|
|
75
|
+
{"type": "dataset", "inodeId": "1234"},
|
|
76
|
+
{
|
|
77
|
+
"type": "table",
|
|
78
|
+
"inodeId": "table1",
|
|
79
|
+
"connectionId": "conn1",
|
|
80
|
+
"path": ["db", "schema", "table1"],
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
"type": "table",
|
|
84
|
+
"inodeId": "table2",
|
|
85
|
+
"connectionId": "conn2",
|
|
86
|
+
"path": ["db", "schema", "table2"],
|
|
87
|
+
},
|
|
88
|
+
]
|
|
89
|
+
expected_sources = sorted(expected_sources, key=lambda x: x["inodeId"])
|
|
90
|
+
assert actual_sources == expected_sources
|
|
91
|
+
|
|
92
|
+
asset_2_results = result[1]
|
|
93
|
+
assert asset_2_results["asset_id"] == "asset2"
|
|
94
|
+
assert asset_2_results["sources"] == [
|
|
95
|
+
{
|
|
96
|
+
"type": "table",
|
|
97
|
+
"inodeId": "table1",
|
|
98
|
+
"connectionId": "conn1",
|
|
99
|
+
"path": ["db", "schema", "table1"],
|
|
100
|
+
}
|
|
101
|
+
]
|
|
@@ -22,14 +22,30 @@ def iterate_all_data(
|
|
|
22
22
|
) -> Iterable[tuple[SigmaAsset, Union[list, Iterator, dict]]]:
|
|
23
23
|
"""Iterate over the extracted data from Sigma"""
|
|
24
24
|
|
|
25
|
+
logger.info("Extracting DATA MODELS from API")
|
|
26
|
+
datamodels = client.fetch(SigmaAsset.DATAMODELS)
|
|
27
|
+
yield SigmaAsset.DATASETS, list(deep_serialize(datamodels))
|
|
28
|
+
|
|
25
29
|
logger.info("Extracting DATASETS from API")
|
|
26
|
-
datasets = client.fetch(SigmaAsset.DATASETS)
|
|
30
|
+
datasets = list(client.fetch(SigmaAsset.DATASETS))
|
|
27
31
|
yield SigmaAsset.DATASETS, list(deep_serialize(datasets))
|
|
28
32
|
|
|
33
|
+
logger.info("Extracting DATASET SOURCES from API")
|
|
34
|
+
dataset_sources = client.fetch(
|
|
35
|
+
SigmaAsset.DATASET_SOURCES, datasets=datasets
|
|
36
|
+
)
|
|
37
|
+
yield SigmaAsset.DATASET_SOURCES, list(deep_serialize(dataset_sources))
|
|
38
|
+
|
|
29
39
|
logger.info("Extracting WORKBOOKS from API")
|
|
30
40
|
workbooks = list(client.fetch(SigmaAsset.WORKBOOKS))
|
|
31
41
|
yield SigmaAsset.WORKBOOKS, list(deep_serialize(workbooks))
|
|
32
42
|
|
|
43
|
+
logger.info("Extracting WORKBOOK SOURCES from API")
|
|
44
|
+
workbook_sources = client.fetch(
|
|
45
|
+
SigmaAsset.WORKBOOK_SOURCES, workbooks=workbooks
|
|
46
|
+
)
|
|
47
|
+
yield SigmaAsset.WORKBOOKS, list(deep_serialize(workbook_sources))
|
|
48
|
+
|
|
33
49
|
logger.info("Extracting FILES from API")
|
|
34
50
|
files = client.fetch(SigmaAsset.FILES)
|
|
35
51
|
yield SigmaAsset.FILES, list(deep_serialize(files))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: castor-extractor
|
|
3
|
-
Version: 0.24.
|
|
3
|
+
Version: 0.24.38
|
|
4
4
|
Summary: Extract your metadata assets.
|
|
5
5
|
Home-page: https://www.castordoc.com/
|
|
6
6
|
License: EULA
|
|
@@ -215,6 +215,20 @@ For any questions or bug report, contact us at [support@coalesce.io](mailto:supp
|
|
|
215
215
|
|
|
216
216
|
# Changelog
|
|
217
217
|
|
|
218
|
+
## 0.24.38 - 2025-08-07
|
|
219
|
+
|
|
220
|
+
* Uploader: Support US and EU zones
|
|
221
|
+
|
|
222
|
+
## 0.24.37 - 2025-08-06
|
|
223
|
+
|
|
224
|
+
* Sigma: extract data models, dataset sources and workbook sources
|
|
225
|
+
|
|
226
|
+
## 0.24.36 - 2025-08-04
|
|
227
|
+
|
|
228
|
+
* Sigma:
|
|
229
|
+
* Refresh token before lineage extraction
|
|
230
|
+
* Disregard 403 errors during lineage extraction
|
|
231
|
+
|
|
218
232
|
## 0.24.35 - 2025-07-29
|
|
219
233
|
|
|
220
234
|
* Coalesce - Fix pagination issue
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
CHANGELOG.md,sha256=
|
|
1
|
+
CHANGELOG.md,sha256=cdsC0cY-q3t1K8a-kXhK3OY6y-yrF8uICKb8OqJ3SJo,19185
|
|
2
2
|
Dockerfile,sha256=xQ05-CFfGShT3oUqaiumaldwA288dj9Yb_pxofQpufg,301
|
|
3
3
|
DockerfileUsage.md,sha256=2hkJQF-5JuuzfPZ7IOxgM6QgIQW7l-9oRMFVwyXC4gE,998
|
|
4
4
|
LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
|
|
@@ -29,7 +29,7 @@ castor_extractor/commands/extract_strategy.py,sha256=Q-pUymatPrBFGXobhyUPzFph0-t
|
|
|
29
29
|
castor_extractor/commands/extract_tableau.py,sha256=LNtI29LbVk1vp4RNrn89GmdW6R_7QBYunRmkowDhbco,1982
|
|
30
30
|
castor_extractor/commands/extract_thoughtspot.py,sha256=caAYJlH-vK7u5IUB6OKXxcaWfLgc7d_XqnFDWK6YNS4,639
|
|
31
31
|
castor_extractor/commands/file_check.py,sha256=TJx76Ymd0QCECmq35zRJMkPE8DJtSInB28MuSXWk8Ao,2644
|
|
32
|
-
castor_extractor/commands/upload.py,sha256=
|
|
32
|
+
castor_extractor/commands/upload.py,sha256=sqpEF_qqCNvT_niIrM6jPhzLaFVjtYwpc2iZw540F20,1633
|
|
33
33
|
castor_extractor/file_checker/__init__.py,sha256=OSt6YLhUT42U_Cp3LCLHMVruwDkksL75Ij13X2UPnVk,119
|
|
34
34
|
castor_extractor/file_checker/column.py,sha256=6bJhcW1snYwgHKkqlS0Ak7XLHZr4YBwO46JCIlnQNKg,3086
|
|
35
35
|
castor_extractor/file_checker/column_test.py,sha256=1j8PxvmvmJgpd-mk30iMYOme32ovPSIn4yCXywFoXrg,1935
|
|
@@ -86,13 +86,14 @@ castor_extractor/transformation/dbt/client.py,sha256=BIue1DNAn2b7kHeiXBkGNosq8jZ
|
|
|
86
86
|
castor_extractor/transformation/dbt/client_test.py,sha256=RLL7y_pLDv2QBM03qBht8yYEooeT_woRADHcb8vgBQ4,4535
|
|
87
87
|
castor_extractor/transformation/dbt/credentials.py,sha256=pGq7GqFQTw9TwN1DXSHC-0yJ2H6B_wMAbHyQTLqJVh0,543
|
|
88
88
|
castor_extractor/types.py,sha256=nHel2hv6NoHmdpOX_heEfO2-DnZPoYA2x0eJdbFvT0s,1276
|
|
89
|
-
castor_extractor/uploader/__init__.py,sha256=
|
|
90
|
-
castor_extractor/uploader/constant.py,sha256=
|
|
89
|
+
castor_extractor/uploader/__init__.py,sha256=xe3QHmHb35TILEhr7__nI_0t0tDolpQuujUyd84YcjI,111
|
|
90
|
+
castor_extractor/uploader/constant.py,sha256=ZmQtFx9nnR0GSLZ9k41upzV3ub4FJCUIyojIEVh-qIg,956
|
|
91
|
+
castor_extractor/uploader/enums.py,sha256=s5KVeBZWRDbDu-qOnrJhTSkSqzh0gxv0W1Z4cUsXfb8,109
|
|
91
92
|
castor_extractor/uploader/env.py,sha256=5KiWHV-WTHfF68T_vzI-ypKAxzy9b9fnz2y4T3lH6QY,871
|
|
92
93
|
castor_extractor/uploader/env_test.py,sha256=ClCWWtwd2N-5ClIDUxVMeKkWfhhOTxpppsXUDmdjxSg,472
|
|
93
|
-
castor_extractor/uploader/settings.py,sha256=
|
|
94
|
-
castor_extractor/uploader/upload.py,sha256=
|
|
95
|
-
castor_extractor/uploader/upload_test.py,sha256=
|
|
94
|
+
castor_extractor/uploader/settings.py,sha256=sUZpg9eHemM99DMrBW8bnlMuoTmCmLCKq-D0OCuQbGA,649
|
|
95
|
+
castor_extractor/uploader/upload.py,sha256=b2g9vWWjXWbt8Ms7brTc7OK_I7Z-1VSibNbppGoB2oQ,4764
|
|
96
|
+
castor_extractor/uploader/upload_test.py,sha256=UgN7TnT9Chn6KVzRcAX0Tuvp7-tps3ugxGitlgb9TSY,462
|
|
96
97
|
castor_extractor/uploader/utils.py,sha256=otAaySj5aeem6f0CTd0Te6ioJ6uP2J1p348j-SdIwDI,802
|
|
97
98
|
castor_extractor/utils/__init__.py,sha256=z_BdKTUyuug3I5AzCuSGrAVskfLax4_olfORIjhZw_M,1691
|
|
98
99
|
castor_extractor/utils/argument_parser.py,sha256=S4EcIh3wNDjs3fOrQnttCcPsAmG8m_Txl7xvEh0Q37s,283
|
|
@@ -269,14 +270,16 @@ castor_extractor/visualization/salesforce_reporting/client/rest.py,sha256=AqL1DT
|
|
|
269
270
|
castor_extractor/visualization/salesforce_reporting/client/soql.py,sha256=ytZnX6zE-NoS_Kz12KghMcCM4ukPwhMj6U0rQZ_8Isk,1621
|
|
270
271
|
castor_extractor/visualization/salesforce_reporting/extract.py,sha256=ScStilebLGf4HDTFqhVTQAvv_OrKxc8waycfBKdsVAc,1359
|
|
271
272
|
castor_extractor/visualization/sigma/__init__.py,sha256=GINql4yJLtjfOJgjHaWNpE13cMtnKNytiFRomwav27Q,114
|
|
272
|
-
castor_extractor/visualization/sigma/assets.py,sha256=
|
|
273
|
+
castor_extractor/visualization/sigma/assets.py,sha256=uKGKDaeY1ejc7XGh4eFaNp2ygG7hgca132xsX4eCwKQ,380
|
|
273
274
|
castor_extractor/visualization/sigma/client/__init__.py,sha256=YQv06FBBQHvBMFg_tN0nUcmUp2NCL2s-eFTXG8rXaBg,74
|
|
274
|
-
castor_extractor/visualization/sigma/client/client.py,sha256=
|
|
275
|
+
castor_extractor/visualization/sigma/client/client.py,sha256=VU0BHlug3tCpGA1je0PjEy4hU4TKhCH9UUGi8LRmNy8,11422
|
|
275
276
|
castor_extractor/visualization/sigma/client/client_test.py,sha256=ae0ZOvKutCm44jnrJ-0_A5Y6ZGyDkMf9Ml3eEP8dNkY,581
|
|
276
277
|
castor_extractor/visualization/sigma/client/credentials.py,sha256=XddAuQSmCKpxJ70TQgRnOj0vMPYVtiStk_lMMQ1AiNM,693
|
|
277
|
-
castor_extractor/visualization/sigma/client/endpoints.py,sha256=
|
|
278
|
+
castor_extractor/visualization/sigma/client/endpoints.py,sha256=i7KTKnl2Os6752CdtJl0vPSC_Z6JxmacodV_saOnce0,1662
|
|
278
279
|
castor_extractor/visualization/sigma/client/pagination.py,sha256=2bFA7GiBUUasFtHJKA90516d283p7Pg50-4zw6Fwt8I,726
|
|
279
|
-
castor_extractor/visualization/sigma/
|
|
280
|
+
castor_extractor/visualization/sigma/client/sources_transformer.py,sha256=mRupzxjtjDqELIouHF0egBkgslDmn5Y4uqO_sbUGCNs,3244
|
|
281
|
+
castor_extractor/visualization/sigma/client/sources_transformer_test.py,sha256=06yUHXyv65amXLKXhix6K3kkVc1kpBqSjIYcxbyMI4Y,2766
|
|
282
|
+
castor_extractor/visualization/sigma/extract.py,sha256=poTh70Xm2D6BwbdGApLkjXy6-t4iZnOoMB5DPfaTLEI,2929
|
|
280
283
|
castor_extractor/visualization/strategy/__init__.py,sha256=HOMv4JxqF5ZmViWi-pDE-PSXJRLTdXal_jtpHG_rlR8,123
|
|
281
284
|
castor_extractor/visualization/strategy/assets.py,sha256=yFXF_dX01patC0HQ1eU7Jo_4DZ4m6IJEg0uCB71tMoI,480
|
|
282
285
|
castor_extractor/visualization/strategy/client/__init__.py,sha256=XWP0yF5j6JefDJkDfX-RSJn3HF2ceQ0Yx1PLCfB3BBo,80
|
|
@@ -428,8 +431,8 @@ castor_extractor/warehouse/sqlserver/queries/table.sql,sha256=4RgeSkHDWTWRyU2iLx
|
|
|
428
431
|
castor_extractor/warehouse/sqlserver/queries/user.sql,sha256=gOrZsMVypusR2dc4vwVs4E1a-CliRsr_UjnD2EbXs-A,94
|
|
429
432
|
castor_extractor/warehouse/sqlserver/query.py,sha256=7sW8cK3JzxPt6faTJ7e4lk9tE4fo_AeCymI-LqsSols,1276
|
|
430
433
|
castor_extractor/warehouse/synapse/queries/column.sql,sha256=lNcFoIW3Y0PFOqoOzJEXmPvZvfAsY0AP63Mu2LuPzPo,1351
|
|
431
|
-
castor_extractor-0.24.
|
|
432
|
-
castor_extractor-0.24.
|
|
433
|
-
castor_extractor-0.24.
|
|
434
|
-
castor_extractor-0.24.
|
|
435
|
-
castor_extractor-0.24.
|
|
434
|
+
castor_extractor-0.24.38.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
|
|
435
|
+
castor_extractor-0.24.38.dist-info/METADATA,sha256=iCWUVbgDFS721szJ8kUGMA58Va3Roq3WmyGinZgnHMw,26638
|
|
436
|
+
castor_extractor-0.24.38.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
|
437
|
+
castor_extractor-0.24.38.dist-info/entry_points.txt,sha256=_F-qeZCybjoMkNb9ErEhnyqXuG6afHIFQhakdBHZsr4,1803
|
|
438
|
+
castor_extractor-0.24.38.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|