castor-extractor 0.24.55__py3-none-any.whl → 0.24.57__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

CHANGELOG.md CHANGED
@@ -1,5 +1,16 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.24.57 - 2025-09-24
4
+
5
+ * Sigma:
6
+ * fix pagination
7
+ * remove redundant element lineages endpoint
8
+ * extract data model sources
9
+
10
+ ## 0.24.56 - 2025-09-24
11
+
12
+ * bump dependencies
13
+
3
14
  ## 0.24.55 - 2025-09-19
4
15
 
5
16
  * Fix encoding in LocalStorage - force to utf-8
@@ -5,11 +5,11 @@ class SigmaAsset(ExternalAsset):
5
5
  """Sigma assets"""
6
6
 
7
7
  DATAMODELS = "datamodels"
8
+ DATAMODEL_SOURCES = "datamodel_sources"
8
9
  DATASETS = "datasets"
9
10
  DATASET_SOURCES = "dataset_sources"
10
11
  ELEMENTS = "elements"
11
12
  FILES = "files"
12
- LINEAGES = "lineages"
13
13
  MEMBERS = "members"
14
14
  QUERIES = "queries"
15
15
  WORKBOOKS = "workbooks"
@@ -1,17 +1,13 @@
1
1
  import logging
2
2
  from collections.abc import Iterator
3
- from concurrent.futures import ThreadPoolExecutor
4
3
  from functools import partial
5
4
  from http import HTTPStatus
6
5
  from typing import Callable, Iterable, Optional
7
6
 
8
- from pydantic import BaseModel
9
-
10
7
  from ....utils import (
11
8
  APIClient,
12
9
  RequestSafeMode,
13
10
  fetch_all_pages,
14
- retry,
15
11
  )
16
12
  from ..assets import SigmaAsset
17
13
  from .authentication import SigmaBearerAuth
@@ -55,38 +51,12 @@ SIGMA_SAFE_MODE = RequestSafeMode(
55
51
  max_errors=_VOLUME_IGNORED,
56
52
  status_codes=_IGNORED_ERROR_CODES,
57
53
  )
58
- SIGMA_SAFE_MODE_LINEAGE = RequestSafeMode(
59
- max_errors=_VOLUME_IGNORED,
60
- status_codes=(
61
- *_IGNORED_ERROR_CODES,
62
- HTTPStatus.FORBIDDEN,
63
- ),
64
- )
65
- _THREADS_LINEAGE = 10 # empirically found; hit the rate limit with 20 workers
66
54
  _RETRY_NUMBER = 1
67
55
  _RETRY_BASE_MS = 60_000
68
56
 
69
57
 
70
- class LineageContext(BaseModel):
71
- """all info needed to build the endpoint for lineage retrieval"""
72
-
73
- workbook_id: str
74
- element_id: str
75
-
76
-
77
- class Lineage(BaseModel):
78
- """holds response from lineage API and context used to retrieve it"""
79
-
80
- lineage: dict
81
- context: LineageContext
82
-
83
-
84
58
  class SigmaClient(APIClient):
85
- def __init__(
86
- self,
87
- credentials: SigmaCredentials,
88
- safe_mode: Optional[RequestSafeMode] = None,
89
- ):
59
+ def __init__(self, credentials: SigmaCredentials):
90
60
  auth = SigmaBearerAuth(
91
61
  host=credentials.host,
92
62
  token_payload=credentials.token_payload,
@@ -96,7 +66,7 @@ class SigmaClient(APIClient):
96
66
  auth=auth,
97
67
  headers=_SIGMA_HEADERS,
98
68
  timeout=_SIGMA_TIMEOUT_S,
99
- safe_mode=safe_mode or SIGMA_SAFE_MODE,
69
+ safe_mode=SIGMA_SAFE_MODE,
100
70
  )
101
71
 
102
72
  def _get_paginated(
@@ -175,68 +145,6 @@ class SigmaClient(APIClient):
175
145
  page=page, workbook_id=workbook_id
176
146
  )
177
147
 
178
- @retry(
179
- (ConnectionError,),
180
- max_retries=_RETRY_NUMBER,
181
- base_ms=_RETRY_BASE_MS,
182
- log_exc_info=True,
183
- )
184
- def _get_lineage(self, lineage_context: LineageContext) -> Lineage:
185
- """
186
- return the lineage from API and other ids needed to characterize
187
- lineage in castor
188
- """
189
- workbook_id = lineage_context.workbook_id
190
- element_id = lineage_context.element_id
191
- endpoint = SigmaEndpointFactory.lineage(workbook_id, element_id)
192
- return Lineage(lineage=self._get(endpoint), context=lineage_context)
193
-
194
- @staticmethod
195
- def _lineage_context(elements: list[dict]) -> list[LineageContext]:
196
- """
197
- Helper function to prepare context for lineage retrieval.
198
- Elements without associated columns are skipped.
199
- """
200
- contexts: list[LineageContext] = []
201
- for element in elements:
202
- if element.get("columns") is None:
203
- continue
204
-
205
- context = LineageContext(
206
- workbook_id=element["workbook_id"],
207
- element_id=element["elementId"],
208
- )
209
- contexts.append(context)
210
- return contexts
211
-
212
- def _get_all_lineages(self, elements: list[dict]) -> Iterator[dict]:
213
- """
214
- The safe mode is temporarily modified to include 403 errors.
215
-
216
- Due to concurrency issues, we force a refresh of the token in hopes that
217
- the lineage extraction takes less than the token expiration time of
218
- 1 hour.
219
- """
220
- safe_mode = self._safe_mode
221
- self._safe_mode = SIGMA_SAFE_MODE_LINEAGE
222
-
223
- lineage_context = self._lineage_context(elements)
224
-
225
- with ThreadPoolExecutor(max_workers=_THREADS_LINEAGE) as executor:
226
- results = executor.map(self._get_lineage, lineage_context)
227
-
228
- for lineage in results:
229
- if not lineage.lineage:
230
- continue
231
-
232
- yield {
233
- **lineage.lineage,
234
- "workbook_id": lineage.context.workbook_id,
235
- "element_id": lineage.context.element_id,
236
- }
237
-
238
- self._safe_mode = safe_mode
239
-
240
148
  @staticmethod
241
149
  def _yield_deduplicated_queries(
242
150
  queries: Iterable[dict], workbook_id: str
@@ -266,6 +174,13 @@ class SigmaClient(APIClient):
266
174
 
267
175
  yield from self._yield_deduplicated_queries(queries, workbook_id)
268
176
 
177
+ def _get_all_datamodel_sources(
178
+ self, datamodels: list[dict]
179
+ ) -> Iterator[dict]:
180
+ yield from SigmaSourcesTransformer(
181
+ self, table_id_key="tableId"
182
+ ).get_datamodel_sources(datamodels)
183
+
269
184
  def _get_all_dataset_sources(self, datasets: list[dict]) -> Iterator[dict]:
270
185
  yield from SigmaSourcesTransformer(self).get_dataset_sources(datasets)
271
186
 
@@ -277,14 +192,22 @@ class SigmaClient(APIClient):
277
192
  def fetch(
278
193
  self,
279
194
  asset: SigmaAsset,
195
+ datamodels: Optional[list[dict]] = None,
280
196
  datasets: Optional[list[dict]] = None,
281
- elements: Optional[list[dict]] = None,
282
197
  workbooks: Optional[list[dict]] = None,
283
198
  ) -> Iterator[dict]:
284
199
  """Returns the needed metadata for the queried asset"""
285
200
  if asset == SigmaAsset.DATAMODELS:
286
201
  yield from self._get_all_datamodels()
287
202
 
203
+ elif asset == SigmaAsset.DATAMODEL_SOURCES:
204
+ if datamodels is None:
205
+ raise ValueError(
206
+ "Missing data models to extract data model sources"
207
+ )
208
+
209
+ yield from self._get_all_datamodel_sources(datamodels)
210
+
288
211
  elif asset == SigmaAsset.DATASETS:
289
212
  yield from self._get_all_datasets()
290
213
 
@@ -303,12 +226,6 @@ class SigmaClient(APIClient):
303
226
  elif asset == SigmaAsset.FILES:
304
227
  yield from self._get_all_files()
305
228
 
306
- elif asset == SigmaAsset.LINEAGES:
307
- if elements is None:
308
- raise ValueError("Missing elements to extract lineage")
309
-
310
- yield from self._get_all_lineages(elements)
311
-
312
229
  elif asset == SigmaAsset.MEMBERS:
313
230
  yield from self._get_all_members()
314
231
 
@@ -19,6 +19,10 @@ class SigmaEndpointFactory:
19
19
  def datamodels(cls) -> str:
20
20
  return f"v2/{cls.DATAMODELS}"
21
21
 
22
+ @classmethod
23
+ def datamodel_sources(cls, datamodel_id: str) -> str:
24
+ return f"v2/{cls.DATAMODELS}/{datamodel_id}/sources"
25
+
22
26
  @classmethod
23
27
  def datasets(cls) -> str:
24
28
  return f"v2/{cls.DATASETS}"
@@ -10,7 +10,7 @@ SIGMA_QUERIES_PAGINATION_LIMIT = 50
10
10
 
11
11
 
12
12
  class SigmaPagination(PaginationModel):
13
- next_page: Optional[str] = "0"
13
+ next_page: Optional[str] = None
14
14
  entries: list = Field(default_factory=list)
15
15
 
16
16
  model_config = ConfigDict(
@@ -27,3 +27,23 @@ class SigmaPagination(PaginationModel):
27
27
 
28
28
  def page_results(self) -> list:
29
29
  return self.entries
30
+
31
+
32
+ class SigmaTokenPagination(PaginationModel):
33
+ next_page_token: Optional[str] = "" # noqa: S105
34
+ entries: list = Field(default_factory=list)
35
+
36
+ model_config = ConfigDict(
37
+ alias_generator=to_camel,
38
+ populate_by_name=True,
39
+ from_attributes=True,
40
+ )
41
+
42
+ def is_last(self) -> bool:
43
+ return not self.next_page_token
44
+
45
+ def next_page_payload(self) -> dict:
46
+ return {"pageToken": self.next_page_token}
47
+
48
+ def page_results(self) -> list:
49
+ return self.entries
@@ -2,8 +2,9 @@ import logging
2
2
  from http import HTTPStatus
3
3
  from typing import TYPE_CHECKING, Callable, Iterator
4
4
 
5
- from ....utils import retry_request
5
+ from ....utils import fetch_all_pages, retry_request
6
6
  from .endpoints import SigmaEndpointFactory
7
+ from .pagination import SigmaTokenPagination
7
8
 
8
9
  if TYPE_CHECKING:
9
10
  from .client import SigmaClient
@@ -17,8 +18,11 @@ SIGMA_CONNECTION_PATH_SLEEP_MS = 30_000 # 30 seconds
17
18
  class SigmaSourcesTransformer:
18
19
  """Retrieves asset sources and enhances them with additional information."""
19
20
 
20
- def __init__(self, api_client: "SigmaClient"):
21
+ def __init__(
22
+ self, api_client: "SigmaClient", table_id_key: str = "inodeId"
23
+ ):
21
24
  self.api_client = api_client
25
+ self.table_id_key = table_id_key
22
26
 
23
27
  @retry_request(
24
28
  status_codes=(HTTPStatus.TOO_MANY_REQUESTS,),
@@ -38,9 +42,9 @@ class SigmaSourcesTransformer:
38
42
  logger.info("Mapping table ids to connection and path information")
39
43
 
40
44
  unique_table_ids = {
41
- source["inodeId"]
45
+ source[self.table_id_key]
42
46
  for asset_sources in all_sources
43
- for source in asset_sources["sources"]
47
+ for source in asset_sources.get("sources", [])
44
48
  if source["type"] == "table"
45
49
  }
46
50
 
@@ -49,15 +53,14 @@ class SigmaSourcesTransformer:
49
53
  for table_id in unique_table_ids
50
54
  }
51
55
 
52
- @staticmethod
53
- def _enhance_table_source(source: dict, table_to_path: dict) -> dict:
56
+ def _enhance_table_source(self, source: dict, table_to_path: dict) -> dict:
54
57
  """
55
58
  Combines a single table source with its connection and path information.
56
59
  """
57
60
  if source["type"] != "table":
58
61
  return source
59
62
 
60
- path_info = table_to_path.get(source["inodeId"], {})
63
+ path_info = table_to_path.get(source[self.table_id_key], {})
61
64
  source["connectionId"] = path_info.get("connectionId")
62
65
  source["path"] = path_info.get("path")
63
66
  return source
@@ -82,19 +85,35 @@ class SigmaSourcesTransformer:
82
85
  }
83
86
 
84
87
  def _get_all_sources(
85
- self, endpoint: Callable[[str], str], asset_ids: set[str]
88
+ self,
89
+ endpoint: Callable[[str], str],
90
+ asset_ids: set[str],
91
+ with_pagination: bool = False,
86
92
  ) -> Iterator[dict]:
87
93
  """Returns transformed sources for the given assets"""
88
94
  all_sources = []
89
95
 
90
96
  for asset_id in asset_ids:
91
- sources = self.api_client._get(endpoint=endpoint(asset_id))
97
+ endpoint_url = endpoint(asset_id)
98
+ if with_pagination:
99
+ request = self.api_client._get_paginated(endpoint=endpoint_url)
100
+ sources = list(fetch_all_pages(request, SigmaTokenPagination))
101
+ else:
102
+ sources = self.api_client._get(endpoint=endpoint_url)
92
103
  all_sources.append({"asset_id": asset_id, "sources": sources})
93
104
 
94
105
  table_to_path = self._map_table_id_to_connection_path(all_sources)
95
106
 
96
107
  yield from self._transform_sources(all_sources, table_to_path)
97
108
 
109
+ def get_datamodel_sources(self, datamodels: list[dict]) -> Iterator[dict]:
110
+ asset_ids = {datamodel["dataModelId"] for datamodel in datamodels}
111
+ yield from self._get_all_sources(
112
+ endpoint=SigmaEndpointFactory.datamodel_sources,
113
+ asset_ids=asset_ids,
114
+ with_pagination=True,
115
+ )
116
+
98
117
  def get_dataset_sources(self, datasets: list[dict]) -> Iterator[dict]:
99
118
  asset_ids = {dataset["datasetId"] for dataset in datasets}
100
119
  yield from self._get_all_sources(
@@ -23,8 +23,14 @@ def iterate_all_data(
23
23
  """Iterate over the extracted data from Sigma"""
24
24
 
25
25
  logger.info("Extracting DATA MODELS from API")
26
- datamodels = client.fetch(SigmaAsset.DATAMODELS)
27
- yield SigmaAsset.DATASETS, list(deep_serialize(datamodels))
26
+ datamodels = list(client.fetch(SigmaAsset.DATAMODELS))
27
+ yield SigmaAsset.DATASETS, deep_serialize(datamodels)
28
+
29
+ logger.info("Extracting DATAMODEL SOURCES from API")
30
+ datamodel_sources = client.fetch(
31
+ SigmaAsset.DATAMODEL_SOURCES, datamodels=datamodels
32
+ )
33
+ yield SigmaAsset.DATAMODEL_SOURCES, list(deep_serialize(datamodel_sources))
28
34
 
29
35
  logger.info("Extracting DATASETS from API")
30
36
  datasets = list(client.fetch(SigmaAsset.DATASETS))
@@ -62,10 +68,6 @@ def iterate_all_data(
62
68
  elements = list(client.fetch(SigmaAsset.ELEMENTS, workbooks=workbooks))
63
69
  yield SigmaAsset.ELEMENTS, list(deep_serialize(elements))
64
70
 
65
- logging.info("Extracting LINEAGES data from API")
66
- lineages = client.fetch(SigmaAsset.LINEAGES, elements=elements)
67
- yield SigmaAsset.LINEAGES, list(deep_serialize(lineages))
68
-
69
71
 
70
72
  def extract_all(**kwargs) -> None:
71
73
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: castor-extractor
3
- Version: 0.24.55
3
+ Version: 0.24.57
4
4
  Summary: Extract your metadata assets.
5
5
  Home-page: https://www.castordoc.com/
6
6
  License: EULA
@@ -215,6 +215,17 @@ For any questions or bug report, contact us at [support@coalesce.io](mailto:supp
215
215
 
216
216
  # Changelog
217
217
 
218
+ ## 0.24.57 - 2025-09-24
219
+
220
+ * Sigma:
221
+ * fix pagination
222
+ * remove redundant element lineages endpoint
223
+ * extract data model sources
224
+
225
+ ## 0.24.56 - 2025-09-24
226
+
227
+ * bump dependencies
228
+
218
229
  ## 0.24.55 - 2025-09-19
219
230
 
220
231
  * Fix encoding in LocalStorage - force to utf-8
@@ -1,4 +1,4 @@
1
- CHANGELOG.md,sha256=y8BAidkUDrMoQLEfu3LJLiqxoEUzI5hJZs4CUN_e1H0,20711
1
+ CHANGELOG.md,sha256=-WezbaTjM4tDXii_RVXSYDz39xuZYqWUsabdyqoh2Kc,20889
2
2
  Dockerfile,sha256=xQ05-CFfGShT3oUqaiumaldwA288dj9Yb_pxofQpufg,301
3
3
  DockerfileUsage.md,sha256=2hkJQF-5JuuzfPZ7IOxgM6QgIQW7l-9oRMFVwyXC4gE,998
4
4
  LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
@@ -270,17 +270,17 @@ castor_extractor/visualization/salesforce_reporting/client/rest.py,sha256=AqL1DT
270
270
  castor_extractor/visualization/salesforce_reporting/client/soql.py,sha256=ytZnX6zE-NoS_Kz12KghMcCM4ukPwhMj6U0rQZ_8Isk,1621
271
271
  castor_extractor/visualization/salesforce_reporting/extract.py,sha256=ScStilebLGf4HDTFqhVTQAvv_OrKxc8waycfBKdsVAc,1359
272
272
  castor_extractor/visualization/sigma/__init__.py,sha256=GINql4yJLtjfOJgjHaWNpE13cMtnKNytiFRomwav27Q,114
273
- castor_extractor/visualization/sigma/assets.py,sha256=uKGKDaeY1ejc7XGh4eFaNp2ygG7hgca132xsX4eCwKQ,380
273
+ castor_extractor/visualization/sigma/assets.py,sha256=iVZqi7XtNgSOVXy0jgeHZonVOeXi7jyikor8ztbECBc,398
274
274
  castor_extractor/visualization/sigma/client/__init__.py,sha256=YQv06FBBQHvBMFg_tN0nUcmUp2NCL2s-eFTXG8rXaBg,74
275
275
  castor_extractor/visualization/sigma/client/authentication.py,sha256=gHukrpfboIjZc_O9CcuDtrl6U-StH0J73VY2J74Bm9o,2279
276
- castor_extractor/visualization/sigma/client/client.py,sha256=De0xWJfUssfrwzyMNh8D2IIouUQzcS0qLUQrUYtjVkY,10827
276
+ castor_extractor/visualization/sigma/client/client.py,sha256=uUEZoTa1WU5bJEjOrgzWqSiJMKgbru5HPBEPazyu1Hc,8272
277
277
  castor_extractor/visualization/sigma/client/client_test.py,sha256=ae0ZOvKutCm44jnrJ-0_A5Y6ZGyDkMf9Ml3eEP8dNkY,581
278
278
  castor_extractor/visualization/sigma/client/credentials.py,sha256=XddAuQSmCKpxJ70TQgRnOj0vMPYVtiStk_lMMQ1AiNM,693
279
- castor_extractor/visualization/sigma/client/endpoints.py,sha256=i7KTKnl2Os6752CdtJl0vPSC_Z6JxmacodV_saOnce0,1662
280
- castor_extractor/visualization/sigma/client/pagination.py,sha256=1yLpCNps5FnDiPcXCcgHu23cxg15Gfc6FvE3AJleb2c,728
281
- castor_extractor/visualization/sigma/client/sources_transformer.py,sha256=n-5mZWSvzfTwpM5VP_bwlcxcaAwCKEEbpMCG_1KRVP4,3748
279
+ castor_extractor/visualization/sigma/client/endpoints.py,sha256=by9VIFml2whlzQT66f2m56RYBsqPrWdAmIP4JkTaBV4,1799
280
+ castor_extractor/visualization/sigma/client/pagination.py,sha256=9kCYQpO7hAH2qvYmnVjnGVUDLkpkEM6BgYlv-JTY8AE,1241
281
+ castor_extractor/visualization/sigma/client/sources_transformer.py,sha256=2f7REl70wYitopftMtYQU-E8kISVck67i7rGYgf3tkk,4552
282
282
  castor_extractor/visualization/sigma/client/sources_transformer_test.py,sha256=06yUHXyv65amXLKXhix6K3kkVc1kpBqSjIYcxbyMI4Y,2766
283
- castor_extractor/visualization/sigma/extract.py,sha256=poTh70Xm2D6BwbdGApLkjXy6-t4iZnOoMB5DPfaTLEI,2929
283
+ castor_extractor/visualization/sigma/extract.py,sha256=iRmRUzSnq_ObG9fxpOI5Rs07EKKT-VRLcyiti5-8D4c,2986
284
284
  castor_extractor/visualization/strategy/__init__.py,sha256=HOMv4JxqF5ZmViWi-pDE-PSXJRLTdXal_jtpHG_rlR8,123
285
285
  castor_extractor/visualization/strategy/assets.py,sha256=yFXF_dX01patC0HQ1eU7Jo_4DZ4m6IJEg0uCB71tMoI,480
286
286
  castor_extractor/visualization/strategy/client/__init__.py,sha256=XWP0yF5j6JefDJkDfX-RSJn3HF2ceQ0Yx1PLCfB3BBo,80
@@ -434,8 +434,8 @@ castor_extractor/warehouse/sqlserver/queries/user.sql,sha256=MAlnTis43E3Amu1e1Oz
434
434
  castor_extractor/warehouse/sqlserver/queries/view_ddl.sql,sha256=9rynvx6MWg3iZzrWPB7haZfVKEPkxulzryE2g19x804,315
435
435
  castor_extractor/warehouse/sqlserver/query.py,sha256=c8f7_SEMR17DhbtzuYphWqWDQ0sCRy-nR442RRBZVYw,1773
436
436
  castor_extractor/warehouse/synapse/queries/column.sql,sha256=lNcFoIW3Y0PFOqoOzJEXmPvZvfAsY0AP63Mu2LuPzPo,1351
437
- castor_extractor-0.24.55.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
438
- castor_extractor-0.24.55.dist-info/METADATA,sha256=MhFCdByqa4_T7A4-Mb96-ISq07W6BP7M-RHgjSfI8iY,28172
439
- castor_extractor-0.24.55.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
440
- castor_extractor-0.24.55.dist-info/entry_points.txt,sha256=_F-qeZCybjoMkNb9ErEhnyqXuG6afHIFQhakdBHZsr4,1803
441
- castor_extractor-0.24.55.dist-info/RECORD,,
437
+ castor_extractor-0.24.57.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
438
+ castor_extractor-0.24.57.dist-info/METADATA,sha256=uSN01JxGlu1gIF4bpBnZtHM3tLQKfU9qT0uimCqtrjI,28350
439
+ castor_extractor-0.24.57.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
440
+ castor_extractor-0.24.57.dist-info/entry_points.txt,sha256=_F-qeZCybjoMkNb9ErEhnyqXuG6afHIFQhakdBHZsr4,1803
441
+ castor_extractor-0.24.57.dist-info/RECORD,,