castor-extractor 0.24.34__py3-none-any.whl → 0.24.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

CHANGELOG.md CHANGED
@@ -1,5 +1,9 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.24.35 - 2025-07-29
4
+
5
+ * Coalesce - Fix pagination issue
6
+
3
7
  ## 0.24.34 - 2025-07-02
4
8
 
5
9
  * SQLServer: multiple databases
@@ -1,31 +1,47 @@
1
1
  import logging
2
+ from functools import partial
2
3
  from http import HTTPStatus
3
- from typing import Iterator, Optional
4
+ from typing import Callable, Optional
4
5
 
5
- from requests import ConnectionError
6
+ from pydantic import ValidationError
6
7
 
7
8
  from ....utils import (
8
9
  APIClient,
9
10
  BearerAuth,
10
11
  RequestSafeMode,
11
12
  SerializedAsset,
13
+ fetch_all_pages,
12
14
  )
13
15
  from ..assets import CoalesceAsset, CoalesceQualityAsset
14
16
  from .credentials import CoalesceCredentials
15
17
  from .endpoint import (
16
18
  CoalesceEndpointFactory,
17
19
  )
18
- from .type import NodeIDToNamesMapping
19
- from .utils import column_names_per_node, is_test, test_names_per_node
20
+ from .pagination import CoalescePagination
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ COALESCE_PAGE_SIZE = 300
26
+ COALESCE_PAGE_SIZE_RUN_RESULTS = 1_000
27
+
28
+ COALESCE_TIMEOUT_SECONDS = 90
20
29
 
21
- _LIMIT_MAX = 1_000
22
30
  _MAX_ERRORS = 200
23
31
 
24
- logger = logging.getLogger(__name__)
32
+ COALESCE_SAFE_MODE = RequestSafeMode(
33
+ status_codes=(HTTPStatus.INTERNAL_SERVER_ERROR,),
34
+ max_errors=_MAX_ERRORS,
35
+ )
25
36
 
26
37
 
27
- def _run_result_payload(result: dict, query_result: dict) -> dict:
38
+ def _run_result_payload(
39
+ environment_id: str,
40
+ result: dict,
41
+ query_result: dict,
42
+ ) -> dict:
28
43
  return {
44
+ "environment_id": environment_id,
29
45
  "node_id": result["nodeID"],
30
46
  "node_name": result["name"],
31
47
  "test_name": query_result["name"],
@@ -37,13 +53,6 @@ def _run_result_payload(result: dict, query_result: dict) -> dict:
37
53
  }
38
54
 
39
55
 
40
- COALESCE_SAFE_MODE = RequestSafeMode(
41
- status_codes=(HTTPStatus.INTERNAL_SERVER_ERROR,),
42
- max_errors=_MAX_ERRORS,
43
- )
44
- COALESCE_TIMEOUT_SECONDS = 90
45
-
46
-
47
56
  class CoalesceBearerAuth(BearerAuth):
48
57
  """Bearer Authentication for Coalesce"""
49
58
 
@@ -69,65 +78,74 @@ class CoalesceClient(APIClient):
69
78
  timeout=COALESCE_TIMEOUT_SECONDS,
70
79
  )
71
80
 
72
- def _fetch_environments(self) -> Iterator[dict]:
73
- endpoint = CoalesceEndpointFactory.environments()
74
- result = self._get(endpoint=endpoint)
75
- return result["data"]
76
-
77
- def _node_details(self, environment_id: int, node_id: str) -> dict:
78
- endpoint = CoalesceEndpointFactory.nodes(
79
- environment_id=environment_id, node_id=node_id
81
+ def _get_paginated(
82
+ self,
83
+ endpoint: str,
84
+ limit: int = COALESCE_PAGE_SIZE,
85
+ params: Optional[dict] = None,
86
+ ) -> Callable:
87
+ return partial(
88
+ self._get,
89
+ retry_on_timeout=False, # explained in the docstring
90
+ endpoint=endpoint,
91
+ params={
92
+ "limit": limit,
93
+ **(params or dict()),
94
+ },
80
95
  )
81
- return self._get(endpoint=endpoint)
96
+
97
+ def _fetch_environments(self) -> SerializedAsset:
98
+ endpoint = CoalesceEndpointFactory.environments()
99
+ request = self._get_paginated(endpoint=endpoint)
100
+ result = fetch_all_pages(request, CoalescePagination)
101
+ return list(result)
82
102
 
83
103
  def _fetch_env_nodes(self, environment_id: int) -> SerializedAsset:
84
104
  endpoint = CoalesceEndpointFactory.nodes(environment_id=environment_id)
85
- result = self._get(endpoint=endpoint)
86
- nodes: list[dict] = []
87
- for node in result["data"]:
88
- try:
89
- details = self._node_details(environment_id, node["id"])
90
- nodes.append({**node, **details})
91
- except ConnectionError as e:
92
- node_id = node["id"]
93
- message = f"ConnectionError, environment: {environment_id}, node: {node_id}"
94
- logger.warning(message)
95
- raise e
96
- return nodes
105
+ request = self._get_paginated(
106
+ endpoint=endpoint,
107
+ params={"detail": "true"},
108
+ )
109
+ result = fetch_all_pages(request, CoalescePagination)
110
+ return [
111
+ {
112
+ **node,
113
+ "environment_id": environment_id,
114
+ }
115
+ for node in result
116
+ ]
97
117
 
98
118
  def _fetch_all_nodes(self) -> SerializedAsset:
119
+ environments = self._fetch_environments()
120
+ total = len(environments)
99
121
  nodes: list[dict] = []
100
- for environment in self._fetch_environments():
101
- environment_id = environment["id"]
102
- nodes.extend(self._fetch_env_nodes(environment_id))
122
+
123
+ for index, env in enumerate(environments):
124
+ env_id = env["id"]
125
+ logger.info(f"Fetching nodes for env #{env_id} - {index}/{total}")
126
+ try:
127
+ nodes.extend(self._fetch_env_nodes(env_id))
128
+ except ValidationError as e:
129
+ # 500 Server Error: Internal Server Error on Coalesce API
130
+ logger.warning(
131
+ f"Skipping nodes for {env_id} due to the following Error: {e}"
132
+ )
133
+ logger.info(f"{len(nodes)} nodes extracted so far")
103
134
  return nodes
104
135
 
105
136
  def _fetch_runs(self, starting_from: str) -> SerializedAsset:
106
- """
107
- fetch runs, per environment;
108
- we break per environment to lower the chance of exceeding the 1k limit
109
- """
110
- runs: list[dict] = []
111
- for environment in self._fetch_environments():
112
- environment_id = environment["id"]
113
- runs.extend(
114
- self._fetch_recent_runs_per_env(environment_id, starting_from)
115
- )
116
- return runs
117
-
118
- def _fetch_recent_runs_per_env(
119
- self, environment_id: int, starting_from: str
120
- ) -> SerializedAsset:
121
137
  endpoint = CoalesceEndpointFactory.runs()
122
138
  params = {
123
- "environmentID": environment_id,
124
- "limit": _LIMIT_MAX,
125
139
  "orderBy": "runEndTime",
126
140
  "orderByDirection": "asc",
127
141
  "startingFrom": starting_from,
128
142
  }
129
- result = self._get(endpoint=endpoint, params=params)
130
- return result["data"]
143
+ request = self._get_paginated(
144
+ endpoint=endpoint,
145
+ params=params,
146
+ limit=COALESCE_PAGE_SIZE_RUN_RESULTS,
147
+ )
148
+ return list(fetch_all_pages(request, CoalescePagination))
131
149
 
132
150
  def _fetch_run_results(self, run_id: str) -> SerializedAsset:
133
151
  endpoint = CoalesceEndpointFactory.run_results(run_id)
@@ -136,51 +154,37 @@ class CoalesceClient(APIClient):
136
154
 
137
155
  def _run_results_by_run(
138
156
  self,
157
+ environment_id: str,
139
158
  run_id: str,
140
- test_names: NodeIDToNamesMapping,
141
- column_names: NodeIDToNamesMapping,
142
159
  ) -> SerializedAsset:
143
160
  run_results: list[dict] = []
144
161
  for result in self._fetch_run_results(run_id):
145
- node_id = result["nodeID"]
146
162
  for query_result in result["queryResults"]:
147
- _is_test = is_test(
163
+ if query_result["type"] != "sqlTest":
164
+ continue
165
+ run_result = _run_result_payload(
166
+ environment_id,
167
+ result,
148
168
  query_result,
149
- node_id,
150
- test_names,
151
- column_names,
152
169
  )
153
- if not _is_test:
154
- continue
155
- run_result = _run_result_payload(result, query_result)
156
170
  run_results.append(run_result)
157
171
  return run_results
158
172
 
159
- def _run_results_by_env(
160
- self, environment_id: int, starting_from: str
173
+ def _fetch_all_run_results(
174
+ self,
175
+ starting_from: str,
161
176
  ) -> SerializedAsset:
162
177
  run_results: list[dict] = []
163
- nodes = self._fetch_env_nodes(environment_id)
164
- test_names = test_names_per_node(nodes)
165
- column_names = column_names_per_node(nodes)
166
- runs = self._fetch_recent_runs_per_env(environment_id, starting_from)
167
178
 
168
- for run in runs:
169
- run_id = run["id"]
170
- _results = self._run_results_by_run(
171
- run_id, test_names, column_names
172
- )
173
- run_results.extend(_results)
174
- return run_results
175
-
176
- def _fetch_all_run_results(self, starting_from: str) -> SerializedAsset:
177
- run_results: list[dict] = []
178
-
179
- for environment in self._fetch_environments():
180
- environment_id = environment["id"]
181
- _results = self._run_results_by_env(environment_id, starting_from)
182
- run_results.extend(_results)
179
+ runs = self._fetch_runs(starting_from)
180
+ total = len(runs)
183
181
 
182
+ for index, run in enumerate(runs):
183
+ logger.info(f"Extracting run results ({index}/{total})")
184
+ run_id = run["id"]
185
+ environment_id = run["environmentID"]
186
+ current_results = self._run_results_by_run(environment_id, run_id)
187
+ run_results.extend(current_results)
184
188
  return run_results
185
189
 
186
190
  def fetch(
@@ -0,0 +1,26 @@
1
+ from typing import Optional, Union
2
+
3
+ from ....utils import PaginationModel
4
+
5
+
6
+ class CoalescePagination(PaginationModel):
7
+ """
8
+ Class to handle paginated results for Coalesce
9
+ See their documentation here
10
+ https://docs.coalesce.io/docs/api
11
+ """
12
+
13
+ data: list
14
+ next: Union[Optional[str], Optional[int]] = None
15
+
16
+ def is_last(self) -> bool:
17
+ """Stopping condition for the pagination"""
18
+ return self.next is None
19
+
20
+ def next_page_payload(self):
21
+ """Payload enabling to generate the request for the next page"""
22
+ return {"startingFrom": self.next}
23
+
24
+ def page_results(self) -> list:
25
+ """List of results of the current page"""
26
+ return self.data
@@ -66,16 +66,19 @@ def fetch_all_pages(
66
66
  """
67
67
  page_number = 1
68
68
  response_payload = request()
69
+
69
70
  paginated_response = pagination_model(**response_payload)
71
+
70
72
  while not paginated_response.is_last():
71
73
  logger.debug(f"Fetching page number {page_number}")
72
74
  yield from paginated_response.page_results()
73
75
  next_page_parameters = paginated_response.next_page_parameters()
74
- new_request = partial(request, **next_page_parameters)
76
+ request_with_pagination = partial(request, **next_page_parameters)
75
77
  if rate_limit:
76
78
  sleep(rate_limit)
77
79
  paginated_response = pagination_model(
78
- current_page_payload=next_page_parameters, **new_request()
80
+ current_page_payload=next_page_parameters,
81
+ **request_with_pagination(),
79
82
  )
80
83
  page_number += 1
81
84
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: castor-extractor
3
- Version: 0.24.34
3
+ Version: 0.24.35
4
4
  Summary: Extract your metadata assets.
5
5
  Home-page: https://www.castordoc.com/
6
6
  License: EULA
@@ -215,6 +215,10 @@ For any questions or bug report, contact us at [support@coalesce.io](mailto:supp
215
215
 
216
216
  # Changelog
217
217
 
218
+ ## 0.24.35 - 2025-07-29
219
+
220
+ * Coalesce - Fix pagination issue
221
+
218
222
  ## 0.24.34 - 2025-07-02
219
223
 
220
224
  * SQLServer: multiple databases
@@ -1,4 +1,4 @@
1
- CHANGELOG.md,sha256=4PQMZjH-5BKSERREUHivWM7KKl_PpIDieFYH2PeRmGQ,18840
1
+ CHANGELOG.md,sha256=1S9O_c1LH8T4P78akRxlFS8Tv0i9Jgswy7V9zvd_UQw,18900
2
2
  Dockerfile,sha256=xQ05-CFfGShT3oUqaiumaldwA288dj9Yb_pxofQpufg,301
3
3
  DockerfileUsage.md,sha256=2hkJQF-5JuuzfPZ7IOxgM6QgIQW7l-9oRMFVwyXC4gE,998
4
4
  LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
@@ -76,12 +76,10 @@ castor_extractor/transformation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
76
76
  castor_extractor/transformation/coalesce/__init__.py,sha256=CW_qdtEfwgJRsCyBlk5hNlxwEO-VV6mBXZvkRbND_J8,112
77
77
  castor_extractor/transformation/coalesce/assets.py,sha256=pzccYPP66c9PAnVroemx7-6MeRHw7Ft1OlTC6jIamAA,363
78
78
  castor_extractor/transformation/coalesce/client/__init__.py,sha256=VRmVpH29rOghtDQnCN7dAdA0dI0Lxseu4BC8rnwM9dU,80
79
- castor_extractor/transformation/coalesce/client/client.py,sha256=7EVJDDxnIm5_uMHLFZ2PD6JzfebVglKST9IiURwn4vs,6524
79
+ castor_extractor/transformation/coalesce/client/client.py,sha256=3YB82ibaumeSRd510mlrPXKsWefV3lHQQVis9oEK-LQ,6133
80
80
  castor_extractor/transformation/coalesce/client/credentials.py,sha256=jbJxjbdPspf-dzYKfeb7oqL_8TXd1nvkJrjAcdAnLPc,548
81
81
  castor_extractor/transformation/coalesce/client/endpoint.py,sha256=0uLh7dpA1vsR9qr_50SEYV_-heQE4BwED9oNMgYsL-w,1272
82
- castor_extractor/transformation/coalesce/client/type.py,sha256=oiiVP9NL0ijTXyQmaB8aJVYckc7m-m8ZgMyNIAduUKE,43
83
- castor_extractor/transformation/coalesce/client/utils.py,sha256=jbxh3OCbYm3fKZD1QfqX5zm1ZD_jFIrpUQsX8paRP7g,1627
84
- castor_extractor/transformation/coalesce/client/utils_test.py,sha256=Q00Y1n0Q_sZ0LFnYn98yDGFumBsifzVJSc7_3PSBMfI,1543
82
+ castor_extractor/transformation/coalesce/client/pagination.py,sha256=zynyWCMEzUQ7HA1Q5AP4BAOmxRQI6NA5jCPEo0lHn44,705
85
83
  castor_extractor/transformation/dbt/__init__.py,sha256=LHQROlMqYWCc7tcmhdjXtROFpJqUvCg9jPC8avHgD4I,107
86
84
  castor_extractor/transformation/dbt/assets.py,sha256=JY1nKEGySZ84wNoe7dnizwAYw2q0t8NVaIfqhB2rSw0,148
87
85
  castor_extractor/transformation/dbt/client.py,sha256=BIue1DNAn2b7kHeiXBkGNosq8jZA2DrgjP7Gi5epAPE,5684
@@ -108,7 +106,7 @@ castor_extractor/utils/client/api/auth.py,sha256=lq0K3UEl1vwIIa_vKTdlpIQPdE5K1-5
108
106
  castor_extractor/utils/client/api/auth_test.py,sha256=LlyXytnatg6ZzR4Zkvzk0BH99FYhHX7qn_nyr2MSnDI,1305
109
107
  castor_extractor/utils/client/api/client.py,sha256=qmj7KoNqt6F-cmpdaMiz_aVxzwMCgbDNcgzXSbCdu1Y,5183
110
108
  castor_extractor/utils/client/api/client_test.py,sha256=FM3ZxsLLfMOBn44cXX6FIgnA31-5TTNIyp9D4LBwtXE,1222
111
- castor_extractor/utils/client/api/pagination.py,sha256=ph5TYqPiyFGgygsIhCATAHPIQ9UJNZyiTcqlyRdGEno,2460
109
+ castor_extractor/utils/client/api/pagination.py,sha256=tNL89bvgnMJd0ajJA07wTTReH3PJOQm3xsa93SKHFss,2499
112
110
  castor_extractor/utils/client/api/pagination_test.py,sha256=jCOgXFXrH-jrCxe2dfk80ZksJF-EtmpJPU11BGabsqk,1385
113
111
  castor_extractor/utils/client/api/safe_request.py,sha256=5pvI2WPRDtitX9F1aYcXTIMPNmDikRK9dKTD3ctoeoQ,1774
114
112
  castor_extractor/utils/client/api/safe_request_test.py,sha256=LqS5FBxs6lLLcTkcgxIoLb6OinxShHXR5y4CWZpwmwg,2005
@@ -430,8 +428,8 @@ castor_extractor/warehouse/sqlserver/queries/table.sql,sha256=4RgeSkHDWTWRyU2iLx
430
428
  castor_extractor/warehouse/sqlserver/queries/user.sql,sha256=gOrZsMVypusR2dc4vwVs4E1a-CliRsr_UjnD2EbXs-A,94
431
429
  castor_extractor/warehouse/sqlserver/query.py,sha256=7sW8cK3JzxPt6faTJ7e4lk9tE4fo_AeCymI-LqsSols,1276
432
430
  castor_extractor/warehouse/synapse/queries/column.sql,sha256=lNcFoIW3Y0PFOqoOzJEXmPvZvfAsY0AP63Mu2LuPzPo,1351
433
- castor_extractor-0.24.34.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
434
- castor_extractor-0.24.34.dist-info/METADATA,sha256=-xB8vdjxDHFkDYbyAlL8L-nEbQMqs44GVzN5wgvKfjs,26293
435
- castor_extractor-0.24.34.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
436
- castor_extractor-0.24.34.dist-info/entry_points.txt,sha256=_F-qeZCybjoMkNb9ErEhnyqXuG6afHIFQhakdBHZsr4,1803
437
- castor_extractor-0.24.34.dist-info/RECORD,,
431
+ castor_extractor-0.24.35.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
432
+ castor_extractor-0.24.35.dist-info/METADATA,sha256=-vrfKzS5B3r2qL7tjFjFBR-AizzuVIexEVJHCci7Z5s,26353
433
+ castor_extractor-0.24.35.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
434
+ castor_extractor-0.24.35.dist-info/entry_points.txt,sha256=_F-qeZCybjoMkNb9ErEhnyqXuG6afHIFQhakdBHZsr4,1803
435
+ castor_extractor-0.24.35.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- NodeIDToNamesMapping = dict[str, set[str]]
@@ -1,52 +0,0 @@
1
- from ....utils import SerializedAsset
2
- from .type import NodeIDToNamesMapping
3
-
4
- _NULL_SUFFIX = ": Null"
5
- _UNIQUE_SUFFIX = ": Unique"
6
-
7
-
8
- def is_test(
9
- query_result: dict,
10
- node_id: str,
11
- test_names: NodeIDToNamesMapping,
12
- column_names: NodeIDToNamesMapping,
13
- ) -> bool:
14
- """
15
- checks whether a query result is a test result or not.
16
-
17
- all this implementation can soon be replaced by checking whether
18
- query_result['type'] == 'sqlTest', which should be GA Apr 28th 2025
19
- """
20
- # test scoped on the node (table)
21
- result_name = query_result["name"]
22
- if result_name in test_names.get(node_id, {}):
23
- return True
24
-
25
- # test scoped on the column
26
- if result_name.endswith(_NULL_SUFFIX) or result_name.endswith(
27
- _UNIQUE_SUFFIX
28
- ):
29
- column_name = result_name.split(":")[0]
30
- if column_name in column_names.get(node_id, {}):
31
- return True
32
- return False
33
-
34
-
35
- def test_names_per_node(nodes: SerializedAsset) -> NodeIDToNamesMapping:
36
- """mapping nodeID: set(testName)"""
37
- mapping: dict[str, set[str]] = {}
38
- for node in nodes:
39
- node_id = node["id"]
40
- tests = node.get("metadata", {}).get("appliedNodeTests", [])
41
- mapping[node_id] = {test["name"] for test in tests}
42
- return mapping
43
-
44
-
45
- def column_names_per_node(nodes: SerializedAsset) -> NodeIDToNamesMapping:
46
- """mapping nodeID: set(columnNames)"""
47
- mapping: dict[str, set[str]] = {}
48
- for node in nodes:
49
- node_id = node["id"]
50
- columns = node.get("metadata", {}).get("columns", [])
51
- mapping[node_id] = {column["name"] for column in columns}
52
- return mapping
@@ -1,54 +0,0 @@
1
- from .utils import is_test
2
-
3
-
4
- def test_is_test():
5
- test_names = {"some-uuid": {"check-mirrors", "check-seatbelt"}}
6
- column_names = {"some-uuid": {"carthago", "delenda", "est"}}
7
-
8
- happy_node_test = is_test(
9
- query_result={"name": "check-mirrors"},
10
- node_id="some-uuid",
11
- test_names=test_names,
12
- column_names=column_names,
13
- )
14
- assert happy_node_test is True
15
-
16
- unknown_node_test = is_test(
17
- query_result={"name": "check-engine"},
18
- node_id="some-uuid",
19
- test_names=test_names,
20
- column_names=column_names,
21
- )
22
- assert unknown_node_test is False
23
-
24
- happy_column_test_unique = is_test(
25
- query_result={"name": "carthago: Unique"},
26
- node_id="some-uuid",
27
- test_names=test_names,
28
- column_names=column_names,
29
- )
30
- assert happy_column_test_unique is True
31
-
32
- happy_column_test_null = is_test(
33
- query_result={"name": "carthago: Null"},
34
- node_id="some-uuid",
35
- test_names=test_names,
36
- column_names=column_names,
37
- )
38
- assert happy_column_test_null is True
39
-
40
- unknown_column_test = is_test(
41
- query_result={"name": "rome: Unique"},
42
- node_id="some-uuid",
43
- test_names=test_names,
44
- column_names=column_names,
45
- )
46
- assert unknown_column_test is False
47
-
48
- unknown_node_id_test = is_test(
49
- query_result={"name": "whatever: Unique"},
50
- node_id="unknown-uuid",
51
- test_names=test_names,
52
- column_names=column_names,
53
- )
54
- assert unknown_node_id_test is False