castor-extractor 0.24.4__py3-none-any.whl → 0.24.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

Files changed (24) hide show
  1. CHANGELOG.md +20 -0
  2. castor_extractor/transformation/__init__.py +0 -0
  3. castor_extractor/transformation/coalesce/__init__.py +2 -0
  4. castor_extractor/transformation/coalesce/assets.py +18 -0
  5. castor_extractor/transformation/coalesce/client/__init__.py +2 -0
  6. castor_extractor/transformation/coalesce/client/client.py +180 -0
  7. castor_extractor/transformation/coalesce/client/credentials.py +23 -0
  8. castor_extractor/transformation/coalesce/client/endpoint.py +42 -0
  9. castor_extractor/transformation/coalesce/client/type.py +1 -0
  10. castor_extractor/transformation/coalesce/client/utils.py +52 -0
  11. castor_extractor/transformation/coalesce/client/utils_test.py +54 -0
  12. castor_extractor/utils/__init__.py +1 -0
  13. castor_extractor/utils/batch.py +16 -0
  14. castor_extractor/utils/batch_test.py +27 -0
  15. castor_extractor/visualization/domo/client/client.py +10 -4
  16. castor_extractor/visualization/tableau/client/client_metadata_api.py +52 -19
  17. castor_extractor/visualization/tableau/client/client_metadata_api_test.py +31 -0
  18. castor_extractor/visualization/tableau/client/gql_queries.py +1 -1
  19. castor_extractor/warehouse/databricks/format.py +1 -1
  20. {castor_extractor-0.24.4.dist-info → castor_extractor-0.24.9.dist-info}/METADATA +23 -3
  21. {castor_extractor-0.24.4.dist-info → castor_extractor-0.24.9.dist-info}/RECORD +24 -11
  22. {castor_extractor-0.24.4.dist-info → castor_extractor-0.24.9.dist-info}/LICENCE +0 -0
  23. {castor_extractor-0.24.4.dist-info → castor_extractor-0.24.9.dist-info}/WHEEL +0 -0
  24. {castor_extractor-0.24.4.dist-info → castor_extractor-0.24.9.dist-info}/entry_points.txt +0 -0
CHANGELOG.md CHANGED
@@ -1,5 +1,25 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.24.9 - 2025-04-16
4
+
5
+ * Introduce API client for **Coalesce**
6
+
7
+ ## 0.24.8 - 2025-04-16
8
+
9
+ * Tableau - remove duplicates introduced by `offset` pagination
10
+
11
+ ## 0.24.7 - 2025-04-07
12
+
13
+ * Tableau - switch from `cursor` to `offset` pagination to mitigate timeout issues
14
+
15
+ ## 0.24.6 - 2025-04-03
16
+
17
+ * Domo - extract cards metadata by batch to prevent from hitting URL max length
18
+
19
+ ## 0.24.5 - 2025-04-02
20
+
21
+ * bump dependencies: google-cloud-storage
22
+
3
23
  ## 0.24.4 - 2025-03-19
4
24
 
5
25
  * Snowflake:
File without changes
@@ -0,0 +1,2 @@
1
+ from .assets import CoalesceAsset, CoalesceQualityAsset
2
+ from .client import CoalesceClient, CoalesceCredentials
@@ -0,0 +1,18 @@
1
+ from ...types import ExternalAsset
2
+
3
+
4
+ class CoalesceAsset(ExternalAsset):
5
+ """Coalesce assets"""
6
+
7
+ NODES = "nodes"
8
+
9
+
10
+ class CoalesceQualityAsset(ExternalAsset):
11
+ """
12
+ Coalesce Quality Assets
13
+ Remark: having a dedicated Enum for Quality simplifies the process of
14
+ searching pushed files
15
+ """
16
+
17
+ NODES = "nodes"
18
+ RUN_RESULTS = "run_results"
@@ -0,0 +1,2 @@
1
+ from .client import CoalesceClient
2
+ from .credentials import CoalesceCredentials
@@ -0,0 +1,180 @@
1
+ from http import HTTPStatus
2
+ from typing import Iterator, Optional
3
+
4
+ from ....utils import APIClient, BearerAuth, RequestSafeMode, SerializedAsset
5
+ from ..assets import CoalesceAsset, CoalesceQualityAsset
6
+ from .credentials import CoalesceCredentials
7
+ from .endpoint import (
8
+ CoalesceEndpointFactory,
9
+ )
10
+ from .type import NodeIDToNamesMapping
11
+ from .utils import column_names_per_node, is_test, test_names_per_node
12
+
13
+ _LIMIT_MAX = 1_000
14
+ _MAX_ERRORS = 50
15
+
16
+
17
+ def _run_result_payload(result: dict, query_result: dict) -> dict:
18
+ return {
19
+ "node_id": result["nodeID"],
20
+ "node_name": result["name"],
21
+ "test_name": query_result["name"],
22
+ "start_time": query_result["startTime"],
23
+ "end_time": query_result["endTime"],
24
+ "status": query_result["status"],
25
+ "success": query_result["success"],
26
+ "isRunning": query_result["isRunning"],
27
+ }
28
+
29
+
30
+ COALESCE_SAFE_MODE = RequestSafeMode(
31
+ status_codes=(HTTPStatus.INTERNAL_SERVER_ERROR,),
32
+ max_errors=_MAX_ERRORS,
33
+ )
34
+ COALESCE_TIMEOUT_SECONDS = 90
35
+
36
+
37
+ class CoalesceBearerAuth(BearerAuth):
38
+ """Bearer Authentication for Coalesce"""
39
+
40
+ def fetch_token(self) -> Optional[str]:
41
+ pass
42
+
43
+ def __init__(self, token: str):
44
+ self._token = token
45
+
46
+
47
+ class CoalesceClient(APIClient):
48
+ """REST API client to extract data from Coalesce"""
49
+
50
+ def __init__(
51
+ self,
52
+ credentials: CoalesceCredentials,
53
+ ):
54
+ auth = CoalesceBearerAuth(token=credentials.token)
55
+ super().__init__(
56
+ host=credentials.host,
57
+ auth=auth,
58
+ safe_mode=COALESCE_SAFE_MODE,
59
+ timeout=COALESCE_TIMEOUT_SECONDS,
60
+ )
61
+
62
+ def _fetch_environments(self) -> Iterator[dict]:
63
+ endpoint = CoalesceEndpointFactory.environments()
64
+ result = self._get(endpoint=endpoint)
65
+ return result["data"]
66
+
67
+ def _node_details(self, environment_id: int, node_id: str) -> dict:
68
+ endpoint = CoalesceEndpointFactory.nodes(
69
+ environment_id=environment_id, node_id=node_id
70
+ )
71
+ return self._get(endpoint=endpoint)
72
+
73
+ def _fetch_env_nodes(self, environment_id: int) -> SerializedAsset:
74
+ endpoint = CoalesceEndpointFactory.nodes(environment_id=environment_id)
75
+ result = self._get(endpoint=endpoint)
76
+ nodes: list[dict] = []
77
+ for node in result["data"]:
78
+ details = self._node_details(environment_id, node["id"])
79
+ nodes.append({**node, **details})
80
+ return nodes
81
+
82
+ def _fetch_all_nodes(self) -> SerializedAsset:
83
+ nodes: list[dict] = []
84
+ for environment in self._fetch_environments():
85
+ environment_id = environment["id"]
86
+ nodes.extend(self._fetch_env_nodes(environment_id))
87
+ return nodes
88
+
89
+ def _fetch_runs(self, starting_from: str) -> SerializedAsset:
90
+ """
91
+ fetch runs, per environment;
92
+ we break per environment to lower the chance of exceeding the 1k limit
93
+ """
94
+ runs: list[dict] = []
95
+ for environment in self._fetch_environments():
96
+ environment_id = environment["id"]
97
+ runs.extend(
98
+ self._fetch_recent_runs_per_env(environment_id, starting_from)
99
+ )
100
+ return runs
101
+
102
+ def _fetch_recent_runs_per_env(
103
+ self, environment_id: int, starting_from: str
104
+ ) -> SerializedAsset:
105
+ endpoint = CoalesceEndpointFactory.runs()
106
+ params = {
107
+ "environmentID": environment_id,
108
+ "limit": _LIMIT_MAX,
109
+ "orderBy": "runEndTime",
110
+ "orderByDirection": "asc",
111
+ "startingFrom": starting_from,
112
+ }
113
+ result = self._get(endpoint=endpoint, params=params)
114
+ return result["data"]
115
+
116
+ def _fetch_run_results(self, run_id: str) -> SerializedAsset:
117
+ endpoint = CoalesceEndpointFactory.run_results(run_id)
118
+ result = self._get(endpoint=endpoint)
119
+ return result["data"]
120
+
121
+ def _run_results_by_run(
122
+ self,
123
+ run_id: str,
124
+ test_names: NodeIDToNamesMapping,
125
+ column_names: NodeIDToNamesMapping,
126
+ ) -> SerializedAsset:
127
+ run_results: list[dict] = []
128
+ for result in self._fetch_run_results(run_id):
129
+ node_id = result["nodeID"]
130
+ for query_result in result["queryResults"]:
131
+ _is_test = is_test(
132
+ query_result,
133
+ node_id,
134
+ test_names,
135
+ column_names,
136
+ )
137
+ if not _is_test:
138
+ continue
139
+ run_result = _run_result_payload(result, query_result)
140
+ run_results.append(run_result)
141
+ return run_results
142
+
143
+ def _run_results_by_env(
144
+ self, environment_id: int, starting_from: str
145
+ ) -> SerializedAsset:
146
+ run_results: list[dict] = []
147
+ nodes = self._fetch_env_nodes(environment_id)
148
+ test_names = test_names_per_node(nodes)
149
+ column_names = column_names_per_node(nodes)
150
+ runs = self._fetch_recent_runs_per_env(environment_id, starting_from)
151
+
152
+ for run in runs:
153
+ run_id = run["id"]
154
+ _results = self._run_results_by_run(
155
+ run_id, test_names, column_names
156
+ )
157
+ run_results.extend(_results)
158
+ return run_results
159
+
160
+ def _fetch_all_run_results(self, starting_from: str) -> SerializedAsset:
161
+ run_results: list[dict] = []
162
+
163
+ for environment in self._fetch_environments():
164
+ environment_id = environment["id"]
165
+ _results = self._run_results_by_env(environment_id, starting_from)
166
+ run_results.extend(_results)
167
+
168
+ return run_results
169
+
170
+ def fetch(
171
+ self, asset: CoalesceAsset, starting_from=None
172
+ ) -> SerializedAsset:
173
+ """Extract the given Coalesce Asset"""
174
+ if asset in (CoalesceAsset.NODES, CoalesceQualityAsset.NODES):
175
+ return self._fetch_all_nodes()
176
+ elif asset == CoalesceQualityAsset.RUN_RESULTS:
177
+ return self._fetch_all_run_results(starting_from=starting_from)
178
+ raise AssertionError(
179
+ f"Asset {asset} is not supported by CoalesceClient"
180
+ )
@@ -0,0 +1,23 @@
1
+ from pydantic import Field
2
+ from pydantic_settings import BaseSettings, SettingsConfigDict
3
+
4
+ CASTOR_ENV_PREFIX = "CASTOR_COALESCE_"
5
+
6
+
7
+ class CoalesceCredentials(BaseSettings):
8
+ """Class to handle Coalesce rest API permissions"""
9
+
10
+ model_config = SettingsConfigDict(
11
+ env_prefix=CASTOR_ENV_PREFIX,
12
+ extra="ignore",
13
+ populate_by_name=True,
14
+ )
15
+
16
+ host: str
17
+ token: str = Field(repr=False)
18
+
19
+ @property
20
+ def token_payload(self) -> dict[str, str]:
21
+ return {
22
+ "client_secret": self.token,
23
+ }
@@ -0,0 +1,42 @@
1
+ from typing import Optional
2
+
3
+
4
+ class CoalesceEndpointFactory:
5
+ """Provide endpoints to hit Coalesce API"""
6
+
7
+ @classmethod
8
+ def environments(cls, environment_id: Optional[int] = None) -> str:
9
+ """
10
+ When specified, concatenate environment_id at the end to fetch details.
11
+ Otherwise, list existing environments.
12
+ """
13
+ base = "api/v1/environments"
14
+ if environment_id:
15
+ return base + f"/{environment_id}"
16
+ return base
17
+
18
+ @classmethod
19
+ def nodes(cls, environment_id: int, node_id: Optional[str] = None) -> str:
20
+ """
21
+ When specified, concatenate node_id at the end to fetch details.
22
+ Otherwise, list existing nodes in the given environment.
23
+ """
24
+ base = f"api/v1/environments/{environment_id}/nodes"
25
+ if node_id:
26
+ return base + f"/{node_id}"
27
+ return base
28
+
29
+ @classmethod
30
+ def runs(cls) -> str:
31
+ """
32
+ Get runs (additional filtering can be done in the body)
33
+ """
34
+ base = "api/v1/runs"
35
+ return base
36
+
37
+ @classmethod
38
+ def run_results(cls, run_id: str) -> str:
39
+ """
40
+ get run results (including success/fail for tests), given a run id
41
+ """
42
+ return f"api/v1/runs/{run_id}/results"
@@ -0,0 +1 @@
1
+ NodeIDToNamesMapping = dict[str, set[str]]
@@ -0,0 +1,52 @@
1
+ from ....utils import SerializedAsset
2
+ from .type import NodeIDToNamesMapping
3
+
4
+ _NULL_SUFFIX = ": Null"
5
+ _UNIQUE_SUFFIX = ": Unique"
6
+
7
+
8
+ def is_test(
9
+ query_result: dict,
10
+ node_id: str,
11
+ test_names: NodeIDToNamesMapping,
12
+ column_names: NodeIDToNamesMapping,
13
+ ) -> bool:
14
+ """
15
+ checks whether a query result is a test result or not.
16
+
17
+ all this implementation can soon be replaced by checking whether
18
+ query_result['type'] == 'sqlTest', which should be GA Apr 28th 2025
19
+ """
20
+ # test scoped on the node (table)
21
+ result_name = query_result["name"]
22
+ if result_name in test_names.get(node_id, {}):
23
+ return True
24
+
25
+ # test scoped on the column
26
+ if result_name.endswith(_NULL_SUFFIX) or result_name.endswith(
27
+ _UNIQUE_SUFFIX
28
+ ):
29
+ column_name = result_name.split(":")[0]
30
+ if column_name in column_names.get(node_id, {}):
31
+ return True
32
+ return False
33
+
34
+
35
+ def test_names_per_node(nodes: SerializedAsset) -> NodeIDToNamesMapping:
36
+ """mapping nodeID: set(testName)"""
37
+ mapping: dict[str, set[str]] = {}
38
+ for node in nodes:
39
+ node_id = node["id"]
40
+ tests = node.get("metadata", {}).get("appliedNodeTests", [])
41
+ mapping[node_id] = {test["name"] for test in tests}
42
+ return mapping
43
+
44
+
45
+ def column_names_per_node(nodes: SerializedAsset) -> NodeIDToNamesMapping:
46
+ """mapping nodeID: set(columnNames)"""
47
+ mapping: dict[str, set[str]] = {}
48
+ for node in nodes:
49
+ node_id = node["id"]
50
+ columns = node.get("metadata", {}).get("columns", [])
51
+ mapping[node_id] = {column["name"] for column in columns}
52
+ return mapping
@@ -0,0 +1,54 @@
1
+ from .utils import is_test
2
+
3
+
4
+ def test_is_test():
5
+ test_names = {"some-uuid": {"check-mirrors", "check-seatbelt"}}
6
+ column_names = {"some-uuid": {"carthago", "delenda", "est"}}
7
+
8
+ happy_node_test = is_test(
9
+ query_result={"name": "check-mirrors"},
10
+ node_id="some-uuid",
11
+ test_names=test_names,
12
+ column_names=column_names,
13
+ )
14
+ assert happy_node_test is True
15
+
16
+ unknown_node_test = is_test(
17
+ query_result={"name": "check-engine"},
18
+ node_id="some-uuid",
19
+ test_names=test_names,
20
+ column_names=column_names,
21
+ )
22
+ assert unknown_node_test is False
23
+
24
+ happy_column_test_unique = is_test(
25
+ query_result={"name": "carthago: Unique"},
26
+ node_id="some-uuid",
27
+ test_names=test_names,
28
+ column_names=column_names,
29
+ )
30
+ assert happy_column_test_unique is True
31
+
32
+ happy_column_test_null = is_test(
33
+ query_result={"name": "carthago: Null"},
34
+ node_id="some-uuid",
35
+ test_names=test_names,
36
+ column_names=column_names,
37
+ )
38
+ assert happy_column_test_null is True
39
+
40
+ unknown_column_test = is_test(
41
+ query_result={"name": "rome: Unique"},
42
+ node_id="some-uuid",
43
+ test_names=test_names,
44
+ column_names=column_names,
45
+ )
46
+ assert unknown_column_test is False
47
+
48
+ unknown_node_id_test = is_test(
49
+ query_result={"name": "whatever: Unique"},
50
+ node_id="unknown-uuid",
51
+ test_names=test_names,
52
+ column_names=column_names,
53
+ )
54
+ assert unknown_node_id_test is False
@@ -1,4 +1,5 @@
1
1
  from .argument_parser import parse_filled_arguments
2
+ from .batch import batch_of_length
2
3
  from .client import (
3
4
  AbstractSourceClient,
4
5
  APIClient,
@@ -0,0 +1,16 @@
1
+ from typing import Iterator, List, TypeVar
2
+
3
+ T = TypeVar("T")
4
+
5
+
6
+ def batch_of_length(
7
+ elements: List[T],
8
+ batch_size: int,
9
+ ) -> Iterator[List[T]]:
10
+ """
11
+ Split the given elements into smaller chunks
12
+ """
13
+ assert batch_size > 1, "batch size must be greater or equal to 1"
14
+ element_count = len(elements)
15
+ for index in range(0, element_count, batch_size):
16
+ yield elements[index : min((index + batch_size), element_count)]
@@ -0,0 +1,27 @@
1
+ import pytest
2
+
3
+ from .batch import batch_of_length
4
+
5
+
6
+ def test_batch_of_length():
7
+ elements = ["a", "b", "c", "d", "e", "f", "g", "h"]
8
+ result = list(batch_of_length(elements, 3))
9
+ assert result == [
10
+ ["a", "b", "c"],
11
+ ["d", "e", "f"],
12
+ ["g", "h"],
13
+ ]
14
+
15
+ result = list(batch_of_length(elements, 1000))
16
+ assert result == [
17
+ elements,
18
+ ]
19
+
20
+ result = list(batch_of_length(elements, 7))
21
+ assert result == [
22
+ ["a", "b", "c", "d", "e", "f", "g"],
23
+ ["h"],
24
+ ]
25
+
26
+ with pytest.raises(AssertionError):
27
+ list(batch_of_length(elements, -12))
@@ -9,6 +9,7 @@ import requests
9
9
  from ....utils import (
10
10
  RequestSafeMode,
11
11
  at_midnight,
12
+ batch_of_length,
12
13
  current_date,
13
14
  empty_iterator,
14
15
  handle_response,
@@ -48,6 +49,8 @@ _RETRY_BASE_MS = 10 * 60 * 1000 # 10 minutes
48
49
 
49
50
  _PARENT_FOLDER = "/Dashboards"
50
51
 
52
+ _CARDS_BATCH_SIZE = 100
53
+
51
54
  logger = logging.getLogger(__name__)
52
55
 
53
56
 
@@ -156,16 +159,19 @@ class DomoClient:
156
159
 
157
160
  return all_results
158
161
 
162
+ def _cards_metadata(self, card_ids: list[int]) -> Iterator[dict]:
163
+ # batch to avoid hitting the URL max length
164
+ for batch_card_ids in batch_of_length(card_ids, _CARDS_BATCH_SIZE):
165
+ endpoint = self._endpoint_factory.cards_metadata(batch_card_ids)
166
+ yield from self._get_element(endpoint)
167
+
159
168
  def _datasources(self, card_ids: list[int]) -> RawData:
160
169
  """Yields all distinct datasources associated to the given cards"""
161
170
  if not card_ids:
162
171
  return empty_iterator()
163
172
 
164
- endpoint = self._endpoint_factory.cards_metadata(card_ids)
165
- cards_metadata = self._get_element(endpoint)
166
-
167
173
  processed: set[str] = set()
168
- for card in cards_metadata:
174
+ for card in self._cards_metadata(card_ids):
169
175
  for datasource in card["datasources"]:
170
176
  id_ = datasource["dataSourceId"]
171
177
  if id_ in processed:
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  from collections.abc import Iterator
2
3
  from typing import Optional
3
4
 
@@ -9,15 +10,14 @@ from ..constants import DEFAULT_PAGE_SIZE
9
10
  from .errors import TableauApiError, TableauApiTimeout
10
11
  from .gql_queries import FIELDS_QUERIES, GQL_QUERIES, QUERY_TEMPLATE
11
12
 
13
+ logger = logging.getLogger(__name__)
14
+
12
15
  # increase the value when extraction is too slow
13
16
  # decrease the value when timeouts arise
14
17
  _CUSTOM_PAGE_SIZE: dict[TableauAsset, int] = {
15
- # for some clients, extraction of columns tend to hit the node limit
16
- # https://community.tableau.com/s/question/0D54T00000YuK60SAF/metadata-query-nodelimitexceeded-error
17
- # the workaround is to reduce pagination
18
- TableauAsset.COLUMN: 50,
19
18
  # fields are light but volumes are bigger
20
19
  TableauAsset.FIELD: 1000,
20
+ # tables are sometimes heavy
21
21
  TableauAsset.TABLE: 50,
22
22
  }
23
23
 
@@ -51,8 +51,9 @@ def _check_errors(answer: dict) -> None:
51
51
 
52
52
  def gql_query_scroll(
53
53
  server,
54
- query: str,
55
54
  resource: str,
55
+ fields: str,
56
+ page_size: int,
56
57
  ) -> Iterator[SerializedAsset]:
57
58
  """
58
59
  Iterate over GQL query results, handling pagination and cursor
@@ -67,26 +68,58 @@ def gql_query_scroll(
67
68
  max_retries=_RETRY_COUNT,
68
69
  base_ms=_RETRY_BASE_MS,
69
70
  )
70
- def _call(cursor: Optional[str]) -> dict:
71
- # If cursor is defined it must be quoted else use null token
72
- token = "null" if cursor is None else f'"{cursor}"'
73
- query_ = query.replace("AFTER_TOKEN_SIGNAL", token)
74
- answer = server.metadata.query(query_)
71
+ def _call(first: int, offset: int) -> dict:
72
+ query = QUERY_TEMPLATE.format(
73
+ resource=resource,
74
+ fields=fields,
75
+ first=first,
76
+ offset=offset,
77
+ )
78
+ answer = server.metadata.query(query)
75
79
  _check_errors(answer)
76
80
  return answer["data"][f"{resource}Connection"]
77
81
 
78
- cursor = None
82
+ current_offset = 0
79
83
  while True:
80
- payload = _call(cursor)
84
+ payload = _call(first=page_size, offset=current_offset)
81
85
  yield payload["nodes"]
82
86
 
83
- page_info = payload["pageInfo"]
84
- if page_info["hasNextPage"]:
85
- cursor = page_info["endCursor"]
86
- else:
87
+ current_offset += len(payload["nodes"])
88
+ total = payload["totalCount"]
89
+ logger.info(f"Extracted {current_offset}/{total} {resource}")
90
+
91
+ if not payload["pageInfo"]["hasNextPage"]:
87
92
  break
88
93
 
89
94
 
95
+ def _deduplicate(result_pages: Iterator[SerializedAsset]) -> SerializedAsset:
96
+ """
97
+ Sometimes assets are duplicated, which triggers UniqueViolation errors
98
+ during store_all down the line.
99
+
100
+ We suspect the offset pagination to be the root cause, because we had no
101
+ problem until recently, when we switched from cursor pagination to offset
102
+ pagination (for performance reasons)
103
+ https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_examples.html#pagination
104
+
105
+ This is a straightforward solution to remove these duplicates directly at
106
+ extraction.
107
+ We don't show warnings because duplicates are expected, and we keep only
108
+ the first occurrence since those duplicates are probably identical.
109
+ """
110
+ deduplicated: SerializedAsset = []
111
+ seen_ids: set[str] = set()
112
+ for page in result_pages:
113
+ for asset in page:
114
+ asset_id = asset["id"]
115
+ if asset_id in seen_ids:
116
+ # skip duplicate
117
+ continue
118
+ deduplicated.append(asset)
119
+ seen_ids.add(asset_id)
120
+ return deduplicated
121
+
122
+
90
123
  class TableauClientMetadataApi:
91
124
  """
92
125
  Calls the MetadataAPI, using graphQL
@@ -107,13 +140,13 @@ class TableauClientMetadataApi:
107
140
  fields: str,
108
141
  page_size: int = DEFAULT_PAGE_SIZE,
109
142
  ) -> SerializedAsset:
110
- query = QUERY_TEMPLATE.format(
143
+ result_pages = gql_query_scroll(
144
+ self._server,
111
145
  resource=resource,
112
146
  fields=fields,
113
147
  page_size=page_size,
114
148
  )
115
- result_pages = gql_query_scroll(self._server, query, resource)
116
- return [asset for page in result_pages for asset in page]
149
+ return _deduplicate(result_pages)
117
150
 
118
151
  def _page_size(self, asset: TableauAsset) -> int:
119
152
  return (
@@ -0,0 +1,31 @@
1
+ from .client_metadata_api import _deduplicate
2
+
3
+
4
+ def test__deduplicate():
5
+ result_pages = iter(
6
+ [
7
+ [
8
+ {"id": 1, "name": "workbook_1"},
9
+ {"id": 2, "name": "workbook_2"},
10
+ ],
11
+ [
12
+ {"id": 1, "name": "workbook_1"},
13
+ {"id": 3, "name": "workbook_3"},
14
+ {"id": 4, "name": "workbook_4"},
15
+ ],
16
+ [
17
+ {"id": 4, "name": "workbook_4"},
18
+ {"id": 5, "name": "workbook_5"},
19
+ {"id": 5, "name": "workbook_5"},
20
+ {"id": 5, "name": "workbook_5"},
21
+ ],
22
+ [
23
+ {"id": 1, "name": "workbook_1"},
24
+ {"id": 3, "name": "workbook_3"},
25
+ ],
26
+ ]
27
+ )
28
+ deduplicated = _deduplicate(result_pages)
29
+ assert len(deduplicated) == 5
30
+ deduplicated_keys = {item["id"] for item in deduplicated}
31
+ assert deduplicated_keys == {1, 2, 3, 4, 5}
@@ -2,7 +2,7 @@ from ..assets import TableauAsset
2
2
 
3
3
  QUERY_TEMPLATE = """
4
4
  {{
5
- {resource}Connection(first: {page_size}, after: AFTER_TOKEN_SIGNAL) {{
5
+ {resource}Connection(first: {first}, offset: {offset}) {{
6
6
  nodes {{ {fields}
7
7
  }}
8
8
  pageInfo {{
@@ -168,7 +168,7 @@ class DatabricksFormatter:
168
168
  "schema_name": None,
169
169
  "query_text": q["query_text"],
170
170
  "user_id": q["user_id"],
171
- "user_name": q["user_name"],
171
+ "user_name": q.get("user_name"),
172
172
  "start_time": start_time,
173
173
  "end_time": end_time,
174
174
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: castor-extractor
3
- Version: 0.24.4
3
+ Version: 0.24.9
4
4
  Summary: Extract your metadata assets.
5
5
  Home-page: https://www.castordoc.com/
6
6
  License: EULA
@@ -35,7 +35,7 @@ Requires-Dist: google-api-core (>=2.1.1,<3.0.0)
35
35
  Requires-Dist: google-api-python-client (>=2.121.0,<3.0.0) ; extra == "lookerstudio" or extra == "all"
36
36
  Requires-Dist: google-auth (>=2,<3)
37
37
  Requires-Dist: google-cloud-core (>=2.1.0,<3.0.0)
38
- Requires-Dist: google-cloud-storage (>=2,<3)
38
+ Requires-Dist: google-cloud-storage (>=3.1.0,<4.0.0)
39
39
  Requires-Dist: google-resumable-media (>=2.0.3,<3.0.0)
40
40
  Requires-Dist: googleapis-common-protos (>=1.53.0,<2.0.0)
41
41
  Requires-Dist: looker-sdk (>=25.0.0,<26.0.0) ; extra == "looker" or extra == "all"
@@ -51,7 +51,7 @@ Requires-Dist: pymssql (>=2.2.11,<3.0.0) ; extra == "sqlserver" or extra == "all
51
51
  Requires-Dist: pymysql[rsa] (>=1.1.0,<2.0.0) ; extra == "mysql" or extra == "all"
52
52
  Requires-Dist: python-dateutil (>=2.0.0,<=3.0.0)
53
53
  Requires-Dist: requests (>=2.0.0,<3.0.0)
54
- Requires-Dist: setuptools (>=75.6)
54
+ Requires-Dist: setuptools (>=78.1)
55
55
  Requires-Dist: snowflake-connector-python (>=3.4.0,<4.0.0) ; extra == "snowflake" or extra == "all"
56
56
  Requires-Dist: snowflake-sqlalchemy (!=1.2.5,<2.0.0) ; extra == "snowflake" or extra == "all"
57
57
  Requires-Dist: sqlalchemy (>=1.4,<1.5)
@@ -210,6 +210,26 @@ For any questions or bug report, contact us at [support@castordoc.com](mailto:su
210
210
 
211
211
  # Changelog
212
212
 
213
+ ## 0.24.9 - 2025-04-16
214
+
215
+ * Introduce API client for **Coalesce**
216
+
217
+ ## 0.24.8 - 2025-04-16
218
+
219
+ * Tableau - remove duplicates introduced by `offset` pagination
220
+
221
+ ## 0.24.7 - 2025-04-07
222
+
223
+ * Tableau - switch from `cursor` to `offset` pagination to mitigate timeout issues
224
+
225
+ ## 0.24.6 - 2025-04-03
226
+
227
+ * Domo - extract cards metadata by batch to prevent from hitting URL max length
228
+
229
+ ## 0.24.5 - 2025-04-02
230
+
231
+ * bump dependencies: google-cloud-storage
232
+
213
233
  ## 0.24.4 - 2025-03-19
214
234
 
215
235
  * Snowflake:
@@ -1,4 +1,4 @@
1
- CHANGELOG.md,sha256=1Y5FmmQDspwZaOhKjnJosP2sNd898LeTOmVIMTBt9Bw,16387
1
+ CHANGELOG.md,sha256=UKD2ldg9s00KOoVfWjnyB_m50R0fnpPLbpmkZHKoOQM,16821
2
2
  Dockerfile,sha256=xQ05-CFfGShT3oUqaiumaldwA288dj9Yb_pxofQpufg,301
3
3
  DockerfileUsage.md,sha256=2hkJQF-5JuuzfPZ7IOxgM6QgIQW7l-9oRMFVwyXC4gE,998
4
4
  LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
@@ -68,6 +68,16 @@ castor_extractor/quality/soda/client/client.py,sha256=Gd3GaachWx5ZEH_nqgTxiBIbUq
68
68
  castor_extractor/quality/soda/client/credentials.py,sha256=R1g7nHpJlQ5hBjtUFN06QjjWAouQtb_V-je7cAXXIA4,514
69
69
  castor_extractor/quality/soda/client/endpoints.py,sha256=x3B-XlnDF8NJMuk-81N72_6HA-YZEzA895khLyj0j54,228
70
70
  castor_extractor/quality/soda/client/pagination.py,sha256=_7caQUNDNPGRufnZNrfYBN3oVXsk99_2wYr67I0ehAs,530
71
+ castor_extractor/transformation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
72
+ castor_extractor/transformation/coalesce/__init__.py,sha256=CW_qdtEfwgJRsCyBlk5hNlxwEO-VV6mBXZvkRbND_J8,112
73
+ castor_extractor/transformation/coalesce/assets.py,sha256=pzccYPP66c9PAnVroemx7-6MeRHw7Ft1OlTC6jIamAA,363
74
+ castor_extractor/transformation/coalesce/client/__init__.py,sha256=VRmVpH29rOghtDQnCN7dAdA0dI0Lxseu4BC8rnwM9dU,80
75
+ castor_extractor/transformation/coalesce/client/client.py,sha256=yrPzIk-6VN4MDHwti3Yxy3PCfHmxE6znjuehl_-dYTg,6151
76
+ castor_extractor/transformation/coalesce/client/credentials.py,sha256=jbJxjbdPspf-dzYKfeb7oqL_8TXd1nvkJrjAcdAnLPc,548
77
+ castor_extractor/transformation/coalesce/client/endpoint.py,sha256=0uLh7dpA1vsR9qr_50SEYV_-heQE4BwED9oNMgYsL-w,1272
78
+ castor_extractor/transformation/coalesce/client/type.py,sha256=oiiVP9NL0ijTXyQmaB8aJVYckc7m-m8ZgMyNIAduUKE,43
79
+ castor_extractor/transformation/coalesce/client/utils.py,sha256=jbxh3OCbYm3fKZD1QfqX5zm1ZD_jFIrpUQsX8paRP7g,1627
80
+ castor_extractor/transformation/coalesce/client/utils_test.py,sha256=Q00Y1n0Q_sZ0LFnYn98yDGFumBsifzVJSc7_3PSBMfI,1543
71
81
  castor_extractor/types.py,sha256=nHel2hv6NoHmdpOX_heEfO2-DnZPoYA2x0eJdbFvT0s,1276
72
82
  castor_extractor/uploader/__init__.py,sha256=A4bq_SrEtKAsl0r_D_duSTvL5WIQjVfsMy7tDx9IKg0,87
73
83
  castor_extractor/uploader/constant.py,sha256=yTigLHDlYwoRr6CpFIl7ReElFsQd4H-qkluMZJPWSx0,865
@@ -77,9 +87,11 @@ castor_extractor/uploader/settings.py,sha256=3MvOX-UFRqrLZoiT7wYn9jUGro7NX4RCafY
77
87
  castor_extractor/uploader/upload.py,sha256=PSQfkO_7LSE0WBo9Tm_hlS2ONepKeB0cBFdJXySnues,4310
78
88
  castor_extractor/uploader/upload_test.py,sha256=7fwstdQe7FjuwGilsCdFpEQr1qLoR2WTRUzyy93fISw,402
79
89
  castor_extractor/uploader/utils.py,sha256=otAaySj5aeem6f0CTd0Te6ioJ6uP2J1p348j-SdIwDI,802
80
- castor_extractor/utils/__init__.py,sha256=X7WOOgrpGf7Vh8r-7eNGjuC0rKs0g9GTO3d7hZ18gwo,1550
90
+ castor_extractor/utils/__init__.py,sha256=KQkr_CmxWG0Vpu7CaqjbJkffUeEWcyeA9Cbm394Hygk,1585
81
91
  castor_extractor/utils/argument_parser.py,sha256=S4EcIh3wNDjs3fOrQnttCcPsAmG8m_Txl7xvEh0Q37s,283
82
92
  castor_extractor/utils/argument_parser_test.py,sha256=wnyLFJ74iEiPxxLSbwFtckR7FIHxsFOVU38ljs9gqRA,633
93
+ castor_extractor/utils/batch.py,sha256=SFlLmJgVjV2nVhIrjVIEp8wJ9du4dKKHq8YVYubnwQQ,448
94
+ castor_extractor/utils/batch_test.py,sha256=84JYXOxiTkZFAceVh0mzN6VtKxcqoFPbxkZfIDyLGlg,606
83
95
  castor_extractor/utils/client/__init__.py,sha256=h5gm8UNNCCkAqhjYK5f6BY7k0cHFOyAvkmlktqwpir0,392
84
96
  castor_extractor/utils/client/abstract.py,sha256=CWF7_afNpEZ3jor-22wXbKIvM20ukHkaDy_uknKz8B0,2075
85
97
  castor_extractor/utils/client/api/__init__.py,sha256=vlG7WXznYgLTn3XyMGsyUkgRkup8FbKM14EXJ8mv-b0,264
@@ -146,7 +158,7 @@ castor_extractor/visualization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
146
158
  castor_extractor/visualization/domo/__init__.py,sha256=1axOCPm4RpdIyUt9LQEvlMvbOPllW8rk63h6EjVgJ0Y,111
147
159
  castor_extractor/visualization/domo/assets.py,sha256=bK1urFR2tnlWkVkkhR32mAKMoKbESNlop-CNGx-65PY,206
148
160
  castor_extractor/visualization/domo/client/__init__.py,sha256=Do0fU4B8Hhlhahcv734gnJl_ryCztfTBDea7XNCKfB8,72
149
- castor_extractor/visualization/domo/client/client.py,sha256=vOMBY5dY6N3v55YJPdh9aoiddXnuLnGoFHLE5BeUKSg,9662
161
+ castor_extractor/visualization/domo/client/client.py,sha256=bgzXWUm-UnTIwgyJKaJkoHzQpDYwWCGCe97MsMFw6ng,9930
150
162
  castor_extractor/visualization/domo/client/credentials.py,sha256=4gnsk4Tpt3ggdUYbvyNPJEXeCyTy12s-X24P5hFdULg,873
151
163
  castor_extractor/visualization/domo/client/endpoints.py,sha256=eIE9oeZ_cmJSWWDuyxh6JaAOs3y5bTJQQ265HYgpulE,2775
152
164
  castor_extractor/visualization/domo/client/pagination.py,sha256=ukVkHVzoH4mfZ29H9YcnC2YrdVolP10wv25J6Q3ehRw,821
@@ -264,12 +276,13 @@ castor_extractor/visualization/tableau/__init__.py,sha256=eFI_1hjdkxyUiAYiy3szwy
264
276
  castor_extractor/visualization/tableau/assets.py,sha256=HbCRd8VCj1WBEeqg9jwnygnT7xOFJ6PQD7Lq7sV-XR0,635
265
277
  castor_extractor/visualization/tableau/client/__init__.py,sha256=P8RKFKOC63WkH5hdEytJOwHS9vzQ8GXreLfXZetmMP8,78
266
278
  castor_extractor/visualization/tableau/client/client.py,sha256=zzqhzIqKyJygo4ZNGk6cZh0e6Z9R1W5T0P9un52KC1M,7626
267
- castor_extractor/visualization/tableau/client/client_metadata_api.py,sha256=fIBsSbRTypBABsCoigO2dkKsw4Eu3GrsEPTDfjY8A80,4303
279
+ castor_extractor/visualization/tableau/client/client_metadata_api.py,sha256=fARj7xroHfMd4nlo5CJK5jPok5UsHznOQpIpNaECVHw,5274
280
+ castor_extractor/visualization/tableau/client/client_metadata_api_test.py,sha256=lbsq5mLtqeNc5EsmCw9Mvl8qcvMsTcJTepHwy1ToyvA,969
268
281
  castor_extractor/visualization/tableau/client/client_rest_api.py,sha256=x4dNw4PPJdalTlGowwkANwqiS2ZhGxzpQytkHq3KbpY,3988
269
282
  castor_extractor/visualization/tableau/client/client_tsc.py,sha256=VI_PJyd1ty3HSYXHHQjshmG2ziowIbrwJRonRPCHbks,1820
270
283
  castor_extractor/visualization/tableau/client/credentials.py,sha256=uQICIgeXmLZfOroTgZt7PuKNKTyqQllRGSTcOmIfrKU,1893
271
284
  castor_extractor/visualization/tableau/client/errors.py,sha256=ecT8Tit5VtzrOBB9ykblA0nvd75j5-_QDFupjV48zJQ,300
272
- castor_extractor/visualization/tableau/client/gql_queries.py,sha256=NISarYh33Ij7DhYxqjTdv681AHYpbft8kPwVUQbAZ7U,2190
285
+ castor_extractor/visualization/tableau/client/gql_queries.py,sha256=XJAfhpMZ5S7-AhfpOaoHMHCAdil-l5e5xB-CH4NC38M,2177
273
286
  castor_extractor/visualization/tableau/client/rest_fields.py,sha256=ZKYYuMxg9PXhczVXaD4rXNk7dYyWJ1_bVM8FLEXju7s,888
274
287
  castor_extractor/visualization/tableau/constants.py,sha256=lHGB50FgVNO2nXeIhkvQKivD8ZFBIjDrflgD5cTXKJw,104
275
288
  castor_extractor/visualization/tableau/extract.py,sha256=FnjmmUdNA9MEf3S5Tw37x6ZXxVsK8R3YnVk1UVYbaZk,1423
@@ -315,7 +328,7 @@ castor_extractor/warehouse/databricks/credentials.py,sha256=ExtVcl2NpMXTx1Lg8vHQ
315
328
  castor_extractor/warehouse/databricks/endpoints.py,sha256=qPoL9CtPFJdwVuW9rJ37nmeMd-nChOBouEVYb4SlaUE,670
316
329
  castor_extractor/warehouse/databricks/enums.py,sha256=3T6BbVvbWvfWkD23krsYT1x0kKh1qRzNPl6WpcXe300,274
317
330
  castor_extractor/warehouse/databricks/extract.py,sha256=Z4VTEIf0QMiua0QGAlJdQ86kxmGAXekQ304aCKme6IY,7358
318
- castor_extractor/warehouse/databricks/format.py,sha256=FUBMrFFWSa_lX5PtixJCDR3eRYycqeMw0oKHt7AkA4o,6732
331
+ castor_extractor/warehouse/databricks/format.py,sha256=S3BOcwJubc1pyKr-li26uftUUfsjfrm5Qf4LqmElXVk,6736
319
332
  castor_extractor/warehouse/databricks/format_test.py,sha256=ls0IcOElqp_qecAzNbK0zdca7Pms4seCHimbw8NAoAI,3322
320
333
  castor_extractor/warehouse/databricks/lineage.py,sha256=jwiRXrgqBAtzQt5EgErYrN8YRyviEEHmyrSbw8TSPq4,2105
321
334
  castor_extractor/warehouse/databricks/lineage_test.py,sha256=PyBn1eAoxLm4Bz5M0F4zmaxFX2mXRTM_uug5OKbQPQs,2684
@@ -403,8 +416,8 @@ castor_extractor/warehouse/sqlserver/queries/table.sql,sha256=kbBQP-TdG5px1IVgyx
403
416
  castor_extractor/warehouse/sqlserver/queries/user.sql,sha256=gOrZsMVypusR2dc4vwVs4E1a-CliRsr_UjnD2EbXs-A,94
404
417
  castor_extractor/warehouse/sqlserver/query.py,sha256=g0hPT-RmeGi2DyenAi3o72cTlQsLToXIFYojqc8E5fQ,533
405
418
  castor_extractor/warehouse/synapse/queries/column.sql,sha256=lNcFoIW3Y0PFOqoOzJEXmPvZvfAsY0AP63Mu2LuPzPo,1351
406
- castor_extractor-0.24.4.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
407
- castor_extractor-0.24.4.dist-info/METADATA,sha256=eY2TPP3IDq9an2JJzoZcN-_rG5DJIGzbJOqEtGBhzd4,23543
408
- castor_extractor-0.24.4.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
409
- castor_extractor-0.24.4.dist-info/entry_points.txt,sha256=FQNShG4w4nRO95_bZnagh7FQ2oiZ-40bdt8ZdTW1-uI,1731
410
- castor_extractor-0.24.4.dist-info/RECORD,,
419
+ castor_extractor-0.24.9.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
420
+ castor_extractor-0.24.9.dist-info/METADATA,sha256=JDqbNB2dwsOO7_5PKUWP0r4FL217fi7OIEbVaOPljDQ,23985
421
+ castor_extractor-0.24.9.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
422
+ castor_extractor-0.24.9.dist-info/entry_points.txt,sha256=FQNShG4w4nRO95_bZnagh7FQ2oiZ-40bdt8ZdTW1-uI,1731
423
+ castor_extractor-0.24.9.dist-info/RECORD,,