castor-extractor 0.19.0__py3-none-any.whl → 0.19.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +29 -2
- castor_extractor/file_checker/templates/generic_warehouse.py +1 -1
- castor_extractor/knowledge/notion/client/client.py +44 -80
- castor_extractor/knowledge/notion/client/client_test.py +9 -4
- castor_extractor/knowledge/notion/client/constants.py +1 -0
- castor_extractor/knowledge/notion/client/endpoints.py +1 -1
- castor_extractor/knowledge/notion/client/pagination.py +9 -5
- castor_extractor/quality/soda/assets.py +1 -1
- castor_extractor/quality/soda/client/client.py +30 -83
- castor_extractor/quality/soda/client/credentials.py +0 -11
- castor_extractor/quality/soda/client/endpoints.py +3 -6
- castor_extractor/quality/soda/client/pagination.py +25 -0
- castor_extractor/utils/__init__.py +13 -2
- castor_extractor/utils/client/__init__.py +14 -0
- castor_extractor/utils/client/api/__init__.py +5 -0
- castor_extractor/utils/client/api/auth.py +76 -0
- castor_extractor/utils/client/api/auth_test.py +49 -0
- castor_extractor/utils/client/api/client.py +153 -0
- castor_extractor/utils/client/api/client_test.py +47 -0
- castor_extractor/utils/client/api/pagination.py +83 -0
- castor_extractor/utils/client/api/pagination_test.py +51 -0
- castor_extractor/utils/{safe_request_test.py → client/api/safe_request_test.py} +4 -1
- castor_extractor/utils/client/api/utils.py +9 -0
- castor_extractor/utils/client/api/utils_test.py +16 -0
- castor_extractor/utils/collection.py +34 -2
- castor_extractor/utils/collection_test.py +17 -3
- castor_extractor/utils/pager/__init__.py +0 -1
- castor_extractor/utils/retry.py +44 -0
- castor_extractor/utils/retry_test.py +26 -1
- castor_extractor/utils/salesforce/client.py +44 -49
- castor_extractor/utils/salesforce/client_test.py +2 -2
- castor_extractor/utils/salesforce/pagination.py +33 -0
- castor_extractor/visualization/domo/client/client.py +10 -5
- castor_extractor/visualization/domo/client/credentials.py +1 -1
- castor_extractor/visualization/domo/client/endpoints.py +19 -7
- castor_extractor/visualization/looker/api/credentials.py +1 -1
- castor_extractor/visualization/metabase/client/api/client.py +26 -11
- castor_extractor/visualization/metabase/client/api/credentials.py +1 -1
- castor_extractor/visualization/metabase/client/db/credentials.py +1 -1
- castor_extractor/visualization/mode/client/credentials.py +1 -1
- castor_extractor/visualization/qlik/client/engine/credentials.py +1 -1
- castor_extractor/visualization/salesforce_reporting/client/rest.py +4 -3
- castor_extractor/visualization/sigma/client/client.py +106 -111
- castor_extractor/visualization/sigma/client/credentials.py +11 -1
- castor_extractor/visualization/sigma/client/endpoints.py +1 -1
- castor_extractor/visualization/sigma/client/pagination.py +22 -18
- castor_extractor/visualization/tableau/tests/unit/rest_api/auth_test.py +0 -1
- castor_extractor/visualization/tableau/tests/unit/rest_api/credentials_test.py +0 -3
- castor_extractor/visualization/tableau_revamp/assets.py +11 -0
- castor_extractor/visualization/tableau_revamp/client/client.py +71 -151
- castor_extractor/visualization/tableau_revamp/client/client_metadata_api.py +95 -0
- castor_extractor/visualization/tableau_revamp/client/client_rest_api.py +128 -0
- castor_extractor/visualization/tableau_revamp/client/client_tsc.py +66 -0
- castor_extractor/visualization/tableau_revamp/client/{tsc_fields.py → rest_fields.py} +15 -2
- castor_extractor/visualization/tableau_revamp/constants.py +0 -2
- castor_extractor/visualization/tableau_revamp/extract.py +5 -11
- castor_extractor/warehouse/databricks/api_client.py +239 -0
- castor_extractor/warehouse/databricks/api_client_test.py +15 -0
- castor_extractor/warehouse/databricks/client.py +37 -490
- castor_extractor/warehouse/databricks/client_test.py +1 -99
- castor_extractor/warehouse/databricks/endpoints.py +28 -0
- castor_extractor/warehouse/databricks/lineage.py +141 -0
- castor_extractor/warehouse/databricks/lineage_test.py +34 -0
- castor_extractor/warehouse/databricks/pagination.py +22 -0
- castor_extractor/warehouse/databricks/sql_client.py +90 -0
- castor_extractor/warehouse/databricks/utils.py +44 -1
- castor_extractor/warehouse/databricks/utils_test.py +58 -1
- castor_extractor/warehouse/mysql/client.py +0 -2
- castor_extractor/warehouse/salesforce/client.py +12 -59
- castor_extractor/warehouse/salesforce/pagination.py +34 -0
- castor_extractor/warehouse/sqlserver/client.py +0 -1
- castor_extractor-0.19.6.dist-info/METADATA +903 -0
- {castor_extractor-0.19.0.dist-info → castor_extractor-0.19.6.dist-info}/RECORD +77 -60
- castor_extractor/utils/client/api.py +0 -87
- castor_extractor/utils/client/api_test.py +0 -24
- castor_extractor/utils/pager/pager_on_token.py +0 -52
- castor_extractor/utils/pager/pager_on_token_test.py +0 -73
- castor_extractor/visualization/sigma/client/client_test.py +0 -54
- castor_extractor-0.19.0.dist-info/METADATA +0 -207
- /castor_extractor/utils/{safe_request.py → client/api/safe_request.py} +0 -0
- {castor_extractor-0.19.0.dist-info → castor_extractor-0.19.6.dist-info}/LICENCE +0 -0
- {castor_extractor-0.19.0.dist-info → castor_extractor-0.19.6.dist-info}/WHEEL +0 -0
- {castor_extractor-0.19.0.dist-info → castor_extractor-0.19.6.dist-info}/entry_points.txt +0 -0
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
from
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from pydantic import Field
|
|
2
4
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
3
5
|
|
|
4
6
|
CASTOR_ENV_PREFIX = "CASTOR_SIGMA_"
|
|
@@ -17,3 +19,11 @@ class SigmaCredentials(BaseSettings):
|
|
|
17
19
|
client_id: str
|
|
18
20
|
host: str
|
|
19
21
|
grant_type: str = "client_credentials"
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def token_payload(self) -> Dict[str, str]:
|
|
25
|
+
return {
|
|
26
|
+
"grant_type": self.grant_type,
|
|
27
|
+
"client_id": self.client_id,
|
|
28
|
+
"client_secret": self.api_token,
|
|
29
|
+
}
|
|
@@ -1,24 +1,28 @@
|
|
|
1
|
-
import logging
|
|
2
1
|
from typing import Optional
|
|
3
2
|
|
|
4
|
-
|
|
3
|
+
from pydantic import ConfigDict, Field
|
|
4
|
+
from pydantic.alias_generators import to_camel
|
|
5
5
|
|
|
6
|
+
from ....utils import PaginationModel
|
|
6
7
|
|
|
7
|
-
|
|
8
|
-
"""This is a wrapper around Sigma's pagination system"""
|
|
8
|
+
SIGMA_API_LIMIT = 500 # default number of records per page
|
|
9
9
|
|
|
10
|
-
def __init__(
|
|
11
|
-
self,
|
|
12
|
-
next_page: Optional[str],
|
|
13
|
-
entries: Optional[list] = None,
|
|
14
|
-
total: Optional[int] = 0,
|
|
15
|
-
):
|
|
16
|
-
self.next_page = next_page
|
|
17
|
-
self.entries = entries or []
|
|
18
|
-
self.total = total
|
|
19
10
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
11
|
+
class SigmaPagination(PaginationModel):
|
|
12
|
+
next_page: Optional[str] = "0"
|
|
13
|
+
entries: list = Field(default_factory=list)
|
|
14
|
+
|
|
15
|
+
model_config = ConfigDict(
|
|
16
|
+
alias_generator=to_camel,
|
|
17
|
+
populate_by_name=True,
|
|
18
|
+
from_attributes=True,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
def is_last(self) -> bool:
|
|
22
|
+
return self.next_page is None
|
|
23
|
+
|
|
24
|
+
def next_page_payload(self) -> dict:
|
|
25
|
+
return {"page": self.next_page}
|
|
26
|
+
|
|
27
|
+
def page_results(self) -> list:
|
|
28
|
+
return self.entries
|
|
@@ -10,9 +10,20 @@ class TableauRevampAsset(ExternalAsset):
|
|
|
10
10
|
DASHBOARD = "dashboards"
|
|
11
11
|
DATASOURCE = "datasources"
|
|
12
12
|
FIELD = "fields"
|
|
13
|
+
METRIC = "metrics"
|
|
14
|
+
METRIC_DEFINITION = "metrics_definitions"
|
|
13
15
|
PROJECT = "projects"
|
|
14
16
|
SHEET = "sheets"
|
|
17
|
+
SUBSCRIPTION = "subscriptions"
|
|
15
18
|
TABLE = "tables"
|
|
16
19
|
USAGE = "usage"
|
|
17
20
|
USER = "users"
|
|
18
21
|
WORKBOOK = "workbooks"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# assets that are only available for clients using Tableau Pulse
|
|
25
|
+
TABLEAU_PULSE_ASSETS = (
|
|
26
|
+
TableauRevampAsset.METRIC,
|
|
27
|
+
TableauRevampAsset.METRIC_DEFINITION,
|
|
28
|
+
TableauRevampAsset.SUBSCRIPTION,
|
|
29
|
+
)
|
|
@@ -1,59 +1,39 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Dict, Iterator, List, Optional
|
|
3
2
|
|
|
4
3
|
import tableauserverclient as TSC # type: ignore
|
|
5
|
-
from tableauserverclient import Pager
|
|
6
4
|
|
|
7
5
|
from ....utils import SerializedAsset
|
|
8
|
-
from ..assets import TableauRevampAsset
|
|
9
|
-
from ..constants import
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
)
|
|
6
|
+
from ..assets import TABLEAU_PULSE_ASSETS, TableauRevampAsset
|
|
7
|
+
from ..constants import DEFAULT_TIMEOUT_SECONDS
|
|
8
|
+
from .client_metadata_api import TableauClientMetadataApi
|
|
9
|
+
from .client_rest_api import TableauClientRestApi
|
|
10
|
+
from .client_tsc import TableauClientTSC
|
|
14
11
|
from .credentials import TableauRevampCredentials
|
|
15
|
-
from .errors import TableauApiError
|
|
16
|
-
from .gql_queries import FIELDS_QUERIES, GQL_QUERIES, QUERY_TEMPLATE
|
|
17
|
-
from .tsc_fields import TSC_FIELDS
|
|
18
12
|
|
|
19
13
|
logger = logging.getLogger(__name__)
|
|
20
14
|
|
|
21
|
-
# these assets must be extracted via TableauServerClient
|
|
15
|
+
# these assets must be extracted via TableauServerClient (TSC)
|
|
22
16
|
_TSC_ASSETS = (
|
|
23
|
-
# only users who published content can be extracted from MetadataAPI
|
|
24
|
-
TableauRevampAsset.USER,
|
|
25
17
|
# projects are not available in Metadata API
|
|
26
18
|
TableauRevampAsset.PROJECT,
|
|
27
19
|
# view count are not available in Metadata API
|
|
28
20
|
TableauRevampAsset.USAGE,
|
|
21
|
+
# only users who published content can be extracted from MetadataAPI
|
|
22
|
+
TableauRevampAsset.USER,
|
|
29
23
|
)
|
|
30
24
|
|
|
31
|
-
#
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
# fields are light but volumes are bigger
|
|
39
|
-
TableauRevampAsset.FIELD: 1000,
|
|
40
|
-
TableauRevampAsset.TABLE: 50,
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def _pick_fields(
|
|
45
|
-
data: Pager,
|
|
46
|
-
asset: TableauRevampAsset,
|
|
47
|
-
) -> SerializedAsset:
|
|
48
|
-
fields = TSC_FIELDS[asset]
|
|
49
|
-
|
|
50
|
-
def _pick(row: dict):
|
|
51
|
-
return {field: getattr(row, field) for field in fields}
|
|
25
|
+
# these assets must be extracted via the REST API
|
|
26
|
+
_REST_API_ASSETS = (
|
|
27
|
+
# Tableau Pulse assets are only available in REST API
|
|
28
|
+
TableauRevampAsset.METRIC,
|
|
29
|
+
TableauRevampAsset.METRIC_DEFINITION,
|
|
30
|
+
TableauRevampAsset.SUBSCRIPTION,
|
|
31
|
+
)
|
|
52
32
|
|
|
53
|
-
|
|
33
|
+
logging.getLogger("tableau.endpoint").setLevel(logging.WARNING)
|
|
54
34
|
|
|
55
35
|
|
|
56
|
-
def
|
|
36
|
+
def _merge_datasources(
|
|
57
37
|
datasources: SerializedAsset,
|
|
58
38
|
tsc_datasources: SerializedAsset,
|
|
59
39
|
) -> SerializedAsset:
|
|
@@ -71,14 +51,19 @@ def _enrich_datasources_with_tsc(
|
|
|
71
51
|
# embedded datasources cannot be accessed via URL => no webpage_url
|
|
72
52
|
continue
|
|
73
53
|
luid = datasource["luid"]
|
|
74
|
-
tsc_datasource = mapping
|
|
54
|
+
tsc_datasource = mapping.get(luid)
|
|
55
|
+
if not tsc_datasource:
|
|
56
|
+
# it happens that a datasource is in Metadata API but not in TSC
|
|
57
|
+
datasource["projectLuid"] = None
|
|
58
|
+
datasource["webpageUrl"] = None
|
|
59
|
+
continue
|
|
75
60
|
datasource["projectLuid"] = tsc_datasource["project_id"]
|
|
76
61
|
datasource["webpageUrl"] = tsc_datasource["webpage_url"]
|
|
77
62
|
|
|
78
63
|
return datasources
|
|
79
64
|
|
|
80
65
|
|
|
81
|
-
def
|
|
66
|
+
def _merge_workbooks(
|
|
82
67
|
workbooks: SerializedAsset,
|
|
83
68
|
tsc_workbooks: SerializedAsset,
|
|
84
69
|
) -> SerializedAsset:
|
|
@@ -104,61 +89,44 @@ def _enrich_workbooks_with_tsc(
|
|
|
104
89
|
return workbooks
|
|
105
90
|
|
|
106
91
|
|
|
107
|
-
def
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
# If cursor is defined it must be quoted else use null token
|
|
116
|
-
token = "null" if cursor is None else f'"{cursor}"'
|
|
117
|
-
query_ = query.replace("AFTER_TOKEN_SIGNAL", token)
|
|
118
|
-
answer = server.metadata.query(query_)
|
|
119
|
-
if "errors" in answer:
|
|
120
|
-
raise TableauApiError(answer["errors"])
|
|
121
|
-
return answer["data"][f"{resource}Connection"]
|
|
122
|
-
|
|
123
|
-
cursor = None
|
|
124
|
-
while True:
|
|
125
|
-
payload = _call(cursor)
|
|
126
|
-
yield payload["nodes"]
|
|
127
|
-
|
|
128
|
-
page_info = payload["pageInfo"]
|
|
129
|
-
if page_info["hasNextPage"]:
|
|
130
|
-
cursor = page_info["endCursor"]
|
|
131
|
-
else:
|
|
132
|
-
break
|
|
92
|
+
def _server(
|
|
93
|
+
server_url: str,
|
|
94
|
+
timeout_sec: int,
|
|
95
|
+
) -> TSC.Server:
|
|
96
|
+
options = {"verify": True, "timeout": timeout_sec}
|
|
97
|
+
server = TSC.Server(server_url, use_server_version=True)
|
|
98
|
+
server.add_http_options(options)
|
|
99
|
+
return server
|
|
133
100
|
|
|
134
101
|
|
|
135
102
|
class TableauRevampClient:
|
|
136
103
|
"""
|
|
137
104
|
Connect to Tableau's API and extract assets.
|
|
138
105
|
|
|
139
|
-
Relies on TableauServerClient overlay
|
|
106
|
+
Relies on TableauServerClient (TSC) overlay for authentication
|
|
140
107
|
https://tableau.github.io/server-client-python/docs/
|
|
141
|
-
- for connection
|
|
142
|
-
- to extract Users (Metadata
|
|
143
|
-
|
|
144
|
-
Calls the MetadataAPI, using graphQL
|
|
145
|
-
https://help.tableau.com/current/api/metadata_api/en-us/reference/index.html
|
|
146
108
|
"""
|
|
147
109
|
|
|
148
110
|
def __init__(
|
|
149
111
|
self,
|
|
150
112
|
credentials: TableauRevampCredentials,
|
|
151
113
|
timeout_sec: int = DEFAULT_TIMEOUT_SECONDS,
|
|
114
|
+
with_pulse: bool = False,
|
|
152
115
|
):
|
|
153
116
|
self._credentials = credentials
|
|
154
|
-
self._server =
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
self.
|
|
158
|
-
self.
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
117
|
+
self._server = _server(credentials.server_url, timeout_sec)
|
|
118
|
+
self._with_pulse = with_pulse
|
|
119
|
+
|
|
120
|
+
self._client_metadata = TableauClientMetadataApi(server=self._server)
|
|
121
|
+
self._client_rest = TableauClientRestApi(server=self._server)
|
|
122
|
+
self._client_tsc = TableauClientTSC(server=self._server)
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def base_url(self) -> str:
|
|
126
|
+
return self._credentials.server_url
|
|
127
|
+
|
|
128
|
+
@property
|
|
129
|
+
def name(self) -> str:
|
|
162
130
|
return "Tableau/API"
|
|
163
131
|
|
|
164
132
|
def _user_password_login(self) -> None:
|
|
@@ -203,98 +171,50 @@ class TableauRevampClient:
|
|
|
203
171
|
"Invalid credentials: either user/password or PAT must be provided",
|
|
204
172
|
)
|
|
205
173
|
|
|
206
|
-
def base_url(self) -> str:
|
|
207
|
-
return self._credentials.server_url
|
|
208
|
-
|
|
209
|
-
def _fetch_from_tsc(
|
|
210
|
-
self,
|
|
211
|
-
asset: TableauRevampAsset,
|
|
212
|
-
) -> SerializedAsset:
|
|
213
|
-
if asset == TableauRevampAsset.DATASOURCE:
|
|
214
|
-
data = TSC.Pager(self._server.datasources)
|
|
215
|
-
|
|
216
|
-
elif asset == TableauRevampAsset.PROJECT:
|
|
217
|
-
data = TSC.Pager(self._server.projects)
|
|
218
|
-
|
|
219
|
-
elif asset == TableauRevampAsset.USAGE:
|
|
220
|
-
data = TSC.Pager(self._server.views, usage=True)
|
|
221
|
-
|
|
222
|
-
elif asset == TableauRevampAsset.USER:
|
|
223
|
-
data = TSC.Pager(self._server.users)
|
|
224
|
-
|
|
225
|
-
elif asset == TableauRevampAsset.WORKBOOK:
|
|
226
|
-
data = TSC.Pager(self._server.workbooks)
|
|
227
|
-
|
|
228
|
-
else:
|
|
229
|
-
raise AssertionError(f"Fetching from TSC not supported for {asset}")
|
|
230
|
-
|
|
231
|
-
return _pick_fields(data, asset)
|
|
232
|
-
|
|
233
|
-
def _run_graphql_query(
|
|
234
|
-
self,
|
|
235
|
-
resource: str,
|
|
236
|
-
fields: str,
|
|
237
|
-
page_size: int = DEFAULT_PAGE_SIZE,
|
|
238
|
-
) -> SerializedAsset:
|
|
239
|
-
query = QUERY_TEMPLATE.format(
|
|
240
|
-
resource=resource,
|
|
241
|
-
fields=fields,
|
|
242
|
-
page_size=page_size,
|
|
243
|
-
)
|
|
244
|
-
result_pages = gql_query_scroll(self._server, query, resource)
|
|
245
|
-
return [asset for page in result_pages for asset in page]
|
|
246
|
-
|
|
247
|
-
def _fetch_fields(self) -> SerializedAsset:
|
|
248
|
-
result: SerializedAsset = []
|
|
249
|
-
page_size = _CUSTOM_PAGE_SIZE[TableauRevampAsset.FIELD]
|
|
250
|
-
for resource, fields in FIELDS_QUERIES:
|
|
251
|
-
current = self._run_graphql_query(resource, fields, page_size)
|
|
252
|
-
result.extend(current)
|
|
253
|
-
return result
|
|
254
|
-
|
|
255
|
-
def _fetch_from_metadata_api(
|
|
256
|
-
self,
|
|
257
|
-
asset: TableauRevampAsset,
|
|
258
|
-
) -> SerializedAsset:
|
|
259
|
-
if asset == TableauRevampAsset.FIELD:
|
|
260
|
-
return self._fetch_fields()
|
|
261
|
-
|
|
262
|
-
page_size = _CUSTOM_PAGE_SIZE.get(asset) or DEFAULT_PAGE_SIZE
|
|
263
|
-
resource, fields = GQL_QUERIES[asset]
|
|
264
|
-
return self._run_graphql_query(resource, fields, page_size)
|
|
265
|
-
|
|
266
174
|
def _fetch_datasources(self) -> SerializedAsset:
|
|
267
175
|
asset = TableauRevampAsset.DATASOURCE
|
|
268
176
|
|
|
269
|
-
datasources = self.
|
|
270
|
-
|
|
177
|
+
datasources = self._client_metadata.fetch(asset)
|
|
178
|
+
tsc_datasources = self._client_tsc.fetch(asset)
|
|
271
179
|
|
|
272
|
-
return
|
|
180
|
+
return _merge_datasources(datasources, tsc_datasources)
|
|
273
181
|
|
|
274
182
|
def _fetch_workbooks(self) -> SerializedAsset:
|
|
275
183
|
asset = TableauRevampAsset.WORKBOOK
|
|
276
184
|
|
|
277
|
-
workbooks = self.
|
|
278
|
-
workbook_projects = self.
|
|
185
|
+
workbooks = self._client_metadata.fetch(asset)
|
|
186
|
+
workbook_projects = self._client_tsc.fetch(asset)
|
|
279
187
|
|
|
280
|
-
return
|
|
188
|
+
return _merge_workbooks(workbooks, workbook_projects)
|
|
281
189
|
|
|
282
|
-
def fetch(
|
|
190
|
+
def fetch(
|
|
191
|
+
self,
|
|
192
|
+
asset: TableauRevampAsset,
|
|
193
|
+
) -> SerializedAsset:
|
|
283
194
|
"""
|
|
284
195
|
Extract the given Tableau Asset
|
|
285
196
|
"""
|
|
197
|
+
if asset in TABLEAU_PULSE_ASSETS and not self._with_pulse:
|
|
198
|
+
logger.info(f"Skipping asset {asset} - Tableau Pulse de-activated")
|
|
199
|
+
return []
|
|
200
|
+
|
|
201
|
+
logger.info(f"Extracting {asset.name}...")
|
|
286
202
|
|
|
287
203
|
if asset == TableauRevampAsset.DATASOURCE:
|
|
288
|
-
#
|
|
204
|
+
# two APIs are required to extract datasources
|
|
289
205
|
return self._fetch_datasources()
|
|
290
206
|
|
|
291
207
|
if asset == TableauRevampAsset.WORKBOOK:
|
|
292
|
-
#
|
|
208
|
+
# two APIs are required to extract workbooks
|
|
293
209
|
return self._fetch_workbooks()
|
|
294
210
|
|
|
295
211
|
if asset in _TSC_ASSETS:
|
|
296
212
|
# some assets can only be extracted via TSC
|
|
297
|
-
return self.
|
|
213
|
+
return self._client_tsc.fetch(asset)
|
|
214
|
+
|
|
215
|
+
if asset in _REST_API_ASSETS:
|
|
216
|
+
# some assets can only be extracted via REST API
|
|
217
|
+
return self._client_rest.fetch(asset)
|
|
298
218
|
|
|
299
|
-
#
|
|
300
|
-
return self.
|
|
219
|
+
# other assets can be extracted via Metadata API
|
|
220
|
+
return self._client_metadata.fetch(asset)
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from typing import Dict, Iterator, Optional
|
|
2
|
+
|
|
3
|
+
import tableauserverclient as TSC # type: ignore
|
|
4
|
+
|
|
5
|
+
from ....utils import SerializedAsset
|
|
6
|
+
from ..assets import TableauRevampAsset
|
|
7
|
+
from ..constants import DEFAULT_PAGE_SIZE
|
|
8
|
+
from .errors import TableauApiError
|
|
9
|
+
from .gql_queries import FIELDS_QUERIES, GQL_QUERIES, QUERY_TEMPLATE
|
|
10
|
+
|
|
11
|
+
# increase the value when extraction is too slow
|
|
12
|
+
# decrease the value when timeouts arise
|
|
13
|
+
_CUSTOM_PAGE_SIZE: Dict[TableauRevampAsset, int] = {
|
|
14
|
+
# for some clients, extraction of columns tend to hit the node limit
|
|
15
|
+
# https://community.tableau.com/s/question/0D54T00000YuK60SAF/metadata-query-nodelimitexceeded-error
|
|
16
|
+
# the workaround is to reduce pagination
|
|
17
|
+
TableauRevampAsset.COLUMN: 50,
|
|
18
|
+
# fields are light but volumes are bigger
|
|
19
|
+
TableauRevampAsset.FIELD: 1000,
|
|
20
|
+
TableauRevampAsset.TABLE: 50,
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def gql_query_scroll(
|
|
25
|
+
server,
|
|
26
|
+
query: str,
|
|
27
|
+
resource: str,
|
|
28
|
+
) -> Iterator[SerializedAsset]:
|
|
29
|
+
"""Iterate over GQL query results, handling pagination and cursor"""
|
|
30
|
+
|
|
31
|
+
def _call(cursor: Optional[str]) -> dict:
|
|
32
|
+
# If cursor is defined it must be quoted else use null token
|
|
33
|
+
token = "null" if cursor is None else f'"{cursor}"'
|
|
34
|
+
query_ = query.replace("AFTER_TOKEN_SIGNAL", token)
|
|
35
|
+
answer = server.metadata.query(query_)
|
|
36
|
+
if "errors" in answer:
|
|
37
|
+
raise TableauApiError(answer["errors"])
|
|
38
|
+
return answer["data"][f"{resource}Connection"]
|
|
39
|
+
|
|
40
|
+
cursor = None
|
|
41
|
+
while True:
|
|
42
|
+
payload = _call(cursor)
|
|
43
|
+
yield payload["nodes"]
|
|
44
|
+
|
|
45
|
+
page_info = payload["pageInfo"]
|
|
46
|
+
if page_info["hasNextPage"]:
|
|
47
|
+
cursor = page_info["endCursor"]
|
|
48
|
+
else:
|
|
49
|
+
break
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class TableauClientMetadataApi:
|
|
53
|
+
"""
|
|
54
|
+
Calls the MetadataAPI, using graphQL
|
|
55
|
+
https://help.tableau.com/current/api/metadata_api/en-us/reference/index.html
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
server: TSC.Server,
|
|
61
|
+
):
|
|
62
|
+
self._server = server
|
|
63
|
+
|
|
64
|
+
def _call(
|
|
65
|
+
self,
|
|
66
|
+
resource: str,
|
|
67
|
+
fields: str,
|
|
68
|
+
page_size: int = DEFAULT_PAGE_SIZE,
|
|
69
|
+
) -> SerializedAsset:
|
|
70
|
+
query = QUERY_TEMPLATE.format(
|
|
71
|
+
resource=resource,
|
|
72
|
+
fields=fields,
|
|
73
|
+
page_size=page_size,
|
|
74
|
+
)
|
|
75
|
+
result_pages = gql_query_scroll(self._server, query, resource)
|
|
76
|
+
return [asset for page in result_pages for asset in page]
|
|
77
|
+
|
|
78
|
+
def _fetch_fields(self) -> SerializedAsset:
|
|
79
|
+
result: SerializedAsset = []
|
|
80
|
+
page_size = _CUSTOM_PAGE_SIZE[TableauRevampAsset.FIELD]
|
|
81
|
+
for resource, fields in FIELDS_QUERIES:
|
|
82
|
+
current = self._call(resource, fields, page_size)
|
|
83
|
+
result.extend(current)
|
|
84
|
+
return result
|
|
85
|
+
|
|
86
|
+
def fetch(
|
|
87
|
+
self,
|
|
88
|
+
asset: TableauRevampAsset,
|
|
89
|
+
) -> SerializedAsset:
|
|
90
|
+
if asset == TableauRevampAsset.FIELD:
|
|
91
|
+
return self._fetch_fields()
|
|
92
|
+
|
|
93
|
+
page_size = _CUSTOM_PAGE_SIZE.get(asset) or DEFAULT_PAGE_SIZE
|
|
94
|
+
resource, fields = GQL_QUERIES[asset]
|
|
95
|
+
return self._call(resource, fields, page_size)
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Dict, Optional
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
import tableauserverclient as TSC # type: ignore
|
|
6
|
+
|
|
7
|
+
from ....utils import SerializedAsset, deduplicate
|
|
8
|
+
from ..assets import TableauRevampAsset
|
|
9
|
+
from .rest_fields import REST_FIELDS
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
_PULSE_API = "api/-/pulse"
|
|
14
|
+
|
|
15
|
+
_METRICS_URL = "{base}/pulse/site/{site}/metrics/{definition_id}"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _pick(
|
|
19
|
+
data: SerializedAsset,
|
|
20
|
+
asset: TableauRevampAsset,
|
|
21
|
+
) -> SerializedAsset:
|
|
22
|
+
keys = REST_FIELDS[asset]
|
|
23
|
+
return [{key: row[key] for key in keys} for row in data]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TableauClientRestApi:
|
|
27
|
+
"""
|
|
28
|
+
Extract Tableau Assets using REST API
|
|
29
|
+
https://help.tableau.com/current/api/rest_api/en-us/REST/rest_api_ref.htm
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
server: TSC.Server,
|
|
35
|
+
):
|
|
36
|
+
self._server = server
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def timeout_sec(self) -> int:
|
|
40
|
+
return self._server.http_options["timeout"]
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def headers(self) -> Dict[str, str]:
|
|
44
|
+
return {"x-tableau-auth": self._server.auth_token}
|
|
45
|
+
|
|
46
|
+
def _get_site_name(self) -> str:
|
|
47
|
+
site_id = self._server.site_id
|
|
48
|
+
site = self._server.sites.get_by_id(site_id)
|
|
49
|
+
return site.content_url
|
|
50
|
+
|
|
51
|
+
def _get(
|
|
52
|
+
self,
|
|
53
|
+
url: str,
|
|
54
|
+
page_token: Optional[str] = None,
|
|
55
|
+
) -> Dict:
|
|
56
|
+
if page_token:
|
|
57
|
+
url += f"?page_token={page_token}"
|
|
58
|
+
|
|
59
|
+
logger.debug(f"Calling REST API: {url}")
|
|
60
|
+
response = requests.get(
|
|
61
|
+
url,
|
|
62
|
+
headers=self.headers,
|
|
63
|
+
timeout=self.timeout_sec,
|
|
64
|
+
)
|
|
65
|
+
response.raise_for_status()
|
|
66
|
+
return response.json()
|
|
67
|
+
|
|
68
|
+
def _call(
|
|
69
|
+
self,
|
|
70
|
+
path: str,
|
|
71
|
+
target: str,
|
|
72
|
+
) -> SerializedAsset:
|
|
73
|
+
base = self._server.server_address.strip("/")
|
|
74
|
+
url = f"{base}/{path}/{target}"
|
|
75
|
+
|
|
76
|
+
next_page_token = None
|
|
77
|
+
data = []
|
|
78
|
+
|
|
79
|
+
while True:
|
|
80
|
+
response = self._get(url, next_page_token)
|
|
81
|
+
data += response[target]
|
|
82
|
+
next_page_token = response.get("next_page_token")
|
|
83
|
+
if not next_page_token:
|
|
84
|
+
break
|
|
85
|
+
|
|
86
|
+
return data
|
|
87
|
+
|
|
88
|
+
def _compute_metric_url(self, data: SerializedAsset) -> None:
|
|
89
|
+
site = self._get_site_name()
|
|
90
|
+
base_url = self._server.server_address.strip("/")
|
|
91
|
+
for row in data:
|
|
92
|
+
row["metadata"]["url"] = _METRICS_URL.format(
|
|
93
|
+
base=base_url,
|
|
94
|
+
site=site,
|
|
95
|
+
definition_id=row["metadata"]["id"],
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def _fetch_metrics(self, definitions: SerializedAsset) -> SerializedAsset:
|
|
99
|
+
metrics = []
|
|
100
|
+
for definition in definitions:
|
|
101
|
+
definition_id = definition["metadata"]["id"]
|
|
102
|
+
path = f"{_PULSE_API}/definitions/{definition_id}"
|
|
103
|
+
metrics += self._call(path=path, target="metrics")
|
|
104
|
+
|
|
105
|
+
# for some reason, the REST API sometimes send the same metric twice
|
|
106
|
+
return deduplicate("id", metrics)
|
|
107
|
+
|
|
108
|
+
def fetch(
|
|
109
|
+
self,
|
|
110
|
+
asset: TableauRevampAsset,
|
|
111
|
+
) -> SerializedAsset:
|
|
112
|
+
if asset == TableauRevampAsset.SUBSCRIPTION:
|
|
113
|
+
# https://help.tableau.com/current/api/rest_api/en-us/REST/rest_api_ref_pulse.htm#PulseSubscriptionService_ListSubscriptions
|
|
114
|
+
data = self._call(path=_PULSE_API, target="subscriptions")
|
|
115
|
+
|
|
116
|
+
elif asset == TableauRevampAsset.METRIC_DEFINITION:
|
|
117
|
+
# https://help.tableau.com/current/api/rest_api/en-us/REST/rest_api_ref_pulse.htm#MetricQueryService_ListDefinitions
|
|
118
|
+
data = self._call(path=_PULSE_API, target="definitions")
|
|
119
|
+
self._compute_metric_url(data)
|
|
120
|
+
|
|
121
|
+
elif asset == TableauRevampAsset.METRIC:
|
|
122
|
+
# https://help.tableau.com/current/api/rest_api/en-us/REST/rest_api_ref_pulse.htm#MetricQueryService_ListMetrics
|
|
123
|
+
definitions = self._call(path=_PULSE_API, target="definitions")
|
|
124
|
+
data = self._fetch_metrics(definitions)
|
|
125
|
+
else:
|
|
126
|
+
raise AssertionError(f"Unsupported asset {asset} for REST API")
|
|
127
|
+
|
|
128
|
+
return _pick(data, asset)
|