castor-extractor 0.16.5__py3-none-any.whl → 0.16.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +17 -0
- castor_extractor/utils/__init__.py +2 -1
- castor_extractor/utils/collection.py +32 -0
- castor_extractor/utils/collection_test.py +60 -0
- castor_extractor/utils/time.py +9 -1
- castor_extractor/utils/time_test.py +8 -1
- castor_extractor/visualization/domo/client/client.py +28 -43
- castor_extractor/visualization/domo/client/client_test.py +1 -23
- castor_extractor/visualization/domo/client/endpoints.py +13 -6
- castor_extractor/visualization/domo/client/pagination.py +4 -0
- castor_extractor/visualization/looker/api/client.py +21 -17
- castor_extractor/visualization/looker/api/sdk.py +10 -58
- castor_extractor/visualization/looker/api/utils.py +1 -1
- castor_extractor/visualization/looker/extract.py +2 -1
- castor_extractor/visualization/looker/multithreading.py +1 -1
- castor_extractor/visualization/tableau_revamp/__init__.py +3 -0
- castor_extractor/visualization/tableau_revamp/assets.py +18 -0
- castor_extractor/visualization/tableau_revamp/client/__init__.py +2 -0
- castor_extractor/visualization/tableau_revamp/client/client.py +297 -0
- castor_extractor/visualization/tableau_revamp/client/credentials.py +104 -0
- castor_extractor/visualization/tableau_revamp/client/errors.py +3 -0
- castor_extractor/visualization/tableau_revamp/client/gql_queries.py +134 -0
- castor_extractor/visualization/tableau_revamp/client/tsc_fields.py +34 -0
- castor_extractor/visualization/tableau_revamp/constants.py +5 -0
- castor_extractor/visualization/tableau_revamp/extract.py +53 -0
- castor_extractor/warehouse/databricks/client.py +12 -5
- castor_extractor/warehouse/databricks/client_test.py +22 -3
- castor_extractor/warehouse/databricks/format.py +5 -1
- {castor_extractor-0.16.5.dist-info → castor_extractor-0.16.9.dist-info}/METADATA +6 -3
- {castor_extractor-0.16.5.dist-info → castor_extractor-0.16.9.dist-info}/RECORD +33 -22
- {castor_extractor-0.16.5.dist-info → castor_extractor-0.16.9.dist-info}/LICENCE +0 -0
- {castor_extractor-0.16.5.dist-info → castor_extractor-0.16.9.dist-info}/WHEEL +0 -0
- {castor_extractor-0.16.5.dist-info → castor_extractor-0.16.9.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Dict, Iterator, List, Optional
|
|
3
|
+
|
|
4
|
+
import tableauserverclient as TSC # type: ignore
|
|
5
|
+
from tableauserverclient import Pager
|
|
6
|
+
|
|
7
|
+
from ....utils import SerializedAsset
|
|
8
|
+
from ..assets import TableauRevampAsset
|
|
9
|
+
from ..constants import (
|
|
10
|
+
DEFAULT_PAGE_SIZE,
|
|
11
|
+
DEFAULT_TIMEOUT_SECONDS,
|
|
12
|
+
TABLEAU_SERVER_VERSION,
|
|
13
|
+
)
|
|
14
|
+
from .credentials import TableauRevampCredentials
|
|
15
|
+
from .errors import TableauApiError
|
|
16
|
+
from .gql_queries import FIELDS_QUERIES, GQL_QUERIES, QUERY_TEMPLATE
|
|
17
|
+
from .tsc_fields import TSC_FIELDS
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
# these assets must be extracted via TableauServerClient
|
|
22
|
+
_TSC_ASSETS = (
|
|
23
|
+
# only users who published content can be extracted from MetadataAPI
|
|
24
|
+
TableauRevampAsset.USER,
|
|
25
|
+
# projects are not available in Metadata API
|
|
26
|
+
TableauRevampAsset.PROJECT,
|
|
27
|
+
# view count are not available in Metadata API
|
|
28
|
+
TableauRevampAsset.USAGE,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# speed up extraction: fields and columns are smaller but volumes are bigger
|
|
32
|
+
_CUSTOM_PAGE_SIZE: Dict[TableauRevampAsset, int] = {
|
|
33
|
+
TableauRevampAsset.FIELD: 1000,
|
|
34
|
+
TableauRevampAsset.COLUMN: 1000,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _pick_fields(
|
|
39
|
+
data: Pager,
|
|
40
|
+
asset: TableauRevampAsset,
|
|
41
|
+
) -> SerializedAsset:
|
|
42
|
+
fields = TSC_FIELDS[asset]
|
|
43
|
+
|
|
44
|
+
def _pick(row: dict):
|
|
45
|
+
return {field: getattr(row, field) for field in fields}
|
|
46
|
+
|
|
47
|
+
return [_pick(row) for row in data]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _enrich_datasources_with_tsc(
|
|
51
|
+
datasources: SerializedAsset,
|
|
52
|
+
tsc_datasources: SerializedAsset,
|
|
53
|
+
) -> SerializedAsset:
|
|
54
|
+
"""
|
|
55
|
+
Enrich datasources with fields coming from TableauServerClient:
|
|
56
|
+
- project_luid
|
|
57
|
+
- webpage_url
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
mapping = {row["id"]: row for row in tsc_datasources}
|
|
61
|
+
|
|
62
|
+
for datasource in datasources:
|
|
63
|
+
if datasource["__typename"] != "PublishedDatasource":
|
|
64
|
+
# embedded datasources are bound to workbooks => no project
|
|
65
|
+
# embedded datasources cannot be accessed via URL => no webpage_url
|
|
66
|
+
continue
|
|
67
|
+
luid = datasource["luid"]
|
|
68
|
+
tsc_datasource = mapping[luid]
|
|
69
|
+
datasource["projectLuid"] = tsc_datasource["project_id"]
|
|
70
|
+
datasource["webpageUrl"] = tsc_datasource["webpage_url"]
|
|
71
|
+
|
|
72
|
+
return datasources
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _enrich_workbooks_with_tsc(
|
|
76
|
+
workbooks: SerializedAsset,
|
|
77
|
+
tsc_workbooks: SerializedAsset,
|
|
78
|
+
) -> SerializedAsset:
|
|
79
|
+
"""
|
|
80
|
+
Enrich workbooks with fields coming from TableauServerClient:
|
|
81
|
+
- project_luid
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
mapping = {row["id"]: row for row in tsc_workbooks}
|
|
85
|
+
|
|
86
|
+
for workbook in workbooks:
|
|
87
|
+
luid = workbook["luid"]
|
|
88
|
+
tsc_workbook = mapping.get(luid)
|
|
89
|
+
if not tsc_workbook:
|
|
90
|
+
# it happens that a workbook is in Metadata API but not in TSC
|
|
91
|
+
# in this case, we push the workbook with default project
|
|
92
|
+
logger.warning(f"Workbook {luid} was not found in TSC")
|
|
93
|
+
workbook["projectLuid"] = None
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
workbook["projectLuid"] = tsc_workbook["project_id"]
|
|
97
|
+
|
|
98
|
+
return workbooks
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def gql_query_scroll(
|
|
102
|
+
server,
|
|
103
|
+
query: str,
|
|
104
|
+
resource: str,
|
|
105
|
+
) -> Iterator[SerializedAsset]:
|
|
106
|
+
"""Iterate over GQL query results, handling pagination and cursor"""
|
|
107
|
+
|
|
108
|
+
def _call(cursor: Optional[str]) -> dict:
|
|
109
|
+
# If cursor is defined it must be quoted else use null token
|
|
110
|
+
token = "null" if cursor is None else f'"{cursor}"'
|
|
111
|
+
query_ = query.replace("AFTER_TOKEN_SIGNAL", token)
|
|
112
|
+
answer = server.metadata.query(query_)
|
|
113
|
+
if "errors" in answer:
|
|
114
|
+
raise TableauApiError(answer["errors"])
|
|
115
|
+
return answer["data"][f"{resource}Connection"]
|
|
116
|
+
|
|
117
|
+
cursor = None
|
|
118
|
+
while True:
|
|
119
|
+
payload = _call(cursor)
|
|
120
|
+
yield payload["nodes"]
|
|
121
|
+
|
|
122
|
+
page_info = payload["pageInfo"]
|
|
123
|
+
if page_info["hasNextPage"]:
|
|
124
|
+
cursor = page_info["endCursor"]
|
|
125
|
+
else:
|
|
126
|
+
break
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class TableauRevampClient:
|
|
130
|
+
"""
|
|
131
|
+
Connect to Tableau's API and extract assets.
|
|
132
|
+
|
|
133
|
+
Relies on TableauServerClient overlay:
|
|
134
|
+
https://tableau.github.io/server-client-python/docs/
|
|
135
|
+
- for connection
|
|
136
|
+
- to extract Users (Metadata
|
|
137
|
+
|
|
138
|
+
Calls the MetadataAPI, using graphQL
|
|
139
|
+
https://help.tableau.com/current/api/metadata_api/en-us/reference/index.html
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
def __init__(
|
|
143
|
+
self,
|
|
144
|
+
credentials: TableauRevampCredentials,
|
|
145
|
+
timeout_sec: int = DEFAULT_TIMEOUT_SECONDS,
|
|
146
|
+
):
|
|
147
|
+
self._credentials = credentials
|
|
148
|
+
self._server = TSC.Server(self._credentials.server_url)
|
|
149
|
+
options = {"verify": True, "timeout": timeout_sec}
|
|
150
|
+
self._server.add_http_options(options)
|
|
151
|
+
self._server.version = TABLEAU_SERVER_VERSION
|
|
152
|
+
self.errors: List[str] = []
|
|
153
|
+
|
|
154
|
+
@staticmethod
|
|
155
|
+
def name() -> str:
|
|
156
|
+
return "Tableau/API"
|
|
157
|
+
|
|
158
|
+
def _user_password_login(self) -> None:
|
|
159
|
+
"""Login into Tableau using user and password"""
|
|
160
|
+
self._server.auth.sign_in(
|
|
161
|
+
TSC.TableauAuth(
|
|
162
|
+
self._credentials.user,
|
|
163
|
+
self._credentials.password,
|
|
164
|
+
site_id=self._credentials.site_id,
|
|
165
|
+
),
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
def _pat_login(self) -> None:
|
|
169
|
+
"""Login into Tableau using personal authentication token"""
|
|
170
|
+
self._server.auth.sign_in(
|
|
171
|
+
TSC.PersonalAccessTokenAuth(
|
|
172
|
+
self._credentials.token_name,
|
|
173
|
+
self._credentials.token,
|
|
174
|
+
site_id=self._credentials.site_id,
|
|
175
|
+
),
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
def login(self) -> None:
|
|
179
|
+
"""
|
|
180
|
+
Depending on the given credentials, logs-in using either:
|
|
181
|
+
- user/password
|
|
182
|
+
- token_name/value (Personal Access Token)
|
|
183
|
+
https://help.tableau.com/current/api/rest_api/en-us/REST/rest_api_concepts_auth.htm
|
|
184
|
+
|
|
185
|
+
Raises an error if none can be found
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
if self._credentials.user and self._credentials.password:
|
|
189
|
+
logger.info("Logging in using user and password authentication")
|
|
190
|
+
return self._user_password_login()
|
|
191
|
+
|
|
192
|
+
if self._credentials.token_name and self._credentials.token:
|
|
193
|
+
logger.info("Logging in using token authentication")
|
|
194
|
+
return self._pat_login()
|
|
195
|
+
|
|
196
|
+
raise ValueError(
|
|
197
|
+
"Invalid credentials: either user/password or PAT must be provided",
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
def base_url(self) -> str:
|
|
201
|
+
return self._credentials.server_url
|
|
202
|
+
|
|
203
|
+
def _fetch_from_tsc(
|
|
204
|
+
self,
|
|
205
|
+
asset: TableauRevampAsset,
|
|
206
|
+
) -> SerializedAsset:
|
|
207
|
+
|
|
208
|
+
if asset == TableauRevampAsset.DATASOURCE:
|
|
209
|
+
data = TSC.Pager(self._server.datasources)
|
|
210
|
+
|
|
211
|
+
elif asset == TableauRevampAsset.PROJECT:
|
|
212
|
+
data = TSC.Pager(self._server.projects)
|
|
213
|
+
|
|
214
|
+
elif asset == TableauRevampAsset.USAGE:
|
|
215
|
+
data = TSC.Pager(self._server.views, usage=True)
|
|
216
|
+
|
|
217
|
+
elif asset == TableauRevampAsset.USER:
|
|
218
|
+
data = TSC.Pager(self._server.users)
|
|
219
|
+
|
|
220
|
+
elif asset == TableauRevampAsset.WORKBOOK:
|
|
221
|
+
data = TSC.Pager(self._server.workbooks)
|
|
222
|
+
|
|
223
|
+
else:
|
|
224
|
+
raise AssertionError(f"Fetching from TSC not supported for {asset}")
|
|
225
|
+
|
|
226
|
+
return _pick_fields(data, asset)
|
|
227
|
+
|
|
228
|
+
def _run_graphql_query(
|
|
229
|
+
self,
|
|
230
|
+
resource: str,
|
|
231
|
+
fields: str,
|
|
232
|
+
page_size: int = DEFAULT_PAGE_SIZE,
|
|
233
|
+
) -> SerializedAsset:
|
|
234
|
+
query = QUERY_TEMPLATE.format(
|
|
235
|
+
resource=resource,
|
|
236
|
+
fields=fields,
|
|
237
|
+
page_size=page_size,
|
|
238
|
+
)
|
|
239
|
+
result_pages = gql_query_scroll(self._server, query, resource)
|
|
240
|
+
return [asset for page in result_pages for asset in page]
|
|
241
|
+
|
|
242
|
+
def _fetch_fields(self) -> SerializedAsset:
|
|
243
|
+
result: SerializedAsset = []
|
|
244
|
+
page_size = _CUSTOM_PAGE_SIZE[TableauRevampAsset.FIELD]
|
|
245
|
+
for resource, fields in FIELDS_QUERIES:
|
|
246
|
+
current = self._run_graphql_query(resource, fields, page_size)
|
|
247
|
+
result.extend(current)
|
|
248
|
+
return result
|
|
249
|
+
|
|
250
|
+
def _fetch_from_metadata_api(
|
|
251
|
+
self,
|
|
252
|
+
asset: TableauRevampAsset,
|
|
253
|
+
) -> SerializedAsset:
|
|
254
|
+
if asset == TableauRevampAsset.FIELD:
|
|
255
|
+
return self._fetch_fields()
|
|
256
|
+
|
|
257
|
+
page_size = _CUSTOM_PAGE_SIZE.get(asset) or DEFAULT_PAGE_SIZE
|
|
258
|
+
resource, fields = GQL_QUERIES[asset]
|
|
259
|
+
return self._run_graphql_query(resource, fields, page_size)
|
|
260
|
+
|
|
261
|
+
def _fetch_datasources(self) -> SerializedAsset:
|
|
262
|
+
asset = TableauRevampAsset.DATASOURCE
|
|
263
|
+
|
|
264
|
+
datasources = self._fetch_from_metadata_api(asset)
|
|
265
|
+
datasource_projects = self._fetch_from_tsc(asset)
|
|
266
|
+
|
|
267
|
+
return _enrich_datasources_with_tsc(datasources, datasource_projects)
|
|
268
|
+
|
|
269
|
+
def _fetch_workbooks(self) -> SerializedAsset:
|
|
270
|
+
asset = TableauRevampAsset.WORKBOOK
|
|
271
|
+
|
|
272
|
+
workbooks = self._fetch_from_metadata_api(asset)
|
|
273
|
+
workbook_projects = self._fetch_from_tsc(asset)
|
|
274
|
+
|
|
275
|
+
return _enrich_workbooks_with_tsc(workbooks, workbook_projects)
|
|
276
|
+
|
|
277
|
+
def fetch(
|
|
278
|
+
self,
|
|
279
|
+
asset: TableauRevampAsset,
|
|
280
|
+
) -> SerializedAsset:
|
|
281
|
+
"""
|
|
282
|
+
Extract the given Tableau Asset
|
|
283
|
+
"""
|
|
284
|
+
if asset == TableauRevampAsset.DATASOURCE:
|
|
285
|
+
# both APIs are required to extract datasources
|
|
286
|
+
return self._fetch_datasources()
|
|
287
|
+
|
|
288
|
+
if asset == TableauRevampAsset.WORKBOOK:
|
|
289
|
+
# both APIs are required to extract workbooks
|
|
290
|
+
return self._fetch_workbooks()
|
|
291
|
+
|
|
292
|
+
if asset in _TSC_ASSETS:
|
|
293
|
+
# some assets can only be extracted via TSC
|
|
294
|
+
return self._fetch_from_tsc(asset)
|
|
295
|
+
|
|
296
|
+
# extract most assets via Metadata API
|
|
297
|
+
return self._fetch_from_metadata_api(asset)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Dict, Literal, Optional, overload
|
|
3
|
+
|
|
4
|
+
from ....utils import from_env
|
|
5
|
+
|
|
6
|
+
_AUTH_ERROR_MSG = "Need either user and password or token_name and token"
|
|
7
|
+
|
|
8
|
+
# To specify the default site on Tableau Server, you can use an empty string
|
|
9
|
+
# https://tableau.github.io/server-client-python/docs/api-ref#authentication
|
|
10
|
+
_DEFAULT_SERVER_SITE_ID = ""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CredentialsKey(Enum):
|
|
14
|
+
"""Value enum object for the credentials"""
|
|
15
|
+
|
|
16
|
+
TABLEAU_USER = "user"
|
|
17
|
+
TABLEAU_PASSWORD = "password" # noqa: S105
|
|
18
|
+
TABLEAU_TOKEN_NAME = "token_name" # noqa: S105
|
|
19
|
+
TABLEAU_TOKEN = "token" # noqa: S105
|
|
20
|
+
TABLEAU_SITE_ID = "site_id"
|
|
21
|
+
TABLEAU_SERVER_URL = "server_url"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
CREDENTIALS_ENV: Dict[CredentialsKey, str] = {
|
|
25
|
+
CredentialsKey.TABLEAU_USER: "CASTOR_TABLEAU_USER",
|
|
26
|
+
CredentialsKey.TABLEAU_PASSWORD: "CASTOR_TABLEAU_PASSWORD",
|
|
27
|
+
CredentialsKey.TABLEAU_TOKEN_NAME: "CASTOR_TABLEAU_TOKEN_NAME",
|
|
28
|
+
CredentialsKey.TABLEAU_TOKEN: "CASTOR_TABLEAU_TOKEN",
|
|
29
|
+
CredentialsKey.TABLEAU_SITE_ID: "CASTOR_TABLEAU_SITE_ID",
|
|
30
|
+
CredentialsKey.TABLEAU_SERVER_URL: "CASTOR_TABLEAU_SERVER_URL",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@overload
|
|
35
|
+
def get_value(key: CredentialsKey, kwargs: dict) -> Optional[str]: ...
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@overload
|
|
39
|
+
def get_value(
|
|
40
|
+
key: CredentialsKey, kwargs: dict, optional: Literal[True]
|
|
41
|
+
) -> Optional[str]: ...
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@overload
|
|
45
|
+
def get_value(
|
|
46
|
+
key: CredentialsKey, kwargs: dict, optional: Literal[False]
|
|
47
|
+
) -> str: ...
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def get_value(
|
|
51
|
+
key: CredentialsKey,
|
|
52
|
+
kwargs: dict,
|
|
53
|
+
optional: bool = True,
|
|
54
|
+
) -> Optional[str]:
|
|
55
|
+
"""
|
|
56
|
+
Returns the value of the given key:
|
|
57
|
+
- from kwargs in priority
|
|
58
|
+
- from ENV otherwise
|
|
59
|
+
Raises an error if not found (unless optional)
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
if key.value in kwargs:
|
|
63
|
+
return kwargs[key.value]
|
|
64
|
+
|
|
65
|
+
env_key = CREDENTIALS_ENV[key]
|
|
66
|
+
return from_env(env_key, optional)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class TableauRevampCredentials:
|
|
70
|
+
"""
|
|
71
|
+
Tableau's credentials to connect to REST API
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
*,
|
|
77
|
+
server_url: str,
|
|
78
|
+
site_id: Optional[str],
|
|
79
|
+
user: Optional[str],
|
|
80
|
+
password: Optional[str],
|
|
81
|
+
token_name: Optional[str],
|
|
82
|
+
token: Optional[str],
|
|
83
|
+
):
|
|
84
|
+
self.user = user
|
|
85
|
+
self.site_id = site_id or _DEFAULT_SERVER_SITE_ID
|
|
86
|
+
self.server_url = server_url
|
|
87
|
+
self.password = password
|
|
88
|
+
self.token_name = token_name
|
|
89
|
+
self.token = token
|
|
90
|
+
|
|
91
|
+
@classmethod
|
|
92
|
+
def from_env(cls, kwargs: dict) -> "TableauRevampCredentials":
|
|
93
|
+
return TableauRevampCredentials(
|
|
94
|
+
server_url=get_value(
|
|
95
|
+
CredentialsKey.TABLEAU_SERVER_URL,
|
|
96
|
+
kwargs,
|
|
97
|
+
optional=False,
|
|
98
|
+
),
|
|
99
|
+
site_id=get_value(CredentialsKey.TABLEAU_SITE_ID, kwargs),
|
|
100
|
+
user=get_value(CredentialsKey.TABLEAU_USER, kwargs),
|
|
101
|
+
password=get_value(CredentialsKey.TABLEAU_PASSWORD, kwargs),
|
|
102
|
+
token_name=get_value(CredentialsKey.TABLEAU_TOKEN_NAME, kwargs),
|
|
103
|
+
token=get_value(CredentialsKey.TABLEAU_TOKEN, kwargs),
|
|
104
|
+
)
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
from typing import Dict, Tuple
|
|
2
|
+
|
|
3
|
+
from ..assets import TableauRevampAsset
|
|
4
|
+
|
|
5
|
+
QUERY_TEMPLATE = """
|
|
6
|
+
{{
|
|
7
|
+
{resource}Connection(first: {page_size}, after: AFTER_TOKEN_SIGNAL) {{
|
|
8
|
+
nodes {{ {fields}
|
|
9
|
+
}}
|
|
10
|
+
pageInfo {{
|
|
11
|
+
hasNextPage
|
|
12
|
+
endCursor
|
|
13
|
+
}}
|
|
14
|
+
totalCount
|
|
15
|
+
}}
|
|
16
|
+
}}
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
_COLUMNS_QUERY = """
|
|
20
|
+
downstreamDashboards { id }
|
|
21
|
+
downstreamFields { id }
|
|
22
|
+
downstreamWorkbooks { id }
|
|
23
|
+
id
|
|
24
|
+
name
|
|
25
|
+
table { id }
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
_DASHBOARDS_QUERY = """
|
|
29
|
+
createdAt
|
|
30
|
+
id
|
|
31
|
+
name
|
|
32
|
+
path
|
|
33
|
+
tags { name }
|
|
34
|
+
updatedAt
|
|
35
|
+
workbook { id }
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
_DATASOURCES_QUERY = """
|
|
39
|
+
__typename
|
|
40
|
+
downstreamDashboards { id }
|
|
41
|
+
downstreamWorkbooks { id }
|
|
42
|
+
id
|
|
43
|
+
name
|
|
44
|
+
... on PublishedDatasource {
|
|
45
|
+
description
|
|
46
|
+
luid
|
|
47
|
+
owner { luid }
|
|
48
|
+
site { name }
|
|
49
|
+
tags { name }
|
|
50
|
+
uri
|
|
51
|
+
}
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
_TABLES_QUERY = """
|
|
55
|
+
__typename
|
|
56
|
+
downstreamDashboards { id }
|
|
57
|
+
downstreamDatasources { id }
|
|
58
|
+
downstreamWorkbooks { id }
|
|
59
|
+
id
|
|
60
|
+
name
|
|
61
|
+
... on DatabaseTable {
|
|
62
|
+
connectionType
|
|
63
|
+
fullName
|
|
64
|
+
schema
|
|
65
|
+
}
|
|
66
|
+
... on CustomSQLTable {
|
|
67
|
+
query
|
|
68
|
+
}
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
_WORKBOOKS_QUERY = """
|
|
73
|
+
createdAt
|
|
74
|
+
description
|
|
75
|
+
embeddedDatasources { id }
|
|
76
|
+
id
|
|
77
|
+
luid
|
|
78
|
+
name
|
|
79
|
+
owner { luid }
|
|
80
|
+
site { name }
|
|
81
|
+
tags { name }
|
|
82
|
+
updatedAt
|
|
83
|
+
uri
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
_FIELDS_QUERY = """
|
|
87
|
+
__typename
|
|
88
|
+
datasource { id }
|
|
89
|
+
description
|
|
90
|
+
downstreamDashboards { id }
|
|
91
|
+
downstreamWorkbooks { id }
|
|
92
|
+
folderName
|
|
93
|
+
id
|
|
94
|
+
name
|
|
95
|
+
dataType
|
|
96
|
+
role
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
_FIELDS_QUERY_WITH_COLUMNS = f"""
|
|
101
|
+
{_FIELDS_QUERY}
|
|
102
|
+
columns {{
|
|
103
|
+
name
|
|
104
|
+
table {{ name }}
|
|
105
|
+
}}
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
_SHEETS_QUERY = """
|
|
109
|
+
containedInDashboards { id }
|
|
110
|
+
createdAt
|
|
111
|
+
id
|
|
112
|
+
index
|
|
113
|
+
name
|
|
114
|
+
updatedAt
|
|
115
|
+
upstreamFields { name }
|
|
116
|
+
workbook { id }
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
GQL_QUERIES: Dict[TableauRevampAsset, Tuple[str, str]] = {
|
|
121
|
+
TableauRevampAsset.COLUMN: ("columns", _COLUMNS_QUERY),
|
|
122
|
+
TableauRevampAsset.DASHBOARD: ("dashboards", _DASHBOARDS_QUERY),
|
|
123
|
+
TableauRevampAsset.DATASOURCE: ("datasources", _DATASOURCES_QUERY),
|
|
124
|
+
TableauRevampAsset.SHEET: ("sheets", _SHEETS_QUERY),
|
|
125
|
+
TableauRevampAsset.TABLE: ("tables", _TABLES_QUERY),
|
|
126
|
+
TableauRevampAsset.WORKBOOK: ("workbooks", _WORKBOOKS_QUERY),
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
FIELDS_QUERIES = (
|
|
130
|
+
("binFields", _FIELDS_QUERY),
|
|
131
|
+
("calculatedFields", _FIELDS_QUERY),
|
|
132
|
+
("columnFields", _FIELDS_QUERY_WITH_COLUMNS),
|
|
133
|
+
("groupFields", _FIELDS_QUERY),
|
|
134
|
+
)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import Dict, Set
|
|
2
|
+
|
|
3
|
+
from ..assets import TableauRevampAsset
|
|
4
|
+
|
|
5
|
+
# list of fields to pick in TSC response
|
|
6
|
+
TSC_FIELDS: Dict[TableauRevampAsset, Set[str]] = {
|
|
7
|
+
TableauRevampAsset.DATASOURCE: {
|
|
8
|
+
"id",
|
|
9
|
+
"project_id",
|
|
10
|
+
"webpage_url",
|
|
11
|
+
},
|
|
12
|
+
TableauRevampAsset.PROJECT: {
|
|
13
|
+
"description",
|
|
14
|
+
"id",
|
|
15
|
+
"name",
|
|
16
|
+
"parent_id",
|
|
17
|
+
},
|
|
18
|
+
TableauRevampAsset.USAGE: {
|
|
19
|
+
"name",
|
|
20
|
+
"total_views",
|
|
21
|
+
"workbook_id",
|
|
22
|
+
},
|
|
23
|
+
TableauRevampAsset.USER: {
|
|
24
|
+
"email",
|
|
25
|
+
"fullname",
|
|
26
|
+
"id",
|
|
27
|
+
"name",
|
|
28
|
+
"site_role",
|
|
29
|
+
},
|
|
30
|
+
TableauRevampAsset.WORKBOOK: {
|
|
31
|
+
"id",
|
|
32
|
+
"project_id",
|
|
33
|
+
},
|
|
34
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Iterable, Tuple
|
|
3
|
+
|
|
4
|
+
from ...utils import (
|
|
5
|
+
OUTPUT_DIR,
|
|
6
|
+
current_timestamp,
|
|
7
|
+
deep_serialize,
|
|
8
|
+
from_env,
|
|
9
|
+
get_output_filename,
|
|
10
|
+
write_errors_logs,
|
|
11
|
+
write_json,
|
|
12
|
+
write_summary,
|
|
13
|
+
)
|
|
14
|
+
from .assets import TableauRevampAsset
|
|
15
|
+
from .client import TableauRevampClient
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def iterate_all_data(
|
|
21
|
+
client: TableauRevampClient,
|
|
22
|
+
) -> Iterable[Tuple[TableauRevampAsset, list]]:
|
|
23
|
+
"""Iterate over the extracted Data from Tableau"""
|
|
24
|
+
|
|
25
|
+
logger.info("Extracting USER from Tableau API")
|
|
26
|
+
yield TableauRevampAsset.USER, deep_serialize(
|
|
27
|
+
client.fetch(TableauRevampAsset.USER)
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def extract_all(client: TableauRevampClient, **kwargs: str) -> None:
|
|
32
|
+
"""
|
|
33
|
+
Extract Data from tableau
|
|
34
|
+
Store data locally in files under the output_directory
|
|
35
|
+
If errors from Tableau's API are catch store them locally in file under the output_directory
|
|
36
|
+
"""
|
|
37
|
+
output_directory = kwargs.get("output_directory") or from_env(OUTPUT_DIR)
|
|
38
|
+
|
|
39
|
+
timestamp = current_timestamp()
|
|
40
|
+
|
|
41
|
+
for key, data in iterate_all_data(client):
|
|
42
|
+
filename = get_output_filename(key.value, output_directory, timestamp)
|
|
43
|
+
write_json(filename, data)
|
|
44
|
+
|
|
45
|
+
write_summary(
|
|
46
|
+
output_directory,
|
|
47
|
+
timestamp,
|
|
48
|
+
base_url=client.base_url(),
|
|
49
|
+
client_name=client.name(),
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
if client.errors:
|
|
53
|
+
write_errors_logs(output_directory, timestamp, client.errors)
|
|
@@ -3,7 +3,7 @@ from datetime import date
|
|
|
3
3
|
from functools import partial
|
|
4
4
|
from typing import Any, Dict, List, Optional, Set
|
|
5
5
|
|
|
6
|
-
from ...utils import at_midnight, date_after
|
|
6
|
+
from ...utils import at_midnight, date_after, mapping_from_rows
|
|
7
7
|
from ...utils.client.api import APIClient
|
|
8
8
|
from ...utils.pager import PagerOnToken
|
|
9
9
|
from ..abstract.time_filter import TimeFilter
|
|
@@ -88,15 +88,22 @@ class DatabricksClient(APIClient):
|
|
|
88
88
|
)
|
|
89
89
|
|
|
90
90
|
@staticmethod
|
|
91
|
-
def _match_table_with_user(table: dict,
|
|
91
|
+
def _match_table_with_user(table: dict, user_mapping: dict) -> dict:
|
|
92
92
|
table_owner_email = table.get("owner_email")
|
|
93
93
|
if not table_owner_email:
|
|
94
94
|
return table
|
|
95
|
-
owner_external_id =
|
|
95
|
+
owner_external_id = user_mapping.get(table_owner_email)
|
|
96
96
|
if not owner_external_id:
|
|
97
97
|
return table
|
|
98
98
|
return {**table, "owner_external_id": owner_external_id}
|
|
99
99
|
|
|
100
|
+
@staticmethod
|
|
101
|
+
def _get_user_mapping(users: List[dict]) -> dict:
|
|
102
|
+
return {
|
|
103
|
+
**mapping_from_rows(users, "email", "id"),
|
|
104
|
+
**mapping_from_rows(users, "user_name", "id"),
|
|
105
|
+
}
|
|
106
|
+
|
|
100
107
|
def tables_and_columns(
|
|
101
108
|
self, schemas: List[dict], users: List[dict]
|
|
102
109
|
) -> TablesColumns:
|
|
@@ -105,11 +112,11 @@ class DatabricksClient(APIClient):
|
|
|
105
112
|
"""
|
|
106
113
|
tables: List[dict] = []
|
|
107
114
|
columns: List[dict] = []
|
|
108
|
-
|
|
115
|
+
user_mapping = self._get_user_mapping(users)
|
|
109
116
|
for schema in schemas:
|
|
110
117
|
t_to_add, c_to_add = self._tables_columns_of_schema(schema)
|
|
111
118
|
t_with_owner = [
|
|
112
|
-
self._match_table_with_user(table,
|
|
119
|
+
self._match_table_with_user(table, user_mapping)
|
|
113
120
|
for table in t_to_add
|
|
114
121
|
]
|
|
115
122
|
tables.extend(t_with_owner)
|