castor-extractor 0.16.5__py3-none-any.whl → 0.16.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

Files changed (33) hide show
  1. CHANGELOG.md +17 -0
  2. castor_extractor/utils/__init__.py +2 -1
  3. castor_extractor/utils/collection.py +32 -0
  4. castor_extractor/utils/collection_test.py +60 -0
  5. castor_extractor/utils/time.py +9 -1
  6. castor_extractor/utils/time_test.py +8 -1
  7. castor_extractor/visualization/domo/client/client.py +28 -43
  8. castor_extractor/visualization/domo/client/client_test.py +1 -23
  9. castor_extractor/visualization/domo/client/endpoints.py +13 -6
  10. castor_extractor/visualization/domo/client/pagination.py +4 -0
  11. castor_extractor/visualization/looker/api/client.py +21 -17
  12. castor_extractor/visualization/looker/api/sdk.py +10 -58
  13. castor_extractor/visualization/looker/api/utils.py +1 -1
  14. castor_extractor/visualization/looker/extract.py +2 -1
  15. castor_extractor/visualization/looker/multithreading.py +1 -1
  16. castor_extractor/visualization/tableau_revamp/__init__.py +3 -0
  17. castor_extractor/visualization/tableau_revamp/assets.py +18 -0
  18. castor_extractor/visualization/tableau_revamp/client/__init__.py +2 -0
  19. castor_extractor/visualization/tableau_revamp/client/client.py +297 -0
  20. castor_extractor/visualization/tableau_revamp/client/credentials.py +104 -0
  21. castor_extractor/visualization/tableau_revamp/client/errors.py +3 -0
  22. castor_extractor/visualization/tableau_revamp/client/gql_queries.py +134 -0
  23. castor_extractor/visualization/tableau_revamp/client/tsc_fields.py +34 -0
  24. castor_extractor/visualization/tableau_revamp/constants.py +5 -0
  25. castor_extractor/visualization/tableau_revamp/extract.py +53 -0
  26. castor_extractor/warehouse/databricks/client.py +12 -5
  27. castor_extractor/warehouse/databricks/client_test.py +22 -3
  28. castor_extractor/warehouse/databricks/format.py +5 -1
  29. {castor_extractor-0.16.5.dist-info → castor_extractor-0.16.9.dist-info}/METADATA +6 -3
  30. {castor_extractor-0.16.5.dist-info → castor_extractor-0.16.9.dist-info}/RECORD +33 -22
  31. {castor_extractor-0.16.5.dist-info → castor_extractor-0.16.9.dist-info}/LICENCE +0 -0
  32. {castor_extractor-0.16.5.dist-info → castor_extractor-0.16.9.dist-info}/WHEEL +0 -0
  33. {castor_extractor-0.16.5.dist-info → castor_extractor-0.16.9.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,297 @@
1
+ import logging
2
+ from typing import Dict, Iterator, List, Optional
3
+
4
+ import tableauserverclient as TSC # type: ignore
5
+ from tableauserverclient import Pager
6
+
7
+ from ....utils import SerializedAsset
8
+ from ..assets import TableauRevampAsset
9
+ from ..constants import (
10
+ DEFAULT_PAGE_SIZE,
11
+ DEFAULT_TIMEOUT_SECONDS,
12
+ TABLEAU_SERVER_VERSION,
13
+ )
14
+ from .credentials import TableauRevampCredentials
15
+ from .errors import TableauApiError
16
+ from .gql_queries import FIELDS_QUERIES, GQL_QUERIES, QUERY_TEMPLATE
17
+ from .tsc_fields import TSC_FIELDS
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # these assets must be extracted via TableauServerClient
22
+ _TSC_ASSETS = (
23
+ # only users who published content can be extracted from MetadataAPI
24
+ TableauRevampAsset.USER,
25
+ # projects are not available in Metadata API
26
+ TableauRevampAsset.PROJECT,
27
+ # view count are not available in Metadata API
28
+ TableauRevampAsset.USAGE,
29
+ )
30
+
31
+ # speed up extraction: fields and columns are smaller but volumes are bigger
32
+ _CUSTOM_PAGE_SIZE: Dict[TableauRevampAsset, int] = {
33
+ TableauRevampAsset.FIELD: 1000,
34
+ TableauRevampAsset.COLUMN: 1000,
35
+ }
36
+
37
+
38
+ def _pick_fields(
39
+ data: Pager,
40
+ asset: TableauRevampAsset,
41
+ ) -> SerializedAsset:
42
+ fields = TSC_FIELDS[asset]
43
+
44
+ def _pick(row: dict):
45
+ return {field: getattr(row, field) for field in fields}
46
+
47
+ return [_pick(row) for row in data]
48
+
49
+
50
+ def _enrich_datasources_with_tsc(
51
+ datasources: SerializedAsset,
52
+ tsc_datasources: SerializedAsset,
53
+ ) -> SerializedAsset:
54
+ """
55
+ Enrich datasources with fields coming from TableauServerClient:
56
+ - project_luid
57
+ - webpage_url
58
+ """
59
+
60
+ mapping = {row["id"]: row for row in tsc_datasources}
61
+
62
+ for datasource in datasources:
63
+ if datasource["__typename"] != "PublishedDatasource":
64
+ # embedded datasources are bound to workbooks => no project
65
+ # embedded datasources cannot be accessed via URL => no webpage_url
66
+ continue
67
+ luid = datasource["luid"]
68
+ tsc_datasource = mapping[luid]
69
+ datasource["projectLuid"] = tsc_datasource["project_id"]
70
+ datasource["webpageUrl"] = tsc_datasource["webpage_url"]
71
+
72
+ return datasources
73
+
74
+
75
+ def _enrich_workbooks_with_tsc(
76
+ workbooks: SerializedAsset,
77
+ tsc_workbooks: SerializedAsset,
78
+ ) -> SerializedAsset:
79
+ """
80
+ Enrich workbooks with fields coming from TableauServerClient:
81
+ - project_luid
82
+ """
83
+
84
+ mapping = {row["id"]: row for row in tsc_workbooks}
85
+
86
+ for workbook in workbooks:
87
+ luid = workbook["luid"]
88
+ tsc_workbook = mapping.get(luid)
89
+ if not tsc_workbook:
90
+ # it happens that a workbook is in Metadata API but not in TSC
91
+ # in this case, we push the workbook with default project
92
+ logger.warning(f"Workbook {luid} was not found in TSC")
93
+ workbook["projectLuid"] = None
94
+ continue
95
+
96
+ workbook["projectLuid"] = tsc_workbook["project_id"]
97
+
98
+ return workbooks
99
+
100
+
101
+ def gql_query_scroll(
102
+ server,
103
+ query: str,
104
+ resource: str,
105
+ ) -> Iterator[SerializedAsset]:
106
+ """Iterate over GQL query results, handling pagination and cursor"""
107
+
108
+ def _call(cursor: Optional[str]) -> dict:
109
+ # If cursor is defined it must be quoted else use null token
110
+ token = "null" if cursor is None else f'"{cursor}"'
111
+ query_ = query.replace("AFTER_TOKEN_SIGNAL", token)
112
+ answer = server.metadata.query(query_)
113
+ if "errors" in answer:
114
+ raise TableauApiError(answer["errors"])
115
+ return answer["data"][f"{resource}Connection"]
116
+
117
+ cursor = None
118
+ while True:
119
+ payload = _call(cursor)
120
+ yield payload["nodes"]
121
+
122
+ page_info = payload["pageInfo"]
123
+ if page_info["hasNextPage"]:
124
+ cursor = page_info["endCursor"]
125
+ else:
126
+ break
127
+
128
+
129
+ class TableauRevampClient:
130
+ """
131
+ Connect to Tableau's API and extract assets.
132
+
133
+ Relies on TableauServerClient overlay:
134
+ https://tableau.github.io/server-client-python/docs/
135
+ - for connection
136
+ - to extract Users (Metadata
137
+
138
+ Calls the MetadataAPI, using graphQL
139
+ https://help.tableau.com/current/api/metadata_api/en-us/reference/index.html
140
+ """
141
+
142
+ def __init__(
143
+ self,
144
+ credentials: TableauRevampCredentials,
145
+ timeout_sec: int = DEFAULT_TIMEOUT_SECONDS,
146
+ ):
147
+ self._credentials = credentials
148
+ self._server = TSC.Server(self._credentials.server_url)
149
+ options = {"verify": True, "timeout": timeout_sec}
150
+ self._server.add_http_options(options)
151
+ self._server.version = TABLEAU_SERVER_VERSION
152
+ self.errors: List[str] = []
153
+
154
+ @staticmethod
155
+ def name() -> str:
156
+ return "Tableau/API"
157
+
158
+ def _user_password_login(self) -> None:
159
+ """Login into Tableau using user and password"""
160
+ self._server.auth.sign_in(
161
+ TSC.TableauAuth(
162
+ self._credentials.user,
163
+ self._credentials.password,
164
+ site_id=self._credentials.site_id,
165
+ ),
166
+ )
167
+
168
+ def _pat_login(self) -> None:
169
+ """Login into Tableau using personal authentication token"""
170
+ self._server.auth.sign_in(
171
+ TSC.PersonalAccessTokenAuth(
172
+ self._credentials.token_name,
173
+ self._credentials.token,
174
+ site_id=self._credentials.site_id,
175
+ ),
176
+ )
177
+
178
+ def login(self) -> None:
179
+ """
180
+ Depending on the given credentials, logs-in using either:
181
+ - user/password
182
+ - token_name/value (Personal Access Token)
183
+ https://help.tableau.com/current/api/rest_api/en-us/REST/rest_api_concepts_auth.htm
184
+
185
+ Raises an error if none can be found
186
+ """
187
+
188
+ if self._credentials.user and self._credentials.password:
189
+ logger.info("Logging in using user and password authentication")
190
+ return self._user_password_login()
191
+
192
+ if self._credentials.token_name and self._credentials.token:
193
+ logger.info("Logging in using token authentication")
194
+ return self._pat_login()
195
+
196
+ raise ValueError(
197
+ "Invalid credentials: either user/password or PAT must be provided",
198
+ )
199
+
200
+ def base_url(self) -> str:
201
+ return self._credentials.server_url
202
+
203
+ def _fetch_from_tsc(
204
+ self,
205
+ asset: TableauRevampAsset,
206
+ ) -> SerializedAsset:
207
+
208
+ if asset == TableauRevampAsset.DATASOURCE:
209
+ data = TSC.Pager(self._server.datasources)
210
+
211
+ elif asset == TableauRevampAsset.PROJECT:
212
+ data = TSC.Pager(self._server.projects)
213
+
214
+ elif asset == TableauRevampAsset.USAGE:
215
+ data = TSC.Pager(self._server.views, usage=True)
216
+
217
+ elif asset == TableauRevampAsset.USER:
218
+ data = TSC.Pager(self._server.users)
219
+
220
+ elif asset == TableauRevampAsset.WORKBOOK:
221
+ data = TSC.Pager(self._server.workbooks)
222
+
223
+ else:
224
+ raise AssertionError(f"Fetching from TSC not supported for {asset}")
225
+
226
+ return _pick_fields(data, asset)
227
+
228
+ def _run_graphql_query(
229
+ self,
230
+ resource: str,
231
+ fields: str,
232
+ page_size: int = DEFAULT_PAGE_SIZE,
233
+ ) -> SerializedAsset:
234
+ query = QUERY_TEMPLATE.format(
235
+ resource=resource,
236
+ fields=fields,
237
+ page_size=page_size,
238
+ )
239
+ result_pages = gql_query_scroll(self._server, query, resource)
240
+ return [asset for page in result_pages for asset in page]
241
+
242
+ def _fetch_fields(self) -> SerializedAsset:
243
+ result: SerializedAsset = []
244
+ page_size = _CUSTOM_PAGE_SIZE[TableauRevampAsset.FIELD]
245
+ for resource, fields in FIELDS_QUERIES:
246
+ current = self._run_graphql_query(resource, fields, page_size)
247
+ result.extend(current)
248
+ return result
249
+
250
+ def _fetch_from_metadata_api(
251
+ self,
252
+ asset: TableauRevampAsset,
253
+ ) -> SerializedAsset:
254
+ if asset == TableauRevampAsset.FIELD:
255
+ return self._fetch_fields()
256
+
257
+ page_size = _CUSTOM_PAGE_SIZE.get(asset) or DEFAULT_PAGE_SIZE
258
+ resource, fields = GQL_QUERIES[asset]
259
+ return self._run_graphql_query(resource, fields, page_size)
260
+
261
+ def _fetch_datasources(self) -> SerializedAsset:
262
+ asset = TableauRevampAsset.DATASOURCE
263
+
264
+ datasources = self._fetch_from_metadata_api(asset)
265
+ datasource_projects = self._fetch_from_tsc(asset)
266
+
267
+ return _enrich_datasources_with_tsc(datasources, datasource_projects)
268
+
269
+ def _fetch_workbooks(self) -> SerializedAsset:
270
+ asset = TableauRevampAsset.WORKBOOK
271
+
272
+ workbooks = self._fetch_from_metadata_api(asset)
273
+ workbook_projects = self._fetch_from_tsc(asset)
274
+
275
+ return _enrich_workbooks_with_tsc(workbooks, workbook_projects)
276
+
277
+ def fetch(
278
+ self,
279
+ asset: TableauRevampAsset,
280
+ ) -> SerializedAsset:
281
+ """
282
+ Extract the given Tableau Asset
283
+ """
284
+ if asset == TableauRevampAsset.DATASOURCE:
285
+ # both APIs are required to extract datasources
286
+ return self._fetch_datasources()
287
+
288
+ if asset == TableauRevampAsset.WORKBOOK:
289
+ # both APIs are required to extract workbooks
290
+ return self._fetch_workbooks()
291
+
292
+ if asset in _TSC_ASSETS:
293
+ # some assets can only be extracted via TSC
294
+ return self._fetch_from_tsc(asset)
295
+
296
+ # extract most assets via Metadata API
297
+ return self._fetch_from_metadata_api(asset)
@@ -0,0 +1,104 @@
1
+ from enum import Enum
2
+ from typing import Dict, Literal, Optional, overload
3
+
4
+ from ....utils import from_env
5
+
6
+ _AUTH_ERROR_MSG = "Need either user and password or token_name and token"
7
+
8
+ # To specify the default site on Tableau Server, you can use an empty string
9
+ # https://tableau.github.io/server-client-python/docs/api-ref#authentication
10
+ _DEFAULT_SERVER_SITE_ID = ""
11
+
12
+
13
+ class CredentialsKey(Enum):
14
+ """Value enum object for the credentials"""
15
+
16
+ TABLEAU_USER = "user"
17
+ TABLEAU_PASSWORD = "password" # noqa: S105
18
+ TABLEAU_TOKEN_NAME = "token_name" # noqa: S105
19
+ TABLEAU_TOKEN = "token" # noqa: S105
20
+ TABLEAU_SITE_ID = "site_id"
21
+ TABLEAU_SERVER_URL = "server_url"
22
+
23
+
24
+ CREDENTIALS_ENV: Dict[CredentialsKey, str] = {
25
+ CredentialsKey.TABLEAU_USER: "CASTOR_TABLEAU_USER",
26
+ CredentialsKey.TABLEAU_PASSWORD: "CASTOR_TABLEAU_PASSWORD",
27
+ CredentialsKey.TABLEAU_TOKEN_NAME: "CASTOR_TABLEAU_TOKEN_NAME",
28
+ CredentialsKey.TABLEAU_TOKEN: "CASTOR_TABLEAU_TOKEN",
29
+ CredentialsKey.TABLEAU_SITE_ID: "CASTOR_TABLEAU_SITE_ID",
30
+ CredentialsKey.TABLEAU_SERVER_URL: "CASTOR_TABLEAU_SERVER_URL",
31
+ }
32
+
33
+
34
+ @overload
35
+ def get_value(key: CredentialsKey, kwargs: dict) -> Optional[str]: ...
36
+
37
+
38
+ @overload
39
+ def get_value(
40
+ key: CredentialsKey, kwargs: dict, optional: Literal[True]
41
+ ) -> Optional[str]: ...
42
+
43
+
44
+ @overload
45
+ def get_value(
46
+ key: CredentialsKey, kwargs: dict, optional: Literal[False]
47
+ ) -> str: ...
48
+
49
+
50
+ def get_value(
51
+ key: CredentialsKey,
52
+ kwargs: dict,
53
+ optional: bool = True,
54
+ ) -> Optional[str]:
55
+ """
56
+ Returns the value of the given key:
57
+ - from kwargs in priority
58
+ - from ENV otherwise
59
+ Raises an error if not found (unless optional)
60
+ """
61
+
62
+ if key.value in kwargs:
63
+ return kwargs[key.value]
64
+
65
+ env_key = CREDENTIALS_ENV[key]
66
+ return from_env(env_key, optional)
67
+
68
+
69
+ class TableauRevampCredentials:
70
+ """
71
+ Tableau's credentials to connect to REST API
72
+ """
73
+
74
+ def __init__(
75
+ self,
76
+ *,
77
+ server_url: str,
78
+ site_id: Optional[str],
79
+ user: Optional[str],
80
+ password: Optional[str],
81
+ token_name: Optional[str],
82
+ token: Optional[str],
83
+ ):
84
+ self.user = user
85
+ self.site_id = site_id or _DEFAULT_SERVER_SITE_ID
86
+ self.server_url = server_url
87
+ self.password = password
88
+ self.token_name = token_name
89
+ self.token = token
90
+
91
+ @classmethod
92
+ def from_env(cls, kwargs: dict) -> "TableauRevampCredentials":
93
+ return TableauRevampCredentials(
94
+ server_url=get_value(
95
+ CredentialsKey.TABLEAU_SERVER_URL,
96
+ kwargs,
97
+ optional=False,
98
+ ),
99
+ site_id=get_value(CredentialsKey.TABLEAU_SITE_ID, kwargs),
100
+ user=get_value(CredentialsKey.TABLEAU_USER, kwargs),
101
+ password=get_value(CredentialsKey.TABLEAU_PASSWORD, kwargs),
102
+ token_name=get_value(CredentialsKey.TABLEAU_TOKEN_NAME, kwargs),
103
+ token=get_value(CredentialsKey.TABLEAU_TOKEN, kwargs),
104
+ )
@@ -0,0 +1,3 @@
1
+ class TableauApiError(ValueError):
2
+ def __init__(self, error: str):
3
+ super().__init__(f"Tableau API returned the following error: {error}")
@@ -0,0 +1,134 @@
1
+ from typing import Dict, Tuple
2
+
3
+ from ..assets import TableauRevampAsset
4
+
5
+ QUERY_TEMPLATE = """
6
+ {{
7
+ {resource}Connection(first: {page_size}, after: AFTER_TOKEN_SIGNAL) {{
8
+ nodes {{ {fields}
9
+ }}
10
+ pageInfo {{
11
+ hasNextPage
12
+ endCursor
13
+ }}
14
+ totalCount
15
+ }}
16
+ }}
17
+ """
18
+
19
+ _COLUMNS_QUERY = """
20
+ downstreamDashboards { id }
21
+ downstreamFields { id }
22
+ downstreamWorkbooks { id }
23
+ id
24
+ name
25
+ table { id }
26
+ """
27
+
28
+ _DASHBOARDS_QUERY = """
29
+ createdAt
30
+ id
31
+ name
32
+ path
33
+ tags { name }
34
+ updatedAt
35
+ workbook { id }
36
+ """
37
+
38
+ _DATASOURCES_QUERY = """
39
+ __typename
40
+ downstreamDashboards { id }
41
+ downstreamWorkbooks { id }
42
+ id
43
+ name
44
+ ... on PublishedDatasource {
45
+ description
46
+ luid
47
+ owner { luid }
48
+ site { name }
49
+ tags { name }
50
+ uri
51
+ }
52
+ """
53
+
54
+ _TABLES_QUERY = """
55
+ __typename
56
+ downstreamDashboards { id }
57
+ downstreamDatasources { id }
58
+ downstreamWorkbooks { id }
59
+ id
60
+ name
61
+ ... on DatabaseTable {
62
+ connectionType
63
+ fullName
64
+ schema
65
+ }
66
+ ... on CustomSQLTable {
67
+ query
68
+ }
69
+ """
70
+
71
+
72
+ _WORKBOOKS_QUERY = """
73
+ createdAt
74
+ description
75
+ embeddedDatasources { id }
76
+ id
77
+ luid
78
+ name
79
+ owner { luid }
80
+ site { name }
81
+ tags { name }
82
+ updatedAt
83
+ uri
84
+ """
85
+
86
+ _FIELDS_QUERY = """
87
+ __typename
88
+ datasource { id }
89
+ description
90
+ downstreamDashboards { id }
91
+ downstreamWorkbooks { id }
92
+ folderName
93
+ id
94
+ name
95
+ dataType
96
+ role
97
+ """
98
+
99
+
100
+ _FIELDS_QUERY_WITH_COLUMNS = f"""
101
+ {_FIELDS_QUERY}
102
+ columns {{
103
+ name
104
+ table {{ name }}
105
+ }}
106
+ """
107
+
108
+ _SHEETS_QUERY = """
109
+ containedInDashboards { id }
110
+ createdAt
111
+ id
112
+ index
113
+ name
114
+ updatedAt
115
+ upstreamFields { name }
116
+ workbook { id }
117
+ """
118
+
119
+
120
+ GQL_QUERIES: Dict[TableauRevampAsset, Tuple[str, str]] = {
121
+ TableauRevampAsset.COLUMN: ("columns", _COLUMNS_QUERY),
122
+ TableauRevampAsset.DASHBOARD: ("dashboards", _DASHBOARDS_QUERY),
123
+ TableauRevampAsset.DATASOURCE: ("datasources", _DATASOURCES_QUERY),
124
+ TableauRevampAsset.SHEET: ("sheets", _SHEETS_QUERY),
125
+ TableauRevampAsset.TABLE: ("tables", _TABLES_QUERY),
126
+ TableauRevampAsset.WORKBOOK: ("workbooks", _WORKBOOKS_QUERY),
127
+ }
128
+
129
+ FIELDS_QUERIES = (
130
+ ("binFields", _FIELDS_QUERY),
131
+ ("calculatedFields", _FIELDS_QUERY),
132
+ ("columnFields", _FIELDS_QUERY_WITH_COLUMNS),
133
+ ("groupFields", _FIELDS_QUERY),
134
+ )
@@ -0,0 +1,34 @@
1
+ from typing import Dict, Set
2
+
3
+ from ..assets import TableauRevampAsset
4
+
5
+ # list of fields to pick in TSC response
6
+ TSC_FIELDS: Dict[TableauRevampAsset, Set[str]] = {
7
+ TableauRevampAsset.DATASOURCE: {
8
+ "id",
9
+ "project_id",
10
+ "webpage_url",
11
+ },
12
+ TableauRevampAsset.PROJECT: {
13
+ "description",
14
+ "id",
15
+ "name",
16
+ "parent_id",
17
+ },
18
+ TableauRevampAsset.USAGE: {
19
+ "name",
20
+ "total_views",
21
+ "workbook_id",
22
+ },
23
+ TableauRevampAsset.USER: {
24
+ "email",
25
+ "fullname",
26
+ "id",
27
+ "name",
28
+ "site_role",
29
+ },
30
+ TableauRevampAsset.WORKBOOK: {
31
+ "id",
32
+ "project_id",
33
+ },
34
+ }
@@ -0,0 +1,5 @@
1
+ TABLEAU_SERVER_VERSION = "3.5"
2
+
3
+ DEFAULT_PAGE_SIZE = 100
4
+
5
+ DEFAULT_TIMEOUT_SECONDS = 100
@@ -0,0 +1,53 @@
1
+ import logging
2
+ from typing import Iterable, Tuple
3
+
4
+ from ...utils import (
5
+ OUTPUT_DIR,
6
+ current_timestamp,
7
+ deep_serialize,
8
+ from_env,
9
+ get_output_filename,
10
+ write_errors_logs,
11
+ write_json,
12
+ write_summary,
13
+ )
14
+ from .assets import TableauRevampAsset
15
+ from .client import TableauRevampClient
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def iterate_all_data(
21
+ client: TableauRevampClient,
22
+ ) -> Iterable[Tuple[TableauRevampAsset, list]]:
23
+ """Iterate over the extracted Data from Tableau"""
24
+
25
+ logger.info("Extracting USER from Tableau API")
26
+ yield TableauRevampAsset.USER, deep_serialize(
27
+ client.fetch(TableauRevampAsset.USER)
28
+ )
29
+
30
+
31
+ def extract_all(client: TableauRevampClient, **kwargs: str) -> None:
32
+ """
33
+ Extract Data from tableau
34
+ Store data locally in files under the output_directory
35
+ If errors from Tableau's API are catch store them locally in file under the output_directory
36
+ """
37
+ output_directory = kwargs.get("output_directory") or from_env(OUTPUT_DIR)
38
+
39
+ timestamp = current_timestamp()
40
+
41
+ for key, data in iterate_all_data(client):
42
+ filename = get_output_filename(key.value, output_directory, timestamp)
43
+ write_json(filename, data)
44
+
45
+ write_summary(
46
+ output_directory,
47
+ timestamp,
48
+ base_url=client.base_url(),
49
+ client_name=client.name(),
50
+ )
51
+
52
+ if client.errors:
53
+ write_errors_logs(output_directory, timestamp, client.errors)
@@ -3,7 +3,7 @@ from datetime import date
3
3
  from functools import partial
4
4
  from typing import Any, Dict, List, Optional, Set
5
5
 
6
- from ...utils import at_midnight, date_after
6
+ from ...utils import at_midnight, date_after, mapping_from_rows
7
7
  from ...utils.client.api import APIClient
8
8
  from ...utils.pager import PagerOnToken
9
9
  from ..abstract.time_filter import TimeFilter
@@ -88,15 +88,22 @@ class DatabricksClient(APIClient):
88
88
  )
89
89
 
90
90
  @staticmethod
91
- def _match_table_with_user(table: dict, user_id_by_email: dict) -> dict:
91
+ def _match_table_with_user(table: dict, user_mapping: dict) -> dict:
92
92
  table_owner_email = table.get("owner_email")
93
93
  if not table_owner_email:
94
94
  return table
95
- owner_external_id = user_id_by_email.get(table_owner_email)
95
+ owner_external_id = user_mapping.get(table_owner_email)
96
96
  if not owner_external_id:
97
97
  return table
98
98
  return {**table, "owner_external_id": owner_external_id}
99
99
 
100
+ @staticmethod
101
+ def _get_user_mapping(users: List[dict]) -> dict:
102
+ return {
103
+ **mapping_from_rows(users, "email", "id"),
104
+ **mapping_from_rows(users, "user_name", "id"),
105
+ }
106
+
100
107
  def tables_and_columns(
101
108
  self, schemas: List[dict], users: List[dict]
102
109
  ) -> TablesColumns:
@@ -105,11 +112,11 @@ class DatabricksClient(APIClient):
105
112
  """
106
113
  tables: List[dict] = []
107
114
  columns: List[dict] = []
108
- user_id_by_email = {user.get("email"): user.get("id") for user in users}
115
+ user_mapping = self._get_user_mapping(users)
109
116
  for schema in schemas:
110
117
  t_to_add, c_to_add = self._tables_columns_of_schema(schema)
111
118
  t_with_owner = [
112
- self._match_table_with_user(table, user_id_by_email)
119
+ self._match_table_with_user(table, user_mapping)
113
120
  for table in t_to_add
114
121
  ]
115
122
  tables.extend(t_with_owner)