castor-extractor 0.21.7__py3-none-any.whl → 0.22.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +8 -0
- castor_extractor/commands/__init__.py +0 -3
- castor_extractor/commands/file_check.py +1 -2
- castor_extractor/file_checker/column.py +5 -5
- castor_extractor/file_checker/file.py +7 -7
- castor_extractor/file_checker/file_test.py +2 -2
- castor_extractor/file_checker/templates/generic_warehouse.py +4 -6
- castor_extractor/knowledge/confluence/client/client.py +2 -1
- castor_extractor/knowledge/confluence/extract.py +3 -2
- castor_extractor/knowledge/notion/client/client.py +3 -2
- castor_extractor/knowledge/notion/extract.py +3 -2
- castor_extractor/quality/soda/client/client.py +2 -1
- castor_extractor/quality/soda/client/pagination.py +1 -3
- castor_extractor/types.py +3 -3
- castor_extractor/uploader/env.py +2 -2
- castor_extractor/uploader/upload.py +4 -3
- castor_extractor/uploader/utils.py +1 -1
- castor_extractor/utils/client/abstract.py +2 -1
- castor_extractor/utils/client/api/auth.py +2 -2
- castor_extractor/utils/client/api/auth_test.py +2 -2
- castor_extractor/utils/client/api/client.py +8 -3
- castor_extractor/utils/client/api/pagination.py +3 -2
- castor_extractor/utils/client/api/safe_request.py +5 -5
- castor_extractor/utils/collection.py +7 -11
- castor_extractor/utils/dbt/client.py +3 -3
- castor_extractor/utils/dbt/client_test.py +2 -2
- castor_extractor/utils/deprecate.py +1 -2
- castor_extractor/utils/files.py +5 -5
- castor_extractor/utils/formatter.py +5 -4
- castor_extractor/utils/json_stream_write.py +2 -1
- castor_extractor/utils/object.py +2 -1
- castor_extractor/utils/pager/pager.py +2 -4
- castor_extractor/utils/pager/pager_on_id.py +2 -1
- castor_extractor/utils/pager/pager_on_id_test.py +5 -5
- castor_extractor/utils/pager/pager_test.py +3 -3
- castor_extractor/utils/retry.py +4 -3
- castor_extractor/utils/retry_test.py +2 -3
- castor_extractor/utils/safe.py +3 -3
- castor_extractor/utils/salesforce/client.py +2 -1
- castor_extractor/utils/salesforce/credentials.py +1 -3
- castor_extractor/utils/store.py +2 -1
- castor_extractor/utils/string.py +2 -2
- castor_extractor/utils/string_test.py +1 -3
- castor_extractor/utils/type.py +3 -2
- castor_extractor/utils/validation.py +4 -4
- castor_extractor/utils/write.py +2 -2
- castor_extractor/visualization/domo/client/client.py +8 -7
- castor_extractor/visualization/domo/client/credentials.py +2 -2
- castor_extractor/visualization/domo/client/endpoints.py +2 -2
- castor_extractor/visualization/domo/extract.py +3 -2
- castor_extractor/visualization/looker/api/client.py +17 -16
- castor_extractor/visualization/looker/api/utils.py +2 -2
- castor_extractor/visualization/looker/assets.py +1 -3
- castor_extractor/visualization/looker/extract.py +4 -3
- castor_extractor/visualization/looker/fields.py +3 -3
- castor_extractor/visualization/looker/multithreading.py +3 -3
- castor_extractor/visualization/metabase/assets.py +1 -3
- castor_extractor/visualization/metabase/client/api/client.py +8 -7
- castor_extractor/visualization/metabase/extract.py +3 -2
- castor_extractor/visualization/metabase/types.py +1 -3
- castor_extractor/visualization/mode/client/client.py +6 -6
- castor_extractor/visualization/mode/extract.py +2 -2
- castor_extractor/visualization/powerbi/assets.py +1 -3
- castor_extractor/visualization/powerbi/client/client.py +12 -11
- castor_extractor/visualization/powerbi/client/credentials.py +3 -3
- castor_extractor/visualization/powerbi/client/endpoints.py +2 -2
- castor_extractor/visualization/powerbi/extract.py +3 -2
- castor_extractor/visualization/qlik/assets.py +1 -3
- castor_extractor/visualization/qlik/client/constants.py +1 -3
- castor_extractor/visualization/qlik/client/engine/error.py +1 -3
- castor_extractor/visualization/qlik/client/master.py +3 -3
- castor_extractor/visualization/qlik/client/rest.py +12 -12
- castor_extractor/visualization/qlik/extract.py +4 -3
- castor_extractor/visualization/salesforce_reporting/client/rest.py +3 -2
- castor_extractor/visualization/salesforce_reporting/client/soql.py +1 -3
- castor_extractor/visualization/salesforce_reporting/extract.py +3 -2
- castor_extractor/visualization/sigma/client/client.py +11 -8
- castor_extractor/visualization/sigma/client/credentials.py +1 -3
- castor_extractor/visualization/sigma/client/pagination.py +1 -1
- castor_extractor/visualization/sigma/extract.py +3 -2
- castor_extractor/visualization/tableau/assets.py +1 -2
- castor_extractor/visualization/tableau/client/client.py +1 -2
- castor_extractor/visualization/tableau/client/client_utils.py +3 -2
- castor_extractor/visualization/tableau/client/credentials.py +3 -3
- castor_extractor/visualization/tableau/client/safe_mode.py +1 -2
- castor_extractor/visualization/tableau/extract.py +2 -2
- castor_extractor/visualization/tableau/gql_fields.py +3 -3
- castor_extractor/visualization/tableau/tsc_fields.py +1 -2
- castor_extractor/visualization/tableau/types.py +3 -3
- castor_extractor/visualization/tableau_revamp/client/client.py +6 -1
- castor_extractor/visualization/tableau_revamp/client/client_metadata_api.py +56 -9
- castor_extractor/visualization/tableau_revamp/client/client_rest_api.py +3 -3
- castor_extractor/visualization/tableau_revamp/client/client_tsc.py +3 -2
- castor_extractor/visualization/tableau_revamp/client/errors.py +5 -0
- castor_extractor/visualization/tableau_revamp/client/gql_queries.py +1 -3
- castor_extractor/visualization/tableau_revamp/client/rest_fields.py +1 -3
- castor_extractor/visualization/tableau_revamp/extract.py +2 -2
- castor_extractor/visualization/thoughtspot/client/client.py +3 -2
- castor_extractor/visualization/thoughtspot/client/utils.py +1 -1
- castor_extractor/visualization/thoughtspot/extract.py +3 -2
- castor_extractor/warehouse/abstract/asset.py +4 -5
- castor_extractor/warehouse/abstract/extract.py +4 -3
- castor_extractor/warehouse/abstract/query.py +4 -4
- castor_extractor/warehouse/bigquery/client.py +8 -8
- castor_extractor/warehouse/bigquery/extract.py +1 -1
- castor_extractor/warehouse/bigquery/query.py +2 -2
- castor_extractor/warehouse/bigquery/types.py +2 -4
- castor_extractor/warehouse/databricks/api_client.py +15 -14
- castor_extractor/warehouse/databricks/client.py +16 -16
- castor_extractor/warehouse/databricks/extract.py +4 -4
- castor_extractor/warehouse/databricks/format.py +12 -12
- castor_extractor/warehouse/databricks/lineage.py +11 -11
- castor_extractor/warehouse/databricks/pagination.py +2 -2
- castor_extractor/warehouse/databricks/types.py +4 -4
- castor_extractor/warehouse/databricks/utils.py +5 -4
- castor_extractor/warehouse/mysql/query.py +2 -2
- castor_extractor/warehouse/postgres/query.py +2 -2
- castor_extractor/warehouse/redshift/client.py +1 -1
- castor_extractor/warehouse/redshift/query.py +2 -2
- castor_extractor/warehouse/salesforce/client.py +8 -8
- castor_extractor/warehouse/salesforce/extract.py +3 -4
- castor_extractor/warehouse/salesforce/format.py +8 -7
- castor_extractor/warehouse/salesforce/format_test.py +2 -4
- castor_extractor/warehouse/snowflake/query.py +5 -5
- castor_extractor/warehouse/sqlserver/client.py +1 -1
- castor_extractor/warehouse/sqlserver/query.py +2 -2
- {castor_extractor-0.21.7.dist-info → castor_extractor-0.22.0.dist-info}/METADATA +11 -6
- {castor_extractor-0.21.7.dist-info → castor_extractor-0.22.0.dist-info}/RECORD +131 -131
- {castor_extractor-0.21.7.dist-info → castor_extractor-0.22.0.dist-info}/LICENCE +0 -0
- {castor_extractor-0.21.7.dist-info → castor_extractor-0.22.0.dist-info}/WHEEL +0 -0
- {castor_extractor-0.21.7.dist-info → castor_extractor-0.22.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
-
from typing import Set
|
|
3
2
|
|
|
4
3
|
from ...types import ExternalAsset, classproperty
|
|
5
4
|
|
|
@@ -24,7 +23,7 @@ class TableauAsset(ExternalAsset):
|
|
|
24
23
|
WORKBOOK_TO_DATASOURCE = "workbooks_to_datasource"
|
|
25
24
|
|
|
26
25
|
@classproperty
|
|
27
|
-
def optional(cls) ->
|
|
26
|
+
def optional(cls) -> set["TableauAsset"]:
|
|
28
27
|
return {
|
|
29
28
|
TableauAsset.DASHBOARD,
|
|
30
29
|
TableauAsset.DASHBOARD_SHEET,
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import List
|
|
3
2
|
|
|
4
3
|
import tableauserverclient as TSC # type: ignore
|
|
5
4
|
|
|
@@ -43,7 +42,7 @@ class ApiClient:
|
|
|
43
42
|
self._page_size = PAGE_SIZE
|
|
44
43
|
self._server.version = TABLEAU_SERVER_VERSION
|
|
45
44
|
self._safe_mode = bool(kwargs.get("safe_mode"))
|
|
46
|
-
self.errors:
|
|
45
|
+
self.errors: list[str] = []
|
|
47
46
|
|
|
48
47
|
@staticmethod
|
|
49
48
|
def name() -> str:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from
|
|
1
|
+
from collections.abc import Iterator
|
|
2
|
+
from typing import Optional
|
|
2
3
|
|
|
3
4
|
from ....utils import SerializedAsset
|
|
4
5
|
from ..assets import TableauAsset
|
|
@@ -69,6 +70,6 @@ def query_scroll(
|
|
|
69
70
|
break
|
|
70
71
|
|
|
71
72
|
|
|
72
|
-
def extract_asset(asset:
|
|
73
|
+
def extract_asset(asset: dict, asset_type: TableauAsset) -> dict:
|
|
73
74
|
"""Agnostic function extracting dedicated attributes with define asset"""
|
|
74
75
|
return {key: getattr(asset, key) for key in TSC_FIELDS[asset_type]}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Optional
|
|
3
3
|
|
|
4
4
|
from ....utils import from_env
|
|
5
5
|
|
|
@@ -20,7 +20,7 @@ class CredentialsKey(Enum):
|
|
|
20
20
|
TABLEAU_SERVER_URL = "server_url"
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
CREDENTIALS_ENV:
|
|
23
|
+
CREDENTIALS_ENV: dict[CredentialsKey, str] = {
|
|
24
24
|
CredentialsKey.TABLEAU_USER: "CASTOR_TABLEAU_USER",
|
|
25
25
|
CredentialsKey.TABLEAU_PASSWORD: "CASTOR_TABLEAU_PASSWORD",
|
|
26
26
|
CredentialsKey.TABLEAU_TOKEN_NAME: "CASTOR_TABLEAU_TOKEN_NAME",
|
|
@@ -89,7 +89,7 @@ class CredentialsApi:
|
|
|
89
89
|
CredentialsKey.TABLEAU_TOKEN: token,
|
|
90
90
|
}
|
|
91
91
|
|
|
92
|
-
def to_dict(self, hide: bool = False) ->
|
|
92
|
+
def to_dict(self, hide: bool = False) -> dict[str, str]:
|
|
93
93
|
safe = (
|
|
94
94
|
CredentialsKey.TABLEAU_USER,
|
|
95
95
|
CredentialsKey.TABLEAU_SITE_ID,
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Dict, List
|
|
3
2
|
|
|
4
3
|
import tableauserverclient as TSC # type: ignore
|
|
5
4
|
|
|
@@ -48,7 +47,7 @@ def safe_mode_fetch_usage(client) -> SerializedAsset:
|
|
|
48
47
|
Returns computed usages when page number is not found
|
|
49
48
|
Log errors if ServerResponseError is return
|
|
50
49
|
"""
|
|
51
|
-
list_usages:
|
|
50
|
+
list_usages: list[dict] = []
|
|
52
51
|
page_number: int = 0
|
|
53
52
|
|
|
54
53
|
while True:
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from
|
|
2
|
+
from collections.abc import Iterable
|
|
3
3
|
|
|
4
4
|
from ...utils import (
|
|
5
5
|
OUTPUT_DIR,
|
|
@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__)
|
|
|
19
19
|
|
|
20
20
|
def iterate_all_data(
|
|
21
21
|
client: Client,
|
|
22
|
-
) -> Iterable[
|
|
22
|
+
) -> Iterable[tuple[TableauAsset, list]]:
|
|
23
23
|
"""Iterate over the extracted Data from Tableau"""
|
|
24
24
|
|
|
25
25
|
logger.info("Extracting USER from Tableau API")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Fields which will be use for Tableau GraphQL API
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Union
|
|
4
4
|
|
|
5
5
|
from .assets import TableauAsset, TableauGraphqlAsset
|
|
6
6
|
|
|
@@ -189,9 +189,9 @@ class GQLQueryFields(Enum):
|
|
|
189
189
|
"""
|
|
190
190
|
|
|
191
191
|
|
|
192
|
-
QueryInfo =
|
|
192
|
+
QueryInfo = list[dict[str, Union[GQLQueryFields, TableauGraphqlAsset]]]
|
|
193
193
|
|
|
194
|
-
QUERY_FIELDS:
|
|
194
|
+
QUERY_FIELDS: dict[TableauAsset, QueryInfo] = {
|
|
195
195
|
TableauAsset.CUSTOM_SQL_TABLE: [
|
|
196
196
|
{
|
|
197
197
|
FIELDS: GQLQueryFields.CUSTOM_SQL_TABLE,
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
# TSC for TableauServerClient: basic REST API to extracting core objects
|
|
2
|
-
from typing import Dict, Set
|
|
3
2
|
|
|
4
3
|
from .assets import TableauAsset
|
|
5
4
|
|
|
6
5
|
# TSC fields extracted per assets
|
|
7
|
-
TSC_FIELDS:
|
|
6
|
+
TSC_FIELDS: dict[TableauAsset, set[str]] = {
|
|
8
7
|
TableauAsset.PROJECT: {
|
|
9
8
|
"id",
|
|
10
9
|
"name",
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Union
|
|
2
2
|
|
|
3
3
|
from tableauserverclient import ServerResponseError # type: ignore
|
|
4
4
|
from typing_extensions import Literal
|
|
@@ -6,6 +6,6 @@ from typing_extensions import Literal
|
|
|
6
6
|
from .errors import TableauErrorCode
|
|
7
7
|
|
|
8
8
|
PageReturn = Union[
|
|
9
|
-
|
|
10
|
-
|
|
9
|
+
tuple[list[dict], Literal[None]],
|
|
10
|
+
tuple[Literal[None], Union[TableauErrorCode, ServerResponseError]],
|
|
11
11
|
]
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from typing import Optional
|
|
2
3
|
|
|
3
4
|
import tableauserverclient as TSC # type: ignore
|
|
4
5
|
|
|
@@ -121,12 +122,16 @@ class TableauRevampClient:
|
|
|
121
122
|
credentials: TableauRevampCredentials,
|
|
122
123
|
timeout_sec: int = DEFAULT_TIMEOUT_SECONDS,
|
|
123
124
|
with_pulse: bool = False,
|
|
125
|
+
override_page_size: Optional[int] = None,
|
|
124
126
|
):
|
|
125
127
|
self._credentials = credentials
|
|
126
128
|
self._server = _server(credentials.server_url, timeout_sec)
|
|
127
129
|
self._with_pulse = with_pulse
|
|
128
130
|
|
|
129
|
-
self._client_metadata = TableauClientMetadataApi(
|
|
131
|
+
self._client_metadata = TableauClientMetadataApi(
|
|
132
|
+
server=self._server,
|
|
133
|
+
override_page_size=override_page_size,
|
|
134
|
+
)
|
|
130
135
|
self._client_rest = TableauClientRestApi(server=self._server)
|
|
131
136
|
self._client_tsc = TableauClientTSC(server=self._server)
|
|
132
137
|
|
|
@@ -1,16 +1,17 @@
|
|
|
1
|
-
from
|
|
1
|
+
from collections.abc import Iterator
|
|
2
|
+
from typing import Optional
|
|
2
3
|
|
|
3
4
|
import tableauserverclient as TSC # type: ignore
|
|
4
5
|
|
|
5
|
-
from ....utils import SerializedAsset
|
|
6
|
+
from ....utils import SerializedAsset, retry
|
|
6
7
|
from ..assets import TableauRevampAsset
|
|
7
8
|
from ..constants import DEFAULT_PAGE_SIZE
|
|
8
|
-
from .errors import TableauApiError
|
|
9
|
+
from .errors import TableauApiError, TableauApiTimeout
|
|
9
10
|
from .gql_queries import FIELDS_QUERIES, GQL_QUERIES, QUERY_TEMPLATE
|
|
10
11
|
|
|
11
12
|
# increase the value when extraction is too slow
|
|
12
13
|
# decrease the value when timeouts arise
|
|
13
|
-
_CUSTOM_PAGE_SIZE:
|
|
14
|
+
_CUSTOM_PAGE_SIZE: dict[TableauRevampAsset, int] = {
|
|
14
15
|
# for some clients, extraction of columns tend to hit the node limit
|
|
15
16
|
# https://community.tableau.com/s/question/0D54T00000YuK60SAF/metadata-query-nodelimitexceeded-error
|
|
16
17
|
# the workaround is to reduce pagination
|
|
@@ -20,21 +21,58 @@ _CUSTOM_PAGE_SIZE: Dict[TableauRevampAsset, int] = {
|
|
|
20
21
|
TableauRevampAsset.TABLE: 50,
|
|
21
22
|
}
|
|
22
23
|
|
|
24
|
+
_TIMEOUT_MESSAGE = (
|
|
25
|
+
"Execution canceled because timeout of 30000 millis was reached"
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
_RETRY_BASE_MS = 10_000
|
|
29
|
+
_RETRY_COUNT = 4
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _check_errors(answer: dict) -> None:
|
|
33
|
+
"""
|
|
34
|
+
handle errors in graphql response:
|
|
35
|
+
- return None when there's no errors in the answer
|
|
36
|
+
- TableauApiTimeout if any of the errors is a timeout
|
|
37
|
+
- TableauApiError (generic) otherwise
|
|
38
|
+
"""
|
|
39
|
+
if "errors" not in answer:
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
errors = answer["errors"]
|
|
43
|
+
|
|
44
|
+
for error in errors:
|
|
45
|
+
if error.get("message") == _TIMEOUT_MESSAGE:
|
|
46
|
+
# we need specific handling for timeout issues (retry strategy)
|
|
47
|
+
raise TableauApiTimeout(errors)
|
|
48
|
+
|
|
49
|
+
raise TableauApiError(answer["errors"])
|
|
50
|
+
|
|
23
51
|
|
|
24
52
|
def gql_query_scroll(
|
|
25
53
|
server,
|
|
26
54
|
query: str,
|
|
27
55
|
resource: str,
|
|
28
56
|
) -> Iterator[SerializedAsset]:
|
|
29
|
-
"""
|
|
57
|
+
"""
|
|
58
|
+
Iterate over GQL query results, handling pagination and cursor
|
|
30
59
|
|
|
60
|
+
We have a retry strategy when timeout issues arise.
|
|
61
|
+
It's a known issue on Tableau side, still waiting for their fix:
|
|
62
|
+
https://issues.salesforce.com/issue/a028c00000zKahoAAC/undefined
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
@retry(
|
|
66
|
+
exceptions=(TableauApiTimeout,),
|
|
67
|
+
max_retries=_RETRY_COUNT,
|
|
68
|
+
base_ms=_RETRY_BASE_MS,
|
|
69
|
+
)
|
|
31
70
|
def _call(cursor: Optional[str]) -> dict:
|
|
32
71
|
# If cursor is defined it must be quoted else use null token
|
|
33
72
|
token = "null" if cursor is None else f'"{cursor}"'
|
|
34
73
|
query_ = query.replace("AFTER_TOKEN_SIGNAL", token)
|
|
35
74
|
answer = server.metadata.query(query_)
|
|
36
|
-
|
|
37
|
-
raise TableauApiError(answer["errors"])
|
|
75
|
+
_check_errors(answer)
|
|
38
76
|
return answer["data"][f"{resource}Connection"]
|
|
39
77
|
|
|
40
78
|
cursor = None
|
|
@@ -58,8 +96,10 @@ class TableauClientMetadataApi:
|
|
|
58
96
|
def __init__(
|
|
59
97
|
self,
|
|
60
98
|
server: TSC.Server,
|
|
99
|
+
override_page_size: Optional[int] = None,
|
|
61
100
|
):
|
|
62
101
|
self._server = server
|
|
102
|
+
self._forced_page_size = override_page_size
|
|
63
103
|
|
|
64
104
|
def _call(
|
|
65
105
|
self,
|
|
@@ -75,9 +115,16 @@ class TableauClientMetadataApi:
|
|
|
75
115
|
result_pages = gql_query_scroll(self._server, query, resource)
|
|
76
116
|
return [asset for page in result_pages for asset in page]
|
|
77
117
|
|
|
118
|
+
def _page_size(self, asset: TableauRevampAsset) -> int:
|
|
119
|
+
return (
|
|
120
|
+
self._forced_page_size
|
|
121
|
+
or _CUSTOM_PAGE_SIZE.get(asset)
|
|
122
|
+
or DEFAULT_PAGE_SIZE
|
|
123
|
+
)
|
|
124
|
+
|
|
78
125
|
def _fetch_fields(self) -> SerializedAsset:
|
|
79
126
|
result: SerializedAsset = []
|
|
80
|
-
page_size =
|
|
127
|
+
page_size = self._page_size(TableauRevampAsset.FIELD)
|
|
81
128
|
for resource, fields in FIELDS_QUERIES:
|
|
82
129
|
current = self._call(resource, fields, page_size)
|
|
83
130
|
result.extend(current)
|
|
@@ -90,6 +137,6 @@ class TableauClientMetadataApi:
|
|
|
90
137
|
if asset == TableauRevampAsset.FIELD:
|
|
91
138
|
return self._fetch_fields()
|
|
92
139
|
|
|
93
|
-
page_size =
|
|
140
|
+
page_size = self._page_size(asset)
|
|
94
141
|
resource, fields = GQL_QUERIES[asset]
|
|
95
142
|
return self._call(resource, fields, page_size)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Optional
|
|
3
3
|
|
|
4
4
|
import requests
|
|
5
5
|
import tableauserverclient as TSC # type: ignore
|
|
@@ -40,7 +40,7 @@ class TableauClientRestApi:
|
|
|
40
40
|
return self._server.http_options["timeout"]
|
|
41
41
|
|
|
42
42
|
@property
|
|
43
|
-
def headers(self) ->
|
|
43
|
+
def headers(self) -> dict[str, str]:
|
|
44
44
|
return {"x-tableau-auth": self._server.auth_token}
|
|
45
45
|
|
|
46
46
|
def _get_site_name(self) -> str:
|
|
@@ -52,7 +52,7 @@ class TableauClientRestApi:
|
|
|
52
52
|
self,
|
|
53
53
|
url: str,
|
|
54
54
|
page_token: Optional[str] = None,
|
|
55
|
-
) ->
|
|
55
|
+
) -> dict:
|
|
56
56
|
if page_token:
|
|
57
57
|
url += f"?page_token={page_token}"
|
|
58
58
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from
|
|
1
|
+
from collections.abc import Iterable, Iterator
|
|
2
|
+
from typing import Any
|
|
2
3
|
|
|
3
4
|
import tableauserverclient as TSC # type: ignore
|
|
4
5
|
|
|
@@ -30,7 +31,7 @@ class TableauClientTSC:
|
|
|
30
31
|
self,
|
|
31
32
|
data: Iterable,
|
|
32
33
|
asset: TableauRevampAsset,
|
|
33
|
-
) -> Iterator[
|
|
34
|
+
) -> Iterator[dict]:
|
|
34
35
|
keys = REST_FIELDS[asset]
|
|
35
36
|
|
|
36
37
|
for row in data:
|
|
@@ -1,3 +1,8 @@
|
|
|
1
1
|
class TableauApiError(ValueError):
|
|
2
2
|
def __init__(self, error: str):
|
|
3
3
|
super().__init__(f"Tableau API returned the following error: {error}")
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TableauApiTimeout(ValueError):
|
|
7
|
+
def __init__(self, error: str):
|
|
8
|
+
super().__init__(f"Tableau API returned a timeout error: {error}")
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from typing import Dict, Tuple
|
|
2
|
-
|
|
3
1
|
from ..assets import TableauRevampAsset
|
|
4
2
|
|
|
5
3
|
QUERY_TEMPLATE = """
|
|
@@ -130,7 +128,7 @@ workbook { id }
|
|
|
130
128
|
"""
|
|
131
129
|
|
|
132
130
|
|
|
133
|
-
GQL_QUERIES:
|
|
131
|
+
GQL_QUERIES: dict[TableauRevampAsset, tuple[str, str]] = {
|
|
134
132
|
TableauRevampAsset.COLUMN: ("columns", _COLUMNS_QUERY),
|
|
135
133
|
TableauRevampAsset.DASHBOARD: ("dashboards", _DASHBOARDS_QUERY),
|
|
136
134
|
TableauRevampAsset.DATASOURCE: ("datasources", _DATASOURCES_QUERY),
|
|
@@ -1,9 +1,7 @@
|
|
|
1
|
-
from typing import Dict, Set
|
|
2
|
-
|
|
3
1
|
from ..assets import TableauRevampAsset
|
|
4
2
|
|
|
5
3
|
# list of fields to pick in REST API or TSC responses
|
|
6
|
-
REST_FIELDS:
|
|
4
|
+
REST_FIELDS: dict[TableauRevampAsset, set[str]] = {
|
|
7
5
|
TableauRevampAsset.DATASOURCE: {
|
|
8
6
|
"id",
|
|
9
7
|
"project_id",
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from
|
|
2
|
+
from collections.abc import Iterable
|
|
3
3
|
|
|
4
4
|
from ...utils import (
|
|
5
5
|
OUTPUT_DIR,
|
|
@@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
|
|
|
18
18
|
|
|
19
19
|
def iterate_all_data(
|
|
20
20
|
client: TableauRevampClient,
|
|
21
|
-
) -> Iterable[
|
|
21
|
+
) -> Iterable[tuple[TableauRevampAsset, list]]:
|
|
22
22
|
"""Iterate over the extracted Data from Tableau"""
|
|
23
23
|
|
|
24
24
|
for asset in TableauRevampAsset:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from
|
|
1
|
+
from collections.abc import Iterator
|
|
2
|
+
from typing import Optional
|
|
2
3
|
|
|
3
4
|
import requests
|
|
4
5
|
|
|
@@ -35,7 +36,7 @@ THOUGHTSPOT_SAFE_MODE = RequestSafeMode()
|
|
|
35
36
|
|
|
36
37
|
|
|
37
38
|
class ThoughtspotBearerAuth(BearerAuth):
|
|
38
|
-
def __init__(self, host: str, token_payload:
|
|
39
|
+
def __init__(self, host: str, token_payload: dict[str, str]):
|
|
39
40
|
auth_endpoint = ThoughtspotEndpointFactory.authentication()
|
|
40
41
|
self.authentication_url = build_url(host, auth_endpoint)
|
|
41
42
|
self.token_payload = token_payload
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from
|
|
2
|
+
from collections.abc import Iterable, Iterator
|
|
3
|
+
from typing import Union
|
|
3
4
|
|
|
4
5
|
from ...utils import (
|
|
5
6
|
OUTPUT_DIR,
|
|
@@ -21,7 +22,7 @@ logger = logging.getLogger(__name__)
|
|
|
21
22
|
|
|
22
23
|
def iterate_all_data(
|
|
23
24
|
client: ThoughtspotClient,
|
|
24
|
-
) -> Iterable[
|
|
25
|
+
) -> Iterable[tuple[ThoughtspotAsset, Union[list, Iterator, dict]]]:
|
|
25
26
|
"""Iterate over the extracted data from Thoughtspot"""
|
|
26
27
|
|
|
27
28
|
for asset in ThoughtspotAsset:
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
-
from typing import Dict, List, Set, Tuple
|
|
3
2
|
|
|
4
3
|
from ...types import ExternalAsset, classproperty
|
|
5
4
|
|
|
@@ -26,7 +25,7 @@ class WarehouseAsset(ExternalAsset):
|
|
|
26
25
|
VIEW_DDL = "view_ddl"
|
|
27
26
|
|
|
28
27
|
@classproperty
|
|
29
|
-
def optional(cls) ->
|
|
28
|
+
def optional(cls) -> set["WarehouseAsset"]:
|
|
30
29
|
return {
|
|
31
30
|
WarehouseAsset.ADDITIONAL_COLUMN_LINEAGE,
|
|
32
31
|
WarehouseAsset.ADDITIONAL_TABLE_LINEAGE,
|
|
@@ -50,7 +49,7 @@ class WarehouseAssetGroup(Enum):
|
|
|
50
49
|
|
|
51
50
|
|
|
52
51
|
# tuple of supported assets for each group (depends on the technology)
|
|
53
|
-
SupportedAssets =
|
|
52
|
+
SupportedAssets = dict[WarehouseAssetGroup, tuple[WarehouseAsset, ...]]
|
|
54
53
|
|
|
55
54
|
# shared by all technologies
|
|
56
55
|
CATALOG_ASSETS = (
|
|
@@ -80,13 +79,13 @@ NON_EXTRACTABLE_ASSETS = {WarehouseAssetGroup.EXTERNAL_LINEAGE}
|
|
|
80
79
|
|
|
81
80
|
def extractable_asset_groups(
|
|
82
81
|
supported_assets: SupportedAssets,
|
|
83
|
-
) ->
|
|
82
|
+
) -> list[tuple[WarehouseAsset, ...]]:
|
|
84
83
|
"""
|
|
85
84
|
helper function to differentiate
|
|
86
85
|
extractable assets vs supported (ingest-able) assets
|
|
87
86
|
"""
|
|
88
87
|
groups = set(supported_assets).difference(NON_EXTRACTABLE_ASSETS)
|
|
89
|
-
extractable:
|
|
88
|
+
extractable: set[tuple[WarehouseAsset, ...]] = {
|
|
90
89
|
supported_assets[group] for group in groups
|
|
91
90
|
}
|
|
92
91
|
return list(extractable)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from collections.abc import Iterator
|
|
2
3
|
from itertools import chain
|
|
3
|
-
from typing import Callable,
|
|
4
|
+
from typing import Callable, Optional
|
|
4
5
|
|
|
5
6
|
from ...utils import (
|
|
6
7
|
OUTPUT_DIR,
|
|
@@ -16,7 +17,7 @@ from .query import AbstractQueryBuilder, ExtractionQuery
|
|
|
16
17
|
logger = logging.getLogger(__name__)
|
|
17
18
|
|
|
18
19
|
|
|
19
|
-
def common_args(kwargs: dict) ->
|
|
20
|
+
def common_args(kwargs: dict) -> tuple[str, bool]:
|
|
20
21
|
"""Args used by all technologies"""
|
|
21
22
|
output_directory = kwargs.get("output_directory") or from_env(OUTPUT_DIR)
|
|
22
23
|
skip_existing = kwargs.get("skip_existing") or False
|
|
@@ -39,7 +40,7 @@ class SQLExtractionProcessor:
|
|
|
39
40
|
self._safe_mode = safe_mode
|
|
40
41
|
|
|
41
42
|
@staticmethod
|
|
42
|
-
def _unique(data: Iterator[dict]) ->
|
|
43
|
+
def _unique(data: Iterator[dict]) -> list[dict]:
|
|
43
44
|
"""
|
|
44
45
|
Remove duplicate in the given data.
|
|
45
46
|
Remark: this method implies loading all data in memory: it breaks the streaming pipeline !
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
import os
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Optional
|
|
5
5
|
|
|
6
6
|
from .asset import WarehouseAsset
|
|
7
7
|
from .time_filter import TimeFilter
|
|
@@ -37,7 +37,7 @@ class AbstractQueryBuilder(ABC):
|
|
|
37
37
|
def __init__(
|
|
38
38
|
self,
|
|
39
39
|
time_filter: Optional[TimeFilter],
|
|
40
|
-
duplicated: Optional[
|
|
40
|
+
duplicated: Optional[tuple[WarehouseAsset, ...]] = None,
|
|
41
41
|
):
|
|
42
42
|
self._time_filter = time_filter or TimeFilter.default()
|
|
43
43
|
self._duplicated = duplicated
|
|
@@ -55,7 +55,7 @@ class AbstractQueryBuilder(ABC):
|
|
|
55
55
|
"""read from a file located in queries directory"""
|
|
56
56
|
root = os.path.dirname(inspect.getfile(self.__class__))
|
|
57
57
|
path = os.path.join(root, QUERIES_DIR, filename)
|
|
58
|
-
with open(path
|
|
58
|
+
with open(path) as f:
|
|
59
59
|
return f.read()
|
|
60
60
|
|
|
61
61
|
def load_statement(self, asset: WarehouseAsset) -> str:
|
|
@@ -75,7 +75,7 @@ class AbstractQueryBuilder(ABC):
|
|
|
75
75
|
return ExtractionQuery(statement, params)
|
|
76
76
|
|
|
77
77
|
@abstractmethod
|
|
78
|
-
def build(self, asset: WarehouseAsset) ->
|
|
78
|
+
def build(self, asset: WarehouseAsset) -> list[ExtractionQuery]:
|
|
79
79
|
"""
|
|
80
80
|
Build the Query allowing extraction of the given asset
|
|
81
81
|
- Most of the time, returns a single query
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
import logging
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Optional
|
|
4
4
|
|
|
5
5
|
from google.api_core.exceptions import Forbidden # type: ignore
|
|
6
6
|
from google.cloud.bigquery import Client as GoogleCloudClient # type: ignore
|
|
@@ -27,9 +27,9 @@ class BigQueryClient(SqlalchemyClient):
|
|
|
27
27
|
def __init__(
|
|
28
28
|
self,
|
|
29
29
|
credentials: dict,
|
|
30
|
-
db_allowed: Optional[
|
|
31
|
-
db_blocked: Optional[
|
|
32
|
-
dataset_blocked: Optional[
|
|
30
|
+
db_allowed: Optional[set[str]] = None,
|
|
31
|
+
db_blocked: Optional[set[str]] = None,
|
|
32
|
+
dataset_blocked: Optional[set[str]] = None,
|
|
33
33
|
):
|
|
34
34
|
super().__init__(credentials)
|
|
35
35
|
self._db_allowed = db_allowed
|
|
@@ -37,8 +37,8 @@ class BigQueryClient(SqlalchemyClient):
|
|
|
37
37
|
self._dataset_blocked = dataset_blocked
|
|
38
38
|
self.credentials = self._credentials()
|
|
39
39
|
self.client = self._client()
|
|
40
|
-
self._projects:
|
|
41
|
-
self._datasets:
|
|
40
|
+
self._projects: list[str] | None = None
|
|
41
|
+
self._datasets: list[Dataset] | None = None
|
|
42
42
|
|
|
43
43
|
@staticmethod
|
|
44
44
|
def name() -> str:
|
|
@@ -78,7 +78,7 @@ class BigQueryClient(SqlalchemyClient):
|
|
|
78
78
|
credentials=self.credentials,
|
|
79
79
|
)
|
|
80
80
|
|
|
81
|
-
def _list_datasets(self) ->
|
|
81
|
+
def _list_datasets(self) -> list[Dataset]:
|
|
82
82
|
"""
|
|
83
83
|
Returns datasets available for the given GCP client
|
|
84
84
|
Cache the result in self._datasets to reduce number of API calls
|
|
@@ -98,7 +98,7 @@ class BigQueryClient(SqlalchemyClient):
|
|
|
98
98
|
base_ms=_RETRY_BASE_MS,
|
|
99
99
|
log_exc_info=True,
|
|
100
100
|
)
|
|
101
|
-
def get_projects(self) ->
|
|
101
|
+
def get_projects(self) -> list[str]:
|
|
102
102
|
"""
|
|
103
103
|
Returns distinct project_id available for the given GCP client
|
|
104
104
|
Cache the result in self._projects to reduce number of API calls.
|
|
@@ -38,7 +38,7 @@ def _credentials(params: dict) -> dict:
|
|
|
38
38
|
"""extract GCP credentials"""
|
|
39
39
|
path = params.get("credentials") or from_env(BIGQUERY_CREDENTIALS)
|
|
40
40
|
logger.info(f"Credentials fetched from {path}")
|
|
41
|
-
with open(path
|
|
41
|
+
with open(path) as file:
|
|
42
42
|
return cast(dict, json.load(file))
|
|
43
43
|
|
|
44
44
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Optional
|
|
3
3
|
|
|
4
4
|
from ..abstract import (
|
|
5
5
|
AbstractQueryBuilder,
|
|
@@ -109,7 +109,7 @@ class BigQueryQueryBuilder(AbstractQueryBuilder):
|
|
|
109
109
|
else self._regions
|
|
110
110
|
)
|
|
111
111
|
|
|
112
|
-
def build(self, asset: WarehouseAsset) ->
|
|
112
|
+
def build(self, asset: WarehouseAsset) -> list[ExtractionQuery]:
|
|
113
113
|
"""
|
|
114
114
|
It would be easier to stitch data directly in the query statement (UNION ALL).
|
|
115
115
|
Unfortunately, querying INFORMATION_SCHEMA on multiple regions
|