castor-extractor 0.21.9__py3-none-any.whl → 0.22.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +4 -0
- castor_extractor/commands/__init__.py +0 -3
- castor_extractor/commands/file_check.py +1 -2
- castor_extractor/file_checker/column.py +5 -5
- castor_extractor/file_checker/file.py +7 -7
- castor_extractor/file_checker/file_test.py +2 -2
- castor_extractor/file_checker/templates/generic_warehouse.py +4 -6
- castor_extractor/knowledge/confluence/client/client.py +2 -1
- castor_extractor/knowledge/confluence/extract.py +3 -2
- castor_extractor/knowledge/notion/client/client.py +3 -2
- castor_extractor/knowledge/notion/extract.py +3 -2
- castor_extractor/quality/soda/client/client.py +2 -1
- castor_extractor/quality/soda/client/pagination.py +1 -3
- castor_extractor/types.py +3 -3
- castor_extractor/uploader/env.py +2 -2
- castor_extractor/uploader/upload.py +4 -3
- castor_extractor/uploader/utils.py +1 -1
- castor_extractor/utils/client/abstract.py +2 -1
- castor_extractor/utils/client/api/auth.py +2 -2
- castor_extractor/utils/client/api/auth_test.py +2 -2
- castor_extractor/utils/client/api/client.py +3 -3
- castor_extractor/utils/client/api/pagination.py +3 -2
- castor_extractor/utils/client/api/safe_request.py +5 -5
- castor_extractor/utils/collection.py +7 -11
- castor_extractor/utils/dbt/client.py +3 -3
- castor_extractor/utils/dbt/client_test.py +2 -2
- castor_extractor/utils/deprecate.py +1 -2
- castor_extractor/utils/files.py +5 -5
- castor_extractor/utils/formatter.py +5 -4
- castor_extractor/utils/json_stream_write.py +2 -1
- castor_extractor/utils/object.py +2 -1
- castor_extractor/utils/pager/pager.py +2 -4
- castor_extractor/utils/pager/pager_on_id.py +2 -1
- castor_extractor/utils/pager/pager_on_id_test.py +5 -5
- castor_extractor/utils/pager/pager_test.py +3 -3
- castor_extractor/utils/retry.py +4 -3
- castor_extractor/utils/retry_test.py +2 -3
- castor_extractor/utils/safe.py +3 -3
- castor_extractor/utils/salesforce/client.py +2 -1
- castor_extractor/utils/salesforce/credentials.py +1 -3
- castor_extractor/utils/store.py +2 -1
- castor_extractor/utils/string.py +2 -2
- castor_extractor/utils/string_test.py +1 -3
- castor_extractor/utils/type.py +3 -2
- castor_extractor/utils/validation.py +4 -4
- castor_extractor/utils/write.py +2 -2
- castor_extractor/visualization/domo/client/client.py +8 -7
- castor_extractor/visualization/domo/client/credentials.py +2 -2
- castor_extractor/visualization/domo/client/endpoints.py +2 -2
- castor_extractor/visualization/domo/extract.py +3 -2
- castor_extractor/visualization/looker/api/client.py +17 -16
- castor_extractor/visualization/looker/api/utils.py +2 -2
- castor_extractor/visualization/looker/assets.py +1 -3
- castor_extractor/visualization/looker/extract.py +4 -3
- castor_extractor/visualization/looker/fields.py +3 -3
- castor_extractor/visualization/looker/multithreading.py +3 -3
- castor_extractor/visualization/metabase/assets.py +1 -3
- castor_extractor/visualization/metabase/client/api/client.py +8 -7
- castor_extractor/visualization/metabase/extract.py +3 -2
- castor_extractor/visualization/metabase/types.py +1 -3
- castor_extractor/visualization/mode/client/client.py +6 -6
- castor_extractor/visualization/mode/extract.py +2 -2
- castor_extractor/visualization/powerbi/assets.py +1 -3
- castor_extractor/visualization/powerbi/client/client.py +12 -11
- castor_extractor/visualization/powerbi/client/credentials.py +3 -3
- castor_extractor/visualization/powerbi/client/endpoints.py +2 -2
- castor_extractor/visualization/powerbi/extract.py +3 -2
- castor_extractor/visualization/qlik/assets.py +1 -3
- castor_extractor/visualization/qlik/client/constants.py +1 -3
- castor_extractor/visualization/qlik/client/engine/error.py +1 -3
- castor_extractor/visualization/qlik/client/master.py +3 -3
- castor_extractor/visualization/qlik/client/rest.py +12 -12
- castor_extractor/visualization/qlik/extract.py +4 -3
- castor_extractor/visualization/salesforce_reporting/client/rest.py +3 -2
- castor_extractor/visualization/salesforce_reporting/client/soql.py +1 -3
- castor_extractor/visualization/salesforce_reporting/extract.py +3 -2
- castor_extractor/visualization/sigma/client/client.py +9 -8
- castor_extractor/visualization/sigma/client/credentials.py +1 -3
- castor_extractor/visualization/sigma/extract.py +3 -2
- castor_extractor/visualization/tableau/assets.py +1 -2
- castor_extractor/visualization/tableau/client/client.py +1 -2
- castor_extractor/visualization/tableau/client/client_utils.py +3 -2
- castor_extractor/visualization/tableau/client/credentials.py +3 -3
- castor_extractor/visualization/tableau/client/safe_mode.py +1 -2
- castor_extractor/visualization/tableau/extract.py +2 -2
- castor_extractor/visualization/tableau/gql_fields.py +3 -3
- castor_extractor/visualization/tableau/tsc_fields.py +1 -2
- castor_extractor/visualization/tableau/types.py +3 -3
- castor_extractor/visualization/tableau_revamp/client/client_metadata_api.py +3 -2
- castor_extractor/visualization/tableau_revamp/client/client_rest_api.py +3 -3
- castor_extractor/visualization/tableau_revamp/client/client_tsc.py +3 -2
- castor_extractor/visualization/tableau_revamp/client/gql_queries.py +1 -3
- castor_extractor/visualization/tableau_revamp/client/rest_fields.py +1 -3
- castor_extractor/visualization/tableau_revamp/extract.py +2 -2
- castor_extractor/visualization/thoughtspot/client/client.py +3 -2
- castor_extractor/visualization/thoughtspot/client/utils.py +1 -1
- castor_extractor/visualization/thoughtspot/extract.py +3 -2
- castor_extractor/warehouse/abstract/asset.py +4 -5
- castor_extractor/warehouse/abstract/extract.py +4 -3
- castor_extractor/warehouse/abstract/query.py +4 -4
- castor_extractor/warehouse/bigquery/client.py +8 -8
- castor_extractor/warehouse/bigquery/extract.py +1 -1
- castor_extractor/warehouse/bigquery/query.py +2 -2
- castor_extractor/warehouse/bigquery/types.py +2 -4
- castor_extractor/warehouse/databricks/api_client.py +15 -14
- castor_extractor/warehouse/databricks/client.py +16 -16
- castor_extractor/warehouse/databricks/extract.py +4 -4
- castor_extractor/warehouse/databricks/format.py +12 -12
- castor_extractor/warehouse/databricks/lineage.py +11 -11
- castor_extractor/warehouse/databricks/pagination.py +2 -2
- castor_extractor/warehouse/databricks/types.py +4 -4
- castor_extractor/warehouse/databricks/utils.py +5 -4
- castor_extractor/warehouse/mysql/query.py +2 -2
- castor_extractor/warehouse/postgres/query.py +2 -2
- castor_extractor/warehouse/redshift/client.py +1 -1
- castor_extractor/warehouse/redshift/query.py +2 -2
- castor_extractor/warehouse/salesforce/client.py +8 -8
- castor_extractor/warehouse/salesforce/extract.py +3 -4
- castor_extractor/warehouse/salesforce/format.py +8 -7
- castor_extractor/warehouse/salesforce/format_test.py +2 -4
- castor_extractor/warehouse/snowflake/query.py +5 -5
- castor_extractor/warehouse/sqlserver/client.py +1 -1
- castor_extractor/warehouse/sqlserver/query.py +2 -2
- {castor_extractor-0.21.9.dist-info → castor_extractor-0.22.0.dist-info}/METADATA +7 -6
- {castor_extractor-0.21.9.dist-info → castor_extractor-0.22.0.dist-info}/RECORD +128 -128
- {castor_extractor-0.21.9.dist-info → castor_extractor-0.22.0.dist-info}/LICENCE +0 -0
- {castor_extractor-0.21.9.dist-info → castor_extractor-0.22.0.dist-info}/WHEEL +0 -0
- {castor_extractor-0.21.9.dist-info → castor_extractor-0.22.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from
|
|
1
|
+
from collections.abc import Iterator
|
|
2
|
+
from typing import Optional
|
|
2
3
|
|
|
3
4
|
from ....utils import SerializedAsset
|
|
4
5
|
from ..assets import TableauAsset
|
|
@@ -69,6 +70,6 @@ def query_scroll(
|
|
|
69
70
|
break
|
|
70
71
|
|
|
71
72
|
|
|
72
|
-
def extract_asset(asset:
|
|
73
|
+
def extract_asset(asset: dict, asset_type: TableauAsset) -> dict:
|
|
73
74
|
"""Agnostic function extracting dedicated attributes with define asset"""
|
|
74
75
|
return {key: getattr(asset, key) for key in TSC_FIELDS[asset_type]}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Optional
|
|
3
3
|
|
|
4
4
|
from ....utils import from_env
|
|
5
5
|
|
|
@@ -20,7 +20,7 @@ class CredentialsKey(Enum):
|
|
|
20
20
|
TABLEAU_SERVER_URL = "server_url"
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
CREDENTIALS_ENV:
|
|
23
|
+
CREDENTIALS_ENV: dict[CredentialsKey, str] = {
|
|
24
24
|
CredentialsKey.TABLEAU_USER: "CASTOR_TABLEAU_USER",
|
|
25
25
|
CredentialsKey.TABLEAU_PASSWORD: "CASTOR_TABLEAU_PASSWORD",
|
|
26
26
|
CredentialsKey.TABLEAU_TOKEN_NAME: "CASTOR_TABLEAU_TOKEN_NAME",
|
|
@@ -89,7 +89,7 @@ class CredentialsApi:
|
|
|
89
89
|
CredentialsKey.TABLEAU_TOKEN: token,
|
|
90
90
|
}
|
|
91
91
|
|
|
92
|
-
def to_dict(self, hide: bool = False) ->
|
|
92
|
+
def to_dict(self, hide: bool = False) -> dict[str, str]:
|
|
93
93
|
safe = (
|
|
94
94
|
CredentialsKey.TABLEAU_USER,
|
|
95
95
|
CredentialsKey.TABLEAU_SITE_ID,
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Dict, List
|
|
3
2
|
|
|
4
3
|
import tableauserverclient as TSC # type: ignore
|
|
5
4
|
|
|
@@ -48,7 +47,7 @@ def safe_mode_fetch_usage(client) -> SerializedAsset:
|
|
|
48
47
|
Returns computed usages when page number is not found
|
|
49
48
|
Log errors if ServerResponseError is return
|
|
50
49
|
"""
|
|
51
|
-
list_usages:
|
|
50
|
+
list_usages: list[dict] = []
|
|
52
51
|
page_number: int = 0
|
|
53
52
|
|
|
54
53
|
while True:
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from
|
|
2
|
+
from collections.abc import Iterable
|
|
3
3
|
|
|
4
4
|
from ...utils import (
|
|
5
5
|
OUTPUT_DIR,
|
|
@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__)
|
|
|
19
19
|
|
|
20
20
|
def iterate_all_data(
|
|
21
21
|
client: Client,
|
|
22
|
-
) -> Iterable[
|
|
22
|
+
) -> Iterable[tuple[TableauAsset, list]]:
|
|
23
23
|
"""Iterate over the extracted Data from Tableau"""
|
|
24
24
|
|
|
25
25
|
logger.info("Extracting USER from Tableau API")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Fields which will be use for Tableau GraphQL API
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Union
|
|
4
4
|
|
|
5
5
|
from .assets import TableauAsset, TableauGraphqlAsset
|
|
6
6
|
|
|
@@ -189,9 +189,9 @@ class GQLQueryFields(Enum):
|
|
|
189
189
|
"""
|
|
190
190
|
|
|
191
191
|
|
|
192
|
-
QueryInfo =
|
|
192
|
+
QueryInfo = list[dict[str, Union[GQLQueryFields, TableauGraphqlAsset]]]
|
|
193
193
|
|
|
194
|
-
QUERY_FIELDS:
|
|
194
|
+
QUERY_FIELDS: dict[TableauAsset, QueryInfo] = {
|
|
195
195
|
TableauAsset.CUSTOM_SQL_TABLE: [
|
|
196
196
|
{
|
|
197
197
|
FIELDS: GQLQueryFields.CUSTOM_SQL_TABLE,
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
# TSC for TableauServerClient: basic REST API to extracting core objects
|
|
2
|
-
from typing import Dict, Set
|
|
3
2
|
|
|
4
3
|
from .assets import TableauAsset
|
|
5
4
|
|
|
6
5
|
# TSC fields extracted per assets
|
|
7
|
-
TSC_FIELDS:
|
|
6
|
+
TSC_FIELDS: dict[TableauAsset, set[str]] = {
|
|
8
7
|
TableauAsset.PROJECT: {
|
|
9
8
|
"id",
|
|
10
9
|
"name",
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Union
|
|
2
2
|
|
|
3
3
|
from tableauserverclient import ServerResponseError # type: ignore
|
|
4
4
|
from typing_extensions import Literal
|
|
@@ -6,6 +6,6 @@ from typing_extensions import Literal
|
|
|
6
6
|
from .errors import TableauErrorCode
|
|
7
7
|
|
|
8
8
|
PageReturn = Union[
|
|
9
|
-
|
|
10
|
-
|
|
9
|
+
tuple[list[dict], Literal[None]],
|
|
10
|
+
tuple[Literal[None], Union[TableauErrorCode, ServerResponseError]],
|
|
11
11
|
]
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from
|
|
1
|
+
from collections.abc import Iterator
|
|
2
|
+
from typing import Optional
|
|
2
3
|
|
|
3
4
|
import tableauserverclient as TSC # type: ignore
|
|
4
5
|
|
|
@@ -10,7 +11,7 @@ from .gql_queries import FIELDS_QUERIES, GQL_QUERIES, QUERY_TEMPLATE
|
|
|
10
11
|
|
|
11
12
|
# increase the value when extraction is too slow
|
|
12
13
|
# decrease the value when timeouts arise
|
|
13
|
-
_CUSTOM_PAGE_SIZE:
|
|
14
|
+
_CUSTOM_PAGE_SIZE: dict[TableauRevampAsset, int] = {
|
|
14
15
|
# for some clients, extraction of columns tend to hit the node limit
|
|
15
16
|
# https://community.tableau.com/s/question/0D54T00000YuK60SAF/metadata-query-nodelimitexceeded-error
|
|
16
17
|
# the workaround is to reduce pagination
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Optional
|
|
3
3
|
|
|
4
4
|
import requests
|
|
5
5
|
import tableauserverclient as TSC # type: ignore
|
|
@@ -40,7 +40,7 @@ class TableauClientRestApi:
|
|
|
40
40
|
return self._server.http_options["timeout"]
|
|
41
41
|
|
|
42
42
|
@property
|
|
43
|
-
def headers(self) ->
|
|
43
|
+
def headers(self) -> dict[str, str]:
|
|
44
44
|
return {"x-tableau-auth": self._server.auth_token}
|
|
45
45
|
|
|
46
46
|
def _get_site_name(self) -> str:
|
|
@@ -52,7 +52,7 @@ class TableauClientRestApi:
|
|
|
52
52
|
self,
|
|
53
53
|
url: str,
|
|
54
54
|
page_token: Optional[str] = None,
|
|
55
|
-
) ->
|
|
55
|
+
) -> dict:
|
|
56
56
|
if page_token:
|
|
57
57
|
url += f"?page_token={page_token}"
|
|
58
58
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from
|
|
1
|
+
from collections.abc import Iterable, Iterator
|
|
2
|
+
from typing import Any
|
|
2
3
|
|
|
3
4
|
import tableauserverclient as TSC # type: ignore
|
|
4
5
|
|
|
@@ -30,7 +31,7 @@ class TableauClientTSC:
|
|
|
30
31
|
self,
|
|
31
32
|
data: Iterable,
|
|
32
33
|
asset: TableauRevampAsset,
|
|
33
|
-
) -> Iterator[
|
|
34
|
+
) -> Iterator[dict]:
|
|
34
35
|
keys = REST_FIELDS[asset]
|
|
35
36
|
|
|
36
37
|
for row in data:
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from typing import Dict, Tuple
|
|
2
|
-
|
|
3
1
|
from ..assets import TableauRevampAsset
|
|
4
2
|
|
|
5
3
|
QUERY_TEMPLATE = """
|
|
@@ -130,7 +128,7 @@ workbook { id }
|
|
|
130
128
|
"""
|
|
131
129
|
|
|
132
130
|
|
|
133
|
-
GQL_QUERIES:
|
|
131
|
+
GQL_QUERIES: dict[TableauRevampAsset, tuple[str, str]] = {
|
|
134
132
|
TableauRevampAsset.COLUMN: ("columns", _COLUMNS_QUERY),
|
|
135
133
|
TableauRevampAsset.DASHBOARD: ("dashboards", _DASHBOARDS_QUERY),
|
|
136
134
|
TableauRevampAsset.DATASOURCE: ("datasources", _DATASOURCES_QUERY),
|
|
@@ -1,9 +1,7 @@
|
|
|
1
|
-
from typing import Dict, Set
|
|
2
|
-
|
|
3
1
|
from ..assets import TableauRevampAsset
|
|
4
2
|
|
|
5
3
|
# list of fields to pick in REST API or TSC responses
|
|
6
|
-
REST_FIELDS:
|
|
4
|
+
REST_FIELDS: dict[TableauRevampAsset, set[str]] = {
|
|
7
5
|
TableauRevampAsset.DATASOURCE: {
|
|
8
6
|
"id",
|
|
9
7
|
"project_id",
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from
|
|
2
|
+
from collections.abc import Iterable
|
|
3
3
|
|
|
4
4
|
from ...utils import (
|
|
5
5
|
OUTPUT_DIR,
|
|
@@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
|
|
|
18
18
|
|
|
19
19
|
def iterate_all_data(
|
|
20
20
|
client: TableauRevampClient,
|
|
21
|
-
) -> Iterable[
|
|
21
|
+
) -> Iterable[tuple[TableauRevampAsset, list]]:
|
|
22
22
|
"""Iterate over the extracted Data from Tableau"""
|
|
23
23
|
|
|
24
24
|
for asset in TableauRevampAsset:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from
|
|
1
|
+
from collections.abc import Iterator
|
|
2
|
+
from typing import Optional
|
|
2
3
|
|
|
3
4
|
import requests
|
|
4
5
|
|
|
@@ -35,7 +36,7 @@ THOUGHTSPOT_SAFE_MODE = RequestSafeMode()
|
|
|
35
36
|
|
|
36
37
|
|
|
37
38
|
class ThoughtspotBearerAuth(BearerAuth):
|
|
38
|
-
def __init__(self, host: str, token_payload:
|
|
39
|
+
def __init__(self, host: str, token_payload: dict[str, str]):
|
|
39
40
|
auth_endpoint = ThoughtspotEndpointFactory.authentication()
|
|
40
41
|
self.authentication_url = build_url(host, auth_endpoint)
|
|
41
42
|
self.token_payload = token_payload
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from
|
|
2
|
+
from collections.abc import Iterable, Iterator
|
|
3
|
+
from typing import Union
|
|
3
4
|
|
|
4
5
|
from ...utils import (
|
|
5
6
|
OUTPUT_DIR,
|
|
@@ -21,7 +22,7 @@ logger = logging.getLogger(__name__)
|
|
|
21
22
|
|
|
22
23
|
def iterate_all_data(
|
|
23
24
|
client: ThoughtspotClient,
|
|
24
|
-
) -> Iterable[
|
|
25
|
+
) -> Iterable[tuple[ThoughtspotAsset, Union[list, Iterator, dict]]]:
|
|
25
26
|
"""Iterate over the extracted data from Thoughtspot"""
|
|
26
27
|
|
|
27
28
|
for asset in ThoughtspotAsset:
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
-
from typing import Dict, List, Set, Tuple
|
|
3
2
|
|
|
4
3
|
from ...types import ExternalAsset, classproperty
|
|
5
4
|
|
|
@@ -26,7 +25,7 @@ class WarehouseAsset(ExternalAsset):
|
|
|
26
25
|
VIEW_DDL = "view_ddl"
|
|
27
26
|
|
|
28
27
|
@classproperty
|
|
29
|
-
def optional(cls) ->
|
|
28
|
+
def optional(cls) -> set["WarehouseAsset"]:
|
|
30
29
|
return {
|
|
31
30
|
WarehouseAsset.ADDITIONAL_COLUMN_LINEAGE,
|
|
32
31
|
WarehouseAsset.ADDITIONAL_TABLE_LINEAGE,
|
|
@@ -50,7 +49,7 @@ class WarehouseAssetGroup(Enum):
|
|
|
50
49
|
|
|
51
50
|
|
|
52
51
|
# tuple of supported assets for each group (depends on the technology)
|
|
53
|
-
SupportedAssets =
|
|
52
|
+
SupportedAssets = dict[WarehouseAssetGroup, tuple[WarehouseAsset, ...]]
|
|
54
53
|
|
|
55
54
|
# shared by all technologies
|
|
56
55
|
CATALOG_ASSETS = (
|
|
@@ -80,13 +79,13 @@ NON_EXTRACTABLE_ASSETS = {WarehouseAssetGroup.EXTERNAL_LINEAGE}
|
|
|
80
79
|
|
|
81
80
|
def extractable_asset_groups(
|
|
82
81
|
supported_assets: SupportedAssets,
|
|
83
|
-
) ->
|
|
82
|
+
) -> list[tuple[WarehouseAsset, ...]]:
|
|
84
83
|
"""
|
|
85
84
|
helper function to differentiate
|
|
86
85
|
extractable assets vs supported (ingest-able) assets
|
|
87
86
|
"""
|
|
88
87
|
groups = set(supported_assets).difference(NON_EXTRACTABLE_ASSETS)
|
|
89
|
-
extractable:
|
|
88
|
+
extractable: set[tuple[WarehouseAsset, ...]] = {
|
|
90
89
|
supported_assets[group] for group in groups
|
|
91
90
|
}
|
|
92
91
|
return list(extractable)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from collections.abc import Iterator
|
|
2
3
|
from itertools import chain
|
|
3
|
-
from typing import Callable,
|
|
4
|
+
from typing import Callable, Optional
|
|
4
5
|
|
|
5
6
|
from ...utils import (
|
|
6
7
|
OUTPUT_DIR,
|
|
@@ -16,7 +17,7 @@ from .query import AbstractQueryBuilder, ExtractionQuery
|
|
|
16
17
|
logger = logging.getLogger(__name__)
|
|
17
18
|
|
|
18
19
|
|
|
19
|
-
def common_args(kwargs: dict) ->
|
|
20
|
+
def common_args(kwargs: dict) -> tuple[str, bool]:
|
|
20
21
|
"""Args used by all technologies"""
|
|
21
22
|
output_directory = kwargs.get("output_directory") or from_env(OUTPUT_DIR)
|
|
22
23
|
skip_existing = kwargs.get("skip_existing") or False
|
|
@@ -39,7 +40,7 @@ class SQLExtractionProcessor:
|
|
|
39
40
|
self._safe_mode = safe_mode
|
|
40
41
|
|
|
41
42
|
@staticmethod
|
|
42
|
-
def _unique(data: Iterator[dict]) ->
|
|
43
|
+
def _unique(data: Iterator[dict]) -> list[dict]:
|
|
43
44
|
"""
|
|
44
45
|
Remove duplicate in the given data.
|
|
45
46
|
Remark: this method implies loading all data in memory: it breaks the streaming pipeline !
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
import os
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Optional
|
|
5
5
|
|
|
6
6
|
from .asset import WarehouseAsset
|
|
7
7
|
from .time_filter import TimeFilter
|
|
@@ -37,7 +37,7 @@ class AbstractQueryBuilder(ABC):
|
|
|
37
37
|
def __init__(
|
|
38
38
|
self,
|
|
39
39
|
time_filter: Optional[TimeFilter],
|
|
40
|
-
duplicated: Optional[
|
|
40
|
+
duplicated: Optional[tuple[WarehouseAsset, ...]] = None,
|
|
41
41
|
):
|
|
42
42
|
self._time_filter = time_filter or TimeFilter.default()
|
|
43
43
|
self._duplicated = duplicated
|
|
@@ -55,7 +55,7 @@ class AbstractQueryBuilder(ABC):
|
|
|
55
55
|
"""read from a file located in queries directory"""
|
|
56
56
|
root = os.path.dirname(inspect.getfile(self.__class__))
|
|
57
57
|
path = os.path.join(root, QUERIES_DIR, filename)
|
|
58
|
-
with open(path
|
|
58
|
+
with open(path) as f:
|
|
59
59
|
return f.read()
|
|
60
60
|
|
|
61
61
|
def load_statement(self, asset: WarehouseAsset) -> str:
|
|
@@ -75,7 +75,7 @@ class AbstractQueryBuilder(ABC):
|
|
|
75
75
|
return ExtractionQuery(statement, params)
|
|
76
76
|
|
|
77
77
|
@abstractmethod
|
|
78
|
-
def build(self, asset: WarehouseAsset) ->
|
|
78
|
+
def build(self, asset: WarehouseAsset) -> list[ExtractionQuery]:
|
|
79
79
|
"""
|
|
80
80
|
Build the Query allowing extraction of the given asset
|
|
81
81
|
- Most of the time, returns a single query
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
import logging
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Optional
|
|
4
4
|
|
|
5
5
|
from google.api_core.exceptions import Forbidden # type: ignore
|
|
6
6
|
from google.cloud.bigquery import Client as GoogleCloudClient # type: ignore
|
|
@@ -27,9 +27,9 @@ class BigQueryClient(SqlalchemyClient):
|
|
|
27
27
|
def __init__(
|
|
28
28
|
self,
|
|
29
29
|
credentials: dict,
|
|
30
|
-
db_allowed: Optional[
|
|
31
|
-
db_blocked: Optional[
|
|
32
|
-
dataset_blocked: Optional[
|
|
30
|
+
db_allowed: Optional[set[str]] = None,
|
|
31
|
+
db_blocked: Optional[set[str]] = None,
|
|
32
|
+
dataset_blocked: Optional[set[str]] = None,
|
|
33
33
|
):
|
|
34
34
|
super().__init__(credentials)
|
|
35
35
|
self._db_allowed = db_allowed
|
|
@@ -37,8 +37,8 @@ class BigQueryClient(SqlalchemyClient):
|
|
|
37
37
|
self._dataset_blocked = dataset_blocked
|
|
38
38
|
self.credentials = self._credentials()
|
|
39
39
|
self.client = self._client()
|
|
40
|
-
self._projects:
|
|
41
|
-
self._datasets:
|
|
40
|
+
self._projects: list[str] | None = None
|
|
41
|
+
self._datasets: list[Dataset] | None = None
|
|
42
42
|
|
|
43
43
|
@staticmethod
|
|
44
44
|
def name() -> str:
|
|
@@ -78,7 +78,7 @@ class BigQueryClient(SqlalchemyClient):
|
|
|
78
78
|
credentials=self.credentials,
|
|
79
79
|
)
|
|
80
80
|
|
|
81
|
-
def _list_datasets(self) ->
|
|
81
|
+
def _list_datasets(self) -> list[Dataset]:
|
|
82
82
|
"""
|
|
83
83
|
Returns datasets available for the given GCP client
|
|
84
84
|
Cache the result in self._datasets to reduce number of API calls
|
|
@@ -98,7 +98,7 @@ class BigQueryClient(SqlalchemyClient):
|
|
|
98
98
|
base_ms=_RETRY_BASE_MS,
|
|
99
99
|
log_exc_info=True,
|
|
100
100
|
)
|
|
101
|
-
def get_projects(self) ->
|
|
101
|
+
def get_projects(self) -> list[str]:
|
|
102
102
|
"""
|
|
103
103
|
Returns distinct project_id available for the given GCP client
|
|
104
104
|
Cache the result in self._projects to reduce number of API calls.
|
|
@@ -38,7 +38,7 @@ def _credentials(params: dict) -> dict:
|
|
|
38
38
|
"""extract GCP credentials"""
|
|
39
39
|
path = params.get("credentials") or from_env(BIGQUERY_CREDENTIALS)
|
|
40
40
|
logger.info(f"Credentials fetched from {path}")
|
|
41
|
-
with open(path
|
|
41
|
+
with open(path) as file:
|
|
42
42
|
return cast(dict, json.load(file))
|
|
43
43
|
|
|
44
44
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Optional
|
|
3
3
|
|
|
4
4
|
from ..abstract import (
|
|
5
5
|
AbstractQueryBuilder,
|
|
@@ -109,7 +109,7 @@ class BigQueryQueryBuilder(AbstractQueryBuilder):
|
|
|
109
109
|
else self._regions
|
|
110
110
|
)
|
|
111
111
|
|
|
112
|
-
def build(self, asset: WarehouseAsset) ->
|
|
112
|
+
def build(self, asset: WarehouseAsset) -> list[ExtractionQuery]:
|
|
113
113
|
"""
|
|
114
114
|
It would be easier to stitch data directly in the query statement (UNION ALL).
|
|
115
115
|
Unfortunately, querying INFORMATION_SCHEMA on multiple regions
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from collections.abc import Iterator
|
|
2
3
|
from functools import partial
|
|
3
4
|
from http import HTTPStatus
|
|
4
|
-
from typing import
|
|
5
|
+
from typing import Optional
|
|
5
6
|
|
|
6
7
|
import requests
|
|
7
8
|
|
|
@@ -55,8 +56,8 @@ class DatabricksAPIClient(APIClient):
|
|
|
55
56
|
def __init__(
|
|
56
57
|
self,
|
|
57
58
|
credentials: DatabricksCredentials,
|
|
58
|
-
db_allowed: Optional[
|
|
59
|
-
db_blocked: Optional[
|
|
59
|
+
db_allowed: Optional[set[str]] = None,
|
|
60
|
+
db_blocked: Optional[set[str]] = None,
|
|
60
61
|
):
|
|
61
62
|
auth = DatabricksAuth(credentials)
|
|
62
63
|
super().__init__(
|
|
@@ -81,18 +82,18 @@ class DatabricksAPIClient(APIClient):
|
|
|
81
82
|
return False
|
|
82
83
|
return True
|
|
83
84
|
|
|
84
|
-
def databases(self) ->
|
|
85
|
+
def databases(self) -> list[dict]:
|
|
85
86
|
content = self._get(DatabricksEndpointFactory.databases())
|
|
86
87
|
_databases = self.formatter.format_database(content.get("catalogs", []))
|
|
87
88
|
return [d for d in _databases if self._keep_catalog(d["database_name"])]
|
|
88
89
|
|
|
89
|
-
def _schemas_of_database(self, database: dict) ->
|
|
90
|
+
def _schemas_of_database(self, database: dict) -> list[dict]:
|
|
90
91
|
payload = {"catalog_name": database["database_name"]}
|
|
91
92
|
content = self._get(DatabricksEndpointFactory.schemas(), params=payload)
|
|
92
93
|
schemas = content.get("schemas", [])
|
|
93
94
|
return self.formatter.format_schema(schemas, database)
|
|
94
95
|
|
|
95
|
-
def schemas(self, databases:
|
|
96
|
+
def schemas(self, databases: list[dict]) -> list[dict]:
|
|
96
97
|
"""
|
|
97
98
|
Get the databricks schemas (also sometimes called databases)
|
|
98
99
|
(which correspond to the schemas in Castor)
|
|
@@ -143,8 +144,8 @@ class DatabricksAPIClient(APIClient):
|
|
|
143
144
|
)
|
|
144
145
|
def get_single_column_lineage(
|
|
145
146
|
self,
|
|
146
|
-
names:
|
|
147
|
-
) ->
|
|
147
|
+
names: tuple[str, str],
|
|
148
|
+
) -> list[TimestampedLink]:
|
|
148
149
|
"""
|
|
149
150
|
Helper function used in get_lineage_links.
|
|
150
151
|
Call data lineage API and return the content of the result
|
|
@@ -172,7 +173,7 @@ class DatabricksAPIClient(APIClient):
|
|
|
172
173
|
)
|
|
173
174
|
def get_single_table_lineage(
|
|
174
175
|
self, table_path: str
|
|
175
|
-
) ->
|
|
176
|
+
) -> list[TimestampedLink]:
|
|
176
177
|
"""
|
|
177
178
|
Helper function used in get_lineage_links.
|
|
178
179
|
Call data lineage API and return the content of the result
|
|
@@ -210,7 +211,7 @@ class DatabricksAPIClient(APIClient):
|
|
|
210
211
|
queries = fetch_all_pages(request, DatabricksPagination)
|
|
211
212
|
return queries
|
|
212
213
|
|
|
213
|
-
def queries(self, time_filter: Optional[TimeFilter] = None) ->
|
|
214
|
+
def queries(self, time_filter: Optional[TimeFilter] = None) -> list[dict]:
|
|
214
215
|
"""get all queries, hour per hour"""
|
|
215
216
|
time_range_filters = hourly_time_filters(time_filter)
|
|
216
217
|
raw_queries = []
|
|
@@ -220,14 +221,14 @@ class DatabricksAPIClient(APIClient):
|
|
|
220
221
|
raw_queries.extend(hourly)
|
|
221
222
|
return self.formatter.format_query(raw_queries)
|
|
222
223
|
|
|
223
|
-
def users(self) ->
|
|
224
|
+
def users(self) -> list[dict]:
|
|
224
225
|
"""
|
|
225
226
|
retrieve user from api
|
|
226
227
|
"""
|
|
227
228
|
content = self._get(DatabricksEndpointFactory.users())
|
|
228
229
|
return self.formatter.format_user(content.get("Resources", []))
|
|
229
230
|
|
|
230
|
-
def _view_ddl_per_schema(self, schema: dict) ->
|
|
231
|
+
def _view_ddl_per_schema(self, schema: dict) -> list[dict]:
|
|
231
232
|
payload = {
|
|
232
233
|
"catalog_name": schema["database_id"],
|
|
233
234
|
"schema_name": schema["schema_name"],
|
|
@@ -236,9 +237,9 @@ class DatabricksAPIClient(APIClient):
|
|
|
236
237
|
content = self._get(DatabricksEndpointFactory.tables(), params=payload)
|
|
237
238
|
return self.formatter.format_view_ddl(content.get("tables", []), schema)
|
|
238
239
|
|
|
239
|
-
def view_ddl(self, schemas:
|
|
240
|
+
def view_ddl(self, schemas: list[dict]) -> list[dict]:
|
|
240
241
|
"""retrieve view ddl"""
|
|
241
|
-
view_ddl:
|
|
242
|
+
view_ddl: list[dict] = []
|
|
242
243
|
for schema in schemas:
|
|
243
244
|
v_to_add = self._view_ddl_per_schema(schema)
|
|
244
245
|
view_ddl.extend(v_to_add)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from concurrent.futures import ThreadPoolExecutor
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Optional
|
|
4
4
|
|
|
5
5
|
from ...utils import (
|
|
6
6
|
mapping_from_rows,
|
|
@@ -25,8 +25,8 @@ class DatabricksClient:
|
|
|
25
25
|
def __init__(
|
|
26
26
|
self,
|
|
27
27
|
credentials: DatabricksCredentials,
|
|
28
|
-
db_allowed: Optional[
|
|
29
|
-
db_blocked: Optional[
|
|
28
|
+
db_allowed: Optional[set[str]] = None,
|
|
29
|
+
db_blocked: Optional[set[str]] = None,
|
|
30
30
|
has_table_tags: bool = False,
|
|
31
31
|
has_column_tags: bool = False,
|
|
32
32
|
):
|
|
@@ -58,26 +58,26 @@ class DatabricksClient:
|
|
|
58
58
|
return {**table, "owner_external_id": owner_external_id}
|
|
59
59
|
|
|
60
60
|
@staticmethod
|
|
61
|
-
def _get_user_mapping(users:
|
|
61
|
+
def _get_user_mapping(users: list[dict]) -> dict:
|
|
62
62
|
return {
|
|
63
63
|
**mapping_from_rows(users, "email", "id"),
|
|
64
64
|
**mapping_from_rows(users, "user_name", "id"),
|
|
65
65
|
}
|
|
66
66
|
|
|
67
|
-
def schemas(self, databases:
|
|
67
|
+
def schemas(self, databases: list[dict]) -> list[dict]:
|
|
68
68
|
return self.api_client.schemas(databases)
|
|
69
69
|
|
|
70
|
-
def databases(self) ->
|
|
70
|
+
def databases(self) -> list[dict]:
|
|
71
71
|
return self.api_client.databases()
|
|
72
72
|
|
|
73
73
|
def tables_and_columns(
|
|
74
|
-
self, schemas:
|
|
74
|
+
self, schemas: list[dict], users: list[dict]
|
|
75
75
|
) -> TablesColumns:
|
|
76
76
|
"""
|
|
77
77
|
Get the databricks tables & columns leveraging the unity catalog API
|
|
78
78
|
"""
|
|
79
|
-
tables:
|
|
80
|
-
columns:
|
|
79
|
+
tables: list[dict] = []
|
|
80
|
+
columns: list[dict] = []
|
|
81
81
|
user_mapping = self._get_user_mapping(users)
|
|
82
82
|
table_tags = self.sql_client.get_tags_mapping(TagEntity.TABLE)
|
|
83
83
|
column_tags = self.sql_client.get_tags_mapping(TagEntity.COLUMN)
|
|
@@ -95,7 +95,7 @@ class DatabricksClient:
|
|
|
95
95
|
columns.extend(c_to_add)
|
|
96
96
|
return tables, columns
|
|
97
97
|
|
|
98
|
-
def table_lineage(self, tables:
|
|
98
|
+
def table_lineage(self, tables: list[dict]) -> list[dict]:
|
|
99
99
|
"""
|
|
100
100
|
Wrapper function that retrieves all table lineage
|
|
101
101
|
"""
|
|
@@ -113,8 +113,8 @@ class DatabricksClient:
|
|
|
113
113
|
return self.formatter.format_lineage(deduplicated)
|
|
114
114
|
|
|
115
115
|
def column_lineage(
|
|
116
|
-
self, tables:
|
|
117
|
-
) ->
|
|
116
|
+
self, tables: list[dict], columns: list[dict], table_lineage: list[dict]
|
|
117
|
+
) -> list[dict]:
|
|
118
118
|
"""
|
|
119
119
|
Wrapper function that retrieves all column lineage
|
|
120
120
|
we only try to retrieve column lineage if we found table lineage
|
|
@@ -129,17 +129,17 @@ class DatabricksClient:
|
|
|
129
129
|
results = executor.map(
|
|
130
130
|
self.api_client.get_single_column_lineage, candidate_paths
|
|
131
131
|
)
|
|
132
|
-
lineages:
|
|
132
|
+
lineages: list[TimestampedLink] = [
|
|
133
133
|
link for links in results for link in links
|
|
134
134
|
]
|
|
135
135
|
deduplicated = deduplicate_lineage(lineages)
|
|
136
136
|
return self.formatter.format_lineage(deduplicated)
|
|
137
137
|
|
|
138
|
-
def queries(self, time_filter: Optional[TimeFilter] = None) ->
|
|
138
|
+
def queries(self, time_filter: Optional[TimeFilter] = None) -> list[dict]:
|
|
139
139
|
return self.api_client.queries(time_filter)
|
|
140
140
|
|
|
141
|
-
def users(self) ->
|
|
141
|
+
def users(self) -> list[dict]:
|
|
142
142
|
return self.api_client.users()
|
|
143
143
|
|
|
144
|
-
def view_ddl(self, schemas:
|
|
144
|
+
def view_ddl(self, schemas: list[dict]) -> list[dict]:
|
|
145
145
|
return self.api_client.view_ddl(schemas)
|