castor-extractor 0.21.9__py3-none-any.whl → 0.22.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +8 -0
- castor_extractor/commands/__init__.py +0 -3
- castor_extractor/commands/file_check.py +1 -2
- castor_extractor/file_checker/column.py +5 -5
- castor_extractor/file_checker/file.py +7 -7
- castor_extractor/file_checker/file_test.py +2 -2
- castor_extractor/file_checker/templates/generic_warehouse.py +4 -6
- castor_extractor/knowledge/confluence/client/client.py +2 -1
- castor_extractor/knowledge/confluence/extract.py +3 -2
- castor_extractor/knowledge/notion/client/client.py +3 -2
- castor_extractor/knowledge/notion/extract.py +3 -2
- castor_extractor/quality/soda/client/client.py +2 -1
- castor_extractor/quality/soda/client/pagination.py +1 -3
- castor_extractor/types.py +3 -3
- castor_extractor/uploader/env.py +2 -2
- castor_extractor/uploader/upload.py +4 -3
- castor_extractor/uploader/utils.py +1 -1
- castor_extractor/utils/__init__.py +1 -0
- castor_extractor/utils/client/abstract.py +2 -1
- castor_extractor/utils/client/api/auth.py +2 -2
- castor_extractor/utils/client/api/auth_test.py +2 -2
- castor_extractor/utils/client/api/client.py +3 -3
- castor_extractor/utils/client/api/pagination.py +3 -2
- castor_extractor/utils/client/api/safe_request.py +5 -5
- castor_extractor/utils/collection.py +7 -11
- castor_extractor/utils/dbt/client.py +3 -3
- castor_extractor/utils/dbt/client_test.py +2 -2
- castor_extractor/utils/deprecate.py +1 -2
- castor_extractor/utils/files.py +5 -5
- castor_extractor/utils/formatter.py +5 -4
- castor_extractor/utils/json_stream_write.py +2 -1
- castor_extractor/utils/object.py +2 -1
- castor_extractor/utils/pager/pager.py +2 -4
- castor_extractor/utils/pager/pager_on_id.py +2 -1
- castor_extractor/utils/pager/pager_on_id_test.py +5 -5
- castor_extractor/utils/pager/pager_test.py +3 -3
- castor_extractor/utils/retry.py +4 -3
- castor_extractor/utils/retry_test.py +2 -3
- castor_extractor/utils/safe.py +3 -3
- castor_extractor/utils/salesforce/client.py +2 -1
- castor_extractor/utils/salesforce/credentials.py +1 -3
- castor_extractor/utils/store.py +2 -1
- castor_extractor/utils/string.py +2 -2
- castor_extractor/utils/string_test.py +1 -3
- castor_extractor/utils/time.py +4 -0
- castor_extractor/utils/time_test.py +8 -1
- castor_extractor/utils/type.py +3 -2
- castor_extractor/utils/validation.py +4 -4
- castor_extractor/utils/write.py +2 -2
- castor_extractor/visualization/domo/client/client.py +8 -7
- castor_extractor/visualization/domo/client/credentials.py +2 -2
- castor_extractor/visualization/domo/client/endpoints.py +2 -2
- castor_extractor/visualization/domo/extract.py +3 -2
- castor_extractor/visualization/looker/api/client.py +17 -16
- castor_extractor/visualization/looker/api/utils.py +2 -2
- castor_extractor/visualization/looker/assets.py +1 -3
- castor_extractor/visualization/looker/extract.py +4 -3
- castor_extractor/visualization/looker/fields.py +3 -3
- castor_extractor/visualization/looker/multithreading.py +3 -3
- castor_extractor/visualization/looker_studio/__init__.py +6 -0
- castor_extractor/visualization/looker_studio/assets.py +6 -0
- castor_extractor/visualization/looker_studio/client/__init__.py +3 -0
- castor_extractor/visualization/looker_studio/client/admin_sdk_client.py +90 -0
- castor_extractor/visualization/looker_studio/client/client.py +37 -0
- castor_extractor/visualization/looker_studio/client/credentials.py +20 -0
- castor_extractor/visualization/looker_studio/client/endpoints.py +18 -0
- castor_extractor/visualization/looker_studio/client/enums.py +8 -0
- castor_extractor/visualization/looker_studio/client/looker_studio_api_client.py +102 -0
- castor_extractor/visualization/looker_studio/client/pagination.py +31 -0
- castor_extractor/visualization/looker_studio/client/scopes.py +6 -0
- castor_extractor/visualization/metabase/assets.py +1 -3
- castor_extractor/visualization/metabase/client/api/client.py +8 -7
- castor_extractor/visualization/metabase/extract.py +3 -2
- castor_extractor/visualization/metabase/types.py +1 -3
- castor_extractor/visualization/mode/client/client.py +6 -6
- castor_extractor/visualization/mode/extract.py +2 -2
- castor_extractor/visualization/powerbi/assets.py +1 -3
- castor_extractor/visualization/powerbi/client/client.py +12 -11
- castor_extractor/visualization/powerbi/client/credentials.py +3 -3
- castor_extractor/visualization/powerbi/client/endpoints.py +2 -2
- castor_extractor/visualization/powerbi/extract.py +3 -2
- castor_extractor/visualization/qlik/assets.py +1 -3
- castor_extractor/visualization/qlik/client/constants.py +1 -3
- castor_extractor/visualization/qlik/client/engine/error.py +1 -3
- castor_extractor/visualization/qlik/client/master.py +3 -3
- castor_extractor/visualization/qlik/client/rest.py +12 -12
- castor_extractor/visualization/qlik/extract.py +4 -3
- castor_extractor/visualization/salesforce_reporting/client/rest.py +3 -2
- castor_extractor/visualization/salesforce_reporting/client/soql.py +1 -3
- castor_extractor/visualization/salesforce_reporting/extract.py +3 -2
- castor_extractor/visualization/sigma/client/client.py +9 -8
- castor_extractor/visualization/sigma/client/credentials.py +1 -3
- castor_extractor/visualization/sigma/extract.py +3 -2
- castor_extractor/visualization/tableau/assets.py +1 -2
- castor_extractor/visualization/tableau/client/client.py +1 -2
- castor_extractor/visualization/tableau/client/client_utils.py +3 -2
- castor_extractor/visualization/tableau/client/credentials.py +3 -3
- castor_extractor/visualization/tableau/client/safe_mode.py +1 -2
- castor_extractor/visualization/tableau/extract.py +2 -2
- castor_extractor/visualization/tableau/gql_fields.py +3 -3
- castor_extractor/visualization/tableau/tsc_fields.py +1 -2
- castor_extractor/visualization/tableau/types.py +3 -3
- castor_extractor/visualization/tableau_revamp/client/client_metadata_api.py +3 -2
- castor_extractor/visualization/tableau_revamp/client/client_rest_api.py +3 -3
- castor_extractor/visualization/tableau_revamp/client/client_tsc.py +3 -2
- castor_extractor/visualization/tableau_revamp/client/gql_queries.py +1 -3
- castor_extractor/visualization/tableau_revamp/client/rest_fields.py +1 -3
- castor_extractor/visualization/tableau_revamp/extract.py +2 -2
- castor_extractor/visualization/thoughtspot/client/client.py +3 -2
- castor_extractor/visualization/thoughtspot/client/utils.py +1 -1
- castor_extractor/visualization/thoughtspot/extract.py +3 -2
- castor_extractor/warehouse/abstract/asset.py +4 -5
- castor_extractor/warehouse/abstract/extract.py +4 -3
- castor_extractor/warehouse/abstract/query.py +4 -4
- castor_extractor/warehouse/bigquery/client.py +8 -8
- castor_extractor/warehouse/bigquery/extract.py +1 -1
- castor_extractor/warehouse/bigquery/query.py +2 -2
- castor_extractor/warehouse/bigquery/types.py +2 -4
- castor_extractor/warehouse/databricks/api_client.py +15 -14
- castor_extractor/warehouse/databricks/client.py +16 -16
- castor_extractor/warehouse/databricks/extract.py +4 -4
- castor_extractor/warehouse/databricks/format.py +12 -12
- castor_extractor/warehouse/databricks/lineage.py +11 -11
- castor_extractor/warehouse/databricks/pagination.py +2 -2
- castor_extractor/warehouse/databricks/types.py +4 -4
- castor_extractor/warehouse/databricks/utils.py +5 -4
- castor_extractor/warehouse/mysql/query.py +2 -2
- castor_extractor/warehouse/postgres/query.py +2 -2
- castor_extractor/warehouse/redshift/client.py +1 -1
- castor_extractor/warehouse/redshift/query.py +2 -2
- castor_extractor/warehouse/salesforce/client.py +8 -8
- castor_extractor/warehouse/salesforce/extract.py +3 -4
- castor_extractor/warehouse/salesforce/format.py +19 -11
- castor_extractor/warehouse/salesforce/format_test.py +24 -10
- castor_extractor/warehouse/snowflake/query.py +5 -5
- castor_extractor/warehouse/sqlserver/client.py +1 -1
- castor_extractor/warehouse/sqlserver/query.py +2 -2
- {castor_extractor-0.21.9.dist-info → castor_extractor-0.22.1.dist-info}/METADATA +13 -6
- {castor_extractor-0.21.9.dist-info → castor_extractor-0.22.1.dist-info}/RECORD +142 -131
- {castor_extractor-0.21.9.dist-info → castor_extractor-0.22.1.dist-info}/LICENCE +0 -0
- {castor_extractor-0.21.9.dist-info → castor_extractor-0.22.1.dist-info}/WHEEL +0 -0
- {castor_extractor-0.21.9.dist-info → castor_extractor-0.22.1.dist-info}/entry_points.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from collections.abc import Iterator
|
|
2
3
|
from itertools import chain
|
|
3
|
-
from typing import Callable,
|
|
4
|
+
from typing import Callable, Optional
|
|
4
5
|
|
|
5
6
|
from ...utils import (
|
|
6
7
|
OUTPUT_DIR,
|
|
@@ -16,7 +17,7 @@ from .query import AbstractQueryBuilder, ExtractionQuery
|
|
|
16
17
|
logger = logging.getLogger(__name__)
|
|
17
18
|
|
|
18
19
|
|
|
19
|
-
def common_args(kwargs: dict) ->
|
|
20
|
+
def common_args(kwargs: dict) -> tuple[str, bool]:
|
|
20
21
|
"""Args used by all technologies"""
|
|
21
22
|
output_directory = kwargs.get("output_directory") or from_env(OUTPUT_DIR)
|
|
22
23
|
skip_existing = kwargs.get("skip_existing") or False
|
|
@@ -39,7 +40,7 @@ class SQLExtractionProcessor:
|
|
|
39
40
|
self._safe_mode = safe_mode
|
|
40
41
|
|
|
41
42
|
@staticmethod
|
|
42
|
-
def _unique(data: Iterator[dict]) ->
|
|
43
|
+
def _unique(data: Iterator[dict]) -> list[dict]:
|
|
43
44
|
"""
|
|
44
45
|
Remove duplicate in the given data.
|
|
45
46
|
Remark: this method implies loading all data in memory: it breaks the streaming pipeline !
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
import os
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Optional
|
|
5
5
|
|
|
6
6
|
from .asset import WarehouseAsset
|
|
7
7
|
from .time_filter import TimeFilter
|
|
@@ -37,7 +37,7 @@ class AbstractQueryBuilder(ABC):
|
|
|
37
37
|
def __init__(
|
|
38
38
|
self,
|
|
39
39
|
time_filter: Optional[TimeFilter],
|
|
40
|
-
duplicated: Optional[
|
|
40
|
+
duplicated: Optional[tuple[WarehouseAsset, ...]] = None,
|
|
41
41
|
):
|
|
42
42
|
self._time_filter = time_filter or TimeFilter.default()
|
|
43
43
|
self._duplicated = duplicated
|
|
@@ -55,7 +55,7 @@ class AbstractQueryBuilder(ABC):
|
|
|
55
55
|
"""read from a file located in queries directory"""
|
|
56
56
|
root = os.path.dirname(inspect.getfile(self.__class__))
|
|
57
57
|
path = os.path.join(root, QUERIES_DIR, filename)
|
|
58
|
-
with open(path
|
|
58
|
+
with open(path) as f:
|
|
59
59
|
return f.read()
|
|
60
60
|
|
|
61
61
|
def load_statement(self, asset: WarehouseAsset) -> str:
|
|
@@ -75,7 +75,7 @@ class AbstractQueryBuilder(ABC):
|
|
|
75
75
|
return ExtractionQuery(statement, params)
|
|
76
76
|
|
|
77
77
|
@abstractmethod
|
|
78
|
-
def build(self, asset: WarehouseAsset) ->
|
|
78
|
+
def build(self, asset: WarehouseAsset) -> list[ExtractionQuery]:
|
|
79
79
|
"""
|
|
80
80
|
Build the Query allowing extraction of the given asset
|
|
81
81
|
- Most of the time, returns a single query
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
import logging
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Optional
|
|
4
4
|
|
|
5
5
|
from google.api_core.exceptions import Forbidden # type: ignore
|
|
6
6
|
from google.cloud.bigquery import Client as GoogleCloudClient # type: ignore
|
|
@@ -27,9 +27,9 @@ class BigQueryClient(SqlalchemyClient):
|
|
|
27
27
|
def __init__(
|
|
28
28
|
self,
|
|
29
29
|
credentials: dict,
|
|
30
|
-
db_allowed: Optional[
|
|
31
|
-
db_blocked: Optional[
|
|
32
|
-
dataset_blocked: Optional[
|
|
30
|
+
db_allowed: Optional[set[str]] = None,
|
|
31
|
+
db_blocked: Optional[set[str]] = None,
|
|
32
|
+
dataset_blocked: Optional[set[str]] = None,
|
|
33
33
|
):
|
|
34
34
|
super().__init__(credentials)
|
|
35
35
|
self._db_allowed = db_allowed
|
|
@@ -37,8 +37,8 @@ class BigQueryClient(SqlalchemyClient):
|
|
|
37
37
|
self._dataset_blocked = dataset_blocked
|
|
38
38
|
self.credentials = self._credentials()
|
|
39
39
|
self.client = self._client()
|
|
40
|
-
self._projects:
|
|
41
|
-
self._datasets:
|
|
40
|
+
self._projects: list[str] | None = None
|
|
41
|
+
self._datasets: list[Dataset] | None = None
|
|
42
42
|
|
|
43
43
|
@staticmethod
|
|
44
44
|
def name() -> str:
|
|
@@ -78,7 +78,7 @@ class BigQueryClient(SqlalchemyClient):
|
|
|
78
78
|
credentials=self.credentials,
|
|
79
79
|
)
|
|
80
80
|
|
|
81
|
-
def _list_datasets(self) ->
|
|
81
|
+
def _list_datasets(self) -> list[Dataset]:
|
|
82
82
|
"""
|
|
83
83
|
Returns datasets available for the given GCP client
|
|
84
84
|
Cache the result in self._datasets to reduce number of API calls
|
|
@@ -98,7 +98,7 @@ class BigQueryClient(SqlalchemyClient):
|
|
|
98
98
|
base_ms=_RETRY_BASE_MS,
|
|
99
99
|
log_exc_info=True,
|
|
100
100
|
)
|
|
101
|
-
def get_projects(self) ->
|
|
101
|
+
def get_projects(self) -> list[str]:
|
|
102
102
|
"""
|
|
103
103
|
Returns distinct project_id available for the given GCP client
|
|
104
104
|
Cache the result in self._projects to reduce number of API calls.
|
|
@@ -38,7 +38,7 @@ def _credentials(params: dict) -> dict:
|
|
|
38
38
|
"""extract GCP credentials"""
|
|
39
39
|
path = params.get("credentials") or from_env(BIGQUERY_CREDENTIALS)
|
|
40
40
|
logger.info(f"Credentials fetched from {path}")
|
|
41
|
-
with open(path
|
|
41
|
+
with open(path) as file:
|
|
42
42
|
return cast(dict, json.load(file))
|
|
43
43
|
|
|
44
44
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Optional
|
|
3
3
|
|
|
4
4
|
from ..abstract import (
|
|
5
5
|
AbstractQueryBuilder,
|
|
@@ -109,7 +109,7 @@ class BigQueryQueryBuilder(AbstractQueryBuilder):
|
|
|
109
109
|
else self._regions
|
|
110
110
|
)
|
|
111
111
|
|
|
112
|
-
def build(self, asset: WarehouseAsset) ->
|
|
112
|
+
def build(self, asset: WarehouseAsset) -> list[ExtractionQuery]:
|
|
113
113
|
"""
|
|
114
114
|
It would be easier to stitch data directly in the query statement (UNION ALL).
|
|
115
115
|
Unfortunately, querying INFORMATION_SCHEMA on multiple regions
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from collections.abc import Iterator
|
|
2
3
|
from functools import partial
|
|
3
4
|
from http import HTTPStatus
|
|
4
|
-
from typing import
|
|
5
|
+
from typing import Optional
|
|
5
6
|
|
|
6
7
|
import requests
|
|
7
8
|
|
|
@@ -55,8 +56,8 @@ class DatabricksAPIClient(APIClient):
|
|
|
55
56
|
def __init__(
|
|
56
57
|
self,
|
|
57
58
|
credentials: DatabricksCredentials,
|
|
58
|
-
db_allowed: Optional[
|
|
59
|
-
db_blocked: Optional[
|
|
59
|
+
db_allowed: Optional[set[str]] = None,
|
|
60
|
+
db_blocked: Optional[set[str]] = None,
|
|
60
61
|
):
|
|
61
62
|
auth = DatabricksAuth(credentials)
|
|
62
63
|
super().__init__(
|
|
@@ -81,18 +82,18 @@ class DatabricksAPIClient(APIClient):
|
|
|
81
82
|
return False
|
|
82
83
|
return True
|
|
83
84
|
|
|
84
|
-
def databases(self) ->
|
|
85
|
+
def databases(self) -> list[dict]:
|
|
85
86
|
content = self._get(DatabricksEndpointFactory.databases())
|
|
86
87
|
_databases = self.formatter.format_database(content.get("catalogs", []))
|
|
87
88
|
return [d for d in _databases if self._keep_catalog(d["database_name"])]
|
|
88
89
|
|
|
89
|
-
def _schemas_of_database(self, database: dict) ->
|
|
90
|
+
def _schemas_of_database(self, database: dict) -> list[dict]:
|
|
90
91
|
payload = {"catalog_name": database["database_name"]}
|
|
91
92
|
content = self._get(DatabricksEndpointFactory.schemas(), params=payload)
|
|
92
93
|
schemas = content.get("schemas", [])
|
|
93
94
|
return self.formatter.format_schema(schemas, database)
|
|
94
95
|
|
|
95
|
-
def schemas(self, databases:
|
|
96
|
+
def schemas(self, databases: list[dict]) -> list[dict]:
|
|
96
97
|
"""
|
|
97
98
|
Get the databricks schemas (also sometimes called databases)
|
|
98
99
|
(which correspond to the schemas in Castor)
|
|
@@ -143,8 +144,8 @@ class DatabricksAPIClient(APIClient):
|
|
|
143
144
|
)
|
|
144
145
|
def get_single_column_lineage(
|
|
145
146
|
self,
|
|
146
|
-
names:
|
|
147
|
-
) ->
|
|
147
|
+
names: tuple[str, str],
|
|
148
|
+
) -> list[TimestampedLink]:
|
|
148
149
|
"""
|
|
149
150
|
Helper function used in get_lineage_links.
|
|
150
151
|
Call data lineage API and return the content of the result
|
|
@@ -172,7 +173,7 @@ class DatabricksAPIClient(APIClient):
|
|
|
172
173
|
)
|
|
173
174
|
def get_single_table_lineage(
|
|
174
175
|
self, table_path: str
|
|
175
|
-
) ->
|
|
176
|
+
) -> list[TimestampedLink]:
|
|
176
177
|
"""
|
|
177
178
|
Helper function used in get_lineage_links.
|
|
178
179
|
Call data lineage API and return the content of the result
|
|
@@ -210,7 +211,7 @@ class DatabricksAPIClient(APIClient):
|
|
|
210
211
|
queries = fetch_all_pages(request, DatabricksPagination)
|
|
211
212
|
return queries
|
|
212
213
|
|
|
213
|
-
def queries(self, time_filter: Optional[TimeFilter] = None) ->
|
|
214
|
+
def queries(self, time_filter: Optional[TimeFilter] = None) -> list[dict]:
|
|
214
215
|
"""get all queries, hour per hour"""
|
|
215
216
|
time_range_filters = hourly_time_filters(time_filter)
|
|
216
217
|
raw_queries = []
|
|
@@ -220,14 +221,14 @@ class DatabricksAPIClient(APIClient):
|
|
|
220
221
|
raw_queries.extend(hourly)
|
|
221
222
|
return self.formatter.format_query(raw_queries)
|
|
222
223
|
|
|
223
|
-
def users(self) ->
|
|
224
|
+
def users(self) -> list[dict]:
|
|
224
225
|
"""
|
|
225
226
|
retrieve user from api
|
|
226
227
|
"""
|
|
227
228
|
content = self._get(DatabricksEndpointFactory.users())
|
|
228
229
|
return self.formatter.format_user(content.get("Resources", []))
|
|
229
230
|
|
|
230
|
-
def _view_ddl_per_schema(self, schema: dict) ->
|
|
231
|
+
def _view_ddl_per_schema(self, schema: dict) -> list[dict]:
|
|
231
232
|
payload = {
|
|
232
233
|
"catalog_name": schema["database_id"],
|
|
233
234
|
"schema_name": schema["schema_name"],
|
|
@@ -236,9 +237,9 @@ class DatabricksAPIClient(APIClient):
|
|
|
236
237
|
content = self._get(DatabricksEndpointFactory.tables(), params=payload)
|
|
237
238
|
return self.formatter.format_view_ddl(content.get("tables", []), schema)
|
|
238
239
|
|
|
239
|
-
def view_ddl(self, schemas:
|
|
240
|
+
def view_ddl(self, schemas: list[dict]) -> list[dict]:
|
|
240
241
|
"""retrieve view ddl"""
|
|
241
|
-
view_ddl:
|
|
242
|
+
view_ddl: list[dict] = []
|
|
242
243
|
for schema in schemas:
|
|
243
244
|
v_to_add = self._view_ddl_per_schema(schema)
|
|
244
245
|
view_ddl.extend(v_to_add)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from concurrent.futures import ThreadPoolExecutor
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Optional
|
|
4
4
|
|
|
5
5
|
from ...utils import (
|
|
6
6
|
mapping_from_rows,
|
|
@@ -25,8 +25,8 @@ class DatabricksClient:
|
|
|
25
25
|
def __init__(
|
|
26
26
|
self,
|
|
27
27
|
credentials: DatabricksCredentials,
|
|
28
|
-
db_allowed: Optional[
|
|
29
|
-
db_blocked: Optional[
|
|
28
|
+
db_allowed: Optional[set[str]] = None,
|
|
29
|
+
db_blocked: Optional[set[str]] = None,
|
|
30
30
|
has_table_tags: bool = False,
|
|
31
31
|
has_column_tags: bool = False,
|
|
32
32
|
):
|
|
@@ -58,26 +58,26 @@ class DatabricksClient:
|
|
|
58
58
|
return {**table, "owner_external_id": owner_external_id}
|
|
59
59
|
|
|
60
60
|
@staticmethod
|
|
61
|
-
def _get_user_mapping(users:
|
|
61
|
+
def _get_user_mapping(users: list[dict]) -> dict:
|
|
62
62
|
return {
|
|
63
63
|
**mapping_from_rows(users, "email", "id"),
|
|
64
64
|
**mapping_from_rows(users, "user_name", "id"),
|
|
65
65
|
}
|
|
66
66
|
|
|
67
|
-
def schemas(self, databases:
|
|
67
|
+
def schemas(self, databases: list[dict]) -> list[dict]:
|
|
68
68
|
return self.api_client.schemas(databases)
|
|
69
69
|
|
|
70
|
-
def databases(self) ->
|
|
70
|
+
def databases(self) -> list[dict]:
|
|
71
71
|
return self.api_client.databases()
|
|
72
72
|
|
|
73
73
|
def tables_and_columns(
|
|
74
|
-
self, schemas:
|
|
74
|
+
self, schemas: list[dict], users: list[dict]
|
|
75
75
|
) -> TablesColumns:
|
|
76
76
|
"""
|
|
77
77
|
Get the databricks tables & columns leveraging the unity catalog API
|
|
78
78
|
"""
|
|
79
|
-
tables:
|
|
80
|
-
columns:
|
|
79
|
+
tables: list[dict] = []
|
|
80
|
+
columns: list[dict] = []
|
|
81
81
|
user_mapping = self._get_user_mapping(users)
|
|
82
82
|
table_tags = self.sql_client.get_tags_mapping(TagEntity.TABLE)
|
|
83
83
|
column_tags = self.sql_client.get_tags_mapping(TagEntity.COLUMN)
|
|
@@ -95,7 +95,7 @@ class DatabricksClient:
|
|
|
95
95
|
columns.extend(c_to_add)
|
|
96
96
|
return tables, columns
|
|
97
97
|
|
|
98
|
-
def table_lineage(self, tables:
|
|
98
|
+
def table_lineage(self, tables: list[dict]) -> list[dict]:
|
|
99
99
|
"""
|
|
100
100
|
Wrapper function that retrieves all table lineage
|
|
101
101
|
"""
|
|
@@ -113,8 +113,8 @@ class DatabricksClient:
|
|
|
113
113
|
return self.formatter.format_lineage(deduplicated)
|
|
114
114
|
|
|
115
115
|
def column_lineage(
|
|
116
|
-
self, tables:
|
|
117
|
-
) ->
|
|
116
|
+
self, tables: list[dict], columns: list[dict], table_lineage: list[dict]
|
|
117
|
+
) -> list[dict]:
|
|
118
118
|
"""
|
|
119
119
|
Wrapper function that retrieves all column lineage
|
|
120
120
|
we only try to retrieve column lineage if we found table lineage
|
|
@@ -129,17 +129,17 @@ class DatabricksClient:
|
|
|
129
129
|
results = executor.map(
|
|
130
130
|
self.api_client.get_single_column_lineage, candidate_paths
|
|
131
131
|
)
|
|
132
|
-
lineages:
|
|
132
|
+
lineages: list[TimestampedLink] = [
|
|
133
133
|
link for links in results for link in links
|
|
134
134
|
]
|
|
135
135
|
deduplicated = deduplicate_lineage(lineages)
|
|
136
136
|
return self.formatter.format_lineage(deduplicated)
|
|
137
137
|
|
|
138
|
-
def queries(self, time_filter: Optional[TimeFilter] = None) ->
|
|
138
|
+
def queries(self, time_filter: Optional[TimeFilter] = None) -> list[dict]:
|
|
139
139
|
return self.api_client.queries(time_filter)
|
|
140
140
|
|
|
141
|
-
def users(self) ->
|
|
141
|
+
def users(self) -> list[dict]:
|
|
142
142
|
return self.api_client.users()
|
|
143
143
|
|
|
144
|
-
def view_ddl(self, schemas:
|
|
144
|
+
def view_ddl(self, schemas: list[dict]) -> list[dict]:
|
|
145
145
|
return self.api_client.view_ddl(schemas)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Optional
|
|
3
3
|
|
|
4
4
|
from ...utils import AbstractStorage, LocalStorage, write_summary
|
|
5
5
|
from ..abstract import (
|
|
@@ -29,7 +29,7 @@ DATABRICKS_ASSETS: SupportedAssets = {
|
|
|
29
29
|
logger = logging.getLogger(__name__)
|
|
30
30
|
|
|
31
31
|
OTimeFilter = Optional[TimeFilter]
|
|
32
|
-
Paths =
|
|
32
|
+
Paths = dict[str, str]
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
class DatabricksExtractionProcessor:
|
|
@@ -71,7 +71,7 @@ class DatabricksExtractionProcessor:
|
|
|
71
71
|
if self._should_not_reextract(WarehouseAssetGroup.CATALOG):
|
|
72
72
|
return self._existing_group_paths(WarehouseAssetGroup.CATALOG)
|
|
73
73
|
|
|
74
|
-
catalog_locations:
|
|
74
|
+
catalog_locations: dict[str, str] = dict()
|
|
75
75
|
databases = self._client.databases()
|
|
76
76
|
location = self._storage.put(WarehouseAsset.DATABASE.value, databases)
|
|
77
77
|
catalog_locations[WarehouseAsset.DATABASE.value] = location
|
|
@@ -101,7 +101,7 @@ class DatabricksExtractionProcessor:
|
|
|
101
101
|
return self._existing_group_paths(
|
|
102
102
|
WarehouseAssetGroup.ADDITIONAL_LINEAGE
|
|
103
103
|
)
|
|
104
|
-
lineage_locations:
|
|
104
|
+
lineage_locations: dict[str, str] = dict()
|
|
105
105
|
|
|
106
106
|
# extract catalog
|
|
107
107
|
databases = self._client.databases()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from datetime import datetime
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Optional
|
|
4
4
|
|
|
5
5
|
from .types import TablesColumns
|
|
6
6
|
from .utils import build_path
|
|
@@ -12,7 +12,7 @@ EXCLUDED_SCHEMAS = {"information_schema", "default"}
|
|
|
12
12
|
|
|
13
13
|
TABLE_URL_TPL = "{host}explore/data/{catalog_name}/{schema_name}/{table_name}?o={workspace_id}"
|
|
14
14
|
|
|
15
|
-
TagMapping =
|
|
15
|
+
TagMapping = dict[str, list[str]]
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
def _to_datetime_or_none(time_ms: Optional[int]) -> Optional[datetime]:
|
|
@@ -87,7 +87,7 @@ class DatabricksFormatter:
|
|
|
87
87
|
"""
|
|
88
88
|
|
|
89
89
|
@staticmethod
|
|
90
|
-
def format_database(raw_databases:
|
|
90
|
+
def format_database(raw_databases: list[dict]) -> list[dict]:
|
|
91
91
|
databases = []
|
|
92
92
|
for catalog in raw_databases:
|
|
93
93
|
name = catalog["name"]
|
|
@@ -101,7 +101,7 @@ class DatabricksFormatter:
|
|
|
101
101
|
return databases
|
|
102
102
|
|
|
103
103
|
@staticmethod
|
|
104
|
-
def format_schema(raw_schemas:
|
|
104
|
+
def format_schema(raw_schemas: list[dict], database: dict) -> list[dict]:
|
|
105
105
|
schemas = []
|
|
106
106
|
for schema in raw_schemas:
|
|
107
107
|
if schema["name"] in EXCLUDED_SCHEMAS:
|
|
@@ -118,7 +118,7 @@ class DatabricksFormatter:
|
|
|
118
118
|
|
|
119
119
|
@staticmethod
|
|
120
120
|
def format_table_column(
|
|
121
|
-
raw_tables:
|
|
121
|
+
raw_tables: list[dict],
|
|
122
122
|
schema: dict,
|
|
123
123
|
host: str,
|
|
124
124
|
workspace_id: str,
|
|
@@ -141,8 +141,8 @@ class DatabricksFormatter:
|
|
|
141
141
|
return tables, columns
|
|
142
142
|
|
|
143
143
|
@staticmethod
|
|
144
|
-
def format_lineage(timestamps: dict) ->
|
|
145
|
-
lineage:
|
|
144
|
+
def format_lineage(timestamps: dict) -> list[dict]:
|
|
145
|
+
lineage: list[dict] = []
|
|
146
146
|
for link, timestamp in timestamps.items():
|
|
147
147
|
parent_path, child_path = link
|
|
148
148
|
link_ = {
|
|
@@ -154,7 +154,7 @@ class DatabricksFormatter:
|
|
|
154
154
|
return lineage
|
|
155
155
|
|
|
156
156
|
@staticmethod
|
|
157
|
-
def format_query(raw_queries:
|
|
157
|
+
def format_query(raw_queries: list[dict]) -> list[dict]:
|
|
158
158
|
queries = []
|
|
159
159
|
for q in raw_queries:
|
|
160
160
|
if not q["query_text"]:
|
|
@@ -176,7 +176,7 @@ class DatabricksFormatter:
|
|
|
176
176
|
return queries
|
|
177
177
|
|
|
178
178
|
@staticmethod
|
|
179
|
-
def _primary(emails:
|
|
179
|
+
def _primary(emails: list[dict]) -> Optional[str]:
|
|
180
180
|
"""helper function to select a unique email"""
|
|
181
181
|
if not emails:
|
|
182
182
|
return None
|
|
@@ -189,7 +189,7 @@ class DatabricksFormatter:
|
|
|
189
189
|
emails = user.get("emails")
|
|
190
190
|
return self._primary(emails) if emails else None
|
|
191
191
|
|
|
192
|
-
def format_user(self, raw_users:
|
|
192
|
+
def format_user(self, raw_users: list[dict]) -> list[dict]:
|
|
193
193
|
users = []
|
|
194
194
|
for user in raw_users:
|
|
195
195
|
users.append(
|
|
@@ -204,8 +204,8 @@ class DatabricksFormatter:
|
|
|
204
204
|
return users
|
|
205
205
|
|
|
206
206
|
@staticmethod
|
|
207
|
-
def format_view_ddl(tables:
|
|
208
|
-
view_ddl:
|
|
207
|
+
def format_view_ddl(tables: list[dict], schema: dict) -> list[dict]:
|
|
208
|
+
view_ddl: list[dict] = []
|
|
209
209
|
if not tables:
|
|
210
210
|
return view_ddl
|
|
211
211
|
for table in tables:
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import cast
|
|
2
2
|
|
|
3
3
|
from .types import Link, Ostr, OTimestampedLink, TimestampedLink
|
|
4
4
|
|
|
@@ -9,7 +9,7 @@ class LineageLinks:
|
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
def __init__(self):
|
|
12
|
-
self.lineage:
|
|
12
|
+
self.lineage: dict[Link, Ostr] = dict()
|
|
13
13
|
|
|
14
14
|
def add(self, timestamped_link: TimestampedLink) -> None:
|
|
15
15
|
"""
|
|
@@ -52,7 +52,7 @@ def _link(path_from: Ostr, path_to: Ostr, timestamp: Ostr) -> OTimestampedLink:
|
|
|
52
52
|
|
|
53
53
|
def single_table_lineage_links(
|
|
54
54
|
table_path: str, single_table_lineage: dict
|
|
55
|
-
) ->
|
|
55
|
+
) -> list[TimestampedLink]:
|
|
56
56
|
"""
|
|
57
57
|
process databricks lineage API response for a given table
|
|
58
58
|
returns a list of (parent, child, timestamp)
|
|
@@ -60,7 +60,7 @@ def single_table_lineage_links(
|
|
|
60
60
|
Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
|
|
61
61
|
we could also have `notebookInfos` or `fileInfo`
|
|
62
62
|
"""
|
|
63
|
-
links:
|
|
63
|
+
links: list[OTimestampedLink] = []
|
|
64
64
|
# add parent:
|
|
65
65
|
for link in single_table_lineage.get("upstreams", []):
|
|
66
66
|
parent = link.get("tableInfo", {})
|
|
@@ -80,7 +80,7 @@ def single_table_lineage_links(
|
|
|
80
80
|
|
|
81
81
|
def single_column_lineage_links(
|
|
82
82
|
column_path: str, single_column_lineage: dict
|
|
83
|
-
) ->
|
|
83
|
+
) -> list[TimestampedLink]:
|
|
84
84
|
"""
|
|
85
85
|
process databricks lineage API response for a given table
|
|
86
86
|
returns a list of (parent, child, timestamp)
|
|
@@ -88,7 +88,7 @@ def single_column_lineage_links(
|
|
|
88
88
|
Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
|
|
89
89
|
we could also have `notebookInfos` or `fileInfo`
|
|
90
90
|
"""
|
|
91
|
-
links:
|
|
91
|
+
links: list[OTimestampedLink] = []
|
|
92
92
|
# add parent:
|
|
93
93
|
for link in single_column_lineage.get("upstream_cols", []):
|
|
94
94
|
parent_path = _to_column_path(link)
|
|
@@ -105,8 +105,8 @@ def single_column_lineage_links(
|
|
|
105
105
|
|
|
106
106
|
|
|
107
107
|
def paths_for_column_lineage(
|
|
108
|
-
tables:
|
|
109
|
-
) ->
|
|
108
|
+
tables: list[dict], columns: list[dict], table_lineage: list[dict]
|
|
109
|
+
) -> list[tuple[str, str]]:
|
|
110
110
|
"""
|
|
111
111
|
helper providing a list of candidate columns to look lineage for:
|
|
112
112
|
we only look for column lineage where there is table lineage
|
|
@@ -118,12 +118,12 @@ def paths_for_column_lineage(
|
|
|
118
118
|
for table in tables
|
|
119
119
|
}
|
|
120
120
|
|
|
121
|
-
tables_with_lineage:
|
|
121
|
+
tables_with_lineage: set[str] = set()
|
|
122
122
|
for t in table_lineage:
|
|
123
123
|
tables_with_lineage.add(t["parent_path"])
|
|
124
124
|
tables_with_lineage.add(t["child_path"])
|
|
125
125
|
|
|
126
|
-
paths_to_return:
|
|
126
|
+
paths_to_return: list[tuple[str, str]] = []
|
|
127
127
|
for column in columns:
|
|
128
128
|
table_path = mapping[column["table_id"]]
|
|
129
129
|
if table_path not in tables_with_lineage:
|
|
@@ -134,7 +134,7 @@ def paths_for_column_lineage(
|
|
|
134
134
|
return paths_to_return
|
|
135
135
|
|
|
136
136
|
|
|
137
|
-
def deduplicate_lineage(lineages:
|
|
137
|
+
def deduplicate_lineage(lineages: list[TimestampedLink]) -> dict:
|
|
138
138
|
deduplicated_lineage = LineageLinks()
|
|
139
139
|
for timestamped_link in lineages:
|
|
140
140
|
deduplicated_lineage.add(timestamped_link)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Optional
|
|
2
2
|
|
|
3
3
|
from pydantic import Field
|
|
4
4
|
|
|
@@ -10,7 +10,7 @@ DATABRICKS_PAGE_SIZE = 100
|
|
|
10
10
|
class DatabricksPagination(PaginationModel):
|
|
11
11
|
next_page_token: Optional[str] = None
|
|
12
12
|
has_next_page: bool = False
|
|
13
|
-
res:
|
|
13
|
+
res: list[dict] = Field(default_factory=list)
|
|
14
14
|
|
|
15
15
|
def is_last(self) -> bool:
|
|
16
16
|
return not (self.has_next_page and self.next_page_token)
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Optional
|
|
2
2
|
|
|
3
|
-
Link =
|
|
4
|
-
TablesColumns =
|
|
3
|
+
Link = tuple[str, str]
|
|
4
|
+
TablesColumns = tuple[list[dict], list[dict]]
|
|
5
5
|
Ostr = Optional[str]
|
|
6
|
-
TimestampedLink =
|
|
6
|
+
TimestampedLink = tuple[str, str, Ostr]
|
|
7
7
|
|
|
8
8
|
OTimestampedLink = Optional[TimestampedLink]
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
1
2
|
from datetime import date
|
|
2
|
-
from typing import
|
|
3
|
+
from typing import Optional
|
|
3
4
|
|
|
4
5
|
from ...utils import at_midnight
|
|
5
6
|
from ..abstract import TimeFilter
|
|
@@ -14,8 +15,8 @@ def _day_hour_to_epoch_ms(day: date, hour: int) -> int:
|
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
def build_path(
|
|
17
|
-
row:
|
|
18
|
-
keys:
|
|
18
|
+
row: dict,
|
|
19
|
+
keys: list[str],
|
|
19
20
|
) -> str:
|
|
20
21
|
"""
|
|
21
22
|
format an asset's path:
|
|
@@ -26,7 +27,7 @@ def build_path(
|
|
|
26
27
|
return ".".join(key_values)
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
def tag_label(row:
|
|
30
|
+
def tag_label(row: dict) -> str:
|
|
30
31
|
"""
|
|
31
32
|
format the tag's label:
|
|
32
33
|
- {key:value} when the value is not empty
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Optional
|
|
2
2
|
|
|
3
3
|
from ..abstract import (
|
|
4
4
|
AbstractQueryBuilder,
|
|
@@ -19,6 +19,6 @@ class MySQLQueryBuilder(AbstractQueryBuilder):
|
|
|
19
19
|
):
|
|
20
20
|
super().__init__(time_filter=time_filter)
|
|
21
21
|
|
|
22
|
-
def build(self, asset: WarehouseAsset) ->
|
|
22
|
+
def build(self, asset: WarehouseAsset) -> list[ExtractionQuery]:
|
|
23
23
|
query = self.build_default(asset)
|
|
24
24
|
return [query]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Optional
|
|
2
2
|
|
|
3
3
|
from ..abstract import (
|
|
4
4
|
AbstractQueryBuilder,
|
|
@@ -19,6 +19,6 @@ class PostgresQueryBuilder(AbstractQueryBuilder):
|
|
|
19
19
|
):
|
|
20
20
|
super().__init__(time_filter=time_filter)
|
|
21
21
|
|
|
22
|
-
def build(self, asset: WarehouseAsset) ->
|
|
22
|
+
def build(self, asset: WarehouseAsset) -> list[ExtractionQuery]:
|
|
23
23
|
query = self.build_default(asset)
|
|
24
24
|
return [query]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Optional
|
|
2
2
|
|
|
3
3
|
from ..abstract import (
|
|
4
4
|
AbstractQueryBuilder,
|
|
@@ -27,7 +27,7 @@ class RedshiftQueryBuilder(AbstractQueryBuilder):
|
|
|
27
27
|
params = self._time_filter.to_dict()
|
|
28
28
|
return ExtractionQuery(statement, params)
|
|
29
29
|
|
|
30
|
-
def build(self, asset: WarehouseAsset) ->
|
|
30
|
+
def build(self, asset: WarehouseAsset) -> list[ExtractionQuery]:
|
|
31
31
|
if asset == WarehouseAsset.QUERY and self.is_serverless:
|
|
32
32
|
query = self.build_query_serverless()
|
|
33
33
|
else:
|