castor-extractor 0.19.0__py3-none-any.whl → 0.19.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +29 -2
- castor_extractor/file_checker/templates/generic_warehouse.py +1 -1
- castor_extractor/knowledge/notion/client/client.py +44 -80
- castor_extractor/knowledge/notion/client/client_test.py +9 -4
- castor_extractor/knowledge/notion/client/constants.py +1 -0
- castor_extractor/knowledge/notion/client/endpoints.py +1 -1
- castor_extractor/knowledge/notion/client/pagination.py +9 -5
- castor_extractor/quality/soda/assets.py +1 -1
- castor_extractor/quality/soda/client/client.py +30 -83
- castor_extractor/quality/soda/client/credentials.py +0 -11
- castor_extractor/quality/soda/client/endpoints.py +3 -6
- castor_extractor/quality/soda/client/pagination.py +25 -0
- castor_extractor/utils/__init__.py +13 -2
- castor_extractor/utils/client/__init__.py +14 -0
- castor_extractor/utils/client/api/__init__.py +5 -0
- castor_extractor/utils/client/api/auth.py +76 -0
- castor_extractor/utils/client/api/auth_test.py +49 -0
- castor_extractor/utils/client/api/client.py +153 -0
- castor_extractor/utils/client/api/client_test.py +47 -0
- castor_extractor/utils/client/api/pagination.py +83 -0
- castor_extractor/utils/client/api/pagination_test.py +51 -0
- castor_extractor/utils/{safe_request_test.py → client/api/safe_request_test.py} +4 -1
- castor_extractor/utils/client/api/utils.py +9 -0
- castor_extractor/utils/client/api/utils_test.py +16 -0
- castor_extractor/utils/collection.py +34 -2
- castor_extractor/utils/collection_test.py +17 -3
- castor_extractor/utils/pager/__init__.py +0 -1
- castor_extractor/utils/retry.py +44 -0
- castor_extractor/utils/retry_test.py +26 -1
- castor_extractor/utils/salesforce/client.py +44 -49
- castor_extractor/utils/salesforce/client_test.py +2 -2
- castor_extractor/utils/salesforce/pagination.py +33 -0
- castor_extractor/visualization/domo/client/client.py +10 -5
- castor_extractor/visualization/domo/client/credentials.py +1 -1
- castor_extractor/visualization/domo/client/endpoints.py +19 -7
- castor_extractor/visualization/looker/api/credentials.py +1 -1
- castor_extractor/visualization/metabase/client/api/client.py +26 -11
- castor_extractor/visualization/metabase/client/api/credentials.py +1 -1
- castor_extractor/visualization/metabase/client/db/credentials.py +1 -1
- castor_extractor/visualization/mode/client/credentials.py +1 -1
- castor_extractor/visualization/qlik/client/engine/credentials.py +1 -1
- castor_extractor/visualization/salesforce_reporting/client/rest.py +4 -3
- castor_extractor/visualization/sigma/client/client.py +106 -111
- castor_extractor/visualization/sigma/client/credentials.py +11 -1
- castor_extractor/visualization/sigma/client/endpoints.py +1 -1
- castor_extractor/visualization/sigma/client/pagination.py +22 -18
- castor_extractor/visualization/tableau/tests/unit/rest_api/auth_test.py +0 -1
- castor_extractor/visualization/tableau/tests/unit/rest_api/credentials_test.py +0 -3
- castor_extractor/visualization/tableau_revamp/assets.py +11 -0
- castor_extractor/visualization/tableau_revamp/client/client.py +71 -151
- castor_extractor/visualization/tableau_revamp/client/client_metadata_api.py +95 -0
- castor_extractor/visualization/tableau_revamp/client/client_rest_api.py +128 -0
- castor_extractor/visualization/tableau_revamp/client/client_tsc.py +66 -0
- castor_extractor/visualization/tableau_revamp/client/{tsc_fields.py → rest_fields.py} +15 -2
- castor_extractor/visualization/tableau_revamp/constants.py +0 -2
- castor_extractor/visualization/tableau_revamp/extract.py +5 -11
- castor_extractor/warehouse/databricks/api_client.py +239 -0
- castor_extractor/warehouse/databricks/api_client_test.py +15 -0
- castor_extractor/warehouse/databricks/client.py +37 -490
- castor_extractor/warehouse/databricks/client_test.py +1 -99
- castor_extractor/warehouse/databricks/endpoints.py +28 -0
- castor_extractor/warehouse/databricks/lineage.py +141 -0
- castor_extractor/warehouse/databricks/lineage_test.py +34 -0
- castor_extractor/warehouse/databricks/pagination.py +22 -0
- castor_extractor/warehouse/databricks/sql_client.py +90 -0
- castor_extractor/warehouse/databricks/utils.py +44 -1
- castor_extractor/warehouse/databricks/utils_test.py +58 -1
- castor_extractor/warehouse/mysql/client.py +0 -2
- castor_extractor/warehouse/salesforce/client.py +12 -59
- castor_extractor/warehouse/salesforce/pagination.py +34 -0
- castor_extractor/warehouse/sqlserver/client.py +0 -1
- castor_extractor-0.19.6.dist-info/METADATA +903 -0
- {castor_extractor-0.19.0.dist-info → castor_extractor-0.19.6.dist-info}/RECORD +77 -60
- castor_extractor/utils/client/api.py +0 -87
- castor_extractor/utils/client/api_test.py +0 -24
- castor_extractor/utils/pager/pager_on_token.py +0 -52
- castor_extractor/utils/pager/pager_on_token_test.py +0 -73
- castor_extractor/visualization/sigma/client/client_test.py +0 -54
- castor_extractor-0.19.0.dist-info/METADATA +0 -207
- /castor_extractor/utils/{safe_request.py → client/api/safe_request.py} +0 -0
- {castor_extractor-0.19.0.dist-info → castor_extractor-0.19.6.dist-info}/LICENCE +0 -0
- {castor_extractor-0.19.0.dist-info → castor_extractor-0.19.6.dist-info}/WHEEL +0 -0
- {castor_extractor-0.19.0.dist-info → castor_extractor-0.19.6.dist-info}/entry_points.txt +0 -0
|
@@ -1,93 +1,24 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from collections import defaultdict
|
|
3
2
|
from concurrent.futures import ThreadPoolExecutor
|
|
4
|
-
from
|
|
5
|
-
from enum import Enum
|
|
6
|
-
from functools import partial
|
|
7
|
-
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, cast
|
|
8
|
-
|
|
9
|
-
import requests
|
|
10
|
-
from databricks import sql # type: ignore
|
|
11
|
-
from requests import Response
|
|
3
|
+
from typing import List, Optional, Set
|
|
12
4
|
|
|
13
5
|
from ...utils import (
|
|
14
|
-
SafeMode,
|
|
15
|
-
at_midnight,
|
|
16
|
-
date_after,
|
|
17
6
|
mapping_from_rows,
|
|
18
|
-
retry,
|
|
19
|
-
safe_mode,
|
|
20
7
|
)
|
|
21
|
-
from
|
|
22
|
-
from
|
|
23
|
-
from ..abstract.time_filter import TimeFilter
|
|
8
|
+
from ..abstract import TimeFilter
|
|
9
|
+
from .api_client import DatabricksAPIClient
|
|
24
10
|
from .credentials import DatabricksCredentials
|
|
25
|
-
from .format import DatabricksFormatter
|
|
26
|
-
from .
|
|
27
|
-
from .
|
|
11
|
+
from .format import DatabricksFormatter
|
|
12
|
+
from .lineage import deduplicate_lineage, paths_for_column_lineage
|
|
13
|
+
from .sql_client import DatabricksSQLClient, TagEntity
|
|
14
|
+
from .types import TablesColumns, TimestampedLink
|
|
28
15
|
|
|
29
16
|
logger = logging.getLogger(__name__)
|
|
30
17
|
|
|
31
|
-
_DATABRICKS_CLIENT_TIMEOUT = 90
|
|
32
|
-
_DEFAULT_HOUR_MIN = 0
|
|
33
|
-
_DEFAULT_HOUR_MAX = 23
|
|
34
|
-
_MAX_NUMBER_OF_LINEAGE_ERRORS = 1000
|
|
35
|
-
_MAX_NUMBER_OF_QUERY_ERRORS = 1000
|
|
36
18
|
_MAX_THREADS = 10
|
|
37
|
-
_NUM_HOURS_IN_A_DAY = 24
|
|
38
|
-
_RETRY_ATTEMPTS = 3
|
|
39
|
-
_RETRY_BASE_MS = 1000
|
|
40
|
-
_RETRY_EXCEPTIONS = [
|
|
41
|
-
requests.exceptions.ConnectTimeout,
|
|
42
|
-
]
|
|
43
|
-
_WORKSPACE_ID_HEADER = "X-Databricks-Org-Id"
|
|
44
|
-
|
|
45
|
-
_INFORMATION_SCHEMA_SQL = "SELECT * FROM system.information_schema"
|
|
46
|
-
|
|
47
|
-
safe_lineage_params = SafeMode((BaseException,), _MAX_NUMBER_OF_LINEAGE_ERRORS)
|
|
48
|
-
safe_query_params = SafeMode((BaseException,), _MAX_NUMBER_OF_QUERY_ERRORS)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
class TagEntity(Enum):
|
|
52
|
-
"""Entities that can be tagged in Databricks"""
|
|
53
|
-
|
|
54
|
-
COLUMN = "COLUMN"
|
|
55
|
-
TABLE = "TABLE"
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def _day_to_epoch_ms(day: date) -> int:
|
|
59
|
-
return int(at_midnight(day).timestamp() * 1000)
|
|
60
19
|
|
|
61
20
|
|
|
62
|
-
|
|
63
|
-
return int(at_midnight(day).timestamp() * 1000) + (hour * 3600 * 1000)
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
class LineageLinks:
|
|
67
|
-
"""
|
|
68
|
-
helper class that handles lineage deduplication and filtering
|
|
69
|
-
"""
|
|
70
|
-
|
|
71
|
-
def __init__(self):
|
|
72
|
-
self.lineage: Dict[Link, Ostr] = dict()
|
|
73
|
-
|
|
74
|
-
def add(self, timestamped_link: TimestampedLink) -> None:
|
|
75
|
-
"""
|
|
76
|
-
keep the most recent lineage link, adding to `self.lineage`
|
|
77
|
-
"""
|
|
78
|
-
parent, child, timestamp = timestamped_link
|
|
79
|
-
link = (parent, child)
|
|
80
|
-
if not self.lineage.get(link):
|
|
81
|
-
self.lineage[link] = timestamp
|
|
82
|
-
else:
|
|
83
|
-
if not timestamp:
|
|
84
|
-
return
|
|
85
|
-
# keep most recent link; cast for mypy
|
|
86
|
-
recent = max(cast(str, self.lineage[link]), cast(str, timestamp))
|
|
87
|
-
self.lineage[link] = recent
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
class DatabricksClient(APIClient):
|
|
21
|
+
class DatabricksClient:
|
|
91
22
|
"""Databricks Client"""
|
|
92
23
|
|
|
93
24
|
def __init__(
|
|
@@ -98,111 +29,23 @@ class DatabricksClient(APIClient):
|
|
|
98
29
|
has_table_tags: bool = False,
|
|
99
30
|
has_column_tags: bool = False,
|
|
100
31
|
):
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
self.
|
|
32
|
+
self.api_client = DatabricksAPIClient(
|
|
33
|
+
credentials=credentials,
|
|
34
|
+
db_allowed=db_allowed,
|
|
35
|
+
db_blocked=db_blocked,
|
|
36
|
+
)
|
|
37
|
+
self.sql_client = DatabricksSQLClient(
|
|
38
|
+
credentials=credentials,
|
|
39
|
+
has_table_tags=has_table_tags,
|
|
40
|
+
has_column_tags=has_column_tags,
|
|
41
|
+
)
|
|
107
42
|
|
|
108
|
-
self._timeout = _DATABRICKS_CLIENT_TIMEOUT
|
|
109
43
|
self.formatter = DatabricksFormatter()
|
|
110
44
|
|
|
111
|
-
def execute_sql(
|
|
112
|
-
self,
|
|
113
|
-
query: str,
|
|
114
|
-
params: Optional[dict] = None,
|
|
115
|
-
):
|
|
116
|
-
"""
|
|
117
|
-
Execute a SQL query on Databricks system tables and return the results.
|
|
118
|
-
https://docs.databricks.com/en/dev-tools/python-sql-connector.html
|
|
119
|
-
|
|
120
|
-
//!\\ credentials.http_path is required in order to run SQL queries
|
|
121
|
-
"""
|
|
122
|
-
assert self._http_path, "HTTP_PATH is required to run SQL queries"
|
|
123
|
-
with sql.connect(
|
|
124
|
-
server_hostname=self._host,
|
|
125
|
-
http_path=self._http_path,
|
|
126
|
-
access_token=self._token,
|
|
127
|
-
) as connection:
|
|
128
|
-
with connection.cursor() as cursor:
|
|
129
|
-
cursor.execute(query, params)
|
|
130
|
-
return cursor.fetchall()
|
|
131
|
-
|
|
132
45
|
@staticmethod
|
|
133
46
|
def name() -> str:
|
|
134
47
|
return "Databricks"
|
|
135
48
|
|
|
136
|
-
def _keep_catalog(self, catalog: str) -> bool:
|
|
137
|
-
"""
|
|
138
|
-
Helper function to determine if we should keep the Databricks catalog
|
|
139
|
-
which is a CastorDoc database
|
|
140
|
-
"""
|
|
141
|
-
if self._db_allowed and catalog not in self._db_allowed:
|
|
142
|
-
return False
|
|
143
|
-
if self._db_blocked and catalog in self._db_blocked:
|
|
144
|
-
return False
|
|
145
|
-
return True
|
|
146
|
-
|
|
147
|
-
def databases(self) -> List[dict]:
|
|
148
|
-
path = "api/2.1/unity-catalog/catalogs"
|
|
149
|
-
content = self.get(path=path)
|
|
150
|
-
_databases = self.formatter.format_database(content.get("catalogs", []))
|
|
151
|
-
return [d for d in _databases if self._keep_catalog(d["database_name"])]
|
|
152
|
-
|
|
153
|
-
def _schemas_of_database(self, database: dict) -> List[dict]:
|
|
154
|
-
path = "api/2.1/unity-catalog/schemas"
|
|
155
|
-
payload = {"catalog_name": database["database_name"]}
|
|
156
|
-
content = self.get(path=path, payload=payload)
|
|
157
|
-
schemas = content.get("schemas", [])
|
|
158
|
-
return self.formatter.format_schema(schemas, database)
|
|
159
|
-
|
|
160
|
-
def schemas(self, databases: List[dict]) -> List[dict]:
|
|
161
|
-
"""
|
|
162
|
-
Get the databricks schemas (also sometimes called databases)
|
|
163
|
-
(which correspond to the schemas in Castor)
|
|
164
|
-
leveraging the unity catalog API
|
|
165
|
-
"""
|
|
166
|
-
return [
|
|
167
|
-
schema
|
|
168
|
-
for database in databases
|
|
169
|
-
for schema in self._schemas_of_database(database)
|
|
170
|
-
]
|
|
171
|
-
|
|
172
|
-
@staticmethod
|
|
173
|
-
def _process_table_response(response: Response) -> Tuple[dict, str]:
|
|
174
|
-
"""
|
|
175
|
-
Returns both the JSON content and the Workspace ID, which is found
|
|
176
|
-
in the response's headers.
|
|
177
|
-
"""
|
|
178
|
-
return response.json(), response.headers[_WORKSPACE_ID_HEADER]
|
|
179
|
-
|
|
180
|
-
def _tables_columns_of_schema(
|
|
181
|
-
self,
|
|
182
|
-
schema: dict,
|
|
183
|
-
table_tags: TagMapping,
|
|
184
|
-
column_tags: TagMapping,
|
|
185
|
-
) -> TablesColumns:
|
|
186
|
-
path = "api/2.1/unity-catalog/tables"
|
|
187
|
-
payload = {
|
|
188
|
-
"catalog_name": schema["database_id"],
|
|
189
|
-
"schema_name": schema["schema_name"],
|
|
190
|
-
}
|
|
191
|
-
content, workspace_id = self.get(
|
|
192
|
-
path=path,
|
|
193
|
-
payload=payload,
|
|
194
|
-
processor=self._process_table_response,
|
|
195
|
-
)
|
|
196
|
-
host = self.build_url(self._host, path="")
|
|
197
|
-
return self.formatter.format_table_column(
|
|
198
|
-
raw_tables=content.get("tables", []),
|
|
199
|
-
schema=schema,
|
|
200
|
-
host=host,
|
|
201
|
-
workspace_id=workspace_id,
|
|
202
|
-
table_tags=table_tags,
|
|
203
|
-
column_tags=column_tags,
|
|
204
|
-
)
|
|
205
|
-
|
|
206
49
|
@staticmethod
|
|
207
50
|
def _match_table_with_user(table: dict, user_mapping: dict) -> dict:
|
|
208
51
|
table_owner_email = table.get("owner_email")
|
|
@@ -213,40 +56,6 @@ class DatabricksClient(APIClient):
|
|
|
213
56
|
return table
|
|
214
57
|
return {**table, "owner_external_id": owner_external_id}
|
|
215
58
|
|
|
216
|
-
def _needs_extraction(self, entity: TagEntity) -> bool:
|
|
217
|
-
if entity == TagEntity.TABLE:
|
|
218
|
-
return self._has_table_tags
|
|
219
|
-
if entity == TagEntity.COLUMN:
|
|
220
|
-
return self._has_column_tags
|
|
221
|
-
raise AssertionError(f"Entity not supported: {entity}")
|
|
222
|
-
|
|
223
|
-
def _get_tags_mapping(self, entity: TagEntity) -> TagMapping:
|
|
224
|
-
"""
|
|
225
|
-
Fetch tags of the given entity and build a mapping:
|
|
226
|
-
{ path: list[tags] }
|
|
227
|
-
|
|
228
|
-
https://docs.databricks.com/en/sql/language-manual/information-schema/table_tags.html
|
|
229
|
-
https://docs.databricks.com/en/sql/language-manual/information-schema/column_tags.html
|
|
230
|
-
"""
|
|
231
|
-
if not self._needs_extraction(entity):
|
|
232
|
-
# extracting tags require additional credentials (http_path)
|
|
233
|
-
return dict()
|
|
234
|
-
|
|
235
|
-
table = f"{entity.value.lower()}_tags"
|
|
236
|
-
query = f"{_INFORMATION_SCHEMA_SQL}.{table}"
|
|
237
|
-
result = self.execute_sql(query)
|
|
238
|
-
mapping = defaultdict(list)
|
|
239
|
-
for row in result:
|
|
240
|
-
dict_row = row.asDict()
|
|
241
|
-
keys = ["catalog_name", "schema_name", "table_name"]
|
|
242
|
-
if entity == TagEntity.COLUMN:
|
|
243
|
-
keys.append("column_name")
|
|
244
|
-
path = build_path(dict_row, keys)
|
|
245
|
-
label = tag_label(dict_row)
|
|
246
|
-
mapping[path].append(label)
|
|
247
|
-
|
|
248
|
-
return mapping
|
|
249
|
-
|
|
250
59
|
@staticmethod
|
|
251
60
|
def _get_user_mapping(users: List[dict]) -> dict:
|
|
252
61
|
return {
|
|
@@ -254,6 +63,12 @@ class DatabricksClient(APIClient):
|
|
|
254
63
|
**mapping_from_rows(users, "user_name", "id"),
|
|
255
64
|
}
|
|
256
65
|
|
|
66
|
+
def schemas(self, databases: List[dict]) -> List[dict]:
|
|
67
|
+
return self.api_client.schemas(databases)
|
|
68
|
+
|
|
69
|
+
def databases(self) -> List[dict]:
|
|
70
|
+
return self.api_client.databases()
|
|
71
|
+
|
|
257
72
|
def tables_and_columns(
|
|
258
73
|
self, schemas: List[dict], users: List[dict]
|
|
259
74
|
) -> TablesColumns:
|
|
@@ -263,10 +78,10 @@ class DatabricksClient(APIClient):
|
|
|
263
78
|
tables: List[dict] = []
|
|
264
79
|
columns: List[dict] = []
|
|
265
80
|
user_mapping = self._get_user_mapping(users)
|
|
266
|
-
table_tags = self.
|
|
267
|
-
column_tags = self.
|
|
81
|
+
table_tags = self.sql_client.get_tags_mapping(TagEntity.TABLE)
|
|
82
|
+
column_tags = self.sql_client.get_tags_mapping(TagEntity.COLUMN)
|
|
268
83
|
for schema in schemas:
|
|
269
|
-
t_to_add, c_to_add = self.
|
|
84
|
+
t_to_add, c_to_add = self.api_client.tables_columns_of_schema(
|
|
270
85
|
schema=schema,
|
|
271
86
|
table_tags=table_tags,
|
|
272
87
|
column_tags=column_tags,
|
|
@@ -279,82 +94,6 @@ class DatabricksClient(APIClient):
|
|
|
279
94
|
columns.extend(c_to_add)
|
|
280
95
|
return tables, columns
|
|
281
96
|
|
|
282
|
-
@staticmethod
|
|
283
|
-
def _to_table_path(table: dict) -> Ostr:
|
|
284
|
-
if table.get("name"):
|
|
285
|
-
return f"{table['catalog_name']}.{table['schema_name']}.{table['name']}"
|
|
286
|
-
return None
|
|
287
|
-
|
|
288
|
-
@staticmethod
|
|
289
|
-
def _to_column_path(column: dict) -> Ostr:
|
|
290
|
-
if column.get("name"):
|
|
291
|
-
return f"{column['catalog_name']}.{column['schema_name']}.{column['table_name']}.{column['name']}"
|
|
292
|
-
return None
|
|
293
|
-
|
|
294
|
-
def _link(
|
|
295
|
-
self, path_from: Ostr, path_to: Ostr, timestamp: Ostr
|
|
296
|
-
) -> OTimestampedLink:
|
|
297
|
-
"""exclude missing path and self-lineage"""
|
|
298
|
-
if (not path_from) or (not path_to):
|
|
299
|
-
return None
|
|
300
|
-
is_self_lineage = path_from.lower() == path_to.lower()
|
|
301
|
-
if is_self_lineage:
|
|
302
|
-
return None
|
|
303
|
-
return (path_from, path_to, timestamp)
|
|
304
|
-
|
|
305
|
-
def _single_table_lineage_links(
|
|
306
|
-
self, table_path: str, single_table_lineage: dict
|
|
307
|
-
) -> List[TimestampedLink]:
|
|
308
|
-
"""
|
|
309
|
-
process databricks lineage API response for a given table
|
|
310
|
-
returns a list of (parent, child, timestamp)
|
|
311
|
-
|
|
312
|
-
Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
|
|
313
|
-
we could also have `notebookInfos` or `fileInfo`
|
|
314
|
-
"""
|
|
315
|
-
links: List[OTimestampedLink] = []
|
|
316
|
-
# add parent:
|
|
317
|
-
for link in single_table_lineage.get("upstreams", []):
|
|
318
|
-
parent = link.get("tableInfo", {})
|
|
319
|
-
parent_path = self._to_table_path(parent)
|
|
320
|
-
timestamp: Ostr = parent.get("lineage_timestamp")
|
|
321
|
-
links.append(self._link(parent_path, table_path, timestamp))
|
|
322
|
-
|
|
323
|
-
# add children:
|
|
324
|
-
for link in single_table_lineage.get("downstreams", []):
|
|
325
|
-
child = link.get("tableInfo", {})
|
|
326
|
-
child_path = self._to_table_path(child)
|
|
327
|
-
timestamp = child.get("lineage_timestamp")
|
|
328
|
-
links.append(self._link(table_path, child_path, timestamp))
|
|
329
|
-
|
|
330
|
-
return list(filter(None, links))
|
|
331
|
-
|
|
332
|
-
@safe_mode(safe_lineage_params, lambda: [])
|
|
333
|
-
@retry(
|
|
334
|
-
exceptions=_RETRY_EXCEPTIONS,
|
|
335
|
-
max_retries=_RETRY_ATTEMPTS,
|
|
336
|
-
base_ms=_RETRY_BASE_MS,
|
|
337
|
-
)
|
|
338
|
-
def get_single_table_lineage(
|
|
339
|
-
self, table_path: str
|
|
340
|
-
) -> List[TimestampedLink]:
|
|
341
|
-
"""
|
|
342
|
-
Helper function used in get_lineage_links.
|
|
343
|
-
Call data lineage API and return the content of the result
|
|
344
|
-
eg table_path: broward_prd.bronze.account_adjustments
|
|
345
|
-
FYI: Maximum rate of 50 requests per SECOND
|
|
346
|
-
"""
|
|
347
|
-
path = "api/2.0/lineage-tracking/table-lineage"
|
|
348
|
-
payload = {"table_name": table_path, "include_entity_lineage": True}
|
|
349
|
-
content = self.get(path=path, payload=payload)
|
|
350
|
-
return self._single_table_lineage_links(table_path, content)
|
|
351
|
-
|
|
352
|
-
def _deduplicate_lineage(self, lineages: List[TimestampedLink]) -> dict:
|
|
353
|
-
deduplicated_lineage = LineageLinks()
|
|
354
|
-
for timestamped_link in lineages:
|
|
355
|
-
deduplicated_lineage.add(timestamped_link)
|
|
356
|
-
return deduplicated_lineage.lineage
|
|
357
|
-
|
|
358
97
|
def table_lineage(self, tables: List[dict]) -> List[dict]:
|
|
359
98
|
"""
|
|
360
99
|
Wrapper function that retrieves all table lineage
|
|
@@ -365,94 +104,13 @@ class DatabricksClient(APIClient):
|
|
|
365
104
|
".".join([table["schema_id"], table["table_name"]])
|
|
366
105
|
for table in tables
|
|
367
106
|
]
|
|
368
|
-
results = executor.map(
|
|
107
|
+
results = executor.map(
|
|
108
|
+
self.api_client.get_single_table_lineage, table_paths
|
|
109
|
+
)
|
|
369
110
|
lineages = [link for links in results for link in links]
|
|
370
|
-
deduplicated =
|
|
111
|
+
deduplicated = deduplicate_lineage(lineages)
|
|
371
112
|
return self.formatter.format_lineage(deduplicated)
|
|
372
113
|
|
|
373
|
-
@staticmethod
|
|
374
|
-
def _paths_for_column_lineage(
|
|
375
|
-
tables: List[dict], columns: List[dict], table_lineage: List[dict]
|
|
376
|
-
) -> List[Tuple[str, str]]:
|
|
377
|
-
"""
|
|
378
|
-
helper providing a list of candidate columns to look lineage for:
|
|
379
|
-
we only look for column lineage where there is table lineage
|
|
380
|
-
"""
|
|
381
|
-
# mapping between table id and its path db.schema.table
|
|
382
|
-
# table["schema_id"] follows the pattern `db.schema`
|
|
383
|
-
mapping = {
|
|
384
|
-
table["id"]: ".".join([table["schema_id"], table["table_name"]])
|
|
385
|
-
for table in tables
|
|
386
|
-
}
|
|
387
|
-
|
|
388
|
-
tables_with_lineage: Set[str] = set()
|
|
389
|
-
for t in table_lineage:
|
|
390
|
-
tables_with_lineage.add(t["parent_path"])
|
|
391
|
-
tables_with_lineage.add(t["child_path"])
|
|
392
|
-
|
|
393
|
-
paths_to_return: List[Tuple[str, str]] = []
|
|
394
|
-
for column in columns:
|
|
395
|
-
table_path = mapping[column["table_id"]]
|
|
396
|
-
if table_path not in tables_with_lineage:
|
|
397
|
-
continue
|
|
398
|
-
column_ = (table_path, column["column_name"])
|
|
399
|
-
paths_to_return.append(column_)
|
|
400
|
-
|
|
401
|
-
return paths_to_return
|
|
402
|
-
|
|
403
|
-
def _single_column_lineage_links(
|
|
404
|
-
self, column_path: str, single_column_lineage: dict
|
|
405
|
-
) -> List[TimestampedLink]:
|
|
406
|
-
"""
|
|
407
|
-
process databricks lineage API response for a given table
|
|
408
|
-
returns a list of (parent, child, timestamp)
|
|
409
|
-
|
|
410
|
-
Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
|
|
411
|
-
we could also have `notebookInfos` or `fileInfo`
|
|
412
|
-
"""
|
|
413
|
-
links: List[OTimestampedLink] = []
|
|
414
|
-
# add parent:
|
|
415
|
-
for link in single_column_lineage.get("upstream_cols", []):
|
|
416
|
-
parent_path = self._to_column_path(link)
|
|
417
|
-
timestamp: Ostr = link.get("lineage_timestamp")
|
|
418
|
-
links.append(self._link(parent_path, column_path, timestamp))
|
|
419
|
-
|
|
420
|
-
# add children:
|
|
421
|
-
for link in single_column_lineage.get("downstream_cols", []):
|
|
422
|
-
child_path = self._to_column_path(link)
|
|
423
|
-
timestamp = link.get("lineage_timestamp")
|
|
424
|
-
links.append(self._link(column_path, child_path, timestamp))
|
|
425
|
-
|
|
426
|
-
return list(filter(None, links))
|
|
427
|
-
|
|
428
|
-
@safe_mode(safe_lineage_params, lambda: [])
|
|
429
|
-
@retry(
|
|
430
|
-
exceptions=_RETRY_EXCEPTIONS,
|
|
431
|
-
max_retries=_RETRY_ATTEMPTS,
|
|
432
|
-
base_ms=_RETRY_BASE_MS,
|
|
433
|
-
)
|
|
434
|
-
def get_single_column_lineage(
|
|
435
|
-
self,
|
|
436
|
-
names: Tuple[str, str],
|
|
437
|
-
) -> List[TimestampedLink]:
|
|
438
|
-
"""
|
|
439
|
-
Helper function used in get_lineage_links.
|
|
440
|
-
Call data lineage API and return the content of the result
|
|
441
|
-
|
|
442
|
-
eg table_path: broward_prd.bronze.account_adjustments
|
|
443
|
-
FYI: Maximum rate of 10 requests per SECOND
|
|
444
|
-
"""
|
|
445
|
-
table_path, column_name = names
|
|
446
|
-
api_path = "api/2.0/lineage-tracking/column-lineage"
|
|
447
|
-
payload = {
|
|
448
|
-
"table_name": table_path,
|
|
449
|
-
"column_name": column_name,
|
|
450
|
-
"include_entity_lineage": True,
|
|
451
|
-
}
|
|
452
|
-
content = self.get(path=api_path, payload=payload)
|
|
453
|
-
column_path = f"{table_path}.{column_name}"
|
|
454
|
-
return self._single_column_lineage_links(column_path, content)
|
|
455
|
-
|
|
456
114
|
def column_lineage(
|
|
457
115
|
self, tables: List[dict], columns: List[dict], table_lineage: List[dict]
|
|
458
116
|
) -> List[dict]:
|
|
@@ -460,133 +118,22 @@ class DatabricksClient(APIClient):
|
|
|
460
118
|
Wrapper function that retrieves all column lineage
|
|
461
119
|
we only try to retrieve column lineage if we found table lineage
|
|
462
120
|
"""
|
|
463
|
-
candidate_paths =
|
|
121
|
+
candidate_paths = paths_for_column_lineage(
|
|
464
122
|
tables, columns, table_lineage
|
|
465
123
|
)
|
|
466
124
|
lineages: List[TimestampedLink] = [
|
|
467
125
|
link
|
|
468
126
|
for paths in candidate_paths
|
|
469
|
-
for link in self.get_single_column_lineage(paths)
|
|
127
|
+
for link in self.api_client.get_single_column_lineage(paths)
|
|
470
128
|
]
|
|
471
|
-
deduplicated =
|
|
129
|
+
deduplicated = deduplicate_lineage(lineages)
|
|
472
130
|
return self.formatter.format_lineage(deduplicated)
|
|
473
131
|
|
|
474
|
-
@staticmethod
|
|
475
|
-
def _time_filter_payload(start_time_ms: int, end_time_ms: int) -> dict:
|
|
476
|
-
return {
|
|
477
|
-
"filter_by": {
|
|
478
|
-
"query_start_time_range": {
|
|
479
|
-
"end_time_ms": end_time_ms,
|
|
480
|
-
"start_time_ms": start_time_ms,
|
|
481
|
-
}
|
|
482
|
-
}
|
|
483
|
-
}
|
|
484
|
-
|
|
485
|
-
def _hourly_time_filters(
|
|
486
|
-
self, time_filter: Optional[TimeFilter]
|
|
487
|
-
) -> Iterable[dict]:
|
|
488
|
-
"""time filters to retrieve Databricks' queries: 1h duration each"""
|
|
489
|
-
# define an explicit time window
|
|
490
|
-
if not time_filter:
|
|
491
|
-
time_filter = TimeFilter.default()
|
|
492
|
-
|
|
493
|
-
assert time_filter # for mypy
|
|
494
|
-
|
|
495
|
-
hour_min = time_filter.hour_min
|
|
496
|
-
hour_max = time_filter.hour_max
|
|
497
|
-
day = time_filter.day
|
|
498
|
-
if hour_min is None or hour_max is None: # fallback to an entire day
|
|
499
|
-
hour_min, hour_max = _DEFAULT_HOUR_MIN, _DEFAULT_HOUR_MAX
|
|
500
|
-
|
|
501
|
-
for index in range(hour_min, min(hour_max + 1, _NUM_HOURS_IN_A_DAY)):
|
|
502
|
-
start_time_ms = _day_hour_to_epoch_ms(day, index)
|
|
503
|
-
end_time_ms = _day_hour_to_epoch_ms(day, index + 1)
|
|
504
|
-
yield self._time_filter_payload(start_time_ms, end_time_ms)
|
|
505
|
-
|
|
506
|
-
def query_payload(
|
|
507
|
-
self,
|
|
508
|
-
page_token: Optional[str] = None,
|
|
509
|
-
max_results: Optional[int] = None,
|
|
510
|
-
time_range_filter: Optional[dict] = None,
|
|
511
|
-
) -> dict:
|
|
512
|
-
"""helper method to build the payload used to retrieve queries"""
|
|
513
|
-
# in payload: You can provide only one of 'page_token' or 'filter_by'
|
|
514
|
-
if page_token:
|
|
515
|
-
payload: Dict[str, Any] = {"page_token": page_token}
|
|
516
|
-
else:
|
|
517
|
-
if not time_range_filter:
|
|
518
|
-
# should never happen.
|
|
519
|
-
# `time_range_filter` optional to leverage functiontools.partial
|
|
520
|
-
raise ValueError("Time range not specified")
|
|
521
|
-
payload = {**time_range_filter}
|
|
522
|
-
if max_results:
|
|
523
|
-
payload["max_results"] = max_results
|
|
524
|
-
return payload
|
|
525
|
-
|
|
526
|
-
def _scroll_queries(
|
|
527
|
-
self,
|
|
528
|
-
page_token: Optional[str] = None,
|
|
529
|
-
max_results: Optional[int] = None,
|
|
530
|
-
time_range_filter: Optional[dict] = None,
|
|
531
|
-
) -> dict:
|
|
532
|
-
"""
|
|
533
|
-
Callback to scroll the queries api
|
|
534
|
-
https://docs.databricks.com/api/workspace/queryhistory/list
|
|
535
|
-
max_results: Limit the number of results returned in one page.
|
|
536
|
-
The default is 100. (both on our side and Databricks')
|
|
537
|
-
"""
|
|
538
|
-
path = "api/2.0/sql/history/queries"
|
|
539
|
-
payload = self.query_payload(page_token, max_results, time_range_filter)
|
|
540
|
-
content = self.get(path=path, payload=payload)
|
|
541
|
-
return content if content else {}
|
|
542
|
-
|
|
543
|
-
@safe_mode(safe_query_params, lambda: [])
|
|
544
|
-
@retry(
|
|
545
|
-
exceptions=_RETRY_EXCEPTIONS,
|
|
546
|
-
max_retries=_RETRY_ATTEMPTS,
|
|
547
|
-
base_ms=_RETRY_BASE_MS,
|
|
548
|
-
)
|
|
549
|
-
def _queries(self, filter_: dict) -> List[dict]:
|
|
550
|
-
"""helper to retrieve queries using a given time filter"""
|
|
551
|
-
_time_filtered_scroll_queries = partial(
|
|
552
|
-
self._scroll_queries,
|
|
553
|
-
time_range_filter=filter_,
|
|
554
|
-
)
|
|
555
|
-
# retrieve all queries using pagination
|
|
556
|
-
return PagerOnToken(_time_filtered_scroll_queries).all()
|
|
557
|
-
|
|
558
132
|
def queries(self, time_filter: Optional[TimeFilter] = None) -> List[dict]:
|
|
559
|
-
|
|
560
|
-
time_range_filters = self._hourly_time_filters(time_filter)
|
|
561
|
-
|
|
562
|
-
raw_queries = []
|
|
563
|
-
for _filter in time_range_filters:
|
|
564
|
-
hourly = self._queries(_filter)
|
|
565
|
-
raw_queries.extend(hourly)
|
|
566
|
-
return self.formatter.format_query(raw_queries)
|
|
133
|
+
return self.api_client.queries(time_filter)
|
|
567
134
|
|
|
568
135
|
def users(self) -> List[dict]:
|
|
569
|
-
|
|
570
|
-
retrieve user from api
|
|
571
|
-
"""
|
|
572
|
-
path = "api/2.0/preview/scim/v2/Users"
|
|
573
|
-
content = self.get(path=path)
|
|
574
|
-
return self.formatter.format_user(content.get("Resources", []))
|
|
575
|
-
|
|
576
|
-
def _view_ddl(self, schema: dict) -> List[dict]:
|
|
577
|
-
path = "api/2.1/unity-catalog/tables"
|
|
578
|
-
payload = {
|
|
579
|
-
"catalog_name": schema["database_id"],
|
|
580
|
-
"schema_name": schema["schema_name"],
|
|
581
|
-
"omit_columns": True,
|
|
582
|
-
}
|
|
583
|
-
content = self.get(path=path, payload=payload)
|
|
584
|
-
return self.formatter.format_view_ddl(content.get("tables", []), schema)
|
|
136
|
+
return self.api_client.users()
|
|
585
137
|
|
|
586
138
|
def view_ddl(self, schemas: List[dict]) -> List[dict]:
|
|
587
|
-
|
|
588
|
-
view_ddl: List[dict] = []
|
|
589
|
-
for schema in schemas:
|
|
590
|
-
v_to_add = self._view_ddl(schema)
|
|
591
|
-
view_ddl.extend(v_to_add)
|
|
592
|
-
return view_ddl
|
|
139
|
+
return self.api_client.view_ddl(schemas)
|