castor-extractor 0.19.4__py3-none-any.whl → 0.19.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +13 -0
- castor_extractor/quality/soda/client/pagination.py +1 -1
- castor_extractor/utils/__init__.py +1 -0
- castor_extractor/utils/client/__init__.py +1 -1
- castor_extractor/utils/client/api/__init__.py +1 -1
- castor_extractor/utils/client/api/client.py +33 -7
- castor_extractor/utils/client/api/pagination.py +23 -6
- castor_extractor/utils/pager/__init__.py +0 -1
- castor_extractor/utils/salesforce/client.py +45 -50
- castor_extractor/utils/salesforce/client_test.py +2 -2
- castor_extractor/utils/salesforce/pagination.py +33 -0
- castor_extractor/visualization/metabase/client/api/client.py +30 -11
- castor_extractor/visualization/salesforce_reporting/client/rest.py +4 -3
- castor_extractor/visualization/sigma/client/client.py +2 -1
- castor_extractor/visualization/tableau_revamp/assets.py +8 -0
- castor_extractor/visualization/tableau_revamp/client/client.py +6 -1
- castor_extractor/warehouse/databricks/api_client.py +239 -0
- castor_extractor/warehouse/databricks/api_client_test.py +15 -0
- castor_extractor/warehouse/databricks/client.py +37 -489
- castor_extractor/warehouse/databricks/client_test.py +1 -99
- castor_extractor/warehouse/databricks/endpoints.py +28 -0
- castor_extractor/warehouse/databricks/lineage.py +141 -0
- castor_extractor/warehouse/databricks/lineage_test.py +34 -0
- castor_extractor/warehouse/databricks/pagination.py +22 -0
- castor_extractor/warehouse/databricks/sql_client.py +90 -0
- castor_extractor/warehouse/databricks/utils.py +44 -1
- castor_extractor/warehouse/databricks/utils_test.py +58 -1
- castor_extractor/warehouse/mysql/client.py +0 -3
- castor_extractor/warehouse/salesforce/client.py +12 -59
- castor_extractor/warehouse/salesforce/pagination.py +34 -0
- castor_extractor/warehouse/sqlserver/client.py +0 -2
- {castor_extractor-0.19.4.dist-info → castor_extractor-0.19.7.dist-info}/METADATA +14 -1
- {castor_extractor-0.19.4.dist-info → castor_extractor-0.19.7.dist-info}/RECORD +36 -31
- castor_extractor/utils/client/api_deprecated.py +0 -89
- castor_extractor/utils/client/api_deprecated_test.py +0 -18
- castor_extractor/utils/pager/pager_on_token.py +0 -52
- castor_extractor/utils/pager/pager_on_token_test.py +0 -73
- {castor_extractor-0.19.4.dist-info → castor_extractor-0.19.7.dist-info}/LICENCE +0 -0
- {castor_extractor-0.19.4.dist-info → castor_extractor-0.19.7.dist-info}/WHEEL +0 -0
- {castor_extractor-0.19.4.dist-info → castor_extractor-0.19.7.dist-info}/entry_points.txt +0 -0
|
@@ -1,92 +1,24 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from collections import defaultdict
|
|
3
2
|
from concurrent.futures import ThreadPoolExecutor
|
|
4
|
-
from
|
|
5
|
-
from enum import Enum
|
|
6
|
-
from functools import partial
|
|
7
|
-
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, cast
|
|
8
|
-
|
|
9
|
-
import requests
|
|
10
|
-
from databricks import sql # type: ignore
|
|
11
|
-
from requests import Response
|
|
3
|
+
from typing import List, Optional, Set
|
|
12
4
|
|
|
13
5
|
from ...utils import (
|
|
14
|
-
SafeMode,
|
|
15
|
-
at_midnight,
|
|
16
6
|
mapping_from_rows,
|
|
17
|
-
retry,
|
|
18
|
-
safe_mode,
|
|
19
7
|
)
|
|
20
|
-
from
|
|
21
|
-
from
|
|
22
|
-
from ..abstract.time_filter import TimeFilter
|
|
8
|
+
from ..abstract import TimeFilter
|
|
9
|
+
from .api_client import DatabricksAPIClient
|
|
23
10
|
from .credentials import DatabricksCredentials
|
|
24
|
-
from .format import DatabricksFormatter
|
|
25
|
-
from .
|
|
26
|
-
from .
|
|
11
|
+
from .format import DatabricksFormatter
|
|
12
|
+
from .lineage import deduplicate_lineage, paths_for_column_lineage
|
|
13
|
+
from .sql_client import DatabricksSQLClient, TagEntity
|
|
14
|
+
from .types import TablesColumns, TimestampedLink
|
|
27
15
|
|
|
28
16
|
logger = logging.getLogger(__name__)
|
|
29
17
|
|
|
30
|
-
_DATABRICKS_CLIENT_TIMEOUT = 90
|
|
31
|
-
_DEFAULT_HOUR_MIN = 0
|
|
32
|
-
_DEFAULT_HOUR_MAX = 23
|
|
33
|
-
_MAX_NUMBER_OF_LINEAGE_ERRORS = 1000
|
|
34
|
-
_MAX_NUMBER_OF_QUERY_ERRORS = 1000
|
|
35
18
|
_MAX_THREADS = 10
|
|
36
|
-
_NUM_HOURS_IN_A_DAY = 24
|
|
37
|
-
_RETRY_ATTEMPTS = 3
|
|
38
|
-
_RETRY_BASE_MS = 1000
|
|
39
|
-
_RETRY_EXCEPTIONS = [
|
|
40
|
-
requests.exceptions.ConnectTimeout,
|
|
41
|
-
]
|
|
42
|
-
_WORKSPACE_ID_HEADER = "X-Databricks-Org-Id"
|
|
43
|
-
|
|
44
|
-
_INFORMATION_SCHEMA_SQL = "SELECT * FROM system.information_schema"
|
|
45
|
-
|
|
46
|
-
safe_lineage_params = SafeMode((BaseException,), _MAX_NUMBER_OF_LINEAGE_ERRORS)
|
|
47
|
-
safe_query_params = SafeMode((BaseException,), _MAX_NUMBER_OF_QUERY_ERRORS)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class TagEntity(Enum):
|
|
51
|
-
"""Entities that can be tagged in Databricks"""
|
|
52
|
-
|
|
53
|
-
COLUMN = "COLUMN"
|
|
54
|
-
TABLE = "TABLE"
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
def _day_to_epoch_ms(day: date) -> int:
|
|
58
|
-
return int(at_midnight(day).timestamp() * 1000)
|
|
59
19
|
|
|
60
20
|
|
|
61
|
-
|
|
62
|
-
return int(at_midnight(day).timestamp() * 1000) + (hour * 3600 * 1000)
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
class LineageLinks:
|
|
66
|
-
"""
|
|
67
|
-
helper class that handles lineage deduplication and filtering
|
|
68
|
-
"""
|
|
69
|
-
|
|
70
|
-
def __init__(self):
|
|
71
|
-
self.lineage: Dict[Link, Ostr] = dict()
|
|
72
|
-
|
|
73
|
-
def add(self, timestamped_link: TimestampedLink) -> None:
|
|
74
|
-
"""
|
|
75
|
-
keep the most recent lineage link, adding to `self.lineage`
|
|
76
|
-
"""
|
|
77
|
-
parent, child, timestamp = timestamped_link
|
|
78
|
-
link = (parent, child)
|
|
79
|
-
if not self.lineage.get(link):
|
|
80
|
-
self.lineage[link] = timestamp
|
|
81
|
-
else:
|
|
82
|
-
if not timestamp:
|
|
83
|
-
return
|
|
84
|
-
# keep most recent link; cast for mypy
|
|
85
|
-
recent = max(cast(str, self.lineage[link]), cast(str, timestamp))
|
|
86
|
-
self.lineage[link] = recent
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
class DatabricksClient(APIClientDeprecated):
|
|
21
|
+
class DatabricksClient:
|
|
90
22
|
"""Databricks Client"""
|
|
91
23
|
|
|
92
24
|
def __init__(
|
|
@@ -97,111 +29,23 @@ class DatabricksClient(APIClientDeprecated):
|
|
|
97
29
|
has_table_tags: bool = False,
|
|
98
30
|
has_column_tags: bool = False,
|
|
99
31
|
):
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
self.
|
|
32
|
+
self.api_client = DatabricksAPIClient(
|
|
33
|
+
credentials=credentials,
|
|
34
|
+
db_allowed=db_allowed,
|
|
35
|
+
db_blocked=db_blocked,
|
|
36
|
+
)
|
|
37
|
+
self.sql_client = DatabricksSQLClient(
|
|
38
|
+
credentials=credentials,
|
|
39
|
+
has_table_tags=has_table_tags,
|
|
40
|
+
has_column_tags=has_column_tags,
|
|
41
|
+
)
|
|
106
42
|
|
|
107
|
-
self._timeout = _DATABRICKS_CLIENT_TIMEOUT
|
|
108
43
|
self.formatter = DatabricksFormatter()
|
|
109
44
|
|
|
110
|
-
def execute_sql(
|
|
111
|
-
self,
|
|
112
|
-
query: str,
|
|
113
|
-
params: Optional[dict] = None,
|
|
114
|
-
):
|
|
115
|
-
"""
|
|
116
|
-
Execute a SQL query on Databricks system tables and return the results.
|
|
117
|
-
https://docs.databricks.com/en/dev-tools/python-sql-connector.html
|
|
118
|
-
|
|
119
|
-
//!\\ credentials.http_path is required in order to run SQL queries
|
|
120
|
-
"""
|
|
121
|
-
assert self._http_path, "HTTP_PATH is required to run SQL queries"
|
|
122
|
-
with sql.connect(
|
|
123
|
-
server_hostname=self._host,
|
|
124
|
-
http_path=self._http_path,
|
|
125
|
-
access_token=self._token,
|
|
126
|
-
) as connection:
|
|
127
|
-
with connection.cursor() as cursor:
|
|
128
|
-
cursor.execute(query, params)
|
|
129
|
-
return cursor.fetchall()
|
|
130
|
-
|
|
131
45
|
@staticmethod
|
|
132
46
|
def name() -> str:
|
|
133
47
|
return "Databricks"
|
|
134
48
|
|
|
135
|
-
def _keep_catalog(self, catalog: str) -> bool:
|
|
136
|
-
"""
|
|
137
|
-
Helper function to determine if we should keep the Databricks catalog
|
|
138
|
-
which is a CastorDoc database
|
|
139
|
-
"""
|
|
140
|
-
if self._db_allowed and catalog not in self._db_allowed:
|
|
141
|
-
return False
|
|
142
|
-
if self._db_blocked and catalog in self._db_blocked:
|
|
143
|
-
return False
|
|
144
|
-
return True
|
|
145
|
-
|
|
146
|
-
def databases(self) -> List[dict]:
|
|
147
|
-
path = "api/2.1/unity-catalog/catalogs"
|
|
148
|
-
content = self.get(path=path)
|
|
149
|
-
_databases = self.formatter.format_database(content.get("catalogs", []))
|
|
150
|
-
return [d for d in _databases if self._keep_catalog(d["database_name"])]
|
|
151
|
-
|
|
152
|
-
def _schemas_of_database(self, database: dict) -> List[dict]:
|
|
153
|
-
path = "api/2.1/unity-catalog/schemas"
|
|
154
|
-
payload = {"catalog_name": database["database_name"]}
|
|
155
|
-
content = self.get(path=path, payload=payload)
|
|
156
|
-
schemas = content.get("schemas", [])
|
|
157
|
-
return self.formatter.format_schema(schemas, database)
|
|
158
|
-
|
|
159
|
-
def schemas(self, databases: List[dict]) -> List[dict]:
|
|
160
|
-
"""
|
|
161
|
-
Get the databricks schemas (also sometimes called databases)
|
|
162
|
-
(which correspond to the schemas in Castor)
|
|
163
|
-
leveraging the unity catalog API
|
|
164
|
-
"""
|
|
165
|
-
return [
|
|
166
|
-
schema
|
|
167
|
-
for database in databases
|
|
168
|
-
for schema in self._schemas_of_database(database)
|
|
169
|
-
]
|
|
170
|
-
|
|
171
|
-
@staticmethod
|
|
172
|
-
def _process_table_response(response: Response) -> Tuple[dict, str]:
|
|
173
|
-
"""
|
|
174
|
-
Returns both the JSON content and the Workspace ID, which is found
|
|
175
|
-
in the response's headers.
|
|
176
|
-
"""
|
|
177
|
-
return response.json(), response.headers[_WORKSPACE_ID_HEADER]
|
|
178
|
-
|
|
179
|
-
def _tables_columns_of_schema(
|
|
180
|
-
self,
|
|
181
|
-
schema: dict,
|
|
182
|
-
table_tags: TagMapping,
|
|
183
|
-
column_tags: TagMapping,
|
|
184
|
-
) -> TablesColumns:
|
|
185
|
-
path = "api/2.1/unity-catalog/tables"
|
|
186
|
-
payload = {
|
|
187
|
-
"catalog_name": schema["database_id"],
|
|
188
|
-
"schema_name": schema["schema_name"],
|
|
189
|
-
}
|
|
190
|
-
content, workspace_id = self.get(
|
|
191
|
-
path=path,
|
|
192
|
-
payload=payload,
|
|
193
|
-
processor=self._process_table_response,
|
|
194
|
-
)
|
|
195
|
-
host = self.build_url(self._host, path="")
|
|
196
|
-
return self.formatter.format_table_column(
|
|
197
|
-
raw_tables=content.get("tables", []),
|
|
198
|
-
schema=schema,
|
|
199
|
-
host=host,
|
|
200
|
-
workspace_id=workspace_id,
|
|
201
|
-
table_tags=table_tags,
|
|
202
|
-
column_tags=column_tags,
|
|
203
|
-
)
|
|
204
|
-
|
|
205
49
|
@staticmethod
|
|
206
50
|
def _match_table_with_user(table: dict, user_mapping: dict) -> dict:
|
|
207
51
|
table_owner_email = table.get("owner_email")
|
|
@@ -212,40 +56,6 @@ class DatabricksClient(APIClientDeprecated):
|
|
|
212
56
|
return table
|
|
213
57
|
return {**table, "owner_external_id": owner_external_id}
|
|
214
58
|
|
|
215
|
-
def _needs_extraction(self, entity: TagEntity) -> bool:
|
|
216
|
-
if entity == TagEntity.TABLE:
|
|
217
|
-
return self._has_table_tags
|
|
218
|
-
if entity == TagEntity.COLUMN:
|
|
219
|
-
return self._has_column_tags
|
|
220
|
-
raise AssertionError(f"Entity not supported: {entity}")
|
|
221
|
-
|
|
222
|
-
def _get_tags_mapping(self, entity: TagEntity) -> TagMapping:
|
|
223
|
-
"""
|
|
224
|
-
Fetch tags of the given entity and build a mapping:
|
|
225
|
-
{ path: list[tags] }
|
|
226
|
-
|
|
227
|
-
https://docs.databricks.com/en/sql/language-manual/information-schema/table_tags.html
|
|
228
|
-
https://docs.databricks.com/en/sql/language-manual/information-schema/column_tags.html
|
|
229
|
-
"""
|
|
230
|
-
if not self._needs_extraction(entity):
|
|
231
|
-
# extracting tags require additional credentials (http_path)
|
|
232
|
-
return dict()
|
|
233
|
-
|
|
234
|
-
table = f"{entity.value.lower()}_tags"
|
|
235
|
-
query = f"{_INFORMATION_SCHEMA_SQL}.{table}"
|
|
236
|
-
result = self.execute_sql(query)
|
|
237
|
-
mapping = defaultdict(list)
|
|
238
|
-
for row in result:
|
|
239
|
-
dict_row = row.asDict()
|
|
240
|
-
keys = ["catalog_name", "schema_name", "table_name"]
|
|
241
|
-
if entity == TagEntity.COLUMN:
|
|
242
|
-
keys.append("column_name")
|
|
243
|
-
path = build_path(dict_row, keys)
|
|
244
|
-
label = tag_label(dict_row)
|
|
245
|
-
mapping[path].append(label)
|
|
246
|
-
|
|
247
|
-
return mapping
|
|
248
|
-
|
|
249
59
|
@staticmethod
|
|
250
60
|
def _get_user_mapping(users: List[dict]) -> dict:
|
|
251
61
|
return {
|
|
@@ -253,6 +63,12 @@ class DatabricksClient(APIClientDeprecated):
|
|
|
253
63
|
**mapping_from_rows(users, "user_name", "id"),
|
|
254
64
|
}
|
|
255
65
|
|
|
66
|
+
def schemas(self, databases: List[dict]) -> List[dict]:
|
|
67
|
+
return self.api_client.schemas(databases)
|
|
68
|
+
|
|
69
|
+
def databases(self) -> List[dict]:
|
|
70
|
+
return self.api_client.databases()
|
|
71
|
+
|
|
256
72
|
def tables_and_columns(
|
|
257
73
|
self, schemas: List[dict], users: List[dict]
|
|
258
74
|
) -> TablesColumns:
|
|
@@ -262,10 +78,10 @@ class DatabricksClient(APIClientDeprecated):
|
|
|
262
78
|
tables: List[dict] = []
|
|
263
79
|
columns: List[dict] = []
|
|
264
80
|
user_mapping = self._get_user_mapping(users)
|
|
265
|
-
table_tags = self.
|
|
266
|
-
column_tags = self.
|
|
81
|
+
table_tags = self.sql_client.get_tags_mapping(TagEntity.TABLE)
|
|
82
|
+
column_tags = self.sql_client.get_tags_mapping(TagEntity.COLUMN)
|
|
267
83
|
for schema in schemas:
|
|
268
|
-
t_to_add, c_to_add = self.
|
|
84
|
+
t_to_add, c_to_add = self.api_client.tables_columns_of_schema(
|
|
269
85
|
schema=schema,
|
|
270
86
|
table_tags=table_tags,
|
|
271
87
|
column_tags=column_tags,
|
|
@@ -278,82 +94,6 @@ class DatabricksClient(APIClientDeprecated):
|
|
|
278
94
|
columns.extend(c_to_add)
|
|
279
95
|
return tables, columns
|
|
280
96
|
|
|
281
|
-
@staticmethod
|
|
282
|
-
def _to_table_path(table: dict) -> Ostr:
|
|
283
|
-
if table.get("name"):
|
|
284
|
-
return f"{table['catalog_name']}.{table['schema_name']}.{table['name']}"
|
|
285
|
-
return None
|
|
286
|
-
|
|
287
|
-
@staticmethod
|
|
288
|
-
def _to_column_path(column: dict) -> Ostr:
|
|
289
|
-
if column.get("name"):
|
|
290
|
-
return f"{column['catalog_name']}.{column['schema_name']}.{column['table_name']}.{column['name']}"
|
|
291
|
-
return None
|
|
292
|
-
|
|
293
|
-
def _link(
|
|
294
|
-
self, path_from: Ostr, path_to: Ostr, timestamp: Ostr
|
|
295
|
-
) -> OTimestampedLink:
|
|
296
|
-
"""exclude missing path and self-lineage"""
|
|
297
|
-
if (not path_from) or (not path_to):
|
|
298
|
-
return None
|
|
299
|
-
is_self_lineage = path_from.lower() == path_to.lower()
|
|
300
|
-
if is_self_lineage:
|
|
301
|
-
return None
|
|
302
|
-
return (path_from, path_to, timestamp)
|
|
303
|
-
|
|
304
|
-
def _single_table_lineage_links(
|
|
305
|
-
self, table_path: str, single_table_lineage: dict
|
|
306
|
-
) -> List[TimestampedLink]:
|
|
307
|
-
"""
|
|
308
|
-
process databricks lineage API response for a given table
|
|
309
|
-
returns a list of (parent, child, timestamp)
|
|
310
|
-
|
|
311
|
-
Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
|
|
312
|
-
we could also have `notebookInfos` or `fileInfo`
|
|
313
|
-
"""
|
|
314
|
-
links: List[OTimestampedLink] = []
|
|
315
|
-
# add parent:
|
|
316
|
-
for link in single_table_lineage.get("upstreams", []):
|
|
317
|
-
parent = link.get("tableInfo", {})
|
|
318
|
-
parent_path = self._to_table_path(parent)
|
|
319
|
-
timestamp: Ostr = parent.get("lineage_timestamp")
|
|
320
|
-
links.append(self._link(parent_path, table_path, timestamp))
|
|
321
|
-
|
|
322
|
-
# add children:
|
|
323
|
-
for link in single_table_lineage.get("downstreams", []):
|
|
324
|
-
child = link.get("tableInfo", {})
|
|
325
|
-
child_path = self._to_table_path(child)
|
|
326
|
-
timestamp = child.get("lineage_timestamp")
|
|
327
|
-
links.append(self._link(table_path, child_path, timestamp))
|
|
328
|
-
|
|
329
|
-
return list(filter(None, links))
|
|
330
|
-
|
|
331
|
-
@safe_mode(safe_lineage_params, lambda: [])
|
|
332
|
-
@retry(
|
|
333
|
-
exceptions=_RETRY_EXCEPTIONS,
|
|
334
|
-
max_retries=_RETRY_ATTEMPTS,
|
|
335
|
-
base_ms=_RETRY_BASE_MS,
|
|
336
|
-
)
|
|
337
|
-
def get_single_table_lineage(
|
|
338
|
-
self, table_path: str
|
|
339
|
-
) -> List[TimestampedLink]:
|
|
340
|
-
"""
|
|
341
|
-
Helper function used in get_lineage_links.
|
|
342
|
-
Call data lineage API and return the content of the result
|
|
343
|
-
eg table_path: broward_prd.bronze.account_adjustments
|
|
344
|
-
FYI: Maximum rate of 50 requests per SECOND
|
|
345
|
-
"""
|
|
346
|
-
path = "api/2.0/lineage-tracking/table-lineage"
|
|
347
|
-
payload = {"table_name": table_path, "include_entity_lineage": True}
|
|
348
|
-
content = self.get(path=path, payload=payload)
|
|
349
|
-
return self._single_table_lineage_links(table_path, content)
|
|
350
|
-
|
|
351
|
-
def _deduplicate_lineage(self, lineages: List[TimestampedLink]) -> dict:
|
|
352
|
-
deduplicated_lineage = LineageLinks()
|
|
353
|
-
for timestamped_link in lineages:
|
|
354
|
-
deduplicated_lineage.add(timestamped_link)
|
|
355
|
-
return deduplicated_lineage.lineage
|
|
356
|
-
|
|
357
97
|
def table_lineage(self, tables: List[dict]) -> List[dict]:
|
|
358
98
|
"""
|
|
359
99
|
Wrapper function that retrieves all table lineage
|
|
@@ -364,94 +104,13 @@ class DatabricksClient(APIClientDeprecated):
|
|
|
364
104
|
".".join([table["schema_id"], table["table_name"]])
|
|
365
105
|
for table in tables
|
|
366
106
|
]
|
|
367
|
-
results = executor.map(
|
|
107
|
+
results = executor.map(
|
|
108
|
+
self.api_client.get_single_table_lineage, table_paths
|
|
109
|
+
)
|
|
368
110
|
lineages = [link for links in results for link in links]
|
|
369
|
-
deduplicated =
|
|
111
|
+
deduplicated = deduplicate_lineage(lineages)
|
|
370
112
|
return self.formatter.format_lineage(deduplicated)
|
|
371
113
|
|
|
372
|
-
@staticmethod
|
|
373
|
-
def _paths_for_column_lineage(
|
|
374
|
-
tables: List[dict], columns: List[dict], table_lineage: List[dict]
|
|
375
|
-
) -> List[Tuple[str, str]]:
|
|
376
|
-
"""
|
|
377
|
-
helper providing a list of candidate columns to look lineage for:
|
|
378
|
-
we only look for column lineage where there is table lineage
|
|
379
|
-
"""
|
|
380
|
-
# mapping between table id and its path db.schema.table
|
|
381
|
-
# table["schema_id"] follows the pattern `db.schema`
|
|
382
|
-
mapping = {
|
|
383
|
-
table["id"]: ".".join([table["schema_id"], table["table_name"]])
|
|
384
|
-
for table in tables
|
|
385
|
-
}
|
|
386
|
-
|
|
387
|
-
tables_with_lineage: Set[str] = set()
|
|
388
|
-
for t in table_lineage:
|
|
389
|
-
tables_with_lineage.add(t["parent_path"])
|
|
390
|
-
tables_with_lineage.add(t["child_path"])
|
|
391
|
-
|
|
392
|
-
paths_to_return: List[Tuple[str, str]] = []
|
|
393
|
-
for column in columns:
|
|
394
|
-
table_path = mapping[column["table_id"]]
|
|
395
|
-
if table_path not in tables_with_lineage:
|
|
396
|
-
continue
|
|
397
|
-
column_ = (table_path, column["column_name"])
|
|
398
|
-
paths_to_return.append(column_)
|
|
399
|
-
|
|
400
|
-
return paths_to_return
|
|
401
|
-
|
|
402
|
-
def _single_column_lineage_links(
|
|
403
|
-
self, column_path: str, single_column_lineage: dict
|
|
404
|
-
) -> List[TimestampedLink]:
|
|
405
|
-
"""
|
|
406
|
-
process databricks lineage API response for a given table
|
|
407
|
-
returns a list of (parent, child, timestamp)
|
|
408
|
-
|
|
409
|
-
Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
|
|
410
|
-
we could also have `notebookInfos` or `fileInfo`
|
|
411
|
-
"""
|
|
412
|
-
links: List[OTimestampedLink] = []
|
|
413
|
-
# add parent:
|
|
414
|
-
for link in single_column_lineage.get("upstream_cols", []):
|
|
415
|
-
parent_path = self._to_column_path(link)
|
|
416
|
-
timestamp: Ostr = link.get("lineage_timestamp")
|
|
417
|
-
links.append(self._link(parent_path, column_path, timestamp))
|
|
418
|
-
|
|
419
|
-
# add children:
|
|
420
|
-
for link in single_column_lineage.get("downstream_cols", []):
|
|
421
|
-
child_path = self._to_column_path(link)
|
|
422
|
-
timestamp = link.get("lineage_timestamp")
|
|
423
|
-
links.append(self._link(column_path, child_path, timestamp))
|
|
424
|
-
|
|
425
|
-
return list(filter(None, links))
|
|
426
|
-
|
|
427
|
-
@safe_mode(safe_lineage_params, lambda: [])
|
|
428
|
-
@retry(
|
|
429
|
-
exceptions=_RETRY_EXCEPTIONS,
|
|
430
|
-
max_retries=_RETRY_ATTEMPTS,
|
|
431
|
-
base_ms=_RETRY_BASE_MS,
|
|
432
|
-
)
|
|
433
|
-
def get_single_column_lineage(
|
|
434
|
-
self,
|
|
435
|
-
names: Tuple[str, str],
|
|
436
|
-
) -> List[TimestampedLink]:
|
|
437
|
-
"""
|
|
438
|
-
Helper function used in get_lineage_links.
|
|
439
|
-
Call data lineage API and return the content of the result
|
|
440
|
-
|
|
441
|
-
eg table_path: broward_prd.bronze.account_adjustments
|
|
442
|
-
FYI: Maximum rate of 10 requests per SECOND
|
|
443
|
-
"""
|
|
444
|
-
table_path, column_name = names
|
|
445
|
-
api_path = "api/2.0/lineage-tracking/column-lineage"
|
|
446
|
-
payload = {
|
|
447
|
-
"table_name": table_path,
|
|
448
|
-
"column_name": column_name,
|
|
449
|
-
"include_entity_lineage": True,
|
|
450
|
-
}
|
|
451
|
-
content = self.get(path=api_path, payload=payload)
|
|
452
|
-
column_path = f"{table_path}.{column_name}"
|
|
453
|
-
return self._single_column_lineage_links(column_path, content)
|
|
454
|
-
|
|
455
114
|
def column_lineage(
|
|
456
115
|
self, tables: List[dict], columns: List[dict], table_lineage: List[dict]
|
|
457
116
|
) -> List[dict]:
|
|
@@ -459,133 +118,22 @@ class DatabricksClient(APIClientDeprecated):
|
|
|
459
118
|
Wrapper function that retrieves all column lineage
|
|
460
119
|
we only try to retrieve column lineage if we found table lineage
|
|
461
120
|
"""
|
|
462
|
-
candidate_paths =
|
|
121
|
+
candidate_paths = paths_for_column_lineage(
|
|
463
122
|
tables, columns, table_lineage
|
|
464
123
|
)
|
|
465
124
|
lineages: List[TimestampedLink] = [
|
|
466
125
|
link
|
|
467
126
|
for paths in candidate_paths
|
|
468
|
-
for link in self.get_single_column_lineage(paths)
|
|
127
|
+
for link in self.api_client.get_single_column_lineage(paths)
|
|
469
128
|
]
|
|
470
|
-
deduplicated =
|
|
129
|
+
deduplicated = deduplicate_lineage(lineages)
|
|
471
130
|
return self.formatter.format_lineage(deduplicated)
|
|
472
131
|
|
|
473
|
-
@staticmethod
|
|
474
|
-
def _time_filter_payload(start_time_ms: int, end_time_ms: int) -> dict:
|
|
475
|
-
return {
|
|
476
|
-
"filter_by": {
|
|
477
|
-
"query_start_time_range": {
|
|
478
|
-
"end_time_ms": end_time_ms,
|
|
479
|
-
"start_time_ms": start_time_ms,
|
|
480
|
-
}
|
|
481
|
-
}
|
|
482
|
-
}
|
|
483
|
-
|
|
484
|
-
def _hourly_time_filters(
|
|
485
|
-
self, time_filter: Optional[TimeFilter]
|
|
486
|
-
) -> Iterable[dict]:
|
|
487
|
-
"""time filters to retrieve Databricks' queries: 1h duration each"""
|
|
488
|
-
# define an explicit time window
|
|
489
|
-
if not time_filter:
|
|
490
|
-
time_filter = TimeFilter.default()
|
|
491
|
-
|
|
492
|
-
assert time_filter # for mypy
|
|
493
|
-
|
|
494
|
-
hour_min = time_filter.hour_min
|
|
495
|
-
hour_max = time_filter.hour_max
|
|
496
|
-
day = time_filter.day
|
|
497
|
-
if hour_min is None or hour_max is None: # fallback to an entire day
|
|
498
|
-
hour_min, hour_max = _DEFAULT_HOUR_MIN, _DEFAULT_HOUR_MAX
|
|
499
|
-
|
|
500
|
-
for index in range(hour_min, min(hour_max + 1, _NUM_HOURS_IN_A_DAY)):
|
|
501
|
-
start_time_ms = _day_hour_to_epoch_ms(day, index)
|
|
502
|
-
end_time_ms = _day_hour_to_epoch_ms(day, index + 1)
|
|
503
|
-
yield self._time_filter_payload(start_time_ms, end_time_ms)
|
|
504
|
-
|
|
505
|
-
def query_payload(
|
|
506
|
-
self,
|
|
507
|
-
page_token: Optional[str] = None,
|
|
508
|
-
max_results: Optional[int] = None,
|
|
509
|
-
time_range_filter: Optional[dict] = None,
|
|
510
|
-
) -> dict:
|
|
511
|
-
"""helper method to build the payload used to retrieve queries"""
|
|
512
|
-
# in payload: You can provide only one of 'page_token' or 'filter_by'
|
|
513
|
-
if page_token:
|
|
514
|
-
payload: Dict[str, Any] = {"page_token": page_token}
|
|
515
|
-
else:
|
|
516
|
-
if not time_range_filter:
|
|
517
|
-
# should never happen.
|
|
518
|
-
# `time_range_filter` optional to leverage functiontools.partial
|
|
519
|
-
raise ValueError("Time range not specified")
|
|
520
|
-
payload = {**time_range_filter}
|
|
521
|
-
if max_results:
|
|
522
|
-
payload["max_results"] = max_results
|
|
523
|
-
return payload
|
|
524
|
-
|
|
525
|
-
def _scroll_queries(
|
|
526
|
-
self,
|
|
527
|
-
page_token: Optional[str] = None,
|
|
528
|
-
max_results: Optional[int] = None,
|
|
529
|
-
time_range_filter: Optional[dict] = None,
|
|
530
|
-
) -> dict:
|
|
531
|
-
"""
|
|
532
|
-
Callback to scroll the queries api
|
|
533
|
-
https://docs.databricks.com/api/workspace/queryhistory/list
|
|
534
|
-
max_results: Limit the number of results returned in one page.
|
|
535
|
-
The default is 100. (both on our side and Databricks')
|
|
536
|
-
"""
|
|
537
|
-
path = "api/2.0/sql/history/queries"
|
|
538
|
-
payload = self.query_payload(page_token, max_results, time_range_filter)
|
|
539
|
-
content = self.get(path=path, payload=payload)
|
|
540
|
-
return content if content else {}
|
|
541
|
-
|
|
542
|
-
@safe_mode(safe_query_params, lambda: [])
|
|
543
|
-
@retry(
|
|
544
|
-
exceptions=_RETRY_EXCEPTIONS,
|
|
545
|
-
max_retries=_RETRY_ATTEMPTS,
|
|
546
|
-
base_ms=_RETRY_BASE_MS,
|
|
547
|
-
)
|
|
548
|
-
def _queries(self, filter_: dict) -> List[dict]:
|
|
549
|
-
"""helper to retrieve queries using a given time filter"""
|
|
550
|
-
_time_filtered_scroll_queries = partial(
|
|
551
|
-
self._scroll_queries,
|
|
552
|
-
time_range_filter=filter_,
|
|
553
|
-
)
|
|
554
|
-
# retrieve all queries using pagination
|
|
555
|
-
return PagerOnToken(_time_filtered_scroll_queries).all()
|
|
556
|
-
|
|
557
132
|
def queries(self, time_filter: Optional[TimeFilter] = None) -> List[dict]:
|
|
558
|
-
|
|
559
|
-
time_range_filters = self._hourly_time_filters(time_filter)
|
|
560
|
-
|
|
561
|
-
raw_queries = []
|
|
562
|
-
for _filter in time_range_filters:
|
|
563
|
-
hourly = self._queries(_filter)
|
|
564
|
-
raw_queries.extend(hourly)
|
|
565
|
-
return self.formatter.format_query(raw_queries)
|
|
133
|
+
return self.api_client.queries(time_filter)
|
|
566
134
|
|
|
567
135
|
def users(self) -> List[dict]:
|
|
568
|
-
|
|
569
|
-
retrieve user from api
|
|
570
|
-
"""
|
|
571
|
-
path = "api/2.0/preview/scim/v2/Users"
|
|
572
|
-
content = self.get(path=path)
|
|
573
|
-
return self.formatter.format_user(content.get("Resources", []))
|
|
574
|
-
|
|
575
|
-
def _view_ddl(self, schema: dict) -> List[dict]:
|
|
576
|
-
path = "api/2.1/unity-catalog/tables"
|
|
577
|
-
payload = {
|
|
578
|
-
"catalog_name": schema["database_id"],
|
|
579
|
-
"schema_name": schema["schema_name"],
|
|
580
|
-
"omit_columns": True,
|
|
581
|
-
}
|
|
582
|
-
content = self.get(path=path, payload=payload)
|
|
583
|
-
return self.formatter.format_view_ddl(content.get("tables", []), schema)
|
|
136
|
+
return self.api_client.users()
|
|
584
137
|
|
|
585
138
|
def view_ddl(self, schemas: List[dict]) -> List[dict]:
|
|
586
|
-
|
|
587
|
-
view_ddl: List[dict] = []
|
|
588
|
-
for schema in schemas:
|
|
589
|
-
v_to_add = self._view_ddl(schema)
|
|
590
|
-
view_ddl.extend(v_to_add)
|
|
591
|
-
return view_ddl
|
|
139
|
+
return self.api_client.view_ddl(schemas)
|