castor-extractor 0.19.0__py3-none-any.whl → 0.19.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

Files changed (83) hide show
  1. CHANGELOG.md +29 -2
  2. castor_extractor/file_checker/templates/generic_warehouse.py +1 -1
  3. castor_extractor/knowledge/notion/client/client.py +44 -80
  4. castor_extractor/knowledge/notion/client/client_test.py +9 -4
  5. castor_extractor/knowledge/notion/client/constants.py +1 -0
  6. castor_extractor/knowledge/notion/client/endpoints.py +1 -1
  7. castor_extractor/knowledge/notion/client/pagination.py +9 -5
  8. castor_extractor/quality/soda/assets.py +1 -1
  9. castor_extractor/quality/soda/client/client.py +30 -83
  10. castor_extractor/quality/soda/client/credentials.py +0 -11
  11. castor_extractor/quality/soda/client/endpoints.py +3 -6
  12. castor_extractor/quality/soda/client/pagination.py +25 -0
  13. castor_extractor/utils/__init__.py +13 -2
  14. castor_extractor/utils/client/__init__.py +14 -0
  15. castor_extractor/utils/client/api/__init__.py +5 -0
  16. castor_extractor/utils/client/api/auth.py +76 -0
  17. castor_extractor/utils/client/api/auth_test.py +49 -0
  18. castor_extractor/utils/client/api/client.py +153 -0
  19. castor_extractor/utils/client/api/client_test.py +47 -0
  20. castor_extractor/utils/client/api/pagination.py +83 -0
  21. castor_extractor/utils/client/api/pagination_test.py +51 -0
  22. castor_extractor/utils/{safe_request_test.py → client/api/safe_request_test.py} +4 -1
  23. castor_extractor/utils/client/api/utils.py +9 -0
  24. castor_extractor/utils/client/api/utils_test.py +16 -0
  25. castor_extractor/utils/collection.py +34 -2
  26. castor_extractor/utils/collection_test.py +17 -3
  27. castor_extractor/utils/pager/__init__.py +0 -1
  28. castor_extractor/utils/retry.py +44 -0
  29. castor_extractor/utils/retry_test.py +26 -1
  30. castor_extractor/utils/salesforce/client.py +44 -49
  31. castor_extractor/utils/salesforce/client_test.py +2 -2
  32. castor_extractor/utils/salesforce/pagination.py +33 -0
  33. castor_extractor/visualization/domo/client/client.py +10 -5
  34. castor_extractor/visualization/domo/client/credentials.py +1 -1
  35. castor_extractor/visualization/domo/client/endpoints.py +19 -7
  36. castor_extractor/visualization/looker/api/credentials.py +1 -1
  37. castor_extractor/visualization/metabase/client/api/client.py +26 -11
  38. castor_extractor/visualization/metabase/client/api/credentials.py +1 -1
  39. castor_extractor/visualization/metabase/client/db/credentials.py +1 -1
  40. castor_extractor/visualization/mode/client/credentials.py +1 -1
  41. castor_extractor/visualization/qlik/client/engine/credentials.py +1 -1
  42. castor_extractor/visualization/salesforce_reporting/client/rest.py +4 -3
  43. castor_extractor/visualization/sigma/client/client.py +106 -111
  44. castor_extractor/visualization/sigma/client/credentials.py +11 -1
  45. castor_extractor/visualization/sigma/client/endpoints.py +1 -1
  46. castor_extractor/visualization/sigma/client/pagination.py +22 -18
  47. castor_extractor/visualization/tableau/tests/unit/rest_api/auth_test.py +0 -1
  48. castor_extractor/visualization/tableau/tests/unit/rest_api/credentials_test.py +0 -3
  49. castor_extractor/visualization/tableau_revamp/assets.py +11 -0
  50. castor_extractor/visualization/tableau_revamp/client/client.py +71 -151
  51. castor_extractor/visualization/tableau_revamp/client/client_metadata_api.py +95 -0
  52. castor_extractor/visualization/tableau_revamp/client/client_rest_api.py +128 -0
  53. castor_extractor/visualization/tableau_revamp/client/client_tsc.py +66 -0
  54. castor_extractor/visualization/tableau_revamp/client/{tsc_fields.py → rest_fields.py} +15 -2
  55. castor_extractor/visualization/tableau_revamp/constants.py +0 -2
  56. castor_extractor/visualization/tableau_revamp/extract.py +5 -11
  57. castor_extractor/warehouse/databricks/api_client.py +239 -0
  58. castor_extractor/warehouse/databricks/api_client_test.py +15 -0
  59. castor_extractor/warehouse/databricks/client.py +37 -490
  60. castor_extractor/warehouse/databricks/client_test.py +1 -99
  61. castor_extractor/warehouse/databricks/endpoints.py +28 -0
  62. castor_extractor/warehouse/databricks/lineage.py +141 -0
  63. castor_extractor/warehouse/databricks/lineage_test.py +34 -0
  64. castor_extractor/warehouse/databricks/pagination.py +22 -0
  65. castor_extractor/warehouse/databricks/sql_client.py +90 -0
  66. castor_extractor/warehouse/databricks/utils.py +44 -1
  67. castor_extractor/warehouse/databricks/utils_test.py +58 -1
  68. castor_extractor/warehouse/mysql/client.py +0 -2
  69. castor_extractor/warehouse/salesforce/client.py +12 -59
  70. castor_extractor/warehouse/salesforce/pagination.py +34 -0
  71. castor_extractor/warehouse/sqlserver/client.py +0 -1
  72. castor_extractor-0.19.6.dist-info/METADATA +903 -0
  73. {castor_extractor-0.19.0.dist-info → castor_extractor-0.19.6.dist-info}/RECORD +77 -60
  74. castor_extractor/utils/client/api.py +0 -87
  75. castor_extractor/utils/client/api_test.py +0 -24
  76. castor_extractor/utils/pager/pager_on_token.py +0 -52
  77. castor_extractor/utils/pager/pager_on_token_test.py +0 -73
  78. castor_extractor/visualization/sigma/client/client_test.py +0 -54
  79. castor_extractor-0.19.0.dist-info/METADATA +0 -207
  80. /castor_extractor/utils/{safe_request.py → client/api/safe_request.py} +0 -0
  81. {castor_extractor-0.19.0.dist-info → castor_extractor-0.19.6.dist-info}/LICENCE +0 -0
  82. {castor_extractor-0.19.0.dist-info → castor_extractor-0.19.6.dist-info}/WHEEL +0 -0
  83. {castor_extractor-0.19.0.dist-info → castor_extractor-0.19.6.dist-info}/entry_points.txt +0 -0
@@ -1,93 +1,24 @@
1
1
  import logging
2
- from collections import defaultdict
3
2
  from concurrent.futures import ThreadPoolExecutor
4
- from datetime import date
5
- from enum import Enum
6
- from functools import partial
7
- from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, cast
8
-
9
- import requests
10
- from databricks import sql # type: ignore
11
- from requests import Response
3
+ from typing import List, Optional, Set
12
4
 
13
5
  from ...utils import (
14
- SafeMode,
15
- at_midnight,
16
- date_after,
17
6
  mapping_from_rows,
18
- retry,
19
- safe_mode,
20
7
  )
21
- from ...utils.client.api import APIClient
22
- from ...utils.pager import PagerOnToken
23
- from ..abstract.time_filter import TimeFilter
8
+ from ..abstract import TimeFilter
9
+ from .api_client import DatabricksAPIClient
24
10
  from .credentials import DatabricksCredentials
25
- from .format import DatabricksFormatter, TagMapping
26
- from .types import Link, Ostr, OTimestampedLink, TablesColumns, TimestampedLink
27
- from .utils import build_path, tag_label
11
+ from .format import DatabricksFormatter
12
+ from .lineage import deduplicate_lineage, paths_for_column_lineage
13
+ from .sql_client import DatabricksSQLClient, TagEntity
14
+ from .types import TablesColumns, TimestampedLink
28
15
 
29
16
  logger = logging.getLogger(__name__)
30
17
 
31
- _DATABRICKS_CLIENT_TIMEOUT = 90
32
- _DEFAULT_HOUR_MIN = 0
33
- _DEFAULT_HOUR_MAX = 23
34
- _MAX_NUMBER_OF_LINEAGE_ERRORS = 1000
35
- _MAX_NUMBER_OF_QUERY_ERRORS = 1000
36
18
  _MAX_THREADS = 10
37
- _NUM_HOURS_IN_A_DAY = 24
38
- _RETRY_ATTEMPTS = 3
39
- _RETRY_BASE_MS = 1000
40
- _RETRY_EXCEPTIONS = [
41
- requests.exceptions.ConnectTimeout,
42
- ]
43
- _WORKSPACE_ID_HEADER = "X-Databricks-Org-Id"
44
-
45
- _INFORMATION_SCHEMA_SQL = "SELECT * FROM system.information_schema"
46
-
47
- safe_lineage_params = SafeMode((BaseException,), _MAX_NUMBER_OF_LINEAGE_ERRORS)
48
- safe_query_params = SafeMode((BaseException,), _MAX_NUMBER_OF_QUERY_ERRORS)
49
-
50
-
51
- class TagEntity(Enum):
52
- """Entities that can be tagged in Databricks"""
53
-
54
- COLUMN = "COLUMN"
55
- TABLE = "TABLE"
56
-
57
-
58
- def _day_to_epoch_ms(day: date) -> int:
59
- return int(at_midnight(day).timestamp() * 1000)
60
19
 
61
20
 
62
- def _day_hour_to_epoch_ms(day: date, hour: int) -> int:
63
- return int(at_midnight(day).timestamp() * 1000) + (hour * 3600 * 1000)
64
-
65
-
66
- class LineageLinks:
67
- """
68
- helper class that handles lineage deduplication and filtering
69
- """
70
-
71
- def __init__(self):
72
- self.lineage: Dict[Link, Ostr] = dict()
73
-
74
- def add(self, timestamped_link: TimestampedLink) -> None:
75
- """
76
- keep the most recent lineage link, adding to `self.lineage`
77
- """
78
- parent, child, timestamp = timestamped_link
79
- link = (parent, child)
80
- if not self.lineage.get(link):
81
- self.lineage[link] = timestamp
82
- else:
83
- if not timestamp:
84
- return
85
- # keep most recent link; cast for mypy
86
- recent = max(cast(str, self.lineage[link]), cast(str, timestamp))
87
- self.lineage[link] = recent
88
-
89
-
90
- class DatabricksClient(APIClient):
21
+ class DatabricksClient:
91
22
  """Databricks Client"""
92
23
 
93
24
  def __init__(
@@ -98,111 +29,23 @@ class DatabricksClient(APIClient):
98
29
  has_table_tags: bool = False,
99
30
  has_column_tags: bool = False,
100
31
  ):
101
- super().__init__(host=credentials.host, token=credentials.token)
102
- self._http_path = credentials.http_path
103
- self._db_allowed = db_allowed
104
- self._db_blocked = db_blocked
105
- self._has_table_tags = has_table_tags
106
- self._has_column_tags = has_column_tags
32
+ self.api_client = DatabricksAPIClient(
33
+ credentials=credentials,
34
+ db_allowed=db_allowed,
35
+ db_blocked=db_blocked,
36
+ )
37
+ self.sql_client = DatabricksSQLClient(
38
+ credentials=credentials,
39
+ has_table_tags=has_table_tags,
40
+ has_column_tags=has_column_tags,
41
+ )
107
42
 
108
- self._timeout = _DATABRICKS_CLIENT_TIMEOUT
109
43
  self.formatter = DatabricksFormatter()
110
44
 
111
- def execute_sql(
112
- self,
113
- query: str,
114
- params: Optional[dict] = None,
115
- ):
116
- """
117
- Execute a SQL query on Databricks system tables and return the results.
118
- https://docs.databricks.com/en/dev-tools/python-sql-connector.html
119
-
120
- //!\\ credentials.http_path is required in order to run SQL queries
121
- """
122
- assert self._http_path, "HTTP_PATH is required to run SQL queries"
123
- with sql.connect(
124
- server_hostname=self._host,
125
- http_path=self._http_path,
126
- access_token=self._token,
127
- ) as connection:
128
- with connection.cursor() as cursor:
129
- cursor.execute(query, params)
130
- return cursor.fetchall()
131
-
132
45
  @staticmethod
133
46
  def name() -> str:
134
47
  return "Databricks"
135
48
 
136
- def _keep_catalog(self, catalog: str) -> bool:
137
- """
138
- Helper function to determine if we should keep the Databricks catalog
139
- which is a CastorDoc database
140
- """
141
- if self._db_allowed and catalog not in self._db_allowed:
142
- return False
143
- if self._db_blocked and catalog in self._db_blocked:
144
- return False
145
- return True
146
-
147
- def databases(self) -> List[dict]:
148
- path = "api/2.1/unity-catalog/catalogs"
149
- content = self.get(path=path)
150
- _databases = self.formatter.format_database(content.get("catalogs", []))
151
- return [d for d in _databases if self._keep_catalog(d["database_name"])]
152
-
153
- def _schemas_of_database(self, database: dict) -> List[dict]:
154
- path = "api/2.1/unity-catalog/schemas"
155
- payload = {"catalog_name": database["database_name"]}
156
- content = self.get(path=path, payload=payload)
157
- schemas = content.get("schemas", [])
158
- return self.formatter.format_schema(schemas, database)
159
-
160
- def schemas(self, databases: List[dict]) -> List[dict]:
161
- """
162
- Get the databricks schemas (also sometimes called databases)
163
- (which correspond to the schemas in Castor)
164
- leveraging the unity catalog API
165
- """
166
- return [
167
- schema
168
- for database in databases
169
- for schema in self._schemas_of_database(database)
170
- ]
171
-
172
- @staticmethod
173
- def _process_table_response(response: Response) -> Tuple[dict, str]:
174
- """
175
- Returns both the JSON content and the Workspace ID, which is found
176
- in the response's headers.
177
- """
178
- return response.json(), response.headers[_WORKSPACE_ID_HEADER]
179
-
180
- def _tables_columns_of_schema(
181
- self,
182
- schema: dict,
183
- table_tags: TagMapping,
184
- column_tags: TagMapping,
185
- ) -> TablesColumns:
186
- path = "api/2.1/unity-catalog/tables"
187
- payload = {
188
- "catalog_name": schema["database_id"],
189
- "schema_name": schema["schema_name"],
190
- }
191
- content, workspace_id = self.get(
192
- path=path,
193
- payload=payload,
194
- processor=self._process_table_response,
195
- )
196
- host = self.build_url(self._host, path="")
197
- return self.formatter.format_table_column(
198
- raw_tables=content.get("tables", []),
199
- schema=schema,
200
- host=host,
201
- workspace_id=workspace_id,
202
- table_tags=table_tags,
203
- column_tags=column_tags,
204
- )
205
-
206
49
  @staticmethod
207
50
  def _match_table_with_user(table: dict, user_mapping: dict) -> dict:
208
51
  table_owner_email = table.get("owner_email")
@@ -213,40 +56,6 @@ class DatabricksClient(APIClient):
213
56
  return table
214
57
  return {**table, "owner_external_id": owner_external_id}
215
58
 
216
- def _needs_extraction(self, entity: TagEntity) -> bool:
217
- if entity == TagEntity.TABLE:
218
- return self._has_table_tags
219
- if entity == TagEntity.COLUMN:
220
- return self._has_column_tags
221
- raise AssertionError(f"Entity not supported: {entity}")
222
-
223
- def _get_tags_mapping(self, entity: TagEntity) -> TagMapping:
224
- """
225
- Fetch tags of the given entity and build a mapping:
226
- { path: list[tags] }
227
-
228
- https://docs.databricks.com/en/sql/language-manual/information-schema/table_tags.html
229
- https://docs.databricks.com/en/sql/language-manual/information-schema/column_tags.html
230
- """
231
- if not self._needs_extraction(entity):
232
- # extracting tags require additional credentials (http_path)
233
- return dict()
234
-
235
- table = f"{entity.value.lower()}_tags"
236
- query = f"{_INFORMATION_SCHEMA_SQL}.{table}"
237
- result = self.execute_sql(query)
238
- mapping = defaultdict(list)
239
- for row in result:
240
- dict_row = row.asDict()
241
- keys = ["catalog_name", "schema_name", "table_name"]
242
- if entity == TagEntity.COLUMN:
243
- keys.append("column_name")
244
- path = build_path(dict_row, keys)
245
- label = tag_label(dict_row)
246
- mapping[path].append(label)
247
-
248
- return mapping
249
-
250
59
  @staticmethod
251
60
  def _get_user_mapping(users: List[dict]) -> dict:
252
61
  return {
@@ -254,6 +63,12 @@ class DatabricksClient(APIClient):
254
63
  **mapping_from_rows(users, "user_name", "id"),
255
64
  }
256
65
 
66
+ def schemas(self, databases: List[dict]) -> List[dict]:
67
+ return self.api_client.schemas(databases)
68
+
69
+ def databases(self) -> List[dict]:
70
+ return self.api_client.databases()
71
+
257
72
  def tables_and_columns(
258
73
  self, schemas: List[dict], users: List[dict]
259
74
  ) -> TablesColumns:
@@ -263,10 +78,10 @@ class DatabricksClient(APIClient):
263
78
  tables: List[dict] = []
264
79
  columns: List[dict] = []
265
80
  user_mapping = self._get_user_mapping(users)
266
- table_tags = self._get_tags_mapping(TagEntity.TABLE)
267
- column_tags = self._get_tags_mapping(TagEntity.COLUMN)
81
+ table_tags = self.sql_client.get_tags_mapping(TagEntity.TABLE)
82
+ column_tags = self.sql_client.get_tags_mapping(TagEntity.COLUMN)
268
83
  for schema in schemas:
269
- t_to_add, c_to_add = self._tables_columns_of_schema(
84
+ t_to_add, c_to_add = self.api_client.tables_columns_of_schema(
270
85
  schema=schema,
271
86
  table_tags=table_tags,
272
87
  column_tags=column_tags,
@@ -279,82 +94,6 @@ class DatabricksClient(APIClient):
279
94
  columns.extend(c_to_add)
280
95
  return tables, columns
281
96
 
282
- @staticmethod
283
- def _to_table_path(table: dict) -> Ostr:
284
- if table.get("name"):
285
- return f"{table['catalog_name']}.{table['schema_name']}.{table['name']}"
286
- return None
287
-
288
- @staticmethod
289
- def _to_column_path(column: dict) -> Ostr:
290
- if column.get("name"):
291
- return f"{column['catalog_name']}.{column['schema_name']}.{column['table_name']}.{column['name']}"
292
- return None
293
-
294
- def _link(
295
- self, path_from: Ostr, path_to: Ostr, timestamp: Ostr
296
- ) -> OTimestampedLink:
297
- """exclude missing path and self-lineage"""
298
- if (not path_from) or (not path_to):
299
- return None
300
- is_self_lineage = path_from.lower() == path_to.lower()
301
- if is_self_lineage:
302
- return None
303
- return (path_from, path_to, timestamp)
304
-
305
- def _single_table_lineage_links(
306
- self, table_path: str, single_table_lineage: dict
307
- ) -> List[TimestampedLink]:
308
- """
309
- process databricks lineage API response for a given table
310
- returns a list of (parent, child, timestamp)
311
-
312
- Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
313
- we could also have `notebookInfos` or `fileInfo`
314
- """
315
- links: List[OTimestampedLink] = []
316
- # add parent:
317
- for link in single_table_lineage.get("upstreams", []):
318
- parent = link.get("tableInfo", {})
319
- parent_path = self._to_table_path(parent)
320
- timestamp: Ostr = parent.get("lineage_timestamp")
321
- links.append(self._link(parent_path, table_path, timestamp))
322
-
323
- # add children:
324
- for link in single_table_lineage.get("downstreams", []):
325
- child = link.get("tableInfo", {})
326
- child_path = self._to_table_path(child)
327
- timestamp = child.get("lineage_timestamp")
328
- links.append(self._link(table_path, child_path, timestamp))
329
-
330
- return list(filter(None, links))
331
-
332
- @safe_mode(safe_lineage_params, lambda: [])
333
- @retry(
334
- exceptions=_RETRY_EXCEPTIONS,
335
- max_retries=_RETRY_ATTEMPTS,
336
- base_ms=_RETRY_BASE_MS,
337
- )
338
- def get_single_table_lineage(
339
- self, table_path: str
340
- ) -> List[TimestampedLink]:
341
- """
342
- Helper function used in get_lineage_links.
343
- Call data lineage API and return the content of the result
344
- eg table_path: broward_prd.bronze.account_adjustments
345
- FYI: Maximum rate of 50 requests per SECOND
346
- """
347
- path = "api/2.0/lineage-tracking/table-lineage"
348
- payload = {"table_name": table_path, "include_entity_lineage": True}
349
- content = self.get(path=path, payload=payload)
350
- return self._single_table_lineage_links(table_path, content)
351
-
352
- def _deduplicate_lineage(self, lineages: List[TimestampedLink]) -> dict:
353
- deduplicated_lineage = LineageLinks()
354
- for timestamped_link in lineages:
355
- deduplicated_lineage.add(timestamped_link)
356
- return deduplicated_lineage.lineage
357
-
358
97
  def table_lineage(self, tables: List[dict]) -> List[dict]:
359
98
  """
360
99
  Wrapper function that retrieves all table lineage
@@ -365,94 +104,13 @@ class DatabricksClient(APIClient):
365
104
  ".".join([table["schema_id"], table["table_name"]])
366
105
  for table in tables
367
106
  ]
368
- results = executor.map(self.get_single_table_lineage, table_paths)
107
+ results = executor.map(
108
+ self.api_client.get_single_table_lineage, table_paths
109
+ )
369
110
  lineages = [link for links in results for link in links]
370
- deduplicated = self._deduplicate_lineage(lineages)
111
+ deduplicated = deduplicate_lineage(lineages)
371
112
  return self.formatter.format_lineage(deduplicated)
372
113
 
373
- @staticmethod
374
- def _paths_for_column_lineage(
375
- tables: List[dict], columns: List[dict], table_lineage: List[dict]
376
- ) -> List[Tuple[str, str]]:
377
- """
378
- helper providing a list of candidate columns to look lineage for:
379
- we only look for column lineage where there is table lineage
380
- """
381
- # mapping between table id and its path db.schema.table
382
- # table["schema_id"] follows the pattern `db.schema`
383
- mapping = {
384
- table["id"]: ".".join([table["schema_id"], table["table_name"]])
385
- for table in tables
386
- }
387
-
388
- tables_with_lineage: Set[str] = set()
389
- for t in table_lineage:
390
- tables_with_lineage.add(t["parent_path"])
391
- tables_with_lineage.add(t["child_path"])
392
-
393
- paths_to_return: List[Tuple[str, str]] = []
394
- for column in columns:
395
- table_path = mapping[column["table_id"]]
396
- if table_path not in tables_with_lineage:
397
- continue
398
- column_ = (table_path, column["column_name"])
399
- paths_to_return.append(column_)
400
-
401
- return paths_to_return
402
-
403
- def _single_column_lineage_links(
404
- self, column_path: str, single_column_lineage: dict
405
- ) -> List[TimestampedLink]:
406
- """
407
- process databricks lineage API response for a given table
408
- returns a list of (parent, child, timestamp)
409
-
410
- Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
411
- we could also have `notebookInfos` or `fileInfo`
412
- """
413
- links: List[OTimestampedLink] = []
414
- # add parent:
415
- for link in single_column_lineage.get("upstream_cols", []):
416
- parent_path = self._to_column_path(link)
417
- timestamp: Ostr = link.get("lineage_timestamp")
418
- links.append(self._link(parent_path, column_path, timestamp))
419
-
420
- # add children:
421
- for link in single_column_lineage.get("downstream_cols", []):
422
- child_path = self._to_column_path(link)
423
- timestamp = link.get("lineage_timestamp")
424
- links.append(self._link(column_path, child_path, timestamp))
425
-
426
- return list(filter(None, links))
427
-
428
- @safe_mode(safe_lineage_params, lambda: [])
429
- @retry(
430
- exceptions=_RETRY_EXCEPTIONS,
431
- max_retries=_RETRY_ATTEMPTS,
432
- base_ms=_RETRY_BASE_MS,
433
- )
434
- def get_single_column_lineage(
435
- self,
436
- names: Tuple[str, str],
437
- ) -> List[TimestampedLink]:
438
- """
439
- Helper function used in get_lineage_links.
440
- Call data lineage API and return the content of the result
441
-
442
- eg table_path: broward_prd.bronze.account_adjustments
443
- FYI: Maximum rate of 10 requests per SECOND
444
- """
445
- table_path, column_name = names
446
- api_path = "api/2.0/lineage-tracking/column-lineage"
447
- payload = {
448
- "table_name": table_path,
449
- "column_name": column_name,
450
- "include_entity_lineage": True,
451
- }
452
- content = self.get(path=api_path, payload=payload)
453
- column_path = f"{table_path}.{column_name}"
454
- return self._single_column_lineage_links(column_path, content)
455
-
456
114
  def column_lineage(
457
115
  self, tables: List[dict], columns: List[dict], table_lineage: List[dict]
458
116
  ) -> List[dict]:
@@ -460,133 +118,22 @@ class DatabricksClient(APIClient):
460
118
  Wrapper function that retrieves all column lineage
461
119
  we only try to retrieve column lineage if we found table lineage
462
120
  """
463
- candidate_paths = self._paths_for_column_lineage(
121
+ candidate_paths = paths_for_column_lineage(
464
122
  tables, columns, table_lineage
465
123
  )
466
124
  lineages: List[TimestampedLink] = [
467
125
  link
468
126
  for paths in candidate_paths
469
- for link in self.get_single_column_lineage(paths)
127
+ for link in self.api_client.get_single_column_lineage(paths)
470
128
  ]
471
- deduplicated = self._deduplicate_lineage(lineages)
129
+ deduplicated = deduplicate_lineage(lineages)
472
130
  return self.formatter.format_lineage(deduplicated)
473
131
 
474
- @staticmethod
475
- def _time_filter_payload(start_time_ms: int, end_time_ms: int) -> dict:
476
- return {
477
- "filter_by": {
478
- "query_start_time_range": {
479
- "end_time_ms": end_time_ms,
480
- "start_time_ms": start_time_ms,
481
- }
482
- }
483
- }
484
-
485
- def _hourly_time_filters(
486
- self, time_filter: Optional[TimeFilter]
487
- ) -> Iterable[dict]:
488
- """time filters to retrieve Databricks' queries: 1h duration each"""
489
- # define an explicit time window
490
- if not time_filter:
491
- time_filter = TimeFilter.default()
492
-
493
- assert time_filter # for mypy
494
-
495
- hour_min = time_filter.hour_min
496
- hour_max = time_filter.hour_max
497
- day = time_filter.day
498
- if hour_min is None or hour_max is None: # fallback to an entire day
499
- hour_min, hour_max = _DEFAULT_HOUR_MIN, _DEFAULT_HOUR_MAX
500
-
501
- for index in range(hour_min, min(hour_max + 1, _NUM_HOURS_IN_A_DAY)):
502
- start_time_ms = _day_hour_to_epoch_ms(day, index)
503
- end_time_ms = _day_hour_to_epoch_ms(day, index + 1)
504
- yield self._time_filter_payload(start_time_ms, end_time_ms)
505
-
506
- def query_payload(
507
- self,
508
- page_token: Optional[str] = None,
509
- max_results: Optional[int] = None,
510
- time_range_filter: Optional[dict] = None,
511
- ) -> dict:
512
- """helper method to build the payload used to retrieve queries"""
513
- # in payload: You can provide only one of 'page_token' or 'filter_by'
514
- if page_token:
515
- payload: Dict[str, Any] = {"page_token": page_token}
516
- else:
517
- if not time_range_filter:
518
- # should never happen.
519
- # `time_range_filter` optional to leverage functiontools.partial
520
- raise ValueError("Time range not specified")
521
- payload = {**time_range_filter}
522
- if max_results:
523
- payload["max_results"] = max_results
524
- return payload
525
-
526
- def _scroll_queries(
527
- self,
528
- page_token: Optional[str] = None,
529
- max_results: Optional[int] = None,
530
- time_range_filter: Optional[dict] = None,
531
- ) -> dict:
532
- """
533
- Callback to scroll the queries api
534
- https://docs.databricks.com/api/workspace/queryhistory/list
535
- max_results: Limit the number of results returned in one page.
536
- The default is 100. (both on our side and Databricks')
537
- """
538
- path = "api/2.0/sql/history/queries"
539
- payload = self.query_payload(page_token, max_results, time_range_filter)
540
- content = self.get(path=path, payload=payload)
541
- return content if content else {}
542
-
543
- @safe_mode(safe_query_params, lambda: [])
544
- @retry(
545
- exceptions=_RETRY_EXCEPTIONS,
546
- max_retries=_RETRY_ATTEMPTS,
547
- base_ms=_RETRY_BASE_MS,
548
- )
549
- def _queries(self, filter_: dict) -> List[dict]:
550
- """helper to retrieve queries using a given time filter"""
551
- _time_filtered_scroll_queries = partial(
552
- self._scroll_queries,
553
- time_range_filter=filter_,
554
- )
555
- # retrieve all queries using pagination
556
- return PagerOnToken(_time_filtered_scroll_queries).all()
557
-
558
132
  def queries(self, time_filter: Optional[TimeFilter] = None) -> List[dict]:
559
- """get all queries, hour per hour"""
560
- time_range_filters = self._hourly_time_filters(time_filter)
561
-
562
- raw_queries = []
563
- for _filter in time_range_filters:
564
- hourly = self._queries(_filter)
565
- raw_queries.extend(hourly)
566
- return self.formatter.format_query(raw_queries)
133
+ return self.api_client.queries(time_filter)
567
134
 
568
135
  def users(self) -> List[dict]:
569
- """
570
- retrieve user from api
571
- """
572
- path = "api/2.0/preview/scim/v2/Users"
573
- content = self.get(path=path)
574
- return self.formatter.format_user(content.get("Resources", []))
575
-
576
- def _view_ddl(self, schema: dict) -> List[dict]:
577
- path = "api/2.1/unity-catalog/tables"
578
- payload = {
579
- "catalog_name": schema["database_id"],
580
- "schema_name": schema["schema_name"],
581
- "omit_columns": True,
582
- }
583
- content = self.get(path=path, payload=payload)
584
- return self.formatter.format_view_ddl(content.get("tables", []), schema)
136
+ return self.api_client.users()
585
137
 
586
138
  def view_ddl(self, schemas: List[dict]) -> List[dict]:
587
- """retrieve view ddl"""
588
- view_ddl: List[dict] = []
589
- for schema in schemas:
590
- v_to_add = self._view_ddl(schema)
591
- view_ddl.extend(v_to_add)
592
- return view_ddl
139
+ return self.api_client.view_ddl(schemas)