castor-extractor 0.19.4__py3-none-any.whl → 0.19.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

Files changed (40) hide show
  1. CHANGELOG.md +13 -0
  2. castor_extractor/quality/soda/client/pagination.py +1 -1
  3. castor_extractor/utils/__init__.py +1 -0
  4. castor_extractor/utils/client/__init__.py +1 -1
  5. castor_extractor/utils/client/api/__init__.py +1 -1
  6. castor_extractor/utils/client/api/client.py +33 -7
  7. castor_extractor/utils/client/api/pagination.py +23 -6
  8. castor_extractor/utils/pager/__init__.py +0 -1
  9. castor_extractor/utils/salesforce/client.py +45 -50
  10. castor_extractor/utils/salesforce/client_test.py +2 -2
  11. castor_extractor/utils/salesforce/pagination.py +33 -0
  12. castor_extractor/visualization/metabase/client/api/client.py +30 -11
  13. castor_extractor/visualization/salesforce_reporting/client/rest.py +4 -3
  14. castor_extractor/visualization/sigma/client/client.py +2 -1
  15. castor_extractor/visualization/tableau_revamp/assets.py +8 -0
  16. castor_extractor/visualization/tableau_revamp/client/client.py +6 -1
  17. castor_extractor/warehouse/databricks/api_client.py +239 -0
  18. castor_extractor/warehouse/databricks/api_client_test.py +15 -0
  19. castor_extractor/warehouse/databricks/client.py +37 -489
  20. castor_extractor/warehouse/databricks/client_test.py +1 -99
  21. castor_extractor/warehouse/databricks/endpoints.py +28 -0
  22. castor_extractor/warehouse/databricks/lineage.py +141 -0
  23. castor_extractor/warehouse/databricks/lineage_test.py +34 -0
  24. castor_extractor/warehouse/databricks/pagination.py +22 -0
  25. castor_extractor/warehouse/databricks/sql_client.py +90 -0
  26. castor_extractor/warehouse/databricks/utils.py +44 -1
  27. castor_extractor/warehouse/databricks/utils_test.py +58 -1
  28. castor_extractor/warehouse/mysql/client.py +0 -3
  29. castor_extractor/warehouse/salesforce/client.py +12 -59
  30. castor_extractor/warehouse/salesforce/pagination.py +34 -0
  31. castor_extractor/warehouse/sqlserver/client.py +0 -2
  32. {castor_extractor-0.19.4.dist-info → castor_extractor-0.19.7.dist-info}/METADATA +14 -1
  33. {castor_extractor-0.19.4.dist-info → castor_extractor-0.19.7.dist-info}/RECORD +36 -31
  34. castor_extractor/utils/client/api_deprecated.py +0 -89
  35. castor_extractor/utils/client/api_deprecated_test.py +0 -18
  36. castor_extractor/utils/pager/pager_on_token.py +0 -52
  37. castor_extractor/utils/pager/pager_on_token_test.py +0 -73
  38. {castor_extractor-0.19.4.dist-info → castor_extractor-0.19.7.dist-info}/LICENCE +0 -0
  39. {castor_extractor-0.19.4.dist-info → castor_extractor-0.19.7.dist-info}/WHEEL +0 -0
  40. {castor_extractor-0.19.4.dist-info → castor_extractor-0.19.7.dist-info}/entry_points.txt +0 -0
@@ -1,92 +1,24 @@
1
1
  import logging
2
- from collections import defaultdict
3
2
  from concurrent.futures import ThreadPoolExecutor
4
- from datetime import date
5
- from enum import Enum
6
- from functools import partial
7
- from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, cast
8
-
9
- import requests
10
- from databricks import sql # type: ignore
11
- from requests import Response
3
+ from typing import List, Optional, Set
12
4
 
13
5
  from ...utils import (
14
- SafeMode,
15
- at_midnight,
16
6
  mapping_from_rows,
17
- retry,
18
- safe_mode,
19
7
  )
20
- from ...utils.client import APIClientDeprecated
21
- from ...utils.pager import PagerOnToken
22
- from ..abstract.time_filter import TimeFilter
8
+ from ..abstract import TimeFilter
9
+ from .api_client import DatabricksAPIClient
23
10
  from .credentials import DatabricksCredentials
24
- from .format import DatabricksFormatter, TagMapping
25
- from .types import Link, Ostr, OTimestampedLink, TablesColumns, TimestampedLink
26
- from .utils import build_path, tag_label
11
+ from .format import DatabricksFormatter
12
+ from .lineage import deduplicate_lineage, paths_for_column_lineage
13
+ from .sql_client import DatabricksSQLClient, TagEntity
14
+ from .types import TablesColumns, TimestampedLink
27
15
 
28
16
  logger = logging.getLogger(__name__)
29
17
 
30
- _DATABRICKS_CLIENT_TIMEOUT = 90
31
- _DEFAULT_HOUR_MIN = 0
32
- _DEFAULT_HOUR_MAX = 23
33
- _MAX_NUMBER_OF_LINEAGE_ERRORS = 1000
34
- _MAX_NUMBER_OF_QUERY_ERRORS = 1000
35
18
  _MAX_THREADS = 10
36
- _NUM_HOURS_IN_A_DAY = 24
37
- _RETRY_ATTEMPTS = 3
38
- _RETRY_BASE_MS = 1000
39
- _RETRY_EXCEPTIONS = [
40
- requests.exceptions.ConnectTimeout,
41
- ]
42
- _WORKSPACE_ID_HEADER = "X-Databricks-Org-Id"
43
-
44
- _INFORMATION_SCHEMA_SQL = "SELECT * FROM system.information_schema"
45
-
46
- safe_lineage_params = SafeMode((BaseException,), _MAX_NUMBER_OF_LINEAGE_ERRORS)
47
- safe_query_params = SafeMode((BaseException,), _MAX_NUMBER_OF_QUERY_ERRORS)
48
-
49
-
50
- class TagEntity(Enum):
51
- """Entities that can be tagged in Databricks"""
52
-
53
- COLUMN = "COLUMN"
54
- TABLE = "TABLE"
55
-
56
-
57
- def _day_to_epoch_ms(day: date) -> int:
58
- return int(at_midnight(day).timestamp() * 1000)
59
19
 
60
20
 
61
- def _day_hour_to_epoch_ms(day: date, hour: int) -> int:
62
- return int(at_midnight(day).timestamp() * 1000) + (hour * 3600 * 1000)
63
-
64
-
65
- class LineageLinks:
66
- """
67
- helper class that handles lineage deduplication and filtering
68
- """
69
-
70
- def __init__(self):
71
- self.lineage: Dict[Link, Ostr] = dict()
72
-
73
- def add(self, timestamped_link: TimestampedLink) -> None:
74
- """
75
- keep the most recent lineage link, adding to `self.lineage`
76
- """
77
- parent, child, timestamp = timestamped_link
78
- link = (parent, child)
79
- if not self.lineage.get(link):
80
- self.lineage[link] = timestamp
81
- else:
82
- if not timestamp:
83
- return
84
- # keep most recent link; cast for mypy
85
- recent = max(cast(str, self.lineage[link]), cast(str, timestamp))
86
- self.lineage[link] = recent
87
-
88
-
89
- class DatabricksClient(APIClientDeprecated):
21
+ class DatabricksClient:
90
22
  """Databricks Client"""
91
23
 
92
24
  def __init__(
@@ -97,111 +29,23 @@ class DatabricksClient(APIClientDeprecated):
97
29
  has_table_tags: bool = False,
98
30
  has_column_tags: bool = False,
99
31
  ):
100
- super().__init__(host=credentials.host, token=credentials.token)
101
- self._http_path = credentials.http_path
102
- self._db_allowed = db_allowed
103
- self._db_blocked = db_blocked
104
- self._has_table_tags = has_table_tags
105
- self._has_column_tags = has_column_tags
32
+ self.api_client = DatabricksAPIClient(
33
+ credentials=credentials,
34
+ db_allowed=db_allowed,
35
+ db_blocked=db_blocked,
36
+ )
37
+ self.sql_client = DatabricksSQLClient(
38
+ credentials=credentials,
39
+ has_table_tags=has_table_tags,
40
+ has_column_tags=has_column_tags,
41
+ )
106
42
 
107
- self._timeout = _DATABRICKS_CLIENT_TIMEOUT
108
43
  self.formatter = DatabricksFormatter()
109
44
 
110
- def execute_sql(
111
- self,
112
- query: str,
113
- params: Optional[dict] = None,
114
- ):
115
- """
116
- Execute a SQL query on Databricks system tables and return the results.
117
- https://docs.databricks.com/en/dev-tools/python-sql-connector.html
118
-
119
- //!\\ credentials.http_path is required in order to run SQL queries
120
- """
121
- assert self._http_path, "HTTP_PATH is required to run SQL queries"
122
- with sql.connect(
123
- server_hostname=self._host,
124
- http_path=self._http_path,
125
- access_token=self._token,
126
- ) as connection:
127
- with connection.cursor() as cursor:
128
- cursor.execute(query, params)
129
- return cursor.fetchall()
130
-
131
45
  @staticmethod
132
46
  def name() -> str:
133
47
  return "Databricks"
134
48
 
135
- def _keep_catalog(self, catalog: str) -> bool:
136
- """
137
- Helper function to determine if we should keep the Databricks catalog
138
- which is a CastorDoc database
139
- """
140
- if self._db_allowed and catalog not in self._db_allowed:
141
- return False
142
- if self._db_blocked and catalog in self._db_blocked:
143
- return False
144
- return True
145
-
146
- def databases(self) -> List[dict]:
147
- path = "api/2.1/unity-catalog/catalogs"
148
- content = self.get(path=path)
149
- _databases = self.formatter.format_database(content.get("catalogs", []))
150
- return [d for d in _databases if self._keep_catalog(d["database_name"])]
151
-
152
- def _schemas_of_database(self, database: dict) -> List[dict]:
153
- path = "api/2.1/unity-catalog/schemas"
154
- payload = {"catalog_name": database["database_name"]}
155
- content = self.get(path=path, payload=payload)
156
- schemas = content.get("schemas", [])
157
- return self.formatter.format_schema(schemas, database)
158
-
159
- def schemas(self, databases: List[dict]) -> List[dict]:
160
- """
161
- Get the databricks schemas (also sometimes called databases)
162
- (which correspond to the schemas in Castor)
163
- leveraging the unity catalog API
164
- """
165
- return [
166
- schema
167
- for database in databases
168
- for schema in self._schemas_of_database(database)
169
- ]
170
-
171
- @staticmethod
172
- def _process_table_response(response: Response) -> Tuple[dict, str]:
173
- """
174
- Returns both the JSON content and the Workspace ID, which is found
175
- in the response's headers.
176
- """
177
- return response.json(), response.headers[_WORKSPACE_ID_HEADER]
178
-
179
- def _tables_columns_of_schema(
180
- self,
181
- schema: dict,
182
- table_tags: TagMapping,
183
- column_tags: TagMapping,
184
- ) -> TablesColumns:
185
- path = "api/2.1/unity-catalog/tables"
186
- payload = {
187
- "catalog_name": schema["database_id"],
188
- "schema_name": schema["schema_name"],
189
- }
190
- content, workspace_id = self.get(
191
- path=path,
192
- payload=payload,
193
- processor=self._process_table_response,
194
- )
195
- host = self.build_url(self._host, path="")
196
- return self.formatter.format_table_column(
197
- raw_tables=content.get("tables", []),
198
- schema=schema,
199
- host=host,
200
- workspace_id=workspace_id,
201
- table_tags=table_tags,
202
- column_tags=column_tags,
203
- )
204
-
205
49
  @staticmethod
206
50
  def _match_table_with_user(table: dict, user_mapping: dict) -> dict:
207
51
  table_owner_email = table.get("owner_email")
@@ -212,40 +56,6 @@ class DatabricksClient(APIClientDeprecated):
212
56
  return table
213
57
  return {**table, "owner_external_id": owner_external_id}
214
58
 
215
- def _needs_extraction(self, entity: TagEntity) -> bool:
216
- if entity == TagEntity.TABLE:
217
- return self._has_table_tags
218
- if entity == TagEntity.COLUMN:
219
- return self._has_column_tags
220
- raise AssertionError(f"Entity not supported: {entity}")
221
-
222
- def _get_tags_mapping(self, entity: TagEntity) -> TagMapping:
223
- """
224
- Fetch tags of the given entity and build a mapping:
225
- { path: list[tags] }
226
-
227
- https://docs.databricks.com/en/sql/language-manual/information-schema/table_tags.html
228
- https://docs.databricks.com/en/sql/language-manual/information-schema/column_tags.html
229
- """
230
- if not self._needs_extraction(entity):
231
- # extracting tags require additional credentials (http_path)
232
- return dict()
233
-
234
- table = f"{entity.value.lower()}_tags"
235
- query = f"{_INFORMATION_SCHEMA_SQL}.{table}"
236
- result = self.execute_sql(query)
237
- mapping = defaultdict(list)
238
- for row in result:
239
- dict_row = row.asDict()
240
- keys = ["catalog_name", "schema_name", "table_name"]
241
- if entity == TagEntity.COLUMN:
242
- keys.append("column_name")
243
- path = build_path(dict_row, keys)
244
- label = tag_label(dict_row)
245
- mapping[path].append(label)
246
-
247
- return mapping
248
-
249
59
  @staticmethod
250
60
  def _get_user_mapping(users: List[dict]) -> dict:
251
61
  return {
@@ -253,6 +63,12 @@ class DatabricksClient(APIClientDeprecated):
253
63
  **mapping_from_rows(users, "user_name", "id"),
254
64
  }
255
65
 
66
+ def schemas(self, databases: List[dict]) -> List[dict]:
67
+ return self.api_client.schemas(databases)
68
+
69
+ def databases(self) -> List[dict]:
70
+ return self.api_client.databases()
71
+
256
72
  def tables_and_columns(
257
73
  self, schemas: List[dict], users: List[dict]
258
74
  ) -> TablesColumns:
@@ -262,10 +78,10 @@ class DatabricksClient(APIClientDeprecated):
262
78
  tables: List[dict] = []
263
79
  columns: List[dict] = []
264
80
  user_mapping = self._get_user_mapping(users)
265
- table_tags = self._get_tags_mapping(TagEntity.TABLE)
266
- column_tags = self._get_tags_mapping(TagEntity.COLUMN)
81
+ table_tags = self.sql_client.get_tags_mapping(TagEntity.TABLE)
82
+ column_tags = self.sql_client.get_tags_mapping(TagEntity.COLUMN)
267
83
  for schema in schemas:
268
- t_to_add, c_to_add = self._tables_columns_of_schema(
84
+ t_to_add, c_to_add = self.api_client.tables_columns_of_schema(
269
85
  schema=schema,
270
86
  table_tags=table_tags,
271
87
  column_tags=column_tags,
@@ -278,82 +94,6 @@ class DatabricksClient(APIClientDeprecated):
278
94
  columns.extend(c_to_add)
279
95
  return tables, columns
280
96
 
281
- @staticmethod
282
- def _to_table_path(table: dict) -> Ostr:
283
- if table.get("name"):
284
- return f"{table['catalog_name']}.{table['schema_name']}.{table['name']}"
285
- return None
286
-
287
- @staticmethod
288
- def _to_column_path(column: dict) -> Ostr:
289
- if column.get("name"):
290
- return f"{column['catalog_name']}.{column['schema_name']}.{column['table_name']}.{column['name']}"
291
- return None
292
-
293
- def _link(
294
- self, path_from: Ostr, path_to: Ostr, timestamp: Ostr
295
- ) -> OTimestampedLink:
296
- """exclude missing path and self-lineage"""
297
- if (not path_from) or (not path_to):
298
- return None
299
- is_self_lineage = path_from.lower() == path_to.lower()
300
- if is_self_lineage:
301
- return None
302
- return (path_from, path_to, timestamp)
303
-
304
- def _single_table_lineage_links(
305
- self, table_path: str, single_table_lineage: dict
306
- ) -> List[TimestampedLink]:
307
- """
308
- process databricks lineage API response for a given table
309
- returns a list of (parent, child, timestamp)
310
-
311
- Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
312
- we could also have `notebookInfos` or `fileInfo`
313
- """
314
- links: List[OTimestampedLink] = []
315
- # add parent:
316
- for link in single_table_lineage.get("upstreams", []):
317
- parent = link.get("tableInfo", {})
318
- parent_path = self._to_table_path(parent)
319
- timestamp: Ostr = parent.get("lineage_timestamp")
320
- links.append(self._link(parent_path, table_path, timestamp))
321
-
322
- # add children:
323
- for link in single_table_lineage.get("downstreams", []):
324
- child = link.get("tableInfo", {})
325
- child_path = self._to_table_path(child)
326
- timestamp = child.get("lineage_timestamp")
327
- links.append(self._link(table_path, child_path, timestamp))
328
-
329
- return list(filter(None, links))
330
-
331
- @safe_mode(safe_lineage_params, lambda: [])
332
- @retry(
333
- exceptions=_RETRY_EXCEPTIONS,
334
- max_retries=_RETRY_ATTEMPTS,
335
- base_ms=_RETRY_BASE_MS,
336
- )
337
- def get_single_table_lineage(
338
- self, table_path: str
339
- ) -> List[TimestampedLink]:
340
- """
341
- Helper function used in get_lineage_links.
342
- Call data lineage API and return the content of the result
343
- eg table_path: broward_prd.bronze.account_adjustments
344
- FYI: Maximum rate of 50 requests per SECOND
345
- """
346
- path = "api/2.0/lineage-tracking/table-lineage"
347
- payload = {"table_name": table_path, "include_entity_lineage": True}
348
- content = self.get(path=path, payload=payload)
349
- return self._single_table_lineage_links(table_path, content)
350
-
351
- def _deduplicate_lineage(self, lineages: List[TimestampedLink]) -> dict:
352
- deduplicated_lineage = LineageLinks()
353
- for timestamped_link in lineages:
354
- deduplicated_lineage.add(timestamped_link)
355
- return deduplicated_lineage.lineage
356
-
357
97
  def table_lineage(self, tables: List[dict]) -> List[dict]:
358
98
  """
359
99
  Wrapper function that retrieves all table lineage
@@ -364,94 +104,13 @@ class DatabricksClient(APIClientDeprecated):
364
104
  ".".join([table["schema_id"], table["table_name"]])
365
105
  for table in tables
366
106
  ]
367
- results = executor.map(self.get_single_table_lineage, table_paths)
107
+ results = executor.map(
108
+ self.api_client.get_single_table_lineage, table_paths
109
+ )
368
110
  lineages = [link for links in results for link in links]
369
- deduplicated = self._deduplicate_lineage(lineages)
111
+ deduplicated = deduplicate_lineage(lineages)
370
112
  return self.formatter.format_lineage(deduplicated)
371
113
 
372
- @staticmethod
373
- def _paths_for_column_lineage(
374
- tables: List[dict], columns: List[dict], table_lineage: List[dict]
375
- ) -> List[Tuple[str, str]]:
376
- """
377
- helper providing a list of candidate columns to look lineage for:
378
- we only look for column lineage where there is table lineage
379
- """
380
- # mapping between table id and its path db.schema.table
381
- # table["schema_id"] follows the pattern `db.schema`
382
- mapping = {
383
- table["id"]: ".".join([table["schema_id"], table["table_name"]])
384
- for table in tables
385
- }
386
-
387
- tables_with_lineage: Set[str] = set()
388
- for t in table_lineage:
389
- tables_with_lineage.add(t["parent_path"])
390
- tables_with_lineage.add(t["child_path"])
391
-
392
- paths_to_return: List[Tuple[str, str]] = []
393
- for column in columns:
394
- table_path = mapping[column["table_id"]]
395
- if table_path not in tables_with_lineage:
396
- continue
397
- column_ = (table_path, column["column_name"])
398
- paths_to_return.append(column_)
399
-
400
- return paths_to_return
401
-
402
- def _single_column_lineage_links(
403
- self, column_path: str, single_column_lineage: dict
404
- ) -> List[TimestampedLink]:
405
- """
406
- process databricks lineage API response for a given table
407
- returns a list of (parent, child, timestamp)
408
-
409
- Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
410
- we could also have `notebookInfos` or `fileInfo`
411
- """
412
- links: List[OTimestampedLink] = []
413
- # add parent:
414
- for link in single_column_lineage.get("upstream_cols", []):
415
- parent_path = self._to_column_path(link)
416
- timestamp: Ostr = link.get("lineage_timestamp")
417
- links.append(self._link(parent_path, column_path, timestamp))
418
-
419
- # add children:
420
- for link in single_column_lineage.get("downstream_cols", []):
421
- child_path = self._to_column_path(link)
422
- timestamp = link.get("lineage_timestamp")
423
- links.append(self._link(column_path, child_path, timestamp))
424
-
425
- return list(filter(None, links))
426
-
427
- @safe_mode(safe_lineage_params, lambda: [])
428
- @retry(
429
- exceptions=_RETRY_EXCEPTIONS,
430
- max_retries=_RETRY_ATTEMPTS,
431
- base_ms=_RETRY_BASE_MS,
432
- )
433
- def get_single_column_lineage(
434
- self,
435
- names: Tuple[str, str],
436
- ) -> List[TimestampedLink]:
437
- """
438
- Helper function used in get_lineage_links.
439
- Call data lineage API and return the content of the result
440
-
441
- eg table_path: broward_prd.bronze.account_adjustments
442
- FYI: Maximum rate of 10 requests per SECOND
443
- """
444
- table_path, column_name = names
445
- api_path = "api/2.0/lineage-tracking/column-lineage"
446
- payload = {
447
- "table_name": table_path,
448
- "column_name": column_name,
449
- "include_entity_lineage": True,
450
- }
451
- content = self.get(path=api_path, payload=payload)
452
- column_path = f"{table_path}.{column_name}"
453
- return self._single_column_lineage_links(column_path, content)
454
-
455
114
  def column_lineage(
456
115
  self, tables: List[dict], columns: List[dict], table_lineage: List[dict]
457
116
  ) -> List[dict]:
@@ -459,133 +118,22 @@ class DatabricksClient(APIClientDeprecated):
459
118
  Wrapper function that retrieves all column lineage
460
119
  we only try to retrieve column lineage if we found table lineage
461
120
  """
462
- candidate_paths = self._paths_for_column_lineage(
121
+ candidate_paths = paths_for_column_lineage(
463
122
  tables, columns, table_lineage
464
123
  )
465
124
  lineages: List[TimestampedLink] = [
466
125
  link
467
126
  for paths in candidate_paths
468
- for link in self.get_single_column_lineage(paths)
127
+ for link in self.api_client.get_single_column_lineage(paths)
469
128
  ]
470
- deduplicated = self._deduplicate_lineage(lineages)
129
+ deduplicated = deduplicate_lineage(lineages)
471
130
  return self.formatter.format_lineage(deduplicated)
472
131
 
473
- @staticmethod
474
- def _time_filter_payload(start_time_ms: int, end_time_ms: int) -> dict:
475
- return {
476
- "filter_by": {
477
- "query_start_time_range": {
478
- "end_time_ms": end_time_ms,
479
- "start_time_ms": start_time_ms,
480
- }
481
- }
482
- }
483
-
484
- def _hourly_time_filters(
485
- self, time_filter: Optional[TimeFilter]
486
- ) -> Iterable[dict]:
487
- """time filters to retrieve Databricks' queries: 1h duration each"""
488
- # define an explicit time window
489
- if not time_filter:
490
- time_filter = TimeFilter.default()
491
-
492
- assert time_filter # for mypy
493
-
494
- hour_min = time_filter.hour_min
495
- hour_max = time_filter.hour_max
496
- day = time_filter.day
497
- if hour_min is None or hour_max is None: # fallback to an entire day
498
- hour_min, hour_max = _DEFAULT_HOUR_MIN, _DEFAULT_HOUR_MAX
499
-
500
- for index in range(hour_min, min(hour_max + 1, _NUM_HOURS_IN_A_DAY)):
501
- start_time_ms = _day_hour_to_epoch_ms(day, index)
502
- end_time_ms = _day_hour_to_epoch_ms(day, index + 1)
503
- yield self._time_filter_payload(start_time_ms, end_time_ms)
504
-
505
- def query_payload(
506
- self,
507
- page_token: Optional[str] = None,
508
- max_results: Optional[int] = None,
509
- time_range_filter: Optional[dict] = None,
510
- ) -> dict:
511
- """helper method to build the payload used to retrieve queries"""
512
- # in payload: You can provide only one of 'page_token' or 'filter_by'
513
- if page_token:
514
- payload: Dict[str, Any] = {"page_token": page_token}
515
- else:
516
- if not time_range_filter:
517
- # should never happen.
518
- # `time_range_filter` optional to leverage functiontools.partial
519
- raise ValueError("Time range not specified")
520
- payload = {**time_range_filter}
521
- if max_results:
522
- payload["max_results"] = max_results
523
- return payload
524
-
525
- def _scroll_queries(
526
- self,
527
- page_token: Optional[str] = None,
528
- max_results: Optional[int] = None,
529
- time_range_filter: Optional[dict] = None,
530
- ) -> dict:
531
- """
532
- Callback to scroll the queries api
533
- https://docs.databricks.com/api/workspace/queryhistory/list
534
- max_results: Limit the number of results returned in one page.
535
- The default is 100. (both on our side and Databricks')
536
- """
537
- path = "api/2.0/sql/history/queries"
538
- payload = self.query_payload(page_token, max_results, time_range_filter)
539
- content = self.get(path=path, payload=payload)
540
- return content if content else {}
541
-
542
- @safe_mode(safe_query_params, lambda: [])
543
- @retry(
544
- exceptions=_RETRY_EXCEPTIONS,
545
- max_retries=_RETRY_ATTEMPTS,
546
- base_ms=_RETRY_BASE_MS,
547
- )
548
- def _queries(self, filter_: dict) -> List[dict]:
549
- """helper to retrieve queries using a given time filter"""
550
- _time_filtered_scroll_queries = partial(
551
- self._scroll_queries,
552
- time_range_filter=filter_,
553
- )
554
- # retrieve all queries using pagination
555
- return PagerOnToken(_time_filtered_scroll_queries).all()
556
-
557
132
  def queries(self, time_filter: Optional[TimeFilter] = None) -> List[dict]:
558
- """get all queries, hour per hour"""
559
- time_range_filters = self._hourly_time_filters(time_filter)
560
-
561
- raw_queries = []
562
- for _filter in time_range_filters:
563
- hourly = self._queries(_filter)
564
- raw_queries.extend(hourly)
565
- return self.formatter.format_query(raw_queries)
133
+ return self.api_client.queries(time_filter)
566
134
 
567
135
  def users(self) -> List[dict]:
568
- """
569
- retrieve user from api
570
- """
571
- path = "api/2.0/preview/scim/v2/Users"
572
- content = self.get(path=path)
573
- return self.formatter.format_user(content.get("Resources", []))
574
-
575
- def _view_ddl(self, schema: dict) -> List[dict]:
576
- path = "api/2.1/unity-catalog/tables"
577
- payload = {
578
- "catalog_name": schema["database_id"],
579
- "schema_name": schema["schema_name"],
580
- "omit_columns": True,
581
- }
582
- content = self.get(path=path, payload=payload)
583
- return self.formatter.format_view_ddl(content.get("tables", []), schema)
136
+ return self.api_client.users()
584
137
 
585
138
  def view_ddl(self, schemas: List[dict]) -> List[dict]:
586
- """retrieve view ddl"""
587
- view_ddl: List[dict] = []
588
- for schema in schemas:
589
- v_to_add = self._view_ddl(schema)
590
- view_ddl.extend(v_to_add)
591
- return view_ddl
139
+ return self.api_client.view_ddl(schemas)