castor-extractor 0.19.0__py3-none-any.whl → 0.19.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

Files changed (83) hide show
  1. CHANGELOG.md +29 -2
  2. castor_extractor/file_checker/templates/generic_warehouse.py +1 -1
  3. castor_extractor/knowledge/notion/client/client.py +44 -80
  4. castor_extractor/knowledge/notion/client/client_test.py +9 -4
  5. castor_extractor/knowledge/notion/client/constants.py +1 -0
  6. castor_extractor/knowledge/notion/client/endpoints.py +1 -1
  7. castor_extractor/knowledge/notion/client/pagination.py +9 -5
  8. castor_extractor/quality/soda/assets.py +1 -1
  9. castor_extractor/quality/soda/client/client.py +30 -83
  10. castor_extractor/quality/soda/client/credentials.py +0 -11
  11. castor_extractor/quality/soda/client/endpoints.py +3 -6
  12. castor_extractor/quality/soda/client/pagination.py +25 -0
  13. castor_extractor/utils/__init__.py +13 -2
  14. castor_extractor/utils/client/__init__.py +14 -0
  15. castor_extractor/utils/client/api/__init__.py +5 -0
  16. castor_extractor/utils/client/api/auth.py +76 -0
  17. castor_extractor/utils/client/api/auth_test.py +49 -0
  18. castor_extractor/utils/client/api/client.py +153 -0
  19. castor_extractor/utils/client/api/client_test.py +47 -0
  20. castor_extractor/utils/client/api/pagination.py +83 -0
  21. castor_extractor/utils/client/api/pagination_test.py +51 -0
  22. castor_extractor/utils/{safe_request_test.py → client/api/safe_request_test.py} +4 -1
  23. castor_extractor/utils/client/api/utils.py +9 -0
  24. castor_extractor/utils/client/api/utils_test.py +16 -0
  25. castor_extractor/utils/collection.py +34 -2
  26. castor_extractor/utils/collection_test.py +17 -3
  27. castor_extractor/utils/pager/__init__.py +0 -1
  28. castor_extractor/utils/retry.py +44 -0
  29. castor_extractor/utils/retry_test.py +26 -1
  30. castor_extractor/utils/salesforce/client.py +44 -49
  31. castor_extractor/utils/salesforce/client_test.py +2 -2
  32. castor_extractor/utils/salesforce/pagination.py +33 -0
  33. castor_extractor/visualization/domo/client/client.py +10 -5
  34. castor_extractor/visualization/domo/client/credentials.py +1 -1
  35. castor_extractor/visualization/domo/client/endpoints.py +19 -7
  36. castor_extractor/visualization/looker/api/credentials.py +1 -1
  37. castor_extractor/visualization/metabase/client/api/client.py +26 -11
  38. castor_extractor/visualization/metabase/client/api/credentials.py +1 -1
  39. castor_extractor/visualization/metabase/client/db/credentials.py +1 -1
  40. castor_extractor/visualization/mode/client/credentials.py +1 -1
  41. castor_extractor/visualization/qlik/client/engine/credentials.py +1 -1
  42. castor_extractor/visualization/salesforce_reporting/client/rest.py +4 -3
  43. castor_extractor/visualization/sigma/client/client.py +106 -111
  44. castor_extractor/visualization/sigma/client/credentials.py +11 -1
  45. castor_extractor/visualization/sigma/client/endpoints.py +1 -1
  46. castor_extractor/visualization/sigma/client/pagination.py +22 -18
  47. castor_extractor/visualization/tableau/tests/unit/rest_api/auth_test.py +0 -1
  48. castor_extractor/visualization/tableau/tests/unit/rest_api/credentials_test.py +0 -3
  49. castor_extractor/visualization/tableau_revamp/assets.py +11 -0
  50. castor_extractor/visualization/tableau_revamp/client/client.py +71 -151
  51. castor_extractor/visualization/tableau_revamp/client/client_metadata_api.py +95 -0
  52. castor_extractor/visualization/tableau_revamp/client/client_rest_api.py +128 -0
  53. castor_extractor/visualization/tableau_revamp/client/client_tsc.py +66 -0
  54. castor_extractor/visualization/tableau_revamp/client/{tsc_fields.py → rest_fields.py} +15 -2
  55. castor_extractor/visualization/tableau_revamp/constants.py +0 -2
  56. castor_extractor/visualization/tableau_revamp/extract.py +5 -11
  57. castor_extractor/warehouse/databricks/api_client.py +239 -0
  58. castor_extractor/warehouse/databricks/api_client_test.py +15 -0
  59. castor_extractor/warehouse/databricks/client.py +37 -490
  60. castor_extractor/warehouse/databricks/client_test.py +1 -99
  61. castor_extractor/warehouse/databricks/endpoints.py +28 -0
  62. castor_extractor/warehouse/databricks/lineage.py +141 -0
  63. castor_extractor/warehouse/databricks/lineage_test.py +34 -0
  64. castor_extractor/warehouse/databricks/pagination.py +22 -0
  65. castor_extractor/warehouse/databricks/sql_client.py +90 -0
  66. castor_extractor/warehouse/databricks/utils.py +44 -1
  67. castor_extractor/warehouse/databricks/utils_test.py +58 -1
  68. castor_extractor/warehouse/mysql/client.py +0 -2
  69. castor_extractor/warehouse/salesforce/client.py +12 -59
  70. castor_extractor/warehouse/salesforce/pagination.py +34 -0
  71. castor_extractor/warehouse/sqlserver/client.py +0 -1
  72. castor_extractor-0.19.6.dist-info/METADATA +903 -0
  73. {castor_extractor-0.19.0.dist-info → castor_extractor-0.19.6.dist-info}/RECORD +77 -60
  74. castor_extractor/utils/client/api.py +0 -87
  75. castor_extractor/utils/client/api_test.py +0 -24
  76. castor_extractor/utils/pager/pager_on_token.py +0 -52
  77. castor_extractor/utils/pager/pager_on_token_test.py +0 -73
  78. castor_extractor/visualization/sigma/client/client_test.py +0 -54
  79. castor_extractor-0.19.0.dist-info/METADATA +0 -207
  80. /castor_extractor/utils/{safe_request.py → client/api/safe_request.py} +0 -0
  81. {castor_extractor-0.19.0.dist-info → castor_extractor-0.19.6.dist-info}/LICENCE +0 -0
  82. {castor_extractor-0.19.0.dist-info → castor_extractor-0.19.6.dist-info}/WHEEL +0 -0
  83. {castor_extractor-0.19.0.dist-info → castor_extractor-0.19.6.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,66 @@
1
+ from typing import Any, Dict, Iterable, Iterator
2
+
3
+ import tableauserverclient as TSC # type: ignore
4
+
5
+ from ....utils import JsonType, SerializedAsset
6
+ from ..assets import TableauRevampAsset
7
+ from .rest_fields import REST_FIELDS
8
+
9
+
10
+ def _pick(element: Any, key: str) -> JsonType:
11
+ if isinstance(element, dict):
12
+ return element[key]
13
+ else:
14
+ return getattr(element, key)
15
+
16
+
17
+ class TableauClientTSC:
18
+ """
19
+ Extract Tableau Assets using TableauServerClient (TSC)
20
+ https://tableau.github.io/server-client-python/docs/api-ref
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ server: TSC.Server,
26
+ ):
27
+ self._server = server
28
+
29
+ def _pick_fields(
30
+ self,
31
+ data: Iterable,
32
+ asset: TableauRevampAsset,
33
+ ) -> Iterator[Dict]:
34
+ keys = REST_FIELDS[asset]
35
+
36
+ for row in data:
37
+ fields = {key: _pick(row, key) for key in keys}
38
+ if asset == TableauRevampAsset.USER:
39
+ self._server.users.populate_groups(row)
40
+ fields["group_ids"] = [group.id for group in row.groups]
41
+
42
+ yield fields
43
+
44
+ def fetch(
45
+ self,
46
+ asset: TableauRevampAsset,
47
+ ) -> SerializedAsset:
48
+ if asset == TableauRevampAsset.DATASOURCE:
49
+ data = TSC.Pager(self._server.datasources)
50
+
51
+ elif asset == TableauRevampAsset.PROJECT:
52
+ data = TSC.Pager(self._server.projects)
53
+
54
+ elif asset == TableauRevampAsset.USAGE:
55
+ data = TSC.Pager(self._server.views, usage=True)
56
+
57
+ elif asset == TableauRevampAsset.USER:
58
+ data = TSC.Pager(self._server.users)
59
+
60
+ elif asset == TableauRevampAsset.WORKBOOK:
61
+ data = TSC.Pager(self._server.workbooks)
62
+
63
+ else:
64
+ raise AssertionError(f"Fetching from TSC not supported for {asset}")
65
+
66
+ return list(self._pick_fields(data, asset))
@@ -2,19 +2,32 @@ from typing import Dict, Set
2
2
 
3
3
  from ..assets import TableauRevampAsset
4
4
 
5
- # list of fields to pick in TSC response
6
- TSC_FIELDS: Dict[TableauRevampAsset, Set[str]] = {
5
+ # list of fields to pick in REST API or TSC responses
6
+ REST_FIELDS: Dict[TableauRevampAsset, Set[str]] = {
7
7
  TableauRevampAsset.DATASOURCE: {
8
8
  "id",
9
9
  "project_id",
10
10
  "webpage_url",
11
11
  },
12
+ TableauRevampAsset.METRIC: {
13
+ "id",
14
+ "definition_id",
15
+ },
16
+ TableauRevampAsset.METRIC_DEFINITION: {
17
+ "metadata",
18
+ "specification",
19
+ },
12
20
  TableauRevampAsset.PROJECT: {
13
21
  "description",
14
22
  "id",
15
23
  "name",
16
24
  "parent_id",
17
25
  },
26
+ TableauRevampAsset.SUBSCRIPTION: {
27
+ "follower",
28
+ "id",
29
+ "metric_id",
30
+ },
18
31
  TableauRevampAsset.USAGE: {
19
32
  "name",
20
33
  "total_views",
@@ -1,5 +1,3 @@
1
- TABLEAU_SERVER_VERSION = "3.5"
2
-
3
1
  DEFAULT_PAGE_SIZE = 100
4
2
 
5
3
  DEFAULT_TIMEOUT_SECONDS = 100
@@ -7,7 +7,6 @@ from ...utils import (
7
7
  deep_serialize,
8
8
  from_env,
9
9
  get_output_filename,
10
- write_errors_logs,
11
10
  write_json,
12
11
  write_summary,
13
12
  )
@@ -22,11 +21,9 @@ def iterate_all_data(
22
21
  ) -> Iterable[Tuple[TableauRevampAsset, list]]:
23
22
  """Iterate over the extracted Data from Tableau"""
24
23
 
25
- logger.info("Extracting USER from Tableau API")
26
- yield (
27
- TableauRevampAsset.USER,
28
- deep_serialize(client.fetch(TableauRevampAsset.USER)),
29
- )
24
+ for asset in TableauRevampAsset:
25
+ data = client.fetch(asset)
26
+ yield asset, deep_serialize(data)
30
27
 
31
28
 
32
29
  def extract_all(client: TableauRevampClient, **kwargs: str) -> None:
@@ -46,9 +43,6 @@ def extract_all(client: TableauRevampClient, **kwargs: str) -> None:
46
43
  write_summary(
47
44
  output_directory,
48
45
  timestamp,
49
- base_url=client.base_url(),
50
- client_name=client.name(),
46
+ base_url=client.base_url,
47
+ client_name=client.name,
51
48
  )
52
-
53
- if client.errors:
54
- write_errors_logs(output_directory, timestamp, client.errors)
@@ -0,0 +1,239 @@
1
+ import logging
2
+ from functools import partial
3
+ from typing import Iterator, List, Optional, Set, Tuple
4
+
5
+ import requests
6
+
7
+ from ...utils import (
8
+ APIClient,
9
+ BearerAuth,
10
+ SafeMode,
11
+ build_url,
12
+ fetch_all_pages,
13
+ handle_response,
14
+ retry,
15
+ safe_mode,
16
+ )
17
+ from ..abstract import TimeFilter
18
+ from .credentials import DatabricksCredentials
19
+ from .endpoints import DatabricksEndpointFactory
20
+ from .format import DatabricksFormatter, TagMapping
21
+ from .lineage import single_column_lineage_links, single_table_lineage_links
22
+ from .pagination import DATABRICKS_PAGE_SIZE, DatabricksPagination
23
+ from .types import TablesColumns, TimestampedLink
24
+ from .utils import hourly_time_filters
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ _DATABRICKS_CLIENT_TIMEOUT_S = 90
29
+ _MAX_NUMBER_OF_LINEAGE_ERRORS = 1000
30
+ _MAX_NUMBER_OF_QUERY_ERRORS = 1000
31
+ _RETRY_ATTEMPTS = 3
32
+ _RETRY_BASE_MS = 1000
33
+ _RETRY_EXCEPTIONS = [
34
+ requests.exceptions.ConnectTimeout,
35
+ ]
36
+ _WORKSPACE_ID_HEADER = "X-Databricks-Org-Id"
37
+
38
+ safe_lineage_params = SafeMode((BaseException,), _MAX_NUMBER_OF_LINEAGE_ERRORS)
39
+ safe_query_params = SafeMode((BaseException,), _MAX_NUMBER_OF_QUERY_ERRORS)
40
+
41
+
42
+ class DatabricksAuth(BearerAuth):
43
+ def __init__(self, credentials: DatabricksCredentials):
44
+ self.token = credentials.token
45
+
46
+ def fetch_token(self) -> Optional[str]:
47
+ return self.token
48
+
49
+
50
+ class DatabricksAPIClient(APIClient):
51
+ """Databricks Client"""
52
+
53
+ def __init__(
54
+ self,
55
+ credentials: DatabricksCredentials,
56
+ db_allowed: Optional[Set[str]] = None,
57
+ db_blocked: Optional[Set[str]] = None,
58
+ ):
59
+ auth = DatabricksAuth(credentials)
60
+ super().__init__(
61
+ host=credentials.host,
62
+ auth=auth,
63
+ timeout=_DATABRICKS_CLIENT_TIMEOUT_S,
64
+ )
65
+ self._http_path = credentials.http_path
66
+ self._db_allowed = db_allowed
67
+ self._db_blocked = db_blocked
68
+
69
+ self.formatter = DatabricksFormatter()
70
+
71
+ def _keep_catalog(self, catalog: str) -> bool:
72
+ """
73
+ Helper function to determine if we should keep the Databricks catalog
74
+ which is a CastorDoc database
75
+ """
76
+ if self._db_allowed and catalog not in self._db_allowed:
77
+ return False
78
+ if self._db_blocked and catalog in self._db_blocked:
79
+ return False
80
+ return True
81
+
82
+ def databases(self) -> List[dict]:
83
+ content = self._get(DatabricksEndpointFactory.databases())
84
+ _databases = self.formatter.format_database(content.get("catalogs", []))
85
+ return [d for d in _databases if self._keep_catalog(d["database_name"])]
86
+
87
+ def _schemas_of_database(self, database: dict) -> List[dict]:
88
+ payload = {"catalog_name": database["database_name"]}
89
+ content = self._get(DatabricksEndpointFactory.schemas(), params=payload)
90
+ schemas = content.get("schemas", [])
91
+ return self.formatter.format_schema(schemas, database)
92
+
93
+ def schemas(self, databases: List[dict]) -> List[dict]:
94
+ """
95
+ Get the databricks schemas (also sometimes called databases)
96
+ (which correspond to the schemas in Castor)
97
+ leveraging the unity catalog API
98
+ """
99
+ return [
100
+ schema
101
+ for database in databases
102
+ for schema in self._schemas_of_database(database)
103
+ ]
104
+
105
+ def tables_columns_of_schema(
106
+ self,
107
+ schema: dict,
108
+ table_tags: TagMapping,
109
+ column_tags: TagMapping,
110
+ ) -> TablesColumns:
111
+ payload = {
112
+ "catalog_name": schema["database_id"],
113
+ "schema_name": schema["schema_name"],
114
+ }
115
+ response = self._call(
116
+ method="GET",
117
+ endpoint=DatabricksEndpointFactory.tables(),
118
+ params=payload,
119
+ )
120
+ workspace_id = response.headers[_WORKSPACE_ID_HEADER]
121
+ content = handle_response(response)
122
+ host = build_url(self._host, endpoint="")
123
+ return self.formatter.format_table_column(
124
+ raw_tables=content.get("tables", []),
125
+ schema=schema,
126
+ host=host,
127
+ workspace_id=workspace_id,
128
+ table_tags=table_tags,
129
+ column_tags=column_tags,
130
+ )
131
+
132
+ @safe_mode(safe_lineage_params, lambda: [])
133
+ @retry(
134
+ exceptions=_RETRY_EXCEPTIONS,
135
+ max_retries=_RETRY_ATTEMPTS,
136
+ base_ms=_RETRY_BASE_MS,
137
+ )
138
+ def get_single_column_lineage(
139
+ self,
140
+ names: Tuple[str, str],
141
+ ) -> List[TimestampedLink]:
142
+ """
143
+ Helper function used in get_lineage_links.
144
+ Call data lineage API and return the content of the result
145
+
146
+ eg table_path: broward_prd.bronze.account_adjustments
147
+ FYI: Maximum rate of 10 requests per SECOND
148
+ """
149
+ table_path, column_name = names
150
+ payload = {
151
+ "table_name": table_path,
152
+ "column_name": column_name,
153
+ "include_entity_lineage": True,
154
+ }
155
+ content = self._get(
156
+ DatabricksEndpointFactory.column_lineage(), params=payload
157
+ )
158
+ column_path = f"{table_path}.{column_name}"
159
+ return single_column_lineage_links(column_path, content)
160
+
161
+ @safe_mode(safe_lineage_params, lambda: [])
162
+ @retry(
163
+ exceptions=_RETRY_EXCEPTIONS,
164
+ max_retries=_RETRY_ATTEMPTS,
165
+ base_ms=_RETRY_BASE_MS,
166
+ )
167
+ def get_single_table_lineage(
168
+ self, table_path: str
169
+ ) -> List[TimestampedLink]:
170
+ """
171
+ Helper function used in get_lineage_links.
172
+ Call data lineage API and return the content of the result
173
+ eg table_path: broward_prd.bronze.account_adjustments
174
+ FYI: Maximum rate of 50 requests per SECOND
175
+ """
176
+ payload = {"table_name": table_path, "include_entity_lineage": True}
177
+ content = self._get(
178
+ DatabricksEndpointFactory.table_lineage(), params=payload
179
+ )
180
+ return single_table_lineage_links(table_path, content)
181
+
182
+ @safe_mode(safe_query_params, lambda: [])
183
+ @retry(
184
+ exceptions=_RETRY_EXCEPTIONS,
185
+ max_retries=_RETRY_ATTEMPTS,
186
+ base_ms=_RETRY_BASE_MS,
187
+ )
188
+ def _queries(
189
+ self,
190
+ filter_: dict,
191
+ ) -> Iterator[dict]:
192
+ """
193
+ Callback to scroll the queries api
194
+ https://docs.databricks.com/api/workspace/queryhistory/list
195
+ max_results: Limit the number of results returned in one page.
196
+ The default is 100. (both on our side and Databricks')
197
+ """
198
+ payload = {**filter_, "max_results": DATABRICKS_PAGE_SIZE}
199
+ request = partial(
200
+ self._get,
201
+ endpoint=DatabricksEndpointFactory.queries(),
202
+ data=payload,
203
+ )
204
+ queries = fetch_all_pages(request, DatabricksPagination)
205
+ return queries
206
+
207
+ def queries(self, time_filter: Optional[TimeFilter] = None) -> List[dict]:
208
+ """get all queries, hour per hour"""
209
+ time_range_filters = hourly_time_filters(time_filter)
210
+ raw_queries = []
211
+ for _filter in time_range_filters:
212
+ logger.info(f"Fetching queries for time filter {_filter}")
213
+ hourly = self._queries(_filter)
214
+ raw_queries.extend(hourly)
215
+ return self.formatter.format_query(raw_queries)
216
+
217
+ def users(self) -> List[dict]:
218
+ """
219
+ retrieve user from api
220
+ """
221
+ content = self._get(DatabricksEndpointFactory.users())
222
+ return self.formatter.format_user(content.get("Resources", []))
223
+
224
+ def _view_ddl_per_schema(self, schema: dict) -> List[dict]:
225
+ payload = {
226
+ "catalog_name": schema["database_id"],
227
+ "schema_name": schema["schema_name"],
228
+ "omit_columns": True,
229
+ }
230
+ content = self._get(DatabricksEndpointFactory.tables(), params=payload)
231
+ return self.formatter.format_view_ddl(content.get("tables", []), schema)
232
+
233
+ def view_ddl(self, schemas: List[dict]) -> List[dict]:
234
+ """retrieve view ddl"""
235
+ view_ddl: List[dict] = []
236
+ for schema in schemas:
237
+ v_to_add = self._view_ddl_per_schema(schema)
238
+ view_ddl.extend(v_to_add)
239
+ return view_ddl
@@ -0,0 +1,15 @@
1
+ from .api_client import DatabricksAPIClient
2
+
3
+
4
+ class MockDatabricksClient(DatabricksAPIClient):
5
+ def __init__(self):
6
+ self._db_allowed = ["prd", "staging"]
7
+ self._db_blocked = ["dev"]
8
+
9
+
10
+ def test_DatabricksAPIClient__keep_catalog():
11
+ client = MockDatabricksClient()
12
+ assert client._keep_catalog("prd")
13
+ assert client._keep_catalog("staging")
14
+ assert not client._keep_catalog("dev")
15
+ assert not client._keep_catalog("something_unknown")