castor-extractor 0.22.1__py3-none-any.whl → 0.22.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +21 -0
- castor_extractor/file_checker/file.py +1 -1
- castor_extractor/visualization/powerbi/assets.py +2 -12
- castor_extractor/visualization/powerbi/extract.py +2 -2
- castor_extractor/visualization/sigma/client/client.py +64 -10
- castor_extractor/visualization/thoughtspot/assets.py +3 -1
- castor_extractor/visualization/thoughtspot/client/client.py +67 -14
- castor_extractor/visualization/thoughtspot/client/utils.py +10 -4
- castor_extractor/visualization/thoughtspot/client/utils_test.py +22 -4
- castor_extractor/warehouse/abstract/extract.py +1 -1
- castor_extractor/warehouse/bigquery/client.py +3 -3
- castor_extractor/warehouse/databricks/api_client.py +2 -60
- castor_extractor/warehouse/databricks/client.py +4 -47
- castor_extractor/warehouse/databricks/client_test.py +1 -35
- castor_extractor/warehouse/databricks/credentials.py +4 -6
- castor_extractor/warehouse/databricks/enums.py +15 -0
- castor_extractor/warehouse/databricks/extract.py +13 -11
- castor_extractor/warehouse/databricks/lineage.py +47 -119
- castor_extractor/warehouse/databricks/lineage_test.py +86 -31
- castor_extractor/warehouse/databricks/sql_client.py +23 -8
- castor_extractor/warehouse/databricks/types.py +0 -7
- {castor_extractor-0.22.1.dist-info → castor_extractor-0.22.6.dist-info}/METADATA +24 -3
- {castor_extractor-0.22.1.dist-info → castor_extractor-0.22.6.dist-info}/RECORD +26 -26
- castor_extractor/warehouse/databricks/test_constants.py +0 -79
- {castor_extractor-0.22.1.dist-info → castor_extractor-0.22.6.dist-info}/LICENCE +0 -0
- {castor_extractor-0.22.1.dist-info → castor_extractor-0.22.6.dist-info}/WHEEL +0 -0
- {castor_extractor-0.22.1.dist-info → castor_extractor-0.22.6.dist-info}/entry_points.txt +0 -0
CHANGELOG.md
CHANGED
|
@@ -1,6 +1,27 @@
|
|
|
1
1
|
|
|
2
2
|
# Changelog
|
|
3
3
|
|
|
4
|
+
## 0.22.6 - 2025-01-21
|
|
5
|
+
|
|
6
|
+
* bump dependencies: looker, databricks, deptry, ...
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
## 0.22.5 - 2025-01-09
|
|
10
|
+
|
|
11
|
+
* Databricks: validate and deduplicate lineage links
|
|
12
|
+
|
|
13
|
+
## 0.22.4 - 2025-01-08
|
|
14
|
+
|
|
15
|
+
* ThoughtSpot: extract answers
|
|
16
|
+
|
|
17
|
+
## 0.22.3 - 2024-12-10
|
|
18
|
+
|
|
19
|
+
* Databricks: extract lineage from system tables
|
|
20
|
+
|
|
21
|
+
## 0.22.2 - 2024-12-06
|
|
22
|
+
|
|
23
|
+
* Sigma: multithreading to retrieve lineage
|
|
24
|
+
|
|
4
25
|
## 0.22.1 - 2024-12-05
|
|
5
26
|
|
|
6
27
|
* Salesforce: deduplicate tables
|
|
@@ -123,7 +123,7 @@ class FileCheckerRun:
|
|
|
123
123
|
"""
|
|
124
124
|
if not self.verbose:
|
|
125
125
|
return
|
|
126
|
-
header = f"Issues detected on Row #{index+1}\n"
|
|
126
|
+
header = f"Issues detected on Row #{index + 1}\n"
|
|
127
127
|
for k, v in row.items():
|
|
128
128
|
header += f"{str(k):<20} {str(v):<100}\n"
|
|
129
129
|
self.logger.info(header + _SEPARATOR + issue_log + _SEPARATOR)
|
|
@@ -8,6 +8,7 @@ class PowerBiAsset(ExternalAsset):
|
|
|
8
8
|
DASHBOARDS = "dashboards"
|
|
9
9
|
DATASETS = "datasets"
|
|
10
10
|
DATASET_FIELDS = "dataset_fields"
|
|
11
|
+
DATASET_RELATIONSHIPS = "dataset_relationships"
|
|
11
12
|
METADATA = "metadata"
|
|
12
13
|
PAGES = "pages"
|
|
13
14
|
REPORTS = "reports"
|
|
@@ -19,20 +20,9 @@ class PowerBiAsset(ExternalAsset):
|
|
|
19
20
|
def optional(cls) -> set["PowerBiAsset"]:
|
|
20
21
|
return {
|
|
21
22
|
PowerBiAsset.DATASET_FIELDS,
|
|
23
|
+
PowerBiAsset.DATASET_RELATIONSHIPS,
|
|
22
24
|
PowerBiAsset.PAGES,
|
|
23
25
|
PowerBiAsset.TABLES,
|
|
24
26
|
PowerBiAsset.TILES,
|
|
25
27
|
PowerBiAsset.USERS,
|
|
26
28
|
}
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
# Assets extracted from the Metadata file
|
|
30
|
-
# They are not directly fetched from the PowerBi api.
|
|
31
|
-
METADATA_ASSETS = (
|
|
32
|
-
PowerBiAsset.DATASET_FIELDS,
|
|
33
|
-
PowerBiAsset.TABLES,
|
|
34
|
-
PowerBiAsset.TILES,
|
|
35
|
-
PowerBiAsset.USERS,
|
|
36
|
-
)
|
|
37
|
-
|
|
38
|
-
REPORTS_ASSETS = (PowerBiAsset.PAGES,)
|
|
@@ -11,7 +11,7 @@ from ...utils import (
|
|
|
11
11
|
write_json,
|
|
12
12
|
write_summary,
|
|
13
13
|
)
|
|
14
|
-
from .assets import
|
|
14
|
+
from .assets import PowerBiAsset
|
|
15
15
|
from .client import PowerbiClient, PowerbiCredentials
|
|
16
16
|
|
|
17
17
|
logger = logging.getLogger(__name__)
|
|
@@ -21,7 +21,7 @@ def iterate_all_data(
|
|
|
21
21
|
client: PowerbiClient,
|
|
22
22
|
) -> Iterable[tuple[PowerBiAsset, Union[list, dict]]]:
|
|
23
23
|
for asset in PowerBiAsset:
|
|
24
|
-
if asset in
|
|
24
|
+
if asset in PowerBiAsset.optional:
|
|
25
25
|
continue
|
|
26
26
|
|
|
27
27
|
logger.info(f"Extracting {asset.name} from API")
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
from collections.abc import Iterator
|
|
2
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
2
3
|
from functools import partial
|
|
3
4
|
from http import HTTPStatus
|
|
4
5
|
from typing import Callable, Optional
|
|
5
6
|
|
|
6
7
|
import requests
|
|
8
|
+
from pydantic import BaseModel
|
|
7
9
|
|
|
8
10
|
from ....utils import (
|
|
9
11
|
APIClient,
|
|
@@ -12,6 +14,7 @@ from ....utils import (
|
|
|
12
14
|
build_url,
|
|
13
15
|
fetch_all_pages,
|
|
14
16
|
handle_response,
|
|
17
|
+
retry,
|
|
15
18
|
)
|
|
16
19
|
from ..assets import SigmaAsset
|
|
17
20
|
from .credentials import SigmaCredentials
|
|
@@ -29,7 +32,7 @@ _DATA_ELEMENTS: tuple[str, ...] = (
|
|
|
29
32
|
)
|
|
30
33
|
|
|
31
34
|
_AUTH_TIMEOUT_S = 60
|
|
32
|
-
|
|
35
|
+
_SIGMA_TIMEOUT_S = 300
|
|
33
36
|
|
|
34
37
|
_SIGMA_HEADERS = {
|
|
35
38
|
"Content-Type": _CONTENT_TYPE,
|
|
@@ -47,6 +50,23 @@ SIGMA_SAFE_MODE = RequestSafeMode(
|
|
|
47
50
|
max_errors=_VOLUME_IGNORED,
|
|
48
51
|
status_codes=_IGNORED_ERROR_CODES,
|
|
49
52
|
)
|
|
53
|
+
_THREADS_LINEAGE = 10 # empirically found; hit the rate limit with 20 workers
|
|
54
|
+
_RETRY_NUMBER = 1
|
|
55
|
+
_RETRY_BASE_MS = 60_000
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class LineageContext(BaseModel):
|
|
59
|
+
"""all info needed to build the endpoint for lineage retrieval"""
|
|
60
|
+
|
|
61
|
+
workbook_id: str
|
|
62
|
+
element_id: str
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class Lineage(BaseModel):
|
|
66
|
+
"""holds response from lineage API and context used to retrieve it"""
|
|
67
|
+
|
|
68
|
+
lineage: dict
|
|
69
|
+
context: LineageContext
|
|
50
70
|
|
|
51
71
|
|
|
52
72
|
class SigmaBearerAuth(BearerAuth):
|
|
@@ -77,7 +97,7 @@ class SigmaClient(APIClient):
|
|
|
77
97
|
host=credentials.host,
|
|
78
98
|
auth=auth,
|
|
79
99
|
headers=_SIGMA_HEADERS,
|
|
80
|
-
timeout=
|
|
100
|
+
timeout=_SIGMA_TIMEOUT_S,
|
|
81
101
|
safe_mode=safe_mode or SIGMA_SAFE_MODE,
|
|
82
102
|
)
|
|
83
103
|
|
|
@@ -133,17 +153,51 @@ class SigmaClient(APIClient):
|
|
|
133
153
|
page=page, workbook_id=workbook_id
|
|
134
154
|
)
|
|
135
155
|
|
|
136
|
-
|
|
156
|
+
@retry(
|
|
157
|
+
(ConnectionError,),
|
|
158
|
+
max_retries=_RETRY_NUMBER,
|
|
159
|
+
base_ms=_RETRY_BASE_MS,
|
|
160
|
+
log_exc_info=True,
|
|
161
|
+
)
|
|
162
|
+
def _get_lineage(self, lineage_context: LineageContext) -> Lineage:
|
|
163
|
+
"""
|
|
164
|
+
return the lineage from API and other ids needed to characterize
|
|
165
|
+
lineage in castor
|
|
166
|
+
"""
|
|
167
|
+
workbook_id = lineage_context.workbook_id
|
|
168
|
+
element_id = lineage_context.element_id
|
|
169
|
+
endpoint = SigmaEndpointFactory.lineage(workbook_id, element_id)
|
|
170
|
+
return Lineage(lineage=self._get(endpoint), context=lineage_context)
|
|
171
|
+
|
|
172
|
+
@staticmethod
|
|
173
|
+
def _lineage_context(elements: list[dict]) -> list[LineageContext]:
|
|
174
|
+
"""
|
|
175
|
+
Helper function to prepare context for lineage retrieval.
|
|
176
|
+
Elements without associated columns are skipped.
|
|
177
|
+
"""
|
|
178
|
+
contexts: list[LineageContext] = []
|
|
137
179
|
for element in elements:
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
180
|
+
if element.get("columns") is None:
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
context = LineageContext(
|
|
184
|
+
workbook_id=element["workbook_id"],
|
|
185
|
+
element_id=element["elementId"],
|
|
142
186
|
)
|
|
187
|
+
contexts.append(context)
|
|
188
|
+
return contexts
|
|
189
|
+
|
|
190
|
+
def _get_all_lineages(self, elements: list[dict]) -> Iterator[dict]:
|
|
191
|
+
lineage_context = self._lineage_context(elements)
|
|
192
|
+
|
|
193
|
+
with ThreadPoolExecutor(max_workers=_THREADS_LINEAGE) as executor:
|
|
194
|
+
results = executor.map(self._get_lineage, lineage_context)
|
|
195
|
+
|
|
196
|
+
for lineage in results:
|
|
143
197
|
yield {
|
|
144
|
-
**lineage,
|
|
145
|
-
"workbook_id": workbook_id,
|
|
146
|
-
"element_id": element_id,
|
|
198
|
+
**lineage.lineage,
|
|
199
|
+
"workbook_id": lineage.context.workbook_id,
|
|
200
|
+
"element_id": lineage.context.element_id,
|
|
147
201
|
}
|
|
148
202
|
|
|
149
203
|
def _get_all_queries(self, workbooks: list[dict]) -> Iterator[dict]:
|
|
@@ -4,6 +4,8 @@ from ...types import ExternalAsset
|
|
|
4
4
|
class ThoughtspotAsset(ExternalAsset):
|
|
5
5
|
"""Thoughtspot assets"""
|
|
6
6
|
|
|
7
|
+
ANSWERS = "answers"
|
|
8
|
+
ANSWER_USAGES = "answer_usages"
|
|
7
9
|
LIVEBOARDS = "liveboards"
|
|
10
|
+
LIVEBOARD_USAGES = "liveboard_usages"
|
|
8
11
|
LOGICAL_TABLES = "logical_tables"
|
|
9
|
-
USAGES = "usages"
|
|
@@ -30,7 +30,12 @@ _THOUGHTSPOT_HEADERS = {
|
|
|
30
30
|
"Content-Type": "application/json",
|
|
31
31
|
}
|
|
32
32
|
_METADATA_BATCH_SIZE = 100
|
|
33
|
-
|
|
33
|
+
# https://docs.thoughtspot.com/cloud/latest/object-usage-liveboard
|
|
34
|
+
_OBJECT_USAGE_LIVEBOARD = "Object Usage"
|
|
35
|
+
_ANSWER_USAGE_VIZ = "Answer Usage, by User"
|
|
36
|
+
# https://docs.thoughtspot.com/cloud/latest/user-adoption
|
|
37
|
+
_USER_ADOPTION_LIVEBOARD = "User Adoption"
|
|
38
|
+
_LIVEBOARD_USAGE_VIZ = "Popular Liveboards Last 30 Days"
|
|
34
39
|
# By default, no errors are ignored for the moment
|
|
35
40
|
THOUGHTSPOT_SAFE_MODE = RequestSafeMode()
|
|
36
41
|
|
|
@@ -69,23 +74,39 @@ class ThoughtspotClient(APIClient):
|
|
|
69
74
|
def _metadata_search(
|
|
70
75
|
self,
|
|
71
76
|
metadata_type: str,
|
|
77
|
+
identifier: Optional[str] = None,
|
|
72
78
|
) -> Iterator[dict]:
|
|
79
|
+
"""
|
|
80
|
+
Yields assets of the given asset type, and optionally filters on a
|
|
81
|
+
specific identifier.
|
|
82
|
+
"""
|
|
73
83
|
offset = 0
|
|
84
|
+
|
|
74
85
|
while True:
|
|
86
|
+
search_filters = {
|
|
87
|
+
"metadata": [{"type": metadata_type}],
|
|
88
|
+
"include_details": True,
|
|
89
|
+
"record_size": _METADATA_BATCH_SIZE,
|
|
90
|
+
"record_offset": offset,
|
|
91
|
+
}
|
|
92
|
+
if identifier:
|
|
93
|
+
search_filters["metadata"] = {
|
|
94
|
+
"identifier": identifier,
|
|
95
|
+
"type": metadata_type,
|
|
96
|
+
}
|
|
97
|
+
|
|
75
98
|
metadata = self._post(
|
|
76
99
|
ThoughtspotEndpointFactory.metadata_search(),
|
|
77
|
-
data=
|
|
78
|
-
"metadata": [{"type": metadata_type}],
|
|
79
|
-
"include_details": True,
|
|
80
|
-
"record_size": _METADATA_BATCH_SIZE,
|
|
81
|
-
"record_offset": offset,
|
|
82
|
-
},
|
|
100
|
+
data=search_filters,
|
|
83
101
|
)
|
|
84
102
|
yield from metadata
|
|
85
103
|
if len(metadata) < _METADATA_BATCH_SIZE:
|
|
86
104
|
break
|
|
87
105
|
offset = offset + _METADATA_BATCH_SIZE
|
|
88
106
|
|
|
107
|
+
def _get_all_answers(self) -> Iterator[dict]:
|
|
108
|
+
yield from self._metadata_search(metadata_type="ANSWER")
|
|
109
|
+
|
|
89
110
|
def _get_all_liveboards(self) -> Iterator[dict]:
|
|
90
111
|
yield from self._metadata_search(metadata_type="LIVEBOARD")
|
|
91
112
|
|
|
@@ -95,26 +116,58 @@ class ThoughtspotClient(APIClient):
|
|
|
95
116
|
def _get_all_tables(self) -> Iterator[dict]:
|
|
96
117
|
yield from self._metadata_search(metadata_type="LOGICAL_TABLE")
|
|
97
118
|
|
|
98
|
-
def
|
|
119
|
+
def _get_usages(
|
|
120
|
+
self,
|
|
121
|
+
liveboard_name: str,
|
|
122
|
+
visualization_name: str,
|
|
123
|
+
) -> Iterator[dict]:
|
|
124
|
+
"""
|
|
125
|
+
Yields the data of a given visualization in the given liveboard.
|
|
126
|
+
ThoughtSpot maintains two system liveboards with stats about data usage,
|
|
127
|
+
which are useful to compute view counts and popularity.
|
|
128
|
+
"""
|
|
129
|
+
usage_liveboard = next(
|
|
130
|
+
self._metadata_search(
|
|
131
|
+
metadata_type="LIVEBOARD", identifier=liveboard_name
|
|
132
|
+
)
|
|
133
|
+
)
|
|
134
|
+
liveboard_id = usage_liveboard["metadata_id"]
|
|
135
|
+
|
|
99
136
|
data = self._post(
|
|
100
137
|
endpoint=ThoughtspotEndpointFactory.liveboard(),
|
|
101
138
|
headers={"Accept": "application/octet-stream"},
|
|
102
139
|
data={
|
|
103
|
-
"metadata_identifier":
|
|
140
|
+
"metadata_identifier": liveboard_id,
|
|
104
141
|
"file_format": "CSV",
|
|
105
|
-
"visualization_identifiers": [
|
|
106
|
-
"Popular Liveboards Last 30 Days"
|
|
107
|
-
],
|
|
142
|
+
"visualization_identifiers": [visualization_name],
|
|
108
143
|
},
|
|
109
144
|
handler=lambda x: x.text,
|
|
110
145
|
)
|
|
111
146
|
yield from usage_liveboard_reader(data)
|
|
112
147
|
|
|
113
|
-
def
|
|
148
|
+
def _get_answer_usages(self) -> Iterator[dict]:
|
|
149
|
+
return self._get_usages(
|
|
150
|
+
liveboard_name=_OBJECT_USAGE_LIVEBOARD,
|
|
151
|
+
visualization_name=_ANSWER_USAGE_VIZ,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
def _get_liveboards_usages(self) -> Iterator[dict]:
|
|
155
|
+
return self._get_usages(
|
|
156
|
+
liveboard_name=_USER_ADOPTION_LIVEBOARD,
|
|
157
|
+
visualization_name=_LIVEBOARD_USAGE_VIZ,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
def fetch(self, asset: ThoughtspotAsset) -> Iterator[dict]:
|
|
161
|
+
if asset == ThoughtspotAsset.ANSWERS:
|
|
162
|
+
yield from self._get_all_answers()
|
|
163
|
+
|
|
164
|
+
if asset == ThoughtspotAsset.ANSWER_USAGES:
|
|
165
|
+
yield from self._get_answer_usages()
|
|
166
|
+
|
|
114
167
|
if asset == ThoughtspotAsset.LIVEBOARDS:
|
|
115
168
|
yield from self._get_all_liveboards()
|
|
116
169
|
|
|
117
|
-
if asset == ThoughtspotAsset.
|
|
170
|
+
if asset == ThoughtspotAsset.LIVEBOARD_USAGES:
|
|
118
171
|
yield from self._get_liveboards_usages()
|
|
119
172
|
|
|
120
173
|
if asset == ThoughtspotAsset.LOGICAL_TABLES:
|
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
import csv
|
|
2
|
+
import re
|
|
2
3
|
from collections.abc import Iterator
|
|
3
4
|
from io import StringIO
|
|
4
5
|
|
|
6
|
+
_END_OF_GENERATED_TEXT = r'^""$'
|
|
7
|
+
|
|
5
8
|
|
|
6
9
|
def usage_liveboard_reader(usage_liveboard_csv: str) -> Iterator[dict]:
|
|
7
10
|
"""
|
|
8
11
|
Converts a CSV string into an iterator of dictionaries after
|
|
9
|
-
ignoring the
|
|
10
|
-
|
|
12
|
+
ignoring the generated text that preceeds the actual CSV header row.
|
|
13
|
+
The generated block ends with a row containing only two double quotes.
|
|
14
|
+
Here is an example:
|
|
11
15
|
|
|
12
16
|
"Data extract produced by Castor on 09/19/2024 06:54"
|
|
13
17
|
"Filters applied on data :"
|
|
@@ -15,11 +19,13 @@ def usage_liveboard_reader(usage_liveboard_csv: str) -> Iterator[dict]:
|
|
|
15
19
|
"Pinboard NOT IN [mlm - availability pinboard,null]"
|
|
16
20
|
"Timestamp >= 20240820 00:00:00 < 20240919 00:00:00"
|
|
17
21
|
"Timestamp >= 20240919 00:00:00 < 20240920 00:00:00"
|
|
22
|
+
""
|
|
18
23
|
|
|
19
24
|
"""
|
|
20
25
|
csv_file = StringIO(usage_liveboard_csv)
|
|
21
26
|
|
|
22
|
-
|
|
23
|
-
|
|
27
|
+
line = next(csv_file)
|
|
28
|
+
while not re.match(_END_OF_GENERATED_TEXT, line.strip()):
|
|
29
|
+
line = next(csv_file)
|
|
24
30
|
|
|
25
31
|
yield from csv.DictReader(csv_file)
|
|
@@ -2,7 +2,7 @@ from .utils import (
|
|
|
2
2
|
usage_liveboard_reader,
|
|
3
3
|
)
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
VALID_CSV_1 = '''"Data extract produced by Castor on 09/19/2024 06:54"
|
|
6
6
|
"Filters applied on data :"
|
|
7
7
|
"User Action IN [pinboard_embed_view,pinboard_tspublic_no_runtime_filter,pinboard_tspublic_runtime_filter,pinboard_view]"
|
|
8
8
|
"Pinboard NOT IN [mlm - availability pinboard,null]"
|
|
@@ -16,6 +16,13 @@ VALID_CSV = '''"Data extract produced by Castor on 09/19/2024 06:54"
|
|
|
16
16
|
"September test","25","2"'''
|
|
17
17
|
|
|
18
18
|
|
|
19
|
+
VALID_CSV_2 = '''"Data extract produced by Castor on 01/07/2025 16:07"
|
|
20
|
+
"Filters applied on data :"
|
|
21
|
+
"Timestamp >= 20241208 00:00:00 < 20250107 00:00:00"
|
|
22
|
+
""
|
|
23
|
+
"Answer name","User name","Number of unique users","Count of object interactions"
|
|
24
|
+
"toto","tata","1","666"'''
|
|
25
|
+
|
|
19
26
|
# Invalid CSV input (missing data rows)
|
|
20
27
|
INVALID_CSV = '''"Data extract produced by Castor on 09/19/2024 06:54"
|
|
21
28
|
"Filters applied on data :"
|
|
@@ -27,7 +34,7 @@ INVALID_CSV = '''"Data extract produced by Castor on 09/19/2024 06:54"
|
|
|
27
34
|
|
|
28
35
|
|
|
29
36
|
def test_usage_liveboard_reader():
|
|
30
|
-
|
|
37
|
+
expected_output_1 = [
|
|
31
38
|
{
|
|
32
39
|
"Pinboard": "Market Report",
|
|
33
40
|
"Pinboard Views": "559",
|
|
@@ -49,9 +56,20 @@ def test_usage_liveboard_reader():
|
|
|
49
56
|
"Unique Number of User": "2",
|
|
50
57
|
},
|
|
51
58
|
]
|
|
59
|
+
expected_output_2 = [
|
|
60
|
+
{
|
|
61
|
+
"Answer name": "toto",
|
|
62
|
+
"User name": "tata",
|
|
63
|
+
"Number of unique users": "1",
|
|
64
|
+
"Count of object interactions": "666",
|
|
65
|
+
}
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
result = list(usage_liveboard_reader(VALID_CSV_1))
|
|
69
|
+
assert result == expected_output_1
|
|
52
70
|
|
|
53
|
-
result = list(usage_liveboard_reader(
|
|
54
|
-
assert result ==
|
|
71
|
+
result = list(usage_liveboard_reader(VALID_CSV_2))
|
|
72
|
+
assert result == expected_output_2
|
|
55
73
|
|
|
56
74
|
result = list(usage_liveboard_reader(INVALID_CSV))
|
|
57
75
|
assert result == [] # Expect an empty result since there is no data
|
|
@@ -60,7 +60,7 @@ class SQLExtractionProcessor:
|
|
|
60
60
|
total = len(queries)
|
|
61
61
|
|
|
62
62
|
for i, query in enumerate(queries):
|
|
63
|
-
logger.info(f"Extracting {asset.value}: query {i+1}/{total}")
|
|
63
|
+
logger.info(f"Extracting {asset.value}: query {i + 1}/{total}")
|
|
64
64
|
# concatenate results of all queries
|
|
65
65
|
data = chain(data, self._fetch(query))
|
|
66
66
|
|
|
@@ -66,9 +66,9 @@ class BigQueryClient(SqlalchemyClient):
|
|
|
66
66
|
return BIGQUERY_URI
|
|
67
67
|
|
|
68
68
|
def _credentials(self) -> Credentials:
|
|
69
|
-
assert (
|
|
70
|
-
|
|
71
|
-
)
|
|
69
|
+
assert CREDENTIALS_INFO_KEY in self._options, (
|
|
70
|
+
"Missing BigQuery credentials in engine's options"
|
|
71
|
+
)
|
|
72
72
|
credentials = self._options[CREDENTIALS_INFO_KEY]
|
|
73
73
|
return Credentials.from_service_account_info(credentials)
|
|
74
74
|
|
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from collections.abc import Iterator
|
|
3
2
|
from functools import partial
|
|
4
|
-
from
|
|
5
|
-
from typing import Optional
|
|
3
|
+
from typing import Iterator, Optional
|
|
6
4
|
|
|
7
5
|
import requests
|
|
8
6
|
|
|
@@ -14,16 +12,14 @@ from ...utils import (
|
|
|
14
12
|
fetch_all_pages,
|
|
15
13
|
handle_response,
|
|
16
14
|
retry,
|
|
17
|
-
retry_request,
|
|
18
15
|
safe_mode,
|
|
19
16
|
)
|
|
20
17
|
from ..abstract import TimeFilter
|
|
21
18
|
from .credentials import DatabricksCredentials
|
|
22
19
|
from .endpoints import DatabricksEndpointFactory
|
|
23
20
|
from .format import DatabricksFormatter, TagMapping
|
|
24
|
-
from .lineage import single_column_lineage_links, single_table_lineage_links
|
|
25
21
|
from .pagination import DATABRICKS_PAGE_SIZE, DatabricksPagination
|
|
26
|
-
from .types import TablesColumns
|
|
22
|
+
from .types import TablesColumns
|
|
27
23
|
from .utils import hourly_time_filters
|
|
28
24
|
|
|
29
25
|
logger = logging.getLogger(__name__)
|
|
@@ -132,60 +128,6 @@ class DatabricksAPIClient(APIClient):
|
|
|
132
128
|
column_tags=column_tags,
|
|
133
129
|
)
|
|
134
130
|
|
|
135
|
-
@safe_mode(safe_lineage_params, lambda: [])
|
|
136
|
-
@retry(
|
|
137
|
-
exceptions=_RETRY_EXCEPTIONS,
|
|
138
|
-
max_retries=_RETRY_ATTEMPTS,
|
|
139
|
-
base_ms=_RETRY_BASE_MS,
|
|
140
|
-
)
|
|
141
|
-
@retry_request(
|
|
142
|
-
status_codes=(HTTPStatus.TOO_MANY_REQUESTS,),
|
|
143
|
-
max_retries=_RETRY_ATTEMPTS,
|
|
144
|
-
)
|
|
145
|
-
def get_single_column_lineage(
|
|
146
|
-
self,
|
|
147
|
-
names: tuple[str, str],
|
|
148
|
-
) -> list[TimestampedLink]:
|
|
149
|
-
"""
|
|
150
|
-
Helper function used in get_lineage_links.
|
|
151
|
-
Call data lineage API and return the content of the result
|
|
152
|
-
|
|
153
|
-
eg table_path: broward_prd.bronze.account_adjustments
|
|
154
|
-
FYI: Maximum rate of 10 requests per SECOND
|
|
155
|
-
"""
|
|
156
|
-
table_path, column_name = names
|
|
157
|
-
payload = {
|
|
158
|
-
"table_name": table_path,
|
|
159
|
-
"column_name": column_name,
|
|
160
|
-
"include_entity_lineage": True,
|
|
161
|
-
}
|
|
162
|
-
content = self._get(
|
|
163
|
-
DatabricksEndpointFactory.column_lineage(), params=payload
|
|
164
|
-
)
|
|
165
|
-
column_path = f"{table_path}.{column_name}"
|
|
166
|
-
return single_column_lineage_links(column_path, content)
|
|
167
|
-
|
|
168
|
-
@safe_mode(safe_lineage_params, lambda: [])
|
|
169
|
-
@retry(
|
|
170
|
-
exceptions=_RETRY_EXCEPTIONS,
|
|
171
|
-
max_retries=_RETRY_ATTEMPTS,
|
|
172
|
-
base_ms=_RETRY_BASE_MS,
|
|
173
|
-
)
|
|
174
|
-
def get_single_table_lineage(
|
|
175
|
-
self, table_path: str
|
|
176
|
-
) -> list[TimestampedLink]:
|
|
177
|
-
"""
|
|
178
|
-
Helper function used in get_lineage_links.
|
|
179
|
-
Call data lineage API and return the content of the result
|
|
180
|
-
eg table_path: broward_prd.bronze.account_adjustments
|
|
181
|
-
FYI: Maximum rate of 50 requests per SECOND
|
|
182
|
-
"""
|
|
183
|
-
payload = {"table_name": table_path, "include_entity_lineage": True}
|
|
184
|
-
content = self._get(
|
|
185
|
-
DatabricksEndpointFactory.table_lineage(), params=payload
|
|
186
|
-
)
|
|
187
|
-
return single_table_lineage_links(table_path, content)
|
|
188
|
-
|
|
189
131
|
@safe_mode(safe_query_params, lambda: [])
|
|
190
132
|
@retry(
|
|
191
133
|
exceptions=_RETRY_EXCEPTIONS,
|
|
@@ -1,17 +1,14 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
3
2
|
from typing import Optional
|
|
4
3
|
|
|
5
|
-
from ...utils import
|
|
6
|
-
mapping_from_rows,
|
|
7
|
-
)
|
|
4
|
+
from ...utils import mapping_from_rows
|
|
8
5
|
from ..abstract import TimeFilter
|
|
9
6
|
from .api_client import DatabricksAPIClient
|
|
10
7
|
from .credentials import DatabricksCredentials
|
|
8
|
+
from .enums import TagEntity
|
|
11
9
|
from .format import DatabricksFormatter
|
|
12
|
-
from .
|
|
13
|
-
from .
|
|
14
|
-
from .types import TablesColumns, TimestampedLink
|
|
10
|
+
from .sql_client import DatabricksSQLClient
|
|
11
|
+
from .types import TablesColumns
|
|
15
12
|
|
|
16
13
|
logger = logging.getLogger(__name__)
|
|
17
14
|
|
|
@@ -95,46 +92,6 @@ class DatabricksClient:
|
|
|
95
92
|
columns.extend(c_to_add)
|
|
96
93
|
return tables, columns
|
|
97
94
|
|
|
98
|
-
def table_lineage(self, tables: list[dict]) -> list[dict]:
|
|
99
|
-
"""
|
|
100
|
-
Wrapper function that retrieves all table lineage
|
|
101
|
-
"""
|
|
102
|
-
# retrieve table lineage
|
|
103
|
-
with ThreadPoolExecutor(max_workers=_THREADS_TABLE_LINEAGE) as executor:
|
|
104
|
-
table_paths = [
|
|
105
|
-
".".join([table["schema_id"], table["table_name"]])
|
|
106
|
-
for table in tables
|
|
107
|
-
]
|
|
108
|
-
results = executor.map(
|
|
109
|
-
self.api_client.get_single_table_lineage, table_paths
|
|
110
|
-
)
|
|
111
|
-
lineages = [link for links in results for link in links]
|
|
112
|
-
deduplicated = deduplicate_lineage(lineages)
|
|
113
|
-
return self.formatter.format_lineage(deduplicated)
|
|
114
|
-
|
|
115
|
-
def column_lineage(
|
|
116
|
-
self, tables: list[dict], columns: list[dict], table_lineage: list[dict]
|
|
117
|
-
) -> list[dict]:
|
|
118
|
-
"""
|
|
119
|
-
Wrapper function that retrieves all column lineage
|
|
120
|
-
we only try to retrieve column lineage if we found table lineage
|
|
121
|
-
"""
|
|
122
|
-
candidate_paths = paths_for_column_lineage(
|
|
123
|
-
tables, columns, table_lineage
|
|
124
|
-
)
|
|
125
|
-
# retrieve column lineage
|
|
126
|
-
with ThreadPoolExecutor(
|
|
127
|
-
max_workers=_THREADS_COLUMN_LINEAGE
|
|
128
|
-
) as executor:
|
|
129
|
-
results = executor.map(
|
|
130
|
-
self.api_client.get_single_column_lineage, candidate_paths
|
|
131
|
-
)
|
|
132
|
-
lineages: list[TimestampedLink] = [
|
|
133
|
-
link for links in results for link in links
|
|
134
|
-
]
|
|
135
|
-
deduplicated = deduplicate_lineage(lineages)
|
|
136
|
-
return self.formatter.format_lineage(deduplicated)
|
|
137
|
-
|
|
138
95
|
def queries(self, time_filter: Optional[TimeFilter] = None) -> list[dict]:
|
|
139
96
|
return self.api_client.queries(time_filter)
|
|
140
97
|
|
|
@@ -1,14 +1,4 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
3
|
-
from .client import (
|
|
4
|
-
DatabricksClient,
|
|
5
|
-
)
|
|
6
|
-
from .test_constants import (
|
|
7
|
-
CLOSER_DATE,
|
|
8
|
-
MOCK_TABLES_FOR_TABLE_LINEAGE,
|
|
9
|
-
OLDER_DATE,
|
|
10
|
-
TABLE_LINEAGE_SIDE_EFFECT,
|
|
11
|
-
)
|
|
1
|
+
from .client import DatabricksClient
|
|
12
2
|
|
|
13
3
|
|
|
14
4
|
class MockDatabricksClient(DatabricksClient):
|
|
@@ -48,27 +38,3 @@ def test_DatabricksClient__match_table_with_user():
|
|
|
48
38
|
table_without_owner = {"id": 1, "owner_email": None}
|
|
49
39
|
actual = client._match_table_with_user(table_without_owner, user_mapping)
|
|
50
40
|
assert actual == table_without_owner
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
@patch(
|
|
54
|
-
"source.packages.extractor.castor_extractor.warehouse.databricks.client.DatabricksAPIClient._get",
|
|
55
|
-
side_effect=TABLE_LINEAGE_SIDE_EFFECT,
|
|
56
|
-
)
|
|
57
|
-
def test_DatabricksClient_table_lineage(mock_get):
|
|
58
|
-
client = DatabricksClient(Mock())
|
|
59
|
-
|
|
60
|
-
lineage = client.table_lineage(MOCK_TABLES_FOR_TABLE_LINEAGE)
|
|
61
|
-
assert len(lineage) == 2
|
|
62
|
-
|
|
63
|
-
expected_link_1 = {
|
|
64
|
-
"parent_path": "dev.silver.pre_analytics",
|
|
65
|
-
"child_path": "dev.silver.analytics",
|
|
66
|
-
"timestamp": OLDER_DATE,
|
|
67
|
-
}
|
|
68
|
-
expected_link_2 = {
|
|
69
|
-
"parent_path": "dev.bronze.analytics",
|
|
70
|
-
"child_path": "dev.silver.analytics",
|
|
71
|
-
"timestamp": CLOSER_DATE,
|
|
72
|
-
}
|
|
73
|
-
assert expected_link_1 in lineage
|
|
74
|
-
assert expected_link_2 in lineage
|