castor-extractor 0.22.1__py3-none-any.whl → 0.22.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +16 -0
- castor_extractor/visualization/sigma/client/client.py +64 -10
- castor_extractor/visualization/thoughtspot/assets.py +3 -1
- castor_extractor/visualization/thoughtspot/client/client.py +67 -14
- castor_extractor/visualization/thoughtspot/client/utils.py +10 -4
- castor_extractor/visualization/thoughtspot/client/utils_test.py +22 -4
- castor_extractor/warehouse/databricks/api_client.py +2 -60
- castor_extractor/warehouse/databricks/client.py +4 -47
- castor_extractor/warehouse/databricks/client_test.py +1 -35
- castor_extractor/warehouse/databricks/credentials.py +4 -6
- castor_extractor/warehouse/databricks/enums.py +15 -0
- castor_extractor/warehouse/databricks/extract.py +13 -11
- castor_extractor/warehouse/databricks/lineage.py +47 -119
- castor_extractor/warehouse/databricks/lineage_test.py +86 -31
- castor_extractor/warehouse/databricks/sql_client.py +23 -8
- castor_extractor/warehouse/databricks/types.py +0 -7
- {castor_extractor-0.22.1.dist-info → castor_extractor-0.22.5.dist-info}/METADATA +17 -1
- {castor_extractor-0.22.1.dist-info → castor_extractor-0.22.5.dist-info}/RECORD +21 -21
- castor_extractor/warehouse/databricks/test_constants.py +0 -79
- {castor_extractor-0.22.1.dist-info → castor_extractor-0.22.5.dist-info}/LICENCE +0 -0
- {castor_extractor-0.22.1.dist-info → castor_extractor-0.22.5.dist-info}/WHEEL +0 -0
- {castor_extractor-0.22.1.dist-info → castor_extractor-0.22.5.dist-info}/entry_points.txt +0 -0
CHANGELOG.md
CHANGED
|
@@ -1,6 +1,22 @@
|
|
|
1
1
|
|
|
2
2
|
# Changelog
|
|
3
3
|
|
|
4
|
+
## 0.22.5 - 2025-01-09
|
|
5
|
+
|
|
6
|
+
* Databricks: validate and deduplicate lineage links
|
|
7
|
+
|
|
8
|
+
## 0.22.4 - 2025-01-08
|
|
9
|
+
|
|
10
|
+
* ThoughtSpot: extract answers
|
|
11
|
+
|
|
12
|
+
## 0.22.3 - 2024-12-10
|
|
13
|
+
|
|
14
|
+
* Databricks: extract lineage from system tables
|
|
15
|
+
|
|
16
|
+
## 0.22.2 - 2024-12-06
|
|
17
|
+
|
|
18
|
+
* Sigma: multithreading to retrieve lineage
|
|
19
|
+
|
|
4
20
|
## 0.22.1 - 2024-12-05
|
|
5
21
|
|
|
6
22
|
* Salesforce: deduplicate tables
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
from collections.abc import Iterator
|
|
2
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
2
3
|
from functools import partial
|
|
3
4
|
from http import HTTPStatus
|
|
4
5
|
from typing import Callable, Optional
|
|
5
6
|
|
|
6
7
|
import requests
|
|
8
|
+
from pydantic import BaseModel
|
|
7
9
|
|
|
8
10
|
from ....utils import (
|
|
9
11
|
APIClient,
|
|
@@ -12,6 +14,7 @@ from ....utils import (
|
|
|
12
14
|
build_url,
|
|
13
15
|
fetch_all_pages,
|
|
14
16
|
handle_response,
|
|
17
|
+
retry,
|
|
15
18
|
)
|
|
16
19
|
from ..assets import SigmaAsset
|
|
17
20
|
from .credentials import SigmaCredentials
|
|
@@ -29,7 +32,7 @@ _DATA_ELEMENTS: tuple[str, ...] = (
|
|
|
29
32
|
)
|
|
30
33
|
|
|
31
34
|
_AUTH_TIMEOUT_S = 60
|
|
32
|
-
|
|
35
|
+
_SIGMA_TIMEOUT_S = 300
|
|
33
36
|
|
|
34
37
|
_SIGMA_HEADERS = {
|
|
35
38
|
"Content-Type": _CONTENT_TYPE,
|
|
@@ -47,6 +50,23 @@ SIGMA_SAFE_MODE = RequestSafeMode(
|
|
|
47
50
|
max_errors=_VOLUME_IGNORED,
|
|
48
51
|
status_codes=_IGNORED_ERROR_CODES,
|
|
49
52
|
)
|
|
53
|
+
_THREADS_LINEAGE = 10 # empirically found; hit the rate limit with 20 workers
|
|
54
|
+
_RETRY_NUMBER = 1
|
|
55
|
+
_RETRY_BASE_MS = 60_000
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class LineageContext(BaseModel):
|
|
59
|
+
"""all info needed to build the endpoint for lineage retrieval"""
|
|
60
|
+
|
|
61
|
+
workbook_id: str
|
|
62
|
+
element_id: str
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class Lineage(BaseModel):
|
|
66
|
+
"""holds response from lineage API and context used to retrieve it"""
|
|
67
|
+
|
|
68
|
+
lineage: dict
|
|
69
|
+
context: LineageContext
|
|
50
70
|
|
|
51
71
|
|
|
52
72
|
class SigmaBearerAuth(BearerAuth):
|
|
@@ -77,7 +97,7 @@ class SigmaClient(APIClient):
|
|
|
77
97
|
host=credentials.host,
|
|
78
98
|
auth=auth,
|
|
79
99
|
headers=_SIGMA_HEADERS,
|
|
80
|
-
timeout=
|
|
100
|
+
timeout=_SIGMA_TIMEOUT_S,
|
|
81
101
|
safe_mode=safe_mode or SIGMA_SAFE_MODE,
|
|
82
102
|
)
|
|
83
103
|
|
|
@@ -133,17 +153,51 @@ class SigmaClient(APIClient):
|
|
|
133
153
|
page=page, workbook_id=workbook_id
|
|
134
154
|
)
|
|
135
155
|
|
|
136
|
-
|
|
156
|
+
@retry(
|
|
157
|
+
(ConnectionError,),
|
|
158
|
+
max_retries=_RETRY_NUMBER,
|
|
159
|
+
base_ms=_RETRY_BASE_MS,
|
|
160
|
+
log_exc_info=True,
|
|
161
|
+
)
|
|
162
|
+
def _get_lineage(self, lineage_context: LineageContext) -> Lineage:
|
|
163
|
+
"""
|
|
164
|
+
return the lineage from API and other ids needed to characterize
|
|
165
|
+
lineage in castor
|
|
166
|
+
"""
|
|
167
|
+
workbook_id = lineage_context.workbook_id
|
|
168
|
+
element_id = lineage_context.element_id
|
|
169
|
+
endpoint = SigmaEndpointFactory.lineage(workbook_id, element_id)
|
|
170
|
+
return Lineage(lineage=self._get(endpoint), context=lineage_context)
|
|
171
|
+
|
|
172
|
+
@staticmethod
|
|
173
|
+
def _lineage_context(elements: list[dict]) -> list[LineageContext]:
|
|
174
|
+
"""
|
|
175
|
+
Helper function to prepare context for lineage retrieval.
|
|
176
|
+
Elements without associated columns are skipped.
|
|
177
|
+
"""
|
|
178
|
+
contexts: list[LineageContext] = []
|
|
137
179
|
for element in elements:
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
180
|
+
if element.get("columns") is None:
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
context = LineageContext(
|
|
184
|
+
workbook_id=element["workbook_id"],
|
|
185
|
+
element_id=element["elementId"],
|
|
142
186
|
)
|
|
187
|
+
contexts.append(context)
|
|
188
|
+
return contexts
|
|
189
|
+
|
|
190
|
+
def _get_all_lineages(self, elements: list[dict]) -> Iterator[dict]:
|
|
191
|
+
lineage_context = self._lineage_context(elements)
|
|
192
|
+
|
|
193
|
+
with ThreadPoolExecutor(max_workers=_THREADS_LINEAGE) as executor:
|
|
194
|
+
results = executor.map(self._get_lineage, lineage_context)
|
|
195
|
+
|
|
196
|
+
for lineage in results:
|
|
143
197
|
yield {
|
|
144
|
-
**lineage,
|
|
145
|
-
"workbook_id": workbook_id,
|
|
146
|
-
"element_id": element_id,
|
|
198
|
+
**lineage.lineage,
|
|
199
|
+
"workbook_id": lineage.context.workbook_id,
|
|
200
|
+
"element_id": lineage.context.element_id,
|
|
147
201
|
}
|
|
148
202
|
|
|
149
203
|
def _get_all_queries(self, workbooks: list[dict]) -> Iterator[dict]:
|
|
@@ -4,6 +4,8 @@ from ...types import ExternalAsset
|
|
|
4
4
|
class ThoughtspotAsset(ExternalAsset):
|
|
5
5
|
"""Thoughtspot assets"""
|
|
6
6
|
|
|
7
|
+
ANSWERS = "answers"
|
|
8
|
+
ANSWER_USAGES = "answer_usages"
|
|
7
9
|
LIVEBOARDS = "liveboards"
|
|
10
|
+
LIVEBOARD_USAGES = "liveboard_usages"
|
|
8
11
|
LOGICAL_TABLES = "logical_tables"
|
|
9
|
-
USAGES = "usages"
|
|
@@ -30,7 +30,12 @@ _THOUGHTSPOT_HEADERS = {
|
|
|
30
30
|
"Content-Type": "application/json",
|
|
31
31
|
}
|
|
32
32
|
_METADATA_BATCH_SIZE = 100
|
|
33
|
-
|
|
33
|
+
# https://docs.thoughtspot.com/cloud/latest/object-usage-liveboard
|
|
34
|
+
_OBJECT_USAGE_LIVEBOARD = "Object Usage"
|
|
35
|
+
_ANSWER_USAGE_VIZ = "Answer Usage, by User"
|
|
36
|
+
# https://docs.thoughtspot.com/cloud/latest/user-adoption
|
|
37
|
+
_USER_ADOPTION_LIVEBOARD = "User Adoption"
|
|
38
|
+
_LIVEBOARD_USAGE_VIZ = "Popular Liveboards Last 30 Days"
|
|
34
39
|
# By default, no errors are ignored for the moment
|
|
35
40
|
THOUGHTSPOT_SAFE_MODE = RequestSafeMode()
|
|
36
41
|
|
|
@@ -69,23 +74,39 @@ class ThoughtspotClient(APIClient):
|
|
|
69
74
|
def _metadata_search(
|
|
70
75
|
self,
|
|
71
76
|
metadata_type: str,
|
|
77
|
+
identifier: Optional[str] = None,
|
|
72
78
|
) -> Iterator[dict]:
|
|
79
|
+
"""
|
|
80
|
+
Yields assets of the given asset type, and optionally filters on a
|
|
81
|
+
specific identifier.
|
|
82
|
+
"""
|
|
73
83
|
offset = 0
|
|
84
|
+
|
|
74
85
|
while True:
|
|
86
|
+
search_filters = {
|
|
87
|
+
"metadata": [{"type": metadata_type}],
|
|
88
|
+
"include_details": True,
|
|
89
|
+
"record_size": _METADATA_BATCH_SIZE,
|
|
90
|
+
"record_offset": offset,
|
|
91
|
+
}
|
|
92
|
+
if identifier:
|
|
93
|
+
search_filters["metadata"] = {
|
|
94
|
+
"identifier": identifier,
|
|
95
|
+
"type": metadata_type,
|
|
96
|
+
}
|
|
97
|
+
|
|
75
98
|
metadata = self._post(
|
|
76
99
|
ThoughtspotEndpointFactory.metadata_search(),
|
|
77
|
-
data=
|
|
78
|
-
"metadata": [{"type": metadata_type}],
|
|
79
|
-
"include_details": True,
|
|
80
|
-
"record_size": _METADATA_BATCH_SIZE,
|
|
81
|
-
"record_offset": offset,
|
|
82
|
-
},
|
|
100
|
+
data=search_filters,
|
|
83
101
|
)
|
|
84
102
|
yield from metadata
|
|
85
103
|
if len(metadata) < _METADATA_BATCH_SIZE:
|
|
86
104
|
break
|
|
87
105
|
offset = offset + _METADATA_BATCH_SIZE
|
|
88
106
|
|
|
107
|
+
def _get_all_answers(self) -> Iterator[dict]:
|
|
108
|
+
yield from self._metadata_search(metadata_type="ANSWER")
|
|
109
|
+
|
|
89
110
|
def _get_all_liveboards(self) -> Iterator[dict]:
|
|
90
111
|
yield from self._metadata_search(metadata_type="LIVEBOARD")
|
|
91
112
|
|
|
@@ -95,26 +116,58 @@ class ThoughtspotClient(APIClient):
|
|
|
95
116
|
def _get_all_tables(self) -> Iterator[dict]:
|
|
96
117
|
yield from self._metadata_search(metadata_type="LOGICAL_TABLE")
|
|
97
118
|
|
|
98
|
-
def
|
|
119
|
+
def _get_usages(
|
|
120
|
+
self,
|
|
121
|
+
liveboard_name: str,
|
|
122
|
+
visualization_name: str,
|
|
123
|
+
) -> Iterator[dict]:
|
|
124
|
+
"""
|
|
125
|
+
Yields the data of a given visualization in the given liveboard.
|
|
126
|
+
ThoughtSpot maintains two system liveboards with stats about data usage,
|
|
127
|
+
which are useful to compute view counts and popularity.
|
|
128
|
+
"""
|
|
129
|
+
usage_liveboard = next(
|
|
130
|
+
self._metadata_search(
|
|
131
|
+
metadata_type="LIVEBOARD", identifier=liveboard_name
|
|
132
|
+
)
|
|
133
|
+
)
|
|
134
|
+
liveboard_id = usage_liveboard["metadata_id"]
|
|
135
|
+
|
|
99
136
|
data = self._post(
|
|
100
137
|
endpoint=ThoughtspotEndpointFactory.liveboard(),
|
|
101
138
|
headers={"Accept": "application/octet-stream"},
|
|
102
139
|
data={
|
|
103
|
-
"metadata_identifier":
|
|
140
|
+
"metadata_identifier": liveboard_id,
|
|
104
141
|
"file_format": "CSV",
|
|
105
|
-
"visualization_identifiers": [
|
|
106
|
-
"Popular Liveboards Last 30 Days"
|
|
107
|
-
],
|
|
142
|
+
"visualization_identifiers": [visualization_name],
|
|
108
143
|
},
|
|
109
144
|
handler=lambda x: x.text,
|
|
110
145
|
)
|
|
111
146
|
yield from usage_liveboard_reader(data)
|
|
112
147
|
|
|
113
|
-
def
|
|
148
|
+
def _get_answer_usages(self) -> Iterator[dict]:
|
|
149
|
+
return self._get_usages(
|
|
150
|
+
liveboard_name=_OBJECT_USAGE_LIVEBOARD,
|
|
151
|
+
visualization_name=_ANSWER_USAGE_VIZ,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
def _get_liveboards_usages(self) -> Iterator[dict]:
|
|
155
|
+
return self._get_usages(
|
|
156
|
+
liveboard_name=_USER_ADOPTION_LIVEBOARD,
|
|
157
|
+
visualization_name=_LIVEBOARD_USAGE_VIZ,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
def fetch(self, asset: ThoughtspotAsset) -> Iterator[dict]:
|
|
161
|
+
if asset == ThoughtspotAsset.ANSWERS:
|
|
162
|
+
yield from self._get_all_answers()
|
|
163
|
+
|
|
164
|
+
if asset == ThoughtspotAsset.ANSWER_USAGES:
|
|
165
|
+
yield from self._get_answer_usages()
|
|
166
|
+
|
|
114
167
|
if asset == ThoughtspotAsset.LIVEBOARDS:
|
|
115
168
|
yield from self._get_all_liveboards()
|
|
116
169
|
|
|
117
|
-
if asset == ThoughtspotAsset.
|
|
170
|
+
if asset == ThoughtspotAsset.LIVEBOARD_USAGES:
|
|
118
171
|
yield from self._get_liveboards_usages()
|
|
119
172
|
|
|
120
173
|
if asset == ThoughtspotAsset.LOGICAL_TABLES:
|
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
import csv
|
|
2
|
+
import re
|
|
2
3
|
from collections.abc import Iterator
|
|
3
4
|
from io import StringIO
|
|
4
5
|
|
|
6
|
+
_END_OF_GENERATED_TEXT = r'^""$'
|
|
7
|
+
|
|
5
8
|
|
|
6
9
|
def usage_liveboard_reader(usage_liveboard_csv: str) -> Iterator[dict]:
|
|
7
10
|
"""
|
|
8
11
|
Converts a CSV string into an iterator of dictionaries after
|
|
9
|
-
ignoring the
|
|
10
|
-
|
|
12
|
+
ignoring the generated text that preceeds the actual CSV header row.
|
|
13
|
+
The generated block ends with a row containing only two double quotes.
|
|
14
|
+
Here is an example:
|
|
11
15
|
|
|
12
16
|
"Data extract produced by Castor on 09/19/2024 06:54"
|
|
13
17
|
"Filters applied on data :"
|
|
@@ -15,11 +19,13 @@ def usage_liveboard_reader(usage_liveboard_csv: str) -> Iterator[dict]:
|
|
|
15
19
|
"Pinboard NOT IN [mlm - availability pinboard,null]"
|
|
16
20
|
"Timestamp >= 20240820 00:00:00 < 20240919 00:00:00"
|
|
17
21
|
"Timestamp >= 20240919 00:00:00 < 20240920 00:00:00"
|
|
22
|
+
""
|
|
18
23
|
|
|
19
24
|
"""
|
|
20
25
|
csv_file = StringIO(usage_liveboard_csv)
|
|
21
26
|
|
|
22
|
-
|
|
23
|
-
|
|
27
|
+
line = next(csv_file)
|
|
28
|
+
while not re.match(_END_OF_GENERATED_TEXT, line.strip()):
|
|
29
|
+
line = next(csv_file)
|
|
24
30
|
|
|
25
31
|
yield from csv.DictReader(csv_file)
|
|
@@ -2,7 +2,7 @@ from .utils import (
|
|
|
2
2
|
usage_liveboard_reader,
|
|
3
3
|
)
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
VALID_CSV_1 = '''"Data extract produced by Castor on 09/19/2024 06:54"
|
|
6
6
|
"Filters applied on data :"
|
|
7
7
|
"User Action IN [pinboard_embed_view,pinboard_tspublic_no_runtime_filter,pinboard_tspublic_runtime_filter,pinboard_view]"
|
|
8
8
|
"Pinboard NOT IN [mlm - availability pinboard,null]"
|
|
@@ -16,6 +16,13 @@ VALID_CSV = '''"Data extract produced by Castor on 09/19/2024 06:54"
|
|
|
16
16
|
"September test","25","2"'''
|
|
17
17
|
|
|
18
18
|
|
|
19
|
+
VALID_CSV_2 = '''"Data extract produced by Castor on 01/07/2025 16:07"
|
|
20
|
+
"Filters applied on data :"
|
|
21
|
+
"Timestamp >= 20241208 00:00:00 < 20250107 00:00:00"
|
|
22
|
+
""
|
|
23
|
+
"Answer name","User name","Number of unique users","Count of object interactions"
|
|
24
|
+
"toto","tata","1","666"'''
|
|
25
|
+
|
|
19
26
|
# Invalid CSV input (missing data rows)
|
|
20
27
|
INVALID_CSV = '''"Data extract produced by Castor on 09/19/2024 06:54"
|
|
21
28
|
"Filters applied on data :"
|
|
@@ -27,7 +34,7 @@ INVALID_CSV = '''"Data extract produced by Castor on 09/19/2024 06:54"
|
|
|
27
34
|
|
|
28
35
|
|
|
29
36
|
def test_usage_liveboard_reader():
|
|
30
|
-
|
|
37
|
+
expected_output_1 = [
|
|
31
38
|
{
|
|
32
39
|
"Pinboard": "Market Report",
|
|
33
40
|
"Pinboard Views": "559",
|
|
@@ -49,9 +56,20 @@ def test_usage_liveboard_reader():
|
|
|
49
56
|
"Unique Number of User": "2",
|
|
50
57
|
},
|
|
51
58
|
]
|
|
59
|
+
expected_output_2 = [
|
|
60
|
+
{
|
|
61
|
+
"Answer name": "toto",
|
|
62
|
+
"User name": "tata",
|
|
63
|
+
"Number of unique users": "1",
|
|
64
|
+
"Count of object interactions": "666",
|
|
65
|
+
}
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
result = list(usage_liveboard_reader(VALID_CSV_1))
|
|
69
|
+
assert result == expected_output_1
|
|
52
70
|
|
|
53
|
-
result = list(usage_liveboard_reader(
|
|
54
|
-
assert result ==
|
|
71
|
+
result = list(usage_liveboard_reader(VALID_CSV_2))
|
|
72
|
+
assert result == expected_output_2
|
|
55
73
|
|
|
56
74
|
result = list(usage_liveboard_reader(INVALID_CSV))
|
|
57
75
|
assert result == [] # Expect an empty result since there is no data
|
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from collections.abc import Iterator
|
|
3
2
|
from functools import partial
|
|
4
|
-
from
|
|
5
|
-
from typing import Optional
|
|
3
|
+
from typing import Iterator, Optional
|
|
6
4
|
|
|
7
5
|
import requests
|
|
8
6
|
|
|
@@ -14,16 +12,14 @@ from ...utils import (
|
|
|
14
12
|
fetch_all_pages,
|
|
15
13
|
handle_response,
|
|
16
14
|
retry,
|
|
17
|
-
retry_request,
|
|
18
15
|
safe_mode,
|
|
19
16
|
)
|
|
20
17
|
from ..abstract import TimeFilter
|
|
21
18
|
from .credentials import DatabricksCredentials
|
|
22
19
|
from .endpoints import DatabricksEndpointFactory
|
|
23
20
|
from .format import DatabricksFormatter, TagMapping
|
|
24
|
-
from .lineage import single_column_lineage_links, single_table_lineage_links
|
|
25
21
|
from .pagination import DATABRICKS_PAGE_SIZE, DatabricksPagination
|
|
26
|
-
from .types import TablesColumns
|
|
22
|
+
from .types import TablesColumns
|
|
27
23
|
from .utils import hourly_time_filters
|
|
28
24
|
|
|
29
25
|
logger = logging.getLogger(__name__)
|
|
@@ -132,60 +128,6 @@ class DatabricksAPIClient(APIClient):
|
|
|
132
128
|
column_tags=column_tags,
|
|
133
129
|
)
|
|
134
130
|
|
|
135
|
-
@safe_mode(safe_lineage_params, lambda: [])
|
|
136
|
-
@retry(
|
|
137
|
-
exceptions=_RETRY_EXCEPTIONS,
|
|
138
|
-
max_retries=_RETRY_ATTEMPTS,
|
|
139
|
-
base_ms=_RETRY_BASE_MS,
|
|
140
|
-
)
|
|
141
|
-
@retry_request(
|
|
142
|
-
status_codes=(HTTPStatus.TOO_MANY_REQUESTS,),
|
|
143
|
-
max_retries=_RETRY_ATTEMPTS,
|
|
144
|
-
)
|
|
145
|
-
def get_single_column_lineage(
|
|
146
|
-
self,
|
|
147
|
-
names: tuple[str, str],
|
|
148
|
-
) -> list[TimestampedLink]:
|
|
149
|
-
"""
|
|
150
|
-
Helper function used in get_lineage_links.
|
|
151
|
-
Call data lineage API and return the content of the result
|
|
152
|
-
|
|
153
|
-
eg table_path: broward_prd.bronze.account_adjustments
|
|
154
|
-
FYI: Maximum rate of 10 requests per SECOND
|
|
155
|
-
"""
|
|
156
|
-
table_path, column_name = names
|
|
157
|
-
payload = {
|
|
158
|
-
"table_name": table_path,
|
|
159
|
-
"column_name": column_name,
|
|
160
|
-
"include_entity_lineage": True,
|
|
161
|
-
}
|
|
162
|
-
content = self._get(
|
|
163
|
-
DatabricksEndpointFactory.column_lineage(), params=payload
|
|
164
|
-
)
|
|
165
|
-
column_path = f"{table_path}.{column_name}"
|
|
166
|
-
return single_column_lineage_links(column_path, content)
|
|
167
|
-
|
|
168
|
-
@safe_mode(safe_lineage_params, lambda: [])
|
|
169
|
-
@retry(
|
|
170
|
-
exceptions=_RETRY_EXCEPTIONS,
|
|
171
|
-
max_retries=_RETRY_ATTEMPTS,
|
|
172
|
-
base_ms=_RETRY_BASE_MS,
|
|
173
|
-
)
|
|
174
|
-
def get_single_table_lineage(
|
|
175
|
-
self, table_path: str
|
|
176
|
-
) -> list[TimestampedLink]:
|
|
177
|
-
"""
|
|
178
|
-
Helper function used in get_lineage_links.
|
|
179
|
-
Call data lineage API and return the content of the result
|
|
180
|
-
eg table_path: broward_prd.bronze.account_adjustments
|
|
181
|
-
FYI: Maximum rate of 50 requests per SECOND
|
|
182
|
-
"""
|
|
183
|
-
payload = {"table_name": table_path, "include_entity_lineage": True}
|
|
184
|
-
content = self._get(
|
|
185
|
-
DatabricksEndpointFactory.table_lineage(), params=payload
|
|
186
|
-
)
|
|
187
|
-
return single_table_lineage_links(table_path, content)
|
|
188
|
-
|
|
189
131
|
@safe_mode(safe_query_params, lambda: [])
|
|
190
132
|
@retry(
|
|
191
133
|
exceptions=_RETRY_EXCEPTIONS,
|
|
@@ -1,17 +1,14 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
3
2
|
from typing import Optional
|
|
4
3
|
|
|
5
|
-
from ...utils import
|
|
6
|
-
mapping_from_rows,
|
|
7
|
-
)
|
|
4
|
+
from ...utils import mapping_from_rows
|
|
8
5
|
from ..abstract import TimeFilter
|
|
9
6
|
from .api_client import DatabricksAPIClient
|
|
10
7
|
from .credentials import DatabricksCredentials
|
|
8
|
+
from .enums import TagEntity
|
|
11
9
|
from .format import DatabricksFormatter
|
|
12
|
-
from .
|
|
13
|
-
from .
|
|
14
|
-
from .types import TablesColumns, TimestampedLink
|
|
10
|
+
from .sql_client import DatabricksSQLClient
|
|
11
|
+
from .types import TablesColumns
|
|
15
12
|
|
|
16
13
|
logger = logging.getLogger(__name__)
|
|
17
14
|
|
|
@@ -95,46 +92,6 @@ class DatabricksClient:
|
|
|
95
92
|
columns.extend(c_to_add)
|
|
96
93
|
return tables, columns
|
|
97
94
|
|
|
98
|
-
def table_lineage(self, tables: list[dict]) -> list[dict]:
|
|
99
|
-
"""
|
|
100
|
-
Wrapper function that retrieves all table lineage
|
|
101
|
-
"""
|
|
102
|
-
# retrieve table lineage
|
|
103
|
-
with ThreadPoolExecutor(max_workers=_THREADS_TABLE_LINEAGE) as executor:
|
|
104
|
-
table_paths = [
|
|
105
|
-
".".join([table["schema_id"], table["table_name"]])
|
|
106
|
-
for table in tables
|
|
107
|
-
]
|
|
108
|
-
results = executor.map(
|
|
109
|
-
self.api_client.get_single_table_lineage, table_paths
|
|
110
|
-
)
|
|
111
|
-
lineages = [link for links in results for link in links]
|
|
112
|
-
deduplicated = deduplicate_lineage(lineages)
|
|
113
|
-
return self.formatter.format_lineage(deduplicated)
|
|
114
|
-
|
|
115
|
-
def column_lineage(
|
|
116
|
-
self, tables: list[dict], columns: list[dict], table_lineage: list[dict]
|
|
117
|
-
) -> list[dict]:
|
|
118
|
-
"""
|
|
119
|
-
Wrapper function that retrieves all column lineage
|
|
120
|
-
we only try to retrieve column lineage if we found table lineage
|
|
121
|
-
"""
|
|
122
|
-
candidate_paths = paths_for_column_lineage(
|
|
123
|
-
tables, columns, table_lineage
|
|
124
|
-
)
|
|
125
|
-
# retrieve column lineage
|
|
126
|
-
with ThreadPoolExecutor(
|
|
127
|
-
max_workers=_THREADS_COLUMN_LINEAGE
|
|
128
|
-
) as executor:
|
|
129
|
-
results = executor.map(
|
|
130
|
-
self.api_client.get_single_column_lineage, candidate_paths
|
|
131
|
-
)
|
|
132
|
-
lineages: list[TimestampedLink] = [
|
|
133
|
-
link for links in results for link in links
|
|
134
|
-
]
|
|
135
|
-
deduplicated = deduplicate_lineage(lineages)
|
|
136
|
-
return self.formatter.format_lineage(deduplicated)
|
|
137
|
-
|
|
138
95
|
def queries(self, time_filter: Optional[TimeFilter] = None) -> list[dict]:
|
|
139
96
|
return self.api_client.queries(time_filter)
|
|
140
97
|
|
|
@@ -1,14 +1,4 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
3
|
-
from .client import (
|
|
4
|
-
DatabricksClient,
|
|
5
|
-
)
|
|
6
|
-
from .test_constants import (
|
|
7
|
-
CLOSER_DATE,
|
|
8
|
-
MOCK_TABLES_FOR_TABLE_LINEAGE,
|
|
9
|
-
OLDER_DATE,
|
|
10
|
-
TABLE_LINEAGE_SIDE_EFFECT,
|
|
11
|
-
)
|
|
1
|
+
from .client import DatabricksClient
|
|
12
2
|
|
|
13
3
|
|
|
14
4
|
class MockDatabricksClient(DatabricksClient):
|
|
@@ -48,27 +38,3 @@ def test_DatabricksClient__match_table_with_user():
|
|
|
48
38
|
table_without_owner = {"id": 1, "owner_email": None}
|
|
49
39
|
actual = client._match_table_with_user(table_without_owner, user_mapping)
|
|
50
40
|
assert actual == table_without_owner
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
@patch(
|
|
54
|
-
"source.packages.extractor.castor_extractor.warehouse.databricks.client.DatabricksAPIClient._get",
|
|
55
|
-
side_effect=TABLE_LINEAGE_SIDE_EFFECT,
|
|
56
|
-
)
|
|
57
|
-
def test_DatabricksClient_table_lineage(mock_get):
|
|
58
|
-
client = DatabricksClient(Mock())
|
|
59
|
-
|
|
60
|
-
lineage = client.table_lineage(MOCK_TABLES_FOR_TABLE_LINEAGE)
|
|
61
|
-
assert len(lineage) == 2
|
|
62
|
-
|
|
63
|
-
expected_link_1 = {
|
|
64
|
-
"parent_path": "dev.silver.pre_analytics",
|
|
65
|
-
"child_path": "dev.silver.analytics",
|
|
66
|
-
"timestamp": OLDER_DATE,
|
|
67
|
-
}
|
|
68
|
-
expected_link_2 = {
|
|
69
|
-
"parent_path": "dev.bronze.analytics",
|
|
70
|
-
"child_path": "dev.silver.analytics",
|
|
71
|
-
"timestamp": CLOSER_DATE,
|
|
72
|
-
}
|
|
73
|
-
assert expected_link_1 in lineage
|
|
74
|
-
assert expected_link_2 in lineage
|
|
@@ -1,24 +1,22 @@
|
|
|
1
1
|
from dataclasses import field
|
|
2
|
-
from typing import Optional
|
|
3
2
|
|
|
4
|
-
from
|
|
5
|
-
from pydantic_settings import SettingsConfigDict
|
|
3
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
6
4
|
|
|
7
5
|
DATABRICKS_ENV_PREFIX = "CASTOR_DATABRICKS_"
|
|
8
6
|
|
|
9
7
|
|
|
10
|
-
|
|
11
|
-
class DatabricksCredentials:
|
|
8
|
+
class DatabricksCredentials(BaseSettings):
|
|
12
9
|
"""
|
|
13
10
|
Credentials needed by Databricks client
|
|
14
11
|
Requires:
|
|
15
12
|
- host
|
|
13
|
+
- http_path
|
|
16
14
|
- token
|
|
17
15
|
"""
|
|
18
16
|
|
|
19
17
|
host: str
|
|
18
|
+
http_path: str
|
|
20
19
|
token: str = field(metadata={"sensitive": True})
|
|
21
|
-
http_path: Optional[str] = field(default=None)
|
|
22
20
|
|
|
23
21
|
model_config = SettingsConfigDict(
|
|
24
22
|
env_prefix=DATABRICKS_ENV_PREFIX,
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class LineageEntity(Enum):
|
|
5
|
+
"""Entities that can be linked in Databricks lineage"""
|
|
6
|
+
|
|
7
|
+
COLUMN = "COLUMN"
|
|
8
|
+
TABLE = "TABLE"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TagEntity(Enum):
|
|
12
|
+
"""Entities that can be tagged in Databricks"""
|
|
13
|
+
|
|
14
|
+
COLUMN = "COLUMN"
|
|
15
|
+
TABLE = "TABLE"
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from datetime import date
|
|
2
3
|
from typing import Optional
|
|
3
4
|
|
|
4
5
|
from ...utils import AbstractStorage, LocalStorage, write_summary
|
|
@@ -16,6 +17,7 @@ from ..abstract import (
|
|
|
16
17
|
)
|
|
17
18
|
from .client import DatabricksClient
|
|
18
19
|
from .credentials import DatabricksCredentials
|
|
20
|
+
from .enums import LineageEntity
|
|
19
21
|
|
|
20
22
|
DATABRICKS_ASSETS: SupportedAssets = {
|
|
21
23
|
WarehouseAssetGroup.ADDITIONAL_LINEAGE: ADDITIONAL_LINEAGE_ASSETS,
|
|
@@ -32,6 +34,12 @@ OTimeFilter = Optional[TimeFilter]
|
|
|
32
34
|
Paths = dict[str, str]
|
|
33
35
|
|
|
34
36
|
|
|
37
|
+
def _day(time_filter: OTimeFilter) -> date:
|
|
38
|
+
if not time_filter:
|
|
39
|
+
return TimeFilter.default().day
|
|
40
|
+
return time_filter.day
|
|
41
|
+
|
|
42
|
+
|
|
35
43
|
class DatabricksExtractionProcessor:
|
|
36
44
|
"""Databricks' API-based extraction management"""
|
|
37
45
|
|
|
@@ -96,22 +104,18 @@ class DatabricksExtractionProcessor:
|
|
|
96
104
|
logger.info(f"Extracted {len(columns)} columns to {location}")
|
|
97
105
|
return catalog_locations
|
|
98
106
|
|
|
99
|
-
def extract_lineage(self) -> Paths:
|
|
107
|
+
def extract_lineage(self, time_filter: OTimeFilter = None) -> Paths:
|
|
100
108
|
if self._should_not_reextract(WarehouseAssetGroup.ADDITIONAL_LINEAGE):
|
|
101
109
|
return self._existing_group_paths(
|
|
102
110
|
WarehouseAssetGroup.ADDITIONAL_LINEAGE
|
|
103
111
|
)
|
|
104
112
|
lineage_locations: dict[str, str] = dict()
|
|
105
113
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
schemas = self._client.schemas(databases)
|
|
109
|
-
users = self._client.users()
|
|
110
|
-
tables, columns = self._client.tables_and_columns(schemas, users)
|
|
111
|
-
logger.info("Extracted pre-requisite catalog. Next comes lineage")
|
|
114
|
+
day = _day(time_filter)
|
|
115
|
+
client = self._client.sql_client
|
|
112
116
|
|
|
113
117
|
# extract table lineage
|
|
114
|
-
table_lineage =
|
|
118
|
+
table_lineage = client.get_lineage(LineageEntity.TABLE, day)
|
|
115
119
|
table_lineage_key = WarehouseAsset.ADDITIONAL_TABLE_LINEAGE.value
|
|
116
120
|
location = self._storage.put(table_lineage_key, table_lineage)
|
|
117
121
|
lineage_locations[table_lineage_key] = location
|
|
@@ -119,9 +123,7 @@ class DatabricksExtractionProcessor:
|
|
|
119
123
|
logger.info(msg)
|
|
120
124
|
|
|
121
125
|
# extract column lineage
|
|
122
|
-
column_lineage =
|
|
123
|
-
tables, columns, table_lineage
|
|
124
|
-
)
|
|
126
|
+
column_lineage = client.get_lineage(LineageEntity.COLUMN, day)
|
|
125
127
|
column_lineage_key = WarehouseAsset.ADDITIONAL_COLUMN_LINEAGE.value
|
|
126
128
|
location = self._storage.put(column_lineage_key, column_lineage)
|
|
127
129
|
lineage_locations[column_lineage_key] = location
|
|
@@ -1,141 +1,69 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Iterable, Optional
|
|
2
2
|
|
|
3
|
-
from .
|
|
3
|
+
from .enums import LineageEntity
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
class
|
|
6
|
+
class LineageProcessor:
|
|
7
7
|
"""
|
|
8
8
|
helper class that handles lineage deduplication and filtering
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
|
-
def __init__(self):
|
|
12
|
-
self.
|
|
11
|
+
def __init__(self, lineage_entity: LineageEntity):
|
|
12
|
+
self.lineage_entity = lineage_entity
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
"""
|
|
16
|
-
keep the most recent lineage link, adding to `self.lineage`
|
|
17
|
-
"""
|
|
18
|
-
parent, child, timestamp = timestamped_link
|
|
19
|
-
link = (parent, child)
|
|
20
|
-
if not self.lineage.get(link):
|
|
21
|
-
self.lineage[link] = timestamp
|
|
22
|
-
return
|
|
23
|
-
|
|
24
|
-
if not timestamp:
|
|
25
|
-
return
|
|
26
|
-
# keep most recent link; cast for mypy
|
|
27
|
-
recent = max(cast(str, self.lineage[link]), cast(str, timestamp))
|
|
28
|
-
self.lineage[link] = recent
|
|
14
|
+
self.lineage: dict[tuple[str, str], dict] = dict()
|
|
29
15
|
|
|
16
|
+
def _parent_path(self, link) -> Optional[str]:
|
|
17
|
+
if self.lineage_entity == LineageEntity.TABLE:
|
|
18
|
+
return link["source_table_full_name"]
|
|
30
19
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
20
|
+
source_table = link["source_table_full_name"]
|
|
21
|
+
source_column = link["source_column_name"]
|
|
22
|
+
if not (source_table and source_column):
|
|
23
|
+
return None
|
|
35
24
|
|
|
25
|
+
return f"{source_table}.{source_column}"
|
|
36
26
|
|
|
37
|
-
def
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
return None
|
|
27
|
+
def _child_path(self, link) -> Optional[str]:
|
|
28
|
+
if self.lineage_entity == LineageEntity.TABLE:
|
|
29
|
+
return link["target_table_full_name"]
|
|
41
30
|
|
|
31
|
+
target_table = link["target_table_full_name"]
|
|
32
|
+
target_column = link["target_column_name"]
|
|
33
|
+
if not (target_table and target_column):
|
|
34
|
+
return None
|
|
42
35
|
|
|
43
|
-
|
|
44
|
-
"""exclude missing path and self-lineage"""
|
|
45
|
-
if (not path_from) or (not path_to):
|
|
46
|
-
return None
|
|
47
|
-
is_self_lineage = path_from.lower() == path_to.lower()
|
|
48
|
-
if is_self_lineage:
|
|
49
|
-
return None
|
|
50
|
-
return path_from, path_to, timestamp
|
|
36
|
+
return f"{target_table}.{target_column}"
|
|
51
37
|
|
|
38
|
+
def add(self, link: dict) -> None:
|
|
39
|
+
"""
|
|
40
|
+
If the parent and child paths are valid, keeps the most recent lineage
|
|
41
|
+
link in the `self.lineage` map.
|
|
42
|
+
"""
|
|
43
|
+
parent = self._parent_path(link)
|
|
44
|
+
child = self._child_path(link)
|
|
45
|
+
timestamp = link["event_time"]
|
|
52
46
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
) -> list[TimestampedLink]:
|
|
56
|
-
"""
|
|
57
|
-
process databricks lineage API response for a given table
|
|
58
|
-
returns a list of (parent, child, timestamp)
|
|
59
|
-
|
|
60
|
-
Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
|
|
61
|
-
we could also have `notebookInfos` or `fileInfo`
|
|
62
|
-
"""
|
|
63
|
-
links: list[OTimestampedLink] = []
|
|
64
|
-
# add parent:
|
|
65
|
-
for link in single_table_lineage.get("upstreams", []):
|
|
66
|
-
parent = link.get("tableInfo", {})
|
|
67
|
-
parent_path = _to_table_path(parent)
|
|
68
|
-
timestamp: Ostr = parent.get("lineage_timestamp")
|
|
69
|
-
links.append(_link(parent_path, table_path, timestamp))
|
|
70
|
-
|
|
71
|
-
# add children:
|
|
72
|
-
for link in single_table_lineage.get("downstreams", []):
|
|
73
|
-
child = link.get("tableInfo", {})
|
|
74
|
-
child_path = _to_table_path(child)
|
|
75
|
-
timestamp = child.get("lineage_timestamp")
|
|
76
|
-
links.append(_link(table_path, child_path, timestamp))
|
|
77
|
-
|
|
78
|
-
return list(filter(None, links))
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def single_column_lineage_links(
|
|
82
|
-
column_path: str, single_column_lineage: dict
|
|
83
|
-
) -> list[TimestampedLink]:
|
|
84
|
-
"""
|
|
85
|
-
process databricks lineage API response for a given table
|
|
86
|
-
returns a list of (parent, child, timestamp)
|
|
87
|
-
|
|
88
|
-
Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
|
|
89
|
-
we could also have `notebookInfos` or `fileInfo`
|
|
90
|
-
"""
|
|
91
|
-
links: list[OTimestampedLink] = []
|
|
92
|
-
# add parent:
|
|
93
|
-
for link in single_column_lineage.get("upstream_cols", []):
|
|
94
|
-
parent_path = _to_column_path(link)
|
|
95
|
-
timestamp: Ostr = link.get("lineage_timestamp")
|
|
96
|
-
links.append(_link(parent_path, column_path, timestamp))
|
|
47
|
+
if not (parent and child and parent != child):
|
|
48
|
+
return
|
|
97
49
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
timestamp = link.get("lineage_timestamp")
|
|
102
|
-
links.append(_link(column_path, child_path, timestamp))
|
|
50
|
+
key = (parent, child)
|
|
51
|
+
if key in self.lineage and self.lineage[key]["event_time"] > timestamp:
|
|
52
|
+
return
|
|
103
53
|
|
|
104
|
-
|
|
54
|
+
self.lineage[key] = link
|
|
105
55
|
|
|
106
56
|
|
|
107
|
-
def
|
|
108
|
-
|
|
109
|
-
) -> list[
|
|
57
|
+
def valid_lineage(
|
|
58
|
+
lineage: Iterable[dict], lineage_entity: LineageEntity
|
|
59
|
+
) -> list[dict]:
|
|
110
60
|
"""
|
|
111
|
-
|
|
112
|
-
|
|
61
|
+
Filters out self-lineage or lineage with a missing source or target path,
|
|
62
|
+
then deduplicates by picking the link with the most recent event timestmap.
|
|
113
63
|
"""
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
tables_with_lineage: set[str] = set()
|
|
122
|
-
for t in table_lineage:
|
|
123
|
-
tables_with_lineage.add(t["parent_path"])
|
|
124
|
-
tables_with_lineage.add(t["child_path"])
|
|
125
|
-
|
|
126
|
-
paths_to_return: list[tuple[str, str]] = []
|
|
127
|
-
for column in columns:
|
|
128
|
-
table_path = mapping[column["table_id"]]
|
|
129
|
-
if table_path not in tables_with_lineage:
|
|
130
|
-
continue
|
|
131
|
-
column_ = (table_path, column["column_name"])
|
|
132
|
-
paths_to_return.append(column_)
|
|
133
|
-
|
|
134
|
-
return paths_to_return
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
def deduplicate_lineage(lineages: list[TimestampedLink]) -> dict:
|
|
138
|
-
deduplicated_lineage = LineageLinks()
|
|
139
|
-
for timestamped_link in lineages:
|
|
140
|
-
deduplicated_lineage.add(timestamped_link)
|
|
141
|
-
return deduplicated_lineage.lineage
|
|
64
|
+
deduplicated_lineage = LineageProcessor(lineage_entity)
|
|
65
|
+
|
|
66
|
+
for link in lineage:
|
|
67
|
+
deduplicated_lineage.add(link)
|
|
68
|
+
|
|
69
|
+
return list(deduplicated_lineage.lineage.values())
|
|
@@ -1,34 +1,89 @@
|
|
|
1
|
-
from .
|
|
2
|
-
from .
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
1
|
+
from .enums import LineageEntity
|
|
2
|
+
from .lineage import LineageProcessor, valid_lineage
|
|
3
|
+
|
|
4
|
+
_OLDER_DATE = "2025-01-01 00:00:01.0"
|
|
5
|
+
_CLOSER_DATE = "2025-01-01 02:02:02.0"
|
|
6
|
+
|
|
7
|
+
_TABLE_LINEAGES = [
|
|
8
|
+
{
|
|
9
|
+
"source_table_full_name": "a.b.source",
|
|
10
|
+
"target_table_full_name": "a.b.target",
|
|
11
|
+
"event_time": _CLOSER_DATE,
|
|
12
|
+
"other": "more recent stuff",
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"source_table_full_name": "a.b.source",
|
|
16
|
+
"target_table_full_name": "a.b.target",
|
|
17
|
+
"event_time": _OLDER_DATE,
|
|
18
|
+
"other": "stuff that's too old",
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"source_table_full_name": "no target",
|
|
22
|
+
"target_table_full_name": None,
|
|
23
|
+
"event_time": _CLOSER_DATE,
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
"source_table_full_name": None,
|
|
27
|
+
"target_table_full_name": "no source",
|
|
28
|
+
"event_time": _CLOSER_DATE,
|
|
29
|
+
},
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
_COLUMN_LINEAGES = [
|
|
34
|
+
{
|
|
35
|
+
"source_table_full_name": "a.b.source",
|
|
36
|
+
"source_column_name": "src_col",
|
|
37
|
+
"target_table_full_name": "a.b.target",
|
|
38
|
+
"target_column_name": "trgt_col",
|
|
39
|
+
"event_time": _OLDER_DATE,
|
|
40
|
+
"other": "old stuff",
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
"source_table_full_name": "a.b.source",
|
|
44
|
+
"source_column_name": "src_col",
|
|
45
|
+
"target_table_full_name": "a.b.target",
|
|
46
|
+
"target_column_name": "trgt_col",
|
|
47
|
+
"event_time": _CLOSER_DATE,
|
|
48
|
+
"other": "newer stuff",
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
"source_table_full_name": "a.b.toto",
|
|
52
|
+
"source_column_name": "toto_col",
|
|
53
|
+
"target_table_full_name": "a.b.tata",
|
|
54
|
+
"target_column_name": "tata_col",
|
|
55
|
+
"event_time": _OLDER_DATE,
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
"source_table_full_name": "a.b.source",
|
|
59
|
+
"source_column_name": "a.b.source",
|
|
60
|
+
"target_table_full_name": None,
|
|
61
|
+
"target_column_name": None,
|
|
62
|
+
"event_time": _CLOSER_DATE,
|
|
63
|
+
},
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_valid_lineage():
|
|
68
|
+
table_links = valid_lineage(_TABLE_LINEAGES, LineageEntity.TABLE)
|
|
69
|
+
|
|
70
|
+
assert len(table_links) == 1
|
|
71
|
+
assert table_links[0]["source_table_full_name"] == "a.b.source"
|
|
72
|
+
assert table_links[0]["target_table_full_name"] == "a.b.target"
|
|
73
|
+
assert table_links[0]["event_time"] == _CLOSER_DATE
|
|
74
|
+
assert table_links[0]["other"] == "more recent stuff"
|
|
6
75
|
|
|
7
76
|
|
|
8
77
|
def test_LineageLinks_add():
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
assert
|
|
16
|
-
assert
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
assert expected_key in links.lineage
|
|
22
|
-
assert links.lineage[expected_key] == OLDER_DATE
|
|
23
|
-
|
|
24
|
-
# we update with the more recent timestamp
|
|
25
|
-
timestamped_link = ("parent", "child", CLOSER_DATE)
|
|
26
|
-
links.add(timestamped_link)
|
|
27
|
-
assert expected_key in links.lineage
|
|
28
|
-
assert links.lineage[expected_key] == CLOSER_DATE
|
|
29
|
-
|
|
30
|
-
# we keep the more recent timestamp
|
|
31
|
-
timestamped_link = ("parent", "child", OLDER_DATE)
|
|
32
|
-
links.add(timestamped_link)
|
|
33
|
-
assert expected_key in links.lineage
|
|
34
|
-
assert links.lineage[expected_key] == CLOSER_DATE
|
|
78
|
+
deduplicated_lineage = LineageProcessor(LineageEntity.COLUMN)
|
|
79
|
+
for link in _COLUMN_LINEAGES:
|
|
80
|
+
deduplicated_lineage.add(link)
|
|
81
|
+
|
|
82
|
+
lineage = deduplicated_lineage.lineage
|
|
83
|
+
assert len(lineage) == 2
|
|
84
|
+
assert ("a.b.source.src_col", "a.b.target.trgt_col") in lineage
|
|
85
|
+
assert ("a.b.toto.toto_col", "a.b.tata.tata_col") in lineage
|
|
86
|
+
assert (
|
|
87
|
+
lineage[("a.b.source.src_col", "a.b.target.trgt_col")]["other"]
|
|
88
|
+
== "newer stuff"
|
|
89
|
+
)
|
|
@@ -1,24 +1,24 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections import defaultdict
|
|
3
|
-
from
|
|
3
|
+
from datetime import date
|
|
4
4
|
from typing import Optional
|
|
5
5
|
|
|
6
6
|
from databricks import sql # type: ignore
|
|
7
7
|
|
|
8
8
|
from .credentials import DatabricksCredentials
|
|
9
|
+
from .enums import LineageEntity, TagEntity
|
|
9
10
|
from .format import TagMapping
|
|
11
|
+
from .lineage import valid_lineage
|
|
10
12
|
from .utils import build_path, tag_label
|
|
11
13
|
|
|
12
14
|
logger = logging.getLogger(__name__)
|
|
13
15
|
|
|
14
16
|
_INFORMATION_SCHEMA_SQL = "SELECT * FROM system.information_schema"
|
|
15
17
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
COLUMN = "COLUMN"
|
|
21
|
-
TABLE = "TABLE"
|
|
18
|
+
_LINEAGE_SQL_TPL = """
|
|
19
|
+
SELECT * FROM system.access.{table_name}
|
|
20
|
+
WHERE event_date = :day
|
|
21
|
+
"""
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class DatabricksSQLClient:
|
|
@@ -71,7 +71,6 @@ class DatabricksSQLClient:
|
|
|
71
71
|
https://docs.databricks.com/en/sql/language-manual/information-schema/column_tags.html
|
|
72
72
|
"""
|
|
73
73
|
if not self._needs_extraction(entity):
|
|
74
|
-
# extracting tags require additional credentials (http_path)
|
|
75
74
|
return dict()
|
|
76
75
|
|
|
77
76
|
table = f"{entity.value.lower()}_tags"
|
|
@@ -88,3 +87,19 @@ class DatabricksSQLClient:
|
|
|
88
87
|
mapping[path].append(label)
|
|
89
88
|
|
|
90
89
|
return mapping
|
|
90
|
+
|
|
91
|
+
def get_lineage(
|
|
92
|
+
self, lineage_entity: LineageEntity, day: date
|
|
93
|
+
) -> list[dict]:
|
|
94
|
+
"""
|
|
95
|
+
Fetch {TABLE|COLUMN} lineage of the given day, via system tables
|
|
96
|
+
https://docs.databricks.com/en/admin/system-tables/lineage.html
|
|
97
|
+
"""
|
|
98
|
+
table_name = f"{lineage_entity.value.lower()}_lineage"
|
|
99
|
+
query = _LINEAGE_SQL_TPL.format(table_name=table_name)
|
|
100
|
+
params = {"day": day}
|
|
101
|
+
result = self.execute_sql(query, params)
|
|
102
|
+
data = []
|
|
103
|
+
for row in result:
|
|
104
|
+
data.append(row.asDict())
|
|
105
|
+
return valid_lineage(data, lineage_entity)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: castor-extractor
|
|
3
|
-
Version: 0.22.
|
|
3
|
+
Version: 0.22.5
|
|
4
4
|
Summary: Extract your metadata assets.
|
|
5
5
|
Home-page: https://www.castordoc.com/
|
|
6
6
|
License: EULA
|
|
@@ -207,6 +207,22 @@ For any questions or bug report, contact us at [support@castordoc.com](mailto:su
|
|
|
207
207
|
|
|
208
208
|
# Changelog
|
|
209
209
|
|
|
210
|
+
## 0.22.5 - 2025-01-09
|
|
211
|
+
|
|
212
|
+
* Databricks: validate and deduplicate lineage links
|
|
213
|
+
|
|
214
|
+
## 0.22.4 - 2025-01-08
|
|
215
|
+
|
|
216
|
+
* ThoughtSpot: extract answers
|
|
217
|
+
|
|
218
|
+
## 0.22.3 - 2024-12-10
|
|
219
|
+
|
|
220
|
+
* Databricks: extract lineage from system tables
|
|
221
|
+
|
|
222
|
+
## 0.22.2 - 2024-12-06
|
|
223
|
+
|
|
224
|
+
* Sigma: multithreading to retrieve lineage
|
|
225
|
+
|
|
210
226
|
## 0.22.1 - 2024-12-05
|
|
211
227
|
|
|
212
228
|
* Salesforce: deduplicate tables
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
CHANGELOG.md,sha256=
|
|
1
|
+
CHANGELOG.md,sha256=JzTJEZxIMP9F_aePVfIvqLt0OuG0jYcDygsLyfTAV84,15335
|
|
2
2
|
Dockerfile,sha256=xQ05-CFfGShT3oUqaiumaldwA288dj9Yb_pxofQpufg,301
|
|
3
3
|
DockerfileUsage.md,sha256=2hkJQF-5JuuzfPZ7IOxgM6QgIQW7l-9oRMFVwyXC4gE,998
|
|
4
4
|
LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
|
|
@@ -252,7 +252,7 @@ castor_extractor/visualization/salesforce_reporting/extract.py,sha256=ScStilebLG
|
|
|
252
252
|
castor_extractor/visualization/sigma/__init__.py,sha256=GINql4yJLtjfOJgjHaWNpE13cMtnKNytiFRomwav27Q,114
|
|
253
253
|
castor_extractor/visualization/sigma/assets.py,sha256=JZ1Cpxnml8P3mIJoTUM57hvylB18ErECQXaP5FF63O4,268
|
|
254
254
|
castor_extractor/visualization/sigma/client/__init__.py,sha256=YQv06FBBQHvBMFg_tN0nUcmUp2NCL2s-eFTXG8rXaBg,74
|
|
255
|
-
castor_extractor/visualization/sigma/client/client.py,sha256=
|
|
255
|
+
castor_extractor/visualization/sigma/client/client.py,sha256=d9CpE7vRZAPGzck0jFn37LY_6E_Njz9D1sCnFVGJSWk,8006
|
|
256
256
|
castor_extractor/visualization/sigma/client/credentials.py,sha256=XddAuQSmCKpxJ70TQgRnOj0vMPYVtiStk_lMMQ1AiNM,693
|
|
257
257
|
castor_extractor/visualization/sigma/client/endpoints.py,sha256=DBFphbgoH78_MZUGM_bKBAq28Nl7LWSZ6VRsbxrxtDg,1162
|
|
258
258
|
castor_extractor/visualization/sigma/client/pagination.py,sha256=kNEhNq08tTGbypyMjxs0w4uvDtQc_iaWpOZweaa_FsU,690
|
|
@@ -306,13 +306,13 @@ castor_extractor/visualization/tableau_revamp/client/rest_fields.py,sha256=3kvaq
|
|
|
306
306
|
castor_extractor/visualization/tableau_revamp/constants.py,sha256=lHGB50FgVNO2nXeIhkvQKivD8ZFBIjDrflgD5cTXKJw,104
|
|
307
307
|
castor_extractor/visualization/tableau_revamp/extract.py,sha256=HqnBypuNGx_xKk-68WEOy_ucD15LuRF4t2xXf0XKPE0,1370
|
|
308
308
|
castor_extractor/visualization/thoughtspot/__init__.py,sha256=NhTGUk5Kdt54oCjHYoAt0cLBmVLys5lFYiRANL6wCmI,150
|
|
309
|
-
castor_extractor/visualization/thoughtspot/assets.py,sha256=
|
|
309
|
+
castor_extractor/visualization/thoughtspot/assets.py,sha256=SAQWPKaD2NTSDg7-GSkcRSSEkKSws0MJfOVcHkdeTSg,276
|
|
310
310
|
castor_extractor/visualization/thoughtspot/client/__init__.py,sha256=svrE2rMxR-OXctjPeAHMEPePlfcra-9KDevTMcHunAA,86
|
|
311
|
-
castor_extractor/visualization/thoughtspot/client/client.py,sha256=
|
|
311
|
+
castor_extractor/visualization/thoughtspot/client/client.py,sha256=mtwMCPI1-1tyZb1gSYYr-O2QZMTFQwNgillU6ycsOU4,5552
|
|
312
312
|
castor_extractor/visualization/thoughtspot/client/credentials.py,sha256=fp4YHiZy-dstWiLr5c4kFU9SyPK5rd2nCeh8k5sVRpM,462
|
|
313
313
|
castor_extractor/visualization/thoughtspot/client/endpoints.py,sha256=u3FRkmG6j5OIMEeXWZcgRObP8JeC4EutIJEeitNV44c,330
|
|
314
|
-
castor_extractor/visualization/thoughtspot/client/utils.py,sha256=
|
|
315
|
-
castor_extractor/visualization/thoughtspot/client/utils_test.py,sha256
|
|
314
|
+
castor_extractor/visualization/thoughtspot/client/utils.py,sha256=3LgbIWoG1e39VW8rYaV4ot_0EFipziwf3rFAZKxrlEY,1072
|
|
315
|
+
castor_extractor/visualization/thoughtspot/client/utils_test.py,sha256=2XysRU7a58KA2JgNwU2j4GPrN0rkN7Gvk8kQCJlYXVk,2469
|
|
316
316
|
castor_extractor/visualization/thoughtspot/extract.py,sha256=mcXS0jGFpa50td98AVbbTqxchyI5wDCpB-v1o5iRc3g,1354
|
|
317
317
|
castor_extractor/warehouse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
318
318
|
castor_extractor/warehouse/abstract/__init__.py,sha256=Fdfa026tgOo64MvzVRLHM_F2G-JmcehrF0mh3dHgb7s,419
|
|
@@ -340,21 +340,21 @@ castor_extractor/warehouse/bigquery/queries/view_ddl.sql,sha256=obCm-IN9V8_YSZTw
|
|
|
340
340
|
castor_extractor/warehouse/bigquery/query.py,sha256=FEekxlkrfAXzsT8Kj1AIqYd5mURB5MlZIkbFVXVqEhU,4762
|
|
341
341
|
castor_extractor/warehouse/bigquery/types.py,sha256=rfKkKA13Et7TM4I0uVaXkLfuaBXkv51bNTp4AO0QSdw,57
|
|
342
342
|
castor_extractor/warehouse/databricks/__init__.py,sha256=YG3YSIJgCFRjjI8eExy9T7qGnfnjWhMFh8c15KTs_BA,184
|
|
343
|
-
castor_extractor/warehouse/databricks/api_client.py,sha256=
|
|
343
|
+
castor_extractor/warehouse/databricks/api_client.py,sha256=kLcUGSgrfybZUrpt0tE7qe2OoSSN7IK4myyB7c0czOY,6260
|
|
344
344
|
castor_extractor/warehouse/databricks/api_client_test.py,sha256=YTWC-X7L-XAfK5b39TUgTmR1ifv0QrY5tvLNoSbpmjg,466
|
|
345
|
-
castor_extractor/warehouse/databricks/client.py,sha256=
|
|
346
|
-
castor_extractor/warehouse/databricks/client_test.py,sha256=
|
|
347
|
-
castor_extractor/warehouse/databricks/credentials.py,sha256=
|
|
345
|
+
castor_extractor/warehouse/databricks/client.py,sha256=H6vcKfos7op5AKSQF9qduG4afx-GZgBdyGE7waS6__o,3292
|
|
346
|
+
castor_extractor/warehouse/databricks/client_test.py,sha256=hOuSPh45z6m9T1hjuqpOayby_q8bYdJVdq5qiwkiXrg,1370
|
|
347
|
+
castor_extractor/warehouse/databricks/credentials.py,sha256=ExtVcl2NpMXTx1Lg8vHQdzQtSEm2aqpg3D1BJrNAUjI,528
|
|
348
348
|
castor_extractor/warehouse/databricks/endpoints.py,sha256=qPoL9CtPFJdwVuW9rJ37nmeMd-nChOBouEVYb4SlaUE,670
|
|
349
|
-
castor_extractor/warehouse/databricks/
|
|
349
|
+
castor_extractor/warehouse/databricks/enums.py,sha256=3T6BbVvbWvfWkD23krsYT1x0kKh1qRzNPl6WpcXe300,274
|
|
350
|
+
castor_extractor/warehouse/databricks/extract.py,sha256=Z4VTEIf0QMiua0QGAlJdQ86kxmGAXekQ304aCKme6IY,7358
|
|
350
351
|
castor_extractor/warehouse/databricks/format.py,sha256=FUBMrFFWSa_lX5PtixJCDR3eRYycqeMw0oKHt7AkA4o,6732
|
|
351
352
|
castor_extractor/warehouse/databricks/format_test.py,sha256=ls0IcOElqp_qecAzNbK0zdca7Pms4seCHimbw8NAoAI,3322
|
|
352
|
-
castor_extractor/warehouse/databricks/lineage.py,sha256=
|
|
353
|
-
castor_extractor/warehouse/databricks/lineage_test.py,sha256=
|
|
353
|
+
castor_extractor/warehouse/databricks/lineage.py,sha256=jwiRXrgqBAtzQt5EgErYrN8YRyviEEHmyrSbw8TSPq4,2105
|
|
354
|
+
castor_extractor/warehouse/databricks/lineage_test.py,sha256=PyBn1eAoxLm4Bz5M0F4zmaxFX2mXRTM_uug5OKbQPQs,2684
|
|
354
355
|
castor_extractor/warehouse/databricks/pagination.py,sha256=sM1G0sN1pf1TPpI0Y3Oew378UGEKVkMRc2Mlu9tDjLo,545
|
|
355
|
-
castor_extractor/warehouse/databricks/sql_client.py,sha256=
|
|
356
|
-
castor_extractor/warehouse/databricks/
|
|
357
|
-
castor_extractor/warehouse/databricks/types.py,sha256=-qO5y-uI95B666iDhyNM0TL8WlwYC-3Q4xZuolh3PwE,205
|
|
356
|
+
castor_extractor/warehouse/databricks/sql_client.py,sha256=5isGsRL0MW1lu_E_xTyCvSj_rwaJ2nh-kPlhvTvDy_w,3566
|
|
357
|
+
castor_extractor/warehouse/databricks/types.py,sha256=-TFX4jS6_c3wQLOpJTKpLeGS21YIPjKDjISnzeUPdCc,46
|
|
358
358
|
castor_extractor/warehouse/databricks/utils.py,sha256=5CKn6Me1Tus97H_qDEz_5tkhd4ARmwk2qiC3GndjyCc,1969
|
|
359
359
|
castor_extractor/warehouse/databricks/utils_test.py,sha256=_guTuzRWRTZdDY7ils0X1K8jhI9T877MEtw3x_YDg9I,2415
|
|
360
360
|
castor_extractor/warehouse/mysql/__init__.py,sha256=2KFDogo9GNbApHqw3Vm5t_uNmIRjdp76nmP_WQQMfQY,116
|
|
@@ -436,8 +436,8 @@ castor_extractor/warehouse/sqlserver/queries/table.sql,sha256=kbBQP-TdG5px1IVgyx
|
|
|
436
436
|
castor_extractor/warehouse/sqlserver/queries/user.sql,sha256=gOrZsMVypusR2dc4vwVs4E1a-CliRsr_UjnD2EbXs-A,94
|
|
437
437
|
castor_extractor/warehouse/sqlserver/query.py,sha256=g0hPT-RmeGi2DyenAi3o72cTlQsLToXIFYojqc8E5fQ,533
|
|
438
438
|
castor_extractor/warehouse/synapse/queries/column.sql,sha256=lNcFoIW3Y0PFOqoOzJEXmPvZvfAsY0AP63Mu2LuPzPo,1351
|
|
439
|
-
castor_extractor-0.22.
|
|
440
|
-
castor_extractor-0.22.
|
|
441
|
-
castor_extractor-0.22.
|
|
442
|
-
castor_extractor-0.22.
|
|
443
|
-
castor_extractor-0.22.
|
|
439
|
+
castor_extractor-0.22.5.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
|
|
440
|
+
castor_extractor-0.22.5.dist-info/METADATA,sha256=11A9xI9Bd6Uu1Na_AJngfTbkt-ECXjsabWNTppaZsOk,22352
|
|
441
|
+
castor_extractor-0.22.5.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
|
442
|
+
castor_extractor-0.22.5.dist-info/entry_points.txt,sha256=7aVSxc-_2dicp28Ow-S4y0p4wGoTm9zGmVptMvfLdw8,1649
|
|
443
|
+
castor_extractor-0.22.5.dist-info/RECORD,,
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
OLDER_DATE = "2024-04-18 20:20:20.0"
|
|
2
|
-
CLOSER_DATE = "2024-04-19 20:20:20.0"
|
|
3
|
-
|
|
4
|
-
MOCK_TABLES_FOR_TABLE_LINEAGE = [
|
|
5
|
-
{
|
|
6
|
-
"id": "f51ba2ca-8cc3-4de6-8f8b-730359e8f40f",
|
|
7
|
-
"schema_id": "dev.silver",
|
|
8
|
-
"table_name": "analytics",
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"id": "4e140bdc-a67c-4b68-8a07-c684657d8b44",
|
|
12
|
-
"schema_id": "dev.silver",
|
|
13
|
-
"table_name": "pre_analytics",
|
|
14
|
-
},
|
|
15
|
-
{
|
|
16
|
-
"id": "7d403198-55ea-4a40-9995-6ee2f4c79dfa",
|
|
17
|
-
"schema_id": "dev.bronze",
|
|
18
|
-
"table_name": "analytics",
|
|
19
|
-
},
|
|
20
|
-
]
|
|
21
|
-
|
|
22
|
-
_RAW_LINEAGE_DEV_SILVER_ANALYTICS = {
|
|
23
|
-
"upstreams": [
|
|
24
|
-
{ # there could be other keys: jobInfos, notebookInfos, queryInfos
|
|
25
|
-
"tableInfo": {
|
|
26
|
-
"name": "pre_analytics",
|
|
27
|
-
"catalog_name": "dev",
|
|
28
|
-
"schema_name": "silver",
|
|
29
|
-
"table_type": "PERSISTED_VIEW", # not used
|
|
30
|
-
"lineage_timestamp": OLDER_DATE,
|
|
31
|
-
}
|
|
32
|
-
},
|
|
33
|
-
{
|
|
34
|
-
"tableInfo": {
|
|
35
|
-
"name": "analytics",
|
|
36
|
-
"catalog_name": "dev",
|
|
37
|
-
"schema_name": "bronze",
|
|
38
|
-
"table_type": "PERSISTED_VIEW", # not used
|
|
39
|
-
"lineage_timestamp": CLOSER_DATE,
|
|
40
|
-
}
|
|
41
|
-
},
|
|
42
|
-
],
|
|
43
|
-
"downstreams": [],
|
|
44
|
-
}
|
|
45
|
-
_RAW_LINEAGE_DEV_SILVER_PRE_ANALYTICS = {
|
|
46
|
-
"upstreams": [],
|
|
47
|
-
"downstreams": [
|
|
48
|
-
{
|
|
49
|
-
"tableInfo": {
|
|
50
|
-
"name": "analytics",
|
|
51
|
-
"catalog_name": "dev",
|
|
52
|
-
"schema_name": "silver",
|
|
53
|
-
"table_type": "PERSISTED_VIEW", # not used
|
|
54
|
-
"lineage_timestamp": OLDER_DATE,
|
|
55
|
-
}
|
|
56
|
-
},
|
|
57
|
-
],
|
|
58
|
-
}
|
|
59
|
-
_RAW_LINEAGE_DEV_BRONZE_ANALYTICS = {
|
|
60
|
-
"upstreams": [],
|
|
61
|
-
"downstreams": [
|
|
62
|
-
{
|
|
63
|
-
"tableInfo": {
|
|
64
|
-
"name": "analytics",
|
|
65
|
-
"catalog_name": "dev",
|
|
66
|
-
"schema_name": "silver",
|
|
67
|
-
"table_type": "PERSISTED_VIEW", # not used
|
|
68
|
-
"lineage_timestamp": OLDER_DATE,
|
|
69
|
-
}
|
|
70
|
-
},
|
|
71
|
-
],
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
# should be in the same order as MOCK_TABLES_FOR_TABLE_LINEAGE
|
|
75
|
-
TABLE_LINEAGE_SIDE_EFFECT: tuple = (
|
|
76
|
-
_RAW_LINEAGE_DEV_SILVER_ANALYTICS,
|
|
77
|
-
_RAW_LINEAGE_DEV_SILVER_PRE_ANALYTICS,
|
|
78
|
-
_RAW_LINEAGE_DEV_BRONZE_ANALYTICS,
|
|
79
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|