castor-extractor 0.16.9__py3-none-any.whl → 0.16.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +24 -0
- castor_extractor/utils/client/api.py +8 -3
- castor_extractor/utils/retry.py +3 -1
- castor_extractor/visualization/tableau_revamp/client/client.py +8 -2
- castor_extractor/visualization/tableau_revamp/client/gql_queries.py +15 -2
- castor_extractor/warehouse/abstract/__init__.py +2 -0
- castor_extractor/warehouse/abstract/asset.py +13 -0
- castor_extractor/warehouse/databricks/client.py +239 -3
- castor_extractor/warehouse/databricks/client_test.py +61 -1
- castor_extractor/warehouse/databricks/extract.py +36 -0
- castor_extractor/warehouse/databricks/format.py +13 -0
- castor_extractor/warehouse/databricks/test_constants.py +79 -0
- castor_extractor/warehouse/databricks/types.py +6 -1
- castor_extractor/warehouse/salesforce/client.py +8 -6
- castor_extractor/warehouse/salesforce/extract.py +2 -2
- castor_extractor/warehouse/salesforce/format.py +34 -7
- castor_extractor/warehouse/salesforce/format_test.py +49 -1
- castor_extractor/warehouse/snowflake/extract.py +2 -0
- castor_extractor/warehouse/snowflake/queries/function.sql +10 -0
- {castor_extractor-0.16.9.dist-info → castor_extractor-0.16.15.dist-info}/METADATA +1 -1
- {castor_extractor-0.16.9.dist-info → castor_extractor-0.16.15.dist-info}/RECORD +24 -22
- {castor_extractor-0.16.9.dist-info → castor_extractor-0.16.15.dist-info}/LICENCE +0 -0
- {castor_extractor-0.16.9.dist-info → castor_extractor-0.16.15.dist-info}/WHEEL +0 -0
- {castor_extractor-0.16.9.dist-info → castor_extractor-0.16.15.dist-info}/entry_points.txt +0 -0
CHANGELOG.md
CHANGED
|
@@ -1,5 +1,29 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.16.15 - 2024-06-07
|
|
4
|
+
|
|
5
|
+
* Tableau: extract database_name for CustomSQLTables
|
|
6
|
+
|
|
7
|
+
## 0.16.14 - 2024-06-06
|
|
8
|
+
|
|
9
|
+
* Snowflake: Extract SQL user defined function
|
|
10
|
+
|
|
11
|
+
## 0.16.13 - 2024-06-05
|
|
12
|
+
|
|
13
|
+
* Tableau: extract database_name for tables
|
|
14
|
+
|
|
15
|
+
## 0.16.12 - 2024-06-04
|
|
16
|
+
|
|
17
|
+
* Databricks: Extract lineage
|
|
18
|
+
|
|
19
|
+
## 0.16.11 - 2024-06-03
|
|
20
|
+
|
|
21
|
+
* Tableau: add extra fields to optimise storage
|
|
22
|
+
|
|
23
|
+
## 0.16.10 - 2024-05-30
|
|
24
|
+
|
|
25
|
+
* Salesforce: extract sobjects Label as table name
|
|
26
|
+
|
|
3
27
|
## 0.16.9 - 2024-05-28
|
|
4
28
|
|
|
5
29
|
* Tableau: extract only fields that are necessary
|
|
@@ -5,7 +5,7 @@ import requests
|
|
|
5
5
|
|
|
6
6
|
logger = logging.getLogger(__name__)
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
DEFAULT_TIMEOUT_S = 30
|
|
9
9
|
|
|
10
10
|
# https://requests.readthedocs.io/en/latest/api/#requests.request
|
|
11
11
|
HttpMethod = Literal["GET", "OPTIONS", "HEAD", "POST", "PUT", "PATCH", "DELETE"]
|
|
@@ -20,7 +20,7 @@ class APIClient:
|
|
|
20
20
|
def __init__(self, host: str, token: Optional[str] = None):
|
|
21
21
|
self._host = host
|
|
22
22
|
self._token = token or ""
|
|
23
|
-
self._timeout =
|
|
23
|
+
self._timeout = DEFAULT_TIMEOUT_S
|
|
24
24
|
|
|
25
25
|
@staticmethod
|
|
26
26
|
def build_url(host: str, path: str):
|
|
@@ -44,7 +44,12 @@ class APIClient:
|
|
|
44
44
|
) -> Any:
|
|
45
45
|
logger.debug(f"Calling {method} on {url}")
|
|
46
46
|
result = requests.request(
|
|
47
|
-
method,
|
|
47
|
+
method,
|
|
48
|
+
url,
|
|
49
|
+
headers=self._headers(),
|
|
50
|
+
params=params,
|
|
51
|
+
json=data,
|
|
52
|
+
timeout=self._timeout,
|
|
48
53
|
)
|
|
49
54
|
result.raise_for_status()
|
|
50
55
|
|
castor_extractor/utils/retry.py
CHANGED
|
@@ -68,7 +68,8 @@ class Retry(BaseModel):
|
|
|
68
68
|
self._retry_attempts += 1
|
|
69
69
|
wait_ms = self.base() + self.jitter()
|
|
70
70
|
wait_s = float(wait_ms) / MS_IN_SEC
|
|
71
|
-
|
|
71
|
+
msg = f"Attempting a new call in {wait_s} seconds, {self._retry_attempts} attempt(s) / {self.max_retries} max retries"
|
|
72
|
+
logger.warning(msg)
|
|
72
73
|
time.sleep(wait_s)
|
|
73
74
|
return True
|
|
74
75
|
|
|
@@ -93,6 +94,7 @@ def retry(
|
|
|
93
94
|
try:
|
|
94
95
|
return None, callable(*args, **kwargs)
|
|
95
96
|
except exceptions_ as err:
|
|
97
|
+
logger.warning(f"Exception within {callable.__name__}")
|
|
96
98
|
return err, None
|
|
97
99
|
|
|
98
100
|
def _func(*args, **kwargs) -> Any:
|
|
@@ -28,10 +28,16 @@ _TSC_ASSETS = (
|
|
|
28
28
|
TableauRevampAsset.USAGE,
|
|
29
29
|
)
|
|
30
30
|
|
|
31
|
-
#
|
|
31
|
+
# increase the value when extraction is too slow
|
|
32
|
+
# decrease the value when timeouts arise
|
|
32
33
|
_CUSTOM_PAGE_SIZE: Dict[TableauRevampAsset, int] = {
|
|
34
|
+
# for some clients, extraction of columns tend to hit the node limit
|
|
35
|
+
# https://community.tableau.com/s/question/0D54T00000YuK60SAF/metadata-query-nodelimitexceeded-error
|
|
36
|
+
# the workaround is to reduce pagination
|
|
37
|
+
TableauRevampAsset.COLUMN: 50,
|
|
38
|
+
# fields are light but volumes are bigger
|
|
33
39
|
TableauRevampAsset.FIELD: 1000,
|
|
34
|
-
TableauRevampAsset.
|
|
40
|
+
TableauRevampAsset.TABLE: 50,
|
|
35
41
|
}
|
|
36
42
|
|
|
37
43
|
|
|
@@ -18,7 +18,11 @@ QUERY_TEMPLATE = """
|
|
|
18
18
|
|
|
19
19
|
_COLUMNS_QUERY = """
|
|
20
20
|
downstreamDashboards { id }
|
|
21
|
-
downstreamFields {
|
|
21
|
+
downstreamFields {
|
|
22
|
+
id
|
|
23
|
+
__typename
|
|
24
|
+
datasource { id }
|
|
25
|
+
}
|
|
22
26
|
downstreamWorkbooks { id }
|
|
23
27
|
id
|
|
24
28
|
name
|
|
@@ -59,12 +63,21 @@ downstreamWorkbooks { id }
|
|
|
59
63
|
id
|
|
60
64
|
name
|
|
61
65
|
... on DatabaseTable {
|
|
62
|
-
connectionType
|
|
63
66
|
fullName
|
|
64
67
|
schema
|
|
68
|
+
database {
|
|
69
|
+
connectionType
|
|
70
|
+
id
|
|
71
|
+
name
|
|
72
|
+
}
|
|
65
73
|
}
|
|
66
74
|
... on CustomSQLTable {
|
|
67
75
|
query
|
|
76
|
+
database {
|
|
77
|
+
connectionType
|
|
78
|
+
id
|
|
79
|
+
name
|
|
80
|
+
}
|
|
68
81
|
}
|
|
69
82
|
"""
|
|
70
83
|
|
|
@@ -7,6 +7,8 @@ from ...types import ExternalAsset, classproperty
|
|
|
7
7
|
class WarehouseAsset(ExternalAsset):
|
|
8
8
|
"""Assets that can be extracted from warehouses"""
|
|
9
9
|
|
|
10
|
+
ADDITIONAL_COLUMN_LINEAGE = "additional_column_lineage"
|
|
11
|
+
ADDITIONAL_TABLE_LINEAGE = "additional_table_lineage"
|
|
10
12
|
COLUMN = "column"
|
|
11
13
|
COLUMN_LINEAGE = "column_lineage" # specific to snowflake
|
|
12
14
|
DATABASE = "database"
|
|
@@ -19,12 +21,15 @@ class WarehouseAsset(ExternalAsset):
|
|
|
19
21
|
ROLE = "role"
|
|
20
22
|
SCHEMA = "schema"
|
|
21
23
|
TABLE = "table"
|
|
24
|
+
FUNCTION = "function"
|
|
22
25
|
USER = "user"
|
|
23
26
|
VIEW_DDL = "view_ddl"
|
|
24
27
|
|
|
25
28
|
@classproperty
|
|
26
29
|
def optional(cls) -> Set["WarehouseAsset"]:
|
|
27
30
|
return {
|
|
31
|
+
WarehouseAsset.ADDITIONAL_COLUMN_LINEAGE,
|
|
32
|
+
WarehouseAsset.ADDITIONAL_TABLE_LINEAGE,
|
|
28
33
|
WarehouseAsset.EXTERNAL_COLUMN_LINEAGE,
|
|
29
34
|
WarehouseAsset.EXTERNAL_TABLE_LINEAGE,
|
|
30
35
|
}
|
|
@@ -33,8 +38,10 @@ class WarehouseAsset(ExternalAsset):
|
|
|
33
38
|
class WarehouseAssetGroup(Enum):
|
|
34
39
|
"""Groups of assets that can be extracted together"""
|
|
35
40
|
|
|
41
|
+
ADDITIONAL_LINEAGE = "additional_lineage"
|
|
36
42
|
CATALOG = "catalog"
|
|
37
43
|
EXTERNAL_LINEAGE = "external_lineage"
|
|
44
|
+
FUNCTION = "function"
|
|
38
45
|
QUERY = "query"
|
|
39
46
|
ROLE = "role"
|
|
40
47
|
SNOWFLAKE_LINEAGE = "snowflake_lineage"
|
|
@@ -53,6 +60,7 @@ CATALOG_ASSETS = (
|
|
|
53
60
|
)
|
|
54
61
|
|
|
55
62
|
# shared by technologies supporting queries
|
|
63
|
+
FUNCTIONS_ASSETS = (WarehouseAsset.FUNCTION,)
|
|
56
64
|
QUERIES_ASSETS = (WarehouseAsset.QUERY,)
|
|
57
65
|
VIEWS_ASSETS = (WarehouseAsset.VIEW_DDL,)
|
|
58
66
|
|
|
@@ -61,6 +69,11 @@ EXTERNAL_LINEAGE_ASSETS = (
|
|
|
61
69
|
WarehouseAsset.EXTERNAL_TABLE_LINEAGE,
|
|
62
70
|
)
|
|
63
71
|
|
|
72
|
+
ADDITIONAL_LINEAGE_ASSETS = (
|
|
73
|
+
WarehouseAsset.ADDITIONAL_COLUMN_LINEAGE,
|
|
74
|
+
WarehouseAsset.ADDITIONAL_TABLE_LINEAGE,
|
|
75
|
+
)
|
|
76
|
+
|
|
64
77
|
NON_EXTRACTABLE_ASSETS = {WarehouseAssetGroup.EXTERNAL_LINEAGE}
|
|
65
78
|
|
|
66
79
|
|
|
@@ -1,18 +1,38 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
2
3
|
from datetime import date
|
|
3
4
|
from functools import partial
|
|
4
|
-
from typing import Any, Dict, List, Optional, Set
|
|
5
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, cast
|
|
5
6
|
|
|
6
|
-
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
from ...utils import (
|
|
10
|
+
SafeMode,
|
|
11
|
+
at_midnight,
|
|
12
|
+
date_after,
|
|
13
|
+
mapping_from_rows,
|
|
14
|
+
retry,
|
|
15
|
+
safe_mode,
|
|
16
|
+
)
|
|
7
17
|
from ...utils.client.api import APIClient
|
|
8
18
|
from ...utils.pager import PagerOnToken
|
|
9
19
|
from ..abstract.time_filter import TimeFilter
|
|
10
20
|
from .credentials import DatabricksCredentials
|
|
11
21
|
from .format import DatabricksFormatter
|
|
12
|
-
from .types import TablesColumns
|
|
22
|
+
from .types import Link, Ostr, OTimestampedLink, TablesColumns, TimestampedLink
|
|
13
23
|
|
|
14
24
|
logger = logging.getLogger(__name__)
|
|
15
25
|
|
|
26
|
+
_MAX_NUMBER_OF_LINEAGE_ERRORS = 1000
|
|
27
|
+
_MAX_THREADS = 10
|
|
28
|
+
_RETRY_ATTEMPTS = 3
|
|
29
|
+
_RETRY_BASE_MS = 1000
|
|
30
|
+
_RETRY_EXCEPTIONS = [
|
|
31
|
+
requests.exceptions.ConnectTimeout,
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
safe_params = SafeMode((BaseException,), _MAX_NUMBER_OF_LINEAGE_ERRORS)
|
|
35
|
+
|
|
16
36
|
|
|
17
37
|
def _day_to_epoch_ms(day: date) -> int:
|
|
18
38
|
return int(at_midnight(day).timestamp() * 1000)
|
|
@@ -22,6 +42,30 @@ def _day_hour_to_epoch_ms(day: date, hour: int) -> int:
|
|
|
22
42
|
return int(at_midnight(day).timestamp() * 1000) + (hour * 3600 * 1000)
|
|
23
43
|
|
|
24
44
|
|
|
45
|
+
class LineageLinks:
|
|
46
|
+
"""
|
|
47
|
+
helper class that handles lineage deduplication and filtering
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(self):
|
|
51
|
+
self.lineage: Dict[Link, Ostr] = dict()
|
|
52
|
+
|
|
53
|
+
def add(self, timestamped_link: TimestampedLink) -> None:
|
|
54
|
+
"""
|
|
55
|
+
keep the most recent lineage link, adding to `self.lineage`
|
|
56
|
+
"""
|
|
57
|
+
parent, child, timestamp = timestamped_link
|
|
58
|
+
link = (parent, child)
|
|
59
|
+
if not self.lineage.get(link):
|
|
60
|
+
self.lineage[link] = timestamp
|
|
61
|
+
else:
|
|
62
|
+
if not timestamp:
|
|
63
|
+
return
|
|
64
|
+
# keep most recent link; cast for mypy
|
|
65
|
+
recent = max(cast(str, self.lineage[link]), cast(str, timestamp))
|
|
66
|
+
self.lineage[link] = recent
|
|
67
|
+
|
|
68
|
+
|
|
25
69
|
class DatabricksClient(APIClient):
|
|
26
70
|
"""Databricks Client"""
|
|
27
71
|
|
|
@@ -123,6 +167,198 @@ class DatabricksClient(APIClient):
|
|
|
123
167
|
columns.extend(c_to_add)
|
|
124
168
|
return tables, columns
|
|
125
169
|
|
|
170
|
+
@staticmethod
|
|
171
|
+
def _to_table_path(table: dict) -> Ostr:
|
|
172
|
+
if table.get("name"):
|
|
173
|
+
return f"{table['catalog_name']}.{table['schema_name']}.{table['name']}"
|
|
174
|
+
return None
|
|
175
|
+
|
|
176
|
+
@staticmethod
|
|
177
|
+
def _to_column_path(column: dict) -> Ostr:
|
|
178
|
+
if column.get("name"):
|
|
179
|
+
return f"{column['catalog_name']}.{column['schema_name']}.{column['table_name']}.{column['name']}"
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
def _link(
|
|
183
|
+
self, path_from: Ostr, path_to: Ostr, timestamp: Ostr
|
|
184
|
+
) -> OTimestampedLink:
|
|
185
|
+
"""exclude missing path and self-lineage"""
|
|
186
|
+
if (not path_from) or (not path_to):
|
|
187
|
+
return None
|
|
188
|
+
is_self_lineage = path_from.lower() == path_to.lower()
|
|
189
|
+
if is_self_lineage:
|
|
190
|
+
return None
|
|
191
|
+
return (path_from, path_to, timestamp)
|
|
192
|
+
|
|
193
|
+
def _single_table_lineage_links(
|
|
194
|
+
self, table_path: str, single_table_lineage: dict
|
|
195
|
+
) -> List[TimestampedLink]:
|
|
196
|
+
"""
|
|
197
|
+
process databricks lineage API response for a given table
|
|
198
|
+
returns a list of (parent, child, timestamp)
|
|
199
|
+
|
|
200
|
+
Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
|
|
201
|
+
we could also have `notebookInfos` or `fileInfo`
|
|
202
|
+
"""
|
|
203
|
+
links: List[OTimestampedLink] = []
|
|
204
|
+
# add parent:
|
|
205
|
+
for link in single_table_lineage.get("upstreams", []):
|
|
206
|
+
parent = link.get("tableInfo", {})
|
|
207
|
+
parent_path = self._to_table_path(parent)
|
|
208
|
+
timestamp: Ostr = parent.get("lineage_timestamp")
|
|
209
|
+
links.append(self._link(parent_path, table_path, timestamp))
|
|
210
|
+
|
|
211
|
+
# add children:
|
|
212
|
+
for link in single_table_lineage.get("downstreams", []):
|
|
213
|
+
child = link.get("tableInfo", {})
|
|
214
|
+
child_path = self._to_table_path(child)
|
|
215
|
+
timestamp = child.get("lineage_timestamp")
|
|
216
|
+
links.append(self._link(table_path, child_path, timestamp))
|
|
217
|
+
|
|
218
|
+
return list(filter(None, links))
|
|
219
|
+
|
|
220
|
+
@safe_mode(safe_params, lambda: [])
|
|
221
|
+
@retry(
|
|
222
|
+
exceptions=_RETRY_EXCEPTIONS,
|
|
223
|
+
max_retries=_RETRY_ATTEMPTS,
|
|
224
|
+
base_ms=_RETRY_BASE_MS,
|
|
225
|
+
)
|
|
226
|
+
def get_single_table_lineage(
|
|
227
|
+
self, table_path: str
|
|
228
|
+
) -> List[TimestampedLink]:
|
|
229
|
+
"""
|
|
230
|
+
Helper function used in get_lineage_links.
|
|
231
|
+
Call data lineage API and return the content of the result
|
|
232
|
+
eg table_path: broward_prd.bronze.account_adjustments
|
|
233
|
+
FYI: Maximum rate of 50 requests per SECOND
|
|
234
|
+
"""
|
|
235
|
+
path = "api/2.0/lineage-tracking/table-lineage"
|
|
236
|
+
payload = {"table_name": table_path, "include_entity_lineage": True}
|
|
237
|
+
content = self.get(path=path, payload=payload)
|
|
238
|
+
return self._single_table_lineage_links(table_path, content)
|
|
239
|
+
|
|
240
|
+
def _deduplicate_lineage(self, lineages: List[TimestampedLink]) -> dict:
|
|
241
|
+
deduplicated_lineage = LineageLinks()
|
|
242
|
+
for timestamped_link in lineages:
|
|
243
|
+
deduplicated_lineage.add(timestamped_link)
|
|
244
|
+
return deduplicated_lineage.lineage
|
|
245
|
+
|
|
246
|
+
def table_lineage(self, tables: List[dict]) -> List[dict]:
|
|
247
|
+
"""
|
|
248
|
+
Wrapper function that retrieves all table lineage
|
|
249
|
+
"""
|
|
250
|
+
# retrieve table lineage
|
|
251
|
+
with ThreadPoolExecutor(max_workers=_MAX_THREADS) as executor:
|
|
252
|
+
table_paths = [
|
|
253
|
+
".".join([table["schema_id"], table["table_name"]])
|
|
254
|
+
for table in tables
|
|
255
|
+
]
|
|
256
|
+
results = executor.map(self.get_single_table_lineage, table_paths)
|
|
257
|
+
lineages = [link for links in results for link in links]
|
|
258
|
+
deduplicated = self._deduplicate_lineage(lineages)
|
|
259
|
+
return self.formatter.format_lineage(deduplicated)
|
|
260
|
+
|
|
261
|
+
@staticmethod
|
|
262
|
+
def _paths_for_column_lineage(
|
|
263
|
+
tables: List[dict], columns: List[dict], table_lineage: List[dict]
|
|
264
|
+
) -> List[Tuple[str, str]]:
|
|
265
|
+
"""
|
|
266
|
+
helper providing a list of candidate columns to look lineage for:
|
|
267
|
+
we only look for column lineage where there is table lineage
|
|
268
|
+
"""
|
|
269
|
+
# mapping between table id and its path db.schema.table
|
|
270
|
+
# table["schema_id"] follows the pattern `db.schema`
|
|
271
|
+
mapping = {
|
|
272
|
+
table["id"]: ".".join([table["schema_id"], table["table_name"]])
|
|
273
|
+
for table in tables
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
tables_with_lineage: Set[str] = set()
|
|
277
|
+
for t in table_lineage:
|
|
278
|
+
tables_with_lineage.add(t["parent_path"])
|
|
279
|
+
tables_with_lineage.add(t["child_path"])
|
|
280
|
+
|
|
281
|
+
paths_to_return: List[Tuple[str, str]] = []
|
|
282
|
+
for column in columns:
|
|
283
|
+
table_path = mapping[column["table_id"]]
|
|
284
|
+
if table_path not in tables_with_lineage:
|
|
285
|
+
continue
|
|
286
|
+
column_ = (table_path, column["column_name"])
|
|
287
|
+
paths_to_return.append(column_)
|
|
288
|
+
|
|
289
|
+
return paths_to_return
|
|
290
|
+
|
|
291
|
+
def _single_column_lineage_links(
|
|
292
|
+
self, column_path: str, single_column_lineage: dict
|
|
293
|
+
) -> List[TimestampedLink]:
|
|
294
|
+
"""
|
|
295
|
+
process databricks lineage API response for a given table
|
|
296
|
+
returns a list of (parent, child, timestamp)
|
|
297
|
+
|
|
298
|
+
Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
|
|
299
|
+
we could also have `notebookInfos` or `fileInfo`
|
|
300
|
+
"""
|
|
301
|
+
links: List[OTimestampedLink] = []
|
|
302
|
+
# add parent:
|
|
303
|
+
for link in single_column_lineage.get("upstream_cols", []):
|
|
304
|
+
parent_path = self._to_column_path(link)
|
|
305
|
+
timestamp: Ostr = link.get("lineage_timestamp")
|
|
306
|
+
links.append(self._link(parent_path, column_path, timestamp))
|
|
307
|
+
|
|
308
|
+
# add children:
|
|
309
|
+
for link in single_column_lineage.get("downstream_cols", []):
|
|
310
|
+
child_path = self._to_column_path(link)
|
|
311
|
+
timestamp = link.get("lineage_timestamp")
|
|
312
|
+
links.append(self._link(column_path, child_path, timestamp))
|
|
313
|
+
|
|
314
|
+
return list(filter(None, links))
|
|
315
|
+
|
|
316
|
+
@safe_mode(safe_params, lambda: [])
|
|
317
|
+
@retry(
|
|
318
|
+
exceptions=_RETRY_EXCEPTIONS,
|
|
319
|
+
max_retries=_RETRY_ATTEMPTS,
|
|
320
|
+
base_ms=_RETRY_BASE_MS,
|
|
321
|
+
)
|
|
322
|
+
def get_single_column_lineage(
|
|
323
|
+
self,
|
|
324
|
+
names: Tuple[str, str],
|
|
325
|
+
) -> List[TimestampedLink]:
|
|
326
|
+
"""
|
|
327
|
+
Helper function used in get_lineage_links.
|
|
328
|
+
Call data lineage API and return the content of the result
|
|
329
|
+
|
|
330
|
+
eg table_path: broward_prd.bronze.account_adjustments
|
|
331
|
+
FYI: Maximum rate of 10 requests per SECOND
|
|
332
|
+
"""
|
|
333
|
+
table_path, column_name = names
|
|
334
|
+
api_path = "api/2.0/lineage-tracking/column-lineage"
|
|
335
|
+
payload = {
|
|
336
|
+
"table_name": table_path,
|
|
337
|
+
"column_name": column_name,
|
|
338
|
+
"include_entity_lineage": True,
|
|
339
|
+
}
|
|
340
|
+
content = self.get(path=api_path, payload=payload)
|
|
341
|
+
column_path = f"{table_path}.{column_name}"
|
|
342
|
+
return self._single_column_lineage_links(column_path, content)
|
|
343
|
+
|
|
344
|
+
def column_lineage(
|
|
345
|
+
self, tables: List[dict], columns: List[dict], table_lineage: List[dict]
|
|
346
|
+
) -> List[dict]:
|
|
347
|
+
"""
|
|
348
|
+
Wrapper function that retrieves all column lineage
|
|
349
|
+
we only try to retrieve column lineage if we found table lineage
|
|
350
|
+
"""
|
|
351
|
+
candidate_paths = self._paths_for_column_lineage(
|
|
352
|
+
tables, columns, table_lineage
|
|
353
|
+
)
|
|
354
|
+
lineages: List[TimestampedLink] = [
|
|
355
|
+
link
|
|
356
|
+
for paths in candidate_paths
|
|
357
|
+
for link in self.get_single_column_lineage(paths)
|
|
358
|
+
]
|
|
359
|
+
deduplicated = self._deduplicate_lineage(lineages)
|
|
360
|
+
return self.formatter.format_lineage(deduplicated)
|
|
361
|
+
|
|
126
362
|
@staticmethod
|
|
127
363
|
def _time_filter(time_filter: Optional[TimeFilter]) -> dict:
|
|
128
364
|
"""time filter to retrieve Databricks' queries"""
|
|
@@ -1,9 +1,16 @@
|
|
|
1
1
|
from datetime import date
|
|
2
|
+
from unittest.mock import Mock, patch
|
|
2
3
|
|
|
3
4
|
from freezegun import freeze_time
|
|
4
5
|
|
|
5
6
|
from ..abstract.time_filter import TimeFilter
|
|
6
|
-
from .client import DatabricksClient, _day_hour_to_epoch_ms
|
|
7
|
+
from .client import DatabricksClient, LineageLinks, _day_hour_to_epoch_ms
|
|
8
|
+
from .test_constants import (
|
|
9
|
+
CLOSER_DATE,
|
|
10
|
+
MOCK_TABLES_FOR_TABLE_LINEAGE,
|
|
11
|
+
OLDER_DATE,
|
|
12
|
+
TABLE_LINEAGE_SIDE_EFFECT,
|
|
13
|
+
)
|
|
7
14
|
|
|
8
15
|
|
|
9
16
|
def test__day_hour_to_epoch_ms():
|
|
@@ -97,3 +104,56 @@ def test_DatabricksClient__match_table_with_user():
|
|
|
97
104
|
table_without_owner = {"id": 1, "owner_email": None}
|
|
98
105
|
actual = client._match_table_with_user(table_without_owner, user_mapping)
|
|
99
106
|
assert actual == table_without_owner
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@patch(
|
|
110
|
+
"source.packages.extractor.castor_extractor.warehouse.databricks.client.DatabricksClient.get",
|
|
111
|
+
side_effect=TABLE_LINEAGE_SIDE_EFFECT,
|
|
112
|
+
)
|
|
113
|
+
def test_DatabricksClient_table_lineage(mock_get):
|
|
114
|
+
client = DatabricksClient(Mock())
|
|
115
|
+
|
|
116
|
+
lineage = client.table_lineage(MOCK_TABLES_FOR_TABLE_LINEAGE)
|
|
117
|
+
assert len(lineage) == 2
|
|
118
|
+
|
|
119
|
+
expected_link_1 = {
|
|
120
|
+
"parent_path": "dev.silver.pre_analytics",
|
|
121
|
+
"child_path": "dev.silver.analytics",
|
|
122
|
+
"timestamp": OLDER_DATE,
|
|
123
|
+
}
|
|
124
|
+
expected_link_2 = {
|
|
125
|
+
"parent_path": "dev.bronze.analytics",
|
|
126
|
+
"child_path": "dev.silver.analytics",
|
|
127
|
+
"timestamp": CLOSER_DATE,
|
|
128
|
+
}
|
|
129
|
+
assert expected_link_1 in lineage
|
|
130
|
+
assert expected_link_2 in lineage
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def test_LineageLinks_add():
|
|
134
|
+
links = LineageLinks()
|
|
135
|
+
timestamped_link = ("parent", "child", None)
|
|
136
|
+
expected_key = ("parent", "child")
|
|
137
|
+
|
|
138
|
+
links.add(timestamped_link)
|
|
139
|
+
|
|
140
|
+
assert expected_key in links.lineage
|
|
141
|
+
assert links.lineage[expected_key] is None
|
|
142
|
+
|
|
143
|
+
# we replace None by an actual timestamp
|
|
144
|
+
timestamped_link = ("parent", "child", OLDER_DATE)
|
|
145
|
+
links.add(timestamped_link)
|
|
146
|
+
assert expected_key in links.lineage
|
|
147
|
+
assert links.lineage[expected_key] == OLDER_DATE
|
|
148
|
+
|
|
149
|
+
# we update with the more recent timestamp
|
|
150
|
+
timestamped_link = ("parent", "child", CLOSER_DATE)
|
|
151
|
+
links.add(timestamped_link)
|
|
152
|
+
assert expected_key in links.lineage
|
|
153
|
+
assert links.lineage[expected_key] == CLOSER_DATE
|
|
154
|
+
|
|
155
|
+
# we keep the more recent timestamp
|
|
156
|
+
timestamped_link = ("parent", "child", OLDER_DATE)
|
|
157
|
+
links.add(timestamped_link)
|
|
158
|
+
assert expected_key in links.lineage
|
|
159
|
+
assert links.lineage[expected_key] == CLOSER_DATE
|
|
@@ -3,6 +3,7 @@ from typing import Dict, Optional
|
|
|
3
3
|
|
|
4
4
|
from ...utils import AbstractStorage, LocalStorage, write_summary
|
|
5
5
|
from ..abstract import (
|
|
6
|
+
ADDITIONAL_LINEAGE_ASSETS,
|
|
6
7
|
CATALOG_ASSETS,
|
|
7
8
|
EXTERNAL_LINEAGE_ASSETS,
|
|
8
9
|
QUERIES_ASSETS,
|
|
@@ -17,6 +18,7 @@ from .client import DatabricksClient
|
|
|
17
18
|
from .credentials import to_credentials
|
|
18
19
|
|
|
19
20
|
DATABRICKS_ASSETS: SupportedAssets = {
|
|
21
|
+
WarehouseAssetGroup.ADDITIONAL_LINEAGE: ADDITIONAL_LINEAGE_ASSETS,
|
|
20
22
|
WarehouseAssetGroup.CATALOG: CATALOG_ASSETS,
|
|
21
23
|
WarehouseAssetGroup.QUERY: QUERIES_ASSETS,
|
|
22
24
|
WarehouseAssetGroup.ROLE: (WarehouseAsset.USER,),
|
|
@@ -94,6 +96,39 @@ class DatabricksExtractionProcessor:
|
|
|
94
96
|
logger.info(f"Extracted {len(columns)} columns to {location}")
|
|
95
97
|
return catalog_locations
|
|
96
98
|
|
|
99
|
+
def extract_lineage(self) -> Paths:
|
|
100
|
+
if self._should_not_reextract(WarehouseAssetGroup.ADDITIONAL_LINEAGE):
|
|
101
|
+
return self._existing_group_paths(
|
|
102
|
+
WarehouseAssetGroup.ADDITIONAL_LINEAGE
|
|
103
|
+
)
|
|
104
|
+
lineage_locations: Dict[str, str] = dict()
|
|
105
|
+
|
|
106
|
+
# extract catalog
|
|
107
|
+
databases = self._client.databases()
|
|
108
|
+
schemas = self._client.schemas(databases)
|
|
109
|
+
users = self._client.users()
|
|
110
|
+
tables, columns = self._client.tables_and_columns(schemas, users)
|
|
111
|
+
logger.info("Extracted pre-requisite catalog. Next comes lineage")
|
|
112
|
+
|
|
113
|
+
# extract table lineage
|
|
114
|
+
table_lineage = self._client.table_lineage(tables)
|
|
115
|
+
table_lineage_key = WarehouseAsset.ADDITIONAL_TABLE_LINEAGE.value
|
|
116
|
+
location = self._storage.put(table_lineage_key, table_lineage)
|
|
117
|
+
lineage_locations[table_lineage_key] = location
|
|
118
|
+
msg = f"Extracted {len(table_lineage)} table lineage to {location}"
|
|
119
|
+
logger.info(msg)
|
|
120
|
+
|
|
121
|
+
# extract column lineage
|
|
122
|
+
column_lineage = self._client.column_lineage(
|
|
123
|
+
tables, columns, table_lineage
|
|
124
|
+
)
|
|
125
|
+
column_lineage_key = WarehouseAsset.ADDITIONAL_COLUMN_LINEAGE.value
|
|
126
|
+
location = self._storage.put(column_lineage_key, column_lineage)
|
|
127
|
+
lineage_locations[column_lineage_key] = location
|
|
128
|
+
msg = f"Extracted {len(column_lineage)} column lineage to {location}"
|
|
129
|
+
logger.info(msg)
|
|
130
|
+
return lineage_locations
|
|
131
|
+
|
|
97
132
|
def extract_query(self, time_filter: OTimeFilter = None) -> Paths:
|
|
98
133
|
"""extract yesterday's queries and return their location"""
|
|
99
134
|
if self._should_not_reextract(WarehouseAssetGroup.QUERY):
|
|
@@ -149,6 +184,7 @@ def extract_all(**kwargs) -> None:
|
|
|
149
184
|
)
|
|
150
185
|
|
|
151
186
|
extractor.extract_catalog()
|
|
187
|
+
extractor.extract_lineage()
|
|
152
188
|
extractor.extract_query()
|
|
153
189
|
extractor.extract_role()
|
|
154
190
|
extractor.extract_view_ddl()
|
|
@@ -95,6 +95,19 @@ class DatabricksFormatter:
|
|
|
95
95
|
|
|
96
96
|
return tables, columns
|
|
97
97
|
|
|
98
|
+
@staticmethod
|
|
99
|
+
def format_lineage(timestamps: dict) -> List[dict]:
|
|
100
|
+
lineage: List[dict] = []
|
|
101
|
+
for link, timestamp in timestamps.items():
|
|
102
|
+
parent_path, child_path = link
|
|
103
|
+
link_ = {
|
|
104
|
+
"parent_path": parent_path,
|
|
105
|
+
"child_path": child_path,
|
|
106
|
+
"timestamp": timestamp,
|
|
107
|
+
}
|
|
108
|
+
lineage.append(link_)
|
|
109
|
+
return lineage
|
|
110
|
+
|
|
98
111
|
@staticmethod
|
|
99
112
|
def format_query(raw_queries: List[dict]) -> List[dict]:
|
|
100
113
|
queries = []
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
OLDER_DATE = "2024-04-18 20:20:20.0"
|
|
2
|
+
CLOSER_DATE = "2024-04-19 20:20:20.0"
|
|
3
|
+
|
|
4
|
+
MOCK_TABLES_FOR_TABLE_LINEAGE = [
|
|
5
|
+
{
|
|
6
|
+
"id": "f51ba2ca-8cc3-4de6-8f8b-730359e8f40f",
|
|
7
|
+
"schema_id": "dev.silver",
|
|
8
|
+
"table_name": "analytics",
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "4e140bdc-a67c-4b68-8a07-c684657d8b44",
|
|
12
|
+
"schema_id": "dev.silver",
|
|
13
|
+
"table_name": "pre_analytics",
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"id": "7d403198-55ea-4a40-9995-6ee2f4c79dfa",
|
|
17
|
+
"schema_id": "dev.bronze",
|
|
18
|
+
"table_name": "analytics",
|
|
19
|
+
},
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
_RAW_LINEAGE_DEV_SILVER_ANALYTICS = {
|
|
23
|
+
"upstreams": [
|
|
24
|
+
{ # there could be other keys: jobInfos, notebookInfos, queryInfos
|
|
25
|
+
"tableInfo": {
|
|
26
|
+
"name": "pre_analytics",
|
|
27
|
+
"catalog_name": "dev",
|
|
28
|
+
"schema_name": "silver",
|
|
29
|
+
"table_type": "PERSISTED_VIEW", # not used
|
|
30
|
+
"lineage_timestamp": OLDER_DATE,
|
|
31
|
+
}
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
"tableInfo": {
|
|
35
|
+
"name": "analytics",
|
|
36
|
+
"catalog_name": "dev",
|
|
37
|
+
"schema_name": "bronze",
|
|
38
|
+
"table_type": "PERSISTED_VIEW", # not used
|
|
39
|
+
"lineage_timestamp": CLOSER_DATE,
|
|
40
|
+
}
|
|
41
|
+
},
|
|
42
|
+
],
|
|
43
|
+
"downstreams": [],
|
|
44
|
+
}
|
|
45
|
+
_RAW_LINEAGE_DEV_SILVER_PRE_ANALYTICS = {
|
|
46
|
+
"upstreams": [],
|
|
47
|
+
"downstreams": [
|
|
48
|
+
{
|
|
49
|
+
"tableInfo": {
|
|
50
|
+
"name": "analytics",
|
|
51
|
+
"catalog_name": "dev",
|
|
52
|
+
"schema_name": "silver",
|
|
53
|
+
"table_type": "PERSISTED_VIEW", # not used
|
|
54
|
+
"lineage_timestamp": OLDER_DATE,
|
|
55
|
+
}
|
|
56
|
+
},
|
|
57
|
+
],
|
|
58
|
+
}
|
|
59
|
+
_RAW_LINEAGE_DEV_BRONZE_ANALYTICS = {
|
|
60
|
+
"upstreams": [],
|
|
61
|
+
"downstreams": [
|
|
62
|
+
{
|
|
63
|
+
"tableInfo": {
|
|
64
|
+
"name": "analytics",
|
|
65
|
+
"catalog_name": "dev",
|
|
66
|
+
"schema_name": "silver",
|
|
67
|
+
"table_type": "PERSISTED_VIEW", # not used
|
|
68
|
+
"lineage_timestamp": OLDER_DATE,
|
|
69
|
+
}
|
|
70
|
+
},
|
|
71
|
+
],
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
# should be in the same order as MOCK_TABLES_FOR_TABLE_LINEAGE
|
|
75
|
+
TABLE_LINEAGE_SIDE_EFFECT: tuple = (
|
|
76
|
+
_RAW_LINEAGE_DEV_SILVER_ANALYTICS,
|
|
77
|
+
_RAW_LINEAGE_DEV_SILVER_PRE_ANALYTICS,
|
|
78
|
+
_RAW_LINEAGE_DEV_BRONZE_ANALYTICS,
|
|
79
|
+
)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Dict, Iterator, List
|
|
2
|
+
from typing import Dict, Iterator, List, Tuple
|
|
3
3
|
|
|
4
4
|
from tqdm import tqdm # type: ignore
|
|
5
5
|
|
|
@@ -96,17 +96,19 @@ class SalesforceClient(SalesforceBaseClient):
|
|
|
96
96
|
"""
|
|
97
97
|
sobjects = self.fetch_sobjects()
|
|
98
98
|
logger.info(f"Extracted {len(sobjects)} sobjects")
|
|
99
|
-
return self.formatter.tables(sobjects)
|
|
99
|
+
return list(self.formatter.tables(sobjects))
|
|
100
100
|
|
|
101
101
|
def columns(
|
|
102
|
-
self, sobject_names: List[str], show_progress: bool = True
|
|
102
|
+
self, sobject_names: List[Tuple[str, str]], show_progress: bool = True
|
|
103
103
|
) -> List[dict]:
|
|
104
104
|
"""
|
|
105
105
|
Get salesforce sobject fields as columns
|
|
106
106
|
show_progress: optionally deactivate the tqdm progress bar
|
|
107
107
|
"""
|
|
108
108
|
sobject_fields: Dict[str, List[dict]] = dict()
|
|
109
|
-
for
|
|
110
|
-
|
|
111
|
-
|
|
109
|
+
for api_name, table_name in tqdm(
|
|
110
|
+
sobject_names, disable=not show_progress
|
|
111
|
+
):
|
|
112
|
+
fields = self.fetch_fields(api_name)
|
|
113
|
+
sobject_fields[table_name] = fields
|
|
112
114
|
return self.formatter.columns(sobject_fields)
|
|
@@ -72,8 +72,8 @@ class SalesforceExtractionProcessor:
|
|
|
72
72
|
catalog_locations[WarehouseAsset.TABLE.value] = location
|
|
73
73
|
logger.info(f"Extracted {len(tables)} tables to {location}")
|
|
74
74
|
|
|
75
|
-
|
|
76
|
-
columns = self._client.columns(
|
|
75
|
+
sobject_names = [(t["api_name"], t["table_name"]) for t in tables]
|
|
76
|
+
columns = self._client.columns(sobject_names, show_progress)
|
|
77
77
|
location = self._storage.put(WarehouseAsset.COLUMN.value, columns)
|
|
78
78
|
catalog_locations[WarehouseAsset.COLUMN.value] = location
|
|
79
79
|
logger.info(f"Extracted {len(columns)} columns to {location}")
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any, Dict, List
|
|
1
|
+
from typing import Any, Dict, Iterator, List
|
|
2
2
|
|
|
3
3
|
from .constants import SCHEMA_NAME
|
|
4
4
|
|
|
@@ -35,17 +35,35 @@ def _to_column_payload(field: dict, position: int, table_name: str) -> dict:
|
|
|
35
35
|
}
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
def _to_table_payload(
|
|
38
|
+
def _to_table_payload(sobject: dict, table_name: str) -> dict:
|
|
39
39
|
return {
|
|
40
|
-
"id":
|
|
40
|
+
"id": table_name,
|
|
41
|
+
"api_name": sobject["QualifiedApiName"],
|
|
42
|
+
"label": sobject["Label"],
|
|
41
43
|
"schema_id": SCHEMA_NAME,
|
|
42
|
-
"table_name":
|
|
44
|
+
"table_name": table_name,
|
|
43
45
|
"description": "",
|
|
44
46
|
"tags": [],
|
|
45
47
|
"type": "TABLE",
|
|
46
48
|
}
|
|
47
49
|
|
|
48
50
|
|
|
51
|
+
def _merge_label_and_api_name(sobject: dict) -> dict:
|
|
52
|
+
label = sobject["Label"]
|
|
53
|
+
api_name = sobject["QualifiedApiName"]
|
|
54
|
+
table_name = f"{label} ({api_name})"
|
|
55
|
+
return _to_table_payload(sobject, table_name)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _by_label(sobjects: List[dict]) -> Dict[str, List[dict]]:
|
|
59
|
+
by_label: Dict[str, List[dict]] = dict()
|
|
60
|
+
for sobject in sobjects:
|
|
61
|
+
label = sobject["Label"]
|
|
62
|
+
similar_sobjects = by_label.setdefault(label, [])
|
|
63
|
+
similar_sobjects.append(sobject)
|
|
64
|
+
return by_label
|
|
65
|
+
|
|
66
|
+
|
|
49
67
|
class SalesforceFormatter:
|
|
50
68
|
"""
|
|
51
69
|
Helper functions that format the response in the format to be exported as
|
|
@@ -53,9 +71,18 @@ class SalesforceFormatter:
|
|
|
53
71
|
"""
|
|
54
72
|
|
|
55
73
|
@staticmethod
|
|
56
|
-
def tables(sobjects: List[dict]) ->
|
|
57
|
-
"""
|
|
58
|
-
|
|
74
|
+
def tables(sobjects: List[dict]) -> Iterator[dict]:
|
|
75
|
+
"""
|
|
76
|
+
formats the raw list of sobjects to tables
|
|
77
|
+
if two tables share the same label, then we add the api name as well
|
|
78
|
+
"""
|
|
79
|
+
by_label = _by_label(sobjects)
|
|
80
|
+
for label, similars in by_label.items():
|
|
81
|
+
if len(similars) > 1:
|
|
82
|
+
yield from [_merge_label_and_api_name(s) for s in similars]
|
|
83
|
+
else:
|
|
84
|
+
sobject = similars[0] # unique sobject on label
|
|
85
|
+
yield _to_table_payload(sobject, label)
|
|
59
86
|
|
|
60
87
|
@staticmethod
|
|
61
88
|
def columns(sobject_fields: Dict[str, List[dict]]) -> List[dict]:
|
|
@@ -1,4 +1,21 @@
|
|
|
1
|
-
from
|
|
1
|
+
from typing import Dict, Tuple
|
|
2
|
+
|
|
3
|
+
from .format import (
|
|
4
|
+
SCHEMA_NAME,
|
|
5
|
+
SalesforceFormatter,
|
|
6
|
+
_by_label,
|
|
7
|
+
_field_description,
|
|
8
|
+
_merge_label_and_api_name,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _example_sobjects() -> Tuple[Dict[str, str], ...]:
|
|
13
|
+
"""Returns 4 sobjects with 2 sharing the same label"""
|
|
14
|
+
a = {"Label": "a", "QualifiedApiName": "a_one"}
|
|
15
|
+
b = {"Label": "b", "QualifiedApiName": "b"}
|
|
16
|
+
c = {"Label": "c", "QualifiedApiName": "c"}
|
|
17
|
+
a_prime = {"Label": "a", "QualifiedApiName": "a_two"}
|
|
18
|
+
return a, b, c, a_prime
|
|
2
19
|
|
|
3
20
|
|
|
4
21
|
def test__field_description():
|
|
@@ -30,3 +47,34 @@ def test__field_description():
|
|
|
30
47
|
"- Data Sensitivity Level: bam"
|
|
31
48
|
)
|
|
32
49
|
assert description == expected
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def test__merge_label_and_api_name():
|
|
53
|
+
sobject = {"Label": "foo", "QualifiedApiName": "bar"}
|
|
54
|
+
payload = _merge_label_and_api_name(sobject)
|
|
55
|
+
expected_name = "foo (bar)"
|
|
56
|
+
assert payload == {
|
|
57
|
+
"id": expected_name,
|
|
58
|
+
"api_name": "bar",
|
|
59
|
+
"label": "foo",
|
|
60
|
+
"schema_id": SCHEMA_NAME,
|
|
61
|
+
"table_name": expected_name,
|
|
62
|
+
"description": "",
|
|
63
|
+
"tags": [],
|
|
64
|
+
"type": "TABLE",
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test__by_label():
|
|
69
|
+
a, b, c, a_prime = _example_sobjects()
|
|
70
|
+
sobjects = [a, b, c, a_prime]
|
|
71
|
+
by_label = _by_label(sobjects)
|
|
72
|
+
assert by_label == {"a": [a, a_prime], "b": [b], "c": [c]}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def test_salesforce_formatter_tables():
|
|
76
|
+
sobjects = [*_example_sobjects()]
|
|
77
|
+
tables = SalesforceFormatter.tables(sobjects)
|
|
78
|
+
expected_names = {"a (a_one)", "a (a_two)", "b", "c"}
|
|
79
|
+
payload_names = {t["table_name"] for t in tables}
|
|
80
|
+
assert payload_names == expected_names
|
|
@@ -4,6 +4,7 @@ from ...utils import LocalStorage, from_env, write_summary
|
|
|
4
4
|
from ..abstract import (
|
|
5
5
|
CATALOG_ASSETS,
|
|
6
6
|
EXTERNAL_LINEAGE_ASSETS,
|
|
7
|
+
FUNCTIONS_ASSETS,
|
|
7
8
|
QUERIES_ASSETS,
|
|
8
9
|
VIEWS_ASSETS,
|
|
9
10
|
SQLExtractionProcessor,
|
|
@@ -20,6 +21,7 @@ logger = logging.getLogger(__name__)
|
|
|
20
21
|
|
|
21
22
|
SNOWFLAKE_ASSETS: SupportedAssets = {
|
|
22
23
|
WarehouseAssetGroup.CATALOG: CATALOG_ASSETS,
|
|
24
|
+
WarehouseAssetGroup.FUNCTION: FUNCTIONS_ASSETS,
|
|
23
25
|
WarehouseAssetGroup.QUERY: QUERIES_ASSETS,
|
|
24
26
|
WarehouseAssetGroup.VIEW_DDL: VIEWS_ASSETS,
|
|
25
27
|
WarehouseAssetGroup.ROLE: (
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
SELECT
|
|
2
|
+
f.function_name AS name,
|
|
3
|
+
CONCAT(f.function_catalog, '.', f.function_schema, '.', f.function_name) AS path,
|
|
4
|
+
f.argument_signature AS signature,
|
|
5
|
+
f.function_definition AS definition
|
|
6
|
+
FROM snowflake.account_usage.functions f
|
|
7
|
+
WHERE TRUE
|
|
8
|
+
AND f.function_catalog NOT IN ('SNOWFLAKE', 'UTIL_DB')
|
|
9
|
+
AND f.function_language = 'SQL'
|
|
10
|
+
AND deleted IS NULL
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
CHANGELOG.md,sha256=
|
|
1
|
+
CHANGELOG.md,sha256=QYFobUPMbdi6cidq_yU-oMbXWoAr1BjTE6thfdZ9tA4,10866
|
|
2
2
|
Dockerfile,sha256=HcX5z8OpeSvkScQsN-Y7CNMUig_UB6vTMDl7uqzuLGE,303
|
|
3
3
|
LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
|
|
4
4
|
README.md,sha256=uF6PXm9ocPITlKVSh9afTakHmpLx3TvawLf-CbMP3wM,3578
|
|
@@ -47,7 +47,7 @@ castor_extractor/uploader/utils.py,sha256=NCe0tkB28BVhqzOaDhDjaSfODjjcPWB17X6chn
|
|
|
47
47
|
castor_extractor/utils/__init__.py,sha256=bmzAOc-PKsVreMJtF7DGpPQeHrVqxWel_BblRftt6Ag,1186
|
|
48
48
|
castor_extractor/utils/client/__init__.py,sha256=CRE-xJKm6fVV9dB8ljzB5YoOxX4I1sCD1KSgqs3Y8_Y,161
|
|
49
49
|
castor_extractor/utils/client/abstract.py,sha256=aA5Qcb9TwWDSMq8WpXbGkOB20hehwX2VTpqQAwV76wk,2048
|
|
50
|
-
castor_extractor/utils/client/api.py,sha256=
|
|
50
|
+
castor_extractor/utils/client/api.py,sha256=z1o4fteWx1HxNTqCYihl9sGkIgSQTbd8lW_B9Y2wyeQ,1742
|
|
51
51
|
castor_extractor/utils/client/api_test.py,sha256=NSMdXg1FLc37erqHp2FZsIsogWVv6lFSs7rDXHikr-E,542
|
|
52
52
|
castor_extractor/utils/client/postgres.py,sha256=n6ulaT222WWPY0_6qAZ0MHF0m91HtI9mMqL71nyygo0,866
|
|
53
53
|
castor_extractor/utils/client/query.py,sha256=O6D5EjD1KmBlwa786Uw4D4kzxx97_HH50xIIeSWt0B8,205
|
|
@@ -80,7 +80,7 @@ castor_extractor/utils/pager/pager_on_id_test.py,sha256=CfAXhXaAmCXnm0oflj8_82An
|
|
|
80
80
|
castor_extractor/utils/pager/pager_on_token.py,sha256=G442SKl4BXJFMPbYIIgCk5M8wl7V3jMg3K1WUUkl0I0,1579
|
|
81
81
|
castor_extractor/utils/pager/pager_on_token_test.py,sha256=w2GCUGKR3cD5lfmtFAsNvExtzxkYdBR0pusBrGKFQ08,2548
|
|
82
82
|
castor_extractor/utils/pager/pager_test.py,sha256=QPBVShSXhkiYZUfnAMs43xnys6CD8pAhL3Jhj-Ov2Xc,1705
|
|
83
|
-
castor_extractor/utils/retry.py,sha256=
|
|
83
|
+
castor_extractor/utils/retry.py,sha256=OsUS3qysHCkgWge8BgBwyuvoWcJ6pR_RQmQDcHlors4,3410
|
|
84
84
|
castor_extractor/utils/retry_test.py,sha256=nsMttlmyKygVcffX3Hay8U2S1BspkGPiCmzIXPpLKyk,2230
|
|
85
85
|
castor_extractor/utils/safe.py,sha256=jpfIimwdBSVUvU2DPFrhqpKC_DSYwxQqd08MlIkSODY,1967
|
|
86
86
|
castor_extractor/utils/safe_test.py,sha256=IHN1Z761tYMFslYC-2HAfkXmFPh4LYSqNLs4QZwykjk,2160
|
|
@@ -244,16 +244,16 @@ castor_extractor/visualization/tableau/usage.py,sha256=LlFwlbEr-EnYUJjKZha99CRCR
|
|
|
244
244
|
castor_extractor/visualization/tableau_revamp/__init__.py,sha256=a3DGjQhaz17gBqW-E84TAgupKbqLC40y5Ajo1yn-ot4,156
|
|
245
245
|
castor_extractor/visualization/tableau_revamp/assets.py,sha256=owlwaI2E4UKk1YhkaHgaAXx6gu3Op6EqZ7bjp0tHI6s,351
|
|
246
246
|
castor_extractor/visualization/tableau_revamp/client/__init__.py,sha256=wmS9uLtUiqNYVloi0-DgD8d2qzu3RVZEAtWiaDp6G_M,90
|
|
247
|
-
castor_extractor/visualization/tableau_revamp/client/client.py,sha256=
|
|
247
|
+
castor_extractor/visualization/tableau_revamp/client/client.py,sha256=RSoHDfz79ma0YJRGpiCihnwLGmoxLzphYrxRVyvByHI,9742
|
|
248
248
|
castor_extractor/visualization/tableau_revamp/client/credentials.py,sha256=fHG32egq6ll2U4BNazalMof_plzfCMQjrN9WOs6kezk,3014
|
|
249
249
|
castor_extractor/visualization/tableau_revamp/client/errors.py,sha256=dTe1shqmWmAXpDpCz-E24m8dGYjt6rvIGV9qQb4jnvI,150
|
|
250
|
-
castor_extractor/visualization/tableau_revamp/client/gql_queries.py,sha256
|
|
250
|
+
castor_extractor/visualization/tableau_revamp/client/gql_queries.py,sha256=-V3ToD5Gi7nmfVB2OxTOZw8dcOiF7_ciSWjjW2UdvvI,2270
|
|
251
251
|
castor_extractor/visualization/tableau_revamp/client/tsc_fields.py,sha256=WsDliPCo-XsQ7wN-j0gpW9bdxCHvgH-aePywiltzfbU,688
|
|
252
252
|
castor_extractor/visualization/tableau_revamp/constants.py,sha256=PcdudAogQhi3e-knalhgliMKjy5ahN0em_-7XSLrnxM,87
|
|
253
253
|
castor_extractor/visualization/tableau_revamp/extract.py,sha256=2SLUxp5okM4AcEJJ61ZgcC2ikfZZl9MH17CEXMXmgl0,1450
|
|
254
254
|
castor_extractor/warehouse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
255
|
-
castor_extractor/warehouse/abstract/__init__.py,sha256=
|
|
256
|
-
castor_extractor/warehouse/abstract/asset.py,sha256=
|
|
255
|
+
castor_extractor/warehouse/abstract/__init__.py,sha256=Fdfa026tgOo64MvzVRLHM_F2G-JmcehrF0mh3dHgb7s,419
|
|
256
|
+
castor_extractor/warehouse/abstract/asset.py,sha256=9nHL4WKUU_vRgj7u3sUdIzgI4rRpdS7YrfwNku4Gz9Q,2652
|
|
257
257
|
castor_extractor/warehouse/abstract/asset_test.py,sha256=_kd4ybNlWSAdSdEgJKC-jhJTa1nMRa9i8RO3YbqKLM4,758
|
|
258
258
|
castor_extractor/warehouse/abstract/extract.py,sha256=fVBhdE-yMI_g6RBYZcr7q-ZVW7jK7WVkO_GO_KfkRqg,2908
|
|
259
259
|
castor_extractor/warehouse/abstract/query.py,sha256=GAgeISCmAdrkTKzFGO79hQDf6SA6EFrrlW43w-LiXKo,2632
|
|
@@ -277,13 +277,14 @@ castor_extractor/warehouse/bigquery/queries/view_ddl.sql,sha256=obCm-IN9V8_YSZTw
|
|
|
277
277
|
castor_extractor/warehouse/bigquery/query.py,sha256=hrFfjd5jW2oQnZ6ozlkn-gDe6sCIzu5zSX19T9W6fIk,4162
|
|
278
278
|
castor_extractor/warehouse/bigquery/types.py,sha256=LZVWSmE57lOemNbB5hBRyYmDk9bFAU4nbRaJWALl6N8,140
|
|
279
279
|
castor_extractor/warehouse/databricks/__init__.py,sha256=bTvDxjGQGM2J3hOnVhfNmFP1y8DK0tySiD_EXe5_xWE,200
|
|
280
|
-
castor_extractor/warehouse/databricks/client.py,sha256=
|
|
281
|
-
castor_extractor/warehouse/databricks/client_test.py,sha256=
|
|
280
|
+
castor_extractor/warehouse/databricks/client.py,sha256=oHR_htE25p5tiAAFZKbF48efo7tqIENW4dAGA7yEqHg,16895
|
|
281
|
+
castor_extractor/warehouse/databricks/client_test.py,sha256=KNp4Hi_CC6GwiW2QDJQQwqALfUebuT9D_qL6FuP_8tY,5246
|
|
282
282
|
castor_extractor/warehouse/databricks/credentials.py,sha256=PpGv5_GP320UQjV_gvaxSpOw58AmqSznmjGhGfe6bdU,655
|
|
283
|
-
castor_extractor/warehouse/databricks/extract.py,sha256
|
|
284
|
-
castor_extractor/warehouse/databricks/format.py,sha256=
|
|
283
|
+
castor_extractor/warehouse/databricks/extract.py,sha256=VX-3uo5dZucenrg-wnPur3CxOgpC5H7Ds92TO7OTAjc,7379
|
|
284
|
+
castor_extractor/warehouse/databricks/format.py,sha256=2bRy2fa45NW3uk030rmyba4n2Em-NnyZPBurUslEbcw,5522
|
|
285
285
|
castor_extractor/warehouse/databricks/format_test.py,sha256=iPmdJof43fBYL1Sa_fBrCWDQHCHgm7IWCZag1kWkj9E,1970
|
|
286
|
-
castor_extractor/warehouse/databricks/
|
|
286
|
+
castor_extractor/warehouse/databricks/test_constants.py,sha256=Hm96yq_ltVAKv7WYhYz637r4Cuj-1cCdyOuxMEe3J-Q,2246
|
|
287
|
+
castor_extractor/warehouse/databricks/types.py,sha256=hD6gC8oiT3QSWEvbtgUOGK_lLzzz36sEauB3lS_wxlE,218
|
|
287
288
|
castor_extractor/warehouse/mysql/__init__.py,sha256=2KFDogo9GNbApHqw3Vm5t_uNmIRjdp76nmP_WQQMfQY,116
|
|
288
289
|
castor_extractor/warehouse/mysql/client.py,sha256=IwoJvbmE5VZkMCP9yHf6ta3_AQPEuBPrZZ3meefbcJs,974
|
|
289
290
|
castor_extractor/warehouse/mysql/client_test.py,sha256=wRTv-3c5chy_HKj-buasNiYOOCIfynYqbabM4Hxdh5E,1052
|
|
@@ -323,22 +324,23 @@ castor_extractor/warehouse/redshift/queries/user.sql,sha256=sEXveJAuNvZacvpI6Wfw
|
|
|
323
324
|
castor_extractor/warehouse/redshift/queries/view_ddl.sql,sha256=Pkyh_QT6d4rhTeyiVcqw6O8CRl7NEhk2p7eM5YIn5kg,719
|
|
324
325
|
castor_extractor/warehouse/redshift/query.py,sha256=0C81rkt2cpkWrJIxxwALDyqr-49vlqQM04y_N6wwStc,540
|
|
325
326
|
castor_extractor/warehouse/salesforce/__init__.py,sha256=NR4aNea5jeE1xYqeZ_29deeN84CkN0_D_Z7CLQdJvFY,137
|
|
326
|
-
castor_extractor/warehouse/salesforce/client.py,sha256=
|
|
327
|
+
castor_extractor/warehouse/salesforce/client.py,sha256=ETnZ3n-GFFH0XohDB2ft74wI1HMspvTefR3k7ne-pmI,3891
|
|
327
328
|
castor_extractor/warehouse/salesforce/constants.py,sha256=GusduVBCPvwpk_Im6F3bDvXeNQ7hRnCMdIAjIg65RnE,52
|
|
328
|
-
castor_extractor/warehouse/salesforce/extract.py,sha256=
|
|
329
|
-
castor_extractor/warehouse/salesforce/format.py,sha256=
|
|
330
|
-
castor_extractor/warehouse/salesforce/format_test.py,sha256=
|
|
329
|
+
castor_extractor/warehouse/salesforce/extract.py,sha256=IbhkCli8bSn7tjhRNlaD_HhfmZmv-5E5ajZfEUh68Hs,3438
|
|
330
|
+
castor_extractor/warehouse/salesforce/format.py,sha256=f5mMJyPsVU1ZSLe5WGCUOpj2SyW7_DFfzNVNu_m2aV0,3126
|
|
331
|
+
castor_extractor/warehouse/salesforce/format_test.py,sha256=HBlAYBoCOHaq_QOFudZlpcZb5TyZWV9v-cxK4tklg50,2253
|
|
331
332
|
castor_extractor/warehouse/salesforce/soql.py,sha256=pAEaJE8ZUcyN3ptBsZGzNcGRhCcU81X6RMlnF1HRMw4,1063
|
|
332
333
|
castor_extractor/warehouse/snowflake/__init__.py,sha256=TEGXTyxWp4Tr9gIHb-UFVTRKj6YWmrRtqHruiKSZGiY,174
|
|
333
334
|
castor_extractor/warehouse/snowflake/client.py,sha256=XT0QLVNff_586SDuMe40iu8FCwPDh2uBV5aKc1Ql914,5555
|
|
334
335
|
castor_extractor/warehouse/snowflake/client_test.py,sha256=ihWtOOAQfh8pu5JTr_EWfqefKOVIaJXznACURzaU1Qs,1432
|
|
335
336
|
castor_extractor/warehouse/snowflake/credentials.py,sha256=wbUdbx9jVSHzg2kNDhMFuDstbVTyZOcGAwnSeGeFIqs,875
|
|
336
337
|
castor_extractor/warehouse/snowflake/credentials_test.py,sha256=Lkc-DHXOvr50KrqAW4nt_x0IA0Mu_CsBVu6ATnzQB6I,673
|
|
337
|
-
castor_extractor/warehouse/snowflake/extract.py,sha256=
|
|
338
|
+
castor_extractor/warehouse/snowflake/extract.py,sha256=fcze0VBe9OOAFSr25T9L6CY506Vm_xDEvvy8NWuLW1s,2956
|
|
338
339
|
castor_extractor/warehouse/snowflake/queries/.sqlfluff,sha256=vttrwcr64JVIuvc7WIg9C54cbOkjg_VjXNR7YnTGOPE,31
|
|
339
340
|
castor_extractor/warehouse/snowflake/queries/column.sql,sha256=pAW2UNnut0a483OY2rjOXCdCtQg0g254g61Bt51CIB4,1803
|
|
340
341
|
castor_extractor/warehouse/snowflake/queries/column_lineage.sql,sha256=YKBiZ6zySSNcXLDXwm31EjGIIkkkZc0-S6hI1SRM80o,1179
|
|
341
342
|
castor_extractor/warehouse/snowflake/queries/database.sql,sha256=ifZXoKUXtsrGOxml6AcNhA4yybIyatH5va7bcp-lgCU,483
|
|
343
|
+
castor_extractor/warehouse/snowflake/queries/function.sql,sha256=8LRh0ybhd-RldJ8UZspWUm3yv52evq11O2uqIO4KqeQ,372
|
|
342
344
|
castor_extractor/warehouse/snowflake/queries/grant_to_role.sql,sha256=O7AJ1LzoXGDFmiVvQ8EMJ5x8FSAnaxRPdmRyAlEmkUM,272
|
|
343
345
|
castor_extractor/warehouse/snowflake/queries/grant_to_user.sql,sha256=7AalVajU5vRRpIiys1igSwmDXirbwpMTvJr2ihSz2NE,143
|
|
344
346
|
castor_extractor/warehouse/snowflake/queries/query.sql,sha256=-OYcWUvdPBkpOfezkZaW7hrOdDz3JyoqjNdRm_88Rsk,1779
|
|
@@ -368,8 +370,8 @@ castor_extractor/warehouse/synapse/queries/schema.sql,sha256=aX9xNrBD_ydwl-znGSF
|
|
|
368
370
|
castor_extractor/warehouse/synapse/queries/table.sql,sha256=mCE8bR1Vb7j7SwZW2gafcXidQ2fo1HwxcybA8wP2Kfs,1049
|
|
369
371
|
castor_extractor/warehouse/synapse/queries/user.sql,sha256=sTb_SS7Zj3AXW1SggKPLNMCd0qoTpL7XI_BJRMaEpBg,67
|
|
370
372
|
castor_extractor/warehouse/synapse/queries/view_ddl.sql,sha256=3EVbp5_yTgdByHFIPLHmnoOnqqLE77SrjAwFDvu4e54,249
|
|
371
|
-
castor_extractor-0.16.
|
|
372
|
-
castor_extractor-0.16.
|
|
373
|
-
castor_extractor-0.16.
|
|
374
|
-
castor_extractor-0.16.
|
|
375
|
-
castor_extractor-0.16.
|
|
373
|
+
castor_extractor-0.16.15.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
|
|
374
|
+
castor_extractor-0.16.15.dist-info/METADATA,sha256=CsdtS6LQFjsgi0A7tj0sMwtkQVYBye4Savn2DFGBHso,6583
|
|
375
|
+
castor_extractor-0.16.15.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
376
|
+
castor_extractor-0.16.15.dist-info/entry_points.txt,sha256=SbyPk58Gh-FRztfCNnUZQ6w7SatzNJFZ6GIJLNsy7tI,1427
|
|
377
|
+
castor_extractor-0.16.15.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|