castor-extractor 0.16.11__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +20 -0
- castor_extractor/commands/upload.py +4 -4
- castor_extractor/uploader/constant.py +8 -1
- castor_extractor/uploader/upload.py +63 -46
- castor_extractor/uploader/upload_test.py +4 -3
- castor_extractor/uploader/utils.py +10 -0
- castor_extractor/utils/client/api.py +8 -3
- castor_extractor/utils/retry.py +3 -1
- castor_extractor/visualization/tableau_revamp/client/client.py +5 -2
- castor_extractor/visualization/tableau_revamp/client/gql_queries.py +10 -1
- castor_extractor/warehouse/abstract/__init__.py +2 -0
- castor_extractor/warehouse/abstract/asset.py +14 -0
- castor_extractor/warehouse/databricks/client.py +239 -3
- castor_extractor/warehouse/databricks/client_test.py +61 -1
- castor_extractor/warehouse/databricks/extract.py +36 -0
- castor_extractor/warehouse/databricks/format.py +13 -0
- castor_extractor/warehouse/databricks/test_constants.py +79 -0
- castor_extractor/warehouse/databricks/types.py +6 -1
- castor_extractor/warehouse/snowflake/extract.py +2 -0
- castor_extractor/warehouse/snowflake/queries/function.sql +10 -0
- {castor_extractor-0.16.11.dist-info → castor_extractor-0.17.0.dist-info}/METADATA +1 -1
- {castor_extractor-0.16.11.dist-info → castor_extractor-0.17.0.dist-info}/RECORD +25 -23
- {castor_extractor-0.16.11.dist-info → castor_extractor-0.17.0.dist-info}/LICENCE +0 -0
- {castor_extractor-0.16.11.dist-info → castor_extractor-0.17.0.dist-info}/WHEEL +0 -0
- {castor_extractor-0.16.11.dist-info → castor_extractor-0.17.0.dist-info}/entry_points.txt +0 -0
CHANGELOG.md
CHANGED
|
@@ -1,5 +1,25 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.17.0 - 2024-06-10
|
|
4
|
+
|
|
5
|
+
* Uploader: redirect to the proxy, replace credentials with token
|
|
6
|
+
|
|
7
|
+
## 0.16.15 - 2024-06-07
|
|
8
|
+
|
|
9
|
+
* Tableau: extract database_name for CustomSQLTables
|
|
10
|
+
|
|
11
|
+
## 0.16.14 - 2024-06-06
|
|
12
|
+
|
|
13
|
+
* Snowflake: Extract SQL user defined function
|
|
14
|
+
|
|
15
|
+
## 0.16.13 - 2024-06-05
|
|
16
|
+
|
|
17
|
+
* Tableau: extract database_name for tables
|
|
18
|
+
|
|
19
|
+
## 0.16.12 - 2024-06-04
|
|
20
|
+
|
|
21
|
+
* Databricks: Extract lineage
|
|
22
|
+
|
|
3
23
|
## 0.16.11 - 2024-06-03
|
|
4
24
|
|
|
5
25
|
* Tableau: add extra fields to optimise storage
|
|
@@ -13,10 +13,10 @@ logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
|
|
|
13
13
|
def _args():
|
|
14
14
|
parser = argparse.ArgumentParser()
|
|
15
15
|
parser.add_argument(
|
|
16
|
-
"-
|
|
17
|
-
"--
|
|
16
|
+
"-k",
|
|
17
|
+
"--token",
|
|
18
18
|
required=True,
|
|
19
|
-
help="""
|
|
19
|
+
help="""API token provided by Castor""",
|
|
20
20
|
)
|
|
21
21
|
parser.add_argument(
|
|
22
22
|
"-s",
|
|
@@ -44,7 +44,7 @@ def _args():
|
|
|
44
44
|
)
|
|
45
45
|
parsed = parser.parse_args()
|
|
46
46
|
return {
|
|
47
|
-
"
|
|
47
|
+
"token": parsed.token,
|
|
48
48
|
"source_id": parsed.source_id,
|
|
49
49
|
"file_path": parsed.file_path,
|
|
50
50
|
"directory_path": parsed.directory_path,
|
|
@@ -1,6 +1,13 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
from ..utils import RetryStrategy
|
|
4
|
+
|
|
5
|
+
# url of the gcs proxy
|
|
6
|
+
INGEST_URL = "https://ingest.castordoc.com"
|
|
7
|
+
|
|
8
|
+
RETRY_BASE_MS = 10_000
|
|
9
|
+
RETRY_JITTER_MS = 1_000
|
|
10
|
+
RETRY_STRATEGY = RetryStrategy.LINEAR
|
|
4
11
|
|
|
5
12
|
|
|
6
13
|
class FileType(Enum):
|
|
@@ -1,83 +1,100 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
import json
|
|
3
2
|
import logging
|
|
4
3
|
import ntpath
|
|
5
4
|
from datetime import datetime
|
|
6
|
-
from typing import Iterable, Optional,
|
|
5
|
+
from typing import Dict, Iterable, Optional, Tuple
|
|
7
6
|
from uuid import UUID
|
|
8
7
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
from .
|
|
8
|
+
import requests
|
|
9
|
+
|
|
10
|
+
from ..utils.retry import retry
|
|
11
|
+
from .constant import (
|
|
12
|
+
INGEST_URL,
|
|
13
|
+
PATH_TEMPLATES,
|
|
14
|
+
RETRY_BASE_MS,
|
|
15
|
+
RETRY_JITTER_MS,
|
|
16
|
+
RETRY_STRATEGY,
|
|
17
|
+
FileType,
|
|
18
|
+
)
|
|
12
19
|
from .env import get_blob_env
|
|
13
|
-
from .utils import
|
|
20
|
+
from .utils import iter_files
|
|
14
21
|
|
|
15
22
|
logger = logging.getLogger(__name__)
|
|
16
23
|
|
|
24
|
+
_EXCEPTIONS = (
|
|
25
|
+
requests.exceptions.Timeout,
|
|
26
|
+
requests.exceptions.ConnectTimeout,
|
|
27
|
+
)
|
|
17
28
|
|
|
18
|
-
def _client(credentials: Union[str, dict]) -> storage.Client:
|
|
19
|
-
"""supports dict, string or path to the JSON file"""
|
|
20
|
-
if isinstance(credentials, dict):
|
|
21
|
-
return storage.Client.from_service_account_info(credentials)
|
|
22
|
-
if file_exist(credentials):
|
|
23
|
-
return storage.Client.from_service_account_json(credentials)
|
|
24
|
-
if isinstance(credentials, str):
|
|
25
|
-
credentials = json.loads(credentials)
|
|
26
|
-
return storage.Client.from_service_account_info(credentials)
|
|
27
|
-
raise ValueError("needs path or dict for credentials")
|
|
28
29
|
|
|
30
|
+
def _path_and_url(
|
|
31
|
+
source_id: UUID,
|
|
32
|
+
file_type: FileType,
|
|
33
|
+
file_path: str,
|
|
34
|
+
) -> Tuple[str, str]:
|
|
29
35
|
|
|
30
|
-
def _path(source_id: UUID, file_type: FileType, file_path: str) -> str:
|
|
31
36
|
now = datetime.utcnow()
|
|
32
37
|
timestamp = int(now.timestamp())
|
|
33
38
|
filename = ntpath.basename(file_path)
|
|
34
|
-
|
|
35
39
|
path_template = PATH_TEMPLATES[file_type]
|
|
36
|
-
|
|
40
|
+
path = path_template.format(
|
|
37
41
|
timestamp=timestamp,
|
|
38
42
|
source_id=source_id,
|
|
39
43
|
filename=filename,
|
|
40
44
|
)
|
|
41
45
|
|
|
46
|
+
url = f"{INGEST_URL}/{path}"
|
|
42
47
|
|
|
43
|
-
|
|
44
|
-
credentials: Union[str, dict],
|
|
45
|
-
source_id: UUID,
|
|
46
|
-
file_path: str,
|
|
47
|
-
file_type: FileType,
|
|
48
|
-
) -> storage.Blob:
|
|
49
|
-
"""get the target blob to upload to"""
|
|
50
|
-
client = _client(credentials)
|
|
51
|
-
path = _path(source_id, file_type, file_path)
|
|
48
|
+
return path, url
|
|
52
49
|
|
|
53
|
-
|
|
54
|
-
|
|
50
|
+
|
|
51
|
+
def _headers(token: str) -> Dict:
|
|
52
|
+
return {
|
|
53
|
+
"Authorization": f"Token {token}",
|
|
54
|
+
"Accept": "text/csv, application/json",
|
|
55
|
+
}
|
|
55
56
|
|
|
56
57
|
|
|
57
58
|
def _upload(
|
|
58
|
-
|
|
59
|
+
token: str,
|
|
59
60
|
source_id: UUID,
|
|
60
61
|
file_path: str,
|
|
61
62
|
file_type: FileType,
|
|
62
63
|
) -> None:
|
|
63
64
|
"""
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
file_path: path to the local file to upload
|
|
65
|
+
Upload the given file to Google Cloud Storage (GCS)
|
|
66
|
+
- Don't call GCS API directly
|
|
67
|
+
- Call the ingestion proxy which handles authorisation and uploading
|
|
68
68
|
"""
|
|
69
|
-
|
|
69
|
+
path, url = _path_and_url(source_id, file_type, file_path)
|
|
70
|
+
headers = _headers(token)
|
|
71
|
+
timeout, max_retries = get_blob_env()
|
|
70
72
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
73
|
+
with open(file_path, "rb") as file_content:
|
|
74
|
+
|
|
75
|
+
@retry(
|
|
76
|
+
exceptions=_EXCEPTIONS,
|
|
77
|
+
max_retries=max_retries,
|
|
78
|
+
base_ms=RETRY_BASE_MS,
|
|
79
|
+
jitter_ms=RETRY_JITTER_MS,
|
|
80
|
+
strategy=RETRY_STRATEGY,
|
|
81
|
+
)
|
|
82
|
+
def _request_post():
|
|
83
|
+
response = requests.post(
|
|
84
|
+
url=url,
|
|
85
|
+
headers=headers,
|
|
86
|
+
data=file_content,
|
|
87
|
+
timeout=timeout,
|
|
88
|
+
)
|
|
89
|
+
response.raise_for_status()
|
|
90
|
+
|
|
91
|
+
_request_post()
|
|
92
|
+
|
|
93
|
+
logger.info(f"Uploaded {file_path} as {file_type.value} to {path}")
|
|
77
94
|
|
|
78
95
|
|
|
79
96
|
def upload_manifest(
|
|
80
|
-
|
|
97
|
+
token: str,
|
|
81
98
|
source_id: UUID,
|
|
82
99
|
file_path: str,
|
|
83
100
|
) -> None:
|
|
@@ -86,11 +103,11 @@ def upload_manifest(
|
|
|
86
103
|
source_id: id for the source
|
|
87
104
|
file_path: path to the local manifest to upload
|
|
88
105
|
"""
|
|
89
|
-
_upload(
|
|
106
|
+
_upload(token, source_id, file_path, FileType.DBT)
|
|
90
107
|
|
|
91
108
|
|
|
92
109
|
def upload(
|
|
93
|
-
|
|
110
|
+
token: str,
|
|
94
111
|
source_id: UUID,
|
|
95
112
|
file_type: FileType,
|
|
96
113
|
file_path: Optional[str] = None,
|
|
@@ -113,4 +130,4 @@ def upload(
|
|
|
113
130
|
raise ValueError(message)
|
|
114
131
|
|
|
115
132
|
for file_ in files:
|
|
116
|
-
_upload(
|
|
133
|
+
_upload(token, source_id, file_, file_type)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from uuid import UUID
|
|
2
2
|
|
|
3
|
-
from .constant import FileType
|
|
4
|
-
from .upload import
|
|
3
|
+
from .constant import INGEST_URL, FileType
|
|
4
|
+
from .upload import _path_and_url
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
def test__path():
|
|
@@ -9,5 +9,6 @@ def test__path():
|
|
|
9
9
|
file_type = FileType.VIZ
|
|
10
10
|
file_path = "filename"
|
|
11
11
|
|
|
12
|
-
path =
|
|
12
|
+
path, url = _path_and_url(source_id, file_type, file_path)
|
|
13
13
|
assert path == f"visualization-{source_id}/{file_path}"
|
|
14
|
+
assert url == f"{INGEST_URL}/{path}"
|
|
@@ -1,13 +1,23 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import os
|
|
2
3
|
from typing import Iterator
|
|
3
4
|
|
|
5
|
+
logger = logging.getLogger(__name__)
|
|
6
|
+
|
|
7
|
+
_ALLOWED_EXTENSION = (".json", ".csv")
|
|
8
|
+
|
|
4
9
|
|
|
5
10
|
def iter_files(repository_path: str) -> Iterator[str]:
|
|
6
11
|
"""
|
|
7
12
|
Given a repository path yield all files in that repository
|
|
13
|
+
Removes file whose extension is not allowed
|
|
8
14
|
"""
|
|
9
15
|
|
|
10
16
|
for file in os.listdir(repository_path):
|
|
17
|
+
_, ext = os.path.splitext(file)
|
|
18
|
+
if ext not in _ALLOWED_EXTENSION:
|
|
19
|
+
logger.info(f"Forbidden file extension : skipping {file}")
|
|
20
|
+
continue
|
|
11
21
|
file_path = os.path.join(repository_path, file)
|
|
12
22
|
|
|
13
23
|
if os.path.isfile(file_path):
|
|
@@ -5,7 +5,7 @@ import requests
|
|
|
5
5
|
|
|
6
6
|
logger = logging.getLogger(__name__)
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
DEFAULT_TIMEOUT_S = 30
|
|
9
9
|
|
|
10
10
|
# https://requests.readthedocs.io/en/latest/api/#requests.request
|
|
11
11
|
HttpMethod = Literal["GET", "OPTIONS", "HEAD", "POST", "PUT", "PATCH", "DELETE"]
|
|
@@ -20,7 +20,7 @@ class APIClient:
|
|
|
20
20
|
def __init__(self, host: str, token: Optional[str] = None):
|
|
21
21
|
self._host = host
|
|
22
22
|
self._token = token or ""
|
|
23
|
-
self._timeout =
|
|
23
|
+
self._timeout = DEFAULT_TIMEOUT_S
|
|
24
24
|
|
|
25
25
|
@staticmethod
|
|
26
26
|
def build_url(host: str, path: str):
|
|
@@ -44,7 +44,12 @@ class APIClient:
|
|
|
44
44
|
) -> Any:
|
|
45
45
|
logger.debug(f"Calling {method} on {url}")
|
|
46
46
|
result = requests.request(
|
|
47
|
-
method,
|
|
47
|
+
method,
|
|
48
|
+
url,
|
|
49
|
+
headers=self._headers(),
|
|
50
|
+
params=params,
|
|
51
|
+
json=data,
|
|
52
|
+
timeout=self._timeout,
|
|
48
53
|
)
|
|
49
54
|
result.raise_for_status()
|
|
50
55
|
|
castor_extractor/utils/retry.py
CHANGED
|
@@ -68,7 +68,8 @@ class Retry(BaseModel):
|
|
|
68
68
|
self._retry_attempts += 1
|
|
69
69
|
wait_ms = self.base() + self.jitter()
|
|
70
70
|
wait_s = float(wait_ms) / MS_IN_SEC
|
|
71
|
-
|
|
71
|
+
msg = f"Attempting a new call in {wait_s} seconds, {self._retry_attempts} attempt(s) / {self.max_retries} max retries"
|
|
72
|
+
logger.warning(msg)
|
|
72
73
|
time.sleep(wait_s)
|
|
73
74
|
return True
|
|
74
75
|
|
|
@@ -93,6 +94,7 @@ def retry(
|
|
|
93
94
|
try:
|
|
94
95
|
return None, callable(*args, **kwargs)
|
|
95
96
|
except exceptions_ as err:
|
|
97
|
+
logger.warning(f"Exception within {callable.__name__}")
|
|
96
98
|
return err, None
|
|
97
99
|
|
|
98
100
|
def _func(*args, **kwargs) -> Any:
|
|
@@ -31,8 +31,11 @@ _TSC_ASSETS = (
|
|
|
31
31
|
# increase the value when extraction is too slow
|
|
32
32
|
# decrease the value when timeouts arise
|
|
33
33
|
_CUSTOM_PAGE_SIZE: Dict[TableauRevampAsset, int] = {
|
|
34
|
-
#
|
|
35
|
-
|
|
34
|
+
# for some clients, extraction of columns tend to hit the node limit
|
|
35
|
+
# https://community.tableau.com/s/question/0D54T00000YuK60SAF/metadata-query-nodelimitexceeded-error
|
|
36
|
+
# the workaround is to reduce pagination
|
|
37
|
+
TableauRevampAsset.COLUMN: 50,
|
|
38
|
+
# fields are light but volumes are bigger
|
|
36
39
|
TableauRevampAsset.FIELD: 1000,
|
|
37
40
|
TableauRevampAsset.TABLE: 50,
|
|
38
41
|
}
|
|
@@ -63,12 +63,21 @@ downstreamWorkbooks { id }
|
|
|
63
63
|
id
|
|
64
64
|
name
|
|
65
65
|
... on DatabaseTable {
|
|
66
|
-
connectionType
|
|
67
66
|
fullName
|
|
68
67
|
schema
|
|
68
|
+
database {
|
|
69
|
+
connectionType
|
|
70
|
+
id
|
|
71
|
+
name
|
|
72
|
+
}
|
|
69
73
|
}
|
|
70
74
|
... on CustomSQLTable {
|
|
71
75
|
query
|
|
76
|
+
database {
|
|
77
|
+
connectionType
|
|
78
|
+
id
|
|
79
|
+
name
|
|
80
|
+
}
|
|
72
81
|
}
|
|
73
82
|
"""
|
|
74
83
|
|
|
@@ -7,6 +7,8 @@ from ...types import ExternalAsset, classproperty
|
|
|
7
7
|
class WarehouseAsset(ExternalAsset):
|
|
8
8
|
"""Assets that can be extracted from warehouses"""
|
|
9
9
|
|
|
10
|
+
ADDITIONAL_COLUMN_LINEAGE = "additional_column_lineage"
|
|
11
|
+
ADDITIONAL_TABLE_LINEAGE = "additional_table_lineage"
|
|
10
12
|
COLUMN = "column"
|
|
11
13
|
COLUMN_LINEAGE = "column_lineage" # specific to snowflake
|
|
12
14
|
DATABASE = "database"
|
|
@@ -19,22 +21,28 @@ class WarehouseAsset(ExternalAsset):
|
|
|
19
21
|
ROLE = "role"
|
|
20
22
|
SCHEMA = "schema"
|
|
21
23
|
TABLE = "table"
|
|
24
|
+
FUNCTION = "function"
|
|
22
25
|
USER = "user"
|
|
23
26
|
VIEW_DDL = "view_ddl"
|
|
24
27
|
|
|
25
28
|
@classproperty
|
|
26
29
|
def optional(cls) -> Set["WarehouseAsset"]:
|
|
27
30
|
return {
|
|
31
|
+
WarehouseAsset.ADDITIONAL_COLUMN_LINEAGE,
|
|
32
|
+
WarehouseAsset.ADDITIONAL_TABLE_LINEAGE,
|
|
28
33
|
WarehouseAsset.EXTERNAL_COLUMN_LINEAGE,
|
|
29
34
|
WarehouseAsset.EXTERNAL_TABLE_LINEAGE,
|
|
35
|
+
WarehouseAsset.FUNCTION,
|
|
30
36
|
}
|
|
31
37
|
|
|
32
38
|
|
|
33
39
|
class WarehouseAssetGroup(Enum):
|
|
34
40
|
"""Groups of assets that can be extracted together"""
|
|
35
41
|
|
|
42
|
+
ADDITIONAL_LINEAGE = "additional_lineage"
|
|
36
43
|
CATALOG = "catalog"
|
|
37
44
|
EXTERNAL_LINEAGE = "external_lineage"
|
|
45
|
+
FUNCTION = "function"
|
|
38
46
|
QUERY = "query"
|
|
39
47
|
ROLE = "role"
|
|
40
48
|
SNOWFLAKE_LINEAGE = "snowflake_lineage"
|
|
@@ -53,6 +61,7 @@ CATALOG_ASSETS = (
|
|
|
53
61
|
)
|
|
54
62
|
|
|
55
63
|
# shared by technologies supporting queries
|
|
64
|
+
FUNCTIONS_ASSETS = (WarehouseAsset.FUNCTION,)
|
|
56
65
|
QUERIES_ASSETS = (WarehouseAsset.QUERY,)
|
|
57
66
|
VIEWS_ASSETS = (WarehouseAsset.VIEW_DDL,)
|
|
58
67
|
|
|
@@ -61,6 +70,11 @@ EXTERNAL_LINEAGE_ASSETS = (
|
|
|
61
70
|
WarehouseAsset.EXTERNAL_TABLE_LINEAGE,
|
|
62
71
|
)
|
|
63
72
|
|
|
73
|
+
ADDITIONAL_LINEAGE_ASSETS = (
|
|
74
|
+
WarehouseAsset.ADDITIONAL_COLUMN_LINEAGE,
|
|
75
|
+
WarehouseAsset.ADDITIONAL_TABLE_LINEAGE,
|
|
76
|
+
)
|
|
77
|
+
|
|
64
78
|
NON_EXTRACTABLE_ASSETS = {WarehouseAssetGroup.EXTERNAL_LINEAGE}
|
|
65
79
|
|
|
66
80
|
|
|
@@ -1,18 +1,38 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
2
3
|
from datetime import date
|
|
3
4
|
from functools import partial
|
|
4
|
-
from typing import Any, Dict, List, Optional, Set
|
|
5
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, cast
|
|
5
6
|
|
|
6
|
-
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
from ...utils import (
|
|
10
|
+
SafeMode,
|
|
11
|
+
at_midnight,
|
|
12
|
+
date_after,
|
|
13
|
+
mapping_from_rows,
|
|
14
|
+
retry,
|
|
15
|
+
safe_mode,
|
|
16
|
+
)
|
|
7
17
|
from ...utils.client.api import APIClient
|
|
8
18
|
from ...utils.pager import PagerOnToken
|
|
9
19
|
from ..abstract.time_filter import TimeFilter
|
|
10
20
|
from .credentials import DatabricksCredentials
|
|
11
21
|
from .format import DatabricksFormatter
|
|
12
|
-
from .types import TablesColumns
|
|
22
|
+
from .types import Link, Ostr, OTimestampedLink, TablesColumns, TimestampedLink
|
|
13
23
|
|
|
14
24
|
logger = logging.getLogger(__name__)
|
|
15
25
|
|
|
26
|
+
_MAX_NUMBER_OF_LINEAGE_ERRORS = 1000
|
|
27
|
+
_MAX_THREADS = 10
|
|
28
|
+
_RETRY_ATTEMPTS = 3
|
|
29
|
+
_RETRY_BASE_MS = 1000
|
|
30
|
+
_RETRY_EXCEPTIONS = [
|
|
31
|
+
requests.exceptions.ConnectTimeout,
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
safe_params = SafeMode((BaseException,), _MAX_NUMBER_OF_LINEAGE_ERRORS)
|
|
35
|
+
|
|
16
36
|
|
|
17
37
|
def _day_to_epoch_ms(day: date) -> int:
|
|
18
38
|
return int(at_midnight(day).timestamp() * 1000)
|
|
@@ -22,6 +42,30 @@ def _day_hour_to_epoch_ms(day: date, hour: int) -> int:
|
|
|
22
42
|
return int(at_midnight(day).timestamp() * 1000) + (hour * 3600 * 1000)
|
|
23
43
|
|
|
24
44
|
|
|
45
|
+
class LineageLinks:
|
|
46
|
+
"""
|
|
47
|
+
helper class that handles lineage deduplication and filtering
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(self):
|
|
51
|
+
self.lineage: Dict[Link, Ostr] = dict()
|
|
52
|
+
|
|
53
|
+
def add(self, timestamped_link: TimestampedLink) -> None:
|
|
54
|
+
"""
|
|
55
|
+
keep the most recent lineage link, adding to `self.lineage`
|
|
56
|
+
"""
|
|
57
|
+
parent, child, timestamp = timestamped_link
|
|
58
|
+
link = (parent, child)
|
|
59
|
+
if not self.lineage.get(link):
|
|
60
|
+
self.lineage[link] = timestamp
|
|
61
|
+
else:
|
|
62
|
+
if not timestamp:
|
|
63
|
+
return
|
|
64
|
+
# keep most recent link; cast for mypy
|
|
65
|
+
recent = max(cast(str, self.lineage[link]), cast(str, timestamp))
|
|
66
|
+
self.lineage[link] = recent
|
|
67
|
+
|
|
68
|
+
|
|
25
69
|
class DatabricksClient(APIClient):
|
|
26
70
|
"""Databricks Client"""
|
|
27
71
|
|
|
@@ -123,6 +167,198 @@ class DatabricksClient(APIClient):
|
|
|
123
167
|
columns.extend(c_to_add)
|
|
124
168
|
return tables, columns
|
|
125
169
|
|
|
170
|
+
@staticmethod
|
|
171
|
+
def _to_table_path(table: dict) -> Ostr:
|
|
172
|
+
if table.get("name"):
|
|
173
|
+
return f"{table['catalog_name']}.{table['schema_name']}.{table['name']}"
|
|
174
|
+
return None
|
|
175
|
+
|
|
176
|
+
@staticmethod
|
|
177
|
+
def _to_column_path(column: dict) -> Ostr:
|
|
178
|
+
if column.get("name"):
|
|
179
|
+
return f"{column['catalog_name']}.{column['schema_name']}.{column['table_name']}.{column['name']}"
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
def _link(
|
|
183
|
+
self, path_from: Ostr, path_to: Ostr, timestamp: Ostr
|
|
184
|
+
) -> OTimestampedLink:
|
|
185
|
+
"""exclude missing path and self-lineage"""
|
|
186
|
+
if (not path_from) or (not path_to):
|
|
187
|
+
return None
|
|
188
|
+
is_self_lineage = path_from.lower() == path_to.lower()
|
|
189
|
+
if is_self_lineage:
|
|
190
|
+
return None
|
|
191
|
+
return (path_from, path_to, timestamp)
|
|
192
|
+
|
|
193
|
+
def _single_table_lineage_links(
|
|
194
|
+
self, table_path: str, single_table_lineage: dict
|
|
195
|
+
) -> List[TimestampedLink]:
|
|
196
|
+
"""
|
|
197
|
+
process databricks lineage API response for a given table
|
|
198
|
+
returns a list of (parent, child, timestamp)
|
|
199
|
+
|
|
200
|
+
Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
|
|
201
|
+
we could also have `notebookInfos` or `fileInfo`
|
|
202
|
+
"""
|
|
203
|
+
links: List[OTimestampedLink] = []
|
|
204
|
+
# add parent:
|
|
205
|
+
for link in single_table_lineage.get("upstreams", []):
|
|
206
|
+
parent = link.get("tableInfo", {})
|
|
207
|
+
parent_path = self._to_table_path(parent)
|
|
208
|
+
timestamp: Ostr = parent.get("lineage_timestamp")
|
|
209
|
+
links.append(self._link(parent_path, table_path, timestamp))
|
|
210
|
+
|
|
211
|
+
# add children:
|
|
212
|
+
for link in single_table_lineage.get("downstreams", []):
|
|
213
|
+
child = link.get("tableInfo", {})
|
|
214
|
+
child_path = self._to_table_path(child)
|
|
215
|
+
timestamp = child.get("lineage_timestamp")
|
|
216
|
+
links.append(self._link(table_path, child_path, timestamp))
|
|
217
|
+
|
|
218
|
+
return list(filter(None, links))
|
|
219
|
+
|
|
220
|
+
@safe_mode(safe_params, lambda: [])
|
|
221
|
+
@retry(
|
|
222
|
+
exceptions=_RETRY_EXCEPTIONS,
|
|
223
|
+
max_retries=_RETRY_ATTEMPTS,
|
|
224
|
+
base_ms=_RETRY_BASE_MS,
|
|
225
|
+
)
|
|
226
|
+
def get_single_table_lineage(
|
|
227
|
+
self, table_path: str
|
|
228
|
+
) -> List[TimestampedLink]:
|
|
229
|
+
"""
|
|
230
|
+
Helper function used in get_lineage_links.
|
|
231
|
+
Call data lineage API and return the content of the result
|
|
232
|
+
eg table_path: broward_prd.bronze.account_adjustments
|
|
233
|
+
FYI: Maximum rate of 50 requests per SECOND
|
|
234
|
+
"""
|
|
235
|
+
path = "api/2.0/lineage-tracking/table-lineage"
|
|
236
|
+
payload = {"table_name": table_path, "include_entity_lineage": True}
|
|
237
|
+
content = self.get(path=path, payload=payload)
|
|
238
|
+
return self._single_table_lineage_links(table_path, content)
|
|
239
|
+
|
|
240
|
+
def _deduplicate_lineage(self, lineages: List[TimestampedLink]) -> dict:
|
|
241
|
+
deduplicated_lineage = LineageLinks()
|
|
242
|
+
for timestamped_link in lineages:
|
|
243
|
+
deduplicated_lineage.add(timestamped_link)
|
|
244
|
+
return deduplicated_lineage.lineage
|
|
245
|
+
|
|
246
|
+
def table_lineage(self, tables: List[dict]) -> List[dict]:
|
|
247
|
+
"""
|
|
248
|
+
Wrapper function that retrieves all table lineage
|
|
249
|
+
"""
|
|
250
|
+
# retrieve table lineage
|
|
251
|
+
with ThreadPoolExecutor(max_workers=_MAX_THREADS) as executor:
|
|
252
|
+
table_paths = [
|
|
253
|
+
".".join([table["schema_id"], table["table_name"]])
|
|
254
|
+
for table in tables
|
|
255
|
+
]
|
|
256
|
+
results = executor.map(self.get_single_table_lineage, table_paths)
|
|
257
|
+
lineages = [link for links in results for link in links]
|
|
258
|
+
deduplicated = self._deduplicate_lineage(lineages)
|
|
259
|
+
return self.formatter.format_lineage(deduplicated)
|
|
260
|
+
|
|
261
|
+
@staticmethod
|
|
262
|
+
def _paths_for_column_lineage(
|
|
263
|
+
tables: List[dict], columns: List[dict], table_lineage: List[dict]
|
|
264
|
+
) -> List[Tuple[str, str]]:
|
|
265
|
+
"""
|
|
266
|
+
helper providing a list of candidate columns to look lineage for:
|
|
267
|
+
we only look for column lineage where there is table lineage
|
|
268
|
+
"""
|
|
269
|
+
# mapping between table id and its path db.schema.table
|
|
270
|
+
# table["schema_id"] follows the pattern `db.schema`
|
|
271
|
+
mapping = {
|
|
272
|
+
table["id"]: ".".join([table["schema_id"], table["table_name"]])
|
|
273
|
+
for table in tables
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
tables_with_lineage: Set[str] = set()
|
|
277
|
+
for t in table_lineage:
|
|
278
|
+
tables_with_lineage.add(t["parent_path"])
|
|
279
|
+
tables_with_lineage.add(t["child_path"])
|
|
280
|
+
|
|
281
|
+
paths_to_return: List[Tuple[str, str]] = []
|
|
282
|
+
for column in columns:
|
|
283
|
+
table_path = mapping[column["table_id"]]
|
|
284
|
+
if table_path not in tables_with_lineage:
|
|
285
|
+
continue
|
|
286
|
+
column_ = (table_path, column["column_name"])
|
|
287
|
+
paths_to_return.append(column_)
|
|
288
|
+
|
|
289
|
+
return paths_to_return
|
|
290
|
+
|
|
291
|
+
def _single_column_lineage_links(
|
|
292
|
+
self, column_path: str, single_column_lineage: dict
|
|
293
|
+
) -> List[TimestampedLink]:
|
|
294
|
+
"""
|
|
295
|
+
process databricks lineage API response for a given table
|
|
296
|
+
returns a list of (parent, child, timestamp)
|
|
297
|
+
|
|
298
|
+
Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
|
|
299
|
+
we could also have `notebookInfos` or `fileInfo`
|
|
300
|
+
"""
|
|
301
|
+
links: List[OTimestampedLink] = []
|
|
302
|
+
# add parent:
|
|
303
|
+
for link in single_column_lineage.get("upstream_cols", []):
|
|
304
|
+
parent_path = self._to_column_path(link)
|
|
305
|
+
timestamp: Ostr = link.get("lineage_timestamp")
|
|
306
|
+
links.append(self._link(parent_path, column_path, timestamp))
|
|
307
|
+
|
|
308
|
+
# add children:
|
|
309
|
+
for link in single_column_lineage.get("downstream_cols", []):
|
|
310
|
+
child_path = self._to_column_path(link)
|
|
311
|
+
timestamp = link.get("lineage_timestamp")
|
|
312
|
+
links.append(self._link(column_path, child_path, timestamp))
|
|
313
|
+
|
|
314
|
+
return list(filter(None, links))
|
|
315
|
+
|
|
316
|
+
@safe_mode(safe_params, lambda: [])
|
|
317
|
+
@retry(
|
|
318
|
+
exceptions=_RETRY_EXCEPTIONS,
|
|
319
|
+
max_retries=_RETRY_ATTEMPTS,
|
|
320
|
+
base_ms=_RETRY_BASE_MS,
|
|
321
|
+
)
|
|
322
|
+
def get_single_column_lineage(
|
|
323
|
+
self,
|
|
324
|
+
names: Tuple[str, str],
|
|
325
|
+
) -> List[TimestampedLink]:
|
|
326
|
+
"""
|
|
327
|
+
Helper function used in get_lineage_links.
|
|
328
|
+
Call data lineage API and return the content of the result
|
|
329
|
+
|
|
330
|
+
eg table_path: broward_prd.bronze.account_adjustments
|
|
331
|
+
FYI: Maximum rate of 10 requests per SECOND
|
|
332
|
+
"""
|
|
333
|
+
table_path, column_name = names
|
|
334
|
+
api_path = "api/2.0/lineage-tracking/column-lineage"
|
|
335
|
+
payload = {
|
|
336
|
+
"table_name": table_path,
|
|
337
|
+
"column_name": column_name,
|
|
338
|
+
"include_entity_lineage": True,
|
|
339
|
+
}
|
|
340
|
+
content = self.get(path=api_path, payload=payload)
|
|
341
|
+
column_path = f"{table_path}.{column_name}"
|
|
342
|
+
return self._single_column_lineage_links(column_path, content)
|
|
343
|
+
|
|
344
|
+
def column_lineage(
|
|
345
|
+
self, tables: List[dict], columns: List[dict], table_lineage: List[dict]
|
|
346
|
+
) -> List[dict]:
|
|
347
|
+
"""
|
|
348
|
+
Wrapper function that retrieves all column lineage
|
|
349
|
+
we only try to retrieve column lineage if we found table lineage
|
|
350
|
+
"""
|
|
351
|
+
candidate_paths = self._paths_for_column_lineage(
|
|
352
|
+
tables, columns, table_lineage
|
|
353
|
+
)
|
|
354
|
+
lineages: List[TimestampedLink] = [
|
|
355
|
+
link
|
|
356
|
+
for paths in candidate_paths
|
|
357
|
+
for link in self.get_single_column_lineage(paths)
|
|
358
|
+
]
|
|
359
|
+
deduplicated = self._deduplicate_lineage(lineages)
|
|
360
|
+
return self.formatter.format_lineage(deduplicated)
|
|
361
|
+
|
|
126
362
|
@staticmethod
|
|
127
363
|
def _time_filter(time_filter: Optional[TimeFilter]) -> dict:
|
|
128
364
|
"""time filter to retrieve Databricks' queries"""
|
|
@@ -1,9 +1,16 @@
|
|
|
1
1
|
from datetime import date
|
|
2
|
+
from unittest.mock import Mock, patch
|
|
2
3
|
|
|
3
4
|
from freezegun import freeze_time
|
|
4
5
|
|
|
5
6
|
from ..abstract.time_filter import TimeFilter
|
|
6
|
-
from .client import DatabricksClient, _day_hour_to_epoch_ms
|
|
7
|
+
from .client import DatabricksClient, LineageLinks, _day_hour_to_epoch_ms
|
|
8
|
+
from .test_constants import (
|
|
9
|
+
CLOSER_DATE,
|
|
10
|
+
MOCK_TABLES_FOR_TABLE_LINEAGE,
|
|
11
|
+
OLDER_DATE,
|
|
12
|
+
TABLE_LINEAGE_SIDE_EFFECT,
|
|
13
|
+
)
|
|
7
14
|
|
|
8
15
|
|
|
9
16
|
def test__day_hour_to_epoch_ms():
|
|
@@ -97,3 +104,56 @@ def test_DatabricksClient__match_table_with_user():
|
|
|
97
104
|
table_without_owner = {"id": 1, "owner_email": None}
|
|
98
105
|
actual = client._match_table_with_user(table_without_owner, user_mapping)
|
|
99
106
|
assert actual == table_without_owner
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@patch(
|
|
110
|
+
"source.packages.extractor.castor_extractor.warehouse.databricks.client.DatabricksClient.get",
|
|
111
|
+
side_effect=TABLE_LINEAGE_SIDE_EFFECT,
|
|
112
|
+
)
|
|
113
|
+
def test_DatabricksClient_table_lineage(mock_get):
|
|
114
|
+
client = DatabricksClient(Mock())
|
|
115
|
+
|
|
116
|
+
lineage = client.table_lineage(MOCK_TABLES_FOR_TABLE_LINEAGE)
|
|
117
|
+
assert len(lineage) == 2
|
|
118
|
+
|
|
119
|
+
expected_link_1 = {
|
|
120
|
+
"parent_path": "dev.silver.pre_analytics",
|
|
121
|
+
"child_path": "dev.silver.analytics",
|
|
122
|
+
"timestamp": OLDER_DATE,
|
|
123
|
+
}
|
|
124
|
+
expected_link_2 = {
|
|
125
|
+
"parent_path": "dev.bronze.analytics",
|
|
126
|
+
"child_path": "dev.silver.analytics",
|
|
127
|
+
"timestamp": CLOSER_DATE,
|
|
128
|
+
}
|
|
129
|
+
assert expected_link_1 in lineage
|
|
130
|
+
assert expected_link_2 in lineage
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def test_LineageLinks_add():
|
|
134
|
+
links = LineageLinks()
|
|
135
|
+
timestamped_link = ("parent", "child", None)
|
|
136
|
+
expected_key = ("parent", "child")
|
|
137
|
+
|
|
138
|
+
links.add(timestamped_link)
|
|
139
|
+
|
|
140
|
+
assert expected_key in links.lineage
|
|
141
|
+
assert links.lineage[expected_key] is None
|
|
142
|
+
|
|
143
|
+
# we replace None by an actual timestamp
|
|
144
|
+
timestamped_link = ("parent", "child", OLDER_DATE)
|
|
145
|
+
links.add(timestamped_link)
|
|
146
|
+
assert expected_key in links.lineage
|
|
147
|
+
assert links.lineage[expected_key] == OLDER_DATE
|
|
148
|
+
|
|
149
|
+
# we update with the more recent timestamp
|
|
150
|
+
timestamped_link = ("parent", "child", CLOSER_DATE)
|
|
151
|
+
links.add(timestamped_link)
|
|
152
|
+
assert expected_key in links.lineage
|
|
153
|
+
assert links.lineage[expected_key] == CLOSER_DATE
|
|
154
|
+
|
|
155
|
+
# we keep the more recent timestamp
|
|
156
|
+
timestamped_link = ("parent", "child", OLDER_DATE)
|
|
157
|
+
links.add(timestamped_link)
|
|
158
|
+
assert expected_key in links.lineage
|
|
159
|
+
assert links.lineage[expected_key] == CLOSER_DATE
|
|
@@ -3,6 +3,7 @@ from typing import Dict, Optional
|
|
|
3
3
|
|
|
4
4
|
from ...utils import AbstractStorage, LocalStorage, write_summary
|
|
5
5
|
from ..abstract import (
|
|
6
|
+
ADDITIONAL_LINEAGE_ASSETS,
|
|
6
7
|
CATALOG_ASSETS,
|
|
7
8
|
EXTERNAL_LINEAGE_ASSETS,
|
|
8
9
|
QUERIES_ASSETS,
|
|
@@ -17,6 +18,7 @@ from .client import DatabricksClient
|
|
|
17
18
|
from .credentials import to_credentials
|
|
18
19
|
|
|
19
20
|
DATABRICKS_ASSETS: SupportedAssets = {
|
|
21
|
+
WarehouseAssetGroup.ADDITIONAL_LINEAGE: ADDITIONAL_LINEAGE_ASSETS,
|
|
20
22
|
WarehouseAssetGroup.CATALOG: CATALOG_ASSETS,
|
|
21
23
|
WarehouseAssetGroup.QUERY: QUERIES_ASSETS,
|
|
22
24
|
WarehouseAssetGroup.ROLE: (WarehouseAsset.USER,),
|
|
@@ -94,6 +96,39 @@ class DatabricksExtractionProcessor:
|
|
|
94
96
|
logger.info(f"Extracted {len(columns)} columns to {location}")
|
|
95
97
|
return catalog_locations
|
|
96
98
|
|
|
99
|
+
def extract_lineage(self) -> Paths:
|
|
100
|
+
if self._should_not_reextract(WarehouseAssetGroup.ADDITIONAL_LINEAGE):
|
|
101
|
+
return self._existing_group_paths(
|
|
102
|
+
WarehouseAssetGroup.ADDITIONAL_LINEAGE
|
|
103
|
+
)
|
|
104
|
+
lineage_locations: Dict[str, str] = dict()
|
|
105
|
+
|
|
106
|
+
# extract catalog
|
|
107
|
+
databases = self._client.databases()
|
|
108
|
+
schemas = self._client.schemas(databases)
|
|
109
|
+
users = self._client.users()
|
|
110
|
+
tables, columns = self._client.tables_and_columns(schemas, users)
|
|
111
|
+
logger.info("Extracted pre-requisite catalog. Next comes lineage")
|
|
112
|
+
|
|
113
|
+
# extract table lineage
|
|
114
|
+
table_lineage = self._client.table_lineage(tables)
|
|
115
|
+
table_lineage_key = WarehouseAsset.ADDITIONAL_TABLE_LINEAGE.value
|
|
116
|
+
location = self._storage.put(table_lineage_key, table_lineage)
|
|
117
|
+
lineage_locations[table_lineage_key] = location
|
|
118
|
+
msg = f"Extracted {len(table_lineage)} table lineage to {location}"
|
|
119
|
+
logger.info(msg)
|
|
120
|
+
|
|
121
|
+
# extract column lineage
|
|
122
|
+
column_lineage = self._client.column_lineage(
|
|
123
|
+
tables, columns, table_lineage
|
|
124
|
+
)
|
|
125
|
+
column_lineage_key = WarehouseAsset.ADDITIONAL_COLUMN_LINEAGE.value
|
|
126
|
+
location = self._storage.put(column_lineage_key, column_lineage)
|
|
127
|
+
lineage_locations[column_lineage_key] = location
|
|
128
|
+
msg = f"Extracted {len(column_lineage)} column lineage to {location}"
|
|
129
|
+
logger.info(msg)
|
|
130
|
+
return lineage_locations
|
|
131
|
+
|
|
97
132
|
def extract_query(self, time_filter: OTimeFilter = None) -> Paths:
|
|
98
133
|
"""extract yesterday's queries and return their location"""
|
|
99
134
|
if self._should_not_reextract(WarehouseAssetGroup.QUERY):
|
|
@@ -149,6 +184,7 @@ def extract_all(**kwargs) -> None:
|
|
|
149
184
|
)
|
|
150
185
|
|
|
151
186
|
extractor.extract_catalog()
|
|
187
|
+
extractor.extract_lineage()
|
|
152
188
|
extractor.extract_query()
|
|
153
189
|
extractor.extract_role()
|
|
154
190
|
extractor.extract_view_ddl()
|
|
@@ -95,6 +95,19 @@ class DatabricksFormatter:
|
|
|
95
95
|
|
|
96
96
|
return tables, columns
|
|
97
97
|
|
|
98
|
+
@staticmethod
|
|
99
|
+
def format_lineage(timestamps: dict) -> List[dict]:
|
|
100
|
+
lineage: List[dict] = []
|
|
101
|
+
for link, timestamp in timestamps.items():
|
|
102
|
+
parent_path, child_path = link
|
|
103
|
+
link_ = {
|
|
104
|
+
"parent_path": parent_path,
|
|
105
|
+
"child_path": child_path,
|
|
106
|
+
"timestamp": timestamp,
|
|
107
|
+
}
|
|
108
|
+
lineage.append(link_)
|
|
109
|
+
return lineage
|
|
110
|
+
|
|
98
111
|
@staticmethod
|
|
99
112
|
def format_query(raw_queries: List[dict]) -> List[dict]:
|
|
100
113
|
queries = []
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
OLDER_DATE = "2024-04-18 20:20:20.0"
|
|
2
|
+
CLOSER_DATE = "2024-04-19 20:20:20.0"
|
|
3
|
+
|
|
4
|
+
MOCK_TABLES_FOR_TABLE_LINEAGE = [
|
|
5
|
+
{
|
|
6
|
+
"id": "f51ba2ca-8cc3-4de6-8f8b-730359e8f40f",
|
|
7
|
+
"schema_id": "dev.silver",
|
|
8
|
+
"table_name": "analytics",
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "4e140bdc-a67c-4b68-8a07-c684657d8b44",
|
|
12
|
+
"schema_id": "dev.silver",
|
|
13
|
+
"table_name": "pre_analytics",
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"id": "7d403198-55ea-4a40-9995-6ee2f4c79dfa",
|
|
17
|
+
"schema_id": "dev.bronze",
|
|
18
|
+
"table_name": "analytics",
|
|
19
|
+
},
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
_RAW_LINEAGE_DEV_SILVER_ANALYTICS = {
|
|
23
|
+
"upstreams": [
|
|
24
|
+
{ # there could be other keys: jobInfos, notebookInfos, queryInfos
|
|
25
|
+
"tableInfo": {
|
|
26
|
+
"name": "pre_analytics",
|
|
27
|
+
"catalog_name": "dev",
|
|
28
|
+
"schema_name": "silver",
|
|
29
|
+
"table_type": "PERSISTED_VIEW", # not used
|
|
30
|
+
"lineage_timestamp": OLDER_DATE,
|
|
31
|
+
}
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
"tableInfo": {
|
|
35
|
+
"name": "analytics",
|
|
36
|
+
"catalog_name": "dev",
|
|
37
|
+
"schema_name": "bronze",
|
|
38
|
+
"table_type": "PERSISTED_VIEW", # not used
|
|
39
|
+
"lineage_timestamp": CLOSER_DATE,
|
|
40
|
+
}
|
|
41
|
+
},
|
|
42
|
+
],
|
|
43
|
+
"downstreams": [],
|
|
44
|
+
}
|
|
45
|
+
_RAW_LINEAGE_DEV_SILVER_PRE_ANALYTICS = {
|
|
46
|
+
"upstreams": [],
|
|
47
|
+
"downstreams": [
|
|
48
|
+
{
|
|
49
|
+
"tableInfo": {
|
|
50
|
+
"name": "analytics",
|
|
51
|
+
"catalog_name": "dev",
|
|
52
|
+
"schema_name": "silver",
|
|
53
|
+
"table_type": "PERSISTED_VIEW", # not used
|
|
54
|
+
"lineage_timestamp": OLDER_DATE,
|
|
55
|
+
}
|
|
56
|
+
},
|
|
57
|
+
],
|
|
58
|
+
}
|
|
59
|
+
_RAW_LINEAGE_DEV_BRONZE_ANALYTICS = {
|
|
60
|
+
"upstreams": [],
|
|
61
|
+
"downstreams": [
|
|
62
|
+
{
|
|
63
|
+
"tableInfo": {
|
|
64
|
+
"name": "analytics",
|
|
65
|
+
"catalog_name": "dev",
|
|
66
|
+
"schema_name": "silver",
|
|
67
|
+
"table_type": "PERSISTED_VIEW", # not used
|
|
68
|
+
"lineage_timestamp": OLDER_DATE,
|
|
69
|
+
}
|
|
70
|
+
},
|
|
71
|
+
],
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
# should be in the same order as MOCK_TABLES_FOR_TABLE_LINEAGE
|
|
75
|
+
TABLE_LINEAGE_SIDE_EFFECT: tuple = (
|
|
76
|
+
_RAW_LINEAGE_DEV_SILVER_ANALYTICS,
|
|
77
|
+
_RAW_LINEAGE_DEV_SILVER_PRE_ANALYTICS,
|
|
78
|
+
_RAW_LINEAGE_DEV_BRONZE_ANALYTICS,
|
|
79
|
+
)
|
|
@@ -4,6 +4,7 @@ from ...utils import LocalStorage, from_env, write_summary
|
|
|
4
4
|
from ..abstract import (
|
|
5
5
|
CATALOG_ASSETS,
|
|
6
6
|
EXTERNAL_LINEAGE_ASSETS,
|
|
7
|
+
FUNCTIONS_ASSETS,
|
|
7
8
|
QUERIES_ASSETS,
|
|
8
9
|
VIEWS_ASSETS,
|
|
9
10
|
SQLExtractionProcessor,
|
|
@@ -20,6 +21,7 @@ logger = logging.getLogger(__name__)
|
|
|
20
21
|
|
|
21
22
|
SNOWFLAKE_ASSETS: SupportedAssets = {
|
|
22
23
|
WarehouseAssetGroup.CATALOG: CATALOG_ASSETS,
|
|
24
|
+
WarehouseAssetGroup.FUNCTION: FUNCTIONS_ASSETS,
|
|
23
25
|
WarehouseAssetGroup.QUERY: QUERIES_ASSETS,
|
|
24
26
|
WarehouseAssetGroup.VIEW_DDL: VIEWS_ASSETS,
|
|
25
27
|
WarehouseAssetGroup.ROLE: (
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
SELECT
|
|
2
|
+
f.function_name AS name,
|
|
3
|
+
CONCAT(f.function_catalog, '.', f.function_schema, '.', f.function_name) AS path,
|
|
4
|
+
f.argument_signature AS signature,
|
|
5
|
+
f.function_definition AS definition
|
|
6
|
+
FROM snowflake.account_usage.functions f
|
|
7
|
+
WHERE TRUE
|
|
8
|
+
AND f.function_catalog NOT IN ('SNOWFLAKE', 'UTIL_DB')
|
|
9
|
+
AND f.function_language = 'SQL'
|
|
10
|
+
AND deleted IS NULL
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
CHANGELOG.md,sha256=
|
|
1
|
+
CHANGELOG.md,sha256=EVZ9vhIVN7HLn5PYkRyBWyT3hk72Nt3i1SghwSipfR4,10957
|
|
2
2
|
Dockerfile,sha256=HcX5z8OpeSvkScQsN-Y7CNMUig_UB6vTMDl7uqzuLGE,303
|
|
3
3
|
LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
|
|
4
4
|
README.md,sha256=uF6PXm9ocPITlKVSh9afTakHmpLx3TvawLf-CbMP3wM,3578
|
|
@@ -23,7 +23,7 @@ castor_extractor/commands/extract_snowflake.py,sha256=vYiruxRoo--GeMemOGsSE1w9kc
|
|
|
23
23
|
castor_extractor/commands/extract_sqlserver.py,sha256=lwhbcNChaXHZgMgSOch3faVr7WJw-sDU6GHl3lzBt_0,1141
|
|
24
24
|
castor_extractor/commands/extract_tableau.py,sha256=u-6UCd-kfXwyhNWYxZusqtgTTYkf4gAJS1vRIYWsAVU,1415
|
|
25
25
|
castor_extractor/commands/file_check.py,sha256=PkXFK-kPoG8PpvBP-PCvVdreqwYw6Y1fTh2QzDxACsM,2684
|
|
26
|
-
castor_extractor/commands/upload.py,sha256=
|
|
26
|
+
castor_extractor/commands/upload.py,sha256=tAyHfIDOdUpD0yMJe2f64nXpaVnIbgYXi4bYx2nNvNU,1922
|
|
27
27
|
castor_extractor/file_checker/__init__.py,sha256=OSt6YLhUT42U_Cp3LCLHMVruwDkksL75Ij13X2UPnVk,119
|
|
28
28
|
castor_extractor/file_checker/column.py,sha256=fMchy5v-Sd-0xuYS0V9mob7wnljslzWLhQGqrKGybdk,3097
|
|
29
29
|
castor_extractor/file_checker/column_test.py,sha256=1j8PxvmvmJgpd-mk30iMYOme32ovPSIn4yCXywFoXrg,1935
|
|
@@ -38,16 +38,16 @@ castor_extractor/file_checker/templates/generic_warehouse.py,sha256=zvnWnYB8FNvh
|
|
|
38
38
|
castor_extractor/logger.py,sha256=ovf1mBEKwbJBskBXoqHbcAomBrp58mUwSrCWtEMlYPM,1197
|
|
39
39
|
castor_extractor/types.py,sha256=-QgiOaq--nXUsYLy_oESDrYbRMxs353-YiQnG1blJvU,1303
|
|
40
40
|
castor_extractor/uploader/__init__.py,sha256=SSRtwjg-dNoxME-RJy9G1flASiUKAC5bH1htq3CURQg,75
|
|
41
|
-
castor_extractor/uploader/constant.py,sha256=
|
|
41
|
+
castor_extractor/uploader/constant.py,sha256=yTigLHDlYwoRr6CpFIl7ReElFsQd4H-qkluMZJPWSx0,865
|
|
42
42
|
castor_extractor/uploader/env.py,sha256=5HSniVSOYVg4u38O4k8TB_qaJq9s8yJ1hjedkq_gdVg,878
|
|
43
43
|
castor_extractor/uploader/env_test.py,sha256=ClCWWtwd2N-5ClIDUxVMeKkWfhhOTxpppsXUDmdjxSg,472
|
|
44
|
-
castor_extractor/uploader/upload.py,sha256=
|
|
45
|
-
castor_extractor/uploader/upload_test.py,sha256=
|
|
46
|
-
castor_extractor/uploader/utils.py,sha256=
|
|
44
|
+
castor_extractor/uploader/upload.py,sha256=bTWD1_-hmJ6q1qcEosjZ96wsBtWDnWoCt692NYX_Nko,3228
|
|
45
|
+
castor_extractor/uploader/upload_test.py,sha256=7fwstdQe7FjuwGilsCdFpEQr1qLoR2WTRUzyy93fISw,402
|
|
46
|
+
castor_extractor/uploader/utils.py,sha256=Tx_i875L2vJ8btOLV3-L0UMEFiyhH8E5n0XXRyLjO0Y,793
|
|
47
47
|
castor_extractor/utils/__init__.py,sha256=bmzAOc-PKsVreMJtF7DGpPQeHrVqxWel_BblRftt6Ag,1186
|
|
48
48
|
castor_extractor/utils/client/__init__.py,sha256=CRE-xJKm6fVV9dB8ljzB5YoOxX4I1sCD1KSgqs3Y8_Y,161
|
|
49
49
|
castor_extractor/utils/client/abstract.py,sha256=aA5Qcb9TwWDSMq8WpXbGkOB20hehwX2VTpqQAwV76wk,2048
|
|
50
|
-
castor_extractor/utils/client/api.py,sha256=
|
|
50
|
+
castor_extractor/utils/client/api.py,sha256=z1o4fteWx1HxNTqCYihl9sGkIgSQTbd8lW_B9Y2wyeQ,1742
|
|
51
51
|
castor_extractor/utils/client/api_test.py,sha256=NSMdXg1FLc37erqHp2FZsIsogWVv6lFSs7rDXHikr-E,542
|
|
52
52
|
castor_extractor/utils/client/postgres.py,sha256=n6ulaT222WWPY0_6qAZ0MHF0m91HtI9mMqL71nyygo0,866
|
|
53
53
|
castor_extractor/utils/client/query.py,sha256=O6D5EjD1KmBlwa786Uw4D4kzxx97_HH50xIIeSWt0B8,205
|
|
@@ -80,7 +80,7 @@ castor_extractor/utils/pager/pager_on_id_test.py,sha256=CfAXhXaAmCXnm0oflj8_82An
|
|
|
80
80
|
castor_extractor/utils/pager/pager_on_token.py,sha256=G442SKl4BXJFMPbYIIgCk5M8wl7V3jMg3K1WUUkl0I0,1579
|
|
81
81
|
castor_extractor/utils/pager/pager_on_token_test.py,sha256=w2GCUGKR3cD5lfmtFAsNvExtzxkYdBR0pusBrGKFQ08,2548
|
|
82
82
|
castor_extractor/utils/pager/pager_test.py,sha256=QPBVShSXhkiYZUfnAMs43xnys6CD8pAhL3Jhj-Ov2Xc,1705
|
|
83
|
-
castor_extractor/utils/retry.py,sha256=
|
|
83
|
+
castor_extractor/utils/retry.py,sha256=OsUS3qysHCkgWge8BgBwyuvoWcJ6pR_RQmQDcHlors4,3410
|
|
84
84
|
castor_extractor/utils/retry_test.py,sha256=nsMttlmyKygVcffX3Hay8U2S1BspkGPiCmzIXPpLKyk,2230
|
|
85
85
|
castor_extractor/utils/safe.py,sha256=jpfIimwdBSVUvU2DPFrhqpKC_DSYwxQqd08MlIkSODY,1967
|
|
86
86
|
castor_extractor/utils/safe_test.py,sha256=IHN1Z761tYMFslYC-2HAfkXmFPh4LYSqNLs4QZwykjk,2160
|
|
@@ -244,16 +244,16 @@ castor_extractor/visualization/tableau/usage.py,sha256=LlFwlbEr-EnYUJjKZha99CRCR
|
|
|
244
244
|
castor_extractor/visualization/tableau_revamp/__init__.py,sha256=a3DGjQhaz17gBqW-E84TAgupKbqLC40y5Ajo1yn-ot4,156
|
|
245
245
|
castor_extractor/visualization/tableau_revamp/assets.py,sha256=owlwaI2E4UKk1YhkaHgaAXx6gu3Op6EqZ7bjp0tHI6s,351
|
|
246
246
|
castor_extractor/visualization/tableau_revamp/client/__init__.py,sha256=wmS9uLtUiqNYVloi0-DgD8d2qzu3RVZEAtWiaDp6G_M,90
|
|
247
|
-
castor_extractor/visualization/tableau_revamp/client/client.py,sha256=
|
|
247
|
+
castor_extractor/visualization/tableau_revamp/client/client.py,sha256=RSoHDfz79ma0YJRGpiCihnwLGmoxLzphYrxRVyvByHI,9742
|
|
248
248
|
castor_extractor/visualization/tableau_revamp/client/credentials.py,sha256=fHG32egq6ll2U4BNazalMof_plzfCMQjrN9WOs6kezk,3014
|
|
249
249
|
castor_extractor/visualization/tableau_revamp/client/errors.py,sha256=dTe1shqmWmAXpDpCz-E24m8dGYjt6rvIGV9qQb4jnvI,150
|
|
250
|
-
castor_extractor/visualization/tableau_revamp/client/gql_queries.py,sha256
|
|
250
|
+
castor_extractor/visualization/tableau_revamp/client/gql_queries.py,sha256=-V3ToD5Gi7nmfVB2OxTOZw8dcOiF7_ciSWjjW2UdvvI,2270
|
|
251
251
|
castor_extractor/visualization/tableau_revamp/client/tsc_fields.py,sha256=WsDliPCo-XsQ7wN-j0gpW9bdxCHvgH-aePywiltzfbU,688
|
|
252
252
|
castor_extractor/visualization/tableau_revamp/constants.py,sha256=PcdudAogQhi3e-knalhgliMKjy5ahN0em_-7XSLrnxM,87
|
|
253
253
|
castor_extractor/visualization/tableau_revamp/extract.py,sha256=2SLUxp5okM4AcEJJ61ZgcC2ikfZZl9MH17CEXMXmgl0,1450
|
|
254
254
|
castor_extractor/warehouse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
255
|
-
castor_extractor/warehouse/abstract/__init__.py,sha256=
|
|
256
|
-
castor_extractor/warehouse/abstract/asset.py,sha256=
|
|
255
|
+
castor_extractor/warehouse/abstract/__init__.py,sha256=Fdfa026tgOo64MvzVRLHM_F2G-JmcehrF0mh3dHgb7s,419
|
|
256
|
+
castor_extractor/warehouse/abstract/asset.py,sha256=Qs7T2Iw7KHgWVT2aAoBfCQ8tB143cUZY-DRUSkpgvGU,2689
|
|
257
257
|
castor_extractor/warehouse/abstract/asset_test.py,sha256=_kd4ybNlWSAdSdEgJKC-jhJTa1nMRa9i8RO3YbqKLM4,758
|
|
258
258
|
castor_extractor/warehouse/abstract/extract.py,sha256=fVBhdE-yMI_g6RBYZcr7q-ZVW7jK7WVkO_GO_KfkRqg,2908
|
|
259
259
|
castor_extractor/warehouse/abstract/query.py,sha256=GAgeISCmAdrkTKzFGO79hQDf6SA6EFrrlW43w-LiXKo,2632
|
|
@@ -277,13 +277,14 @@ castor_extractor/warehouse/bigquery/queries/view_ddl.sql,sha256=obCm-IN9V8_YSZTw
|
|
|
277
277
|
castor_extractor/warehouse/bigquery/query.py,sha256=hrFfjd5jW2oQnZ6ozlkn-gDe6sCIzu5zSX19T9W6fIk,4162
|
|
278
278
|
castor_extractor/warehouse/bigquery/types.py,sha256=LZVWSmE57lOemNbB5hBRyYmDk9bFAU4nbRaJWALl6N8,140
|
|
279
279
|
castor_extractor/warehouse/databricks/__init__.py,sha256=bTvDxjGQGM2J3hOnVhfNmFP1y8DK0tySiD_EXe5_xWE,200
|
|
280
|
-
castor_extractor/warehouse/databricks/client.py,sha256=
|
|
281
|
-
castor_extractor/warehouse/databricks/client_test.py,sha256=
|
|
280
|
+
castor_extractor/warehouse/databricks/client.py,sha256=oHR_htE25p5tiAAFZKbF48efo7tqIENW4dAGA7yEqHg,16895
|
|
281
|
+
castor_extractor/warehouse/databricks/client_test.py,sha256=KNp4Hi_CC6GwiW2QDJQQwqALfUebuT9D_qL6FuP_8tY,5246
|
|
282
282
|
castor_extractor/warehouse/databricks/credentials.py,sha256=PpGv5_GP320UQjV_gvaxSpOw58AmqSznmjGhGfe6bdU,655
|
|
283
|
-
castor_extractor/warehouse/databricks/extract.py,sha256
|
|
284
|
-
castor_extractor/warehouse/databricks/format.py,sha256=
|
|
283
|
+
castor_extractor/warehouse/databricks/extract.py,sha256=VX-3uo5dZucenrg-wnPur3CxOgpC5H7Ds92TO7OTAjc,7379
|
|
284
|
+
castor_extractor/warehouse/databricks/format.py,sha256=2bRy2fa45NW3uk030rmyba4n2Em-NnyZPBurUslEbcw,5522
|
|
285
285
|
castor_extractor/warehouse/databricks/format_test.py,sha256=iPmdJof43fBYL1Sa_fBrCWDQHCHgm7IWCZag1kWkj9E,1970
|
|
286
|
-
castor_extractor/warehouse/databricks/
|
|
286
|
+
castor_extractor/warehouse/databricks/test_constants.py,sha256=Hm96yq_ltVAKv7WYhYz637r4Cuj-1cCdyOuxMEe3J-Q,2246
|
|
287
|
+
castor_extractor/warehouse/databricks/types.py,sha256=hD6gC8oiT3QSWEvbtgUOGK_lLzzz36sEauB3lS_wxlE,218
|
|
287
288
|
castor_extractor/warehouse/mysql/__init__.py,sha256=2KFDogo9GNbApHqw3Vm5t_uNmIRjdp76nmP_WQQMfQY,116
|
|
288
289
|
castor_extractor/warehouse/mysql/client.py,sha256=IwoJvbmE5VZkMCP9yHf6ta3_AQPEuBPrZZ3meefbcJs,974
|
|
289
290
|
castor_extractor/warehouse/mysql/client_test.py,sha256=wRTv-3c5chy_HKj-buasNiYOOCIfynYqbabM4Hxdh5E,1052
|
|
@@ -334,11 +335,12 @@ castor_extractor/warehouse/snowflake/client.py,sha256=XT0QLVNff_586SDuMe40iu8FCw
|
|
|
334
335
|
castor_extractor/warehouse/snowflake/client_test.py,sha256=ihWtOOAQfh8pu5JTr_EWfqefKOVIaJXznACURzaU1Qs,1432
|
|
335
336
|
castor_extractor/warehouse/snowflake/credentials.py,sha256=wbUdbx9jVSHzg2kNDhMFuDstbVTyZOcGAwnSeGeFIqs,875
|
|
336
337
|
castor_extractor/warehouse/snowflake/credentials_test.py,sha256=Lkc-DHXOvr50KrqAW4nt_x0IA0Mu_CsBVu6ATnzQB6I,673
|
|
337
|
-
castor_extractor/warehouse/snowflake/extract.py,sha256=
|
|
338
|
+
castor_extractor/warehouse/snowflake/extract.py,sha256=fcze0VBe9OOAFSr25T9L6CY506Vm_xDEvvy8NWuLW1s,2956
|
|
338
339
|
castor_extractor/warehouse/snowflake/queries/.sqlfluff,sha256=vttrwcr64JVIuvc7WIg9C54cbOkjg_VjXNR7YnTGOPE,31
|
|
339
340
|
castor_extractor/warehouse/snowflake/queries/column.sql,sha256=pAW2UNnut0a483OY2rjOXCdCtQg0g254g61Bt51CIB4,1803
|
|
340
341
|
castor_extractor/warehouse/snowflake/queries/column_lineage.sql,sha256=YKBiZ6zySSNcXLDXwm31EjGIIkkkZc0-S6hI1SRM80o,1179
|
|
341
342
|
castor_extractor/warehouse/snowflake/queries/database.sql,sha256=ifZXoKUXtsrGOxml6AcNhA4yybIyatH5va7bcp-lgCU,483
|
|
343
|
+
castor_extractor/warehouse/snowflake/queries/function.sql,sha256=8LRh0ybhd-RldJ8UZspWUm3yv52evq11O2uqIO4KqeQ,372
|
|
342
344
|
castor_extractor/warehouse/snowflake/queries/grant_to_role.sql,sha256=O7AJ1LzoXGDFmiVvQ8EMJ5x8FSAnaxRPdmRyAlEmkUM,272
|
|
343
345
|
castor_extractor/warehouse/snowflake/queries/grant_to_user.sql,sha256=7AalVajU5vRRpIiys1igSwmDXirbwpMTvJr2ihSz2NE,143
|
|
344
346
|
castor_extractor/warehouse/snowflake/queries/query.sql,sha256=-OYcWUvdPBkpOfezkZaW7hrOdDz3JyoqjNdRm_88Rsk,1779
|
|
@@ -368,8 +370,8 @@ castor_extractor/warehouse/synapse/queries/schema.sql,sha256=aX9xNrBD_ydwl-znGSF
|
|
|
368
370
|
castor_extractor/warehouse/synapse/queries/table.sql,sha256=mCE8bR1Vb7j7SwZW2gafcXidQ2fo1HwxcybA8wP2Kfs,1049
|
|
369
371
|
castor_extractor/warehouse/synapse/queries/user.sql,sha256=sTb_SS7Zj3AXW1SggKPLNMCd0qoTpL7XI_BJRMaEpBg,67
|
|
370
372
|
castor_extractor/warehouse/synapse/queries/view_ddl.sql,sha256=3EVbp5_yTgdByHFIPLHmnoOnqqLE77SrjAwFDvu4e54,249
|
|
371
|
-
castor_extractor-0.
|
|
372
|
-
castor_extractor-0.
|
|
373
|
-
castor_extractor-0.
|
|
374
|
-
castor_extractor-0.
|
|
375
|
-
castor_extractor-0.
|
|
373
|
+
castor_extractor-0.17.0.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
|
|
374
|
+
castor_extractor-0.17.0.dist-info/METADATA,sha256=mPiUyxCqXFifcPbhcOPFsnkPAV4OcWXoYzGeUKlbkoo,6582
|
|
375
|
+
castor_extractor-0.17.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
376
|
+
castor_extractor-0.17.0.dist-info/entry_points.txt,sha256=SbyPk58Gh-FRztfCNnUZQ6w7SatzNJFZ6GIJLNsy7tI,1427
|
|
377
|
+
castor_extractor-0.17.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|