castor-extractor 0.22.1__py3-none-any.whl → 0.22.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +21 -0
- castor_extractor/file_checker/file.py +1 -1
- castor_extractor/visualization/powerbi/assets.py +2 -12
- castor_extractor/visualization/powerbi/extract.py +2 -2
- castor_extractor/visualization/sigma/client/client.py +64 -10
- castor_extractor/visualization/thoughtspot/assets.py +3 -1
- castor_extractor/visualization/thoughtspot/client/client.py +67 -14
- castor_extractor/visualization/thoughtspot/client/utils.py +10 -4
- castor_extractor/visualization/thoughtspot/client/utils_test.py +22 -4
- castor_extractor/warehouse/abstract/extract.py +1 -1
- castor_extractor/warehouse/bigquery/client.py +3 -3
- castor_extractor/warehouse/databricks/api_client.py +2 -60
- castor_extractor/warehouse/databricks/client.py +4 -47
- castor_extractor/warehouse/databricks/client_test.py +1 -35
- castor_extractor/warehouse/databricks/credentials.py +4 -6
- castor_extractor/warehouse/databricks/enums.py +15 -0
- castor_extractor/warehouse/databricks/extract.py +13 -11
- castor_extractor/warehouse/databricks/lineage.py +47 -119
- castor_extractor/warehouse/databricks/lineage_test.py +86 -31
- castor_extractor/warehouse/databricks/sql_client.py +23 -8
- castor_extractor/warehouse/databricks/types.py +0 -7
- {castor_extractor-0.22.1.dist-info → castor_extractor-0.22.6.dist-info}/METADATA +24 -3
- {castor_extractor-0.22.1.dist-info → castor_extractor-0.22.6.dist-info}/RECORD +26 -26
- castor_extractor/warehouse/databricks/test_constants.py +0 -79
- {castor_extractor-0.22.1.dist-info → castor_extractor-0.22.6.dist-info}/LICENCE +0 -0
- {castor_extractor-0.22.1.dist-info → castor_extractor-0.22.6.dist-info}/WHEEL +0 -0
- {castor_extractor-0.22.1.dist-info → castor_extractor-0.22.6.dist-info}/entry_points.txt +0 -0
|
@@ -1,24 +1,22 @@
|
|
|
1
1
|
from dataclasses import field
|
|
2
|
-
from typing import Optional
|
|
3
2
|
|
|
4
|
-
from
|
|
5
|
-
from pydantic_settings import SettingsConfigDict
|
|
3
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
6
4
|
|
|
7
5
|
DATABRICKS_ENV_PREFIX = "CASTOR_DATABRICKS_"
|
|
8
6
|
|
|
9
7
|
|
|
10
|
-
|
|
11
|
-
class DatabricksCredentials:
|
|
8
|
+
class DatabricksCredentials(BaseSettings):
|
|
12
9
|
"""
|
|
13
10
|
Credentials needed by Databricks client
|
|
14
11
|
Requires:
|
|
15
12
|
- host
|
|
13
|
+
- http_path
|
|
16
14
|
- token
|
|
17
15
|
"""
|
|
18
16
|
|
|
19
17
|
host: str
|
|
18
|
+
http_path: str
|
|
20
19
|
token: str = field(metadata={"sensitive": True})
|
|
21
|
-
http_path: Optional[str] = field(default=None)
|
|
22
20
|
|
|
23
21
|
model_config = SettingsConfigDict(
|
|
24
22
|
env_prefix=DATABRICKS_ENV_PREFIX,
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class LineageEntity(Enum):
|
|
5
|
+
"""Entities that can be linked in Databricks lineage"""
|
|
6
|
+
|
|
7
|
+
COLUMN = "COLUMN"
|
|
8
|
+
TABLE = "TABLE"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TagEntity(Enum):
|
|
12
|
+
"""Entities that can be tagged in Databricks"""
|
|
13
|
+
|
|
14
|
+
COLUMN = "COLUMN"
|
|
15
|
+
TABLE = "TABLE"
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from datetime import date
|
|
2
3
|
from typing import Optional
|
|
3
4
|
|
|
4
5
|
from ...utils import AbstractStorage, LocalStorage, write_summary
|
|
@@ -16,6 +17,7 @@ from ..abstract import (
|
|
|
16
17
|
)
|
|
17
18
|
from .client import DatabricksClient
|
|
18
19
|
from .credentials import DatabricksCredentials
|
|
20
|
+
from .enums import LineageEntity
|
|
19
21
|
|
|
20
22
|
DATABRICKS_ASSETS: SupportedAssets = {
|
|
21
23
|
WarehouseAssetGroup.ADDITIONAL_LINEAGE: ADDITIONAL_LINEAGE_ASSETS,
|
|
@@ -32,6 +34,12 @@ OTimeFilter = Optional[TimeFilter]
|
|
|
32
34
|
Paths = dict[str, str]
|
|
33
35
|
|
|
34
36
|
|
|
37
|
+
def _day(time_filter: OTimeFilter) -> date:
|
|
38
|
+
if not time_filter:
|
|
39
|
+
return TimeFilter.default().day
|
|
40
|
+
return time_filter.day
|
|
41
|
+
|
|
42
|
+
|
|
35
43
|
class DatabricksExtractionProcessor:
|
|
36
44
|
"""Databricks' API-based extraction management"""
|
|
37
45
|
|
|
@@ -96,22 +104,18 @@ class DatabricksExtractionProcessor:
|
|
|
96
104
|
logger.info(f"Extracted {len(columns)} columns to {location}")
|
|
97
105
|
return catalog_locations
|
|
98
106
|
|
|
99
|
-
def extract_lineage(self) -> Paths:
|
|
107
|
+
def extract_lineage(self, time_filter: OTimeFilter = None) -> Paths:
|
|
100
108
|
if self._should_not_reextract(WarehouseAssetGroup.ADDITIONAL_LINEAGE):
|
|
101
109
|
return self._existing_group_paths(
|
|
102
110
|
WarehouseAssetGroup.ADDITIONAL_LINEAGE
|
|
103
111
|
)
|
|
104
112
|
lineage_locations: dict[str, str] = dict()
|
|
105
113
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
schemas = self._client.schemas(databases)
|
|
109
|
-
users = self._client.users()
|
|
110
|
-
tables, columns = self._client.tables_and_columns(schemas, users)
|
|
111
|
-
logger.info("Extracted pre-requisite catalog. Next comes lineage")
|
|
114
|
+
day = _day(time_filter)
|
|
115
|
+
client = self._client.sql_client
|
|
112
116
|
|
|
113
117
|
# extract table lineage
|
|
114
|
-
table_lineage =
|
|
118
|
+
table_lineage = client.get_lineage(LineageEntity.TABLE, day)
|
|
115
119
|
table_lineage_key = WarehouseAsset.ADDITIONAL_TABLE_LINEAGE.value
|
|
116
120
|
location = self._storage.put(table_lineage_key, table_lineage)
|
|
117
121
|
lineage_locations[table_lineage_key] = location
|
|
@@ -119,9 +123,7 @@ class DatabricksExtractionProcessor:
|
|
|
119
123
|
logger.info(msg)
|
|
120
124
|
|
|
121
125
|
# extract column lineage
|
|
122
|
-
column_lineage =
|
|
123
|
-
tables, columns, table_lineage
|
|
124
|
-
)
|
|
126
|
+
column_lineage = client.get_lineage(LineageEntity.COLUMN, day)
|
|
125
127
|
column_lineage_key = WarehouseAsset.ADDITIONAL_COLUMN_LINEAGE.value
|
|
126
128
|
location = self._storage.put(column_lineage_key, column_lineage)
|
|
127
129
|
lineage_locations[column_lineage_key] = location
|
|
@@ -1,141 +1,69 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Iterable, Optional
|
|
2
2
|
|
|
3
|
-
from .
|
|
3
|
+
from .enums import LineageEntity
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
class
|
|
6
|
+
class LineageProcessor:
|
|
7
7
|
"""
|
|
8
8
|
helper class that handles lineage deduplication and filtering
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
|
-
def __init__(self):
|
|
12
|
-
self.
|
|
11
|
+
def __init__(self, lineage_entity: LineageEntity):
|
|
12
|
+
self.lineage_entity = lineage_entity
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
"""
|
|
16
|
-
keep the most recent lineage link, adding to `self.lineage`
|
|
17
|
-
"""
|
|
18
|
-
parent, child, timestamp = timestamped_link
|
|
19
|
-
link = (parent, child)
|
|
20
|
-
if not self.lineage.get(link):
|
|
21
|
-
self.lineage[link] = timestamp
|
|
22
|
-
return
|
|
23
|
-
|
|
24
|
-
if not timestamp:
|
|
25
|
-
return
|
|
26
|
-
# keep most recent link; cast for mypy
|
|
27
|
-
recent = max(cast(str, self.lineage[link]), cast(str, timestamp))
|
|
28
|
-
self.lineage[link] = recent
|
|
14
|
+
self.lineage: dict[tuple[str, str], dict] = dict()
|
|
29
15
|
|
|
16
|
+
def _parent_path(self, link) -> Optional[str]:
|
|
17
|
+
if self.lineage_entity == LineageEntity.TABLE:
|
|
18
|
+
return link["source_table_full_name"]
|
|
30
19
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
20
|
+
source_table = link["source_table_full_name"]
|
|
21
|
+
source_column = link["source_column_name"]
|
|
22
|
+
if not (source_table and source_column):
|
|
23
|
+
return None
|
|
35
24
|
|
|
25
|
+
return f"{source_table}.{source_column}"
|
|
36
26
|
|
|
37
|
-
def
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
return None
|
|
27
|
+
def _child_path(self, link) -> Optional[str]:
|
|
28
|
+
if self.lineage_entity == LineageEntity.TABLE:
|
|
29
|
+
return link["target_table_full_name"]
|
|
41
30
|
|
|
31
|
+
target_table = link["target_table_full_name"]
|
|
32
|
+
target_column = link["target_column_name"]
|
|
33
|
+
if not (target_table and target_column):
|
|
34
|
+
return None
|
|
42
35
|
|
|
43
|
-
|
|
44
|
-
"""exclude missing path and self-lineage"""
|
|
45
|
-
if (not path_from) or (not path_to):
|
|
46
|
-
return None
|
|
47
|
-
is_self_lineage = path_from.lower() == path_to.lower()
|
|
48
|
-
if is_self_lineage:
|
|
49
|
-
return None
|
|
50
|
-
return path_from, path_to, timestamp
|
|
36
|
+
return f"{target_table}.{target_column}"
|
|
51
37
|
|
|
38
|
+
def add(self, link: dict) -> None:
|
|
39
|
+
"""
|
|
40
|
+
If the parent and child paths are valid, keeps the most recent lineage
|
|
41
|
+
link in the `self.lineage` map.
|
|
42
|
+
"""
|
|
43
|
+
parent = self._parent_path(link)
|
|
44
|
+
child = self._child_path(link)
|
|
45
|
+
timestamp = link["event_time"]
|
|
52
46
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
) -> list[TimestampedLink]:
|
|
56
|
-
"""
|
|
57
|
-
process databricks lineage API response for a given table
|
|
58
|
-
returns a list of (parent, child, timestamp)
|
|
59
|
-
|
|
60
|
-
Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
|
|
61
|
-
we could also have `notebookInfos` or `fileInfo`
|
|
62
|
-
"""
|
|
63
|
-
links: list[OTimestampedLink] = []
|
|
64
|
-
# add parent:
|
|
65
|
-
for link in single_table_lineage.get("upstreams", []):
|
|
66
|
-
parent = link.get("tableInfo", {})
|
|
67
|
-
parent_path = _to_table_path(parent)
|
|
68
|
-
timestamp: Ostr = parent.get("lineage_timestamp")
|
|
69
|
-
links.append(_link(parent_path, table_path, timestamp))
|
|
70
|
-
|
|
71
|
-
# add children:
|
|
72
|
-
for link in single_table_lineage.get("downstreams", []):
|
|
73
|
-
child = link.get("tableInfo", {})
|
|
74
|
-
child_path = _to_table_path(child)
|
|
75
|
-
timestamp = child.get("lineage_timestamp")
|
|
76
|
-
links.append(_link(table_path, child_path, timestamp))
|
|
77
|
-
|
|
78
|
-
return list(filter(None, links))
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def single_column_lineage_links(
|
|
82
|
-
column_path: str, single_column_lineage: dict
|
|
83
|
-
) -> list[TimestampedLink]:
|
|
84
|
-
"""
|
|
85
|
-
process databricks lineage API response for a given table
|
|
86
|
-
returns a list of (parent, child, timestamp)
|
|
87
|
-
|
|
88
|
-
Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
|
|
89
|
-
we could also have `notebookInfos` or `fileInfo`
|
|
90
|
-
"""
|
|
91
|
-
links: list[OTimestampedLink] = []
|
|
92
|
-
# add parent:
|
|
93
|
-
for link in single_column_lineage.get("upstream_cols", []):
|
|
94
|
-
parent_path = _to_column_path(link)
|
|
95
|
-
timestamp: Ostr = link.get("lineage_timestamp")
|
|
96
|
-
links.append(_link(parent_path, column_path, timestamp))
|
|
47
|
+
if not (parent and child and parent != child):
|
|
48
|
+
return
|
|
97
49
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
timestamp = link.get("lineage_timestamp")
|
|
102
|
-
links.append(_link(column_path, child_path, timestamp))
|
|
50
|
+
key = (parent, child)
|
|
51
|
+
if key in self.lineage and self.lineage[key]["event_time"] > timestamp:
|
|
52
|
+
return
|
|
103
53
|
|
|
104
|
-
|
|
54
|
+
self.lineage[key] = link
|
|
105
55
|
|
|
106
56
|
|
|
107
|
-
def
|
|
108
|
-
|
|
109
|
-
) -> list[
|
|
57
|
+
def valid_lineage(
|
|
58
|
+
lineage: Iterable[dict], lineage_entity: LineageEntity
|
|
59
|
+
) -> list[dict]:
|
|
110
60
|
"""
|
|
111
|
-
|
|
112
|
-
|
|
61
|
+
Filters out self-lineage or lineage with a missing source or target path,
|
|
62
|
+
then deduplicates by picking the link with the most recent event timestmap.
|
|
113
63
|
"""
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
tables_with_lineage: set[str] = set()
|
|
122
|
-
for t in table_lineage:
|
|
123
|
-
tables_with_lineage.add(t["parent_path"])
|
|
124
|
-
tables_with_lineage.add(t["child_path"])
|
|
125
|
-
|
|
126
|
-
paths_to_return: list[tuple[str, str]] = []
|
|
127
|
-
for column in columns:
|
|
128
|
-
table_path = mapping[column["table_id"]]
|
|
129
|
-
if table_path not in tables_with_lineage:
|
|
130
|
-
continue
|
|
131
|
-
column_ = (table_path, column["column_name"])
|
|
132
|
-
paths_to_return.append(column_)
|
|
133
|
-
|
|
134
|
-
return paths_to_return
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
def deduplicate_lineage(lineages: list[TimestampedLink]) -> dict:
|
|
138
|
-
deduplicated_lineage = LineageLinks()
|
|
139
|
-
for timestamped_link in lineages:
|
|
140
|
-
deduplicated_lineage.add(timestamped_link)
|
|
141
|
-
return deduplicated_lineage.lineage
|
|
64
|
+
deduplicated_lineage = LineageProcessor(lineage_entity)
|
|
65
|
+
|
|
66
|
+
for link in lineage:
|
|
67
|
+
deduplicated_lineage.add(link)
|
|
68
|
+
|
|
69
|
+
return list(deduplicated_lineage.lineage.values())
|
|
@@ -1,34 +1,89 @@
|
|
|
1
|
-
from .
|
|
2
|
-
from .
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
1
|
+
from .enums import LineageEntity
|
|
2
|
+
from .lineage import LineageProcessor, valid_lineage
|
|
3
|
+
|
|
4
|
+
_OLDER_DATE = "2025-01-01 00:00:01.0"
|
|
5
|
+
_CLOSER_DATE = "2025-01-01 02:02:02.0"
|
|
6
|
+
|
|
7
|
+
_TABLE_LINEAGES = [
|
|
8
|
+
{
|
|
9
|
+
"source_table_full_name": "a.b.source",
|
|
10
|
+
"target_table_full_name": "a.b.target",
|
|
11
|
+
"event_time": _CLOSER_DATE,
|
|
12
|
+
"other": "more recent stuff",
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"source_table_full_name": "a.b.source",
|
|
16
|
+
"target_table_full_name": "a.b.target",
|
|
17
|
+
"event_time": _OLDER_DATE,
|
|
18
|
+
"other": "stuff that's too old",
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"source_table_full_name": "no target",
|
|
22
|
+
"target_table_full_name": None,
|
|
23
|
+
"event_time": _CLOSER_DATE,
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
"source_table_full_name": None,
|
|
27
|
+
"target_table_full_name": "no source",
|
|
28
|
+
"event_time": _CLOSER_DATE,
|
|
29
|
+
},
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
_COLUMN_LINEAGES = [
|
|
34
|
+
{
|
|
35
|
+
"source_table_full_name": "a.b.source",
|
|
36
|
+
"source_column_name": "src_col",
|
|
37
|
+
"target_table_full_name": "a.b.target",
|
|
38
|
+
"target_column_name": "trgt_col",
|
|
39
|
+
"event_time": _OLDER_DATE,
|
|
40
|
+
"other": "old stuff",
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
"source_table_full_name": "a.b.source",
|
|
44
|
+
"source_column_name": "src_col",
|
|
45
|
+
"target_table_full_name": "a.b.target",
|
|
46
|
+
"target_column_name": "trgt_col",
|
|
47
|
+
"event_time": _CLOSER_DATE,
|
|
48
|
+
"other": "newer stuff",
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
"source_table_full_name": "a.b.toto",
|
|
52
|
+
"source_column_name": "toto_col",
|
|
53
|
+
"target_table_full_name": "a.b.tata",
|
|
54
|
+
"target_column_name": "tata_col",
|
|
55
|
+
"event_time": _OLDER_DATE,
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
"source_table_full_name": "a.b.source",
|
|
59
|
+
"source_column_name": "a.b.source",
|
|
60
|
+
"target_table_full_name": None,
|
|
61
|
+
"target_column_name": None,
|
|
62
|
+
"event_time": _CLOSER_DATE,
|
|
63
|
+
},
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_valid_lineage():
|
|
68
|
+
table_links = valid_lineage(_TABLE_LINEAGES, LineageEntity.TABLE)
|
|
69
|
+
|
|
70
|
+
assert len(table_links) == 1
|
|
71
|
+
assert table_links[0]["source_table_full_name"] == "a.b.source"
|
|
72
|
+
assert table_links[0]["target_table_full_name"] == "a.b.target"
|
|
73
|
+
assert table_links[0]["event_time"] == _CLOSER_DATE
|
|
74
|
+
assert table_links[0]["other"] == "more recent stuff"
|
|
6
75
|
|
|
7
76
|
|
|
8
77
|
def test_LineageLinks_add():
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
assert
|
|
16
|
-
assert
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
assert expected_key in links.lineage
|
|
22
|
-
assert links.lineage[expected_key] == OLDER_DATE
|
|
23
|
-
|
|
24
|
-
# we update with the more recent timestamp
|
|
25
|
-
timestamped_link = ("parent", "child", CLOSER_DATE)
|
|
26
|
-
links.add(timestamped_link)
|
|
27
|
-
assert expected_key in links.lineage
|
|
28
|
-
assert links.lineage[expected_key] == CLOSER_DATE
|
|
29
|
-
|
|
30
|
-
# we keep the more recent timestamp
|
|
31
|
-
timestamped_link = ("parent", "child", OLDER_DATE)
|
|
32
|
-
links.add(timestamped_link)
|
|
33
|
-
assert expected_key in links.lineage
|
|
34
|
-
assert links.lineage[expected_key] == CLOSER_DATE
|
|
78
|
+
deduplicated_lineage = LineageProcessor(LineageEntity.COLUMN)
|
|
79
|
+
for link in _COLUMN_LINEAGES:
|
|
80
|
+
deduplicated_lineage.add(link)
|
|
81
|
+
|
|
82
|
+
lineage = deduplicated_lineage.lineage
|
|
83
|
+
assert len(lineage) == 2
|
|
84
|
+
assert ("a.b.source.src_col", "a.b.target.trgt_col") in lineage
|
|
85
|
+
assert ("a.b.toto.toto_col", "a.b.tata.tata_col") in lineage
|
|
86
|
+
assert (
|
|
87
|
+
lineage[("a.b.source.src_col", "a.b.target.trgt_col")]["other"]
|
|
88
|
+
== "newer stuff"
|
|
89
|
+
)
|
|
@@ -1,24 +1,24 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections import defaultdict
|
|
3
|
-
from
|
|
3
|
+
from datetime import date
|
|
4
4
|
from typing import Optional
|
|
5
5
|
|
|
6
6
|
from databricks import sql # type: ignore
|
|
7
7
|
|
|
8
8
|
from .credentials import DatabricksCredentials
|
|
9
|
+
from .enums import LineageEntity, TagEntity
|
|
9
10
|
from .format import TagMapping
|
|
11
|
+
from .lineage import valid_lineage
|
|
10
12
|
from .utils import build_path, tag_label
|
|
11
13
|
|
|
12
14
|
logger = logging.getLogger(__name__)
|
|
13
15
|
|
|
14
16
|
_INFORMATION_SCHEMA_SQL = "SELECT * FROM system.information_schema"
|
|
15
17
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
COLUMN = "COLUMN"
|
|
21
|
-
TABLE = "TABLE"
|
|
18
|
+
_LINEAGE_SQL_TPL = """
|
|
19
|
+
SELECT * FROM system.access.{table_name}
|
|
20
|
+
WHERE event_date = :day
|
|
21
|
+
"""
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class DatabricksSQLClient:
|
|
@@ -71,7 +71,6 @@ class DatabricksSQLClient:
|
|
|
71
71
|
https://docs.databricks.com/en/sql/language-manual/information-schema/column_tags.html
|
|
72
72
|
"""
|
|
73
73
|
if not self._needs_extraction(entity):
|
|
74
|
-
# extracting tags require additional credentials (http_path)
|
|
75
74
|
return dict()
|
|
76
75
|
|
|
77
76
|
table = f"{entity.value.lower()}_tags"
|
|
@@ -88,3 +87,19 @@ class DatabricksSQLClient:
|
|
|
88
87
|
mapping[path].append(label)
|
|
89
88
|
|
|
90
89
|
return mapping
|
|
90
|
+
|
|
91
|
+
def get_lineage(
|
|
92
|
+
self, lineage_entity: LineageEntity, day: date
|
|
93
|
+
) -> list[dict]:
|
|
94
|
+
"""
|
|
95
|
+
Fetch {TABLE|COLUMN} lineage of the given day, via system tables
|
|
96
|
+
https://docs.databricks.com/en/admin/system-tables/lineage.html
|
|
97
|
+
"""
|
|
98
|
+
table_name = f"{lineage_entity.value.lower()}_lineage"
|
|
99
|
+
query = _LINEAGE_SQL_TPL.format(table_name=table_name)
|
|
100
|
+
params = {"day": day}
|
|
101
|
+
result = self.execute_sql(query, params)
|
|
102
|
+
data = []
|
|
103
|
+
for row in result:
|
|
104
|
+
data.append(row.asDict())
|
|
105
|
+
return valid_lineage(data, lineage_entity)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: castor-extractor
|
|
3
|
-
Version: 0.22.
|
|
3
|
+
Version: 0.22.6
|
|
4
4
|
Summary: Extract your metadata assets.
|
|
5
5
|
Home-page: https://www.castordoc.com/
|
|
6
6
|
License: EULA
|
|
@@ -38,7 +38,7 @@ Requires-Dist: google-cloud-core (>=2.1.0,<3.0.0)
|
|
|
38
38
|
Requires-Dist: google-cloud-storage (>=2,<3)
|
|
39
39
|
Requires-Dist: google-resumable-media (>=2.0.3,<3.0.0)
|
|
40
40
|
Requires-Dist: googleapis-common-protos (>=1.53.0,<2.0.0)
|
|
41
|
-
Requires-Dist: looker-sdk (>=
|
|
41
|
+
Requires-Dist: looker-sdk (>=25.0.0,<26.0.0) ; extra == "looker" or extra == "all"
|
|
42
42
|
Requires-Dist: msal (>=1.20.0,<2.0.0) ; extra == "powerbi" or extra == "all"
|
|
43
43
|
Requires-Dist: numpy (<2) ; extra == "bigquery" or extra == "databricks" or extra == "all"
|
|
44
44
|
Requires-Dist: numpy (>=1.26) ; (python_version >= "3.12" and python_version < "3.13") and (extra == "bigquery" or extra == "databricks" or extra == "all")
|
|
@@ -57,7 +57,7 @@ Requires-Dist: snowflake-sqlalchemy (!=1.2.5,<2.0.0) ; extra == "snowflake" or e
|
|
|
57
57
|
Requires-Dist: sqlalchemy (>=1.4,<1.5)
|
|
58
58
|
Requires-Dist: sqlalchemy-bigquery[bqstorage] (>=1.0.0,<=2.0.0) ; extra == "bigquery" or extra == "all"
|
|
59
59
|
Requires-Dist: sqlalchemy-redshift (>=0.8.14,<0.9.0) ; extra == "redshift" or extra == "all"
|
|
60
|
-
Requires-Dist: tableauserverclient (
|
|
60
|
+
Requires-Dist: tableauserverclient (>=0.25.0,<0.26.0) ; extra == "tableau" or extra == "all"
|
|
61
61
|
Requires-Dist: tqdm (>=4.0.0,<5.0.0)
|
|
62
62
|
Requires-Dist: typing-extensions (>=4,<5)
|
|
63
63
|
Requires-Dist: websocket-client (>=1,<2) ; extra == "qlik" or extra == "all"
|
|
@@ -207,6 +207,27 @@ For any questions or bug report, contact us at [support@castordoc.com](mailto:su
|
|
|
207
207
|
|
|
208
208
|
# Changelog
|
|
209
209
|
|
|
210
|
+
## 0.22.6 - 2025-01-21
|
|
211
|
+
|
|
212
|
+
* bump dependencies: looker, databricks, deptry, ...
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
## 0.22.5 - 2025-01-09
|
|
216
|
+
|
|
217
|
+
* Databricks: validate and deduplicate lineage links
|
|
218
|
+
|
|
219
|
+
## 0.22.4 - 2025-01-08
|
|
220
|
+
|
|
221
|
+
* ThoughtSpot: extract answers
|
|
222
|
+
|
|
223
|
+
## 0.22.3 - 2024-12-10
|
|
224
|
+
|
|
225
|
+
* Databricks: extract lineage from system tables
|
|
226
|
+
|
|
227
|
+
## 0.22.2 - 2024-12-06
|
|
228
|
+
|
|
229
|
+
* Sigma: multithreading to retrieve lineage
|
|
230
|
+
|
|
210
231
|
## 0.22.1 - 2024-12-05
|
|
211
232
|
|
|
212
233
|
* Salesforce: deduplicate tables
|