castor-extractor 0.24.38__py3-none-any.whl → 0.24.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +18 -0
- castor_extractor/visualization/powerbi/client/client.py +2 -1
- castor_extractor/visualization/sigma/client/sources_transformer.py +17 -3
- castor_extractor/visualization/strategy/extract.py +1 -1
- castor_extractor/warehouse/databricks/client.py +4 -5
- castor_extractor/warehouse/databricks/client_test.py +2 -1
- castor_extractor/warehouse/databricks/queries/column_lineage.sql +25 -0
- castor_extractor/warehouse/databricks/queries/table_lineage.sql +23 -0
- castor_extractor/warehouse/databricks/sql_client.py +14 -11
- castor_extractor/warehouse/sqlserver/extract.py +3 -1
- castor_extractor/warehouse/sqlserver/queries/column.sql +3 -3
- castor_extractor/warehouse/sqlserver/queries/schema.sql +7 -2
- castor_extractor/warehouse/sqlserver/queries/table.sql +1 -1
- {castor_extractor-0.24.38.dist-info → castor_extractor-0.24.42.dist-info}/METADATA +19 -1
- {castor_extractor-0.24.38.dist-info → castor_extractor-0.24.42.dist-info}/RECORD +18 -18
- castor_extractor/warehouse/databricks/lineage.py +0 -69
- castor_extractor/warehouse/databricks/lineage_test.py +0 -89
- {castor_extractor-0.24.38.dist-info → castor_extractor-0.24.42.dist-info}/LICENCE +0 -0
- {castor_extractor-0.24.38.dist-info → castor_extractor-0.24.42.dist-info}/WHEEL +0 -0
- {castor_extractor-0.24.38.dist-info → castor_extractor-0.24.42.dist-info}/entry_points.txt +0 -0
CHANGELOG.md
CHANGED
|
@@ -1,5 +1,23 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.24.42 - 2025-08-19
|
|
4
|
+
|
|
5
|
+
* Strategy: exclude COLUMNS from extraction
|
|
6
|
+
|
|
7
|
+
## 0.24.41 - 2025-08-19
|
|
8
|
+
|
|
9
|
+
* Sigma: retry on 429 errors when fetching connection paths
|
|
10
|
+
|
|
11
|
+
## 0.24.40 - 2025-08-18
|
|
12
|
+
|
|
13
|
+
* SQLServer: fix database allowlist/blocklist filtering
|
|
14
|
+
|
|
15
|
+
## 0.24.39 - 2025-08-18
|
|
16
|
+
|
|
17
|
+
* Databricks:
|
|
18
|
+
* Fix vanishing owner ID column for tables
|
|
19
|
+
* Deduplicate lineage with SQL to reduce memory use
|
|
20
|
+
|
|
3
21
|
## 0.24.38 - 2025-08-07
|
|
4
22
|
|
|
5
23
|
* Uploader: Support US and EU zones
|
|
@@ -28,6 +28,7 @@ POWERBI_DEFAULT_TIMEOUT_S = 30
|
|
|
28
28
|
METADATA_BATCH_SIZE = 100
|
|
29
29
|
POWERBI_SCAN_STATUS_DONE = "Succeeded"
|
|
30
30
|
POWERBI_SCAN_SLEEP_S = 1
|
|
31
|
+
POWERBI_SCAN_TIMEOUT_S = 60
|
|
31
32
|
|
|
32
33
|
MAX_RETRY_PAGES = 1
|
|
33
34
|
RETRY_PAGES_TIMEOUT_MS = 35 * 1000 # 35 seconds
|
|
@@ -142,7 +143,7 @@ class PowerbiClient(APIClient):
|
|
|
142
143
|
endpoint = self.endpoint_factory.metadata_scan_status(scan_id)
|
|
143
144
|
total_waiting_time_s = 0
|
|
144
145
|
|
|
145
|
-
while total_waiting_time_s <
|
|
146
|
+
while total_waiting_time_s < POWERBI_SCAN_TIMEOUT_S:
|
|
146
147
|
try:
|
|
147
148
|
result = self._get(endpoint)
|
|
148
149
|
except HTTPError as e:
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from http import HTTPStatus
|
|
2
3
|
from typing import TYPE_CHECKING, Callable, Iterator
|
|
3
4
|
|
|
5
|
+
from ....utils import retry_request
|
|
4
6
|
from .endpoints import SigmaEndpointFactory
|
|
5
7
|
|
|
6
8
|
if TYPE_CHECKING:
|
|
@@ -8,6 +10,9 @@ if TYPE_CHECKING:
|
|
|
8
10
|
|
|
9
11
|
logger = logging.getLogger(__name__)
|
|
10
12
|
|
|
13
|
+
SIGMA_CONNECTION_PATH_MAX_RETRY = 1
|
|
14
|
+
SIGMA_CONNECTION_PATH_SLEEP_MS = 30_000 # 30 seconds
|
|
15
|
+
|
|
11
16
|
|
|
12
17
|
class SigmaSourcesTransformer:
|
|
13
18
|
"""Retrieves asset sources and enhances them with additional information."""
|
|
@@ -15,6 +20,17 @@ class SigmaSourcesTransformer:
|
|
|
15
20
|
def __init__(self, api_client: "SigmaClient"):
|
|
16
21
|
self.api_client = api_client
|
|
17
22
|
|
|
23
|
+
@retry_request(
|
|
24
|
+
status_codes=(HTTPStatus.TOO_MANY_REQUESTS,),
|
|
25
|
+
max_retries=SIGMA_CONNECTION_PATH_MAX_RETRY,
|
|
26
|
+
base_ms=SIGMA_CONNECTION_PATH_SLEEP_MS,
|
|
27
|
+
)
|
|
28
|
+
def _get_connection_path(self, table_id: str) -> dict:
|
|
29
|
+
"""Retrieves the connection path for a given table id"""
|
|
30
|
+
return self.api_client._get(
|
|
31
|
+
endpoint=SigmaEndpointFactory.connection_path(table_id)
|
|
32
|
+
)
|
|
33
|
+
|
|
18
34
|
def _map_table_id_to_connection_path(
|
|
19
35
|
self, all_sources: list
|
|
20
36
|
) -> dict[str, dict]:
|
|
@@ -29,9 +45,7 @@ class SigmaSourcesTransformer:
|
|
|
29
45
|
}
|
|
30
46
|
|
|
31
47
|
return {
|
|
32
|
-
table_id: self.
|
|
33
|
-
endpoint=SigmaEndpointFactory.connection_path(table_id)
|
|
34
|
-
)
|
|
48
|
+
table_id: self._get_connection_path(table_id)
|
|
35
49
|
for table_id in unique_table_ids
|
|
36
50
|
}
|
|
37
51
|
|
|
@@ -22,7 +22,7 @@ def iterate_all_data(
|
|
|
22
22
|
) -> Iterable[tuple[str, Union[list, dict]]]:
|
|
23
23
|
"""Iterate over the extracted data from Strategy"""
|
|
24
24
|
|
|
25
|
-
for asset in StrategyAsset:
|
|
25
|
+
for asset in StrategyAsset.mandatory:
|
|
26
26
|
logger.info(f"Extracting {asset.value.upper()} from REST API")
|
|
27
27
|
data = client.fetch(asset)
|
|
28
28
|
yield asset.name.lower(), list(deep_serialize(data))
|
|
@@ -46,12 +46,11 @@ class DatabricksClient:
|
|
|
46
46
|
|
|
47
47
|
@staticmethod
|
|
48
48
|
def _match_table_with_user(table: dict, user_mapping: dict) -> dict:
|
|
49
|
+
"""Matches the table's owner email to an ID, or None if not found."""
|
|
49
50
|
table_owner_email = table.get("owner_email")
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
if not owner_external_id:
|
|
54
|
-
return table
|
|
51
|
+
owner_external_id = (
|
|
52
|
+
user_mapping.get(table_owner_email) if table_owner_email else None
|
|
53
|
+
)
|
|
55
54
|
return {**table, "owner_external_id": owner_external_id}
|
|
56
55
|
|
|
57
56
|
@staticmethod
|
|
@@ -36,5 +36,6 @@ def test_DatabricksClient__match_table_with_user():
|
|
|
36
36
|
assert table_with_owner == {**table, "owner_external_id": 3}
|
|
37
37
|
|
|
38
38
|
table_without_owner = {"id": 1, "owner_email": None}
|
|
39
|
+
expected = {"id": 1, "owner_email": None, "owner_external_id": None}
|
|
39
40
|
actual = client._match_table_with_user(table_without_owner, user_mapping)
|
|
40
|
-
assert actual ==
|
|
41
|
+
assert actual == expected
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/*
|
|
2
|
+
Selects all column lineage events for the given day.
|
|
3
|
+
This excludes self-lineage and deduplicates (parent, child) pairs to keep only the most recent lineage event.
|
|
4
|
+
|
|
5
|
+
Passing parameters is not always supported, so the query must be Python-formatted to set the date.
|
|
6
|
+
*/
|
|
7
|
+
WITH deduplicated_lineage AS (
|
|
8
|
+
SELECT *,
|
|
9
|
+
ROW_NUMBER() OVER (
|
|
10
|
+
PARTITION BY source_table_full_name, source_column_name, target_table_full_name, target_column_name
|
|
11
|
+
ORDER BY event_time DESC
|
|
12
|
+
) AS rank
|
|
13
|
+
FROM system.access.column_lineage
|
|
14
|
+
WHERE
|
|
15
|
+
TRUE
|
|
16
|
+
AND event_date = DATE('{day}')
|
|
17
|
+
AND source_table_full_name IS NOT NULL
|
|
18
|
+
AND source_column_name IS NOT NULL
|
|
19
|
+
AND target_table_full_name IS NOT NULL
|
|
20
|
+
AND target_column_name IS NOT NULL
|
|
21
|
+
AND CONCAT(source_table_full_name, '.', source_column_name) != CONCAT(target_table_full_name, '.', target_column_name)
|
|
22
|
+
)
|
|
23
|
+
SELECT *
|
|
24
|
+
FROM deduplicated_lineage
|
|
25
|
+
WHERE rank = 1
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/*
|
|
2
|
+
Selects all table lineage events for the given day.
|
|
3
|
+
This excludes self-lineage and deduplicates (parent, child) pairs to keep only the most recent lineage event.
|
|
4
|
+
|
|
5
|
+
Passing parameters is not always supported, so the query must be Python-formatted to set the date.
|
|
6
|
+
*/
|
|
7
|
+
WITH deduplicated_lineage AS (
|
|
8
|
+
SELECT *,
|
|
9
|
+
ROW_NUMBER() OVER (
|
|
10
|
+
PARTITION BY source_table_full_name, target_table_full_name
|
|
11
|
+
ORDER BY event_time DESC
|
|
12
|
+
) AS rank
|
|
13
|
+
FROM system.access.table_lineage
|
|
14
|
+
WHERE
|
|
15
|
+
TRUE
|
|
16
|
+
AND event_date = DATE('{day}')
|
|
17
|
+
AND source_table_full_name IS NOT NULL
|
|
18
|
+
AND target_table_full_name IS NOT NULL
|
|
19
|
+
AND source_table_full_name != target_table_full_name
|
|
20
|
+
)
|
|
21
|
+
SELECT *
|
|
22
|
+
FROM deduplicated_lineage
|
|
23
|
+
WHERE rank = 1
|
|
@@ -4,20 +4,25 @@ from datetime import date
|
|
|
4
4
|
|
|
5
5
|
from databricks import sql # type: ignore
|
|
6
6
|
|
|
7
|
+
from ...utils import load_file
|
|
7
8
|
from .credentials import DatabricksCredentials
|
|
8
9
|
from .enums import LineageEntity, TagEntity
|
|
9
10
|
from .format import TagMapping
|
|
10
|
-
from .lineage import valid_lineage
|
|
11
11
|
from .utils import build_path, tag_label
|
|
12
12
|
|
|
13
13
|
logger = logging.getLogger(__name__)
|
|
14
14
|
|
|
15
15
|
_INFORMATION_SCHEMA_SQL = "SELECT * FROM system.information_schema"
|
|
16
16
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
17
|
+
_LINEAGE_SQL_PATHS = {
|
|
18
|
+
LineageEntity.COLUMN: "queries/column_lineage.sql",
|
|
19
|
+
LineageEntity.TABLE: "queries/table_lineage.sql",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _load_lineage_query(lineage_entity: LineageEntity) -> str:
|
|
24
|
+
filename = _LINEAGE_SQL_PATHS[lineage_entity]
|
|
25
|
+
return load_file(filename, __file__)
|
|
21
26
|
|
|
22
27
|
|
|
23
28
|
class DatabricksSQLClient:
|
|
@@ -95,13 +100,11 @@ class DatabricksSQLClient:
|
|
|
95
100
|
Unfortunately, passing parameters is not always supported. We have to
|
|
96
101
|
format the query beforehand and pass it as plain text for execution.
|
|
97
102
|
"""
|
|
98
|
-
|
|
99
|
-
query =
|
|
100
|
-
|
|
101
|
-
day=day,
|
|
102
|
-
)
|
|
103
|
+
query_template = _load_lineage_query(lineage_entity)
|
|
104
|
+
query = query_template.format(day=day)
|
|
105
|
+
|
|
103
106
|
result = self.execute_sql(query)
|
|
104
107
|
data = []
|
|
105
108
|
for row in result:
|
|
106
109
|
data.append(row.asDict())
|
|
107
|
-
return
|
|
110
|
+
return data
|
|
@@ -52,7 +52,9 @@ def extract_all(**kwargs) -> None:
|
|
|
52
52
|
client = MSSQLClient(credentials=_credentials(kwargs))
|
|
53
53
|
|
|
54
54
|
databases = filter_items(
|
|
55
|
-
client.get_databases(),
|
|
55
|
+
items=client.get_databases(),
|
|
56
|
+
allowed=kwargs.get("db_allowed"),
|
|
57
|
+
blocked=kwargs.get("db_blocked"),
|
|
56
58
|
)
|
|
57
59
|
|
|
58
60
|
query_builder = MSSQLQueryBuilder(
|
|
@@ -91,9 +91,9 @@ columns AS (
|
|
|
91
91
|
LEFT JOIN column_ids AS i
|
|
92
92
|
ON
|
|
93
93
|
(
|
|
94
|
-
c.table_name = i.table_name
|
|
95
|
-
AND c.table_schema = i.schema_name
|
|
96
|
-
AND c.column_name = i.column_name
|
|
94
|
+
c.table_name COLLATE DATABASE_DEFAULT = i.table_name COLLATE DATABASE_DEFAULT
|
|
95
|
+
AND c.table_schema COLLATE DATABASE_DEFAULT = i.schema_name COLLATE DATABASE_DEFAULT
|
|
96
|
+
AND c.column_name COLLATE DATABASE_DEFAULT = i.column_name COLLATE DATABASE_DEFAULT
|
|
97
97
|
)
|
|
98
98
|
)
|
|
99
99
|
|
|
@@ -1,4 +1,9 @@
|
|
|
1
|
-
|
|
1
|
+
/*
|
|
2
|
+
Fetch database information
|
|
3
|
+
|
|
4
|
+
Collation is a set of rules that defines how text data is stored and compared, and it can differ between databases.
|
|
5
|
+
The "COLLATE DATABASE_DEFAULT" is to ensure that text is compared with the same collation.
|
|
6
|
+
*/
|
|
2
7
|
WITH ids AS (
|
|
3
8
|
SELECT DISTINCT
|
|
4
9
|
table_catalog,
|
|
@@ -19,4 +24,4 @@ INNER JOIN ids AS i
|
|
|
19
24
|
LEFT JOIN {database}.sys.sysusers AS u
|
|
20
25
|
ON s.principal_id = u.uid
|
|
21
26
|
LEFT JOIN {database}.sys.databases AS d
|
|
22
|
-
ON i.table_catalog = d.name
|
|
27
|
+
ON i.table_catalog COLLATE DATABASE_DEFAULT = d.name COLLATE DATABASE_DEFAULT
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: castor-extractor
|
|
3
|
-
Version: 0.24.
|
|
3
|
+
Version: 0.24.42
|
|
4
4
|
Summary: Extract your metadata assets.
|
|
5
5
|
Home-page: https://www.castordoc.com/
|
|
6
6
|
License: EULA
|
|
@@ -215,6 +215,24 @@ For any questions or bug report, contact us at [support@coalesce.io](mailto:supp
|
|
|
215
215
|
|
|
216
216
|
# Changelog
|
|
217
217
|
|
|
218
|
+
## 0.24.42 - 2025-08-19
|
|
219
|
+
|
|
220
|
+
* Strategy: exclude COLUMNS from extraction
|
|
221
|
+
|
|
222
|
+
## 0.24.41 - 2025-08-19
|
|
223
|
+
|
|
224
|
+
* Sigma: retry on 429 errors when fetching connection paths
|
|
225
|
+
|
|
226
|
+
## 0.24.40 - 2025-08-18
|
|
227
|
+
|
|
228
|
+
* SQLServer: fix database allowlist/blocklist filtering
|
|
229
|
+
|
|
230
|
+
## 0.24.39 - 2025-08-18
|
|
231
|
+
|
|
232
|
+
* Databricks:
|
|
233
|
+
* Fix vanishing owner ID column for tables
|
|
234
|
+
* Deduplicate lineage with SQL to reduce memory use
|
|
235
|
+
|
|
218
236
|
## 0.24.38 - 2025-08-07
|
|
219
237
|
|
|
220
238
|
* Uploader: Support US and EU zones
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
CHANGELOG.md,sha256=
|
|
1
|
+
CHANGELOG.md,sha256=wBOldpLx6AkuwA9zq0ZeOo7J2kvO8OA7mboqb3449PI,19562
|
|
2
2
|
Dockerfile,sha256=xQ05-CFfGShT3oUqaiumaldwA288dj9Yb_pxofQpufg,301
|
|
3
3
|
DockerfileUsage.md,sha256=2hkJQF-5JuuzfPZ7IOxgM6QgIQW7l-9oRMFVwyXC4gE,998
|
|
4
4
|
LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
|
|
@@ -238,7 +238,7 @@ castor_extractor/visualization/powerbi/__init__.py,sha256=hoZ73ngLhMc9edqxO9PUIE
|
|
|
238
238
|
castor_extractor/visualization/powerbi/assets.py,sha256=IB_XKwgdN1pZYGZ4RfeHrLjflianTzWf_6tg-4CIwu0,742
|
|
239
239
|
castor_extractor/visualization/powerbi/client/__init__.py,sha256=UPIhMaCCdNxhiLdkItC0IPFE_AMi-SgqI_ahwjB9utI,151
|
|
240
240
|
castor_extractor/visualization/powerbi/client/authentication.py,sha256=cTohunKr1nUDfvxB0sejJSyfE2BdCtwT1WMPecWlbyU,1045
|
|
241
|
-
castor_extractor/visualization/powerbi/client/client.py,sha256=
|
|
241
|
+
castor_extractor/visualization/powerbi/client/client.py,sha256=9PRckoGdjfhOjhf5yqWTuNdivXcOC2PMgvcx-3uCh3k,8166
|
|
242
242
|
castor_extractor/visualization/powerbi/client/client_test.py,sha256=Ox_bHpCSckEpT6IiR7drx2c9fmaVl1btUZxnwEmamGQ,5718
|
|
243
243
|
castor_extractor/visualization/powerbi/client/constants.py,sha256=88R_aGachNNUZh6OSH2fkDwZtY4KTStzKm_g7HNCqqo,387
|
|
244
244
|
castor_extractor/visualization/powerbi/client/credentials.py,sha256=OVWdhZSNODzTdLysY-sbpBZ3uUkLokeayQZnbJAqt2I,1386
|
|
@@ -277,7 +277,7 @@ castor_extractor/visualization/sigma/client/client_test.py,sha256=ae0ZOvKutCm44j
|
|
|
277
277
|
castor_extractor/visualization/sigma/client/credentials.py,sha256=XddAuQSmCKpxJ70TQgRnOj0vMPYVtiStk_lMMQ1AiNM,693
|
|
278
278
|
castor_extractor/visualization/sigma/client/endpoints.py,sha256=i7KTKnl2Os6752CdtJl0vPSC_Z6JxmacodV_saOnce0,1662
|
|
279
279
|
castor_extractor/visualization/sigma/client/pagination.py,sha256=2bFA7GiBUUasFtHJKA90516d283p7Pg50-4zw6Fwt8I,726
|
|
280
|
-
castor_extractor/visualization/sigma/client/sources_transformer.py,sha256=
|
|
280
|
+
castor_extractor/visualization/sigma/client/sources_transformer.py,sha256=n-5mZWSvzfTwpM5VP_bwlcxcaAwCKEEbpMCG_1KRVP4,3748
|
|
281
281
|
castor_extractor/visualization/sigma/client/sources_transformer_test.py,sha256=06yUHXyv65amXLKXhix6K3kkVc1kpBqSjIYcxbyMI4Y,2766
|
|
282
282
|
castor_extractor/visualization/sigma/extract.py,sha256=poTh70Xm2D6BwbdGApLkjXy6-t4iZnOoMB5DPfaTLEI,2929
|
|
283
283
|
castor_extractor/visualization/strategy/__init__.py,sha256=HOMv4JxqF5ZmViWi-pDE-PSXJRLTdXal_jtpHG_rlR8,123
|
|
@@ -286,7 +286,7 @@ castor_extractor/visualization/strategy/client/__init__.py,sha256=XWP0yF5j6JefDJ
|
|
|
286
286
|
castor_extractor/visualization/strategy/client/client.py,sha256=6DJO0Fh67FXxmwY5h_X9cu5sEq3GhM19b9hwn_fvhSE,9460
|
|
287
287
|
castor_extractor/visualization/strategy/client/credentials.py,sha256=urFfNxWX1JG6wwFMYImufQzHa5g-sgjdlVGzi63owwg,1113
|
|
288
288
|
castor_extractor/visualization/strategy/client/properties.py,sha256=66oBm8Kz6HEQW_jNR5_fAI_O921R2F5yH2Ff3zjtJOk,4500
|
|
289
|
-
castor_extractor/visualization/strategy/extract.py,sha256=
|
|
289
|
+
castor_extractor/visualization/strategy/extract.py,sha256=9Ec48BwiA-5nykhcQiqkSiL_RB034_Ci-ck89YK5Nic,1176
|
|
290
290
|
castor_extractor/visualization/tableau/__init__.py,sha256=eFI_1hjdkxyUiAYiy3szwyuwn3yJ5C_KbpBU0ySJDcQ,138
|
|
291
291
|
castor_extractor/visualization/tableau/assets.py,sha256=HbCRd8VCj1WBEeqg9jwnygnT7xOFJ6PQD7Lq7sV-XR0,635
|
|
292
292
|
castor_extractor/visualization/tableau/client/__init__.py,sha256=P8RKFKOC63WkH5hdEytJOwHS9vzQ8GXreLfXZetmMP8,78
|
|
@@ -337,18 +337,18 @@ castor_extractor/warehouse/bigquery/types.py,sha256=rfKkKA13Et7TM4I0uVaXkLfuaBXk
|
|
|
337
337
|
castor_extractor/warehouse/databricks/__init__.py,sha256=YG3YSIJgCFRjjI8eExy9T7qGnfnjWhMFh8c15KTs_BA,184
|
|
338
338
|
castor_extractor/warehouse/databricks/api_client.py,sha256=kLcUGSgrfybZUrpt0tE7qe2OoSSN7IK4myyB7c0czOY,6260
|
|
339
339
|
castor_extractor/warehouse/databricks/api_client_test.py,sha256=YTWC-X7L-XAfK5b39TUgTmR1ifv0QrY5tvLNoSbpmjg,466
|
|
340
|
-
castor_extractor/warehouse/databricks/client.py,sha256=
|
|
341
|
-
castor_extractor/warehouse/databricks/client_test.py,sha256=
|
|
340
|
+
castor_extractor/warehouse/databricks/client.py,sha256=LzpeVQIOYi_QTfdOHbK6SB4SgxhZ7p9TNxh0Iwfz850,3307
|
|
341
|
+
castor_extractor/warehouse/databricks/client_test.py,sha256=dqEdEAt-6e8CtQ7M2L5vDYkn4JvOjqyqZSFEpQ55WRc,1432
|
|
342
342
|
castor_extractor/warehouse/databricks/credentials.py,sha256=ExtVcl2NpMXTx1Lg8vHQdzQtSEm2aqpg3D1BJrNAUjI,528
|
|
343
343
|
castor_extractor/warehouse/databricks/endpoints.py,sha256=qPoL9CtPFJdwVuW9rJ37nmeMd-nChOBouEVYb4SlaUE,670
|
|
344
344
|
castor_extractor/warehouse/databricks/enums.py,sha256=3T6BbVvbWvfWkD23krsYT1x0kKh1qRzNPl6WpcXe300,274
|
|
345
345
|
castor_extractor/warehouse/databricks/extract.py,sha256=Z4VTEIf0QMiua0QGAlJdQ86kxmGAXekQ304aCKme6IY,7358
|
|
346
346
|
castor_extractor/warehouse/databricks/format.py,sha256=S3BOcwJubc1pyKr-li26uftUUfsjfrm5Qf4LqmElXVk,6736
|
|
347
347
|
castor_extractor/warehouse/databricks/format_test.py,sha256=ls0IcOElqp_qecAzNbK0zdca7Pms4seCHimbw8NAoAI,3322
|
|
348
|
-
castor_extractor/warehouse/databricks/lineage.py,sha256=jwiRXrgqBAtzQt5EgErYrN8YRyviEEHmyrSbw8TSPq4,2105
|
|
349
|
-
castor_extractor/warehouse/databricks/lineage_test.py,sha256=PyBn1eAoxLm4Bz5M0F4zmaxFX2mXRTM_uug5OKbQPQs,2684
|
|
350
348
|
castor_extractor/warehouse/databricks/pagination.py,sha256=sM1G0sN1pf1TPpI0Y3Oew378UGEKVkMRc2Mlu9tDjLo,545
|
|
351
|
-
castor_extractor/warehouse/databricks/
|
|
349
|
+
castor_extractor/warehouse/databricks/queries/column_lineage.sql,sha256=Q8MAZ5N3fNcolTMtRRw2fIrbKgV4ax9StgJgtYMpxNQ,980
|
|
350
|
+
castor_extractor/warehouse/databricks/queries/table_lineage.sql,sha256=5k5jHj11SdGpfMqJEKJihAhd_ngO4kZOZJ8TCPihWDs,786
|
|
351
|
+
castor_extractor/warehouse/databricks/sql_client.py,sha256=oypv_2pomoleXUJJhS8CSKO_ucalQhS9_mcsnsb5wsc,3750
|
|
352
352
|
castor_extractor/warehouse/databricks/types.py,sha256=-TFX4jS6_c3wQLOpJTKpLeGS21YIPjKDjISnzeUPdCc,46
|
|
353
353
|
castor_extractor/warehouse/databricks/utils.py,sha256=5CKn6Me1Tus97H_qDEz_5tkhd4ARmwk2qiC3GndjyCc,1969
|
|
354
354
|
castor_extractor/warehouse/databricks/utils_test.py,sha256=_guTuzRWRTZdDY7ils0X1K8jhI9T877MEtw3x_YDg9I,2415
|
|
@@ -422,17 +422,17 @@ castor_extractor/warehouse/snowflake/queries/view_ddl.sql,sha256=eWsci_50cxiYIv3
|
|
|
422
422
|
castor_extractor/warehouse/snowflake/query.py,sha256=C2LTdPwBzMQ_zMncg0Kq4_WkoY7K9as5tvxBDrIOlwI,1763
|
|
423
423
|
castor_extractor/warehouse/sqlserver/__init__.py,sha256=PdOuYznmvKAbfWAm8UdN47MfEsd9jqPi_dDi3WEo1KY,116
|
|
424
424
|
castor_extractor/warehouse/sqlserver/client.py,sha256=Bjfpw96IKAQfWPiU5SZYEDfetwfkqZrnKbQYoStcnZc,2007
|
|
425
|
-
castor_extractor/warehouse/sqlserver/extract.py,sha256
|
|
425
|
+
castor_extractor/warehouse/sqlserver/extract.py,sha256=GbOlSq8JR6HaJZunkfiRxaSt0pbgazQjF8GpgqWWIcU,2294
|
|
426
426
|
castor_extractor/warehouse/sqlserver/queries/.sqlfluff,sha256=yy0KQdz8I_67vnXyX8eeWwOWkxTXvHyVKSVwhURktd8,48
|
|
427
|
-
castor_extractor/warehouse/sqlserver/queries/column.sql,sha256=
|
|
427
|
+
castor_extractor/warehouse/sqlserver/queries/column.sql,sha256=eRILCgdygYRvtfSdxaswIiIYKW-PiJXW2qi3yHtrfns,2913
|
|
428
428
|
castor_extractor/warehouse/sqlserver/queries/database.sql,sha256=4dPeBCn85MEOXr1f-DPXxiI3RvvoE_1n8lsbTs26E0I,150
|
|
429
|
-
castor_extractor/warehouse/sqlserver/queries/schema.sql,sha256=
|
|
430
|
-
castor_extractor/warehouse/sqlserver/queries/table.sql,sha256=
|
|
429
|
+
castor_extractor/warehouse/sqlserver/queries/schema.sql,sha256=Zp4G86FJ_Be8Zqvdlu7K8DqmsUL62kxbwaUk5asZ0V4,881
|
|
430
|
+
castor_extractor/warehouse/sqlserver/queries/table.sql,sha256=YwFhHc6rGbszqQt7Izh7EngVwrrBoEZ9kniuWXNtGco,2837
|
|
431
431
|
castor_extractor/warehouse/sqlserver/queries/user.sql,sha256=gOrZsMVypusR2dc4vwVs4E1a-CliRsr_UjnD2EbXs-A,94
|
|
432
432
|
castor_extractor/warehouse/sqlserver/query.py,sha256=7sW8cK3JzxPt6faTJ7e4lk9tE4fo_AeCymI-LqsSols,1276
|
|
433
433
|
castor_extractor/warehouse/synapse/queries/column.sql,sha256=lNcFoIW3Y0PFOqoOzJEXmPvZvfAsY0AP63Mu2LuPzPo,1351
|
|
434
|
-
castor_extractor-0.24.
|
|
435
|
-
castor_extractor-0.24.
|
|
436
|
-
castor_extractor-0.24.
|
|
437
|
-
castor_extractor-0.24.
|
|
438
|
-
castor_extractor-0.24.
|
|
434
|
+
castor_extractor-0.24.42.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
|
|
435
|
+
castor_extractor-0.24.42.dist-info/METADATA,sha256=yzhHBGi8q9y-Au6LpFDemdYkmNQ2v0HNXZrlEaRO_60,27015
|
|
436
|
+
castor_extractor-0.24.42.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
|
437
|
+
castor_extractor-0.24.42.dist-info/entry_points.txt,sha256=_F-qeZCybjoMkNb9ErEhnyqXuG6afHIFQhakdBHZsr4,1803
|
|
438
|
+
castor_extractor-0.24.42.dist-info/RECORD,,
|
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
from typing import Iterable, Optional
|
|
2
|
-
|
|
3
|
-
from .enums import LineageEntity
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class LineageProcessor:
|
|
7
|
-
"""
|
|
8
|
-
helper class that handles lineage deduplication and filtering
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
def __init__(self, lineage_entity: LineageEntity):
|
|
12
|
-
self.lineage_entity = lineage_entity
|
|
13
|
-
|
|
14
|
-
self.lineage: dict[tuple[str, str], dict] = dict()
|
|
15
|
-
|
|
16
|
-
def _parent_path(self, link) -> Optional[str]:
|
|
17
|
-
if self.lineage_entity == LineageEntity.TABLE:
|
|
18
|
-
return link["source_table_full_name"]
|
|
19
|
-
|
|
20
|
-
source_table = link["source_table_full_name"]
|
|
21
|
-
source_column = link["source_column_name"]
|
|
22
|
-
if not (source_table and source_column):
|
|
23
|
-
return None
|
|
24
|
-
|
|
25
|
-
return f"{source_table}.{source_column}"
|
|
26
|
-
|
|
27
|
-
def _child_path(self, link) -> Optional[str]:
|
|
28
|
-
if self.lineage_entity == LineageEntity.TABLE:
|
|
29
|
-
return link["target_table_full_name"]
|
|
30
|
-
|
|
31
|
-
target_table = link["target_table_full_name"]
|
|
32
|
-
target_column = link["target_column_name"]
|
|
33
|
-
if not (target_table and target_column):
|
|
34
|
-
return None
|
|
35
|
-
|
|
36
|
-
return f"{target_table}.{target_column}"
|
|
37
|
-
|
|
38
|
-
def add(self, link: dict) -> None:
|
|
39
|
-
"""
|
|
40
|
-
If the parent and child paths are valid, keeps the most recent lineage
|
|
41
|
-
link in the `self.lineage` map.
|
|
42
|
-
"""
|
|
43
|
-
parent = self._parent_path(link)
|
|
44
|
-
child = self._child_path(link)
|
|
45
|
-
timestamp = link["event_time"]
|
|
46
|
-
|
|
47
|
-
if not (parent and child and parent != child):
|
|
48
|
-
return
|
|
49
|
-
|
|
50
|
-
key = (parent, child)
|
|
51
|
-
if key in self.lineage and self.lineage[key]["event_time"] > timestamp:
|
|
52
|
-
return
|
|
53
|
-
|
|
54
|
-
self.lineage[key] = link
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
def valid_lineage(
|
|
58
|
-
lineage: Iterable[dict], lineage_entity: LineageEntity
|
|
59
|
-
) -> list[dict]:
|
|
60
|
-
"""
|
|
61
|
-
Filters out self-lineage or lineage with a missing source or target path,
|
|
62
|
-
then deduplicates by picking the link with the most recent event timestmap.
|
|
63
|
-
"""
|
|
64
|
-
deduplicated_lineage = LineageProcessor(lineage_entity)
|
|
65
|
-
|
|
66
|
-
for link in lineage:
|
|
67
|
-
deduplicated_lineage.add(link)
|
|
68
|
-
|
|
69
|
-
return list(deduplicated_lineage.lineage.values())
|
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
from .enums import LineageEntity
|
|
2
|
-
from .lineage import LineageProcessor, valid_lineage
|
|
3
|
-
|
|
4
|
-
_OLDER_DATE = "2025-01-01 00:00:01.0"
|
|
5
|
-
_CLOSER_DATE = "2025-01-01 02:02:02.0"
|
|
6
|
-
|
|
7
|
-
_TABLE_LINEAGES = [
|
|
8
|
-
{
|
|
9
|
-
"source_table_full_name": "a.b.source",
|
|
10
|
-
"target_table_full_name": "a.b.target",
|
|
11
|
-
"event_time": _CLOSER_DATE,
|
|
12
|
-
"other": "more recent stuff",
|
|
13
|
-
},
|
|
14
|
-
{
|
|
15
|
-
"source_table_full_name": "a.b.source",
|
|
16
|
-
"target_table_full_name": "a.b.target",
|
|
17
|
-
"event_time": _OLDER_DATE,
|
|
18
|
-
"other": "stuff that's too old",
|
|
19
|
-
},
|
|
20
|
-
{
|
|
21
|
-
"source_table_full_name": "no target",
|
|
22
|
-
"target_table_full_name": None,
|
|
23
|
-
"event_time": _CLOSER_DATE,
|
|
24
|
-
},
|
|
25
|
-
{
|
|
26
|
-
"source_table_full_name": None,
|
|
27
|
-
"target_table_full_name": "no source",
|
|
28
|
-
"event_time": _CLOSER_DATE,
|
|
29
|
-
},
|
|
30
|
-
]
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
_COLUMN_LINEAGES = [
|
|
34
|
-
{
|
|
35
|
-
"source_table_full_name": "a.b.source",
|
|
36
|
-
"source_column_name": "src_col",
|
|
37
|
-
"target_table_full_name": "a.b.target",
|
|
38
|
-
"target_column_name": "trgt_col",
|
|
39
|
-
"event_time": _OLDER_DATE,
|
|
40
|
-
"other": "old stuff",
|
|
41
|
-
},
|
|
42
|
-
{
|
|
43
|
-
"source_table_full_name": "a.b.source",
|
|
44
|
-
"source_column_name": "src_col",
|
|
45
|
-
"target_table_full_name": "a.b.target",
|
|
46
|
-
"target_column_name": "trgt_col",
|
|
47
|
-
"event_time": _CLOSER_DATE,
|
|
48
|
-
"other": "newer stuff",
|
|
49
|
-
},
|
|
50
|
-
{
|
|
51
|
-
"source_table_full_name": "a.b.toto",
|
|
52
|
-
"source_column_name": "toto_col",
|
|
53
|
-
"target_table_full_name": "a.b.tata",
|
|
54
|
-
"target_column_name": "tata_col",
|
|
55
|
-
"event_time": _OLDER_DATE,
|
|
56
|
-
},
|
|
57
|
-
{
|
|
58
|
-
"source_table_full_name": "a.b.source",
|
|
59
|
-
"source_column_name": "a.b.source",
|
|
60
|
-
"target_table_full_name": None,
|
|
61
|
-
"target_column_name": None,
|
|
62
|
-
"event_time": _CLOSER_DATE,
|
|
63
|
-
},
|
|
64
|
-
]
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def test_valid_lineage():
|
|
68
|
-
table_links = valid_lineage(_TABLE_LINEAGES, LineageEntity.TABLE)
|
|
69
|
-
|
|
70
|
-
assert len(table_links) == 1
|
|
71
|
-
assert table_links[0]["source_table_full_name"] == "a.b.source"
|
|
72
|
-
assert table_links[0]["target_table_full_name"] == "a.b.target"
|
|
73
|
-
assert table_links[0]["event_time"] == _CLOSER_DATE
|
|
74
|
-
assert table_links[0]["other"] == "more recent stuff"
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def test_LineageLinks_add():
|
|
78
|
-
deduplicated_lineage = LineageProcessor(LineageEntity.COLUMN)
|
|
79
|
-
for link in _COLUMN_LINEAGES:
|
|
80
|
-
deduplicated_lineage.add(link)
|
|
81
|
-
|
|
82
|
-
lineage = deduplicated_lineage.lineage
|
|
83
|
-
assert len(lineage) == 2
|
|
84
|
-
assert ("a.b.source.src_col", "a.b.target.trgt_col") in lineage
|
|
85
|
-
assert ("a.b.toto.toto_col", "a.b.tata.tata_col") in lineage
|
|
86
|
-
assert (
|
|
87
|
-
lineage[("a.b.source.src_col", "a.b.target.trgt_col")]["other"]
|
|
88
|
-
== "newer stuff"
|
|
89
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|