castor-extractor 0.24.38__py3-none-any.whl → 0.24.42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

CHANGELOG.md CHANGED
@@ -1,5 +1,23 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.24.42 - 2025-08-19
4
+
5
+ * Strategy: exclude COLUMNS from extraction
6
+
7
+ ## 0.24.41 - 2025-08-19
8
+
9
+ * Sigma: retry on 429 errors when fetching connection paths
10
+
11
+ ## 0.24.40 - 2025-08-18
12
+
13
+ * SQLServer: fix database allowlist/blocklist filtering
14
+
15
+ ## 0.24.39 - 2025-08-18
16
+
17
+ * Databricks:
18
+ * Fix vanishing owner ID column for tables
19
+ * Deduplicate lineage with SQL to reduce memory use
20
+
3
21
  ## 0.24.38 - 2025-08-07
4
22
 
5
23
  * Uploader: Support US and EU zones
@@ -28,6 +28,7 @@ POWERBI_DEFAULT_TIMEOUT_S = 30
28
28
  METADATA_BATCH_SIZE = 100
29
29
  POWERBI_SCAN_STATUS_DONE = "Succeeded"
30
30
  POWERBI_SCAN_SLEEP_S = 1
31
+ POWERBI_SCAN_TIMEOUT_S = 60
31
32
 
32
33
  MAX_RETRY_PAGES = 1
33
34
  RETRY_PAGES_TIMEOUT_MS = 35 * 1000 # 35 seconds
@@ -142,7 +143,7 @@ class PowerbiClient(APIClient):
142
143
  endpoint = self.endpoint_factory.metadata_scan_status(scan_id)
143
144
  total_waiting_time_s = 0
144
145
 
145
- while total_waiting_time_s < POWERBI_DEFAULT_TIMEOUT_S:
146
+ while total_waiting_time_s < POWERBI_SCAN_TIMEOUT_S:
146
147
  try:
147
148
  result = self._get(endpoint)
148
149
  except HTTPError as e:
@@ -1,6 +1,8 @@
1
1
  import logging
2
+ from http import HTTPStatus
2
3
  from typing import TYPE_CHECKING, Callable, Iterator
3
4
 
5
+ from ....utils import retry_request
4
6
  from .endpoints import SigmaEndpointFactory
5
7
 
6
8
  if TYPE_CHECKING:
@@ -8,6 +10,9 @@ if TYPE_CHECKING:
8
10
 
9
11
  logger = logging.getLogger(__name__)
10
12
 
13
+ SIGMA_CONNECTION_PATH_MAX_RETRY = 1
14
+ SIGMA_CONNECTION_PATH_SLEEP_MS = 30_000 # 30 seconds
15
+
11
16
 
12
17
  class SigmaSourcesTransformer:
13
18
  """Retrieves asset sources and enhances them with additional information."""
@@ -15,6 +20,17 @@ class SigmaSourcesTransformer:
15
20
  def __init__(self, api_client: "SigmaClient"):
16
21
  self.api_client = api_client
17
22
 
23
+ @retry_request(
24
+ status_codes=(HTTPStatus.TOO_MANY_REQUESTS,),
25
+ max_retries=SIGMA_CONNECTION_PATH_MAX_RETRY,
26
+ base_ms=SIGMA_CONNECTION_PATH_SLEEP_MS,
27
+ )
28
+ def _get_connection_path(self, table_id: str) -> dict:
29
+ """Retrieves the connection path for a given table id"""
30
+ return self.api_client._get(
31
+ endpoint=SigmaEndpointFactory.connection_path(table_id)
32
+ )
33
+
18
34
  def _map_table_id_to_connection_path(
19
35
  self, all_sources: list
20
36
  ) -> dict[str, dict]:
@@ -29,9 +45,7 @@ class SigmaSourcesTransformer:
29
45
  }
30
46
 
31
47
  return {
32
- table_id: self.api_client._get(
33
- endpoint=SigmaEndpointFactory.connection_path(table_id)
34
- )
48
+ table_id: self._get_connection_path(table_id)
35
49
  for table_id in unique_table_ids
36
50
  }
37
51
 
@@ -22,7 +22,7 @@ def iterate_all_data(
22
22
  ) -> Iterable[tuple[str, Union[list, dict]]]:
23
23
  """Iterate over the extracted data from Strategy"""
24
24
 
25
- for asset in StrategyAsset:
25
+ for asset in StrategyAsset.mandatory:
26
26
  logger.info(f"Extracting {asset.value.upper()} from REST API")
27
27
  data = client.fetch(asset)
28
28
  yield asset.name.lower(), list(deep_serialize(data))
@@ -46,12 +46,11 @@ class DatabricksClient:
46
46
 
47
47
  @staticmethod
48
48
  def _match_table_with_user(table: dict, user_mapping: dict) -> dict:
49
+ """Matches the table's owner email to an ID, or None if not found."""
49
50
  table_owner_email = table.get("owner_email")
50
- if not table_owner_email:
51
- return table
52
- owner_external_id = user_mapping.get(table_owner_email)
53
- if not owner_external_id:
54
- return table
51
+ owner_external_id = (
52
+ user_mapping.get(table_owner_email) if table_owner_email else None
53
+ )
55
54
  return {**table, "owner_external_id": owner_external_id}
56
55
 
57
56
  @staticmethod
@@ -36,5 +36,6 @@ def test_DatabricksClient__match_table_with_user():
36
36
  assert table_with_owner == {**table, "owner_external_id": 3}
37
37
 
38
38
  table_without_owner = {"id": 1, "owner_email": None}
39
+ expected = {"id": 1, "owner_email": None, "owner_external_id": None}
39
40
  actual = client._match_table_with_user(table_without_owner, user_mapping)
40
- assert actual == table_without_owner
41
+ assert actual == expected
@@ -0,0 +1,25 @@
1
+ /*
2
+ Selects all column lineage events for the given day.
3
+ This excludes self-lineage and deduplicates (parent, child) pairs to keep only the most recent lineage event.
4
+
5
+ Passing parameters is not always supported, so the query must be Python-formatted to set the date.
6
+ */
7
+ WITH deduplicated_lineage AS (
8
+ SELECT *,
9
+ ROW_NUMBER() OVER (
10
+ PARTITION BY source_table_full_name, source_column_name, target_table_full_name, target_column_name
11
+ ORDER BY event_time DESC
12
+ ) AS rank
13
+ FROM system.access.column_lineage
14
+ WHERE
15
+ TRUE
16
+ AND event_date = DATE('{day}')
17
+ AND source_table_full_name IS NOT NULL
18
+ AND source_column_name IS NOT NULL
19
+ AND target_table_full_name IS NOT NULL
20
+ AND target_column_name IS NOT NULL
21
+ AND CONCAT(source_table_full_name, '.', source_column_name) != CONCAT(target_table_full_name, '.', target_column_name)
22
+ )
23
+ SELECT *
24
+ FROM deduplicated_lineage
25
+ WHERE rank = 1
@@ -0,0 +1,23 @@
1
+ /*
2
+ Selects all table lineage events for the given day.
3
+ This excludes self-lineage and deduplicates (parent, child) pairs to keep only the most recent lineage event.
4
+
5
+ Passing parameters is not always supported, so the query must be Python-formatted to set the date.
6
+ */
7
+ WITH deduplicated_lineage AS (
8
+ SELECT *,
9
+ ROW_NUMBER() OVER (
10
+ PARTITION BY source_table_full_name, target_table_full_name
11
+ ORDER BY event_time DESC
12
+ ) AS rank
13
+ FROM system.access.table_lineage
14
+ WHERE
15
+ TRUE
16
+ AND event_date = DATE('{day}')
17
+ AND source_table_full_name IS NOT NULL
18
+ AND target_table_full_name IS NOT NULL
19
+ AND source_table_full_name != target_table_full_name
20
+ )
21
+ SELECT *
22
+ FROM deduplicated_lineage
23
+ WHERE rank = 1
@@ -4,20 +4,25 @@ from datetime import date
4
4
 
5
5
  from databricks import sql # type: ignore
6
6
 
7
+ from ...utils import load_file
7
8
  from .credentials import DatabricksCredentials
8
9
  from .enums import LineageEntity, TagEntity
9
10
  from .format import TagMapping
10
- from .lineage import valid_lineage
11
11
  from .utils import build_path, tag_label
12
12
 
13
13
  logger = logging.getLogger(__name__)
14
14
 
15
15
  _INFORMATION_SCHEMA_SQL = "SELECT * FROM system.information_schema"
16
16
 
17
- _LINEAGE_SQL_TPL = """
18
- SELECT * FROM system.access.{table_name}
19
- WHERE event_date = DATE('{day}')
20
- """
17
+ _LINEAGE_SQL_PATHS = {
18
+ LineageEntity.COLUMN: "queries/column_lineage.sql",
19
+ LineageEntity.TABLE: "queries/table_lineage.sql",
20
+ }
21
+
22
+
23
+ def _load_lineage_query(lineage_entity: LineageEntity) -> str:
24
+ filename = _LINEAGE_SQL_PATHS[lineage_entity]
25
+ return load_file(filename, __file__)
21
26
 
22
27
 
23
28
  class DatabricksSQLClient:
@@ -95,13 +100,11 @@ class DatabricksSQLClient:
95
100
  Unfortunately, passing parameters is not always supported. We have to
96
101
  format the query beforehand and pass it as plain text for execution.
97
102
  """
98
- table_name = f"{lineage_entity.value.lower()}_lineage"
99
- query = _LINEAGE_SQL_TPL.format(
100
- table_name=table_name,
101
- day=day,
102
- )
103
+ query_template = _load_lineage_query(lineage_entity)
104
+ query = query_template.format(day=day)
105
+
103
106
  result = self.execute_sql(query)
104
107
  data = []
105
108
  for row in result:
106
109
  data.append(row.asDict())
107
- return valid_lineage(data, lineage_entity)
110
+ return data
@@ -52,7 +52,9 @@ def extract_all(**kwargs) -> None:
52
52
  client = MSSQLClient(credentials=_credentials(kwargs))
53
53
 
54
54
  databases = filter_items(
55
- client.get_databases(), kwargs.get("allowed"), kwargs.get("blocked")
55
+ items=client.get_databases(),
56
+ allowed=kwargs.get("db_allowed"),
57
+ blocked=kwargs.get("db_blocked"),
56
58
  )
57
59
 
58
60
  query_builder = MSSQLQueryBuilder(
@@ -91,9 +91,9 @@ columns AS (
91
91
  LEFT JOIN column_ids AS i
92
92
  ON
93
93
  (
94
- c.table_name = i.table_name
95
- AND c.table_schema = i.schema_name
96
- AND c.column_name = i.column_name
94
+ c.table_name COLLATE DATABASE_DEFAULT = i.table_name COLLATE DATABASE_DEFAULT
95
+ AND c.table_schema COLLATE DATABASE_DEFAULT = i.schema_name COLLATE DATABASE_DEFAULT
96
+ AND c.column_name COLLATE DATABASE_DEFAULT = i.column_name COLLATE DATABASE_DEFAULT
97
97
  )
98
98
  )
99
99
 
@@ -1,4 +1,9 @@
1
- -- Fetch database information
1
+ /*
2
+ Fetch database information
3
+
4
+ Collation is a set of rules that defines how text data is stored and compared, and it can differ between databases.
5
+ The "COLLATE DATABASE_DEFAULT" is to ensure that text is compared with the same collation.
6
+ */
2
7
  WITH ids AS (
3
8
  SELECT DISTINCT
4
9
  table_catalog,
@@ -19,4 +24,4 @@ INNER JOIN ids AS i
19
24
  LEFT JOIN {database}.sys.sysusers AS u
20
25
  ON s.principal_id = u.uid
21
26
  LEFT JOIN {database}.sys.databases AS d
22
- ON i.table_catalog = d.name
27
+ ON i.table_catalog COLLATE DATABASE_DEFAULT = d.name COLLATE DATABASE_DEFAULT
@@ -92,7 +92,7 @@ meta AS (
92
92
  FROM
93
93
  {database}.information_schema.tables AS t
94
94
  LEFT JOIN {database}.sys.databases AS db
95
- ON t.table_catalog = db.name
95
+ ON t.table_catalog COLLATE DATABASE_DEFAULT = db.name COLLATE DATABASE_DEFAULT
96
96
  )
97
97
 
98
98
  SELECT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: castor-extractor
3
- Version: 0.24.38
3
+ Version: 0.24.42
4
4
  Summary: Extract your metadata assets.
5
5
  Home-page: https://www.castordoc.com/
6
6
  License: EULA
@@ -215,6 +215,24 @@ For any questions or bug report, contact us at [support@coalesce.io](mailto:supp
215
215
 
216
216
  # Changelog
217
217
 
218
+ ## 0.24.42 - 2025-08-19
219
+
220
+ * Strategy: exclude COLUMNS from extraction
221
+
222
+ ## 0.24.41 - 2025-08-19
223
+
224
+ * Sigma: retry on 429 errors when fetching connection paths
225
+
226
+ ## 0.24.40 - 2025-08-18
227
+
228
+ * SQLServer: fix database allowlist/blocklist filtering
229
+
230
+ ## 0.24.39 - 2025-08-18
231
+
232
+ * Databricks:
233
+ * Fix vanishing owner ID column for tables
234
+ * Deduplicate lineage with SQL to reduce memory use
235
+
218
236
  ## 0.24.38 - 2025-08-07
219
237
 
220
238
  * Uploader: Support US and EU zones
@@ -1,4 +1,4 @@
1
- CHANGELOG.md,sha256=cdsC0cY-q3t1K8a-kXhK3OY6y-yrF8uICKb8OqJ3SJo,19185
1
+ CHANGELOG.md,sha256=wBOldpLx6AkuwA9zq0ZeOo7J2kvO8OA7mboqb3449PI,19562
2
2
  Dockerfile,sha256=xQ05-CFfGShT3oUqaiumaldwA288dj9Yb_pxofQpufg,301
3
3
  DockerfileUsage.md,sha256=2hkJQF-5JuuzfPZ7IOxgM6QgIQW7l-9oRMFVwyXC4gE,998
4
4
  LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
@@ -238,7 +238,7 @@ castor_extractor/visualization/powerbi/__init__.py,sha256=hoZ73ngLhMc9edqxO9PUIE
238
238
  castor_extractor/visualization/powerbi/assets.py,sha256=IB_XKwgdN1pZYGZ4RfeHrLjflianTzWf_6tg-4CIwu0,742
239
239
  castor_extractor/visualization/powerbi/client/__init__.py,sha256=UPIhMaCCdNxhiLdkItC0IPFE_AMi-SgqI_ahwjB9utI,151
240
240
  castor_extractor/visualization/powerbi/client/authentication.py,sha256=cTohunKr1nUDfvxB0sejJSyfE2BdCtwT1WMPecWlbyU,1045
241
- castor_extractor/visualization/powerbi/client/client.py,sha256=CWCYmj2spYin74qq9T8v2ZJ5TcxBuEy5EjArhCVZjLM,8141
241
+ castor_extractor/visualization/powerbi/client/client.py,sha256=9PRckoGdjfhOjhf5yqWTuNdivXcOC2PMgvcx-3uCh3k,8166
242
242
  castor_extractor/visualization/powerbi/client/client_test.py,sha256=Ox_bHpCSckEpT6IiR7drx2c9fmaVl1btUZxnwEmamGQ,5718
243
243
  castor_extractor/visualization/powerbi/client/constants.py,sha256=88R_aGachNNUZh6OSH2fkDwZtY4KTStzKm_g7HNCqqo,387
244
244
  castor_extractor/visualization/powerbi/client/credentials.py,sha256=OVWdhZSNODzTdLysY-sbpBZ3uUkLokeayQZnbJAqt2I,1386
@@ -277,7 +277,7 @@ castor_extractor/visualization/sigma/client/client_test.py,sha256=ae0ZOvKutCm44j
277
277
  castor_extractor/visualization/sigma/client/credentials.py,sha256=XddAuQSmCKpxJ70TQgRnOj0vMPYVtiStk_lMMQ1AiNM,693
278
278
  castor_extractor/visualization/sigma/client/endpoints.py,sha256=i7KTKnl2Os6752CdtJl0vPSC_Z6JxmacodV_saOnce0,1662
279
279
  castor_extractor/visualization/sigma/client/pagination.py,sha256=2bFA7GiBUUasFtHJKA90516d283p7Pg50-4zw6Fwt8I,726
280
- castor_extractor/visualization/sigma/client/sources_transformer.py,sha256=mRupzxjtjDqELIouHF0egBkgslDmn5Y4uqO_sbUGCNs,3244
280
+ castor_extractor/visualization/sigma/client/sources_transformer.py,sha256=n-5mZWSvzfTwpM5VP_bwlcxcaAwCKEEbpMCG_1KRVP4,3748
281
281
  castor_extractor/visualization/sigma/client/sources_transformer_test.py,sha256=06yUHXyv65amXLKXhix6K3kkVc1kpBqSjIYcxbyMI4Y,2766
282
282
  castor_extractor/visualization/sigma/extract.py,sha256=poTh70Xm2D6BwbdGApLkjXy6-t4iZnOoMB5DPfaTLEI,2929
283
283
  castor_extractor/visualization/strategy/__init__.py,sha256=HOMv4JxqF5ZmViWi-pDE-PSXJRLTdXal_jtpHG_rlR8,123
@@ -286,7 +286,7 @@ castor_extractor/visualization/strategy/client/__init__.py,sha256=XWP0yF5j6JefDJ
286
286
  castor_extractor/visualization/strategy/client/client.py,sha256=6DJO0Fh67FXxmwY5h_X9cu5sEq3GhM19b9hwn_fvhSE,9460
287
287
  castor_extractor/visualization/strategy/client/credentials.py,sha256=urFfNxWX1JG6wwFMYImufQzHa5g-sgjdlVGzi63owwg,1113
288
288
  castor_extractor/visualization/strategy/client/properties.py,sha256=66oBm8Kz6HEQW_jNR5_fAI_O921R2F5yH2Ff3zjtJOk,4500
289
- castor_extractor/visualization/strategy/extract.py,sha256=2fBuvS2xiOGXRpxXnZsE_C3en6t1-BlM5TbusjHyEkg,1166
289
+ castor_extractor/visualization/strategy/extract.py,sha256=9Ec48BwiA-5nykhcQiqkSiL_RB034_Ci-ck89YK5Nic,1176
290
290
  castor_extractor/visualization/tableau/__init__.py,sha256=eFI_1hjdkxyUiAYiy3szwyuwn3yJ5C_KbpBU0ySJDcQ,138
291
291
  castor_extractor/visualization/tableau/assets.py,sha256=HbCRd8VCj1WBEeqg9jwnygnT7xOFJ6PQD7Lq7sV-XR0,635
292
292
  castor_extractor/visualization/tableau/client/__init__.py,sha256=P8RKFKOC63WkH5hdEytJOwHS9vzQ8GXreLfXZetmMP8,78
@@ -337,18 +337,18 @@ castor_extractor/warehouse/bigquery/types.py,sha256=rfKkKA13Et7TM4I0uVaXkLfuaBXk
337
337
  castor_extractor/warehouse/databricks/__init__.py,sha256=YG3YSIJgCFRjjI8eExy9T7qGnfnjWhMFh8c15KTs_BA,184
338
338
  castor_extractor/warehouse/databricks/api_client.py,sha256=kLcUGSgrfybZUrpt0tE7qe2OoSSN7IK4myyB7c0czOY,6260
339
339
  castor_extractor/warehouse/databricks/api_client_test.py,sha256=YTWC-X7L-XAfK5b39TUgTmR1ifv0QrY5tvLNoSbpmjg,466
340
- castor_extractor/warehouse/databricks/client.py,sha256=H6vcKfos7op5AKSQF9qduG4afx-GZgBdyGE7waS6__o,3292
341
- castor_extractor/warehouse/databricks/client_test.py,sha256=hOuSPh45z6m9T1hjuqpOayby_q8bYdJVdq5qiwkiXrg,1370
340
+ castor_extractor/warehouse/databricks/client.py,sha256=LzpeVQIOYi_QTfdOHbK6SB4SgxhZ7p9TNxh0Iwfz850,3307
341
+ castor_extractor/warehouse/databricks/client_test.py,sha256=dqEdEAt-6e8CtQ7M2L5vDYkn4JvOjqyqZSFEpQ55WRc,1432
342
342
  castor_extractor/warehouse/databricks/credentials.py,sha256=ExtVcl2NpMXTx1Lg8vHQdzQtSEm2aqpg3D1BJrNAUjI,528
343
343
  castor_extractor/warehouse/databricks/endpoints.py,sha256=qPoL9CtPFJdwVuW9rJ37nmeMd-nChOBouEVYb4SlaUE,670
344
344
  castor_extractor/warehouse/databricks/enums.py,sha256=3T6BbVvbWvfWkD23krsYT1x0kKh1qRzNPl6WpcXe300,274
345
345
  castor_extractor/warehouse/databricks/extract.py,sha256=Z4VTEIf0QMiua0QGAlJdQ86kxmGAXekQ304aCKme6IY,7358
346
346
  castor_extractor/warehouse/databricks/format.py,sha256=S3BOcwJubc1pyKr-li26uftUUfsjfrm5Qf4LqmElXVk,6736
347
347
  castor_extractor/warehouse/databricks/format_test.py,sha256=ls0IcOElqp_qecAzNbK0zdca7Pms4seCHimbw8NAoAI,3322
348
- castor_extractor/warehouse/databricks/lineage.py,sha256=jwiRXrgqBAtzQt5EgErYrN8YRyviEEHmyrSbw8TSPq4,2105
349
- castor_extractor/warehouse/databricks/lineage_test.py,sha256=PyBn1eAoxLm4Bz5M0F4zmaxFX2mXRTM_uug5OKbQPQs,2684
350
348
  castor_extractor/warehouse/databricks/pagination.py,sha256=sM1G0sN1pf1TPpI0Y3Oew378UGEKVkMRc2Mlu9tDjLo,545
351
- castor_extractor/warehouse/databricks/sql_client.py,sha256=BchHMNqHPtZsJWhj2XYq3QVVTj3XfKhzhhPTJng8vXo,3656
349
+ castor_extractor/warehouse/databricks/queries/column_lineage.sql,sha256=Q8MAZ5N3fNcolTMtRRw2fIrbKgV4ax9StgJgtYMpxNQ,980
350
+ castor_extractor/warehouse/databricks/queries/table_lineage.sql,sha256=5k5jHj11SdGpfMqJEKJihAhd_ngO4kZOZJ8TCPihWDs,786
351
+ castor_extractor/warehouse/databricks/sql_client.py,sha256=oypv_2pomoleXUJJhS8CSKO_ucalQhS9_mcsnsb5wsc,3750
352
352
  castor_extractor/warehouse/databricks/types.py,sha256=-TFX4jS6_c3wQLOpJTKpLeGS21YIPjKDjISnzeUPdCc,46
353
353
  castor_extractor/warehouse/databricks/utils.py,sha256=5CKn6Me1Tus97H_qDEz_5tkhd4ARmwk2qiC3GndjyCc,1969
354
354
  castor_extractor/warehouse/databricks/utils_test.py,sha256=_guTuzRWRTZdDY7ils0X1K8jhI9T877MEtw3x_YDg9I,2415
@@ -422,17 +422,17 @@ castor_extractor/warehouse/snowflake/queries/view_ddl.sql,sha256=eWsci_50cxiYIv3
422
422
  castor_extractor/warehouse/snowflake/query.py,sha256=C2LTdPwBzMQ_zMncg0Kq4_WkoY7K9as5tvxBDrIOlwI,1763
423
423
  castor_extractor/warehouse/sqlserver/__init__.py,sha256=PdOuYznmvKAbfWAm8UdN47MfEsd9jqPi_dDi3WEo1KY,116
424
424
  castor_extractor/warehouse/sqlserver/client.py,sha256=Bjfpw96IKAQfWPiU5SZYEDfetwfkqZrnKbQYoStcnZc,2007
425
- castor_extractor/warehouse/sqlserver/extract.py,sha256=-LoHY5wAGJk4vutrO3N0_PaRqts7rkEn7pADRHzoxiI,2249
425
+ castor_extractor/warehouse/sqlserver/extract.py,sha256=GbOlSq8JR6HaJZunkfiRxaSt0pbgazQjF8GpgqWWIcU,2294
426
426
  castor_extractor/warehouse/sqlserver/queries/.sqlfluff,sha256=yy0KQdz8I_67vnXyX8eeWwOWkxTXvHyVKSVwhURktd8,48
427
- castor_extractor/warehouse/sqlserver/queries/column.sql,sha256=_K5OS63N7fM7kGPudnnjJEnIyaxR1xE2hoZgnJ_A3p8,2763
427
+ castor_extractor/warehouse/sqlserver/queries/column.sql,sha256=eRILCgdygYRvtfSdxaswIiIYKW-PiJXW2qi3yHtrfns,2913
428
428
  castor_extractor/warehouse/sqlserver/queries/database.sql,sha256=4dPeBCn85MEOXr1f-DPXxiI3RvvoE_1n8lsbTs26E0I,150
429
- castor_extractor/warehouse/sqlserver/queries/schema.sql,sha256=UR3eTiYw7Iq5-GukelnNg_uq6haZ_dwg_SedZfOWUoA,619
430
- castor_extractor/warehouse/sqlserver/queries/table.sql,sha256=4RgeSkHDWTWRyU2iLxaBR0KuSwIBvb3GbQGdkJYXbn0,2787
429
+ castor_extractor/warehouse/sqlserver/queries/schema.sql,sha256=Zp4G86FJ_Be8Zqvdlu7K8DqmsUL62kxbwaUk5asZ0V4,881
430
+ castor_extractor/warehouse/sqlserver/queries/table.sql,sha256=YwFhHc6rGbszqQt7Izh7EngVwrrBoEZ9kniuWXNtGco,2837
431
431
  castor_extractor/warehouse/sqlserver/queries/user.sql,sha256=gOrZsMVypusR2dc4vwVs4E1a-CliRsr_UjnD2EbXs-A,94
432
432
  castor_extractor/warehouse/sqlserver/query.py,sha256=7sW8cK3JzxPt6faTJ7e4lk9tE4fo_AeCymI-LqsSols,1276
433
433
  castor_extractor/warehouse/synapse/queries/column.sql,sha256=lNcFoIW3Y0PFOqoOzJEXmPvZvfAsY0AP63Mu2LuPzPo,1351
434
- castor_extractor-0.24.38.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
435
- castor_extractor-0.24.38.dist-info/METADATA,sha256=iCWUVbgDFS721szJ8kUGMA58Va3Roq3WmyGinZgnHMw,26638
436
- castor_extractor-0.24.38.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
437
- castor_extractor-0.24.38.dist-info/entry_points.txt,sha256=_F-qeZCybjoMkNb9ErEhnyqXuG6afHIFQhakdBHZsr4,1803
438
- castor_extractor-0.24.38.dist-info/RECORD,,
434
+ castor_extractor-0.24.42.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
435
+ castor_extractor-0.24.42.dist-info/METADATA,sha256=yzhHBGi8q9y-Au6LpFDemdYkmNQ2v0HNXZrlEaRO_60,27015
436
+ castor_extractor-0.24.42.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
437
+ castor_extractor-0.24.42.dist-info/entry_points.txt,sha256=_F-qeZCybjoMkNb9ErEhnyqXuG6afHIFQhakdBHZsr4,1803
438
+ castor_extractor-0.24.42.dist-info/RECORD,,
@@ -1,69 +0,0 @@
1
- from typing import Iterable, Optional
2
-
3
- from .enums import LineageEntity
4
-
5
-
6
- class LineageProcessor:
7
- """
8
- helper class that handles lineage deduplication and filtering
9
- """
10
-
11
- def __init__(self, lineage_entity: LineageEntity):
12
- self.lineage_entity = lineage_entity
13
-
14
- self.lineage: dict[tuple[str, str], dict] = dict()
15
-
16
- def _parent_path(self, link) -> Optional[str]:
17
- if self.lineage_entity == LineageEntity.TABLE:
18
- return link["source_table_full_name"]
19
-
20
- source_table = link["source_table_full_name"]
21
- source_column = link["source_column_name"]
22
- if not (source_table and source_column):
23
- return None
24
-
25
- return f"{source_table}.{source_column}"
26
-
27
- def _child_path(self, link) -> Optional[str]:
28
- if self.lineage_entity == LineageEntity.TABLE:
29
- return link["target_table_full_name"]
30
-
31
- target_table = link["target_table_full_name"]
32
- target_column = link["target_column_name"]
33
- if not (target_table and target_column):
34
- return None
35
-
36
- return f"{target_table}.{target_column}"
37
-
38
- def add(self, link: dict) -> None:
39
- """
40
- If the parent and child paths are valid, keeps the most recent lineage
41
- link in the `self.lineage` map.
42
- """
43
- parent = self._parent_path(link)
44
- child = self._child_path(link)
45
- timestamp = link["event_time"]
46
-
47
- if not (parent and child and parent != child):
48
- return
49
-
50
- key = (parent, child)
51
- if key in self.lineage and self.lineage[key]["event_time"] > timestamp:
52
- return
53
-
54
- self.lineage[key] = link
55
-
56
-
57
- def valid_lineage(
58
- lineage: Iterable[dict], lineage_entity: LineageEntity
59
- ) -> list[dict]:
60
- """
61
- Filters out self-lineage or lineage with a missing source or target path,
62
- then deduplicates by picking the link with the most recent event timestmap.
63
- """
64
- deduplicated_lineage = LineageProcessor(lineage_entity)
65
-
66
- for link in lineage:
67
- deduplicated_lineage.add(link)
68
-
69
- return list(deduplicated_lineage.lineage.values())
@@ -1,89 +0,0 @@
1
- from .enums import LineageEntity
2
- from .lineage import LineageProcessor, valid_lineage
3
-
4
- _OLDER_DATE = "2025-01-01 00:00:01.0"
5
- _CLOSER_DATE = "2025-01-01 02:02:02.0"
6
-
7
- _TABLE_LINEAGES = [
8
- {
9
- "source_table_full_name": "a.b.source",
10
- "target_table_full_name": "a.b.target",
11
- "event_time": _CLOSER_DATE,
12
- "other": "more recent stuff",
13
- },
14
- {
15
- "source_table_full_name": "a.b.source",
16
- "target_table_full_name": "a.b.target",
17
- "event_time": _OLDER_DATE,
18
- "other": "stuff that's too old",
19
- },
20
- {
21
- "source_table_full_name": "no target",
22
- "target_table_full_name": None,
23
- "event_time": _CLOSER_DATE,
24
- },
25
- {
26
- "source_table_full_name": None,
27
- "target_table_full_name": "no source",
28
- "event_time": _CLOSER_DATE,
29
- },
30
- ]
31
-
32
-
33
- _COLUMN_LINEAGES = [
34
- {
35
- "source_table_full_name": "a.b.source",
36
- "source_column_name": "src_col",
37
- "target_table_full_name": "a.b.target",
38
- "target_column_name": "trgt_col",
39
- "event_time": _OLDER_DATE,
40
- "other": "old stuff",
41
- },
42
- {
43
- "source_table_full_name": "a.b.source",
44
- "source_column_name": "src_col",
45
- "target_table_full_name": "a.b.target",
46
- "target_column_name": "trgt_col",
47
- "event_time": _CLOSER_DATE,
48
- "other": "newer stuff",
49
- },
50
- {
51
- "source_table_full_name": "a.b.toto",
52
- "source_column_name": "toto_col",
53
- "target_table_full_name": "a.b.tata",
54
- "target_column_name": "tata_col",
55
- "event_time": _OLDER_DATE,
56
- },
57
- {
58
- "source_table_full_name": "a.b.source",
59
- "source_column_name": "a.b.source",
60
- "target_table_full_name": None,
61
- "target_column_name": None,
62
- "event_time": _CLOSER_DATE,
63
- },
64
- ]
65
-
66
-
67
- def test_valid_lineage():
68
- table_links = valid_lineage(_TABLE_LINEAGES, LineageEntity.TABLE)
69
-
70
- assert len(table_links) == 1
71
- assert table_links[0]["source_table_full_name"] == "a.b.source"
72
- assert table_links[0]["target_table_full_name"] == "a.b.target"
73
- assert table_links[0]["event_time"] == _CLOSER_DATE
74
- assert table_links[0]["other"] == "more recent stuff"
75
-
76
-
77
- def test_LineageLinks_add():
78
- deduplicated_lineage = LineageProcessor(LineageEntity.COLUMN)
79
- for link in _COLUMN_LINEAGES:
80
- deduplicated_lineage.add(link)
81
-
82
- lineage = deduplicated_lineage.lineage
83
- assert len(lineage) == 2
84
- assert ("a.b.source.src_col", "a.b.target.trgt_col") in lineage
85
- assert ("a.b.toto.toto_col", "a.b.tata.tata_col") in lineage
86
- assert (
87
- lineage[("a.b.source.src_col", "a.b.target.trgt_col")]["other"]
88
- == "newer stuff"
89
- )