castor-extractor 0.22.1__py3-none-any.whl → 0.22.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

CHANGELOG.md CHANGED
@@ -1,6 +1,22 @@
1
1
 
2
2
  # Changelog
3
3
 
4
+ ## 0.22.5 - 2025-01-09
5
+
6
+ * Databricks: validate and deduplicate lineage links
7
+
8
+ ## 0.22.4 - 2025-01-08
9
+
10
+ * ThoughtSpot: extract answers
11
+
12
+ ## 0.22.3 - 2024-12-10
13
+
14
+ * Databricks: extract lineage from system tables
15
+
16
+ ## 0.22.2 - 2024-12-06
17
+
18
+ * Sigma: multithreading to retrieve lineage
19
+
4
20
  ## 0.22.1 - 2024-12-05
5
21
 
6
22
  * Salesforce: deduplicate tables
@@ -1,9 +1,11 @@
1
1
  from collections.abc import Iterator
2
+ from concurrent.futures import ThreadPoolExecutor
2
3
  from functools import partial
3
4
  from http import HTTPStatus
4
5
  from typing import Callable, Optional
5
6
 
6
7
  import requests
8
+ from pydantic import BaseModel
7
9
 
8
10
  from ....utils import (
9
11
  APIClient,
@@ -12,6 +14,7 @@ from ....utils import (
12
14
  build_url,
13
15
  fetch_all_pages,
14
16
  handle_response,
17
+ retry,
15
18
  )
16
19
  from ..assets import SigmaAsset
17
20
  from .credentials import SigmaCredentials
@@ -29,7 +32,7 @@ _DATA_ELEMENTS: tuple[str, ...] = (
29
32
  )
30
33
 
31
34
  _AUTH_TIMEOUT_S = 60
32
- _SIGMA_TIMEOUT = 120
35
+ _SIGMA_TIMEOUT_S = 300
33
36
 
34
37
  _SIGMA_HEADERS = {
35
38
  "Content-Type": _CONTENT_TYPE,
@@ -47,6 +50,23 @@ SIGMA_SAFE_MODE = RequestSafeMode(
47
50
  max_errors=_VOLUME_IGNORED,
48
51
  status_codes=_IGNORED_ERROR_CODES,
49
52
  )
53
+ _THREADS_LINEAGE = 10 # empirically found; hit the rate limit with 20 workers
54
+ _RETRY_NUMBER = 1
55
+ _RETRY_BASE_MS = 60_000
56
+
57
+
58
+ class LineageContext(BaseModel):
59
+ """all info needed to build the endpoint for lineage retrieval"""
60
+
61
+ workbook_id: str
62
+ element_id: str
63
+
64
+
65
+ class Lineage(BaseModel):
66
+ """holds response from lineage API and context used to retrieve it"""
67
+
68
+ lineage: dict
69
+ context: LineageContext
50
70
 
51
71
 
52
72
  class SigmaBearerAuth(BearerAuth):
@@ -77,7 +97,7 @@ class SigmaClient(APIClient):
77
97
  host=credentials.host,
78
98
  auth=auth,
79
99
  headers=_SIGMA_HEADERS,
80
- timeout=_SIGMA_TIMEOUT,
100
+ timeout=_SIGMA_TIMEOUT_S,
81
101
  safe_mode=safe_mode or SIGMA_SAFE_MODE,
82
102
  )
83
103
 
@@ -133,17 +153,51 @@ class SigmaClient(APIClient):
133
153
  page=page, workbook_id=workbook_id
134
154
  )
135
155
 
136
- def _get_all_lineages(self, elements: list[dict]) -> Iterator[dict]:
156
+ @retry(
157
+ (ConnectionError,),
158
+ max_retries=_RETRY_NUMBER,
159
+ base_ms=_RETRY_BASE_MS,
160
+ log_exc_info=True,
161
+ )
162
+ def _get_lineage(self, lineage_context: LineageContext) -> Lineage:
163
+ """
164
+ return the lineage from API and other ids needed to characterize
165
+ lineage in castor
166
+ """
167
+ workbook_id = lineage_context.workbook_id
168
+ element_id = lineage_context.element_id
169
+ endpoint = SigmaEndpointFactory.lineage(workbook_id, element_id)
170
+ return Lineage(lineage=self._get(endpoint), context=lineage_context)
171
+
172
+ @staticmethod
173
+ def _lineage_context(elements: list[dict]) -> list[LineageContext]:
174
+ """
175
+ Helper function to prepare context for lineage retrieval.
176
+ Elements without associated columns are skipped.
177
+ """
178
+ contexts: list[LineageContext] = []
137
179
  for element in elements:
138
- workbook_id = element["workbook_id"]
139
- element_id = element["elementId"]
140
- lineage = self._get(
141
- endpoint=SigmaEndpointFactory.lineage(workbook_id, element_id)
180
+ if element.get("columns") is None:
181
+ continue
182
+
183
+ context = LineageContext(
184
+ workbook_id=element["workbook_id"],
185
+ element_id=element["elementId"],
142
186
  )
187
+ contexts.append(context)
188
+ return contexts
189
+
190
+ def _get_all_lineages(self, elements: list[dict]) -> Iterator[dict]:
191
+ lineage_context = self._lineage_context(elements)
192
+
193
+ with ThreadPoolExecutor(max_workers=_THREADS_LINEAGE) as executor:
194
+ results = executor.map(self._get_lineage, lineage_context)
195
+
196
+ for lineage in results:
143
197
  yield {
144
- **lineage,
145
- "workbook_id": workbook_id,
146
- "element_id": element_id,
198
+ **lineage.lineage,
199
+ "workbook_id": lineage.context.workbook_id,
200
+ "element_id": lineage.context.element_id,
147
201
  }
148
202
 
149
203
  def _get_all_queries(self, workbooks: list[dict]) -> Iterator[dict]:
@@ -4,6 +4,8 @@ from ...types import ExternalAsset
4
4
  class ThoughtspotAsset(ExternalAsset):
5
5
  """Thoughtspot assets"""
6
6
 
7
+ ANSWERS = "answers"
8
+ ANSWER_USAGES = "answer_usages"
7
9
  LIVEBOARDS = "liveboards"
10
+ LIVEBOARD_USAGES = "liveboard_usages"
8
11
  LOGICAL_TABLES = "logical_tables"
9
- USAGES = "usages"
@@ -30,7 +30,12 @@ _THOUGHTSPOT_HEADERS = {
30
30
  "Content-Type": "application/json",
31
31
  }
32
32
  _METADATA_BATCH_SIZE = 100
33
- _USAGE_LIVEBOARD_ID = "bea79810-145f-4ad0-a02c-4177a6e7d861"
33
+ # https://docs.thoughtspot.com/cloud/latest/object-usage-liveboard
34
+ _OBJECT_USAGE_LIVEBOARD = "Object Usage"
35
+ _ANSWER_USAGE_VIZ = "Answer Usage, by User"
36
+ # https://docs.thoughtspot.com/cloud/latest/user-adoption
37
+ _USER_ADOPTION_LIVEBOARD = "User Adoption"
38
+ _LIVEBOARD_USAGE_VIZ = "Popular Liveboards Last 30 Days"
34
39
  # By default, no errors are ignored for the moment
35
40
  THOUGHTSPOT_SAFE_MODE = RequestSafeMode()
36
41
 
@@ -69,23 +74,39 @@ class ThoughtspotClient(APIClient):
69
74
  def _metadata_search(
70
75
  self,
71
76
  metadata_type: str,
77
+ identifier: Optional[str] = None,
72
78
  ) -> Iterator[dict]:
79
+ """
80
+ Yields assets of the given asset type, and optionally filters on a
81
+ specific identifier.
82
+ """
73
83
  offset = 0
84
+
74
85
  while True:
86
+ search_filters = {
87
+ "metadata": [{"type": metadata_type}],
88
+ "include_details": True,
89
+ "record_size": _METADATA_BATCH_SIZE,
90
+ "record_offset": offset,
91
+ }
92
+ if identifier:
93
+ search_filters["metadata"] = {
94
+ "identifier": identifier,
95
+ "type": metadata_type,
96
+ }
97
+
75
98
  metadata = self._post(
76
99
  ThoughtspotEndpointFactory.metadata_search(),
77
- data={
78
- "metadata": [{"type": metadata_type}],
79
- "include_details": True,
80
- "record_size": _METADATA_BATCH_SIZE,
81
- "record_offset": offset,
82
- },
100
+ data=search_filters,
83
101
  )
84
102
  yield from metadata
85
103
  if len(metadata) < _METADATA_BATCH_SIZE:
86
104
  break
87
105
  offset = offset + _METADATA_BATCH_SIZE
88
106
 
107
+ def _get_all_answers(self) -> Iterator[dict]:
108
+ yield from self._metadata_search(metadata_type="ANSWER")
109
+
89
110
  def _get_all_liveboards(self) -> Iterator[dict]:
90
111
  yield from self._metadata_search(metadata_type="LIVEBOARD")
91
112
 
@@ -95,26 +116,58 @@ class ThoughtspotClient(APIClient):
95
116
  def _get_all_tables(self) -> Iterator[dict]:
96
117
  yield from self._metadata_search(metadata_type="LOGICAL_TABLE")
97
118
 
98
- def _get_liveboards_usages(self) -> Iterator[dict]:
119
+ def _get_usages(
120
+ self,
121
+ liveboard_name: str,
122
+ visualization_name: str,
123
+ ) -> Iterator[dict]:
124
+ """
125
+ Yields the data of a given visualization in the given liveboard.
126
+ ThoughtSpot maintains two system liveboards with stats about data usage,
127
+ which are useful to compute view counts and popularity.
128
+ """
129
+ usage_liveboard = next(
130
+ self._metadata_search(
131
+ metadata_type="LIVEBOARD", identifier=liveboard_name
132
+ )
133
+ )
134
+ liveboard_id = usage_liveboard["metadata_id"]
135
+
99
136
  data = self._post(
100
137
  endpoint=ThoughtspotEndpointFactory.liveboard(),
101
138
  headers={"Accept": "application/octet-stream"},
102
139
  data={
103
- "metadata_identifier": _USAGE_LIVEBOARD_ID,
140
+ "metadata_identifier": liveboard_id,
104
141
  "file_format": "CSV",
105
- "visualization_identifiers": [
106
- "Popular Liveboards Last 30 Days"
107
- ],
142
+ "visualization_identifiers": [visualization_name],
108
143
  },
109
144
  handler=lambda x: x.text,
110
145
  )
111
146
  yield from usage_liveboard_reader(data)
112
147
 
113
- def fetch(self, asset: ThoughtspotAsset):
148
+ def _get_answer_usages(self) -> Iterator[dict]:
149
+ return self._get_usages(
150
+ liveboard_name=_OBJECT_USAGE_LIVEBOARD,
151
+ visualization_name=_ANSWER_USAGE_VIZ,
152
+ )
153
+
154
+ def _get_liveboards_usages(self) -> Iterator[dict]:
155
+ return self._get_usages(
156
+ liveboard_name=_USER_ADOPTION_LIVEBOARD,
157
+ visualization_name=_LIVEBOARD_USAGE_VIZ,
158
+ )
159
+
160
+ def fetch(self, asset: ThoughtspotAsset) -> Iterator[dict]:
161
+ if asset == ThoughtspotAsset.ANSWERS:
162
+ yield from self._get_all_answers()
163
+
164
+ if asset == ThoughtspotAsset.ANSWER_USAGES:
165
+ yield from self._get_answer_usages()
166
+
114
167
  if asset == ThoughtspotAsset.LIVEBOARDS:
115
168
  yield from self._get_all_liveboards()
116
169
 
117
- if asset == ThoughtspotAsset.USAGES:
170
+ if asset == ThoughtspotAsset.LIVEBOARD_USAGES:
118
171
  yield from self._get_liveboards_usages()
119
172
 
120
173
  if asset == ThoughtspotAsset.LOGICAL_TABLES:
@@ -1,13 +1,17 @@
1
1
  import csv
2
+ import re
2
3
  from collections.abc import Iterator
3
4
  from io import StringIO
4
5
 
6
+ _END_OF_GENERATED_TEXT = r'^""$'
7
+
5
8
 
6
9
  def usage_liveboard_reader(usage_liveboard_csv: str) -> Iterator[dict]:
7
10
  """
8
11
  Converts a CSV string into an iterator of dictionaries after
9
- ignoring the first 6 lines, using the 7th line as the header.
10
- First 6 lines looks like the following:
12
+ ignoring the generated text that preceeds the actual CSV header row.
13
+ The generated block ends with a row containing only two double quotes.
14
+ Here is an example:
11
15
 
12
16
  "Data extract produced by Castor on 09/19/2024 06:54"
13
17
  "Filters applied on data :"
@@ -15,11 +19,13 @@ def usage_liveboard_reader(usage_liveboard_csv: str) -> Iterator[dict]:
15
19
  "Pinboard NOT IN [mlm - availability pinboard,null]"
16
20
  "Timestamp >= 20240820 00:00:00 < 20240919 00:00:00"
17
21
  "Timestamp >= 20240919 00:00:00 < 20240920 00:00:00"
22
+ ""
18
23
 
19
24
  """
20
25
  csv_file = StringIO(usage_liveboard_csv)
21
26
 
22
- for _ in range(7):
23
- next(csv_file)
27
+ line = next(csv_file)
28
+ while not re.match(_END_OF_GENERATED_TEXT, line.strip()):
29
+ line = next(csv_file)
24
30
 
25
31
  yield from csv.DictReader(csv_file)
@@ -2,7 +2,7 @@ from .utils import (
2
2
  usage_liveboard_reader,
3
3
  )
4
4
 
5
- VALID_CSV = '''"Data extract produced by Castor on 09/19/2024 06:54"
5
+ VALID_CSV_1 = '''"Data extract produced by Castor on 09/19/2024 06:54"
6
6
  "Filters applied on data :"
7
7
  "User Action IN [pinboard_embed_view,pinboard_tspublic_no_runtime_filter,pinboard_tspublic_runtime_filter,pinboard_view]"
8
8
  "Pinboard NOT IN [mlm - availability pinboard,null]"
@@ -16,6 +16,13 @@ VALID_CSV = '''"Data extract produced by Castor on 09/19/2024 06:54"
16
16
  "September test","25","2"'''
17
17
 
18
18
 
19
+ VALID_CSV_2 = '''"Data extract produced by Castor on 01/07/2025 16:07"
20
+ "Filters applied on data :"
21
+ "Timestamp >= 20241208 00:00:00 < 20250107 00:00:00"
22
+ ""
23
+ "Answer name","User name","Number of unique users","Count of object interactions"
24
+ "toto","tata","1","666"'''
25
+
19
26
  # Invalid CSV input (missing data rows)
20
27
  INVALID_CSV = '''"Data extract produced by Castor on 09/19/2024 06:54"
21
28
  "Filters applied on data :"
@@ -27,7 +34,7 @@ INVALID_CSV = '''"Data extract produced by Castor on 09/19/2024 06:54"
27
34
 
28
35
 
29
36
  def test_usage_liveboard_reader():
30
- expected_output = [
37
+ expected_output_1 = [
31
38
  {
32
39
  "Pinboard": "Market Report",
33
40
  "Pinboard Views": "559",
@@ -49,9 +56,20 @@ def test_usage_liveboard_reader():
49
56
  "Unique Number of User": "2",
50
57
  },
51
58
  ]
59
+ expected_output_2 = [
60
+ {
61
+ "Answer name": "toto",
62
+ "User name": "tata",
63
+ "Number of unique users": "1",
64
+ "Count of object interactions": "666",
65
+ }
66
+ ]
67
+
68
+ result = list(usage_liveboard_reader(VALID_CSV_1))
69
+ assert result == expected_output_1
52
70
 
53
- result = list(usage_liveboard_reader(VALID_CSV))
54
- assert result == expected_output
71
+ result = list(usage_liveboard_reader(VALID_CSV_2))
72
+ assert result == expected_output_2
55
73
 
56
74
  result = list(usage_liveboard_reader(INVALID_CSV))
57
75
  assert result == [] # Expect an empty result since there is no data
@@ -1,8 +1,6 @@
1
1
  import logging
2
- from collections.abc import Iterator
3
2
  from functools import partial
4
- from http import HTTPStatus
5
- from typing import Optional
3
+ from typing import Iterator, Optional
6
4
 
7
5
  import requests
8
6
 
@@ -14,16 +12,14 @@ from ...utils import (
14
12
  fetch_all_pages,
15
13
  handle_response,
16
14
  retry,
17
- retry_request,
18
15
  safe_mode,
19
16
  )
20
17
  from ..abstract import TimeFilter
21
18
  from .credentials import DatabricksCredentials
22
19
  from .endpoints import DatabricksEndpointFactory
23
20
  from .format import DatabricksFormatter, TagMapping
24
- from .lineage import single_column_lineage_links, single_table_lineage_links
25
21
  from .pagination import DATABRICKS_PAGE_SIZE, DatabricksPagination
26
- from .types import TablesColumns, TimestampedLink
22
+ from .types import TablesColumns
27
23
  from .utils import hourly_time_filters
28
24
 
29
25
  logger = logging.getLogger(__name__)
@@ -132,60 +128,6 @@ class DatabricksAPIClient(APIClient):
132
128
  column_tags=column_tags,
133
129
  )
134
130
 
135
- @safe_mode(safe_lineage_params, lambda: [])
136
- @retry(
137
- exceptions=_RETRY_EXCEPTIONS,
138
- max_retries=_RETRY_ATTEMPTS,
139
- base_ms=_RETRY_BASE_MS,
140
- )
141
- @retry_request(
142
- status_codes=(HTTPStatus.TOO_MANY_REQUESTS,),
143
- max_retries=_RETRY_ATTEMPTS,
144
- )
145
- def get_single_column_lineage(
146
- self,
147
- names: tuple[str, str],
148
- ) -> list[TimestampedLink]:
149
- """
150
- Helper function used in get_lineage_links.
151
- Call data lineage API and return the content of the result
152
-
153
- eg table_path: broward_prd.bronze.account_adjustments
154
- FYI: Maximum rate of 10 requests per SECOND
155
- """
156
- table_path, column_name = names
157
- payload = {
158
- "table_name": table_path,
159
- "column_name": column_name,
160
- "include_entity_lineage": True,
161
- }
162
- content = self._get(
163
- DatabricksEndpointFactory.column_lineage(), params=payload
164
- )
165
- column_path = f"{table_path}.{column_name}"
166
- return single_column_lineage_links(column_path, content)
167
-
168
- @safe_mode(safe_lineage_params, lambda: [])
169
- @retry(
170
- exceptions=_RETRY_EXCEPTIONS,
171
- max_retries=_RETRY_ATTEMPTS,
172
- base_ms=_RETRY_BASE_MS,
173
- )
174
- def get_single_table_lineage(
175
- self, table_path: str
176
- ) -> list[TimestampedLink]:
177
- """
178
- Helper function used in get_lineage_links.
179
- Call data lineage API and return the content of the result
180
- eg table_path: broward_prd.bronze.account_adjustments
181
- FYI: Maximum rate of 50 requests per SECOND
182
- """
183
- payload = {"table_name": table_path, "include_entity_lineage": True}
184
- content = self._get(
185
- DatabricksEndpointFactory.table_lineage(), params=payload
186
- )
187
- return single_table_lineage_links(table_path, content)
188
-
189
131
  @safe_mode(safe_query_params, lambda: [])
190
132
  @retry(
191
133
  exceptions=_RETRY_EXCEPTIONS,
@@ -1,17 +1,14 @@
1
1
  import logging
2
- from concurrent.futures import ThreadPoolExecutor
3
2
  from typing import Optional
4
3
 
5
- from ...utils import (
6
- mapping_from_rows,
7
- )
4
+ from ...utils import mapping_from_rows
8
5
  from ..abstract import TimeFilter
9
6
  from .api_client import DatabricksAPIClient
10
7
  from .credentials import DatabricksCredentials
8
+ from .enums import TagEntity
11
9
  from .format import DatabricksFormatter
12
- from .lineage import deduplicate_lineage, paths_for_column_lineage
13
- from .sql_client import DatabricksSQLClient, TagEntity
14
- from .types import TablesColumns, TimestampedLink
10
+ from .sql_client import DatabricksSQLClient
11
+ from .types import TablesColumns
15
12
 
16
13
  logger = logging.getLogger(__name__)
17
14
 
@@ -95,46 +92,6 @@ class DatabricksClient:
95
92
  columns.extend(c_to_add)
96
93
  return tables, columns
97
94
 
98
- def table_lineage(self, tables: list[dict]) -> list[dict]:
99
- """
100
- Wrapper function that retrieves all table lineage
101
- """
102
- # retrieve table lineage
103
- with ThreadPoolExecutor(max_workers=_THREADS_TABLE_LINEAGE) as executor:
104
- table_paths = [
105
- ".".join([table["schema_id"], table["table_name"]])
106
- for table in tables
107
- ]
108
- results = executor.map(
109
- self.api_client.get_single_table_lineage, table_paths
110
- )
111
- lineages = [link for links in results for link in links]
112
- deduplicated = deduplicate_lineage(lineages)
113
- return self.formatter.format_lineage(deduplicated)
114
-
115
- def column_lineage(
116
- self, tables: list[dict], columns: list[dict], table_lineage: list[dict]
117
- ) -> list[dict]:
118
- """
119
- Wrapper function that retrieves all column lineage
120
- we only try to retrieve column lineage if we found table lineage
121
- """
122
- candidate_paths = paths_for_column_lineage(
123
- tables, columns, table_lineage
124
- )
125
- # retrieve column lineage
126
- with ThreadPoolExecutor(
127
- max_workers=_THREADS_COLUMN_LINEAGE
128
- ) as executor:
129
- results = executor.map(
130
- self.api_client.get_single_column_lineage, candidate_paths
131
- )
132
- lineages: list[TimestampedLink] = [
133
- link for links in results for link in links
134
- ]
135
- deduplicated = deduplicate_lineage(lineages)
136
- return self.formatter.format_lineage(deduplicated)
137
-
138
95
  def queries(self, time_filter: Optional[TimeFilter] = None) -> list[dict]:
139
96
  return self.api_client.queries(time_filter)
140
97
 
@@ -1,14 +1,4 @@
1
- from unittest.mock import Mock, patch
2
-
3
- from .client import (
4
- DatabricksClient,
5
- )
6
- from .test_constants import (
7
- CLOSER_DATE,
8
- MOCK_TABLES_FOR_TABLE_LINEAGE,
9
- OLDER_DATE,
10
- TABLE_LINEAGE_SIDE_EFFECT,
11
- )
1
+ from .client import DatabricksClient
12
2
 
13
3
 
14
4
  class MockDatabricksClient(DatabricksClient):
@@ -48,27 +38,3 @@ def test_DatabricksClient__match_table_with_user():
48
38
  table_without_owner = {"id": 1, "owner_email": None}
49
39
  actual = client._match_table_with_user(table_without_owner, user_mapping)
50
40
  assert actual == table_without_owner
51
-
52
-
53
- @patch(
54
- "source.packages.extractor.castor_extractor.warehouse.databricks.client.DatabricksAPIClient._get",
55
- side_effect=TABLE_LINEAGE_SIDE_EFFECT,
56
- )
57
- def test_DatabricksClient_table_lineage(mock_get):
58
- client = DatabricksClient(Mock())
59
-
60
- lineage = client.table_lineage(MOCK_TABLES_FOR_TABLE_LINEAGE)
61
- assert len(lineage) == 2
62
-
63
- expected_link_1 = {
64
- "parent_path": "dev.silver.pre_analytics",
65
- "child_path": "dev.silver.analytics",
66
- "timestamp": OLDER_DATE,
67
- }
68
- expected_link_2 = {
69
- "parent_path": "dev.bronze.analytics",
70
- "child_path": "dev.silver.analytics",
71
- "timestamp": CLOSER_DATE,
72
- }
73
- assert expected_link_1 in lineage
74
- assert expected_link_2 in lineage
@@ -1,24 +1,22 @@
1
1
  from dataclasses import field
2
- from typing import Optional
3
2
 
4
- from pydantic.dataclasses import dataclass
5
- from pydantic_settings import SettingsConfigDict
3
+ from pydantic_settings import BaseSettings, SettingsConfigDict
6
4
 
7
5
  DATABRICKS_ENV_PREFIX = "CASTOR_DATABRICKS_"
8
6
 
9
7
 
10
- @dataclass
11
- class DatabricksCredentials:
8
+ class DatabricksCredentials(BaseSettings):
12
9
  """
13
10
  Credentials needed by Databricks client
14
11
  Requires:
15
12
  - host
13
+ - http_path
16
14
  - token
17
15
  """
18
16
 
19
17
  host: str
18
+ http_path: str
20
19
  token: str = field(metadata={"sensitive": True})
21
- http_path: Optional[str] = field(default=None)
22
20
 
23
21
  model_config = SettingsConfigDict(
24
22
  env_prefix=DATABRICKS_ENV_PREFIX,
@@ -0,0 +1,15 @@
1
+ from enum import Enum
2
+
3
+
4
+ class LineageEntity(Enum):
5
+ """Entities that can be linked in Databricks lineage"""
6
+
7
+ COLUMN = "COLUMN"
8
+ TABLE = "TABLE"
9
+
10
+
11
+ class TagEntity(Enum):
12
+ """Entities that can be tagged in Databricks"""
13
+
14
+ COLUMN = "COLUMN"
15
+ TABLE = "TABLE"
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ from datetime import date
2
3
  from typing import Optional
3
4
 
4
5
  from ...utils import AbstractStorage, LocalStorage, write_summary
@@ -16,6 +17,7 @@ from ..abstract import (
16
17
  )
17
18
  from .client import DatabricksClient
18
19
  from .credentials import DatabricksCredentials
20
+ from .enums import LineageEntity
19
21
 
20
22
  DATABRICKS_ASSETS: SupportedAssets = {
21
23
  WarehouseAssetGroup.ADDITIONAL_LINEAGE: ADDITIONAL_LINEAGE_ASSETS,
@@ -32,6 +34,12 @@ OTimeFilter = Optional[TimeFilter]
32
34
  Paths = dict[str, str]
33
35
 
34
36
 
37
+ def _day(time_filter: OTimeFilter) -> date:
38
+ if not time_filter:
39
+ return TimeFilter.default().day
40
+ return time_filter.day
41
+
42
+
35
43
  class DatabricksExtractionProcessor:
36
44
  """Databricks' API-based extraction management"""
37
45
 
@@ -96,22 +104,18 @@ class DatabricksExtractionProcessor:
96
104
  logger.info(f"Extracted {len(columns)} columns to {location}")
97
105
  return catalog_locations
98
106
 
99
- def extract_lineage(self) -> Paths:
107
+ def extract_lineage(self, time_filter: OTimeFilter = None) -> Paths:
100
108
  if self._should_not_reextract(WarehouseAssetGroup.ADDITIONAL_LINEAGE):
101
109
  return self._existing_group_paths(
102
110
  WarehouseAssetGroup.ADDITIONAL_LINEAGE
103
111
  )
104
112
  lineage_locations: dict[str, str] = dict()
105
113
 
106
- # extract catalog
107
- databases = self._client.databases()
108
- schemas = self._client.schemas(databases)
109
- users = self._client.users()
110
- tables, columns = self._client.tables_and_columns(schemas, users)
111
- logger.info("Extracted pre-requisite catalog. Next comes lineage")
114
+ day = _day(time_filter)
115
+ client = self._client.sql_client
112
116
 
113
117
  # extract table lineage
114
- table_lineage = self._client.table_lineage(tables)
118
+ table_lineage = client.get_lineage(LineageEntity.TABLE, day)
115
119
  table_lineage_key = WarehouseAsset.ADDITIONAL_TABLE_LINEAGE.value
116
120
  location = self._storage.put(table_lineage_key, table_lineage)
117
121
  lineage_locations[table_lineage_key] = location
@@ -119,9 +123,7 @@ class DatabricksExtractionProcessor:
119
123
  logger.info(msg)
120
124
 
121
125
  # extract column lineage
122
- column_lineage = self._client.column_lineage(
123
- tables, columns, table_lineage
124
- )
126
+ column_lineage = client.get_lineage(LineageEntity.COLUMN, day)
125
127
  column_lineage_key = WarehouseAsset.ADDITIONAL_COLUMN_LINEAGE.value
126
128
  location = self._storage.put(column_lineage_key, column_lineage)
127
129
  lineage_locations[column_lineage_key] = location
@@ -1,141 +1,69 @@
1
- from typing import cast
1
+ from typing import Iterable, Optional
2
2
 
3
- from .types import Link, Ostr, OTimestampedLink, TimestampedLink
3
+ from .enums import LineageEntity
4
4
 
5
5
 
6
- class LineageLinks:
6
+ class LineageProcessor:
7
7
  """
8
8
  helper class that handles lineage deduplication and filtering
9
9
  """
10
10
 
11
- def __init__(self):
12
- self.lineage: dict[Link, Ostr] = dict()
11
+ def __init__(self, lineage_entity: LineageEntity):
12
+ self.lineage_entity = lineage_entity
13
13
 
14
- def add(self, timestamped_link: TimestampedLink) -> None:
15
- """
16
- keep the most recent lineage link, adding to `self.lineage`
17
- """
18
- parent, child, timestamp = timestamped_link
19
- link = (parent, child)
20
- if not self.lineage.get(link):
21
- self.lineage[link] = timestamp
22
- return
23
-
24
- if not timestamp:
25
- return
26
- # keep most recent link; cast for mypy
27
- recent = max(cast(str, self.lineage[link]), cast(str, timestamp))
28
- self.lineage[link] = recent
14
+ self.lineage: dict[tuple[str, str], dict] = dict()
29
15
 
16
+ def _parent_path(self, link) -> Optional[str]:
17
+ if self.lineage_entity == LineageEntity.TABLE:
18
+ return link["source_table_full_name"]
30
19
 
31
- def _to_table_path(table: dict) -> Ostr:
32
- if table.get("name"):
33
- return f"{table['catalog_name']}.{table['schema_name']}.{table['name']}"
34
- return None
20
+ source_table = link["source_table_full_name"]
21
+ source_column = link["source_column_name"]
22
+ if not (source_table and source_column):
23
+ return None
35
24
 
25
+ return f"{source_table}.{source_column}"
36
26
 
37
- def _to_column_path(column: dict) -> Ostr:
38
- if column.get("name"):
39
- return f"{column['catalog_name']}.{column['schema_name']}.{column['table_name']}.{column['name']}"
40
- return None
27
+ def _child_path(self, link) -> Optional[str]:
28
+ if self.lineage_entity == LineageEntity.TABLE:
29
+ return link["target_table_full_name"]
41
30
 
31
+ target_table = link["target_table_full_name"]
32
+ target_column = link["target_column_name"]
33
+ if not (target_table and target_column):
34
+ return None
42
35
 
43
- def _link(path_from: Ostr, path_to: Ostr, timestamp: Ostr) -> OTimestampedLink:
44
- """exclude missing path and self-lineage"""
45
- if (not path_from) or (not path_to):
46
- return None
47
- is_self_lineage = path_from.lower() == path_to.lower()
48
- if is_self_lineage:
49
- return None
50
- return path_from, path_to, timestamp
36
+ return f"{target_table}.{target_column}"
51
37
 
38
+ def add(self, link: dict) -> None:
39
+ """
40
+ If the parent and child paths are valid, keeps the most recent lineage
41
+ link in the `self.lineage` map.
42
+ """
43
+ parent = self._parent_path(link)
44
+ child = self._child_path(link)
45
+ timestamp = link["event_time"]
52
46
 
53
- def single_table_lineage_links(
54
- table_path: str, single_table_lineage: dict
55
- ) -> list[TimestampedLink]:
56
- """
57
- process databricks lineage API response for a given table
58
- returns a list of (parent, child, timestamp)
59
-
60
- Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
61
- we could also have `notebookInfos` or `fileInfo`
62
- """
63
- links: list[OTimestampedLink] = []
64
- # add parent:
65
- for link in single_table_lineage.get("upstreams", []):
66
- parent = link.get("tableInfo", {})
67
- parent_path = _to_table_path(parent)
68
- timestamp: Ostr = parent.get("lineage_timestamp")
69
- links.append(_link(parent_path, table_path, timestamp))
70
-
71
- # add children:
72
- for link in single_table_lineage.get("downstreams", []):
73
- child = link.get("tableInfo", {})
74
- child_path = _to_table_path(child)
75
- timestamp = child.get("lineage_timestamp")
76
- links.append(_link(table_path, child_path, timestamp))
77
-
78
- return list(filter(None, links))
79
-
80
-
81
- def single_column_lineage_links(
82
- column_path: str, single_column_lineage: dict
83
- ) -> list[TimestampedLink]:
84
- """
85
- process databricks lineage API response for a given table
86
- returns a list of (parent, child, timestamp)
87
-
88
- Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
89
- we could also have `notebookInfos` or `fileInfo`
90
- """
91
- links: list[OTimestampedLink] = []
92
- # add parent:
93
- for link in single_column_lineage.get("upstream_cols", []):
94
- parent_path = _to_column_path(link)
95
- timestamp: Ostr = link.get("lineage_timestamp")
96
- links.append(_link(parent_path, column_path, timestamp))
47
+ if not (parent and child and parent != child):
48
+ return
97
49
 
98
- # add children:
99
- for link in single_column_lineage.get("downstream_cols", []):
100
- child_path = _to_column_path(link)
101
- timestamp = link.get("lineage_timestamp")
102
- links.append(_link(column_path, child_path, timestamp))
50
+ key = (parent, child)
51
+ if key in self.lineage and self.lineage[key]["event_time"] > timestamp:
52
+ return
103
53
 
104
- return list(filter(None, links))
54
+ self.lineage[key] = link
105
55
 
106
56
 
107
- def paths_for_column_lineage(
108
- tables: list[dict], columns: list[dict], table_lineage: list[dict]
109
- ) -> list[tuple[str, str]]:
57
+ def valid_lineage(
58
+ lineage: Iterable[dict], lineage_entity: LineageEntity
59
+ ) -> list[dict]:
110
60
  """
111
- helper providing a list of candidate columns to look lineage for:
112
- we only look for column lineage where there is table lineage
61
+ Filters out self-lineage or lineage with a missing source or target path,
62
+ then deduplicates by picking the link with the most recent event timestmap.
113
63
  """
114
- # mapping between table id and its path db.schema.table
115
- # table["schema_id"] follows the pattern `db.schema`
116
- mapping = {
117
- table["id"]: ".".join([table["schema_id"], table["table_name"]])
118
- for table in tables
119
- }
120
-
121
- tables_with_lineage: set[str] = set()
122
- for t in table_lineage:
123
- tables_with_lineage.add(t["parent_path"])
124
- tables_with_lineage.add(t["child_path"])
125
-
126
- paths_to_return: list[tuple[str, str]] = []
127
- for column in columns:
128
- table_path = mapping[column["table_id"]]
129
- if table_path not in tables_with_lineage:
130
- continue
131
- column_ = (table_path, column["column_name"])
132
- paths_to_return.append(column_)
133
-
134
- return paths_to_return
135
-
136
-
137
- def deduplicate_lineage(lineages: list[TimestampedLink]) -> dict:
138
- deduplicated_lineage = LineageLinks()
139
- for timestamped_link in lineages:
140
- deduplicated_lineage.add(timestamped_link)
141
- return deduplicated_lineage.lineage
64
+ deduplicated_lineage = LineageProcessor(lineage_entity)
65
+
66
+ for link in lineage:
67
+ deduplicated_lineage.add(link)
68
+
69
+ return list(deduplicated_lineage.lineage.values())
@@ -1,34 +1,89 @@
1
- from .lineage import LineageLinks
2
- from .test_constants import (
3
- CLOSER_DATE,
4
- OLDER_DATE,
5
- )
1
+ from .enums import LineageEntity
2
+ from .lineage import LineageProcessor, valid_lineage
3
+
4
+ _OLDER_DATE = "2025-01-01 00:00:01.0"
5
+ _CLOSER_DATE = "2025-01-01 02:02:02.0"
6
+
7
+ _TABLE_LINEAGES = [
8
+ {
9
+ "source_table_full_name": "a.b.source",
10
+ "target_table_full_name": "a.b.target",
11
+ "event_time": _CLOSER_DATE,
12
+ "other": "more recent stuff",
13
+ },
14
+ {
15
+ "source_table_full_name": "a.b.source",
16
+ "target_table_full_name": "a.b.target",
17
+ "event_time": _OLDER_DATE,
18
+ "other": "stuff that's too old",
19
+ },
20
+ {
21
+ "source_table_full_name": "no target",
22
+ "target_table_full_name": None,
23
+ "event_time": _CLOSER_DATE,
24
+ },
25
+ {
26
+ "source_table_full_name": None,
27
+ "target_table_full_name": "no source",
28
+ "event_time": _CLOSER_DATE,
29
+ },
30
+ ]
31
+
32
+
33
+ _COLUMN_LINEAGES = [
34
+ {
35
+ "source_table_full_name": "a.b.source",
36
+ "source_column_name": "src_col",
37
+ "target_table_full_name": "a.b.target",
38
+ "target_column_name": "trgt_col",
39
+ "event_time": _OLDER_DATE,
40
+ "other": "old stuff",
41
+ },
42
+ {
43
+ "source_table_full_name": "a.b.source",
44
+ "source_column_name": "src_col",
45
+ "target_table_full_name": "a.b.target",
46
+ "target_column_name": "trgt_col",
47
+ "event_time": _CLOSER_DATE,
48
+ "other": "newer stuff",
49
+ },
50
+ {
51
+ "source_table_full_name": "a.b.toto",
52
+ "source_column_name": "toto_col",
53
+ "target_table_full_name": "a.b.tata",
54
+ "target_column_name": "tata_col",
55
+ "event_time": _OLDER_DATE,
56
+ },
57
+ {
58
+ "source_table_full_name": "a.b.source",
59
+ "source_column_name": "a.b.source",
60
+ "target_table_full_name": None,
61
+ "target_column_name": None,
62
+ "event_time": _CLOSER_DATE,
63
+ },
64
+ ]
65
+
66
+
67
+ def test_valid_lineage():
68
+ table_links = valid_lineage(_TABLE_LINEAGES, LineageEntity.TABLE)
69
+
70
+ assert len(table_links) == 1
71
+ assert table_links[0]["source_table_full_name"] == "a.b.source"
72
+ assert table_links[0]["target_table_full_name"] == "a.b.target"
73
+ assert table_links[0]["event_time"] == _CLOSER_DATE
74
+ assert table_links[0]["other"] == "more recent stuff"
6
75
 
7
76
 
8
77
  def test_LineageLinks_add():
9
- links = LineageLinks()
10
- timestamped_link = ("parent", "child", None)
11
- expected_key = ("parent", "child")
12
-
13
- links.add(timestamped_link)
14
-
15
- assert expected_key in links.lineage
16
- assert links.lineage[expected_key] is None
17
-
18
- # we replace None by an actual timestamp
19
- timestamped_link = ("parent", "child", OLDER_DATE)
20
- links.add(timestamped_link)
21
- assert expected_key in links.lineage
22
- assert links.lineage[expected_key] == OLDER_DATE
23
-
24
- # we update with the more recent timestamp
25
- timestamped_link = ("parent", "child", CLOSER_DATE)
26
- links.add(timestamped_link)
27
- assert expected_key in links.lineage
28
- assert links.lineage[expected_key] == CLOSER_DATE
29
-
30
- # we keep the more recent timestamp
31
- timestamped_link = ("parent", "child", OLDER_DATE)
32
- links.add(timestamped_link)
33
- assert expected_key in links.lineage
34
- assert links.lineage[expected_key] == CLOSER_DATE
78
+ deduplicated_lineage = LineageProcessor(LineageEntity.COLUMN)
79
+ for link in _COLUMN_LINEAGES:
80
+ deduplicated_lineage.add(link)
81
+
82
+ lineage = deduplicated_lineage.lineage
83
+ assert len(lineage) == 2
84
+ assert ("a.b.source.src_col", "a.b.target.trgt_col") in lineage
85
+ assert ("a.b.toto.toto_col", "a.b.tata.tata_col") in lineage
86
+ assert (
87
+ lineage[("a.b.source.src_col", "a.b.target.trgt_col")]["other"]
88
+ == "newer stuff"
89
+ )
@@ -1,24 +1,24 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
- from enum import Enum
3
+ from datetime import date
4
4
  from typing import Optional
5
5
 
6
6
  from databricks import sql # type: ignore
7
7
 
8
8
  from .credentials import DatabricksCredentials
9
+ from .enums import LineageEntity, TagEntity
9
10
  from .format import TagMapping
11
+ from .lineage import valid_lineage
10
12
  from .utils import build_path, tag_label
11
13
 
12
14
  logger = logging.getLogger(__name__)
13
15
 
14
16
  _INFORMATION_SCHEMA_SQL = "SELECT * FROM system.information_schema"
15
17
 
16
-
17
- class TagEntity(Enum):
18
- """Entities that can be tagged in Databricks"""
19
-
20
- COLUMN = "COLUMN"
21
- TABLE = "TABLE"
18
+ _LINEAGE_SQL_TPL = """
19
+ SELECT * FROM system.access.{table_name}
20
+ WHERE event_date = :day
21
+ """
22
22
 
23
23
 
24
24
  class DatabricksSQLClient:
@@ -71,7 +71,6 @@ class DatabricksSQLClient:
71
71
  https://docs.databricks.com/en/sql/language-manual/information-schema/column_tags.html
72
72
  """
73
73
  if not self._needs_extraction(entity):
74
- # extracting tags require additional credentials (http_path)
75
74
  return dict()
76
75
 
77
76
  table = f"{entity.value.lower()}_tags"
@@ -88,3 +87,19 @@ class DatabricksSQLClient:
88
87
  mapping[path].append(label)
89
88
 
90
89
  return mapping
90
+
91
+ def get_lineage(
92
+ self, lineage_entity: LineageEntity, day: date
93
+ ) -> list[dict]:
94
+ """
95
+ Fetch {TABLE|COLUMN} lineage of the given day, via system tables
96
+ https://docs.databricks.com/en/admin/system-tables/lineage.html
97
+ """
98
+ table_name = f"{lineage_entity.value.lower()}_lineage"
99
+ query = _LINEAGE_SQL_TPL.format(table_name=table_name)
100
+ params = {"day": day}
101
+ result = self.execute_sql(query, params)
102
+ data = []
103
+ for row in result:
104
+ data.append(row.asDict())
105
+ return valid_lineage(data, lineage_entity)
@@ -1,8 +1 @@
1
- from typing import Optional
2
-
3
- Link = tuple[str, str]
4
1
  TablesColumns = tuple[list[dict], list[dict]]
5
- Ostr = Optional[str]
6
- TimestampedLink = tuple[str, str, Ostr]
7
-
8
- OTimestampedLink = Optional[TimestampedLink]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: castor-extractor
3
- Version: 0.22.1
3
+ Version: 0.22.5
4
4
  Summary: Extract your metadata assets.
5
5
  Home-page: https://www.castordoc.com/
6
6
  License: EULA
@@ -207,6 +207,22 @@ For any questions or bug report, contact us at [support@castordoc.com](mailto:su
207
207
 
208
208
  # Changelog
209
209
 
210
+ ## 0.22.5 - 2025-01-09
211
+
212
+ * Databricks: validate and deduplicate lineage links
213
+
214
+ ## 0.22.4 - 2025-01-08
215
+
216
+ * ThoughtSpot: extract answers
217
+
218
+ ## 0.22.3 - 2024-12-10
219
+
220
+ * Databricks: extract lineage from system tables
221
+
222
+ ## 0.22.2 - 2024-12-06
223
+
224
+ * Sigma: multithreading to retrieve lineage
225
+
210
226
  ## 0.22.1 - 2024-12-05
211
227
 
212
228
  * Salesforce: deduplicate tables
@@ -1,4 +1,4 @@
1
- CHANGELOG.md,sha256=p1jUz1AWTVMfmt6dwNvWxUSloLrkhHoWRxpT2RU1Hcc,15058
1
+ CHANGELOG.md,sha256=JzTJEZxIMP9F_aePVfIvqLt0OuG0jYcDygsLyfTAV84,15335
2
2
  Dockerfile,sha256=xQ05-CFfGShT3oUqaiumaldwA288dj9Yb_pxofQpufg,301
3
3
  DockerfileUsage.md,sha256=2hkJQF-5JuuzfPZ7IOxgM6QgIQW7l-9oRMFVwyXC4gE,998
4
4
  LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
@@ -252,7 +252,7 @@ castor_extractor/visualization/salesforce_reporting/extract.py,sha256=ScStilebLG
252
252
  castor_extractor/visualization/sigma/__init__.py,sha256=GINql4yJLtjfOJgjHaWNpE13cMtnKNytiFRomwav27Q,114
253
253
  castor_extractor/visualization/sigma/assets.py,sha256=JZ1Cpxnml8P3mIJoTUM57hvylB18ErECQXaP5FF63O4,268
254
254
  castor_extractor/visualization/sigma/client/__init__.py,sha256=YQv06FBBQHvBMFg_tN0nUcmUp2NCL2s-eFTXG8rXaBg,74
255
- castor_extractor/visualization/sigma/client/client.py,sha256=nT61lN2yRpKd6jeqwR0NVOAUVpA5KAQyHkEGTl7n00A,6283
255
+ castor_extractor/visualization/sigma/client/client.py,sha256=d9CpE7vRZAPGzck0jFn37LY_6E_Njz9D1sCnFVGJSWk,8006
256
256
  castor_extractor/visualization/sigma/client/credentials.py,sha256=XddAuQSmCKpxJ70TQgRnOj0vMPYVtiStk_lMMQ1AiNM,693
257
257
  castor_extractor/visualization/sigma/client/endpoints.py,sha256=DBFphbgoH78_MZUGM_bKBAq28Nl7LWSZ6VRsbxrxtDg,1162
258
258
  castor_extractor/visualization/sigma/client/pagination.py,sha256=kNEhNq08tTGbypyMjxs0w4uvDtQc_iaWpOZweaa_FsU,690
@@ -306,13 +306,13 @@ castor_extractor/visualization/tableau_revamp/client/rest_fields.py,sha256=3kvaq
306
306
  castor_extractor/visualization/tableau_revamp/constants.py,sha256=lHGB50FgVNO2nXeIhkvQKivD8ZFBIjDrflgD5cTXKJw,104
307
307
  castor_extractor/visualization/tableau_revamp/extract.py,sha256=HqnBypuNGx_xKk-68WEOy_ucD15LuRF4t2xXf0XKPE0,1370
308
308
  castor_extractor/visualization/thoughtspot/__init__.py,sha256=NhTGUk5Kdt54oCjHYoAt0cLBmVLys5lFYiRANL6wCmI,150
309
- castor_extractor/visualization/thoughtspot/assets.py,sha256=lPRvXk0PKybgLv1AcDVxg-ssf4XLTs0biRqLrqC2TzU,196
309
+ castor_extractor/visualization/thoughtspot/assets.py,sha256=SAQWPKaD2NTSDg7-GSkcRSSEkKSws0MJfOVcHkdeTSg,276
310
310
  castor_extractor/visualization/thoughtspot/client/__init__.py,sha256=svrE2rMxR-OXctjPeAHMEPePlfcra-9KDevTMcHunAA,86
311
- castor_extractor/visualization/thoughtspot/client/client.py,sha256=RHOaJjvlWcSdASXzvlgMbmsSU9oTIixPhH8g0NgyIbc,3719
311
+ castor_extractor/visualization/thoughtspot/client/client.py,sha256=mtwMCPI1-1tyZb1gSYYr-O2QZMTFQwNgillU6ycsOU4,5552
312
312
  castor_extractor/visualization/thoughtspot/client/credentials.py,sha256=fp4YHiZy-dstWiLr5c4kFU9SyPK5rd2nCeh8k5sVRpM,462
313
313
  castor_extractor/visualization/thoughtspot/client/endpoints.py,sha256=u3FRkmG6j5OIMEeXWZcgRObP8JeC4EutIJEeitNV44c,330
314
- castor_extractor/visualization/thoughtspot/client/utils.py,sha256=ua7-10HKpFHYRDBVGLJ5hIEfuUA7ryIH9tl0sBjl0MU,883
315
- castor_extractor/visualization/thoughtspot/client/utils_test.py,sha256=-5ZaEYpQSrIp1-Sx-ViQOLPlv2LoOajEs2mE5YNi_tU,1887
314
+ castor_extractor/visualization/thoughtspot/client/utils.py,sha256=3LgbIWoG1e39VW8rYaV4ot_0EFipziwf3rFAZKxrlEY,1072
315
+ castor_extractor/visualization/thoughtspot/client/utils_test.py,sha256=2XysRU7a58KA2JgNwU2j4GPrN0rkN7Gvk8kQCJlYXVk,2469
316
316
  castor_extractor/visualization/thoughtspot/extract.py,sha256=mcXS0jGFpa50td98AVbbTqxchyI5wDCpB-v1o5iRc3g,1354
317
317
  castor_extractor/warehouse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
318
318
  castor_extractor/warehouse/abstract/__init__.py,sha256=Fdfa026tgOo64MvzVRLHM_F2G-JmcehrF0mh3dHgb7s,419
@@ -340,21 +340,21 @@ castor_extractor/warehouse/bigquery/queries/view_ddl.sql,sha256=obCm-IN9V8_YSZTw
340
340
  castor_extractor/warehouse/bigquery/query.py,sha256=FEekxlkrfAXzsT8Kj1AIqYd5mURB5MlZIkbFVXVqEhU,4762
341
341
  castor_extractor/warehouse/bigquery/types.py,sha256=rfKkKA13Et7TM4I0uVaXkLfuaBXkv51bNTp4AO0QSdw,57
342
342
  castor_extractor/warehouse/databricks/__init__.py,sha256=YG3YSIJgCFRjjI8eExy9T7qGnfnjWhMFh8c15KTs_BA,184
343
- castor_extractor/warehouse/databricks/api_client.py,sha256=1E3t8uCi3b8xVXLCodwlH5y8FIGmu9otORvA7ZqcGKE,8283
343
+ castor_extractor/warehouse/databricks/api_client.py,sha256=kLcUGSgrfybZUrpt0tE7qe2OoSSN7IK4myyB7c0czOY,6260
344
344
  castor_extractor/warehouse/databricks/api_client_test.py,sha256=YTWC-X7L-XAfK5b39TUgTmR1ifv0QrY5tvLNoSbpmjg,466
345
- castor_extractor/warehouse/databricks/client.py,sha256=K3RafGL_UerFAGmRKK2Cp2IXzalQYqkneQFvgsYdOZY,4993
346
- castor_extractor/warehouse/databricks/client_test.py,sha256=UKr_D3M8mhqV1oL2_3y_6pEzAFLVE3FHDNZh4omFLK4,2286
347
- castor_extractor/warehouse/databricks/credentials.py,sha256=iphbVynVTQXMEbJy4QaT5fer-GpOi7QtbAlg8R7-Lj4,598
345
+ castor_extractor/warehouse/databricks/client.py,sha256=H6vcKfos7op5AKSQF9qduG4afx-GZgBdyGE7waS6__o,3292
346
+ castor_extractor/warehouse/databricks/client_test.py,sha256=hOuSPh45z6m9T1hjuqpOayby_q8bYdJVdq5qiwkiXrg,1370
347
+ castor_extractor/warehouse/databricks/credentials.py,sha256=ExtVcl2NpMXTx1Lg8vHQdzQtSEm2aqpg3D1BJrNAUjI,528
348
348
  castor_extractor/warehouse/databricks/endpoints.py,sha256=qPoL9CtPFJdwVuW9rJ37nmeMd-nChOBouEVYb4SlaUE,670
349
- castor_extractor/warehouse/databricks/extract.py,sha256=G_-78-vrvEyn8rcKXXDXlxjad4Ot-Ko4vnhvEcOzjJQ,7389
349
+ castor_extractor/warehouse/databricks/enums.py,sha256=3T6BbVvbWvfWkD23krsYT1x0kKh1qRzNPl6WpcXe300,274
350
+ castor_extractor/warehouse/databricks/extract.py,sha256=Z4VTEIf0QMiua0QGAlJdQ86kxmGAXekQ304aCKme6IY,7358
350
351
  castor_extractor/warehouse/databricks/format.py,sha256=FUBMrFFWSa_lX5PtixJCDR3eRYycqeMw0oKHt7AkA4o,6732
351
352
  castor_extractor/warehouse/databricks/format_test.py,sha256=ls0IcOElqp_qecAzNbK0zdca7Pms4seCHimbw8NAoAI,3322
352
- castor_extractor/warehouse/databricks/lineage.py,sha256=RUCcKz19R0dJVab6JUSUbGx4L5Vyb4sVoTAwLbfgjxo,4700
353
- castor_extractor/warehouse/databricks/lineage_test.py,sha256=EejO4qKH_kJlJSrIap6GvkUi9E55RFvfiySKazAh0_A,1048
353
+ castor_extractor/warehouse/databricks/lineage.py,sha256=jwiRXrgqBAtzQt5EgErYrN8YRyviEEHmyrSbw8TSPq4,2105
354
+ castor_extractor/warehouse/databricks/lineage_test.py,sha256=PyBn1eAoxLm4Bz5M0F4zmaxFX2mXRTM_uug5OKbQPQs,2684
354
355
  castor_extractor/warehouse/databricks/pagination.py,sha256=sM1G0sN1pf1TPpI0Y3Oew378UGEKVkMRc2Mlu9tDjLo,545
355
- castor_extractor/warehouse/databricks/sql_client.py,sha256=KBP0rmMQBWw3jshDfv_NpFW8HqPxGfcBkS4d9T9aXvE,2977
356
- castor_extractor/warehouse/databricks/test_constants.py,sha256=Hm96yq_ltVAKv7WYhYz637r4Cuj-1cCdyOuxMEe3J-Q,2246
357
- castor_extractor/warehouse/databricks/types.py,sha256=-qO5y-uI95B666iDhyNM0TL8WlwYC-3Q4xZuolh3PwE,205
356
+ castor_extractor/warehouse/databricks/sql_client.py,sha256=5isGsRL0MW1lu_E_xTyCvSj_rwaJ2nh-kPlhvTvDy_w,3566
357
+ castor_extractor/warehouse/databricks/types.py,sha256=-TFX4jS6_c3wQLOpJTKpLeGS21YIPjKDjISnzeUPdCc,46
358
358
  castor_extractor/warehouse/databricks/utils.py,sha256=5CKn6Me1Tus97H_qDEz_5tkhd4ARmwk2qiC3GndjyCc,1969
359
359
  castor_extractor/warehouse/databricks/utils_test.py,sha256=_guTuzRWRTZdDY7ils0X1K8jhI9T877MEtw3x_YDg9I,2415
360
360
  castor_extractor/warehouse/mysql/__init__.py,sha256=2KFDogo9GNbApHqw3Vm5t_uNmIRjdp76nmP_WQQMfQY,116
@@ -436,8 +436,8 @@ castor_extractor/warehouse/sqlserver/queries/table.sql,sha256=kbBQP-TdG5px1IVgyx
436
436
  castor_extractor/warehouse/sqlserver/queries/user.sql,sha256=gOrZsMVypusR2dc4vwVs4E1a-CliRsr_UjnD2EbXs-A,94
437
437
  castor_extractor/warehouse/sqlserver/query.py,sha256=g0hPT-RmeGi2DyenAi3o72cTlQsLToXIFYojqc8E5fQ,533
438
438
  castor_extractor/warehouse/synapse/queries/column.sql,sha256=lNcFoIW3Y0PFOqoOzJEXmPvZvfAsY0AP63Mu2LuPzPo,1351
439
- castor_extractor-0.22.1.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
440
- castor_extractor-0.22.1.dist-info/METADATA,sha256=52H1eJe_L62yUSWkBJYLbRanXS6OdauukGW0RfeNiS4,22075
441
- castor_extractor-0.22.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
442
- castor_extractor-0.22.1.dist-info/entry_points.txt,sha256=7aVSxc-_2dicp28Ow-S4y0p4wGoTm9zGmVptMvfLdw8,1649
443
- castor_extractor-0.22.1.dist-info/RECORD,,
439
+ castor_extractor-0.22.5.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
440
+ castor_extractor-0.22.5.dist-info/METADATA,sha256=11A9xI9Bd6Uu1Na_AJngfTbkt-ECXjsabWNTppaZsOk,22352
441
+ castor_extractor-0.22.5.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
442
+ castor_extractor-0.22.5.dist-info/entry_points.txt,sha256=7aVSxc-_2dicp28Ow-S4y0p4wGoTm9zGmVptMvfLdw8,1649
443
+ castor_extractor-0.22.5.dist-info/RECORD,,
@@ -1,79 +0,0 @@
1
- OLDER_DATE = "2024-04-18 20:20:20.0"
2
- CLOSER_DATE = "2024-04-19 20:20:20.0"
3
-
4
- MOCK_TABLES_FOR_TABLE_LINEAGE = [
5
- {
6
- "id": "f51ba2ca-8cc3-4de6-8f8b-730359e8f40f",
7
- "schema_id": "dev.silver",
8
- "table_name": "analytics",
9
- },
10
- {
11
- "id": "4e140bdc-a67c-4b68-8a07-c684657d8b44",
12
- "schema_id": "dev.silver",
13
- "table_name": "pre_analytics",
14
- },
15
- {
16
- "id": "7d403198-55ea-4a40-9995-6ee2f4c79dfa",
17
- "schema_id": "dev.bronze",
18
- "table_name": "analytics",
19
- },
20
- ]
21
-
22
- _RAW_LINEAGE_DEV_SILVER_ANALYTICS = {
23
- "upstreams": [
24
- { # there could be other keys: jobInfos, notebookInfos, queryInfos
25
- "tableInfo": {
26
- "name": "pre_analytics",
27
- "catalog_name": "dev",
28
- "schema_name": "silver",
29
- "table_type": "PERSISTED_VIEW", # not used
30
- "lineage_timestamp": OLDER_DATE,
31
- }
32
- },
33
- {
34
- "tableInfo": {
35
- "name": "analytics",
36
- "catalog_name": "dev",
37
- "schema_name": "bronze",
38
- "table_type": "PERSISTED_VIEW", # not used
39
- "lineage_timestamp": CLOSER_DATE,
40
- }
41
- },
42
- ],
43
- "downstreams": [],
44
- }
45
- _RAW_LINEAGE_DEV_SILVER_PRE_ANALYTICS = {
46
- "upstreams": [],
47
- "downstreams": [
48
- {
49
- "tableInfo": {
50
- "name": "analytics",
51
- "catalog_name": "dev",
52
- "schema_name": "silver",
53
- "table_type": "PERSISTED_VIEW", # not used
54
- "lineage_timestamp": OLDER_DATE,
55
- }
56
- },
57
- ],
58
- }
59
- _RAW_LINEAGE_DEV_BRONZE_ANALYTICS = {
60
- "upstreams": [],
61
- "downstreams": [
62
- {
63
- "tableInfo": {
64
- "name": "analytics",
65
- "catalog_name": "dev",
66
- "schema_name": "silver",
67
- "table_type": "PERSISTED_VIEW", # not used
68
- "lineage_timestamp": OLDER_DATE,
69
- }
70
- },
71
- ],
72
- }
73
-
74
- # should be in the same order as MOCK_TABLES_FOR_TABLE_LINEAGE
75
- TABLE_LINEAGE_SIDE_EFFECT: tuple = (
76
- _RAW_LINEAGE_DEV_SILVER_ANALYTICS,
77
- _RAW_LINEAGE_DEV_SILVER_PRE_ANALYTICS,
78
- _RAW_LINEAGE_DEV_BRONZE_ANALYTICS,
79
- )