castor-extractor 0.16.11__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

Files changed (25) hide show
  1. CHANGELOG.md +20 -0
  2. castor_extractor/commands/upload.py +4 -4
  3. castor_extractor/uploader/constant.py +8 -1
  4. castor_extractor/uploader/upload.py +63 -46
  5. castor_extractor/uploader/upload_test.py +4 -3
  6. castor_extractor/uploader/utils.py +10 -0
  7. castor_extractor/utils/client/api.py +8 -3
  8. castor_extractor/utils/retry.py +3 -1
  9. castor_extractor/visualization/tableau_revamp/client/client.py +5 -2
  10. castor_extractor/visualization/tableau_revamp/client/gql_queries.py +10 -1
  11. castor_extractor/warehouse/abstract/__init__.py +2 -0
  12. castor_extractor/warehouse/abstract/asset.py +14 -0
  13. castor_extractor/warehouse/databricks/client.py +239 -3
  14. castor_extractor/warehouse/databricks/client_test.py +61 -1
  15. castor_extractor/warehouse/databricks/extract.py +36 -0
  16. castor_extractor/warehouse/databricks/format.py +13 -0
  17. castor_extractor/warehouse/databricks/test_constants.py +79 -0
  18. castor_extractor/warehouse/databricks/types.py +6 -1
  19. castor_extractor/warehouse/snowflake/extract.py +2 -0
  20. castor_extractor/warehouse/snowflake/queries/function.sql +10 -0
  21. {castor_extractor-0.16.11.dist-info → castor_extractor-0.17.0.dist-info}/METADATA +1 -1
  22. {castor_extractor-0.16.11.dist-info → castor_extractor-0.17.0.dist-info}/RECORD +25 -23
  23. {castor_extractor-0.16.11.dist-info → castor_extractor-0.17.0.dist-info}/LICENCE +0 -0
  24. {castor_extractor-0.16.11.dist-info → castor_extractor-0.17.0.dist-info}/WHEEL +0 -0
  25. {castor_extractor-0.16.11.dist-info → castor_extractor-0.17.0.dist-info}/entry_points.txt +0 -0
CHANGELOG.md CHANGED
@@ -1,5 +1,25 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.17.0 - 2024-06-10
4
+
5
+ * Uploader: redirect to the proxy, replace credentials with token
6
+
7
+ ## 0.16.15 - 2024-06-07
8
+
9
+ * Tableau: extract database_name for CustomSQLTables
10
+
11
+ ## 0.16.14 - 2024-06-06
12
+
13
+ * Snowflake: Extract SQL user defined function
14
+
15
+ ## 0.16.13 - 2024-06-05
16
+
17
+ * Tableau: extract database_name for tables
18
+
19
+ ## 0.16.12 - 2024-06-04
20
+
21
+ * Databricks: Extract lineage
22
+
3
23
  ## 0.16.11 - 2024-06-03
4
24
 
5
25
  * Tableau: add extra fields to optimise storage
@@ -13,10 +13,10 @@ logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
13
13
  def _args():
14
14
  parser = argparse.ArgumentParser()
15
15
  parser.add_argument(
16
- "-c",
17
- "--credentials",
16
+ "-k",
17
+ "--token",
18
18
  required=True,
19
- help="""Path to credentials or credentials as string""",
19
+ help="""API token provided by Castor""",
20
20
  )
21
21
  parser.add_argument(
22
22
  "-s",
@@ -44,7 +44,7 @@ def _args():
44
44
  )
45
45
  parsed = parser.parse_args()
46
46
  return {
47
- "credentials": parsed.credentials,
47
+ "token": parsed.token,
48
48
  "source_id": parsed.source_id,
49
49
  "file_path": parsed.file_path,
50
50
  "directory_path": parsed.directory_path,
@@ -1,6 +1,13 @@
1
1
  from enum import Enum
2
2
 
3
- EXTRACTION_BUCKET = "extraction-storage"
3
+ from ..utils import RetryStrategy
4
+
5
+ # url of the gcs proxy
6
+ INGEST_URL = "https://ingest.castordoc.com"
7
+
8
+ RETRY_BASE_MS = 10_000
9
+ RETRY_JITTER_MS = 1_000
10
+ RETRY_STRATEGY = RetryStrategy.LINEAR
4
11
 
5
12
 
6
13
  class FileType(Enum):
@@ -1,83 +1,100 @@
1
1
  #!/usr/bin/env python3
2
- import json
3
2
  import logging
4
3
  import ntpath
5
4
  from datetime import datetime
6
- from typing import Iterable, Optional, Union
5
+ from typing import Dict, Iterable, Optional, Tuple
7
6
  from uuid import UUID
8
7
 
9
- from google.cloud import storage # type: ignore
10
-
11
- from .constant import EXTRACTION_BUCKET, PATH_TEMPLATES, FileType
8
+ import requests
9
+
10
+ from ..utils.retry import retry
11
+ from .constant import (
12
+ INGEST_URL,
13
+ PATH_TEMPLATES,
14
+ RETRY_BASE_MS,
15
+ RETRY_JITTER_MS,
16
+ RETRY_STRATEGY,
17
+ FileType,
18
+ )
12
19
  from .env import get_blob_env
13
- from .utils import file_exist, iter_files
20
+ from .utils import iter_files
14
21
 
15
22
  logger = logging.getLogger(__name__)
16
23
 
24
+ _EXCEPTIONS = (
25
+ requests.exceptions.Timeout,
26
+ requests.exceptions.ConnectTimeout,
27
+ )
17
28
 
18
- def _client(credentials: Union[str, dict]) -> storage.Client:
19
- """supports dict, string or path to the JSON file"""
20
- if isinstance(credentials, dict):
21
- return storage.Client.from_service_account_info(credentials)
22
- if file_exist(credentials):
23
- return storage.Client.from_service_account_json(credentials)
24
- if isinstance(credentials, str):
25
- credentials = json.loads(credentials)
26
- return storage.Client.from_service_account_info(credentials)
27
- raise ValueError("needs path or dict for credentials")
28
29
 
30
+ def _path_and_url(
31
+ source_id: UUID,
32
+ file_type: FileType,
33
+ file_path: str,
34
+ ) -> Tuple[str, str]:
29
35
 
30
- def _path(source_id: UUID, file_type: FileType, file_path: str) -> str:
31
36
  now = datetime.utcnow()
32
37
  timestamp = int(now.timestamp())
33
38
  filename = ntpath.basename(file_path)
34
-
35
39
  path_template = PATH_TEMPLATES[file_type]
36
- return path_template.format(
40
+ path = path_template.format(
37
41
  timestamp=timestamp,
38
42
  source_id=source_id,
39
43
  filename=filename,
40
44
  )
41
45
 
46
+ url = f"{INGEST_URL}/{path}"
42
47
 
43
- def _get_blob(
44
- credentials: Union[str, dict],
45
- source_id: UUID,
46
- file_path: str,
47
- file_type: FileType,
48
- ) -> storage.Blob:
49
- """get the target blob to upload to"""
50
- client = _client(credentials)
51
- path = _path(source_id, file_type, file_path)
48
+ return path, url
52
49
 
53
- bucket = client.bucket(EXTRACTION_BUCKET)
54
- return bucket.blob(path)
50
+
51
+ def _headers(token: str) -> Dict:
52
+ return {
53
+ "Authorization": f"Token {token}",
54
+ "Accept": "text/csv, application/json",
55
+ }
55
56
 
56
57
 
57
58
  def _upload(
58
- credentials: Union[str, dict],
59
+ token: str,
59
60
  source_id: UUID,
60
61
  file_path: str,
61
62
  file_type: FileType,
62
63
  ) -> None:
63
64
  """
64
- credentials: path to file or dict
65
- source_id: id for the source
66
- file_type: type of file to upload
67
- file_path: path to the local file to upload
65
+ Upload the given file to Google Cloud Storage (GCS)
66
+ - Don't call GCS API directly
67
+ - Call the ingestion proxy which handles authorisation and uploading
68
68
  """
69
- timeout, retries = get_blob_env()
69
+ path, url = _path_and_url(source_id, file_type, file_path)
70
+ headers = _headers(token)
71
+ timeout, max_retries = get_blob_env()
70
72
 
71
- blob = _get_blob(credentials, source_id, file_path, file_type)
72
- with open(file_path, "rb") as f:
73
- blob.upload_from_file(f, timeout=timeout, num_retries=retries)
74
- logger.info(
75
- f"uploaded {file_path} as {file_type.value} to {blob.public_url}",
76
- )
73
+ with open(file_path, "rb") as file_content:
74
+
75
+ @retry(
76
+ exceptions=_EXCEPTIONS,
77
+ max_retries=max_retries,
78
+ base_ms=RETRY_BASE_MS,
79
+ jitter_ms=RETRY_JITTER_MS,
80
+ strategy=RETRY_STRATEGY,
81
+ )
82
+ def _request_post():
83
+ response = requests.post(
84
+ url=url,
85
+ headers=headers,
86
+ data=file_content,
87
+ timeout=timeout,
88
+ )
89
+ response.raise_for_status()
90
+
91
+ _request_post()
92
+
93
+ logger.info(f"Uploaded {file_path} as {file_type.value} to {path}")
77
94
 
78
95
 
79
96
  def upload_manifest(
80
- credentials: Union[str, dict],
97
+ token: str,
81
98
  source_id: UUID,
82
99
  file_path: str,
83
100
  ) -> None:
@@ -86,11 +103,11 @@ def upload_manifest(
86
103
  source_id: id for the source
87
104
  file_path: path to the local manifest to upload
88
105
  """
89
- _upload(credentials, source_id, file_path, FileType.DBT)
106
+ _upload(token, source_id, file_path, FileType.DBT)
90
107
 
91
108
 
92
109
  def upload(
93
- credentials: Union[str, dict],
110
+ token: str,
94
111
  source_id: UUID,
95
112
  file_type: FileType,
96
113
  file_path: Optional[str] = None,
@@ -113,4 +130,4 @@ def upload(
113
130
  raise ValueError(message)
114
131
 
115
132
  for file_ in files:
116
- _upload(credentials, source_id, file_, file_type)
133
+ _upload(token, source_id, file_, file_type)
@@ -1,7 +1,7 @@
1
1
  from uuid import UUID
2
2
 
3
- from .constant import FileType
4
- from .upload import _path
3
+ from .constant import INGEST_URL, FileType
4
+ from .upload import _path_and_url
5
5
 
6
6
 
7
7
  def test__path():
@@ -9,5 +9,6 @@ def test__path():
9
9
  file_type = FileType.VIZ
10
10
  file_path = "filename"
11
11
 
12
- path = _path(source_id, file_type, file_path)
12
+ path, url = _path_and_url(source_id, file_type, file_path)
13
13
  assert path == f"visualization-{source_id}/{file_path}"
14
+ assert url == f"{INGEST_URL}/{path}"
@@ -1,13 +1,23 @@
1
+ import logging
1
2
  import os
2
3
  from typing import Iterator
3
4
 
5
+ logger = logging.getLogger(__name__)
6
+
7
+ _ALLOWED_EXTENSION = (".json", ".csv")
8
+
4
9
 
5
10
  def iter_files(repository_path: str) -> Iterator[str]:
6
11
  """
7
12
  Given a repository path yield all files in that repository
13
+ Removes file whose extension is not allowed
8
14
  """
9
15
 
10
16
  for file in os.listdir(repository_path):
17
+ _, ext = os.path.splitext(file)
18
+ if ext not in _ALLOWED_EXTENSION:
19
+ logger.info(f"Forbidden file extension : skipping {file}")
20
+ continue
11
21
  file_path = os.path.join(repository_path, file)
12
22
 
13
23
  if os.path.isfile(file_path):
@@ -5,7 +5,7 @@ import requests
5
5
 
6
6
  logger = logging.getLogger(__name__)
7
7
 
8
- DEFAULT_TIMEOUT_MS = 30_000
8
+ DEFAULT_TIMEOUT_S = 30
9
9
 
10
10
  # https://requests.readthedocs.io/en/latest/api/#requests.request
11
11
  HttpMethod = Literal["GET", "OPTIONS", "HEAD", "POST", "PUT", "PATCH", "DELETE"]
@@ -20,7 +20,7 @@ class APIClient:
20
20
  def __init__(self, host: str, token: Optional[str] = None):
21
21
  self._host = host
22
22
  self._token = token or ""
23
- self._timeout = DEFAULT_TIMEOUT_MS
23
+ self._timeout = DEFAULT_TIMEOUT_S
24
24
 
25
25
  @staticmethod
26
26
  def build_url(host: str, path: str):
@@ -44,7 +44,12 @@ class APIClient:
44
44
  ) -> Any:
45
45
  logger.debug(f"Calling {method} on {url}")
46
46
  result = requests.request(
47
- method, url, headers=self._headers(), params=params, json=data
47
+ method,
48
+ url,
49
+ headers=self._headers(),
50
+ params=params,
51
+ json=data,
52
+ timeout=self._timeout,
48
53
  )
49
54
  result.raise_for_status()
50
55
 
@@ -68,7 +68,8 @@ class Retry(BaseModel):
68
68
  self._retry_attempts += 1
69
69
  wait_ms = self.base() + self.jitter()
70
70
  wait_s = float(wait_ms) / MS_IN_SEC
71
- logger.warning(f"Attempting a new call in {wait_s} seconds")
71
+ msg = f"Attempting a new call in {wait_s} seconds, {self._retry_attempts} attempt(s) / {self.max_retries} max retries"
72
+ logger.warning(msg)
72
73
  time.sleep(wait_s)
73
74
  return True
74
75
 
@@ -93,6 +94,7 @@ def retry(
93
94
  try:
94
95
  return None, callable(*args, **kwargs)
95
96
  except exceptions_ as err:
97
+ logger.warning(f"Exception within {callable.__name__}")
96
98
  return err, None
97
99
 
98
100
  def _func(*args, **kwargs) -> Any:
@@ -31,8 +31,11 @@ _TSC_ASSETS = (
31
31
  # increase the value when extraction is too slow
32
32
  # decrease the value when timeouts arise
33
33
  _CUSTOM_PAGE_SIZE: Dict[TableauRevampAsset, int] = {
34
- # fields and columns are light but volumes are bigger
35
- TableauRevampAsset.COLUMN: 200,
34
+ # for some clients, extraction of columns tend to hit the node limit
35
+ # https://community.tableau.com/s/question/0D54T00000YuK60SAF/metadata-query-nodelimitexceeded-error
36
+ # the workaround is to reduce pagination
37
+ TableauRevampAsset.COLUMN: 50,
38
+ # fields are light but volumes are bigger
36
39
  TableauRevampAsset.FIELD: 1000,
37
40
  TableauRevampAsset.TABLE: 50,
38
41
  }
@@ -63,12 +63,21 @@ downstreamWorkbooks { id }
63
63
  id
64
64
  name
65
65
  ... on DatabaseTable {
66
- connectionType
67
66
  fullName
68
67
  schema
68
+ database {
69
+ connectionType
70
+ id
71
+ name
72
+ }
69
73
  }
70
74
  ... on CustomSQLTable {
71
75
  query
76
+ database {
77
+ connectionType
78
+ id
79
+ name
80
+ }
72
81
  }
73
82
  """
74
83
 
@@ -1,6 +1,8 @@
1
1
  from .asset import (
2
+ ADDITIONAL_LINEAGE_ASSETS,
2
3
  CATALOG_ASSETS,
3
4
  EXTERNAL_LINEAGE_ASSETS,
5
+ FUNCTIONS_ASSETS,
4
6
  QUERIES_ASSETS,
5
7
  VIEWS_ASSETS,
6
8
  SupportedAssets,
@@ -7,6 +7,8 @@ from ...types import ExternalAsset, classproperty
7
7
  class WarehouseAsset(ExternalAsset):
8
8
  """Assets that can be extracted from warehouses"""
9
9
 
10
+ ADDITIONAL_COLUMN_LINEAGE = "additional_column_lineage"
11
+ ADDITIONAL_TABLE_LINEAGE = "additional_table_lineage"
10
12
  COLUMN = "column"
11
13
  COLUMN_LINEAGE = "column_lineage" # specific to snowflake
12
14
  DATABASE = "database"
@@ -19,22 +21,28 @@ class WarehouseAsset(ExternalAsset):
19
21
  ROLE = "role"
20
22
  SCHEMA = "schema"
21
23
  TABLE = "table"
24
+ FUNCTION = "function"
22
25
  USER = "user"
23
26
  VIEW_DDL = "view_ddl"
24
27
 
25
28
  @classproperty
26
29
  def optional(cls) -> Set["WarehouseAsset"]:
27
30
  return {
31
+ WarehouseAsset.ADDITIONAL_COLUMN_LINEAGE,
32
+ WarehouseAsset.ADDITIONAL_TABLE_LINEAGE,
28
33
  WarehouseAsset.EXTERNAL_COLUMN_LINEAGE,
29
34
  WarehouseAsset.EXTERNAL_TABLE_LINEAGE,
35
+ WarehouseAsset.FUNCTION,
30
36
  }
31
37
 
32
38
 
33
39
  class WarehouseAssetGroup(Enum):
34
40
  """Groups of assets that can be extracted together"""
35
41
 
42
+ ADDITIONAL_LINEAGE = "additional_lineage"
36
43
  CATALOG = "catalog"
37
44
  EXTERNAL_LINEAGE = "external_lineage"
45
+ FUNCTION = "function"
38
46
  QUERY = "query"
39
47
  ROLE = "role"
40
48
  SNOWFLAKE_LINEAGE = "snowflake_lineage"
@@ -53,6 +61,7 @@ CATALOG_ASSETS = (
53
61
  )
54
62
 
55
63
  # shared by technologies supporting queries
64
+ FUNCTIONS_ASSETS = (WarehouseAsset.FUNCTION,)
56
65
  QUERIES_ASSETS = (WarehouseAsset.QUERY,)
57
66
  VIEWS_ASSETS = (WarehouseAsset.VIEW_DDL,)
58
67
 
@@ -61,6 +70,11 @@ EXTERNAL_LINEAGE_ASSETS = (
61
70
  WarehouseAsset.EXTERNAL_TABLE_LINEAGE,
62
71
  )
63
72
 
73
+ ADDITIONAL_LINEAGE_ASSETS = (
74
+ WarehouseAsset.ADDITIONAL_COLUMN_LINEAGE,
75
+ WarehouseAsset.ADDITIONAL_TABLE_LINEAGE,
76
+ )
77
+
64
78
  NON_EXTRACTABLE_ASSETS = {WarehouseAssetGroup.EXTERNAL_LINEAGE}
65
79
 
66
80
 
@@ -1,18 +1,38 @@
1
1
  import logging
2
+ from concurrent.futures import ThreadPoolExecutor
2
3
  from datetime import date
3
4
  from functools import partial
4
- from typing import Any, Dict, List, Optional, Set
5
+ from typing import Any, Dict, List, Optional, Set, Tuple, cast
5
6
 
6
- from ...utils import at_midnight, date_after, mapping_from_rows
7
+ import requests
8
+
9
+ from ...utils import (
10
+ SafeMode,
11
+ at_midnight,
12
+ date_after,
13
+ mapping_from_rows,
14
+ retry,
15
+ safe_mode,
16
+ )
7
17
  from ...utils.client.api import APIClient
8
18
  from ...utils.pager import PagerOnToken
9
19
  from ..abstract.time_filter import TimeFilter
10
20
  from .credentials import DatabricksCredentials
11
21
  from .format import DatabricksFormatter
12
- from .types import TablesColumns
22
+ from .types import Link, Ostr, OTimestampedLink, TablesColumns, TimestampedLink
13
23
 
14
24
  logger = logging.getLogger(__name__)
15
25
 
26
+ _MAX_NUMBER_OF_LINEAGE_ERRORS = 1000
27
+ _MAX_THREADS = 10
28
+ _RETRY_ATTEMPTS = 3
29
+ _RETRY_BASE_MS = 1000
30
+ _RETRY_EXCEPTIONS = [
31
+ requests.exceptions.ConnectTimeout,
32
+ ]
33
+
34
+ safe_params = SafeMode((BaseException,), _MAX_NUMBER_OF_LINEAGE_ERRORS)
35
+
16
36
 
17
37
  def _day_to_epoch_ms(day: date) -> int:
18
38
  return int(at_midnight(day).timestamp() * 1000)
@@ -22,6 +42,30 @@ def _day_hour_to_epoch_ms(day: date, hour: int) -> int:
22
42
  return int(at_midnight(day).timestamp() * 1000) + (hour * 3600 * 1000)
23
43
 
24
44
 
45
+ class LineageLinks:
46
+ """
47
+ helper class that handles lineage deduplication and filtering
48
+ """
49
+
50
+ def __init__(self):
51
+ self.lineage: Dict[Link, Ostr] = dict()
52
+
53
+ def add(self, timestamped_link: TimestampedLink) -> None:
54
+ """
55
+ keep the most recent lineage link, adding to `self.lineage`
56
+ """
57
+ parent, child, timestamp = timestamped_link
58
+ link = (parent, child)
59
+ if not self.lineage.get(link):
60
+ self.lineage[link] = timestamp
61
+ else:
62
+ if not timestamp:
63
+ return
64
+ # keep most recent link; cast for mypy
65
+ recent = max(cast(str, self.lineage[link]), cast(str, timestamp))
66
+ self.lineage[link] = recent
67
+
68
+
25
69
  class DatabricksClient(APIClient):
26
70
  """Databricks Client"""
27
71
 
@@ -123,6 +167,198 @@ class DatabricksClient(APIClient):
123
167
  columns.extend(c_to_add)
124
168
  return tables, columns
125
169
 
170
+ @staticmethod
171
+ def _to_table_path(table: dict) -> Ostr:
172
+ if table.get("name"):
173
+ return f"{table['catalog_name']}.{table['schema_name']}.{table['name']}"
174
+ return None
175
+
176
+ @staticmethod
177
+ def _to_column_path(column: dict) -> Ostr:
178
+ if column.get("name"):
179
+ return f"{column['catalog_name']}.{column['schema_name']}.{column['table_name']}.{column['name']}"
180
+ return None
181
+
182
+ def _link(
183
+ self, path_from: Ostr, path_to: Ostr, timestamp: Ostr
184
+ ) -> OTimestampedLink:
185
+ """exclude missing path and self-lineage"""
186
+ if (not path_from) or (not path_to):
187
+ return None
188
+ is_self_lineage = path_from.lower() == path_to.lower()
189
+ if is_self_lineage:
190
+ return None
191
+ return (path_from, path_to, timestamp)
192
+
193
+ def _single_table_lineage_links(
194
+ self, table_path: str, single_table_lineage: dict
195
+ ) -> List[TimestampedLink]:
196
+ """
197
+ process databricks lineage API response for a given table
198
+ returns a list of (parent, child, timestamp)
199
+
200
+ Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
201
+ we could also have `notebookInfos` or `fileInfo`
202
+ """
203
+ links: List[OTimestampedLink] = []
204
+ # add parent:
205
+ for link in single_table_lineage.get("upstreams", []):
206
+ parent = link.get("tableInfo", {})
207
+ parent_path = self._to_table_path(parent)
208
+ timestamp: Ostr = parent.get("lineage_timestamp")
209
+ links.append(self._link(parent_path, table_path, timestamp))
210
+
211
+ # add children:
212
+ for link in single_table_lineage.get("downstreams", []):
213
+ child = link.get("tableInfo", {})
214
+ child_path = self._to_table_path(child)
215
+ timestamp = child.get("lineage_timestamp")
216
+ links.append(self._link(table_path, child_path, timestamp))
217
+
218
+ return list(filter(None, links))
219
+
220
+ @safe_mode(safe_params, lambda: [])
221
+ @retry(
222
+ exceptions=_RETRY_EXCEPTIONS,
223
+ max_retries=_RETRY_ATTEMPTS,
224
+ base_ms=_RETRY_BASE_MS,
225
+ )
226
+ def get_single_table_lineage(
227
+ self, table_path: str
228
+ ) -> List[TimestampedLink]:
229
+ """
230
+ Helper function used in get_lineage_links.
231
+ Call data lineage API and return the content of the result
232
+ eg table_path: broward_prd.bronze.account_adjustments
233
+ FYI: Maximum rate of 50 requests per SECOND
234
+ """
235
+ path = "api/2.0/lineage-tracking/table-lineage"
236
+ payload = {"table_name": table_path, "include_entity_lineage": True}
237
+ content = self.get(path=path, payload=payload)
238
+ return self._single_table_lineage_links(table_path, content)
239
+
240
+ def _deduplicate_lineage(self, lineages: List[TimestampedLink]) -> dict:
241
+ deduplicated_lineage = LineageLinks()
242
+ for timestamped_link in lineages:
243
+ deduplicated_lineage.add(timestamped_link)
244
+ return deduplicated_lineage.lineage
245
+
246
+ def table_lineage(self, tables: List[dict]) -> List[dict]:
247
+ """
248
+ Wrapper function that retrieves all table lineage
249
+ """
250
+ # retrieve table lineage
251
+ with ThreadPoolExecutor(max_workers=_MAX_THREADS) as executor:
252
+ table_paths = [
253
+ ".".join([table["schema_id"], table["table_name"]])
254
+ for table in tables
255
+ ]
256
+ results = executor.map(self.get_single_table_lineage, table_paths)
257
+ lineages = [link for links in results for link in links]
258
+ deduplicated = self._deduplicate_lineage(lineages)
259
+ return self.formatter.format_lineage(deduplicated)
260
+
261
+ @staticmethod
262
+ def _paths_for_column_lineage(
263
+ tables: List[dict], columns: List[dict], table_lineage: List[dict]
264
+ ) -> List[Tuple[str, str]]:
265
+ """
266
+ helper providing a list of candidate columns to look lineage for:
267
+ we only look for column lineage where there is table lineage
268
+ """
269
+ # mapping between table id and its path db.schema.table
270
+ # table["schema_id"] follows the pattern `db.schema`
271
+ mapping = {
272
+ table["id"]: ".".join([table["schema_id"], table["table_name"]])
273
+ for table in tables
274
+ }
275
+
276
+ tables_with_lineage: Set[str] = set()
277
+ for t in table_lineage:
278
+ tables_with_lineage.add(t["parent_path"])
279
+ tables_with_lineage.add(t["child_path"])
280
+
281
+ paths_to_return: List[Tuple[str, str]] = []
282
+ for column in columns:
283
+ table_path = mapping[column["table_id"]]
284
+ if table_path not in tables_with_lineage:
285
+ continue
286
+ column_ = (table_path, column["column_name"])
287
+ paths_to_return.append(column_)
288
+
289
+ return paths_to_return
290
+
291
+ def _single_column_lineage_links(
292
+ self, column_path: str, single_column_lineage: dict
293
+ ) -> List[TimestampedLink]:
294
+ """
295
+ process databricks lineage API response for a given table
296
+ returns a list of (parent, child, timestamp)
297
+
298
+ Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
299
+ we could also have `notebookInfos` or `fileInfo`
300
+ """
301
+ links: List[OTimestampedLink] = []
302
+ # add parent:
303
+ for link in single_column_lineage.get("upstream_cols", []):
304
+ parent_path = self._to_column_path(link)
305
+ timestamp: Ostr = link.get("lineage_timestamp")
306
+ links.append(self._link(parent_path, column_path, timestamp))
307
+
308
+ # add children:
309
+ for link in single_column_lineage.get("downstream_cols", []):
310
+ child_path = self._to_column_path(link)
311
+ timestamp = link.get("lineage_timestamp")
312
+ links.append(self._link(column_path, child_path, timestamp))
313
+
314
+ return list(filter(None, links))
315
+
316
+ @safe_mode(safe_params, lambda: [])
317
+ @retry(
318
+ exceptions=_RETRY_EXCEPTIONS,
319
+ max_retries=_RETRY_ATTEMPTS,
320
+ base_ms=_RETRY_BASE_MS,
321
+ )
322
+ def get_single_column_lineage(
323
+ self,
324
+ names: Tuple[str, str],
325
+ ) -> List[TimestampedLink]:
326
+ """
327
+ Helper function used in get_lineage_links.
328
+ Call data lineage API and return the content of the result
329
+
330
+ eg table_path: broward_prd.bronze.account_adjustments
331
+ FYI: Maximum rate of 10 requests per SECOND
332
+ """
333
+ table_path, column_name = names
334
+ api_path = "api/2.0/lineage-tracking/column-lineage"
335
+ payload = {
336
+ "table_name": table_path,
337
+ "column_name": column_name,
338
+ "include_entity_lineage": True,
339
+ }
340
+ content = self.get(path=api_path, payload=payload)
341
+ column_path = f"{table_path}.{column_name}"
342
+ return self._single_column_lineage_links(column_path, content)
343
+
344
+ def column_lineage(
345
+ self, tables: List[dict], columns: List[dict], table_lineage: List[dict]
346
+ ) -> List[dict]:
347
+ """
348
+ Wrapper function that retrieves all column lineage
349
+ we only try to retrieve column lineage if we found table lineage
350
+ """
351
+ candidate_paths = self._paths_for_column_lineage(
352
+ tables, columns, table_lineage
353
+ )
354
+ lineages: List[TimestampedLink] = [
355
+ link
356
+ for paths in candidate_paths
357
+ for link in self.get_single_column_lineage(paths)
358
+ ]
359
+ deduplicated = self._deduplicate_lineage(lineages)
360
+ return self.formatter.format_lineage(deduplicated)
361
+
126
362
  @staticmethod
127
363
  def _time_filter(time_filter: Optional[TimeFilter]) -> dict:
128
364
  """time filter to retrieve Databricks' queries"""
@@ -1,9 +1,16 @@
1
1
  from datetime import date
2
+ from unittest.mock import Mock, patch
2
3
 
3
4
  from freezegun import freeze_time
4
5
 
5
6
  from ..abstract.time_filter import TimeFilter
6
- from .client import DatabricksClient, _day_hour_to_epoch_ms
7
+ from .client import DatabricksClient, LineageLinks, _day_hour_to_epoch_ms
8
+ from .test_constants import (
9
+ CLOSER_DATE,
10
+ MOCK_TABLES_FOR_TABLE_LINEAGE,
11
+ OLDER_DATE,
12
+ TABLE_LINEAGE_SIDE_EFFECT,
13
+ )
7
14
 
8
15
 
9
16
  def test__day_hour_to_epoch_ms():
@@ -97,3 +104,56 @@ def test_DatabricksClient__match_table_with_user():
97
104
  table_without_owner = {"id": 1, "owner_email": None}
98
105
  actual = client._match_table_with_user(table_without_owner, user_mapping)
99
106
  assert actual == table_without_owner
107
+
108
+
109
+ @patch(
110
+ "source.packages.extractor.castor_extractor.warehouse.databricks.client.DatabricksClient.get",
111
+ side_effect=TABLE_LINEAGE_SIDE_EFFECT,
112
+ )
113
+ def test_DatabricksClient_table_lineage(mock_get):
114
+ client = DatabricksClient(Mock())
115
+
116
+ lineage = client.table_lineage(MOCK_TABLES_FOR_TABLE_LINEAGE)
117
+ assert len(lineage) == 2
118
+
119
+ expected_link_1 = {
120
+ "parent_path": "dev.silver.pre_analytics",
121
+ "child_path": "dev.silver.analytics",
122
+ "timestamp": OLDER_DATE,
123
+ }
124
+ expected_link_2 = {
125
+ "parent_path": "dev.bronze.analytics",
126
+ "child_path": "dev.silver.analytics",
127
+ "timestamp": CLOSER_DATE,
128
+ }
129
+ assert expected_link_1 in lineage
130
+ assert expected_link_2 in lineage
131
+
132
+
133
+ def test_LineageLinks_add():
134
+ links = LineageLinks()
135
+ timestamped_link = ("parent", "child", None)
136
+ expected_key = ("parent", "child")
137
+
138
+ links.add(timestamped_link)
139
+
140
+ assert expected_key in links.lineage
141
+ assert links.lineage[expected_key] is None
142
+
143
+ # we replace None by an actual timestamp
144
+ timestamped_link = ("parent", "child", OLDER_DATE)
145
+ links.add(timestamped_link)
146
+ assert expected_key in links.lineage
147
+ assert links.lineage[expected_key] == OLDER_DATE
148
+
149
+ # we update with the more recent timestamp
150
+ timestamped_link = ("parent", "child", CLOSER_DATE)
151
+ links.add(timestamped_link)
152
+ assert expected_key in links.lineage
153
+ assert links.lineage[expected_key] == CLOSER_DATE
154
+
155
+ # we keep the more recent timestamp
156
+ timestamped_link = ("parent", "child", OLDER_DATE)
157
+ links.add(timestamped_link)
158
+ assert expected_key in links.lineage
159
+ assert links.lineage[expected_key] == CLOSER_DATE
@@ -3,6 +3,7 @@ from typing import Dict, Optional
3
3
 
4
4
  from ...utils import AbstractStorage, LocalStorage, write_summary
5
5
  from ..abstract import (
6
+ ADDITIONAL_LINEAGE_ASSETS,
6
7
  CATALOG_ASSETS,
7
8
  EXTERNAL_LINEAGE_ASSETS,
8
9
  QUERIES_ASSETS,
@@ -17,6 +18,7 @@ from .client import DatabricksClient
17
18
  from .credentials import to_credentials
18
19
 
19
20
  DATABRICKS_ASSETS: SupportedAssets = {
21
+ WarehouseAssetGroup.ADDITIONAL_LINEAGE: ADDITIONAL_LINEAGE_ASSETS,
20
22
  WarehouseAssetGroup.CATALOG: CATALOG_ASSETS,
21
23
  WarehouseAssetGroup.QUERY: QUERIES_ASSETS,
22
24
  WarehouseAssetGroup.ROLE: (WarehouseAsset.USER,),
@@ -94,6 +96,39 @@ class DatabricksExtractionProcessor:
94
96
  logger.info(f"Extracted {len(columns)} columns to {location}")
95
97
  return catalog_locations
96
98
 
99
+ def extract_lineage(self) -> Paths:
100
+ if self._should_not_reextract(WarehouseAssetGroup.ADDITIONAL_LINEAGE):
101
+ return self._existing_group_paths(
102
+ WarehouseAssetGroup.ADDITIONAL_LINEAGE
103
+ )
104
+ lineage_locations: Dict[str, str] = dict()
105
+
106
+ # extract catalog
107
+ databases = self._client.databases()
108
+ schemas = self._client.schemas(databases)
109
+ users = self._client.users()
110
+ tables, columns = self._client.tables_and_columns(schemas, users)
111
+ logger.info("Extracted pre-requisite catalog. Next comes lineage")
112
+
113
+ # extract table lineage
114
+ table_lineage = self._client.table_lineage(tables)
115
+ table_lineage_key = WarehouseAsset.ADDITIONAL_TABLE_LINEAGE.value
116
+ location = self._storage.put(table_lineage_key, table_lineage)
117
+ lineage_locations[table_lineage_key] = location
118
+ msg = f"Extracted {len(table_lineage)} table lineage to {location}"
119
+ logger.info(msg)
120
+
121
+ # extract column lineage
122
+ column_lineage = self._client.column_lineage(
123
+ tables, columns, table_lineage
124
+ )
125
+ column_lineage_key = WarehouseAsset.ADDITIONAL_COLUMN_LINEAGE.value
126
+ location = self._storage.put(column_lineage_key, column_lineage)
127
+ lineage_locations[column_lineage_key] = location
128
+ msg = f"Extracted {len(column_lineage)} column lineage to {location}"
129
+ logger.info(msg)
130
+ return lineage_locations
131
+
97
132
  def extract_query(self, time_filter: OTimeFilter = None) -> Paths:
98
133
  """extract yesterday's queries and return their location"""
99
134
  if self._should_not_reextract(WarehouseAssetGroup.QUERY):
@@ -149,6 +184,7 @@ def extract_all(**kwargs) -> None:
149
184
  )
150
185
 
151
186
  extractor.extract_catalog()
187
+ extractor.extract_lineage()
152
188
  extractor.extract_query()
153
189
  extractor.extract_role()
154
190
  extractor.extract_view_ddl()
@@ -95,6 +95,19 @@ class DatabricksFormatter:
95
95
 
96
96
  return tables, columns
97
97
 
98
+ @staticmethod
99
+ def format_lineage(timestamps: dict) -> List[dict]:
100
+ lineage: List[dict] = []
101
+ for link, timestamp in timestamps.items():
102
+ parent_path, child_path = link
103
+ link_ = {
104
+ "parent_path": parent_path,
105
+ "child_path": child_path,
106
+ "timestamp": timestamp,
107
+ }
108
+ lineage.append(link_)
109
+ return lineage
110
+
98
111
  @staticmethod
99
112
  def format_query(raw_queries: List[dict]) -> List[dict]:
100
113
  queries = []
@@ -0,0 +1,79 @@
1
+ OLDER_DATE = "2024-04-18 20:20:20.0"
2
+ CLOSER_DATE = "2024-04-19 20:20:20.0"
3
+
4
+ MOCK_TABLES_FOR_TABLE_LINEAGE = [
5
+ {
6
+ "id": "f51ba2ca-8cc3-4de6-8f8b-730359e8f40f",
7
+ "schema_id": "dev.silver",
8
+ "table_name": "analytics",
9
+ },
10
+ {
11
+ "id": "4e140bdc-a67c-4b68-8a07-c684657d8b44",
12
+ "schema_id": "dev.silver",
13
+ "table_name": "pre_analytics",
14
+ },
15
+ {
16
+ "id": "7d403198-55ea-4a40-9995-6ee2f4c79dfa",
17
+ "schema_id": "dev.bronze",
18
+ "table_name": "analytics",
19
+ },
20
+ ]
21
+
22
+ _RAW_LINEAGE_DEV_SILVER_ANALYTICS = {
23
+ "upstreams": [
24
+ { # there could be other keys: jobInfos, notebookInfos, queryInfos
25
+ "tableInfo": {
26
+ "name": "pre_analytics",
27
+ "catalog_name": "dev",
28
+ "schema_name": "silver",
29
+ "table_type": "PERSISTED_VIEW", # not used
30
+ "lineage_timestamp": OLDER_DATE,
31
+ }
32
+ },
33
+ {
34
+ "tableInfo": {
35
+ "name": "analytics",
36
+ "catalog_name": "dev",
37
+ "schema_name": "bronze",
38
+ "table_type": "PERSISTED_VIEW", # not used
39
+ "lineage_timestamp": CLOSER_DATE,
40
+ }
41
+ },
42
+ ],
43
+ "downstreams": [],
44
+ }
45
+ _RAW_LINEAGE_DEV_SILVER_PRE_ANALYTICS = {
46
+ "upstreams": [],
47
+ "downstreams": [
48
+ {
49
+ "tableInfo": {
50
+ "name": "analytics",
51
+ "catalog_name": "dev",
52
+ "schema_name": "silver",
53
+ "table_type": "PERSISTED_VIEW", # not used
54
+ "lineage_timestamp": OLDER_DATE,
55
+ }
56
+ },
57
+ ],
58
+ }
59
+ _RAW_LINEAGE_DEV_BRONZE_ANALYTICS = {
60
+ "upstreams": [],
61
+ "downstreams": [
62
+ {
63
+ "tableInfo": {
64
+ "name": "analytics",
65
+ "catalog_name": "dev",
66
+ "schema_name": "silver",
67
+ "table_type": "PERSISTED_VIEW", # not used
68
+ "lineage_timestamp": OLDER_DATE,
69
+ }
70
+ },
71
+ ],
72
+ }
73
+
74
+ # should be in the same order as MOCK_TABLES_FOR_TABLE_LINEAGE
75
+ TABLE_LINEAGE_SIDE_EFFECT: tuple = (
76
+ _RAW_LINEAGE_DEV_SILVER_ANALYTICS,
77
+ _RAW_LINEAGE_DEV_SILVER_PRE_ANALYTICS,
78
+ _RAW_LINEAGE_DEV_BRONZE_ANALYTICS,
79
+ )
@@ -1,3 +1,8 @@
1
- from typing import List, Tuple
1
+ from typing import List, Optional, Tuple
2
2
 
3
+ Link = Tuple[str, str]
3
4
  TablesColumns = Tuple[List[dict], List[dict]]
5
+ Ostr = Optional[str]
6
+ TimestampedLink = Tuple[str, str, Ostr]
7
+
8
+ OTimestampedLink = Optional[TimestampedLink]
@@ -4,6 +4,7 @@ from ...utils import LocalStorage, from_env, write_summary
4
4
  from ..abstract import (
5
5
  CATALOG_ASSETS,
6
6
  EXTERNAL_LINEAGE_ASSETS,
7
+ FUNCTIONS_ASSETS,
7
8
  QUERIES_ASSETS,
8
9
  VIEWS_ASSETS,
9
10
  SQLExtractionProcessor,
@@ -20,6 +21,7 @@ logger = logging.getLogger(__name__)
20
21
 
21
22
  SNOWFLAKE_ASSETS: SupportedAssets = {
22
23
  WarehouseAssetGroup.CATALOG: CATALOG_ASSETS,
24
+ WarehouseAssetGroup.FUNCTION: FUNCTIONS_ASSETS,
23
25
  WarehouseAssetGroup.QUERY: QUERIES_ASSETS,
24
26
  WarehouseAssetGroup.VIEW_DDL: VIEWS_ASSETS,
25
27
  WarehouseAssetGroup.ROLE: (
@@ -0,0 +1,10 @@
1
+ SELECT
2
+ f.function_name AS name,
3
+ CONCAT(f.function_catalog, '.', f.function_schema, '.', f.function_name) AS path,
4
+ f.argument_signature AS signature,
5
+ f.function_definition AS definition
6
+ FROM snowflake.account_usage.functions f
7
+ WHERE TRUE
8
+ AND f.function_catalog NOT IN ('SNOWFLAKE', 'UTIL_DB')
9
+ AND f.function_language = 'SQL'
10
+ AND deleted IS NULL
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: castor-extractor
3
- Version: 0.16.11
3
+ Version: 0.17.0
4
4
  Summary: Extract your metadata assets.
5
5
  Home-page: https://www.castordoc.com/
6
6
  License: EULA
@@ -1,4 +1,4 @@
1
- CHANGELOG.md,sha256=CuRENmJ6p4IM6b8vrmt6QI8uN8mX4a-FI_hJ4cQkPps,10588
1
+ CHANGELOG.md,sha256=EVZ9vhIVN7HLn5PYkRyBWyT3hk72Nt3i1SghwSipfR4,10957
2
2
  Dockerfile,sha256=HcX5z8OpeSvkScQsN-Y7CNMUig_UB6vTMDl7uqzuLGE,303
3
3
  LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
4
4
  README.md,sha256=uF6PXm9ocPITlKVSh9afTakHmpLx3TvawLf-CbMP3wM,3578
@@ -23,7 +23,7 @@ castor_extractor/commands/extract_snowflake.py,sha256=vYiruxRoo--GeMemOGsSE1w9kc
23
23
  castor_extractor/commands/extract_sqlserver.py,sha256=lwhbcNChaXHZgMgSOch3faVr7WJw-sDU6GHl3lzBt_0,1141
24
24
  castor_extractor/commands/extract_tableau.py,sha256=u-6UCd-kfXwyhNWYxZusqtgTTYkf4gAJS1vRIYWsAVU,1415
25
25
  castor_extractor/commands/file_check.py,sha256=PkXFK-kPoG8PpvBP-PCvVdreqwYw6Y1fTh2QzDxACsM,2684
26
- castor_extractor/commands/upload.py,sha256=tWN9hUn1aKJrGEmNHI_qjSciFiAoL9x7YolbIsYbg8Y,1956
26
+ castor_extractor/commands/upload.py,sha256=tAyHfIDOdUpD0yMJe2f64nXpaVnIbgYXi4bYx2nNvNU,1922
27
27
  castor_extractor/file_checker/__init__.py,sha256=OSt6YLhUT42U_Cp3LCLHMVruwDkksL75Ij13X2UPnVk,119
28
28
  castor_extractor/file_checker/column.py,sha256=fMchy5v-Sd-0xuYS0V9mob7wnljslzWLhQGqrKGybdk,3097
29
29
  castor_extractor/file_checker/column_test.py,sha256=1j8PxvmvmJgpd-mk30iMYOme32ovPSIn4yCXywFoXrg,1935
@@ -38,16 +38,16 @@ castor_extractor/file_checker/templates/generic_warehouse.py,sha256=zvnWnYB8FNvh
38
38
  castor_extractor/logger.py,sha256=ovf1mBEKwbJBskBXoqHbcAomBrp58mUwSrCWtEMlYPM,1197
39
39
  castor_extractor/types.py,sha256=-QgiOaq--nXUsYLy_oESDrYbRMxs353-YiQnG1blJvU,1303
40
40
  castor_extractor/uploader/__init__.py,sha256=SSRtwjg-dNoxME-RJy9G1flASiUKAC5bH1htq3CURQg,75
41
- castor_extractor/uploader/constant.py,sha256=hEJlWYx0dyBzgo59XUBKCYIKEODpIc2DyzwAZIiNO8g,718
41
+ castor_extractor/uploader/constant.py,sha256=yTigLHDlYwoRr6CpFIl7ReElFsQd4H-qkluMZJPWSx0,865
42
42
  castor_extractor/uploader/env.py,sha256=5HSniVSOYVg4u38O4k8TB_qaJq9s8yJ1hjedkq_gdVg,878
43
43
  castor_extractor/uploader/env_test.py,sha256=ClCWWtwd2N-5ClIDUxVMeKkWfhhOTxpppsXUDmdjxSg,472
44
- castor_extractor/uploader/upload.py,sha256=5Aj3UOx8cpSVvzjYRz7S6nLk249IqUiCia70utU_970,3363
45
- castor_extractor/uploader/upload_test.py,sha256=BfGjAYEEDBmEcUS6_b3SlKyiQNR1iRf6-qmADDirTJI,328
46
- castor_extractor/uploader/utils.py,sha256=NCe0tkB28BVhqzOaDhDjaSfODjjcPWB17X6chnvyCWs,478
44
+ castor_extractor/uploader/upload.py,sha256=bTWD1_-hmJ6q1qcEosjZ96wsBtWDnWoCt692NYX_Nko,3228
45
+ castor_extractor/uploader/upload_test.py,sha256=7fwstdQe7FjuwGilsCdFpEQr1qLoR2WTRUzyy93fISw,402
46
+ castor_extractor/uploader/utils.py,sha256=Tx_i875L2vJ8btOLV3-L0UMEFiyhH8E5n0XXRyLjO0Y,793
47
47
  castor_extractor/utils/__init__.py,sha256=bmzAOc-PKsVreMJtF7DGpPQeHrVqxWel_BblRftt6Ag,1186
48
48
  castor_extractor/utils/client/__init__.py,sha256=CRE-xJKm6fVV9dB8ljzB5YoOxX4I1sCD1KSgqs3Y8_Y,161
49
49
  castor_extractor/utils/client/abstract.py,sha256=aA5Qcb9TwWDSMq8WpXbGkOB20hehwX2VTpqQAwV76wk,2048
50
- castor_extractor/utils/client/api.py,sha256=tHa7eC11sS_eOCXhlnvUa2haRfOLENmjKgjB09Ijt0s,1664
50
+ castor_extractor/utils/client/api.py,sha256=z1o4fteWx1HxNTqCYihl9sGkIgSQTbd8lW_B9Y2wyeQ,1742
51
51
  castor_extractor/utils/client/api_test.py,sha256=NSMdXg1FLc37erqHp2FZsIsogWVv6lFSs7rDXHikr-E,542
52
52
  castor_extractor/utils/client/postgres.py,sha256=n6ulaT222WWPY0_6qAZ0MHF0m91HtI9mMqL71nyygo0,866
53
53
  castor_extractor/utils/client/query.py,sha256=O6D5EjD1KmBlwa786Uw4D4kzxx97_HH50xIIeSWt0B8,205
@@ -80,7 +80,7 @@ castor_extractor/utils/pager/pager_on_id_test.py,sha256=CfAXhXaAmCXnm0oflj8_82An
80
80
  castor_extractor/utils/pager/pager_on_token.py,sha256=G442SKl4BXJFMPbYIIgCk5M8wl7V3jMg3K1WUUkl0I0,1579
81
81
  castor_extractor/utils/pager/pager_on_token_test.py,sha256=w2GCUGKR3cD5lfmtFAsNvExtzxkYdBR0pusBrGKFQ08,2548
82
82
  castor_extractor/utils/pager/pager_test.py,sha256=QPBVShSXhkiYZUfnAMs43xnys6CD8pAhL3Jhj-Ov2Xc,1705
83
- castor_extractor/utils/retry.py,sha256=vYdJMiM-Nr82H1MuD7_KZdqbFz98ffQGqJ4Owbr6mpY,3252
83
+ castor_extractor/utils/retry.py,sha256=OsUS3qysHCkgWge8BgBwyuvoWcJ6pR_RQmQDcHlors4,3410
84
84
  castor_extractor/utils/retry_test.py,sha256=nsMttlmyKygVcffX3Hay8U2S1BspkGPiCmzIXPpLKyk,2230
85
85
  castor_extractor/utils/safe.py,sha256=jpfIimwdBSVUvU2DPFrhqpKC_DSYwxQqd08MlIkSODY,1967
86
86
  castor_extractor/utils/safe_test.py,sha256=IHN1Z761tYMFslYC-2HAfkXmFPh4LYSqNLs4QZwykjk,2160
@@ -244,16 +244,16 @@ castor_extractor/visualization/tableau/usage.py,sha256=LlFwlbEr-EnYUJjKZha99CRCR
244
244
  castor_extractor/visualization/tableau_revamp/__init__.py,sha256=a3DGjQhaz17gBqW-E84TAgupKbqLC40y5Ajo1yn-ot4,156
245
245
  castor_extractor/visualization/tableau_revamp/assets.py,sha256=owlwaI2E4UKk1YhkaHgaAXx6gu3Op6EqZ7bjp0tHI6s,351
246
246
  castor_extractor/visualization/tableau_revamp/client/__init__.py,sha256=wmS9uLtUiqNYVloi0-DgD8d2qzu3RVZEAtWiaDp6G_M,90
247
- castor_extractor/visualization/tableau_revamp/client/client.py,sha256=T7v84dnT97sFqVdzJdk1aOZ7S6U9u6d-j3KBqVj91eY,9532
247
+ castor_extractor/visualization/tableau_revamp/client/client.py,sha256=RSoHDfz79ma0YJRGpiCihnwLGmoxLzphYrxRVyvByHI,9742
248
248
  castor_extractor/visualization/tableau_revamp/client/credentials.py,sha256=fHG32egq6ll2U4BNazalMof_plzfCMQjrN9WOs6kezk,3014
249
249
  castor_extractor/visualization/tableau_revamp/client/errors.py,sha256=dTe1shqmWmAXpDpCz-E24m8dGYjt6rvIGV9qQb4jnvI,150
250
- castor_extractor/visualization/tableau_revamp/client/gql_queries.py,sha256=VP6xXi1mWKDGVnkWPLstLHqc3T4GVSnywyyoT6BJkFY,2153
250
+ castor_extractor/visualization/tableau_revamp/client/gql_queries.py,sha256=-V3ToD5Gi7nmfVB2OxTOZw8dcOiF7_ciSWjjW2UdvvI,2270
251
251
  castor_extractor/visualization/tableau_revamp/client/tsc_fields.py,sha256=WsDliPCo-XsQ7wN-j0gpW9bdxCHvgH-aePywiltzfbU,688
252
252
  castor_extractor/visualization/tableau_revamp/constants.py,sha256=PcdudAogQhi3e-knalhgliMKjy5ahN0em_-7XSLrnxM,87
253
253
  castor_extractor/visualization/tableau_revamp/extract.py,sha256=2SLUxp5okM4AcEJJ61ZgcC2ikfZZl9MH17CEXMXmgl0,1450
254
254
  castor_extractor/warehouse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
255
- castor_extractor/warehouse/abstract/__init__.py,sha256=QNwFRsLpH6aqVpl37qzklLr62iA85Yx6nZAivHDhpyk,366
256
- castor_extractor/warehouse/abstract/asset.py,sha256=qe5ugm7fnkvjbzdELRAeywbuKH4OLq2YHlXdjepehxE,2159
255
+ castor_extractor/warehouse/abstract/__init__.py,sha256=Fdfa026tgOo64MvzVRLHM_F2G-JmcehrF0mh3dHgb7s,419
256
+ castor_extractor/warehouse/abstract/asset.py,sha256=Qs7T2Iw7KHgWVT2aAoBfCQ8tB143cUZY-DRUSkpgvGU,2689
257
257
  castor_extractor/warehouse/abstract/asset_test.py,sha256=_kd4ybNlWSAdSdEgJKC-jhJTa1nMRa9i8RO3YbqKLM4,758
258
258
  castor_extractor/warehouse/abstract/extract.py,sha256=fVBhdE-yMI_g6RBYZcr7q-ZVW7jK7WVkO_GO_KfkRqg,2908
259
259
  castor_extractor/warehouse/abstract/query.py,sha256=GAgeISCmAdrkTKzFGO79hQDf6SA6EFrrlW43w-LiXKo,2632
@@ -277,13 +277,14 @@ castor_extractor/warehouse/bigquery/queries/view_ddl.sql,sha256=obCm-IN9V8_YSZTw
277
277
  castor_extractor/warehouse/bigquery/query.py,sha256=hrFfjd5jW2oQnZ6ozlkn-gDe6sCIzu5zSX19T9W6fIk,4162
278
278
  castor_extractor/warehouse/bigquery/types.py,sha256=LZVWSmE57lOemNbB5hBRyYmDk9bFAU4nbRaJWALl6N8,140
279
279
  castor_extractor/warehouse/databricks/__init__.py,sha256=bTvDxjGQGM2J3hOnVhfNmFP1y8DK0tySiD_EXe5_xWE,200
280
- castor_extractor/warehouse/databricks/client.py,sha256=FsHlpHZ9JTG92Rf_8Z7277o9HBaAD0CKxSEHiujOgXg,8271
281
- castor_extractor/warehouse/databricks/client_test.py,sha256=Y-LBveZFRVaaL49Lo2MwbcJReBcYLNRdHtR_w7xWNWQ,3381
280
+ castor_extractor/warehouse/databricks/client.py,sha256=oHR_htE25p5tiAAFZKbF48efo7tqIENW4dAGA7yEqHg,16895
281
+ castor_extractor/warehouse/databricks/client_test.py,sha256=KNp4Hi_CC6GwiW2QDJQQwqALfUebuT9D_qL6FuP_8tY,5246
282
282
  castor_extractor/warehouse/databricks/credentials.py,sha256=PpGv5_GP320UQjV_gvaxSpOw58AmqSznmjGhGfe6bdU,655
283
- castor_extractor/warehouse/databricks/extract.py,sha256=-vJhAIxSu1lD_xGl-GXZYTmc5BGu0aXM3l-U0UghREM,5773
284
- castor_extractor/warehouse/databricks/format.py,sha256=Nd5L89yWhpIl0OEMV7WK1H3JYUa9WGPC0c-NUOT_uXM,5101
283
+ castor_extractor/warehouse/databricks/extract.py,sha256=VX-3uo5dZucenrg-wnPur3CxOgpC5H7Ds92TO7OTAjc,7379
284
+ castor_extractor/warehouse/databricks/format.py,sha256=2bRy2fa45NW3uk030rmyba4n2Em-NnyZPBurUslEbcw,5522
285
285
  castor_extractor/warehouse/databricks/format_test.py,sha256=iPmdJof43fBYL1Sa_fBrCWDQHCHgm7IWCZag1kWkj9E,1970
286
- castor_extractor/warehouse/databricks/types.py,sha256=T2SyLy9pY_olLtstdC77moPxIiikVsuQLMxh92YMJQo,78
286
+ castor_extractor/warehouse/databricks/test_constants.py,sha256=Hm96yq_ltVAKv7WYhYz637r4Cuj-1cCdyOuxMEe3J-Q,2246
287
+ castor_extractor/warehouse/databricks/types.py,sha256=hD6gC8oiT3QSWEvbtgUOGK_lLzzz36sEauB3lS_wxlE,218
287
288
  castor_extractor/warehouse/mysql/__init__.py,sha256=2KFDogo9GNbApHqw3Vm5t_uNmIRjdp76nmP_WQQMfQY,116
288
289
  castor_extractor/warehouse/mysql/client.py,sha256=IwoJvbmE5VZkMCP9yHf6ta3_AQPEuBPrZZ3meefbcJs,974
289
290
  castor_extractor/warehouse/mysql/client_test.py,sha256=wRTv-3c5chy_HKj-buasNiYOOCIfynYqbabM4Hxdh5E,1052
@@ -334,11 +335,12 @@ castor_extractor/warehouse/snowflake/client.py,sha256=XT0QLVNff_586SDuMe40iu8FCw
334
335
  castor_extractor/warehouse/snowflake/client_test.py,sha256=ihWtOOAQfh8pu5JTr_EWfqefKOVIaJXznACURzaU1Qs,1432
335
336
  castor_extractor/warehouse/snowflake/credentials.py,sha256=wbUdbx9jVSHzg2kNDhMFuDstbVTyZOcGAwnSeGeFIqs,875
336
337
  castor_extractor/warehouse/snowflake/credentials_test.py,sha256=Lkc-DHXOvr50KrqAW4nt_x0IA0Mu_CsBVu6ATnzQB6I,673
337
- castor_extractor/warehouse/snowflake/extract.py,sha256=x-qCz51wAsPyeP91-nuGqT1Q-AH-5iXGUKCiIV6tlFY,2882
338
+ castor_extractor/warehouse/snowflake/extract.py,sha256=fcze0VBe9OOAFSr25T9L6CY506Vm_xDEvvy8NWuLW1s,2956
338
339
  castor_extractor/warehouse/snowflake/queries/.sqlfluff,sha256=vttrwcr64JVIuvc7WIg9C54cbOkjg_VjXNR7YnTGOPE,31
339
340
  castor_extractor/warehouse/snowflake/queries/column.sql,sha256=pAW2UNnut0a483OY2rjOXCdCtQg0g254g61Bt51CIB4,1803
340
341
  castor_extractor/warehouse/snowflake/queries/column_lineage.sql,sha256=YKBiZ6zySSNcXLDXwm31EjGIIkkkZc0-S6hI1SRM80o,1179
341
342
  castor_extractor/warehouse/snowflake/queries/database.sql,sha256=ifZXoKUXtsrGOxml6AcNhA4yybIyatH5va7bcp-lgCU,483
343
+ castor_extractor/warehouse/snowflake/queries/function.sql,sha256=8LRh0ybhd-RldJ8UZspWUm3yv52evq11O2uqIO4KqeQ,372
342
344
  castor_extractor/warehouse/snowflake/queries/grant_to_role.sql,sha256=O7AJ1LzoXGDFmiVvQ8EMJ5x8FSAnaxRPdmRyAlEmkUM,272
343
345
  castor_extractor/warehouse/snowflake/queries/grant_to_user.sql,sha256=7AalVajU5vRRpIiys1igSwmDXirbwpMTvJr2ihSz2NE,143
344
346
  castor_extractor/warehouse/snowflake/queries/query.sql,sha256=-OYcWUvdPBkpOfezkZaW7hrOdDz3JyoqjNdRm_88Rsk,1779
@@ -368,8 +370,8 @@ castor_extractor/warehouse/synapse/queries/schema.sql,sha256=aX9xNrBD_ydwl-znGSF
368
370
  castor_extractor/warehouse/synapse/queries/table.sql,sha256=mCE8bR1Vb7j7SwZW2gafcXidQ2fo1HwxcybA8wP2Kfs,1049
369
371
  castor_extractor/warehouse/synapse/queries/user.sql,sha256=sTb_SS7Zj3AXW1SggKPLNMCd0qoTpL7XI_BJRMaEpBg,67
370
372
  castor_extractor/warehouse/synapse/queries/view_ddl.sql,sha256=3EVbp5_yTgdByHFIPLHmnoOnqqLE77SrjAwFDvu4e54,249
371
- castor_extractor-0.16.11.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
372
- castor_extractor-0.16.11.dist-info/METADATA,sha256=sVbdD6MsgGVPxckw8tREx_xeajevgThiIkuU2IFYBaM,6583
373
- castor_extractor-0.16.11.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
374
- castor_extractor-0.16.11.dist-info/entry_points.txt,sha256=SbyPk58Gh-FRztfCNnUZQ6w7SatzNJFZ6GIJLNsy7tI,1427
375
- castor_extractor-0.16.11.dist-info/RECORD,,
373
+ castor_extractor-0.17.0.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
374
+ castor_extractor-0.17.0.dist-info/METADATA,sha256=mPiUyxCqXFifcPbhcOPFsnkPAV4OcWXoYzGeUKlbkoo,6582
375
+ castor_extractor-0.17.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
376
+ castor_extractor-0.17.0.dist-info/entry_points.txt,sha256=SbyPk58Gh-FRztfCNnUZQ6w7SatzNJFZ6GIJLNsy7tI,1427
377
+ castor_extractor-0.17.0.dist-info/RECORD,,