castor-extractor 0.17.0__py3-none-any.whl → 0.17.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +12 -0
- castor_extractor/uploader/upload.py +1 -1
- castor_extractor/utils/client/api.py +7 -2
- castor_extractor/warehouse/databricks/__init__.py +1 -1
- castor_extractor/warehouse/databricks/client.py +110 -5
- castor_extractor/warehouse/databricks/credentials.py +9 -10
- castor_extractor/warehouse/databricks/extract.py +2 -2
- castor_extractor/warehouse/databricks/format.py +54 -9
- castor_extractor/warehouse/databricks/format_test.py +77 -18
- castor_extractor/warehouse/databricks/utils.py +27 -0
- castor_extractor/warehouse/databricks/utils_test.py +25 -0
- {castor_extractor-0.17.0.dist-info → castor_extractor-0.17.3.dist-info}/METADATA +6 -1
- {castor_extractor-0.17.0.dist-info → castor_extractor-0.17.3.dist-info}/RECORD +16 -14
- {castor_extractor-0.17.0.dist-info → castor_extractor-0.17.3.dist-info}/LICENCE +0 -0
- {castor_extractor-0.17.0.dist-info → castor_extractor-0.17.3.dist-info}/WHEEL +0 -0
- {castor_extractor-0.17.0.dist-info → castor_extractor-0.17.3.dist-info}/entry_points.txt +0 -0
CHANGELOG.md
CHANGED
|
@@ -1,5 +1,17 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.17.3 - 2024-06-24
|
|
4
|
+
|
|
5
|
+
* Databricks: extract tags for tables and column
|
|
6
|
+
|
|
7
|
+
## 0.17.2 - 2024-06-14
|
|
8
|
+
|
|
9
|
+
* Uploader: support multipart
|
|
10
|
+
|
|
11
|
+
## 0.17.1 - 2024-06-12
|
|
12
|
+
|
|
13
|
+
* Databricks: extract table source links
|
|
14
|
+
|
|
3
15
|
## 0.17.0 - 2024-06-10
|
|
4
16
|
|
|
5
17
|
* Uploader: redirect to the proxy, replace credentials with token
|
|
@@ -58,7 +58,12 @@ class APIClient:
|
|
|
58
58
|
|
|
59
59
|
return result.json()
|
|
60
60
|
|
|
61
|
-
def get(
|
|
61
|
+
def get(
|
|
62
|
+
self,
|
|
63
|
+
path: str,
|
|
64
|
+
payload: Optional[dict] = None,
|
|
65
|
+
processor: Optional[Callable] = None,
|
|
66
|
+
) -> dict:
|
|
62
67
|
"""path: REST API operation path, such as /api/2.0/clusters/get"""
|
|
63
68
|
url = self.build_url(self._host, path)
|
|
64
|
-
return self._call(url=url, data=payload)
|
|
69
|
+
return self._call(url=url, data=payload, processor=processor)
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from collections import defaultdict
|
|
2
3
|
from concurrent.futures import ThreadPoolExecutor
|
|
3
4
|
from datetime import date
|
|
5
|
+
from enum import Enum
|
|
4
6
|
from functools import partial
|
|
5
7
|
from typing import Any, Dict, List, Optional, Set, Tuple, cast
|
|
6
8
|
|
|
7
9
|
import requests
|
|
10
|
+
from databricks import sql # type: ignore
|
|
11
|
+
from requests import Response
|
|
8
12
|
|
|
9
13
|
from ...utils import (
|
|
10
14
|
SafeMode,
|
|
@@ -18,8 +22,9 @@ from ...utils.client.api import APIClient
|
|
|
18
22
|
from ...utils.pager import PagerOnToken
|
|
19
23
|
from ..abstract.time_filter import TimeFilter
|
|
20
24
|
from .credentials import DatabricksCredentials
|
|
21
|
-
from .format import DatabricksFormatter
|
|
25
|
+
from .format import DatabricksFormatter, TagMapping
|
|
22
26
|
from .types import Link, Ostr, OTimestampedLink, TablesColumns, TimestampedLink
|
|
27
|
+
from .utils import build_path, tag_label
|
|
23
28
|
|
|
24
29
|
logger = logging.getLogger(__name__)
|
|
25
30
|
|
|
@@ -30,10 +35,20 @@ _RETRY_BASE_MS = 1000
|
|
|
30
35
|
_RETRY_EXCEPTIONS = [
|
|
31
36
|
requests.exceptions.ConnectTimeout,
|
|
32
37
|
]
|
|
38
|
+
_WORKSPACE_ID_HEADER = "X-Databricks-Org-Id"
|
|
39
|
+
|
|
40
|
+
_INFORMATION_SCHEMA_SQL = "SELECT * FROM system.information_schema"
|
|
33
41
|
|
|
34
42
|
safe_params = SafeMode((BaseException,), _MAX_NUMBER_OF_LINEAGE_ERRORS)
|
|
35
43
|
|
|
36
44
|
|
|
45
|
+
class TagEntity(Enum):
|
|
46
|
+
"""Entities that can be tagged in Databricks"""
|
|
47
|
+
|
|
48
|
+
COLUMN = "COLUMN"
|
|
49
|
+
TABLE = "TABLE"
|
|
50
|
+
|
|
51
|
+
|
|
37
52
|
def _day_to_epoch_ms(day: date) -> int:
|
|
38
53
|
return int(at_midnight(day).timestamp() * 1000)
|
|
39
54
|
|
|
@@ -74,12 +89,38 @@ class DatabricksClient(APIClient):
|
|
|
74
89
|
credentials: DatabricksCredentials,
|
|
75
90
|
db_allowed: Optional[Set[str]] = None,
|
|
76
91
|
db_blocked: Optional[Set[str]] = None,
|
|
92
|
+
has_table_tags: bool = False,
|
|
93
|
+
has_column_tags: bool = False,
|
|
77
94
|
):
|
|
78
95
|
super().__init__(host=credentials.host, token=credentials.token)
|
|
96
|
+
self._http_path = credentials.http_path
|
|
79
97
|
self._db_allowed = db_allowed
|
|
80
98
|
self._db_blocked = db_blocked
|
|
99
|
+
self._has_table_tags = has_table_tags
|
|
100
|
+
self._has_column_tags = has_column_tags
|
|
81
101
|
self.formatter = DatabricksFormatter()
|
|
82
102
|
|
|
103
|
+
def execute_sql(
|
|
104
|
+
self,
|
|
105
|
+
query: str,
|
|
106
|
+
params: Optional[dict] = None,
|
|
107
|
+
):
|
|
108
|
+
"""
|
|
109
|
+
Execute a SQL query on Databricks system tables and return the results.
|
|
110
|
+
https://docs.databricks.com/en/dev-tools/python-sql-connector.html
|
|
111
|
+
|
|
112
|
+
/!\ credentials.http_path is required in order to run SQL queries
|
|
113
|
+
"""
|
|
114
|
+
assert self._http_path, "HTTP_PATH is required to run SQL queries"
|
|
115
|
+
with sql.connect(
|
|
116
|
+
server_hostname=self._host,
|
|
117
|
+
http_path=self._http_path,
|
|
118
|
+
access_token=self._token,
|
|
119
|
+
) as connection:
|
|
120
|
+
with connection.cursor() as cursor:
|
|
121
|
+
cursor.execute(query, params)
|
|
122
|
+
return cursor.fetchall()
|
|
123
|
+
|
|
83
124
|
@staticmethod
|
|
84
125
|
def name() -> str:
|
|
85
126
|
return "Databricks"
|
|
@@ -120,15 +161,38 @@ class DatabricksClient(APIClient):
|
|
|
120
161
|
for schema in self._schemas_of_database(database)
|
|
121
162
|
]
|
|
122
163
|
|
|
123
|
-
|
|
164
|
+
@staticmethod
|
|
165
|
+
def _process_table_response(response: Response) -> Tuple[dict, str]:
|
|
166
|
+
"""
|
|
167
|
+
Returns both the JSON content and the Workspace ID, which is found
|
|
168
|
+
in the response's headers.
|
|
169
|
+
"""
|
|
170
|
+
return response.json(), response.headers[_WORKSPACE_ID_HEADER]
|
|
171
|
+
|
|
172
|
+
def _tables_columns_of_schema(
|
|
173
|
+
self,
|
|
174
|
+
schema: dict,
|
|
175
|
+
table_tags: TagMapping,
|
|
176
|
+
column_tags: TagMapping,
|
|
177
|
+
) -> TablesColumns:
|
|
124
178
|
path = "api/2.1/unity-catalog/tables"
|
|
125
179
|
payload = {
|
|
126
180
|
"catalog_name": schema["database_id"],
|
|
127
181
|
"schema_name": schema["schema_name"],
|
|
128
182
|
}
|
|
129
|
-
content = self.get(
|
|
183
|
+
content, workspace_id = self.get(
|
|
184
|
+
path=path,
|
|
185
|
+
payload=payload,
|
|
186
|
+
processor=self._process_table_response,
|
|
187
|
+
)
|
|
188
|
+
host = self.build_url(self._host, path="")
|
|
130
189
|
return self.formatter.format_table_column(
|
|
131
|
-
content.get("tables", []),
|
|
190
|
+
raw_tables=content.get("tables", []),
|
|
191
|
+
schema=schema,
|
|
192
|
+
host=host,
|
|
193
|
+
workspace_id=workspace_id,
|
|
194
|
+
table_tags=table_tags,
|
|
195
|
+
column_tags=column_tags,
|
|
132
196
|
)
|
|
133
197
|
|
|
134
198
|
@staticmethod
|
|
@@ -141,6 +205,40 @@ class DatabricksClient(APIClient):
|
|
|
141
205
|
return table
|
|
142
206
|
return {**table, "owner_external_id": owner_external_id}
|
|
143
207
|
|
|
208
|
+
def _needs_extraction(self, entity: TagEntity) -> bool:
|
|
209
|
+
if entity == TagEntity.TABLE:
|
|
210
|
+
return self._has_table_tags
|
|
211
|
+
if entity == TagEntity.COLUMN:
|
|
212
|
+
return self._has_column_tags
|
|
213
|
+
raise AssertionError(f"Entity not supported: {entity}")
|
|
214
|
+
|
|
215
|
+
def _get_tags_mapping(self, entity: TagEntity) -> TagMapping:
|
|
216
|
+
"""
|
|
217
|
+
Fetch tags of the given entity and build a mapping:
|
|
218
|
+
{ path: list[tags] }
|
|
219
|
+
|
|
220
|
+
https://docs.databricks.com/en/sql/language-manual/information-schema/table_tags.html
|
|
221
|
+
https://docs.databricks.com/en/sql/language-manual/information-schema/column_tags.html
|
|
222
|
+
"""
|
|
223
|
+
if not self._needs_extraction(entity):
|
|
224
|
+
# extracting tags require additional credentials (http_path)
|
|
225
|
+
return dict()
|
|
226
|
+
|
|
227
|
+
table = f"{entity.value.lower()}_tags"
|
|
228
|
+
query = f"{_INFORMATION_SCHEMA_SQL}.{table}"
|
|
229
|
+
result = self.execute_sql(query)
|
|
230
|
+
mapping = defaultdict(list)
|
|
231
|
+
for row in result:
|
|
232
|
+
dict_row = row.asDict()
|
|
233
|
+
keys = ["catalog_name", "schema_name", "table_name"]
|
|
234
|
+
if entity == TagEntity.COLUMN:
|
|
235
|
+
keys.append("column_name")
|
|
236
|
+
path = build_path(dict_row, keys)
|
|
237
|
+
label = tag_label(dict_row)
|
|
238
|
+
mapping[path].append(label)
|
|
239
|
+
|
|
240
|
+
return mapping
|
|
241
|
+
|
|
144
242
|
@staticmethod
|
|
145
243
|
def _get_user_mapping(users: List[dict]) -> dict:
|
|
146
244
|
return {
|
|
@@ -157,8 +255,15 @@ class DatabricksClient(APIClient):
|
|
|
157
255
|
tables: List[dict] = []
|
|
158
256
|
columns: List[dict] = []
|
|
159
257
|
user_mapping = self._get_user_mapping(users)
|
|
258
|
+
table_tags = self._get_tags_mapping(TagEntity.TABLE)
|
|
259
|
+
column_tags = self._get_tags_mapping(TagEntity.COLUMN)
|
|
160
260
|
for schema in schemas:
|
|
161
|
-
|
|
261
|
+
|
|
262
|
+
t_to_add, c_to_add = self._tables_columns_of_schema(
|
|
263
|
+
schema=schema,
|
|
264
|
+
table_tags=table_tags,
|
|
265
|
+
column_tags=column_tags,
|
|
266
|
+
)
|
|
162
267
|
t_with_owner = [
|
|
163
268
|
self._match_table_with_user(table, user_mapping)
|
|
164
269
|
for table in t_to_add
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
from dataclasses import field
|
|
2
|
+
from typing import Optional
|
|
2
3
|
|
|
3
4
|
from pydantic.dataclasses import dataclass
|
|
5
|
+
from pydantic_settings import SettingsConfigDict
|
|
4
6
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
_HOST = "CASTOR_DATABRICKS_HOST"
|
|
8
|
-
_TOKEN = "CASTOR_DATABRICKS_TOKEN" # noqa: S105
|
|
7
|
+
DATABRICKS_ENV_PREFIX = "CASTOR_DATABRICKS_"
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
@dataclass
|
|
@@ -19,10 +18,10 @@ class DatabricksCredentials:
|
|
|
19
18
|
|
|
20
19
|
host: str
|
|
21
20
|
token: str = field(metadata={"sensitive": True})
|
|
21
|
+
http_path: Optional[str] = field(default=None)
|
|
22
22
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
return DatabricksCredentials(host=host, token=token)
|
|
23
|
+
model_config = SettingsConfigDict(
|
|
24
|
+
env_prefix=DATABRICKS_ENV_PREFIX,
|
|
25
|
+
extra="ignore",
|
|
26
|
+
populate_by_name=True,
|
|
27
|
+
)
|
|
@@ -15,7 +15,7 @@ from ..abstract import (
|
|
|
15
15
|
common_args,
|
|
16
16
|
)
|
|
17
17
|
from .client import DatabricksClient
|
|
18
|
-
from .credentials import
|
|
18
|
+
from .credentials import DatabricksCredentials
|
|
19
19
|
|
|
20
20
|
DATABRICKS_ASSETS: SupportedAssets = {
|
|
21
21
|
WarehouseAssetGroup.ADDITIONAL_LINEAGE: ADDITIONAL_LINEAGE_ASSETS,
|
|
@@ -170,7 +170,7 @@ def extract_all(**kwargs) -> None:
|
|
|
170
170
|
output_directory, skip_existing = common_args(kwargs)
|
|
171
171
|
|
|
172
172
|
client = DatabricksClient(
|
|
173
|
-
credentials=
|
|
173
|
+
credentials=DatabricksCredentials(**kwargs),
|
|
174
174
|
db_allowed=kwargs.get("db_allowed"),
|
|
175
175
|
db_blocked=kwargs.get("db_blocked"),
|
|
176
176
|
)
|
|
@@ -1,14 +1,19 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from datetime import datetime
|
|
3
|
-
from typing import List, Optional
|
|
3
|
+
from typing import Dict, List, Optional
|
|
4
4
|
|
|
5
5
|
from .types import TablesColumns
|
|
6
|
+
from .utils import build_path
|
|
6
7
|
|
|
7
8
|
logger = logging.getLogger(__name__)
|
|
8
9
|
|
|
9
10
|
EXCLUDED_DATABASES = {"system"}
|
|
10
11
|
EXCLUDED_SCHEMAS = {"information_schema", "default"}
|
|
11
12
|
|
|
13
|
+
TABLE_URL_TPL = "{host}explore/data/{catalog_name}/{schema_name}/{table_name}?o={workspace_id}"
|
|
14
|
+
|
|
15
|
+
TagMapping = Dict[str, List[str]]
|
|
16
|
+
|
|
12
17
|
|
|
13
18
|
def _to_datetime_or_none(time_ms: Optional[int]) -> Optional[datetime]:
|
|
14
19
|
"""return time in ms as datetime or None"""
|
|
@@ -17,26 +22,61 @@ def _to_datetime_or_none(time_ms: Optional[int]) -> Optional[datetime]:
|
|
|
17
22
|
return datetime.fromtimestamp(time_ms / 1000.0)
|
|
18
23
|
|
|
19
24
|
|
|
20
|
-
def _table_payload(
|
|
25
|
+
def _table_payload(
|
|
26
|
+
schema: dict,
|
|
27
|
+
table: dict,
|
|
28
|
+
host: str,
|
|
29
|
+
workspace_id: str,
|
|
30
|
+
tags: TagMapping,
|
|
31
|
+
) -> dict:
|
|
32
|
+
"""
|
|
33
|
+
Prepares the table payload. This also includes a source link which is built
|
|
34
|
+
here using the host and workspace_id.
|
|
35
|
+
"""
|
|
36
|
+
url = TABLE_URL_TPL.format(
|
|
37
|
+
host=host,
|
|
38
|
+
catalog_name=table["catalog_name"],
|
|
39
|
+
schema_name=table["schema_name"],
|
|
40
|
+
table_name=table["name"],
|
|
41
|
+
workspace_id=workspace_id,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
keys = ["catalog_name", "schema_name", "name"]
|
|
45
|
+
path = build_path(table, keys)
|
|
46
|
+
|
|
21
47
|
return {
|
|
22
48
|
"description": table.get("comment"),
|
|
23
49
|
"id": table["table_id"],
|
|
24
50
|
"owner_email": table.get("owner"),
|
|
25
51
|
"schema_id": f"{schema['id']}",
|
|
26
52
|
"table_name": table["name"],
|
|
27
|
-
"tags": [],
|
|
53
|
+
"tags": tags.get(path, []),
|
|
28
54
|
"type": table.get("table_type"),
|
|
55
|
+
"url": url,
|
|
29
56
|
}
|
|
30
57
|
|
|
31
58
|
|
|
32
|
-
def
|
|
59
|
+
def _column_path(table: dict, column: dict) -> str:
|
|
60
|
+
keys = ["catalog_name", "schema_name", "name"]
|
|
61
|
+
table_path = build_path(table, keys)
|
|
62
|
+
column_name = column["name"]
|
|
63
|
+
return f"{table_path}.{column_name}"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _column_payload(
|
|
67
|
+
table: dict,
|
|
68
|
+
column: dict,
|
|
69
|
+
tags: TagMapping,
|
|
70
|
+
) -> dict:
|
|
71
|
+
path = _column_path(table, column)
|
|
33
72
|
return {
|
|
34
73
|
"column_name": column["name"],
|
|
35
74
|
"data_type": column["type_name"],
|
|
36
75
|
"description": column.get("comment"),
|
|
37
|
-
"id": f"`{table['
|
|
76
|
+
"id": f"`{table['table_id']}`.`{column['name']}`",
|
|
38
77
|
"ordinal_position": column["position"],
|
|
39
|
-
"table_id": table["
|
|
78
|
+
"table_id": table["table_id"],
|
|
79
|
+
"tags": tags.get(path, []),
|
|
40
80
|
}
|
|
41
81
|
|
|
42
82
|
|
|
@@ -78,19 +118,24 @@ class DatabricksFormatter:
|
|
|
78
118
|
|
|
79
119
|
@staticmethod
|
|
80
120
|
def format_table_column(
|
|
81
|
-
raw_tables: List[dict],
|
|
121
|
+
raw_tables: List[dict],
|
|
122
|
+
schema: dict,
|
|
123
|
+
host: str,
|
|
124
|
+
workspace_id: str,
|
|
125
|
+
table_tags: TagMapping,
|
|
126
|
+
column_tags: TagMapping,
|
|
82
127
|
) -> TablesColumns:
|
|
83
128
|
tables = []
|
|
84
129
|
columns = []
|
|
85
130
|
if not raw_tables:
|
|
86
131
|
return [], []
|
|
87
132
|
for table in raw_tables:
|
|
88
|
-
t = _table_payload(schema, table)
|
|
133
|
+
t = _table_payload(schema, table, host, workspace_id, table_tags)
|
|
89
134
|
tables.append(t)
|
|
90
135
|
if not table.get("columns"):
|
|
91
136
|
continue
|
|
92
137
|
for column in table["columns"]:
|
|
93
|
-
c = _column_payload(
|
|
138
|
+
c = _column_payload(table, column, column_tags)
|
|
94
139
|
columns.append(c)
|
|
95
140
|
|
|
96
141
|
return tables, columns
|
|
@@ -1,6 +1,12 @@
|
|
|
1
1
|
from datetime import datetime
|
|
2
2
|
|
|
3
|
-
from .format import
|
|
3
|
+
from .format import (
|
|
4
|
+
DatabricksFormatter,
|
|
5
|
+
_column_path,
|
|
6
|
+
_column_payload,
|
|
7
|
+
_table_payload,
|
|
8
|
+
_to_datetime_or_none,
|
|
9
|
+
)
|
|
4
10
|
|
|
5
11
|
|
|
6
12
|
def test__to_datetime_or_none():
|
|
@@ -23,42 +29,95 @@ def test_DatabricksFormatter__primary():
|
|
|
23
29
|
assert DatabricksFormatter._primary([]) is None
|
|
24
30
|
|
|
25
31
|
|
|
32
|
+
def test__table_payload():
|
|
33
|
+
schema = {"id": "id123"}
|
|
34
|
+
|
|
35
|
+
table = {
|
|
36
|
+
"name": "baz",
|
|
37
|
+
"catalog_name": "foo",
|
|
38
|
+
"schema_name": "bar",
|
|
39
|
+
"table_type": "MANAGED",
|
|
40
|
+
"owner": "pot@ato.com",
|
|
41
|
+
"table_id": "732pot5e-8ato-4c27-b701-9fa51febc192",
|
|
42
|
+
}
|
|
43
|
+
host = "https://some.cloud.databricks.net/"
|
|
44
|
+
workspace_id = "123456"
|
|
45
|
+
|
|
46
|
+
tags = {
|
|
47
|
+
"foo.bar.baz": ["riri", "fifi"],
|
|
48
|
+
"dummy.path": ["loulou"],
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
payload = _table_payload(schema, table, host, workspace_id, tags)
|
|
52
|
+
|
|
53
|
+
expected = {
|
|
54
|
+
"description": None,
|
|
55
|
+
"id": "732pot5e-8ato-4c27-b701-9fa51febc192",
|
|
56
|
+
"owner_email": "pot@ato.com",
|
|
57
|
+
"schema_id": "id123",
|
|
58
|
+
"table_name": "baz",
|
|
59
|
+
"tags": ["riri", "fifi"],
|
|
60
|
+
"type": "MANAGED",
|
|
61
|
+
"url": "https://some.cloud.databricks.net/explore/data/foo/bar/baz?o=123456",
|
|
62
|
+
}
|
|
63
|
+
assert payload == expected
|
|
64
|
+
|
|
65
|
+
|
|
26
66
|
def test__column_payload():
|
|
27
67
|
table = {
|
|
28
|
-
"
|
|
29
|
-
"
|
|
30
|
-
"
|
|
31
|
-
"
|
|
32
|
-
"
|
|
33
|
-
"
|
|
68
|
+
"catalog_name": "foo",
|
|
69
|
+
"name": "baz",
|
|
70
|
+
"owner": "pot@ato.com",
|
|
71
|
+
"schema_name": "bar",
|
|
72
|
+
"table_id": "732pot5e-8ato-4c27-b701-9fa51febc192",
|
|
73
|
+
"table_type": "MANAGED",
|
|
34
74
|
}
|
|
35
75
|
column = {
|
|
76
|
+
"comment": "some description",
|
|
36
77
|
"name": "Uid",
|
|
37
|
-
"
|
|
38
|
-
"type_name": "STRING",
|
|
78
|
+
"nullable": True,
|
|
39
79
|
"position": 0,
|
|
80
|
+
"type_json": '{"name":"Uid","type":"string","nullable":true,"metadata":{}}',
|
|
81
|
+
"type_name": "STRING",
|
|
40
82
|
"type_precision": 0,
|
|
41
83
|
"type_scale": 0,
|
|
42
|
-
"
|
|
43
|
-
"nullable": True,
|
|
44
|
-
"comment": "some description",
|
|
84
|
+
"type_text": "string",
|
|
45
85
|
}
|
|
46
|
-
|
|
86
|
+
tags = {
|
|
87
|
+
"foo.bar.baz.Uid": ["riri", "fifi"],
|
|
88
|
+
"dummy.path": ["loulou"],
|
|
89
|
+
}
|
|
90
|
+
payload = _column_payload(table, column, tags)
|
|
47
91
|
|
|
48
92
|
expected = {
|
|
49
|
-
"id": "`18175cd5-9b9b-4d78-9d28-caaa12c21ce0`.`Uid`",
|
|
50
93
|
"column_name": "Uid",
|
|
51
|
-
"table_id": "18175cd5-9b9b-4d78-9d28-caaa12c21ce0",
|
|
52
|
-
"description": "some description",
|
|
53
94
|
"data_type": "STRING",
|
|
95
|
+
"description": "some description",
|
|
96
|
+
"id": "`732pot5e-8ato-4c27-b701-9fa51febc192`.`Uid`",
|
|
54
97
|
"ordinal_position": 0,
|
|
98
|
+
"table_id": "732pot5e-8ato-4c27-b701-9fa51febc192",
|
|
99
|
+
"tags": ["riri", "fifi"],
|
|
55
100
|
}
|
|
56
101
|
assert payload == expected
|
|
57
102
|
|
|
58
103
|
# case where there are spaces in the name
|
|
59
104
|
column["name"] = "column name with spaces"
|
|
60
|
-
payload = _column_payload(table, column)
|
|
105
|
+
payload = _column_payload(table, column, tags)
|
|
61
106
|
expected_id = (
|
|
62
|
-
"`
|
|
107
|
+
"`732pot5e-8ato-4c27-b701-9fa51febc192`.`column name with spaces`"
|
|
63
108
|
)
|
|
64
109
|
assert payload["id"] == expected_id
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def test__column_path():
|
|
113
|
+
table = {
|
|
114
|
+
"catalog_name": "Jo",
|
|
115
|
+
"schema_name": "William",
|
|
116
|
+
"name": "Jack",
|
|
117
|
+
}
|
|
118
|
+
column = {
|
|
119
|
+
"name": "Averell",
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
expected = "Jo.William.Jack.Averell"
|
|
123
|
+
assert _column_path(table=table, column=column) == expected
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from typing import Dict, List
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def build_path(
|
|
5
|
+
row: Dict,
|
|
6
|
+
keys: List[str],
|
|
7
|
+
) -> str:
|
|
8
|
+
"""
|
|
9
|
+
format an asset's path:
|
|
10
|
+
- picks the given keys from dict
|
|
11
|
+
- join keys with a dot "."
|
|
12
|
+
"""
|
|
13
|
+
key_values = [row[key] for key in keys]
|
|
14
|
+
return ".".join(key_values)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def tag_label(row: Dict) -> str:
|
|
18
|
+
"""
|
|
19
|
+
format the tag's label:
|
|
20
|
+
- {key:value} when the value is not empty
|
|
21
|
+
- {key} otherwise
|
|
22
|
+
"""
|
|
23
|
+
tag_name = row["tag_name"]
|
|
24
|
+
tag_value = row["tag_value"]
|
|
25
|
+
if not tag_value:
|
|
26
|
+
return tag_name
|
|
27
|
+
return f"{tag_name}:{tag_value}"
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from .utils import build_path, tag_label
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_build_path():
|
|
5
|
+
row = {
|
|
6
|
+
"bigflo": "oli",
|
|
7
|
+
"laurel": "hardy",
|
|
8
|
+
"dupond": "dupont",
|
|
9
|
+
}
|
|
10
|
+
keys = ["laurel", "dupond"]
|
|
11
|
+
assert build_path(row, keys) == "hardy.dupont"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_tag_label():
|
|
15
|
+
row = {
|
|
16
|
+
"tag_name": "marketplace",
|
|
17
|
+
"tag_value": "",
|
|
18
|
+
}
|
|
19
|
+
assert tag_label(row) == "marketplace"
|
|
20
|
+
|
|
21
|
+
row = {
|
|
22
|
+
"tag_name": "fi",
|
|
23
|
+
"tag_value": "fou",
|
|
24
|
+
}
|
|
25
|
+
assert tag_label(row) == "fi:fou"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: castor-extractor
|
|
3
|
-
Version: 0.17.
|
|
3
|
+
Version: 0.17.3
|
|
4
4
|
Summary: Extract your metadata assets.
|
|
5
5
|
Home-page: https://www.castordoc.com/
|
|
6
6
|
License: EULA
|
|
@@ -17,6 +17,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.12
|
|
18
18
|
Provides-Extra: all
|
|
19
19
|
Provides-Extra: bigquery
|
|
20
|
+
Provides-Extra: databricks
|
|
20
21
|
Provides-Extra: dbt
|
|
21
22
|
Provides-Extra: looker
|
|
22
23
|
Provides-Extra: metabase
|
|
@@ -29,6 +30,7 @@ Provides-Extra: snowflake
|
|
|
29
30
|
Provides-Extra: sqlserver
|
|
30
31
|
Provides-Extra: tableau
|
|
31
32
|
Requires-Dist: cryptography (>=41.0.5) ; extra == "snowflake"
|
|
33
|
+
Requires-Dist: databricks-sql-connector (>=3.2.0,<4.0.0) ; extra == "databricks" or extra == "all"
|
|
32
34
|
Requires-Dist: google-api-core (>=2.1.1,<3.0.0)
|
|
33
35
|
Requires-Dist: google-auth (>=2,<3)
|
|
34
36
|
Requires-Dist: google-cloud-core (>=2.1.0,<3.0.0)
|
|
@@ -39,6 +41,9 @@ Requires-Dist: looker-sdk (>=23.0.0) ; extra == "looker" or extra == "all"
|
|
|
39
41
|
Requires-Dist: msal (>=1.20.0,<2.0.0) ; extra == "powerbi" or extra == "all"
|
|
40
42
|
Requires-Dist: numpy (<1.25) ; python_version >= "3.8" and python_version < "3.9"
|
|
41
43
|
Requires-Dist: numpy (>=1.26,<2) ; python_version >= "3.12" and python_version < "3.13"
|
|
44
|
+
Requires-Dist: pandas (>=2,<2.2.0) ; python_version >= "3.9" and python_full_version <= "3.11.0"
|
|
45
|
+
Requires-Dist: pandas (>=2.0,<2.1) ; python_version >= "3.8" and python_version < "3.9"
|
|
46
|
+
Requires-Dist: pandas (>=2.1,<2.2.0) ; python_version >= "3.12" and python_version < "3.13"
|
|
42
47
|
Requires-Dist: psycopg2-binary (>=2.0.0,<3.0.0) ; extra == "metabase" or extra == "postgres" or extra == "redshift" or extra == "all"
|
|
43
48
|
Requires-Dist: pycryptodome (>=3.0.0,<4.0.0) ; extra == "metabase" or extra == "all"
|
|
44
49
|
Requires-Dist: pydantic (>=2.6,<3.0)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
CHANGELOG.md,sha256=
|
|
1
|
+
CHANGELOG.md,sha256=dxvJYXKwACP7txJdP-1Ug0G6tj34Vsd8TkEn1uuhLgs,11152
|
|
2
2
|
Dockerfile,sha256=HcX5z8OpeSvkScQsN-Y7CNMUig_UB6vTMDl7uqzuLGE,303
|
|
3
3
|
LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
|
|
4
4
|
README.md,sha256=uF6PXm9ocPITlKVSh9afTakHmpLx3TvawLf-CbMP3wM,3578
|
|
@@ -41,13 +41,13 @@ castor_extractor/uploader/__init__.py,sha256=SSRtwjg-dNoxME-RJy9G1flASiUKAC5bH1h
|
|
|
41
41
|
castor_extractor/uploader/constant.py,sha256=yTigLHDlYwoRr6CpFIl7ReElFsQd4H-qkluMZJPWSx0,865
|
|
42
42
|
castor_extractor/uploader/env.py,sha256=5HSniVSOYVg4u38O4k8TB_qaJq9s8yJ1hjedkq_gdVg,878
|
|
43
43
|
castor_extractor/uploader/env_test.py,sha256=ClCWWtwd2N-5ClIDUxVMeKkWfhhOTxpppsXUDmdjxSg,472
|
|
44
|
-
castor_extractor/uploader/upload.py,sha256=
|
|
44
|
+
castor_extractor/uploader/upload.py,sha256=W1TGqO8_PtFdR661qNlh6v-LOIRvoJoda65-5OujFXs,3239
|
|
45
45
|
castor_extractor/uploader/upload_test.py,sha256=7fwstdQe7FjuwGilsCdFpEQr1qLoR2WTRUzyy93fISw,402
|
|
46
46
|
castor_extractor/uploader/utils.py,sha256=Tx_i875L2vJ8btOLV3-L0UMEFiyhH8E5n0XXRyLjO0Y,793
|
|
47
47
|
castor_extractor/utils/__init__.py,sha256=bmzAOc-PKsVreMJtF7DGpPQeHrVqxWel_BblRftt6Ag,1186
|
|
48
48
|
castor_extractor/utils/client/__init__.py,sha256=CRE-xJKm6fVV9dB8ljzB5YoOxX4I1sCD1KSgqs3Y8_Y,161
|
|
49
49
|
castor_extractor/utils/client/abstract.py,sha256=aA5Qcb9TwWDSMq8WpXbGkOB20hehwX2VTpqQAwV76wk,2048
|
|
50
|
-
castor_extractor/utils/client/api.py,sha256=
|
|
50
|
+
castor_extractor/utils/client/api.py,sha256=AGDj2JH__Q_x7RQdodoVazGvjGQJ9TzNqs-XEX6Hrms,1840
|
|
51
51
|
castor_extractor/utils/client/api_test.py,sha256=NSMdXg1FLc37erqHp2FZsIsogWVv6lFSs7rDXHikr-E,542
|
|
52
52
|
castor_extractor/utils/client/postgres.py,sha256=n6ulaT222WWPY0_6qAZ0MHF0m91HtI9mMqL71nyygo0,866
|
|
53
53
|
castor_extractor/utils/client/query.py,sha256=O6D5EjD1KmBlwa786Uw4D4kzxx97_HH50xIIeSWt0B8,205
|
|
@@ -276,15 +276,17 @@ castor_extractor/warehouse/bigquery/queries/user.sql,sha256=l-fkNGWJVdZwVhbFZL23
|
|
|
276
276
|
castor_extractor/warehouse/bigquery/queries/view_ddl.sql,sha256=obCm-IN9V8_YSZTwcgNSBDD0ZXPgRjlxJjrZDSEH2MU,326
|
|
277
277
|
castor_extractor/warehouse/bigquery/query.py,sha256=hrFfjd5jW2oQnZ6ozlkn-gDe6sCIzu5zSX19T9W6fIk,4162
|
|
278
278
|
castor_extractor/warehouse/bigquery/types.py,sha256=LZVWSmE57lOemNbB5hBRyYmDk9bFAU4nbRaJWALl6N8,140
|
|
279
|
-
castor_extractor/warehouse/databricks/__init__.py,sha256=
|
|
280
|
-
castor_extractor/warehouse/databricks/client.py,sha256=
|
|
279
|
+
castor_extractor/warehouse/databricks/__init__.py,sha256=YG3YSIJgCFRjjI8eExy9T7qGnfnjWhMFh8c15KTs_BA,184
|
|
280
|
+
castor_extractor/warehouse/databricks/client.py,sha256=pnYb6nl9U33nH6UukYP5piWGYF-m1SP2TYiWiUozM_4,20552
|
|
281
281
|
castor_extractor/warehouse/databricks/client_test.py,sha256=KNp4Hi_CC6GwiW2QDJQQwqALfUebuT9D_qL6FuP_8tY,5246
|
|
282
|
-
castor_extractor/warehouse/databricks/credentials.py,sha256=
|
|
283
|
-
castor_extractor/warehouse/databricks/extract.py,sha256=
|
|
284
|
-
castor_extractor/warehouse/databricks/format.py,sha256=
|
|
285
|
-
castor_extractor/warehouse/databricks/format_test.py,sha256=
|
|
282
|
+
castor_extractor/warehouse/databricks/credentials.py,sha256=iphbVynVTQXMEbJy4QaT5fer-GpOi7QtbAlg8R7-Lj4,598
|
|
283
|
+
castor_extractor/warehouse/databricks/extract.py,sha256=VYygE06f7ngYWVlRa48O6drLIZF-_4IBJdyXTYfxZQU,7395
|
|
284
|
+
castor_extractor/warehouse/databricks/format.py,sha256=p252NFzQN1uZdsu5wpP-bMHK0rBBVzallX3-o92Mvh4,6744
|
|
285
|
+
castor_extractor/warehouse/databricks/format_test.py,sha256=ls0IcOElqp_qecAzNbK0zdca7Pms4seCHimbw8NAoAI,3322
|
|
286
286
|
castor_extractor/warehouse/databricks/test_constants.py,sha256=Hm96yq_ltVAKv7WYhYz637r4Cuj-1cCdyOuxMEe3J-Q,2246
|
|
287
287
|
castor_extractor/warehouse/databricks/types.py,sha256=hD6gC8oiT3QSWEvbtgUOGK_lLzzz36sEauB3lS_wxlE,218
|
|
288
|
+
castor_extractor/warehouse/databricks/utils.py,sha256=RWRViqLaj2K0in5T5F6OLp7HCm554BCh3zi4CJqOEt8,576
|
|
289
|
+
castor_extractor/warehouse/databricks/utils_test.py,sha256=5Qrd_tLNLWrDHX2uQyVUf0vqXJzD44uQGGxDBOkwvUU,503
|
|
288
290
|
castor_extractor/warehouse/mysql/__init__.py,sha256=2KFDogo9GNbApHqw3Vm5t_uNmIRjdp76nmP_WQQMfQY,116
|
|
289
291
|
castor_extractor/warehouse/mysql/client.py,sha256=IwoJvbmE5VZkMCP9yHf6ta3_AQPEuBPrZZ3meefbcJs,974
|
|
290
292
|
castor_extractor/warehouse/mysql/client_test.py,sha256=wRTv-3c5chy_HKj-buasNiYOOCIfynYqbabM4Hxdh5E,1052
|
|
@@ -370,8 +372,8 @@ castor_extractor/warehouse/synapse/queries/schema.sql,sha256=aX9xNrBD_ydwl-znGSF
|
|
|
370
372
|
castor_extractor/warehouse/synapse/queries/table.sql,sha256=mCE8bR1Vb7j7SwZW2gafcXidQ2fo1HwxcybA8wP2Kfs,1049
|
|
371
373
|
castor_extractor/warehouse/synapse/queries/user.sql,sha256=sTb_SS7Zj3AXW1SggKPLNMCd0qoTpL7XI_BJRMaEpBg,67
|
|
372
374
|
castor_extractor/warehouse/synapse/queries/view_ddl.sql,sha256=3EVbp5_yTgdByHFIPLHmnoOnqqLE77SrjAwFDvu4e54,249
|
|
373
|
-
castor_extractor-0.17.
|
|
374
|
-
castor_extractor-0.17.
|
|
375
|
-
castor_extractor-0.17.
|
|
376
|
-
castor_extractor-0.17.
|
|
377
|
-
castor_extractor-0.17.
|
|
375
|
+
castor_extractor-0.17.3.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
|
|
376
|
+
castor_extractor-0.17.3.dist-info/METADATA,sha256=v_xgS6DS7FC1kVzma9Z69XiMOjDn6BHckEJJ-rJ5TZI,6985
|
|
377
|
+
castor_extractor-0.17.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
378
|
+
castor_extractor-0.17.3.dist-info/entry_points.txt,sha256=SbyPk58Gh-FRztfCNnUZQ6w7SatzNJFZ6GIJLNsy7tI,1427
|
|
379
|
+
castor_extractor-0.17.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|