castor-extractor 0.16.6__py3-none-any.whl → 0.16.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +21 -0
- castor_extractor/utils/__init__.py +2 -1
- castor_extractor/utils/collection.py +32 -0
- castor_extractor/utils/collection_test.py +60 -0
- castor_extractor/utils/time.py +9 -1
- castor_extractor/utils/time_test.py +8 -1
- castor_extractor/visualization/domo/client/client.py +28 -43
- castor_extractor/visualization/domo/client/client_test.py +1 -23
- castor_extractor/visualization/domo/client/endpoints.py +13 -6
- castor_extractor/visualization/domo/client/pagination.py +4 -0
- castor_extractor/visualization/looker/api/client.py +21 -17
- castor_extractor/visualization/looker/api/sdk.py +10 -58
- castor_extractor/visualization/looker/api/utils.py +1 -1
- castor_extractor/visualization/looker/extract.py +2 -1
- castor_extractor/visualization/looker/multithreading.py +1 -1
- castor_extractor/visualization/tableau_revamp/client/client.py +79 -13
- castor_extractor/visualization/tableau_revamp/client/gql_queries.py +23 -16
- castor_extractor/visualization/tableau_revamp/client/tsc_fields.py +4 -0
- castor_extractor/warehouse/databricks/client.py +12 -5
- castor_extractor/warehouse/databricks/client_test.py +22 -3
- castor_extractor/warehouse/databricks/format.py +5 -1
- castor_extractor/warehouse/salesforce/client.py +8 -6
- castor_extractor/warehouse/salesforce/extract.py +2 -2
- castor_extractor/warehouse/salesforce/format.py +34 -7
- castor_extractor/warehouse/salesforce/format_test.py +49 -1
- {castor_extractor-0.16.6.dist-info → castor_extractor-0.16.11.dist-info}/METADATA +6 -3
- {castor_extractor-0.16.6.dist-info → castor_extractor-0.16.11.dist-info}/RECORD +30 -29
- {castor_extractor-0.16.6.dist-info → castor_extractor-0.16.11.dist-info}/LICENCE +0 -0
- {castor_extractor-0.16.6.dist-info → castor_extractor-0.16.11.dist-info}/WHEEL +0 -0
- {castor_extractor-0.16.6.dist-info → castor_extractor-0.16.11.dist-info}/entry_points.txt +0 -0
|
@@ -2,6 +2,7 @@ import logging
|
|
|
2
2
|
from typing import Dict, Iterator, List, Optional
|
|
3
3
|
|
|
4
4
|
import tableauserverclient as TSC # type: ignore
|
|
5
|
+
from tableauserverclient import Pager
|
|
5
6
|
|
|
6
7
|
from ....utils import SerializedAsset
|
|
7
8
|
from ..assets import TableauRevampAsset
|
|
@@ -12,7 +13,7 @@ from ..constants import (
|
|
|
12
13
|
)
|
|
13
14
|
from .credentials import TableauRevampCredentials
|
|
14
15
|
from .errors import TableauApiError
|
|
15
|
-
from .gql_queries import GQL_QUERIES, QUERY_TEMPLATE
|
|
16
|
+
from .gql_queries import FIELDS_QUERIES, GQL_QUERIES, QUERY_TEMPLATE
|
|
16
17
|
from .tsc_fields import TSC_FIELDS
|
|
17
18
|
|
|
18
19
|
logger = logging.getLogger(__name__)
|
|
@@ -27,13 +28,18 @@ _TSC_ASSETS = (
|
|
|
27
28
|
TableauRevampAsset.USAGE,
|
|
28
29
|
)
|
|
29
30
|
|
|
31
|
+
# increase the value when extraction is too slow
|
|
32
|
+
# decrease the value when timeouts arise
|
|
30
33
|
_CUSTOM_PAGE_SIZE: Dict[TableauRevampAsset, int] = {
|
|
34
|
+
# fields and columns are light but volumes are bigger
|
|
35
|
+
TableauRevampAsset.COLUMN: 200,
|
|
31
36
|
TableauRevampAsset.FIELD: 1000,
|
|
37
|
+
TableauRevampAsset.TABLE: 50,
|
|
32
38
|
}
|
|
33
39
|
|
|
34
40
|
|
|
35
41
|
def _pick_fields(
|
|
36
|
-
data:
|
|
42
|
+
data: Pager,
|
|
37
43
|
asset: TableauRevampAsset,
|
|
38
44
|
) -> SerializedAsset:
|
|
39
45
|
fields = TSC_FIELDS[asset]
|
|
@@ -44,7 +50,7 @@ def _pick_fields(
|
|
|
44
50
|
return [_pick(row) for row in data]
|
|
45
51
|
|
|
46
52
|
|
|
47
|
-
def
|
|
53
|
+
def _enrich_datasources_with_tsc(
|
|
48
54
|
datasources: SerializedAsset,
|
|
49
55
|
tsc_datasources: SerializedAsset,
|
|
50
56
|
) -> SerializedAsset:
|
|
@@ -69,6 +75,32 @@ def _enrich_with_tsc(
|
|
|
69
75
|
return datasources
|
|
70
76
|
|
|
71
77
|
|
|
78
|
+
def _enrich_workbooks_with_tsc(
|
|
79
|
+
workbooks: SerializedAsset,
|
|
80
|
+
tsc_workbooks: SerializedAsset,
|
|
81
|
+
) -> SerializedAsset:
|
|
82
|
+
"""
|
|
83
|
+
Enrich workbooks with fields coming from TableauServerClient:
|
|
84
|
+
- project_luid
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
mapping = {row["id"]: row for row in tsc_workbooks}
|
|
88
|
+
|
|
89
|
+
for workbook in workbooks:
|
|
90
|
+
luid = workbook["luid"]
|
|
91
|
+
tsc_workbook = mapping.get(luid)
|
|
92
|
+
if not tsc_workbook:
|
|
93
|
+
# it happens that a workbook is in Metadata API but not in TSC
|
|
94
|
+
# in this case, we push the workbook with default project
|
|
95
|
+
logger.warning(f"Workbook {luid} was not found in TSC")
|
|
96
|
+
workbook["projectLuid"] = None
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
workbook["projectLuid"] = tsc_workbook["project_id"]
|
|
100
|
+
|
|
101
|
+
return workbooks
|
|
102
|
+
|
|
103
|
+
|
|
72
104
|
def gql_query_scroll(
|
|
73
105
|
server,
|
|
74
106
|
query: str,
|
|
@@ -176,29 +208,32 @@ class TableauRevampClient:
|
|
|
176
208
|
asset: TableauRevampAsset,
|
|
177
209
|
) -> SerializedAsset:
|
|
178
210
|
|
|
179
|
-
if asset == TableauRevampAsset.
|
|
180
|
-
data = TSC.Pager(self._server.
|
|
211
|
+
if asset == TableauRevampAsset.DATASOURCE:
|
|
212
|
+
data = TSC.Pager(self._server.datasources)
|
|
181
213
|
|
|
182
214
|
elif asset == TableauRevampAsset.PROJECT:
|
|
183
215
|
data = TSC.Pager(self._server.projects)
|
|
184
216
|
|
|
185
|
-
elif asset == TableauRevampAsset.DATASOURCE:
|
|
186
|
-
data = TSC.Pager(self._server.datasources)
|
|
187
|
-
|
|
188
217
|
elif asset == TableauRevampAsset.USAGE:
|
|
189
218
|
data = TSC.Pager(self._server.views, usage=True)
|
|
190
219
|
|
|
220
|
+
elif asset == TableauRevampAsset.USER:
|
|
221
|
+
data = TSC.Pager(self._server.users)
|
|
222
|
+
|
|
223
|
+
elif asset == TableauRevampAsset.WORKBOOK:
|
|
224
|
+
data = TSC.Pager(self._server.workbooks)
|
|
225
|
+
|
|
191
226
|
else:
|
|
192
227
|
raise AssertionError(f"Fetching from TSC not supported for {asset}")
|
|
193
228
|
|
|
194
229
|
return _pick_fields(data, asset)
|
|
195
230
|
|
|
196
|
-
def
|
|
231
|
+
def _run_graphql_query(
|
|
197
232
|
self,
|
|
198
|
-
|
|
233
|
+
resource: str,
|
|
234
|
+
fields: str,
|
|
235
|
+
page_size: int = DEFAULT_PAGE_SIZE,
|
|
199
236
|
) -> SerializedAsset:
|
|
200
|
-
resource, fields = GQL_QUERIES[asset]
|
|
201
|
-
page_size = _CUSTOM_PAGE_SIZE.get(asset) or DEFAULT_PAGE_SIZE
|
|
202
237
|
query = QUERY_TEMPLATE.format(
|
|
203
238
|
resource=resource,
|
|
204
239
|
fields=fields,
|
|
@@ -207,13 +242,40 @@ class TableauRevampClient:
|
|
|
207
242
|
result_pages = gql_query_scroll(self._server, query, resource)
|
|
208
243
|
return [asset for page in result_pages for asset in page]
|
|
209
244
|
|
|
245
|
+
def _fetch_fields(self) -> SerializedAsset:
|
|
246
|
+
result: SerializedAsset = []
|
|
247
|
+
page_size = _CUSTOM_PAGE_SIZE[TableauRevampAsset.FIELD]
|
|
248
|
+
for resource, fields in FIELDS_QUERIES:
|
|
249
|
+
current = self._run_graphql_query(resource, fields, page_size)
|
|
250
|
+
result.extend(current)
|
|
251
|
+
return result
|
|
252
|
+
|
|
253
|
+
def _fetch_from_metadata_api(
|
|
254
|
+
self,
|
|
255
|
+
asset: TableauRevampAsset,
|
|
256
|
+
) -> SerializedAsset:
|
|
257
|
+
if asset == TableauRevampAsset.FIELD:
|
|
258
|
+
return self._fetch_fields()
|
|
259
|
+
|
|
260
|
+
page_size = _CUSTOM_PAGE_SIZE.get(asset) or DEFAULT_PAGE_SIZE
|
|
261
|
+
resource, fields = GQL_QUERIES[asset]
|
|
262
|
+
return self._run_graphql_query(resource, fields, page_size)
|
|
263
|
+
|
|
210
264
|
def _fetch_datasources(self) -> SerializedAsset:
|
|
211
265
|
asset = TableauRevampAsset.DATASOURCE
|
|
212
266
|
|
|
213
267
|
datasources = self._fetch_from_metadata_api(asset)
|
|
214
268
|
datasource_projects = self._fetch_from_tsc(asset)
|
|
215
269
|
|
|
216
|
-
return
|
|
270
|
+
return _enrich_datasources_with_tsc(datasources, datasource_projects)
|
|
271
|
+
|
|
272
|
+
def _fetch_workbooks(self) -> SerializedAsset:
|
|
273
|
+
asset = TableauRevampAsset.WORKBOOK
|
|
274
|
+
|
|
275
|
+
workbooks = self._fetch_from_metadata_api(asset)
|
|
276
|
+
workbook_projects = self._fetch_from_tsc(asset)
|
|
277
|
+
|
|
278
|
+
return _enrich_workbooks_with_tsc(workbooks, workbook_projects)
|
|
217
279
|
|
|
218
280
|
def fetch(
|
|
219
281
|
self,
|
|
@@ -226,6 +288,10 @@ class TableauRevampClient:
|
|
|
226
288
|
# both APIs are required to extract datasources
|
|
227
289
|
return self._fetch_datasources()
|
|
228
290
|
|
|
291
|
+
if asset == TableauRevampAsset.WORKBOOK:
|
|
292
|
+
# both APIs are required to extract workbooks
|
|
293
|
+
return self._fetch_workbooks()
|
|
294
|
+
|
|
229
295
|
if asset in _TSC_ASSETS:
|
|
230
296
|
# some assets can only be extracted via TSC
|
|
231
297
|
return self._fetch_from_tsc(asset)
|
|
@@ -18,7 +18,11 @@ QUERY_TEMPLATE = """
|
|
|
18
18
|
|
|
19
19
|
_COLUMNS_QUERY = """
|
|
20
20
|
downstreamDashboards { id }
|
|
21
|
-
downstreamFields {
|
|
21
|
+
downstreamFields {
|
|
22
|
+
id
|
|
23
|
+
__typename
|
|
24
|
+
datasource { id }
|
|
25
|
+
}
|
|
22
26
|
downstreamWorkbooks { id }
|
|
23
27
|
id
|
|
24
28
|
name
|
|
@@ -37,12 +41,10 @@ workbook { id }
|
|
|
37
41
|
|
|
38
42
|
_DATASOURCES_QUERY = """
|
|
39
43
|
__typename
|
|
40
|
-
createdAt
|
|
41
44
|
downstreamDashboards { id }
|
|
42
45
|
downstreamWorkbooks { id }
|
|
43
46
|
id
|
|
44
47
|
name
|
|
45
|
-
updatedAt
|
|
46
48
|
... on PublishedDatasource {
|
|
47
49
|
description
|
|
48
50
|
luid
|
|
@@ -64,7 +66,6 @@ name
|
|
|
64
66
|
connectionType
|
|
65
67
|
fullName
|
|
66
68
|
schema
|
|
67
|
-
tableType
|
|
68
69
|
}
|
|
69
70
|
... on CustomSQLTable {
|
|
70
71
|
query
|
|
@@ -80,7 +81,6 @@ id
|
|
|
80
81
|
luid
|
|
81
82
|
name
|
|
82
83
|
owner { luid }
|
|
83
|
-
projectLuid
|
|
84
84
|
site { name }
|
|
85
85
|
tags { name }
|
|
86
86
|
updatedAt
|
|
@@ -96,16 +96,17 @@ downstreamWorkbooks { id }
|
|
|
96
96
|
folderName
|
|
97
97
|
id
|
|
98
98
|
name
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
}
|
|
99
|
+
dataType
|
|
100
|
+
role
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
_FIELDS_QUERY_WITH_COLUMNS = f"""
|
|
105
|
+
{_FIELDS_QUERY}
|
|
106
|
+
columns {{
|
|
107
|
+
name
|
|
108
|
+
table {{ name }}
|
|
109
|
+
}}
|
|
109
110
|
"""
|
|
110
111
|
|
|
111
112
|
_SHEETS_QUERY = """
|
|
@@ -124,8 +125,14 @@ GQL_QUERIES: Dict[TableauRevampAsset, Tuple[str, str]] = {
|
|
|
124
125
|
TableauRevampAsset.COLUMN: ("columns", _COLUMNS_QUERY),
|
|
125
126
|
TableauRevampAsset.DASHBOARD: ("dashboards", _DASHBOARDS_QUERY),
|
|
126
127
|
TableauRevampAsset.DATASOURCE: ("datasources", _DATASOURCES_QUERY),
|
|
127
|
-
TableauRevampAsset.FIELD: ("fields", _FIELDS_QUERY),
|
|
128
128
|
TableauRevampAsset.SHEET: ("sheets", _SHEETS_QUERY),
|
|
129
129
|
TableauRevampAsset.TABLE: ("tables", _TABLES_QUERY),
|
|
130
130
|
TableauRevampAsset.WORKBOOK: ("workbooks", _WORKBOOKS_QUERY),
|
|
131
131
|
}
|
|
132
|
+
|
|
133
|
+
FIELDS_QUERIES = (
|
|
134
|
+
("binFields", _FIELDS_QUERY),
|
|
135
|
+
("calculatedFields", _FIELDS_QUERY),
|
|
136
|
+
("columnFields", _FIELDS_QUERY_WITH_COLUMNS),
|
|
137
|
+
("groupFields", _FIELDS_QUERY),
|
|
138
|
+
)
|
|
@@ -3,7 +3,7 @@ from datetime import date
|
|
|
3
3
|
from functools import partial
|
|
4
4
|
from typing import Any, Dict, List, Optional, Set
|
|
5
5
|
|
|
6
|
-
from ...utils import at_midnight, date_after
|
|
6
|
+
from ...utils import at_midnight, date_after, mapping_from_rows
|
|
7
7
|
from ...utils.client.api import APIClient
|
|
8
8
|
from ...utils.pager import PagerOnToken
|
|
9
9
|
from ..abstract.time_filter import TimeFilter
|
|
@@ -88,15 +88,22 @@ class DatabricksClient(APIClient):
|
|
|
88
88
|
)
|
|
89
89
|
|
|
90
90
|
@staticmethod
|
|
91
|
-
def _match_table_with_user(table: dict,
|
|
91
|
+
def _match_table_with_user(table: dict, user_mapping: dict) -> dict:
|
|
92
92
|
table_owner_email = table.get("owner_email")
|
|
93
93
|
if not table_owner_email:
|
|
94
94
|
return table
|
|
95
|
-
owner_external_id =
|
|
95
|
+
owner_external_id = user_mapping.get(table_owner_email)
|
|
96
96
|
if not owner_external_id:
|
|
97
97
|
return table
|
|
98
98
|
return {**table, "owner_external_id": owner_external_id}
|
|
99
99
|
|
|
100
|
+
@staticmethod
|
|
101
|
+
def _get_user_mapping(users: List[dict]) -> dict:
|
|
102
|
+
return {
|
|
103
|
+
**mapping_from_rows(users, "email", "id"),
|
|
104
|
+
**mapping_from_rows(users, "user_name", "id"),
|
|
105
|
+
}
|
|
106
|
+
|
|
100
107
|
def tables_and_columns(
|
|
101
108
|
self, schemas: List[dict], users: List[dict]
|
|
102
109
|
) -> TablesColumns:
|
|
@@ -105,11 +112,11 @@ class DatabricksClient(APIClient):
|
|
|
105
112
|
"""
|
|
106
113
|
tables: List[dict] = []
|
|
107
114
|
columns: List[dict] = []
|
|
108
|
-
|
|
115
|
+
user_mapping = self._get_user_mapping(users)
|
|
109
116
|
for schema in schemas:
|
|
110
117
|
t_to_add, c_to_add = self._tables_columns_of_schema(schema)
|
|
111
118
|
t_with_owner = [
|
|
112
|
-
self._match_table_with_user(table,
|
|
119
|
+
self._match_table_with_user(table, user_mapping)
|
|
113
120
|
for table in t_to_add
|
|
114
121
|
]
|
|
115
122
|
tables.extend(t_with_owner)
|
|
@@ -66,15 +66,34 @@ def test_DatabricksClient__keep_catalog():
|
|
|
66
66
|
assert not client._keep_catalog("something_unknown")
|
|
67
67
|
|
|
68
68
|
|
|
69
|
+
def test_DatabricksClient__get_user_mapping():
|
|
70
|
+
client = MockDatabricksClient()
|
|
71
|
+
users = [
|
|
72
|
+
{"id": "both", "email": "hello@world.com", "user_name": "hello world"},
|
|
73
|
+
{"id": "no_email", "email": "", "user_name": "no email"},
|
|
74
|
+
{"id": "no_name", "email": "no@name.fr", "user_name": ""},
|
|
75
|
+
{"id": "no_both", "email": "", "user_name": ""},
|
|
76
|
+
{"id": "", "email": "no@id.com", "user_name": "no id"},
|
|
77
|
+
]
|
|
78
|
+
expected = {
|
|
79
|
+
"hello@world.com": "both",
|
|
80
|
+
"hello world": "both",
|
|
81
|
+
"no@name.fr": "no_name",
|
|
82
|
+
"no email": "no_email",
|
|
83
|
+
}
|
|
84
|
+
mapping = client._get_user_mapping(users)
|
|
85
|
+
assert mapping == expected
|
|
86
|
+
|
|
87
|
+
|
|
69
88
|
def test_DatabricksClient__match_table_with_user():
|
|
70
89
|
client = MockDatabricksClient()
|
|
71
|
-
|
|
90
|
+
user_mapping = {"bob@castordoc.com": 3}
|
|
72
91
|
|
|
73
92
|
table = {"id": 1, "owner_email": "bob@castordoc.com"}
|
|
74
|
-
table_with_owner = client._match_table_with_user(table,
|
|
93
|
+
table_with_owner = client._match_table_with_user(table, user_mapping)
|
|
75
94
|
|
|
76
95
|
assert table_with_owner == {**table, "owner_external_id": 3}
|
|
77
96
|
|
|
78
97
|
table_without_owner = {"id": 1, "owner_email": None}
|
|
79
|
-
actual = client._match_table_with_user(table_without_owner,
|
|
98
|
+
actual = client._match_table_with_user(table_without_owner, user_mapping)
|
|
80
99
|
assert actual == table_without_owner
|
|
@@ -127,13 +127,17 @@ class DatabricksFormatter:
|
|
|
127
127
|
return email["value"]
|
|
128
128
|
return emails[0]["value"]
|
|
129
129
|
|
|
130
|
+
def _email(self, user: dict) -> Optional[str]:
|
|
131
|
+
emails = user.get("emails")
|
|
132
|
+
return self._primary(emails) if emails else None
|
|
133
|
+
|
|
130
134
|
def format_user(self, raw_users: List[dict]) -> List[dict]:
|
|
131
135
|
users = []
|
|
132
136
|
for user in raw_users:
|
|
133
137
|
users.append(
|
|
134
138
|
{
|
|
135
139
|
"id": user["id"],
|
|
136
|
-
"email": self.
|
|
140
|
+
"email": self._email(user),
|
|
137
141
|
"first_name": None,
|
|
138
142
|
"last_name": user.get("displayName") or user["userName"],
|
|
139
143
|
"user_name": user["userName"],
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Dict, Iterator, List
|
|
2
|
+
from typing import Dict, Iterator, List, Tuple
|
|
3
3
|
|
|
4
4
|
from tqdm import tqdm # type: ignore
|
|
5
5
|
|
|
@@ -96,17 +96,19 @@ class SalesforceClient(SalesforceBaseClient):
|
|
|
96
96
|
"""
|
|
97
97
|
sobjects = self.fetch_sobjects()
|
|
98
98
|
logger.info(f"Extracted {len(sobjects)} sobjects")
|
|
99
|
-
return self.formatter.tables(sobjects)
|
|
99
|
+
return list(self.formatter.tables(sobjects))
|
|
100
100
|
|
|
101
101
|
def columns(
|
|
102
|
-
self, sobject_names: List[str], show_progress: bool = True
|
|
102
|
+
self, sobject_names: List[Tuple[str, str]], show_progress: bool = True
|
|
103
103
|
) -> List[dict]:
|
|
104
104
|
"""
|
|
105
105
|
Get salesforce sobject fields as columns
|
|
106
106
|
show_progress: optionally deactivate the tqdm progress bar
|
|
107
107
|
"""
|
|
108
108
|
sobject_fields: Dict[str, List[dict]] = dict()
|
|
109
|
-
for
|
|
110
|
-
|
|
111
|
-
|
|
109
|
+
for api_name, table_name in tqdm(
|
|
110
|
+
sobject_names, disable=not show_progress
|
|
111
|
+
):
|
|
112
|
+
fields = self.fetch_fields(api_name)
|
|
113
|
+
sobject_fields[table_name] = fields
|
|
112
114
|
return self.formatter.columns(sobject_fields)
|
|
@@ -72,8 +72,8 @@ class SalesforceExtractionProcessor:
|
|
|
72
72
|
catalog_locations[WarehouseAsset.TABLE.value] = location
|
|
73
73
|
logger.info(f"Extracted {len(tables)} tables to {location}")
|
|
74
74
|
|
|
75
|
-
|
|
76
|
-
columns = self._client.columns(
|
|
75
|
+
sobject_names = [(t["api_name"], t["table_name"]) for t in tables]
|
|
76
|
+
columns = self._client.columns(sobject_names, show_progress)
|
|
77
77
|
location = self._storage.put(WarehouseAsset.COLUMN.value, columns)
|
|
78
78
|
catalog_locations[WarehouseAsset.COLUMN.value] = location
|
|
79
79
|
logger.info(f"Extracted {len(columns)} columns to {location}")
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any, Dict, List
|
|
1
|
+
from typing import Any, Dict, Iterator, List
|
|
2
2
|
|
|
3
3
|
from .constants import SCHEMA_NAME
|
|
4
4
|
|
|
@@ -35,17 +35,35 @@ def _to_column_payload(field: dict, position: int, table_name: str) -> dict:
|
|
|
35
35
|
}
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
def _to_table_payload(
|
|
38
|
+
def _to_table_payload(sobject: dict, table_name: str) -> dict:
|
|
39
39
|
return {
|
|
40
|
-
"id":
|
|
40
|
+
"id": table_name,
|
|
41
|
+
"api_name": sobject["QualifiedApiName"],
|
|
42
|
+
"label": sobject["Label"],
|
|
41
43
|
"schema_id": SCHEMA_NAME,
|
|
42
|
-
"table_name":
|
|
44
|
+
"table_name": table_name,
|
|
43
45
|
"description": "",
|
|
44
46
|
"tags": [],
|
|
45
47
|
"type": "TABLE",
|
|
46
48
|
}
|
|
47
49
|
|
|
48
50
|
|
|
51
|
+
def _merge_label_and_api_name(sobject: dict) -> dict:
|
|
52
|
+
label = sobject["Label"]
|
|
53
|
+
api_name = sobject["QualifiedApiName"]
|
|
54
|
+
table_name = f"{label} ({api_name})"
|
|
55
|
+
return _to_table_payload(sobject, table_name)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _by_label(sobjects: List[dict]) -> Dict[str, List[dict]]:
|
|
59
|
+
by_label: Dict[str, List[dict]] = dict()
|
|
60
|
+
for sobject in sobjects:
|
|
61
|
+
label = sobject["Label"]
|
|
62
|
+
similar_sobjects = by_label.setdefault(label, [])
|
|
63
|
+
similar_sobjects.append(sobject)
|
|
64
|
+
return by_label
|
|
65
|
+
|
|
66
|
+
|
|
49
67
|
class SalesforceFormatter:
|
|
50
68
|
"""
|
|
51
69
|
Helper functions that format the response in the format to be exported as
|
|
@@ -53,9 +71,18 @@ class SalesforceFormatter:
|
|
|
53
71
|
"""
|
|
54
72
|
|
|
55
73
|
@staticmethod
|
|
56
|
-
def tables(sobjects: List[dict]) ->
|
|
57
|
-
"""
|
|
58
|
-
|
|
74
|
+
def tables(sobjects: List[dict]) -> Iterator[dict]:
|
|
75
|
+
"""
|
|
76
|
+
formats the raw list of sobjects to tables
|
|
77
|
+
if two tables share the same label, then we add the api name as well
|
|
78
|
+
"""
|
|
79
|
+
by_label = _by_label(sobjects)
|
|
80
|
+
for label, similars in by_label.items():
|
|
81
|
+
if len(similars) > 1:
|
|
82
|
+
yield from [_merge_label_and_api_name(s) for s in similars]
|
|
83
|
+
else:
|
|
84
|
+
sobject = similars[0] # unique sobject on label
|
|
85
|
+
yield _to_table_payload(sobject, label)
|
|
59
86
|
|
|
60
87
|
@staticmethod
|
|
61
88
|
def columns(sobject_fields: Dict[str, List[dict]]) -> List[dict]:
|
|
@@ -1,4 +1,21 @@
|
|
|
1
|
-
from
|
|
1
|
+
from typing import Dict, Tuple
|
|
2
|
+
|
|
3
|
+
from .format import (
|
|
4
|
+
SCHEMA_NAME,
|
|
5
|
+
SalesforceFormatter,
|
|
6
|
+
_by_label,
|
|
7
|
+
_field_description,
|
|
8
|
+
_merge_label_and_api_name,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _example_sobjects() -> Tuple[Dict[str, str], ...]:
|
|
13
|
+
"""Returns 4 sobjects with 2 sharing the same label"""
|
|
14
|
+
a = {"Label": "a", "QualifiedApiName": "a_one"}
|
|
15
|
+
b = {"Label": "b", "QualifiedApiName": "b"}
|
|
16
|
+
c = {"Label": "c", "QualifiedApiName": "c"}
|
|
17
|
+
a_prime = {"Label": "a", "QualifiedApiName": "a_two"}
|
|
18
|
+
return a, b, c, a_prime
|
|
2
19
|
|
|
3
20
|
|
|
4
21
|
def test__field_description():
|
|
@@ -30,3 +47,34 @@ def test__field_description():
|
|
|
30
47
|
"- Data Sensitivity Level: bam"
|
|
31
48
|
)
|
|
32
49
|
assert description == expected
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def test__merge_label_and_api_name():
|
|
53
|
+
sobject = {"Label": "foo", "QualifiedApiName": "bar"}
|
|
54
|
+
payload = _merge_label_and_api_name(sobject)
|
|
55
|
+
expected_name = "foo (bar)"
|
|
56
|
+
assert payload == {
|
|
57
|
+
"id": expected_name,
|
|
58
|
+
"api_name": "bar",
|
|
59
|
+
"label": "foo",
|
|
60
|
+
"schema_id": SCHEMA_NAME,
|
|
61
|
+
"table_name": expected_name,
|
|
62
|
+
"description": "",
|
|
63
|
+
"tags": [],
|
|
64
|
+
"type": "TABLE",
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test__by_label():
|
|
69
|
+
a, b, c, a_prime = _example_sobjects()
|
|
70
|
+
sobjects = [a, b, c, a_prime]
|
|
71
|
+
by_label = _by_label(sobjects)
|
|
72
|
+
assert by_label == {"a": [a, a_prime], "b": [b], "c": [c]}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def test_salesforce_formatter_tables():
|
|
76
|
+
sobjects = [*_example_sobjects()]
|
|
77
|
+
tables = SalesforceFormatter.tables(sobjects)
|
|
78
|
+
expected_names = {"a (a_one)", "a (a_two)", "b", "c"}
|
|
79
|
+
payload_names = {t["table_name"] for t in tables}
|
|
80
|
+
assert payload_names == expected_names
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: castor-extractor
|
|
3
|
-
Version: 0.16.
|
|
3
|
+
Version: 0.16.11
|
|
4
4
|
Summary: Extract your metadata assets.
|
|
5
5
|
Home-page: https://www.castordoc.com/
|
|
6
6
|
License: EULA
|
|
7
7
|
Author: Castor
|
|
8
8
|
Author-email: support@castordoc.com
|
|
9
|
-
Requires-Python: >=3.8,<3.
|
|
9
|
+
Requires-Python: >=3.8,<3.13
|
|
10
10
|
Classifier: License :: Other/Proprietary License
|
|
11
11
|
Classifier: Operating System :: OS Independent
|
|
12
12
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -14,6 +14,7 @@ Classifier: Programming Language :: Python :: 3.8
|
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.9
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.10
|
|
16
16
|
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
18
|
Provides-Extra: all
|
|
18
19
|
Provides-Extra: bigquery
|
|
19
20
|
Provides-Extra: dbt
|
|
@@ -34,8 +35,10 @@ Requires-Dist: google-cloud-core (>=2.1.0,<3.0.0)
|
|
|
34
35
|
Requires-Dist: google-cloud-storage (>=2,<3)
|
|
35
36
|
Requires-Dist: google-resumable-media (>=2.0.3,<3.0.0)
|
|
36
37
|
Requires-Dist: googleapis-common-protos (>=1.53.0,<2.0.0)
|
|
37
|
-
Requires-Dist: looker-sdk (>=
|
|
38
|
+
Requires-Dist: looker-sdk (>=23.0.0) ; extra == "looker" or extra == "all"
|
|
38
39
|
Requires-Dist: msal (>=1.20.0,<2.0.0) ; extra == "powerbi" or extra == "all"
|
|
40
|
+
Requires-Dist: numpy (<1.25) ; python_version >= "3.8" and python_version < "3.9"
|
|
41
|
+
Requires-Dist: numpy (>=1.26,<2) ; python_version >= "3.12" and python_version < "3.13"
|
|
39
42
|
Requires-Dist: psycopg2-binary (>=2.0.0,<3.0.0) ; extra == "metabase" or extra == "postgres" or extra == "redshift" or extra == "all"
|
|
40
43
|
Requires-Dist: pycryptodome (>=3.0.0,<4.0.0) ; extra == "metabase" or extra == "all"
|
|
41
44
|
Requires-Dist: pydantic (>=2.6,<3.0)
|