castor-extractor 0.21.9__py3-none-any.whl → 0.22.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +4 -0
- castor_extractor/commands/__init__.py +0 -3
- castor_extractor/commands/file_check.py +1 -2
- castor_extractor/file_checker/column.py +5 -5
- castor_extractor/file_checker/file.py +7 -7
- castor_extractor/file_checker/file_test.py +2 -2
- castor_extractor/file_checker/templates/generic_warehouse.py +4 -6
- castor_extractor/knowledge/confluence/client/client.py +2 -1
- castor_extractor/knowledge/confluence/extract.py +3 -2
- castor_extractor/knowledge/notion/client/client.py +3 -2
- castor_extractor/knowledge/notion/extract.py +3 -2
- castor_extractor/quality/soda/client/client.py +2 -1
- castor_extractor/quality/soda/client/pagination.py +1 -3
- castor_extractor/types.py +3 -3
- castor_extractor/uploader/env.py +2 -2
- castor_extractor/uploader/upload.py +4 -3
- castor_extractor/uploader/utils.py +1 -1
- castor_extractor/utils/client/abstract.py +2 -1
- castor_extractor/utils/client/api/auth.py +2 -2
- castor_extractor/utils/client/api/auth_test.py +2 -2
- castor_extractor/utils/client/api/client.py +3 -3
- castor_extractor/utils/client/api/pagination.py +3 -2
- castor_extractor/utils/client/api/safe_request.py +5 -5
- castor_extractor/utils/collection.py +7 -11
- castor_extractor/utils/dbt/client.py +3 -3
- castor_extractor/utils/dbt/client_test.py +2 -2
- castor_extractor/utils/deprecate.py +1 -2
- castor_extractor/utils/files.py +5 -5
- castor_extractor/utils/formatter.py +5 -4
- castor_extractor/utils/json_stream_write.py +2 -1
- castor_extractor/utils/object.py +2 -1
- castor_extractor/utils/pager/pager.py +2 -4
- castor_extractor/utils/pager/pager_on_id.py +2 -1
- castor_extractor/utils/pager/pager_on_id_test.py +5 -5
- castor_extractor/utils/pager/pager_test.py +3 -3
- castor_extractor/utils/retry.py +4 -3
- castor_extractor/utils/retry_test.py +2 -3
- castor_extractor/utils/safe.py +3 -3
- castor_extractor/utils/salesforce/client.py +2 -1
- castor_extractor/utils/salesforce/credentials.py +1 -3
- castor_extractor/utils/store.py +2 -1
- castor_extractor/utils/string.py +2 -2
- castor_extractor/utils/string_test.py +1 -3
- castor_extractor/utils/type.py +3 -2
- castor_extractor/utils/validation.py +4 -4
- castor_extractor/utils/write.py +2 -2
- castor_extractor/visualization/domo/client/client.py +8 -7
- castor_extractor/visualization/domo/client/credentials.py +2 -2
- castor_extractor/visualization/domo/client/endpoints.py +2 -2
- castor_extractor/visualization/domo/extract.py +3 -2
- castor_extractor/visualization/looker/api/client.py +17 -16
- castor_extractor/visualization/looker/api/utils.py +2 -2
- castor_extractor/visualization/looker/assets.py +1 -3
- castor_extractor/visualization/looker/extract.py +4 -3
- castor_extractor/visualization/looker/fields.py +3 -3
- castor_extractor/visualization/looker/multithreading.py +3 -3
- castor_extractor/visualization/metabase/assets.py +1 -3
- castor_extractor/visualization/metabase/client/api/client.py +8 -7
- castor_extractor/visualization/metabase/extract.py +3 -2
- castor_extractor/visualization/metabase/types.py +1 -3
- castor_extractor/visualization/mode/client/client.py +6 -6
- castor_extractor/visualization/mode/extract.py +2 -2
- castor_extractor/visualization/powerbi/assets.py +1 -3
- castor_extractor/visualization/powerbi/client/client.py +12 -11
- castor_extractor/visualization/powerbi/client/credentials.py +3 -3
- castor_extractor/visualization/powerbi/client/endpoints.py +2 -2
- castor_extractor/visualization/powerbi/extract.py +3 -2
- castor_extractor/visualization/qlik/assets.py +1 -3
- castor_extractor/visualization/qlik/client/constants.py +1 -3
- castor_extractor/visualization/qlik/client/engine/error.py +1 -3
- castor_extractor/visualization/qlik/client/master.py +3 -3
- castor_extractor/visualization/qlik/client/rest.py +12 -12
- castor_extractor/visualization/qlik/extract.py +4 -3
- castor_extractor/visualization/salesforce_reporting/client/rest.py +3 -2
- castor_extractor/visualization/salesforce_reporting/client/soql.py +1 -3
- castor_extractor/visualization/salesforce_reporting/extract.py +3 -2
- castor_extractor/visualization/sigma/client/client.py +9 -8
- castor_extractor/visualization/sigma/client/credentials.py +1 -3
- castor_extractor/visualization/sigma/extract.py +3 -2
- castor_extractor/visualization/tableau/assets.py +1 -2
- castor_extractor/visualization/tableau/client/client.py +1 -2
- castor_extractor/visualization/tableau/client/client_utils.py +3 -2
- castor_extractor/visualization/tableau/client/credentials.py +3 -3
- castor_extractor/visualization/tableau/client/safe_mode.py +1 -2
- castor_extractor/visualization/tableau/extract.py +2 -2
- castor_extractor/visualization/tableau/gql_fields.py +3 -3
- castor_extractor/visualization/tableau/tsc_fields.py +1 -2
- castor_extractor/visualization/tableau/types.py +3 -3
- castor_extractor/visualization/tableau_revamp/client/client_metadata_api.py +3 -2
- castor_extractor/visualization/tableau_revamp/client/client_rest_api.py +3 -3
- castor_extractor/visualization/tableau_revamp/client/client_tsc.py +3 -2
- castor_extractor/visualization/tableau_revamp/client/gql_queries.py +1 -3
- castor_extractor/visualization/tableau_revamp/client/rest_fields.py +1 -3
- castor_extractor/visualization/tableau_revamp/extract.py +2 -2
- castor_extractor/visualization/thoughtspot/client/client.py +3 -2
- castor_extractor/visualization/thoughtspot/client/utils.py +1 -1
- castor_extractor/visualization/thoughtspot/extract.py +3 -2
- castor_extractor/warehouse/abstract/asset.py +4 -5
- castor_extractor/warehouse/abstract/extract.py +4 -3
- castor_extractor/warehouse/abstract/query.py +4 -4
- castor_extractor/warehouse/bigquery/client.py +8 -8
- castor_extractor/warehouse/bigquery/extract.py +1 -1
- castor_extractor/warehouse/bigquery/query.py +2 -2
- castor_extractor/warehouse/bigquery/types.py +2 -4
- castor_extractor/warehouse/databricks/api_client.py +15 -14
- castor_extractor/warehouse/databricks/client.py +16 -16
- castor_extractor/warehouse/databricks/extract.py +4 -4
- castor_extractor/warehouse/databricks/format.py +12 -12
- castor_extractor/warehouse/databricks/lineage.py +11 -11
- castor_extractor/warehouse/databricks/pagination.py +2 -2
- castor_extractor/warehouse/databricks/types.py +4 -4
- castor_extractor/warehouse/databricks/utils.py +5 -4
- castor_extractor/warehouse/mysql/query.py +2 -2
- castor_extractor/warehouse/postgres/query.py +2 -2
- castor_extractor/warehouse/redshift/client.py +1 -1
- castor_extractor/warehouse/redshift/query.py +2 -2
- castor_extractor/warehouse/salesforce/client.py +8 -8
- castor_extractor/warehouse/salesforce/extract.py +3 -4
- castor_extractor/warehouse/salesforce/format.py +8 -7
- castor_extractor/warehouse/salesforce/format_test.py +2 -4
- castor_extractor/warehouse/snowflake/query.py +5 -5
- castor_extractor/warehouse/sqlserver/client.py +1 -1
- castor_extractor/warehouse/sqlserver/query.py +2 -2
- {castor_extractor-0.21.9.dist-info → castor_extractor-0.22.0.dist-info}/METADATA +7 -6
- {castor_extractor-0.21.9.dist-info → castor_extractor-0.22.0.dist-info}/RECORD +128 -128
- {castor_extractor-0.21.9.dist-info → castor_extractor-0.22.0.dist-info}/LICENCE +0 -0
- {castor_extractor-0.21.9.dist-info → castor_extractor-0.22.0.dist-info}/WHEEL +0 -0
- {castor_extractor-0.21.9.dist-info → castor_extractor-0.22.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Optional
|
|
3
3
|
|
|
4
4
|
from ...utils import AbstractStorage, LocalStorage, write_summary
|
|
5
5
|
from ..abstract import (
|
|
@@ -29,7 +29,7 @@ DATABRICKS_ASSETS: SupportedAssets = {
|
|
|
29
29
|
logger = logging.getLogger(__name__)
|
|
30
30
|
|
|
31
31
|
OTimeFilter = Optional[TimeFilter]
|
|
32
|
-
Paths =
|
|
32
|
+
Paths = dict[str, str]
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
class DatabricksExtractionProcessor:
|
|
@@ -71,7 +71,7 @@ class DatabricksExtractionProcessor:
|
|
|
71
71
|
if self._should_not_reextract(WarehouseAssetGroup.CATALOG):
|
|
72
72
|
return self._existing_group_paths(WarehouseAssetGroup.CATALOG)
|
|
73
73
|
|
|
74
|
-
catalog_locations:
|
|
74
|
+
catalog_locations: dict[str, str] = dict()
|
|
75
75
|
databases = self._client.databases()
|
|
76
76
|
location = self._storage.put(WarehouseAsset.DATABASE.value, databases)
|
|
77
77
|
catalog_locations[WarehouseAsset.DATABASE.value] = location
|
|
@@ -101,7 +101,7 @@ class DatabricksExtractionProcessor:
|
|
|
101
101
|
return self._existing_group_paths(
|
|
102
102
|
WarehouseAssetGroup.ADDITIONAL_LINEAGE
|
|
103
103
|
)
|
|
104
|
-
lineage_locations:
|
|
104
|
+
lineage_locations: dict[str, str] = dict()
|
|
105
105
|
|
|
106
106
|
# extract catalog
|
|
107
107
|
databases = self._client.databases()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from datetime import datetime
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Optional
|
|
4
4
|
|
|
5
5
|
from .types import TablesColumns
|
|
6
6
|
from .utils import build_path
|
|
@@ -12,7 +12,7 @@ EXCLUDED_SCHEMAS = {"information_schema", "default"}
|
|
|
12
12
|
|
|
13
13
|
TABLE_URL_TPL = "{host}explore/data/{catalog_name}/{schema_name}/{table_name}?o={workspace_id}"
|
|
14
14
|
|
|
15
|
-
TagMapping =
|
|
15
|
+
TagMapping = dict[str, list[str]]
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
def _to_datetime_or_none(time_ms: Optional[int]) -> Optional[datetime]:
|
|
@@ -87,7 +87,7 @@ class DatabricksFormatter:
|
|
|
87
87
|
"""
|
|
88
88
|
|
|
89
89
|
@staticmethod
|
|
90
|
-
def format_database(raw_databases:
|
|
90
|
+
def format_database(raw_databases: list[dict]) -> list[dict]:
|
|
91
91
|
databases = []
|
|
92
92
|
for catalog in raw_databases:
|
|
93
93
|
name = catalog["name"]
|
|
@@ -101,7 +101,7 @@ class DatabricksFormatter:
|
|
|
101
101
|
return databases
|
|
102
102
|
|
|
103
103
|
@staticmethod
|
|
104
|
-
def format_schema(raw_schemas:
|
|
104
|
+
def format_schema(raw_schemas: list[dict], database: dict) -> list[dict]:
|
|
105
105
|
schemas = []
|
|
106
106
|
for schema in raw_schemas:
|
|
107
107
|
if schema["name"] in EXCLUDED_SCHEMAS:
|
|
@@ -118,7 +118,7 @@ class DatabricksFormatter:
|
|
|
118
118
|
|
|
119
119
|
@staticmethod
|
|
120
120
|
def format_table_column(
|
|
121
|
-
raw_tables:
|
|
121
|
+
raw_tables: list[dict],
|
|
122
122
|
schema: dict,
|
|
123
123
|
host: str,
|
|
124
124
|
workspace_id: str,
|
|
@@ -141,8 +141,8 @@ class DatabricksFormatter:
|
|
|
141
141
|
return tables, columns
|
|
142
142
|
|
|
143
143
|
@staticmethod
|
|
144
|
-
def format_lineage(timestamps: dict) ->
|
|
145
|
-
lineage:
|
|
144
|
+
def format_lineage(timestamps: dict) -> list[dict]:
|
|
145
|
+
lineage: list[dict] = []
|
|
146
146
|
for link, timestamp in timestamps.items():
|
|
147
147
|
parent_path, child_path = link
|
|
148
148
|
link_ = {
|
|
@@ -154,7 +154,7 @@ class DatabricksFormatter:
|
|
|
154
154
|
return lineage
|
|
155
155
|
|
|
156
156
|
@staticmethod
|
|
157
|
-
def format_query(raw_queries:
|
|
157
|
+
def format_query(raw_queries: list[dict]) -> list[dict]:
|
|
158
158
|
queries = []
|
|
159
159
|
for q in raw_queries:
|
|
160
160
|
if not q["query_text"]:
|
|
@@ -176,7 +176,7 @@ class DatabricksFormatter:
|
|
|
176
176
|
return queries
|
|
177
177
|
|
|
178
178
|
@staticmethod
|
|
179
|
-
def _primary(emails:
|
|
179
|
+
def _primary(emails: list[dict]) -> Optional[str]:
|
|
180
180
|
"""helper function to select a unique email"""
|
|
181
181
|
if not emails:
|
|
182
182
|
return None
|
|
@@ -189,7 +189,7 @@ class DatabricksFormatter:
|
|
|
189
189
|
emails = user.get("emails")
|
|
190
190
|
return self._primary(emails) if emails else None
|
|
191
191
|
|
|
192
|
-
def format_user(self, raw_users:
|
|
192
|
+
def format_user(self, raw_users: list[dict]) -> list[dict]:
|
|
193
193
|
users = []
|
|
194
194
|
for user in raw_users:
|
|
195
195
|
users.append(
|
|
@@ -204,8 +204,8 @@ class DatabricksFormatter:
|
|
|
204
204
|
return users
|
|
205
205
|
|
|
206
206
|
@staticmethod
|
|
207
|
-
def format_view_ddl(tables:
|
|
208
|
-
view_ddl:
|
|
207
|
+
def format_view_ddl(tables: list[dict], schema: dict) -> list[dict]:
|
|
208
|
+
view_ddl: list[dict] = []
|
|
209
209
|
if not tables:
|
|
210
210
|
return view_ddl
|
|
211
211
|
for table in tables:
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import cast
|
|
2
2
|
|
|
3
3
|
from .types import Link, Ostr, OTimestampedLink, TimestampedLink
|
|
4
4
|
|
|
@@ -9,7 +9,7 @@ class LineageLinks:
|
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
def __init__(self):
|
|
12
|
-
self.lineage:
|
|
12
|
+
self.lineage: dict[Link, Ostr] = dict()
|
|
13
13
|
|
|
14
14
|
def add(self, timestamped_link: TimestampedLink) -> None:
|
|
15
15
|
"""
|
|
@@ -52,7 +52,7 @@ def _link(path_from: Ostr, path_to: Ostr, timestamp: Ostr) -> OTimestampedLink:
|
|
|
52
52
|
|
|
53
53
|
def single_table_lineage_links(
|
|
54
54
|
table_path: str, single_table_lineage: dict
|
|
55
|
-
) ->
|
|
55
|
+
) -> list[TimestampedLink]:
|
|
56
56
|
"""
|
|
57
57
|
process databricks lineage API response for a given table
|
|
58
58
|
returns a list of (parent, child, timestamp)
|
|
@@ -60,7 +60,7 @@ def single_table_lineage_links(
|
|
|
60
60
|
Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
|
|
61
61
|
we could also have `notebookInfos` or `fileInfo`
|
|
62
62
|
"""
|
|
63
|
-
links:
|
|
63
|
+
links: list[OTimestampedLink] = []
|
|
64
64
|
# add parent:
|
|
65
65
|
for link in single_table_lineage.get("upstreams", []):
|
|
66
66
|
parent = link.get("tableInfo", {})
|
|
@@ -80,7 +80,7 @@ def single_table_lineage_links(
|
|
|
80
80
|
|
|
81
81
|
def single_column_lineage_links(
|
|
82
82
|
column_path: str, single_column_lineage: dict
|
|
83
|
-
) ->
|
|
83
|
+
) -> list[TimestampedLink]:
|
|
84
84
|
"""
|
|
85
85
|
process databricks lineage API response for a given table
|
|
86
86
|
returns a list of (parent, child, timestamp)
|
|
@@ -88,7 +88,7 @@ def single_column_lineage_links(
|
|
|
88
88
|
Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
|
|
89
89
|
we could also have `notebookInfos` or `fileInfo`
|
|
90
90
|
"""
|
|
91
|
-
links:
|
|
91
|
+
links: list[OTimestampedLink] = []
|
|
92
92
|
# add parent:
|
|
93
93
|
for link in single_column_lineage.get("upstream_cols", []):
|
|
94
94
|
parent_path = _to_column_path(link)
|
|
@@ -105,8 +105,8 @@ def single_column_lineage_links(
|
|
|
105
105
|
|
|
106
106
|
|
|
107
107
|
def paths_for_column_lineage(
|
|
108
|
-
tables:
|
|
109
|
-
) ->
|
|
108
|
+
tables: list[dict], columns: list[dict], table_lineage: list[dict]
|
|
109
|
+
) -> list[tuple[str, str]]:
|
|
110
110
|
"""
|
|
111
111
|
helper providing a list of candidate columns to look lineage for:
|
|
112
112
|
we only look for column lineage where there is table lineage
|
|
@@ -118,12 +118,12 @@ def paths_for_column_lineage(
|
|
|
118
118
|
for table in tables
|
|
119
119
|
}
|
|
120
120
|
|
|
121
|
-
tables_with_lineage:
|
|
121
|
+
tables_with_lineage: set[str] = set()
|
|
122
122
|
for t in table_lineage:
|
|
123
123
|
tables_with_lineage.add(t["parent_path"])
|
|
124
124
|
tables_with_lineage.add(t["child_path"])
|
|
125
125
|
|
|
126
|
-
paths_to_return:
|
|
126
|
+
paths_to_return: list[tuple[str, str]] = []
|
|
127
127
|
for column in columns:
|
|
128
128
|
table_path = mapping[column["table_id"]]
|
|
129
129
|
if table_path not in tables_with_lineage:
|
|
@@ -134,7 +134,7 @@ def paths_for_column_lineage(
|
|
|
134
134
|
return paths_to_return
|
|
135
135
|
|
|
136
136
|
|
|
137
|
-
def deduplicate_lineage(lineages:
|
|
137
|
+
def deduplicate_lineage(lineages: list[TimestampedLink]) -> dict:
|
|
138
138
|
deduplicated_lineage = LineageLinks()
|
|
139
139
|
for timestamped_link in lineages:
|
|
140
140
|
deduplicated_lineage.add(timestamped_link)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Optional
|
|
2
2
|
|
|
3
3
|
from pydantic import Field
|
|
4
4
|
|
|
@@ -10,7 +10,7 @@ DATABRICKS_PAGE_SIZE = 100
|
|
|
10
10
|
class DatabricksPagination(PaginationModel):
|
|
11
11
|
next_page_token: Optional[str] = None
|
|
12
12
|
has_next_page: bool = False
|
|
13
|
-
res:
|
|
13
|
+
res: list[dict] = Field(default_factory=list)
|
|
14
14
|
|
|
15
15
|
def is_last(self) -> bool:
|
|
16
16
|
return not (self.has_next_page and self.next_page_token)
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Optional
|
|
2
2
|
|
|
3
|
-
Link =
|
|
4
|
-
TablesColumns =
|
|
3
|
+
Link = tuple[str, str]
|
|
4
|
+
TablesColumns = tuple[list[dict], list[dict]]
|
|
5
5
|
Ostr = Optional[str]
|
|
6
|
-
TimestampedLink =
|
|
6
|
+
TimestampedLink = tuple[str, str, Ostr]
|
|
7
7
|
|
|
8
8
|
OTimestampedLink = Optional[TimestampedLink]
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
1
2
|
from datetime import date
|
|
2
|
-
from typing import
|
|
3
|
+
from typing import Optional
|
|
3
4
|
|
|
4
5
|
from ...utils import at_midnight
|
|
5
6
|
from ..abstract import TimeFilter
|
|
@@ -14,8 +15,8 @@ def _day_hour_to_epoch_ms(day: date, hour: int) -> int:
|
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
def build_path(
|
|
17
|
-
row:
|
|
18
|
-
keys:
|
|
18
|
+
row: dict,
|
|
19
|
+
keys: list[str],
|
|
19
20
|
) -> str:
|
|
20
21
|
"""
|
|
21
22
|
format an asset's path:
|
|
@@ -26,7 +27,7 @@ def build_path(
|
|
|
26
27
|
return ".".join(key_values)
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
def tag_label(row:
|
|
30
|
+
def tag_label(row: dict) -> str:
|
|
30
31
|
"""
|
|
31
32
|
format the tag's label:
|
|
32
33
|
- {key:value} when the value is not empty
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Optional
|
|
2
2
|
|
|
3
3
|
from ..abstract import (
|
|
4
4
|
AbstractQueryBuilder,
|
|
@@ -19,6 +19,6 @@ class MySQLQueryBuilder(AbstractQueryBuilder):
|
|
|
19
19
|
):
|
|
20
20
|
super().__init__(time_filter=time_filter)
|
|
21
21
|
|
|
22
|
-
def build(self, asset: WarehouseAsset) ->
|
|
22
|
+
def build(self, asset: WarehouseAsset) -> list[ExtractionQuery]:
|
|
23
23
|
query = self.build_default(asset)
|
|
24
24
|
return [query]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Optional
|
|
2
2
|
|
|
3
3
|
from ..abstract import (
|
|
4
4
|
AbstractQueryBuilder,
|
|
@@ -19,6 +19,6 @@ class PostgresQueryBuilder(AbstractQueryBuilder):
|
|
|
19
19
|
):
|
|
20
20
|
super().__init__(time_filter=time_filter)
|
|
21
21
|
|
|
22
|
-
def build(self, asset: WarehouseAsset) ->
|
|
22
|
+
def build(self, asset: WarehouseAsset) -> list[ExtractionQuery]:
|
|
23
23
|
query = self.build_default(asset)
|
|
24
24
|
return [query]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Optional
|
|
2
2
|
|
|
3
3
|
from ..abstract import (
|
|
4
4
|
AbstractQueryBuilder,
|
|
@@ -27,7 +27,7 @@ class RedshiftQueryBuilder(AbstractQueryBuilder):
|
|
|
27
27
|
params = self._time_filter.to_dict()
|
|
28
28
|
return ExtractionQuery(statement, params)
|
|
29
29
|
|
|
30
|
-
def build(self, asset: WarehouseAsset) ->
|
|
30
|
+
def build(self, asset: WarehouseAsset) -> list[ExtractionQuery]:
|
|
31
31
|
if asset == WarehouseAsset.QUERY and self.is_serverless:
|
|
32
32
|
query = self.build_query_serverless()
|
|
33
33
|
else:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from functools import partial
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Optional
|
|
4
4
|
|
|
5
5
|
from tqdm import tqdm # type: ignore
|
|
6
6
|
|
|
@@ -29,7 +29,7 @@ class SalesforceClient(SalesforceBaseClient):
|
|
|
29
29
|
def name() -> str:
|
|
30
30
|
return "Salesforce"
|
|
31
31
|
|
|
32
|
-
def fetch_sobjects(self) ->
|
|
32
|
+
def fetch_sobjects(self) -> list[dict]:
|
|
33
33
|
"""Fetch all sobjects"""
|
|
34
34
|
logger.info("Extracting sobjects")
|
|
35
35
|
query = format_sobject_query()
|
|
@@ -39,7 +39,7 @@ class SalesforceClient(SalesforceBaseClient):
|
|
|
39
39
|
results = fetch_all_pages(request_, SalesforceSQLPagination)
|
|
40
40
|
return list(results)
|
|
41
41
|
|
|
42
|
-
def fetch_fields(self, sobject_name: str) ->
|
|
42
|
+
def fetch_fields(self, sobject_name: str) -> list[dict]:
|
|
43
43
|
"""Fetches fields of a given sobject"""
|
|
44
44
|
query = SOBJECT_FIELDS_QUERY_TPL.format(
|
|
45
45
|
entity_definition_id=sobject_name
|
|
@@ -55,7 +55,7 @@ class SalesforceClient(SalesforceBaseClient):
|
|
|
55
55
|
return None
|
|
56
56
|
return response["records"][0]["Description"]
|
|
57
57
|
|
|
58
|
-
def add_table_descriptions(self, sobjects:
|
|
58
|
+
def add_table_descriptions(self, sobjects: list[dict]) -> list[dict]:
|
|
59
59
|
"""
|
|
60
60
|
Add table descriptions.
|
|
61
61
|
We use the tooling API which does not handle well the LIMIT in SOQL
|
|
@@ -67,7 +67,7 @@ class SalesforceClient(SalesforceBaseClient):
|
|
|
67
67
|
described_sobjects.append({**sobject, "Description": description})
|
|
68
68
|
return described_sobjects
|
|
69
69
|
|
|
70
|
-
def tables(self) ->
|
|
70
|
+
def tables(self) -> list[dict]:
|
|
71
71
|
"""
|
|
72
72
|
Get Salesforce sobjects as tables
|
|
73
73
|
"""
|
|
@@ -77,13 +77,13 @@ class SalesforceClient(SalesforceBaseClient):
|
|
|
77
77
|
return list(self.formatter.tables(described_sobjects))
|
|
78
78
|
|
|
79
79
|
def columns(
|
|
80
|
-
self, sobject_names:
|
|
81
|
-
) ->
|
|
80
|
+
self, sobject_names: list[tuple[str, str]], show_progress: bool = True
|
|
81
|
+
) -> list[dict]:
|
|
82
82
|
"""
|
|
83
83
|
Get salesforce sobject fields as columns
|
|
84
84
|
show_progress: optionally deactivate the tqdm progress bar
|
|
85
85
|
"""
|
|
86
|
-
sobject_fields:
|
|
86
|
+
sobject_fields: dict[str, list[dict]] = dict()
|
|
87
87
|
for api_name, table_name in tqdm(
|
|
88
88
|
sobject_names, disable=not show_progress
|
|
89
89
|
):
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Dict, List, Tuple
|
|
3
2
|
|
|
4
3
|
from ...utils import AbstractStorage, LocalStorage, write_summary
|
|
5
4
|
from ...utils.salesforce import SalesforceCredentials
|
|
@@ -14,9 +13,9 @@ from .client import SalesforceClient
|
|
|
14
13
|
logger = logging.getLogger(__name__)
|
|
15
14
|
|
|
16
15
|
|
|
17
|
-
Paths =
|
|
16
|
+
Paths = dict[str, str]
|
|
18
17
|
|
|
19
|
-
SALESFORCE_CATALOG_ASSETS:
|
|
18
|
+
SALESFORCE_CATALOG_ASSETS: tuple[WarehouseAsset, ...] = (
|
|
20
19
|
WarehouseAsset.TABLE,
|
|
21
20
|
WarehouseAsset.COLUMN,
|
|
22
21
|
)
|
|
@@ -81,7 +80,7 @@ class SalesforceExtractionProcessor:
|
|
|
81
80
|
|
|
82
81
|
def extract_role(self) -> Paths:
|
|
83
82
|
"""extract no users and return the empty file location"""
|
|
84
|
-
users:
|
|
83
|
+
users: list[dict] = []
|
|
85
84
|
location = self._storage.put(WarehouseAsset.USER.value, users)
|
|
86
85
|
logger.info(f"Extracted {len(users)} users to {location}")
|
|
87
86
|
return {WarehouseAsset.USER.value: location}
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from
|
|
1
|
+
from collections.abc import Iterator
|
|
2
|
+
from typing import Any
|
|
2
3
|
|
|
3
4
|
from ...utils import group_by
|
|
4
5
|
from .constants import SCHEMA_NAME
|
|
@@ -25,10 +26,10 @@ def _name(sobject: dict) -> str:
|
|
|
25
26
|
return f"{label} ({api_name})"
|
|
26
27
|
|
|
27
28
|
|
|
28
|
-
def _field_description(field:
|
|
29
|
-
context:
|
|
29
|
+
def _field_description(field: dict[str, Any]) -> str:
|
|
30
|
+
context: dict[str, str] = {}
|
|
30
31
|
|
|
31
|
-
field_definition:
|
|
32
|
+
field_definition: dict[str, str] = field.get("FieldDefinition") or {}
|
|
32
33
|
if description := field_definition.get("Description"):
|
|
33
34
|
context["Description"] = _clean(description)
|
|
34
35
|
if help_text := field.get("InlineHelpText"):
|
|
@@ -69,7 +70,7 @@ def _to_table_payload(sobject: dict) -> dict:
|
|
|
69
70
|
}
|
|
70
71
|
|
|
71
72
|
|
|
72
|
-
def _detect_duplicates(sobjects:
|
|
73
|
+
def _detect_duplicates(sobjects: list[dict]) -> list[dict]:
|
|
73
74
|
"""
|
|
74
75
|
enrich the given data with "has_duplicate" flag:
|
|
75
76
|
- True when another asset has the same Label in the list
|
|
@@ -89,7 +90,7 @@ class SalesforceFormatter:
|
|
|
89
90
|
"""
|
|
90
91
|
|
|
91
92
|
@staticmethod
|
|
92
|
-
def tables(sobjects:
|
|
93
|
+
def tables(sobjects: list[dict]) -> Iterator[dict]:
|
|
93
94
|
"""
|
|
94
95
|
formats the raw list of sobjects to tables
|
|
95
96
|
"""
|
|
@@ -98,7 +99,7 @@ class SalesforceFormatter:
|
|
|
98
99
|
yield _to_table_payload(sobject)
|
|
99
100
|
|
|
100
101
|
@staticmethod
|
|
101
|
-
def columns(sobject_fields:
|
|
102
|
+
def columns(sobject_fields: dict[str, list[dict]]) -> Iterator[dict]:
|
|
102
103
|
"""formats the raw list of sobject fields to columns"""
|
|
103
104
|
for table_name, fields in sobject_fields.items():
|
|
104
105
|
fields = _detect_duplicates(fields)
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from typing import Dict, List, Tuple
|
|
2
|
-
|
|
3
1
|
from .format import (
|
|
4
2
|
_HAS_DUPLICATE_KEY,
|
|
5
3
|
SalesforceFormatter,
|
|
@@ -9,7 +7,7 @@ from .format import (
|
|
|
9
7
|
)
|
|
10
8
|
|
|
11
9
|
|
|
12
|
-
def _tables_sobjects() ->
|
|
10
|
+
def _tables_sobjects() -> tuple[dict[str, str], ...]:
|
|
13
11
|
"""Returns 4 sobjects with 2 sharing the same label"""
|
|
14
12
|
a = {"Label": "a", "QualifiedApiName": "a_one"}
|
|
15
13
|
b = {"Label": "b", "QualifiedApiName": "b"}
|
|
@@ -18,7 +16,7 @@ def _tables_sobjects() -> Tuple[Dict[str, str], ...]:
|
|
|
18
16
|
return a, b, c, a_prime
|
|
19
17
|
|
|
20
18
|
|
|
21
|
-
def _columns_sobjects() ->
|
|
19
|
+
def _columns_sobjects() -> dict[str, list[dict]]:
|
|
22
20
|
a = {"Label": "First Name", "QualifiedApiName": "owner_name"}
|
|
23
21
|
b = {"Label": "First Name", "QualifiedApiName": "editor_name"}
|
|
24
22
|
c = {"Label": "Foo Bar", "QualifiedApiName": "foo_bar"}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Optional
|
|
2
2
|
|
|
3
3
|
from ..abstract import (
|
|
4
4
|
CATALOG_ASSETS,
|
|
@@ -14,7 +14,7 @@ DB_FILTERED_ASSETS = (
|
|
|
14
14
|
)
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
def _database_filter(db_list: Optional[
|
|
17
|
+
def _database_filter(db_list: Optional[list[str]], allow: bool) -> str:
|
|
18
18
|
if not db_list:
|
|
19
19
|
return ""
|
|
20
20
|
keyword = "IN" if allow else "NOT IN"
|
|
@@ -34,8 +34,8 @@ class SnowflakeQueryBuilder(AbstractQueryBuilder):
|
|
|
34
34
|
def __init__(
|
|
35
35
|
self,
|
|
36
36
|
time_filter: Optional[TimeFilter] = None,
|
|
37
|
-
db_allowed: Optional[
|
|
38
|
-
db_blocked: Optional[
|
|
37
|
+
db_allowed: Optional[list[str]] = None,
|
|
38
|
+
db_blocked: Optional[list[str]] = None,
|
|
39
39
|
fetch_transient: Optional[bool] = False,
|
|
40
40
|
):
|
|
41
41
|
super().__init__(time_filter=time_filter)
|
|
@@ -52,7 +52,7 @@ class SnowflakeQueryBuilder(AbstractQueryBuilder):
|
|
|
52
52
|
|
|
53
53
|
return statement
|
|
54
54
|
|
|
55
|
-
def build(self, asset: WarehouseAsset) ->
|
|
55
|
+
def build(self, asset: WarehouseAsset) -> list[ExtractionQuery]:
|
|
56
56
|
query = self.build_default(asset)
|
|
57
57
|
|
|
58
58
|
if asset in DB_FILTERED_ASSETS:
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Optional
|
|
2
2
|
|
|
3
3
|
from ..abstract import (
|
|
4
4
|
AbstractQueryBuilder,
|
|
@@ -19,6 +19,6 @@ class MSSQLQueryBuilder(AbstractQueryBuilder):
|
|
|
19
19
|
):
|
|
20
20
|
super().__init__(time_filter=time_filter)
|
|
21
21
|
|
|
22
|
-
def build(self, asset: WarehouseAsset) ->
|
|
22
|
+
def build(self, asset: WarehouseAsset) -> list[ExtractionQuery]:
|
|
23
23
|
query = self.build_default(asset)
|
|
24
24
|
return [query]
|
|
@@ -1,16 +1,15 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: castor-extractor
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.22.0
|
|
4
4
|
Summary: Extract your metadata assets.
|
|
5
5
|
Home-page: https://www.castordoc.com/
|
|
6
6
|
License: EULA
|
|
7
7
|
Author: Castor
|
|
8
8
|
Author-email: support@castordoc.com
|
|
9
|
-
Requires-Python: >=3.
|
|
9
|
+
Requires-Python: >=3.9,<3.13
|
|
10
10
|
Classifier: License :: Other/Proprietary License
|
|
11
11
|
Classifier: Operating System :: OS Independent
|
|
12
12
|
Classifier: Programming Language :: Python :: 3
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.9
|
|
15
14
|
Classifier: Programming Language :: Python :: 3.10
|
|
16
15
|
Classifier: Programming Language :: Python :: 3.11
|
|
@@ -39,10 +38,8 @@ Requires-Dist: google-resumable-media (>=2.0.3,<3.0.0)
|
|
|
39
38
|
Requires-Dist: googleapis-common-protos (>=1.53.0,<2.0.0)
|
|
40
39
|
Requires-Dist: looker-sdk (>=24.16.0,<24.17.0) ; extra == "looker" or extra == "all"
|
|
41
40
|
Requires-Dist: msal (>=1.20.0,<2.0.0) ; extra == "powerbi" or extra == "all"
|
|
42
|
-
Requires-Dist: numpy (<1.25) ; (python_version >= "3.8" and python_version < "3.9") and (extra == "bigquery" or extra == "databricks" or extra == "all")
|
|
43
41
|
Requires-Dist: numpy (<2) ; extra == "bigquery" or extra == "databricks" or extra == "all"
|
|
44
42
|
Requires-Dist: numpy (>=1.26) ; (python_version >= "3.12" and python_version < "3.13") and (extra == "bigquery" or extra == "databricks" or extra == "all")
|
|
45
|
-
Requires-Dist: pandas (<2.1) ; (python_version >= "3.8" and python_version < "3.9") and (extra == "databricks" or extra == "all")
|
|
46
43
|
Requires-Dist: pandas (>=2.1) ; (python_version >= "3.12" and python_version < "3.13") and (extra == "databricks" or extra == "all")
|
|
47
44
|
Requires-Dist: psycopg2-binary (>=2.0.0,<3.0.0) ; extra == "metabase" or extra == "postgres" or extra == "redshift" or extra == "all"
|
|
48
45
|
Requires-Dist: pycryptodome (>=3.0.0,<4.0.0) ; extra == "metabase" or extra == "all"
|
|
@@ -52,7 +49,7 @@ Requires-Dist: pymssql (>=2.2.11,<3.0.0) ; extra == "sqlserver" or extra == "all
|
|
|
52
49
|
Requires-Dist: pymysql[rsa] (>=1.1.0,<2.0.0) ; extra == "mysql" or extra == "all"
|
|
53
50
|
Requires-Dist: python-dateutil (>=2.0.0,<=3.0.0)
|
|
54
51
|
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
|
55
|
-
Requires-Dist: setuptools (>=75.
|
|
52
|
+
Requires-Dist: setuptools (>=75.6)
|
|
56
53
|
Requires-Dist: snowflake-connector-python (>=3.4.0,<4.0.0) ; extra == "snowflake" or extra == "all"
|
|
57
54
|
Requires-Dist: snowflake-sqlalchemy (!=1.2.5,<2.0.0) ; extra == "snowflake" or extra == "all"
|
|
58
55
|
Requires-Dist: sqlalchemy (>=1.4,<1.5)
|
|
@@ -208,6 +205,10 @@ For any questions or bug report, contact us at [support@castordoc.com](mailto:su
|
|
|
208
205
|
|
|
209
206
|
# Changelog
|
|
210
207
|
|
|
208
|
+
## 0.22.0 - 2024-12-04
|
|
209
|
+
|
|
210
|
+
* Stop supporting python3.8
|
|
211
|
+
|
|
211
212
|
## 0.21.9 - 2024-12-04
|
|
212
213
|
|
|
213
214
|
* Tableau: fix handling of timeout retry
|