acryl-datahub 1.0.0rc6__py3-none-any.whl → 1.0.0rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/METADATA +2490 -2490
- {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/RECORD +74 -74
- {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/cli/docker_cli.py +1 -1
- datahub/cli/iceberg_cli.py +1 -1
- datahub/cli/lite_cli.py +4 -2
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/configuration/git.py +1 -3
- datahub/configuration/kafka.py +1 -1
- datahub/ingestion/fs/s3_fs.py +2 -2
- datahub/ingestion/glossary/classification_mixin.py +1 -1
- datahub/ingestion/graph/client.py +16 -7
- datahub/ingestion/graph/entity_versioning.py +3 -3
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
- datahub/ingestion/source/abs/config.py +2 -4
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_api.py +2 -1
- datahub/ingestion/source/csv_enricher.py +3 -3
- datahub/ingestion/source/dbt/dbt_common.py +1 -1
- datahub/ingestion/source/dremio/dremio_api.py +3 -3
- datahub/ingestion/source/dremio/dremio_aspects.py +2 -1
- datahub/ingestion/source/file.py +5 -2
- datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
- datahub/ingestion/source/ge_data_profiler.py +11 -14
- datahub/ingestion/source/iceberg/iceberg.py +46 -12
- datahub/ingestion/source/iceberg/iceberg_common.py +31 -20
- datahub/ingestion/source/identity/okta.py +1 -3
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/kafka_connect/source_connectors.py +4 -7
- datahub/ingestion/source/looker/looker_file_loader.py +2 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
- datahub/ingestion/source/looker/looker_template_language.py +4 -2
- datahub/ingestion/source/looker/lookml_source.py +3 -2
- datahub/ingestion/source/metabase.py +54 -32
- datahub/ingestion/source/metadata/lineage.py +2 -2
- datahub/ingestion/source/mode.py +1 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +6 -3
- datahub/ingestion/source/openapi_parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
- datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/pulsar.py +2 -2
- datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
- datahub/ingestion/source/redash.py +2 -1
- datahub/ingestion/source/s3/config.py +2 -4
- datahub/ingestion/source/s3/source.py +20 -41
- datahub/ingestion/source/salesforce.py +1 -1
- datahub/ingestion/source/schema_inference/object.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
- datahub/ingestion/source/sql/athena.py +2 -2
- datahub/ingestion/source/sql/druid.py +1 -5
- datahub/ingestion/source/sql/sql_common.py +2 -2
- datahub/ingestion/source/sql/sql_types.py +2 -2
- datahub/ingestion/source/sql/teradata.py +4 -2
- datahub/ingestion/source/sql/trino.py +2 -2
- datahub/ingestion/source/superset.py +65 -37
- datahub/ingestion/source/tableau/tableau.py +3 -6
- datahub/ingestion/source/tableau/tableau_common.py +2 -1
- datahub/lite/duckdb_lite.py +5 -10
- datahub/lite/lite_local.py +1 -1
- datahub/lite/lite_util.py +4 -3
- datahub/sdk/dataset.py +3 -3
- datahub/utilities/memory_footprint.py +3 -2
- {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/top_level.txt +0 -0
|
@@ -69,9 +69,19 @@ class MetabaseConfig(DatasetLineageProviderConfigBase, StatefulIngestionConfigBa
|
|
|
69
69
|
default=None,
|
|
70
70
|
description="optional URL to use in links (if `connect_uri` is only for ingestion)",
|
|
71
71
|
)
|
|
72
|
-
username: Optional[str] = Field(
|
|
72
|
+
username: Optional[str] = Field(
|
|
73
|
+
default=None,
|
|
74
|
+
description="Metabase username, used when an API key is not provided.",
|
|
75
|
+
)
|
|
73
76
|
password: Optional[pydantic.SecretStr] = Field(
|
|
74
|
-
default=None,
|
|
77
|
+
default=None,
|
|
78
|
+
description="Metabase password, used when an API key is not provided.",
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# https://www.metabase.com/learn/metabase-basics/administration/administration-and-operation/metabase-api#example-get-request
|
|
82
|
+
api_key: Optional[pydantic.SecretStr] = Field(
|
|
83
|
+
default=None,
|
|
84
|
+
description="Metabase API key. If provided, the username and password will be ignored. Recommended method.",
|
|
75
85
|
)
|
|
76
86
|
# TODO: Check and remove this if no longer needed.
|
|
77
87
|
# Config database_alias is removed from sql sources.
|
|
@@ -178,30 +188,40 @@ class MetabaseSource(StatefulIngestionSourceBase):
|
|
|
178
188
|
self.source_config: MetabaseConfig = config
|
|
179
189
|
|
|
180
190
|
def setup_session(self) -> None:
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
191
|
+
self.session = requests.session()
|
|
192
|
+
if self.config.api_key:
|
|
193
|
+
self.session.headers.update(
|
|
194
|
+
{
|
|
195
|
+
"x-api-key": self.config.api_key.get_secret_value(),
|
|
196
|
+
"Content-Type": "application/json",
|
|
197
|
+
"Accept": "*/*",
|
|
198
|
+
}
|
|
199
|
+
)
|
|
200
|
+
else:
|
|
201
|
+
# If no API key is provided, generate a session token using username and password.
|
|
202
|
+
login_response = requests.post(
|
|
203
|
+
f"{self.config.connect_uri}/api/session",
|
|
204
|
+
None,
|
|
205
|
+
{
|
|
206
|
+
"username": self.config.username,
|
|
207
|
+
"password": (
|
|
208
|
+
self.config.password.get_secret_value()
|
|
209
|
+
if self.config.password
|
|
210
|
+
else None
|
|
211
|
+
),
|
|
212
|
+
},
|
|
213
|
+
)
|
|
193
214
|
|
|
194
|
-
|
|
195
|
-
|
|
215
|
+
login_response.raise_for_status()
|
|
216
|
+
self.access_token = login_response.json().get("id", "")
|
|
196
217
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
)
|
|
218
|
+
self.session.headers.update(
|
|
219
|
+
{
|
|
220
|
+
"X-Metabase-Session": f"{self.access_token}",
|
|
221
|
+
"Content-Type": "application/json",
|
|
222
|
+
"Accept": "*/*",
|
|
223
|
+
}
|
|
224
|
+
)
|
|
205
225
|
|
|
206
226
|
# Test the connection
|
|
207
227
|
try:
|
|
@@ -217,15 +237,17 @@ class MetabaseSource(StatefulIngestionSourceBase):
|
|
|
217
237
|
)
|
|
218
238
|
|
|
219
239
|
def close(self) -> None:
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
self.report.report_failure(
|
|
226
|
-
title="Unable to Log User Out",
|
|
227
|
-
message=f"Unable to logout for user {self.config.username}",
|
|
240
|
+
# API key authentication does not require session closure.
|
|
241
|
+
if not self.config.api_key:
|
|
242
|
+
response = requests.delete(
|
|
243
|
+
f"{self.config.connect_uri}/api/session",
|
|
244
|
+
headers={"X-Metabase-Session": self.access_token},
|
|
228
245
|
)
|
|
246
|
+
if response.status_code not in (200, 204):
|
|
247
|
+
self.report.report_failure(
|
|
248
|
+
title="Unable to Log User Out",
|
|
249
|
+
message=f"Unable to logout for user {self.config.username}",
|
|
250
|
+
)
|
|
229
251
|
super().close()
|
|
230
252
|
|
|
231
253
|
def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
@@ -104,8 +104,8 @@ class FineGrainedLineageConfig(ConfigModel):
|
|
|
104
104
|
|
|
105
105
|
class EntityNodeConfig(ConfigModel):
|
|
106
106
|
entity: EntityConfig
|
|
107
|
-
upstream: Optional[List["EntityNodeConfig"]]
|
|
108
|
-
fineGrainedLineages: Optional[List[FineGrainedLineageConfig]]
|
|
107
|
+
upstream: Optional[List["EntityNodeConfig"]] = None
|
|
108
|
+
fineGrainedLineages: Optional[List[FineGrainedLineageConfig]] = None
|
|
109
109
|
|
|
110
110
|
|
|
111
111
|
# https://pydantic-docs.helpmanual.io/usage/postponed_annotations/ required for when you reference a model within itself
|
datahub/ingestion/source/mode.py
CHANGED
|
@@ -1494,7 +1494,7 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1494
1494
|
sleep_time = error_response.headers.get("retry-after")
|
|
1495
1495
|
if sleep_time is not None:
|
|
1496
1496
|
time.sleep(float(sleep_time))
|
|
1497
|
-
raise HTTPError429
|
|
1497
|
+
raise HTTPError429 from None
|
|
1498
1498
|
|
|
1499
1499
|
raise http_error
|
|
1500
1500
|
|
|
@@ -292,7 +292,7 @@ class Neo4jSource(StatefulIngestionSourceBase):
|
|
|
292
292
|
return record["properties"]
|
|
293
293
|
|
|
294
294
|
def get_relationships(self, record: dict) -> dict:
|
|
295
|
-
return record.get("relationships",
|
|
295
|
+
return record.get("relationships", {})
|
|
296
296
|
|
|
297
297
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
298
298
|
return [
|
datahub/ingestion/source/nifi.py
CHANGED
|
@@ -1234,11 +1234,14 @@ class NifiSource(StatefulIngestionSourceBase):
|
|
|
1234
1234
|
job_type: str,
|
|
1235
1235
|
description: Optional[str],
|
|
1236
1236
|
job_properties: Optional[Dict[str, str]] = None,
|
|
1237
|
-
inlets: List[str] =
|
|
1238
|
-
outlets: List[str] =
|
|
1239
|
-
inputJobs: List[str] =
|
|
1237
|
+
inlets: Optional[List[str]] = None,
|
|
1238
|
+
outlets: Optional[List[str]] = None,
|
|
1239
|
+
inputJobs: Optional[List[str]] = None,
|
|
1240
1240
|
status: Optional[str] = None,
|
|
1241
1241
|
) -> Iterable[MetadataWorkUnit]:
|
|
1242
|
+
inlets = inlets or []
|
|
1243
|
+
outlets = outlets or []
|
|
1244
|
+
inputJobs = inputJobs or []
|
|
1242
1245
|
logger.debug(f"Begining construction of job workunit for {job_urn}")
|
|
1243
1246
|
if job_properties:
|
|
1244
1247
|
job_properties = {k: v for k, v in job_properties.items() if v is not None}
|
|
@@ -167,7 +167,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
|
|
|
167
167
|
Try to determine if example data is defined for the endpoint, and return it
|
|
168
168
|
"""
|
|
169
169
|
data = {}
|
|
170
|
-
if "content" in base_res
|
|
170
|
+
if "content" in base_res:
|
|
171
171
|
res_cont = base_res["content"]
|
|
172
172
|
if "application/json" in res_cont.keys():
|
|
173
173
|
ex_field = None
|
|
@@ -188,7 +188,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
|
|
|
188
188
|
)
|
|
189
189
|
elif "text/csv" in res_cont.keys():
|
|
190
190
|
data = res_cont["text/csv"]["schema"]
|
|
191
|
-
elif "examples" in base_res
|
|
191
|
+
elif "examples" in base_res:
|
|
192
192
|
data = base_res["examples"]["application/json"]
|
|
193
193
|
|
|
194
194
|
return data
|
|
@@ -2,7 +2,7 @@ import functools
|
|
|
2
2
|
import importlib.resources as pkg_resource
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
|
-
from typing import Dict, List
|
|
5
|
+
from typing import Dict, List, Optional
|
|
6
6
|
|
|
7
7
|
import lark
|
|
8
8
|
from lark import Lark, Tree
|
|
@@ -65,8 +65,9 @@ def get_upstream_tables(
|
|
|
65
65
|
platform_instance_resolver: AbstractDataPlatformInstanceResolver,
|
|
66
66
|
ctx: PipelineContext,
|
|
67
67
|
config: PowerBiDashboardSourceConfig,
|
|
68
|
-
parameters: Dict[str, str] =
|
|
68
|
+
parameters: Optional[Dict[str, str]] = None,
|
|
69
69
|
) -> List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage]:
|
|
70
|
+
parameters = parameters or {}
|
|
70
71
|
if table.expression is None:
|
|
71
72
|
logger.debug(f"There is no M-Query expression in table {table.full_name}")
|
|
72
73
|
return []
|
|
@@ -70,13 +70,14 @@ def get_first_rule(tree: Tree, rule: str) -> Optional[Tree]:
|
|
|
70
70
|
return expression_tree
|
|
71
71
|
|
|
72
72
|
|
|
73
|
-
def token_values(tree: Tree, parameters: Dict[str, str] =
|
|
73
|
+
def token_values(tree: Tree, parameters: Optional[Dict[str, str]] = None) -> List[str]:
|
|
74
74
|
"""
|
|
75
75
|
:param tree: Tree to traverse
|
|
76
76
|
:param parameters: If parameters is not an empty dict, it will try to resolve identifier variable references
|
|
77
77
|
using the values in 'parameters'.
|
|
78
78
|
:return: List of leaf token data
|
|
79
79
|
"""
|
|
80
|
+
parameters = parameters or {}
|
|
80
81
|
values: List[str] = []
|
|
81
82
|
|
|
82
83
|
def internal(node: Union[Tree, Token]) -> None:
|
|
@@ -890,9 +890,7 @@ class Mapper:
|
|
|
890
890
|
set(user_rights) & set(self.__config.ownership.owner_criteria)
|
|
891
891
|
)
|
|
892
892
|
> 0
|
|
893
|
-
):
|
|
894
|
-
user_mcps.extend(self.to_datahub_user(user))
|
|
895
|
-
elif self.__config.ownership.owner_criteria is None:
|
|
893
|
+
) or self.__config.ownership.owner_criteria is None:
|
|
896
894
|
user_mcps.extend(self.to_datahub_user(user))
|
|
897
895
|
else:
|
|
898
896
|
continue
|
|
@@ -380,8 +380,9 @@ class DataResolverBase(ABC):
|
|
|
380
380
|
def itr_pages(
|
|
381
381
|
self,
|
|
382
382
|
endpoint: str,
|
|
383
|
-
parameter_override: Dict =
|
|
383
|
+
parameter_override: Optional[Dict] = None,
|
|
384
384
|
) -> Iterator[List[Dict]]:
|
|
385
|
+
parameter_override = parameter_override or {}
|
|
385
386
|
params: dict = {
|
|
386
387
|
"$skip": 0,
|
|
387
388
|
"$top": self.TOP,
|
|
@@ -196,7 +196,7 @@ class PowerBiReportServerAPI:
|
|
|
196
196
|
}
|
|
197
197
|
|
|
198
198
|
reports: List[Any] = []
|
|
199
|
-
for report_type in report_types_mapping
|
|
199
|
+
for report_type in report_types_mapping:
|
|
200
200
|
report_get_endpoint: str = API_ENDPOINTS[report_type]
|
|
201
201
|
# Replace place holders
|
|
202
202
|
report_get_endpoint_http = report_get_endpoint.format(
|
|
@@ -230,8 +230,8 @@ class PulsarSource(StatefulIngestionSourceBase):
|
|
|
230
230
|
self.report.report_warning("HTTPError", message)
|
|
231
231
|
except requests.exceptions.RequestException as e:
|
|
232
232
|
raise Exception(
|
|
233
|
-
|
|
234
|
-
)
|
|
233
|
+
"An ambiguous exception occurred while handling the request"
|
|
234
|
+
) from e
|
|
235
235
|
|
|
236
236
|
@classmethod
|
|
237
237
|
def create(cls, config_dict, ctx):
|
|
@@ -17,8 +17,9 @@ class WebsocketConnection:
|
|
|
17
17
|
self.handle = [-1]
|
|
18
18
|
|
|
19
19
|
def _build_websocket_request_dict(
|
|
20
|
-
self, method: str, params: Union[Dict, List] =
|
|
20
|
+
self, method: str, params: Optional[Union[Dict, List]] = None
|
|
21
21
|
) -> Dict:
|
|
22
|
+
params = params or {}
|
|
22
23
|
return {
|
|
23
24
|
"jsonrpc": "2.0",
|
|
24
25
|
"id": self.request_id,
|
|
@@ -37,11 +38,12 @@ class WebsocketConnection:
|
|
|
37
38
|
return {}
|
|
38
39
|
|
|
39
40
|
def websocket_send_request(
|
|
40
|
-
self, method: str, params: Union[Dict, List] =
|
|
41
|
+
self, method: str, params: Optional[Union[Dict, List]] = None
|
|
41
42
|
) -> Dict:
|
|
42
43
|
"""
|
|
43
44
|
Method to send request to websocket
|
|
44
45
|
"""
|
|
46
|
+
params = params or {}
|
|
45
47
|
self.request_id += 1
|
|
46
48
|
request = self._build_websocket_request_dict(method, params)
|
|
47
49
|
response = self._send_request(request=request)
|
|
@@ -421,8 +421,9 @@ class RedashSource(StatefulIngestionSourceBase):
|
|
|
421
421
|
return database_name
|
|
422
422
|
|
|
423
423
|
def _get_datasource_urns(
|
|
424
|
-
self, data_source: Dict, sql_query_data: Dict =
|
|
424
|
+
self, data_source: Dict, sql_query_data: Optional[Dict] = None
|
|
425
425
|
) -> Optional[List[str]]:
|
|
426
|
+
sql_query_data = sql_query_data or {}
|
|
426
427
|
platform = self._get_platform_based_on_datasource(data_source)
|
|
427
428
|
database_name = self._get_database_name_based_on_datasource(data_source)
|
|
428
429
|
data_source_syntax = data_source.get("syntax")
|
|
@@ -154,10 +154,8 @@ class DataLakeSourceConfig(
|
|
|
154
154
|
return path_specs
|
|
155
155
|
|
|
156
156
|
@pydantic.validator("platform", always=True)
|
|
157
|
-
def platform_valid(cls, platform:
|
|
158
|
-
inferred_platform = values.get(
|
|
159
|
-
"platform", None
|
|
160
|
-
) # we may have inferred it above
|
|
157
|
+
def platform_valid(cls, platform: Any, values: dict) -> str:
|
|
158
|
+
inferred_platform = values.get("platform") # we may have inferred it above
|
|
161
159
|
platform = platform or inferred_platform
|
|
162
160
|
if not platform:
|
|
163
161
|
raise ValueError("platform must not be empty")
|
|
@@ -834,7 +834,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
834
834
|
min=min,
|
|
835
835
|
)
|
|
836
836
|
folders.extend(folders_list)
|
|
837
|
-
if
|
|
837
|
+
if path_spec.traversal_method != FolderTraversalMethod.ALL:
|
|
838
838
|
return folders
|
|
839
839
|
if folders:
|
|
840
840
|
return folders
|
|
@@ -847,7 +847,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
847
847
|
path_spec: PathSpec,
|
|
848
848
|
bucket: "Bucket",
|
|
849
849
|
prefix: str,
|
|
850
|
-
) ->
|
|
850
|
+
) -> Iterable[Folder]:
|
|
851
851
|
"""
|
|
852
852
|
Retrieves all the folders in a path by listing all the files in the prefix.
|
|
853
853
|
If the prefix is a full path then only that folder will be extracted.
|
|
@@ -877,51 +877,30 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
877
877
|
s3_objects = (
|
|
878
878
|
obj
|
|
879
879
|
for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
|
|
880
|
-
if _is_allowed_path(
|
|
880
|
+
if _is_allowed_path(
|
|
881
|
+
path_spec, self.create_s3_path(obj.bucket_name, obj.key)
|
|
882
|
+
)
|
|
881
883
|
)
|
|
882
|
-
|
|
883
|
-
partitions: List[Folder] = []
|
|
884
884
|
grouped_s3_objects_by_dirname = groupby_unsorted(
|
|
885
885
|
s3_objects,
|
|
886
886
|
key=lambda obj: obj.key.rsplit("/", 1)[0],
|
|
887
887
|
)
|
|
888
|
-
for
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
logger.warning(
|
|
903
|
-
f"Unable to find any files in the folder {key}. Skipping..."
|
|
904
|
-
)
|
|
905
|
-
continue
|
|
906
|
-
|
|
907
|
-
id = path_spec.get_partition_from_path(
|
|
908
|
-
self.create_s3_path(max_file.bucket_name, max_file.key)
|
|
888
|
+
for _, group in grouped_s3_objects_by_dirname:
|
|
889
|
+
max_file = max(group, key=lambda x: x.last_modified)
|
|
890
|
+
max_file_s3_path = self.create_s3_path(max_file.bucket_name, max_file.key)
|
|
891
|
+
|
|
892
|
+
# If partition_id is None, it means the folder is not a partition
|
|
893
|
+
partition_id = path_spec.get_partition_from_path(max_file_s3_path)
|
|
894
|
+
|
|
895
|
+
yield Folder(
|
|
896
|
+
partition_id=partition_id,
|
|
897
|
+
is_partition=bool(partition_id),
|
|
898
|
+
creation_time=min(obj.last_modified for obj in group),
|
|
899
|
+
modification_time=max_file.last_modified,
|
|
900
|
+
sample_file=max_file_s3_path,
|
|
901
|
+
size=sum(obj.size for obj in group),
|
|
909
902
|
)
|
|
910
903
|
|
|
911
|
-
# If id is None, it means the folder is not a partition
|
|
912
|
-
partitions.append(
|
|
913
|
-
Folder(
|
|
914
|
-
partition_id=id,
|
|
915
|
-
is_partition=bool(id),
|
|
916
|
-
creation_time=creation_time if creation_time else None, # type: ignore[arg-type]
|
|
917
|
-
modification_time=modification_time,
|
|
918
|
-
sample_file=self.create_s3_path(max_file.bucket_name, max_file.key),
|
|
919
|
-
size=file_size,
|
|
920
|
-
)
|
|
921
|
-
)
|
|
922
|
-
|
|
923
|
-
return partitions
|
|
924
|
-
|
|
925
904
|
def s3_browser(self, path_spec: PathSpec, sample_size: int) -> Iterable[BrowsePath]:
|
|
926
905
|
if self.source_config.aws_config is None:
|
|
927
906
|
raise ValueError("aws_config not set. Cannot browse s3")
|
|
@@ -1000,7 +979,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1000
979
|
min=True,
|
|
1001
980
|
)
|
|
1002
981
|
dirs_to_process.append(dirs_to_process_min[0])
|
|
1003
|
-
folders = []
|
|
982
|
+
folders: List[Folder] = []
|
|
1004
983
|
for dir in dirs_to_process:
|
|
1005
984
|
logger.info(f"Getting files from folder: {dir}")
|
|
1006
985
|
prefix_to_process = urlparse(dir).path.lstrip("/")
|
|
@@ -615,7 +615,7 @@ class SalesforceSource(StatefulIngestionSourceBase):
|
|
|
615
615
|
prefix = "\\" if text.startswith("#") else ""
|
|
616
616
|
desc += f"\n\n{prefix}{text}"
|
|
617
617
|
|
|
618
|
-
text = field.get("InlineHelpText"
|
|
618
|
+
text = field.get("InlineHelpText")
|
|
619
619
|
if text:
|
|
620
620
|
prefix = "\\" if text.startswith("#") else ""
|
|
621
621
|
desc += f"\n\n{prefix}{text}"
|
|
@@ -149,7 +149,7 @@ def construct_schema(
|
|
|
149
149
|
|
|
150
150
|
extended_schema: Dict[Tuple[str, ...], SchemaDescription] = {}
|
|
151
151
|
|
|
152
|
-
for field_path in schema
|
|
152
|
+
for field_path in schema:
|
|
153
153
|
field_types = schema[field_path]["types"]
|
|
154
154
|
field_type: Union[str, type] = "mixed"
|
|
155
155
|
|
|
@@ -124,7 +124,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
124
124
|
try:
|
|
125
125
|
self.sigma_api = SigmaAPI(self.config, self.reporter)
|
|
126
126
|
except Exception as e:
|
|
127
|
-
raise ConfigurationError(
|
|
127
|
+
raise ConfigurationError("Unable to connect sigma API") from e
|
|
128
128
|
|
|
129
129
|
@staticmethod
|
|
130
130
|
def test_connection(config_dict: dict) -> TestConnectionReport:
|
|
@@ -125,7 +125,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
125
125
|
|
|
126
126
|
@pydantic.validator("authentication_type", always=True)
|
|
127
127
|
def authenticator_type_is_valid(cls, v, values):
|
|
128
|
-
if v not in _VALID_AUTH_TYPES
|
|
128
|
+
if v not in _VALID_AUTH_TYPES:
|
|
129
129
|
raise ValueError(
|
|
130
130
|
f"unsupported authenticator type '{v}' was provided,"
|
|
131
131
|
f" use one of {list(_VALID_AUTH_TYPES.keys())}"
|
|
@@ -312,7 +312,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
312
312
|
raise ValueError(
|
|
313
313
|
f"access_token not found in response {response}. "
|
|
314
314
|
"Please check your OAuth configuration."
|
|
315
|
-
)
|
|
315
|
+
) from None
|
|
316
316
|
connect_args = self.get_options()["connect_args"]
|
|
317
317
|
return snowflake.connector.connect(
|
|
318
318
|
user=self.username,
|
|
@@ -396,7 +396,7 @@ class AthenaSource(SQLAlchemySource):
|
|
|
396
396
|
metadata.table_type if metadata.table_type else ""
|
|
397
397
|
)
|
|
398
398
|
|
|
399
|
-
location: Optional[str] = custom_properties.get("location"
|
|
399
|
+
location: Optional[str] = custom_properties.get("location")
|
|
400
400
|
if location is not None:
|
|
401
401
|
if location.startswith("s3://"):
|
|
402
402
|
location = make_s3_urn(location, self.config.env)
|
|
@@ -538,7 +538,7 @@ class AthenaSource(SQLAlchemySource):
|
|
|
538
538
|
column_name=column["name"],
|
|
539
539
|
column_type=column["type"],
|
|
540
540
|
inspector=inspector,
|
|
541
|
-
description=column.get("comment"
|
|
541
|
+
description=column.get("comment"),
|
|
542
542
|
nullable=column.get("nullable", True),
|
|
543
543
|
is_part_of_key=(
|
|
544
544
|
True
|
|
@@ -50,11 +50,7 @@ class DruidConfig(BasicSQLAlchemyConfig):
|
|
|
50
50
|
"""
|
|
51
51
|
|
|
52
52
|
def get_identifier(self, schema: str, table: str) -> str:
|
|
53
|
-
return
|
|
54
|
-
f"{self.platform_instance}.{table}"
|
|
55
|
-
if self.platform_instance
|
|
56
|
-
else f"{table}"
|
|
57
|
-
)
|
|
53
|
+
return f"{table}"
|
|
58
54
|
|
|
59
55
|
|
|
60
56
|
@platform_name("Druid")
|
|
@@ -204,7 +204,7 @@ def get_column_type(
|
|
|
204
204
|
"""
|
|
205
205
|
|
|
206
206
|
TypeClass: Optional[Type] = None
|
|
207
|
-
for sql_type in _field_type_mapping
|
|
207
|
+
for sql_type in _field_type_mapping:
|
|
208
208
|
if isinstance(column_type, sql_type):
|
|
209
209
|
TypeClass = _field_type_mapping[sql_type]
|
|
210
210
|
break
|
|
@@ -973,7 +973,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
973
973
|
inspector=inspector,
|
|
974
974
|
)
|
|
975
975
|
),
|
|
976
|
-
description=column.get("comment"
|
|
976
|
+
description=column.get("comment"),
|
|
977
977
|
nullable=column["nullable"],
|
|
978
978
|
recursive=False,
|
|
979
979
|
globalTags=gtc,
|
|
@@ -317,10 +317,10 @@ def resolve_snowflake_modified_type(type_string: str) -> Any:
|
|
|
317
317
|
match = re.match(r"([a-zA-Z_]+)\(\d+,\s\d+\)", type_string)
|
|
318
318
|
if match:
|
|
319
319
|
modified_type_base = match.group(1) # Extract the base type
|
|
320
|
-
return SNOWFLAKE_TYPES_MAP.get(modified_type_base
|
|
320
|
+
return SNOWFLAKE_TYPES_MAP.get(modified_type_base)
|
|
321
321
|
|
|
322
322
|
# Fallback for types without precision/scale
|
|
323
|
-
return SNOWFLAKE_TYPES_MAP.get(type_string
|
|
323
|
+
return SNOWFLAKE_TYPES_MAP.get(type_string)
|
|
324
324
|
|
|
325
325
|
|
|
326
326
|
# see https://github.com/googleapis/python-bigquery-sqlalchemy/blob/main/sqlalchemy_bigquery/_types.py#L32
|
|
@@ -180,10 +180,11 @@ def optimized_get_columns(
|
|
|
180
180
|
connection: Connection,
|
|
181
181
|
table_name: str,
|
|
182
182
|
schema: Optional[str] = None,
|
|
183
|
-
tables_cache: MutableMapping[str, List[TeradataTable]] =
|
|
183
|
+
tables_cache: Optional[MutableMapping[str, List[TeradataTable]]] = None,
|
|
184
184
|
use_qvci: bool = False,
|
|
185
185
|
**kw: Dict[str, Any],
|
|
186
186
|
) -> List[Dict]:
|
|
187
|
+
tables_cache = tables_cache or {}
|
|
187
188
|
if schema is None:
|
|
188
189
|
schema = self.default_schema_name
|
|
189
190
|
|
|
@@ -314,9 +315,10 @@ def optimized_get_view_definition(
|
|
|
314
315
|
connection: Connection,
|
|
315
316
|
view_name: str,
|
|
316
317
|
schema: Optional[str] = None,
|
|
317
|
-
tables_cache: MutableMapping[str, List[TeradataTable]] =
|
|
318
|
+
tables_cache: Optional[MutableMapping[str, List[TeradataTable]]] = None,
|
|
318
319
|
**kw: Dict[str, Any],
|
|
319
320
|
) -> Optional[str]:
|
|
321
|
+
tables_cache = tables_cache or {}
|
|
320
322
|
if schema is None:
|
|
321
323
|
schema = self.default_schema_name
|
|
322
324
|
|
|
@@ -142,7 +142,7 @@ def get_table_comment(self, connection, table_name: str, schema: str = None, **k
|
|
|
142
142
|
if col_value is not None:
|
|
143
143
|
properties[col_name] = col_value
|
|
144
144
|
|
|
145
|
-
return {"text": properties.get("comment"
|
|
145
|
+
return {"text": properties.get("comment"), "properties": properties}
|
|
146
146
|
else:
|
|
147
147
|
return self.get_table_comment_default(connection, table_name, schema)
|
|
148
148
|
except Exception:
|
|
@@ -483,7 +483,7 @@ def _parse_struct_fields(parts):
|
|
|
483
483
|
|
|
484
484
|
|
|
485
485
|
def _parse_basic_datatype(s):
|
|
486
|
-
for sql_type in _all_atomic_types
|
|
486
|
+
for sql_type in _all_atomic_types:
|
|
487
487
|
if isinstance(s, sql_type):
|
|
488
488
|
return {
|
|
489
489
|
"type": _all_atomic_types[sql_type],
|