acryl-datahub 1.0.0rc6__py3-none-any.whl → 1.0.0rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/METADATA +2490 -2490
- {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/RECORD +74 -74
- {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/cli/docker_cli.py +1 -1
- datahub/cli/iceberg_cli.py +1 -1
- datahub/cli/lite_cli.py +4 -2
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/configuration/git.py +1 -3
- datahub/configuration/kafka.py +1 -1
- datahub/ingestion/fs/s3_fs.py +2 -2
- datahub/ingestion/glossary/classification_mixin.py +1 -1
- datahub/ingestion/graph/client.py +16 -7
- datahub/ingestion/graph/entity_versioning.py +3 -3
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
- datahub/ingestion/source/abs/config.py +2 -4
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_api.py +2 -1
- datahub/ingestion/source/csv_enricher.py +3 -3
- datahub/ingestion/source/dbt/dbt_common.py +1 -1
- datahub/ingestion/source/dremio/dremio_api.py +3 -3
- datahub/ingestion/source/dremio/dremio_aspects.py +2 -1
- datahub/ingestion/source/file.py +5 -2
- datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
- datahub/ingestion/source/ge_data_profiler.py +11 -14
- datahub/ingestion/source/iceberg/iceberg.py +46 -12
- datahub/ingestion/source/iceberg/iceberg_common.py +31 -20
- datahub/ingestion/source/identity/okta.py +1 -3
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/kafka_connect/source_connectors.py +4 -7
- datahub/ingestion/source/looker/looker_file_loader.py +2 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
- datahub/ingestion/source/looker/looker_template_language.py +4 -2
- datahub/ingestion/source/looker/lookml_source.py +3 -2
- datahub/ingestion/source/metabase.py +54 -32
- datahub/ingestion/source/metadata/lineage.py +2 -2
- datahub/ingestion/source/mode.py +1 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +6 -3
- datahub/ingestion/source/openapi_parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
- datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/pulsar.py +2 -2
- datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
- datahub/ingestion/source/redash.py +2 -1
- datahub/ingestion/source/s3/config.py +2 -4
- datahub/ingestion/source/s3/source.py +20 -41
- datahub/ingestion/source/salesforce.py +1 -1
- datahub/ingestion/source/schema_inference/object.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
- datahub/ingestion/source/sql/athena.py +2 -2
- datahub/ingestion/source/sql/druid.py +1 -5
- datahub/ingestion/source/sql/sql_common.py +2 -2
- datahub/ingestion/source/sql/sql_types.py +2 -2
- datahub/ingestion/source/sql/teradata.py +4 -2
- datahub/ingestion/source/sql/trino.py +2 -2
- datahub/ingestion/source/superset.py +65 -37
- datahub/ingestion/source/tableau/tableau.py +3 -6
- datahub/ingestion/source/tableau/tableau_common.py +2 -1
- datahub/lite/duckdb_lite.py +5 -10
- datahub/lite/lite_local.py +1 -1
- datahub/lite/lite_util.py +4 -3
- datahub/sdk/dataset.py +3 -3
- datahub/utilities/memory_footprint.py +3 -2
- {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/top_level.txt +0 -0
|
@@ -330,7 +330,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
330
330
|
aspect_type_name: Optional[str] = None,
|
|
331
331
|
version: int = 0,
|
|
332
332
|
) -> Optional[Aspect]:
|
|
333
|
-
assert aspect_type.ASPECT_NAME
|
|
333
|
+
assert aspect == aspect_type.ASPECT_NAME
|
|
334
334
|
return self.get_aspect(
|
|
335
335
|
entity_urn=entity_urn,
|
|
336
336
|
aspect_type=aspect_type,
|
|
@@ -1547,7 +1547,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1547
1547
|
return fragment
|
|
1548
1548
|
|
|
1549
1549
|
def _run_assertion_build_params(
|
|
1550
|
-
self, params: Optional[Dict[str, str]] =
|
|
1550
|
+
self, params: Optional[Dict[str, str]] = None
|
|
1551
1551
|
) -> List[Any]:
|
|
1552
1552
|
if params is None:
|
|
1553
1553
|
return []
|
|
@@ -1566,9 +1566,11 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1566
1566
|
self,
|
|
1567
1567
|
urn: str,
|
|
1568
1568
|
save_result: bool = True,
|
|
1569
|
-
parameters: Optional[Dict[str, str]] =
|
|
1569
|
+
parameters: Optional[Dict[str, str]] = None,
|
|
1570
1570
|
async_flag: bool = False,
|
|
1571
1571
|
) -> Dict:
|
|
1572
|
+
if parameters is None:
|
|
1573
|
+
parameters = {}
|
|
1572
1574
|
params = self._run_assertion_build_params(parameters)
|
|
1573
1575
|
graph_query: str = """
|
|
1574
1576
|
%s
|
|
@@ -1597,9 +1599,11 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1597
1599
|
self,
|
|
1598
1600
|
urns: List[str],
|
|
1599
1601
|
save_result: bool = True,
|
|
1600
|
-
parameters: Optional[Dict[str, str]] =
|
|
1602
|
+
parameters: Optional[Dict[str, str]] = None,
|
|
1601
1603
|
async_flag: bool = False,
|
|
1602
1604
|
) -> Dict:
|
|
1605
|
+
if parameters is None:
|
|
1606
|
+
parameters = {}
|
|
1603
1607
|
params = self._run_assertion_build_params(parameters)
|
|
1604
1608
|
graph_query: str = """
|
|
1605
1609
|
%s
|
|
@@ -1636,10 +1640,14 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1636
1640
|
def run_assertions_for_asset(
|
|
1637
1641
|
self,
|
|
1638
1642
|
urn: str,
|
|
1639
|
-
tag_urns: Optional[List[str]] =
|
|
1640
|
-
parameters: Optional[Dict[str, str]] =
|
|
1643
|
+
tag_urns: Optional[List[str]] = None,
|
|
1644
|
+
parameters: Optional[Dict[str, str]] = None,
|
|
1641
1645
|
async_flag: bool = False,
|
|
1642
1646
|
) -> Dict:
|
|
1647
|
+
if tag_urns is None:
|
|
1648
|
+
tag_urns = []
|
|
1649
|
+
if parameters is None:
|
|
1650
|
+
parameters = {}
|
|
1643
1651
|
params = self._run_assertion_build_params(parameters)
|
|
1644
1652
|
graph_query: str = """
|
|
1645
1653
|
%s
|
|
@@ -1677,9 +1685,10 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1677
1685
|
self,
|
|
1678
1686
|
entity_name: str,
|
|
1679
1687
|
urns: List[str],
|
|
1680
|
-
aspects: List[str] =
|
|
1688
|
+
aspects: Optional[List[str]] = None,
|
|
1681
1689
|
with_system_metadata: bool = False,
|
|
1682
1690
|
) -> Dict[str, Any]:
|
|
1691
|
+
aspects = aspects or []
|
|
1683
1692
|
payload = {
|
|
1684
1693
|
"urns": urns,
|
|
1685
1694
|
"aspectNames": aspects,
|
|
@@ -93,7 +93,7 @@ class EntityVersioningAPI(DataHubGraphProtocol):
|
|
|
93
93
|
try:
|
|
94
94
|
return response["linkAssetVersion"]["urn"]
|
|
95
95
|
except KeyError:
|
|
96
|
-
raise ValueError(f"Unexpected response: {response}")
|
|
96
|
+
raise ValueError(f"Unexpected response: {response}") from None
|
|
97
97
|
|
|
98
98
|
def link_asset_to_versioned_asset(
|
|
99
99
|
self,
|
|
@@ -165,7 +165,7 @@ class EntityVersioningAPI(DataHubGraphProtocol):
|
|
|
165
165
|
try:
|
|
166
166
|
return response["unlinkAssetVersion"]["urn"]
|
|
167
167
|
except KeyError:
|
|
168
|
-
raise ValueError(f"Unexpected response: {response}")
|
|
168
|
+
raise ValueError(f"Unexpected response: {response}") from None
|
|
169
169
|
|
|
170
170
|
def unlink_latest_asset_from_version_set(
|
|
171
171
|
self, version_set_urn: str
|
|
@@ -198,4 +198,4 @@ class EntityVersioningAPI(DataHubGraphProtocol):
|
|
|
198
198
|
try:
|
|
199
199
|
return response["unlinkAssetVersion"]["urn"]
|
|
200
200
|
except KeyError:
|
|
201
|
-
raise ValueError(f"Unexpected response: {response}")
|
|
201
|
+
raise ValueError(f"Unexpected response: {response}") from None
|
|
@@ -163,12 +163,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
|
|
|
163
163
|
key: DatahubIngestionRunSummaryProvider._convert_sets_to_lists(value)
|
|
164
164
|
for key, value in obj.items()
|
|
165
165
|
}
|
|
166
|
-
elif isinstance(obj, list):
|
|
167
|
-
return [
|
|
168
|
-
DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
|
|
169
|
-
for element in obj
|
|
170
|
-
]
|
|
171
|
-
elif isinstance(obj, set):
|
|
166
|
+
elif isinstance(obj, list) or isinstance(obj, set):
|
|
172
167
|
return [
|
|
173
168
|
DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
|
|
174
169
|
for element in obj
|
|
@@ -144,10 +144,8 @@ class DataLakeSourceConfig(
|
|
|
144
144
|
return path_specs
|
|
145
145
|
|
|
146
146
|
@pydantic.validator("platform", always=True)
|
|
147
|
-
def platform_not_empty(cls, platform:
|
|
148
|
-
inferred_platform = values.get(
|
|
149
|
-
"platform", None
|
|
150
|
-
) # we may have inferred it above
|
|
147
|
+
def platform_not_empty(cls, platform: Any, values: dict) -> str:
|
|
148
|
+
inferred_platform = values.get("platform") # we may have inferred it above
|
|
151
149
|
platform = platform or inferred_platform
|
|
152
150
|
if not platform:
|
|
153
151
|
raise ValueError("platform must not be empty")
|
|
@@ -165,7 +165,7 @@ class BigQueryTableRef:
|
|
|
165
165
|
@classmethod
|
|
166
166
|
def from_spec_obj(cls, spec: dict) -> "BigQueryTableRef":
|
|
167
167
|
for key in ["projectId", "datasetId", "tableId"]:
|
|
168
|
-
if key not in spec
|
|
168
|
+
if key not in spec:
|
|
169
169
|
raise ValueError(f"invalid BigQuery table reference dict: {spec}")
|
|
170
170
|
|
|
171
171
|
return cls(
|
|
@@ -344,7 +344,7 @@ class BigQuerySchemaApi:
|
|
|
344
344
|
with_partitions: bool = False,
|
|
345
345
|
) -> Iterator[BigqueryTable]:
|
|
346
346
|
with PerfTimer() as current_timer:
|
|
347
|
-
filter_clause: str = ", ".join(f"'{table}'" for table in tables
|
|
347
|
+
filter_clause: str = ", ".join(f"'{table}'" for table in tables)
|
|
348
348
|
|
|
349
349
|
if with_partitions:
|
|
350
350
|
query_template = BigqueryQuery.tables_for_dataset
|
|
@@ -159,7 +159,8 @@ class CassandraAPI:
|
|
|
159
159
|
self.report.failure(message="Failed to authenticate to Cassandra", exc=e)
|
|
160
160
|
return False
|
|
161
161
|
|
|
162
|
-
def get(self, query: str, parameters: Optional[List] =
|
|
162
|
+
def get(self, query: str, parameters: Optional[List] = None) -> List:
|
|
163
|
+
parameters = parameters or []
|
|
163
164
|
if not self._cassandra_session:
|
|
164
165
|
return []
|
|
165
166
|
|
|
@@ -314,7 +314,7 @@ class CSVEnricherSource(Source):
|
|
|
314
314
|
"datajob": EditableDataJobPropertiesClass,
|
|
315
315
|
"dataflow": EditableDataFlowPropertiesClass,
|
|
316
316
|
"notebook": EditableNotebookPropertiesClass,
|
|
317
|
-
}.get(entityType
|
|
317
|
+
}.get(entityType)
|
|
318
318
|
|
|
319
319
|
if not entityClass:
|
|
320
320
|
raise ValueError(
|
|
@@ -640,8 +640,8 @@ class CSVEnricherSource(Source):
|
|
|
640
640
|
)
|
|
641
641
|
except Exception as e:
|
|
642
642
|
raise ConfigurationError(
|
|
643
|
-
f"Cannot read remote file {self.config.filename}
|
|
644
|
-
)
|
|
643
|
+
f"Cannot read remote file {self.config.filename}: {e}"
|
|
644
|
+
) from e
|
|
645
645
|
else:
|
|
646
646
|
with open(pathlib.Path(self.config.filename), encoding="utf-8-sig") as f:
|
|
647
647
|
rows = list(csv.DictReader(f, delimiter=self.config.delimiter))
|
|
@@ -1033,7 +1033,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1033
1033
|
cll_nodes.add(dbt_name)
|
|
1034
1034
|
schema_nodes.add(dbt_name)
|
|
1035
1035
|
|
|
1036
|
-
for dbt_name in all_nodes_map
|
|
1036
|
+
for dbt_name in all_nodes_map:
|
|
1037
1037
|
if self._is_allowed_node(dbt_name):
|
|
1038
1038
|
add_node_to_cll_list(dbt_name)
|
|
1039
1039
|
|
|
@@ -271,12 +271,12 @@ class DremioAPIOperations:
|
|
|
271
271
|
self.cancel_query(job_id)
|
|
272
272
|
raise DremioAPIException(
|
|
273
273
|
f"Query execution timed out after {timeout} seconds"
|
|
274
|
-
)
|
|
274
|
+
) from None
|
|
275
275
|
except RuntimeError as e:
|
|
276
|
-
raise DremioAPIException(
|
|
276
|
+
raise DremioAPIException() from e
|
|
277
277
|
|
|
278
278
|
except requests.RequestException as e:
|
|
279
|
-
raise DremioAPIException(
|
|
279
|
+
raise DremioAPIException("Error executing query") from e
|
|
280
280
|
|
|
281
281
|
def fetch_results(self, job_id: str) -> List[Dict]:
|
|
282
282
|
"""Fetch job results with status checking"""
|
|
@@ -168,8 +168,9 @@ class DremioAspects:
|
|
|
168
168
|
)
|
|
169
169
|
|
|
170
170
|
def get_container_urn(
|
|
171
|
-
self, name: Optional[str] = None, path: Optional[List[str]] =
|
|
171
|
+
self, name: Optional[str] = None, path: Optional[List[str]] = None
|
|
172
172
|
) -> str:
|
|
173
|
+
path = path or []
|
|
173
174
|
container_key = self.get_container_key(name, path)
|
|
174
175
|
return container_key.as_urn()
|
|
175
176
|
|
datahub/ingestion/source/file.py
CHANGED
|
@@ -410,10 +410,13 @@ def _from_obj_for_file(
|
|
|
410
410
|
item = MetadataChangeEvent.from_obj(obj)
|
|
411
411
|
elif "aspect" in obj:
|
|
412
412
|
item = MetadataChangeProposalWrapper.from_obj(obj)
|
|
413
|
-
|
|
413
|
+
elif "bucket" in obj:
|
|
414
414
|
item = UsageAggregationClass.from_obj(obj)
|
|
415
|
+
else:
|
|
416
|
+
raise ValueError(f"Unknown object type: {obj}")
|
|
417
|
+
|
|
415
418
|
if not item.validate():
|
|
416
|
-
raise ValueError(f"
|
|
419
|
+
raise ValueError(f"Failed to parse: {obj}")
|
|
417
420
|
|
|
418
421
|
if isinstance(item, UsageAggregationClass):
|
|
419
422
|
logger.warning(f"Dropping deprecated UsageAggregationClass: {item}")
|
|
@@ -498,7 +498,7 @@ class DataProcessCleanup:
|
|
|
498
498
|
# Delete empty dataflows if needed
|
|
499
499
|
if self.config.delete_empty_data_flows:
|
|
500
500
|
deleted_data_flows: int = 0
|
|
501
|
-
for key in dataFlows
|
|
501
|
+
for key in dataFlows:
|
|
502
502
|
if not dataJobs.get(key) or len(dataJobs[key]) == 0:
|
|
503
503
|
logger.info(
|
|
504
504
|
f"Deleting dataflow {key} because there are not datajobs"
|
|
@@ -130,8 +130,9 @@ class DatahubExecutionRequestCleanup:
|
|
|
130
130
|
)
|
|
131
131
|
|
|
132
132
|
def _scroll_execution_requests(
|
|
133
|
-
self, overrides: Dict[str, Any] =
|
|
133
|
+
self, overrides: Optional[Dict[str, Any]] = None
|
|
134
134
|
) -> Iterator[CleanupRecord]:
|
|
135
|
+
overrides = overrides or {}
|
|
135
136
|
headers: Dict[str, Any] = {
|
|
136
137
|
"Accept": "application/json",
|
|
137
138
|
"Content-Type": "application/json",
|
|
@@ -170,14 +170,10 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
|
|
|
170
170
|
).select_from(self._table)
|
|
171
171
|
)
|
|
172
172
|
return convert_to_json_serializable(element_values.fetchone()[0])
|
|
173
|
-
elif
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
)
|
|
178
|
-
)
|
|
179
|
-
return convert_to_json_serializable(element_values.fetchone()[0])
|
|
180
|
-
elif self.engine.dialect.name.lower() == SNOWFLAKE:
|
|
173
|
+
elif (
|
|
174
|
+
self.engine.dialect.name.lower() == BIGQUERY
|
|
175
|
+
or self.engine.dialect.name.lower() == SNOWFLAKE
|
|
176
|
+
):
|
|
181
177
|
element_values = self.engine.execute(
|
|
182
178
|
sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
|
|
183
179
|
self._table
|
|
@@ -381,13 +377,14 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
381
377
|
col = col_dict["name"]
|
|
382
378
|
self.column_types[col] = str(col_dict["type"])
|
|
383
379
|
# We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
|
|
384
|
-
if
|
|
385
|
-
|
|
380
|
+
if (
|
|
381
|
+
not self.config._allow_deny_patterns.allowed(
|
|
382
|
+
f"{self.dataset_name}.{col}"
|
|
383
|
+
)
|
|
384
|
+
or not self.config.profile_nested_fields
|
|
385
|
+
and "." in col
|
|
386
386
|
):
|
|
387
387
|
ignored_columns_by_pattern.append(col)
|
|
388
|
-
# We try to ignore nested columns as well
|
|
389
|
-
elif not self.config.profile_nested_fields and "." in col:
|
|
390
|
-
ignored_columns_by_pattern.append(col)
|
|
391
388
|
elif col_dict.get("type") and self._should_ignore_column(col_dict["type"]):
|
|
392
389
|
ignored_columns_by_type.append(col)
|
|
393
390
|
else:
|
|
@@ -1408,7 +1405,7 @@ class DatahubGEProfiler:
|
|
|
1408
1405
|
},
|
|
1409
1406
|
)
|
|
1410
1407
|
|
|
1411
|
-
if platform
|
|
1408
|
+
if platform in (BIGQUERY, DATABRICKS):
|
|
1412
1409
|
# This is done as GE makes the name as DATASET.TABLE
|
|
1413
1410
|
# but we want it to be PROJECT.DATASET.TABLE instead for multi-project setups
|
|
1414
1411
|
name_parts = pretty_name.split(".")
|
|
@@ -2,8 +2,9 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
import threading
|
|
4
4
|
import uuid
|
|
5
|
-
from typing import Any, Dict, Iterable, List, Optional
|
|
5
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
6
6
|
|
|
7
|
+
from dateutil import parser as dateutil_parser
|
|
7
8
|
from pyiceberg.catalog import Catalog
|
|
8
9
|
from pyiceberg.exceptions import (
|
|
9
10
|
NoSuchIcebergTableError,
|
|
@@ -81,6 +82,7 @@ from datahub.metadata.schema_classes import (
|
|
|
81
82
|
OwnerClass,
|
|
82
83
|
OwnershipClass,
|
|
83
84
|
OwnershipTypeClass,
|
|
85
|
+
TimeStampClass,
|
|
84
86
|
)
|
|
85
87
|
from datahub.utilities.perf_timer import PerfTimer
|
|
86
88
|
from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
|
@@ -183,16 +185,9 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
183
185
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
184
186
|
thread_local = threading.local()
|
|
185
187
|
|
|
186
|
-
def
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
if not self.config.table_pattern.allowed(dataset_name):
|
|
190
|
-
# Dataset name is rejected by pattern, report as dropped.
|
|
191
|
-
self.report.report_dropped(dataset_name)
|
|
192
|
-
LOGGER.debug(
|
|
193
|
-
f"Skipping table {dataset_name} due to not being allowed by the config pattern"
|
|
194
|
-
)
|
|
195
|
-
return
|
|
188
|
+
def _try_processing_dataset(
|
|
189
|
+
dataset_path: Tuple[str, ...], dataset_name: str
|
|
190
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
196
191
|
try:
|
|
197
192
|
if not hasattr(thread_local, "local_catalog"):
|
|
198
193
|
LOGGER.debug(
|
|
@@ -248,10 +243,31 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
248
243
|
LOGGER.warning(
|
|
249
244
|
f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
|
|
250
245
|
)
|
|
246
|
+
except ValueError as e:
|
|
247
|
+
if "Could not initialize FileIO" not in str(e):
|
|
248
|
+
raise
|
|
249
|
+
self.report.warning(
|
|
250
|
+
"Could not initialize FileIO",
|
|
251
|
+
f"Could not initialize FileIO for {dataset_path} due to: {e}",
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
|
|
255
|
+
try:
|
|
256
|
+
LOGGER.debug(f"Processing dataset for path {dataset_path}")
|
|
257
|
+
dataset_name = ".".join(dataset_path)
|
|
258
|
+
if not self.config.table_pattern.allowed(dataset_name):
|
|
259
|
+
# Dataset name is rejected by pattern, report as dropped.
|
|
260
|
+
self.report.report_dropped(dataset_name)
|
|
261
|
+
LOGGER.debug(
|
|
262
|
+
f"Skipping table {dataset_name} due to not being allowed by the config pattern"
|
|
263
|
+
)
|
|
264
|
+
return
|
|
265
|
+
|
|
266
|
+
yield from _try_processing_dataset(dataset_path, dataset_name)
|
|
251
267
|
except Exception as e:
|
|
252
268
|
self.report.report_failure(
|
|
253
269
|
"general",
|
|
254
|
-
f"Failed to create workunit for dataset {
|
|
270
|
+
f"Failed to create workunit for dataset {dataset_path}: {e}",
|
|
255
271
|
)
|
|
256
272
|
LOGGER.exception(
|
|
257
273
|
f"Exception while processing table {dataset_path}, skipping it.",
|
|
@@ -288,6 +304,7 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
288
304
|
)
|
|
289
305
|
|
|
290
306
|
# Dataset properties aspect.
|
|
307
|
+
additional_properties = {}
|
|
291
308
|
custom_properties = table.metadata.properties.copy()
|
|
292
309
|
custom_properties["location"] = table.metadata.location
|
|
293
310
|
custom_properties["format-version"] = str(table.metadata.format_version)
|
|
@@ -299,10 +316,27 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
299
316
|
custom_properties["manifest-list"] = (
|
|
300
317
|
table.current_snapshot().manifest_list
|
|
301
318
|
)
|
|
319
|
+
additional_properties["lastModified"] = TimeStampClass(
|
|
320
|
+
int(table.current_snapshot().timestamp_ms)
|
|
321
|
+
)
|
|
322
|
+
if "created-at" in custom_properties:
|
|
323
|
+
try:
|
|
324
|
+
dt = dateutil_parser.isoparse(custom_properties["created-at"])
|
|
325
|
+
additional_properties["created"] = TimeStampClass(
|
|
326
|
+
int(dt.timestamp() * 1000)
|
|
327
|
+
)
|
|
328
|
+
except Exception as ex:
|
|
329
|
+
LOGGER.warning(
|
|
330
|
+
f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
|
|
331
|
+
)
|
|
332
|
+
|
|
302
333
|
dataset_properties = DatasetPropertiesClass(
|
|
303
334
|
name=table.name()[-1],
|
|
304
335
|
description=table.metadata.properties.get("comment", None),
|
|
305
336
|
customProperties=custom_properties,
|
|
337
|
+
lastModified=additional_properties.get("lastModified"),
|
|
338
|
+
created=additional_properties.get("created"),
|
|
339
|
+
qualifiedName=dataset_name,
|
|
306
340
|
)
|
|
307
341
|
dataset_snapshot.aspects.append(dataset_properties)
|
|
308
342
|
# Dataset ownership aspect.
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import threading
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
4
|
from typing import Any, Dict, Optional
|
|
4
5
|
|
|
@@ -156,18 +157,21 @@ class TopTableTimings:
|
|
|
156
157
|
def __init__(self, size: int = 10):
|
|
157
158
|
self._size = size
|
|
158
159
|
self.top_entites = SortedList(key=lambda x: -x.get(self._VALUE_FIELD, 0))
|
|
160
|
+
self._lock = threading.Lock()
|
|
159
161
|
|
|
160
162
|
def add(self, entity: Dict[str, Any]) -> None:
|
|
161
163
|
if self._VALUE_FIELD not in entity:
|
|
162
164
|
return
|
|
163
|
-
self.
|
|
164
|
-
|
|
165
|
-
self.top_entites.
|
|
165
|
+
with self._lock:
|
|
166
|
+
self.top_entites.add(entity)
|
|
167
|
+
if len(self.top_entites) > self._size:
|
|
168
|
+
self.top_entites.pop()
|
|
166
169
|
|
|
167
170
|
def __str__(self) -> str:
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
+
with self._lock:
|
|
172
|
+
if len(self.top_entites) == 0:
|
|
173
|
+
return "no timings reported"
|
|
174
|
+
return str(list(self.top_entites))
|
|
171
175
|
|
|
172
176
|
|
|
173
177
|
class TimingClass:
|
|
@@ -175,24 +179,31 @@ class TimingClass:
|
|
|
175
179
|
|
|
176
180
|
def __init__(self):
|
|
177
181
|
self.times = SortedList()
|
|
182
|
+
self._lock = threading.Lock()
|
|
178
183
|
|
|
179
184
|
def add_timing(self, t: float) -> None:
|
|
180
|
-
self.
|
|
185
|
+
with self._lock:
|
|
186
|
+
self.times.add(t)
|
|
181
187
|
|
|
182
188
|
def __str__(self) -> str:
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
189
|
+
with self._lock:
|
|
190
|
+
if len(self.times) == 0:
|
|
191
|
+
return "no timings reported"
|
|
192
|
+
total = sum(self.times)
|
|
193
|
+
avg = total / len(self.times)
|
|
194
|
+
return str(
|
|
195
|
+
{
|
|
196
|
+
"average_time": format_timespan(avg, detailed=True, max_units=3),
|
|
197
|
+
"min_time": format_timespan(
|
|
198
|
+
self.times[0], detailed=True, max_units=3
|
|
199
|
+
),
|
|
200
|
+
"max_time": format_timespan(
|
|
201
|
+
self.times[-1], detailed=True, max_units=3
|
|
202
|
+
),
|
|
203
|
+
# total_time does not provide correct information in case we run in more than 1 thread
|
|
204
|
+
"total_time": format_timespan(total, detailed=True, max_units=3),
|
|
205
|
+
}
|
|
206
|
+
)
|
|
196
207
|
|
|
197
208
|
|
|
198
209
|
@dataclass
|
|
@@ -568,9 +568,7 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
568
568
|
if (
|
|
569
569
|
self.config.include_deprovisioned_users is False
|
|
570
570
|
and okta_user.status == UserStatus.DEPROVISIONED
|
|
571
|
-
)
|
|
572
|
-
return False
|
|
573
|
-
elif (
|
|
571
|
+
) or (
|
|
574
572
|
self.config.include_suspended_users is False
|
|
575
573
|
and okta_user.status == UserStatus.SUSPENDED
|
|
576
574
|
):
|
|
@@ -272,7 +272,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
272
272
|
return schema_registry_class.create(config, report)
|
|
273
273
|
except Exception as e:
|
|
274
274
|
logger.debug(e, exc_info=e)
|
|
275
|
-
raise ImportError(config.schema_registry_class)
|
|
275
|
+
raise ImportError(config.schema_registry_class) from e
|
|
276
276
|
|
|
277
277
|
def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext):
|
|
278
278
|
super().__init__(config, ctx)
|
|
@@ -447,13 +447,10 @@ class DebeziumSourceConnector(BaseConnector):
|
|
|
447
447
|
) -> DebeziumParser:
|
|
448
448
|
connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "")
|
|
449
449
|
|
|
450
|
-
if
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
database_name=None,
|
|
455
|
-
)
|
|
456
|
-
elif connector_class == "MySqlConnector":
|
|
450
|
+
if (
|
|
451
|
+
connector_class == "io.debezium.connector.mysql.MySqlConnector"
|
|
452
|
+
or connector_class == "MySqlConnector"
|
|
453
|
+
):
|
|
457
454
|
parser = self.DebeziumParser(
|
|
458
455
|
source_platform="mysql",
|
|
459
456
|
server_name=self.get_server_name(connector_manifest),
|
|
@@ -33,14 +33,14 @@ class LookerViewFileLoader:
|
|
|
33
33
|
base_projects_folder: Dict[str, pathlib.Path],
|
|
34
34
|
reporter: LookMLSourceReport,
|
|
35
35
|
source_config: LookMLSourceConfig,
|
|
36
|
-
manifest_constants: Dict[str, LookerConstant] =
|
|
36
|
+
manifest_constants: Optional[Dict[str, LookerConstant]] = None,
|
|
37
37
|
) -> None:
|
|
38
38
|
self.viewfile_cache: Dict[str, Optional[LookerViewFile]] = {}
|
|
39
39
|
self._root_project_name = root_project_name
|
|
40
40
|
self._base_projects_folder = base_projects_folder
|
|
41
41
|
self.reporter = reporter
|
|
42
42
|
self.source_config = source_config
|
|
43
|
-
self.manifest_constants = manifest_constants
|
|
43
|
+
self.manifest_constants = manifest_constants or {}
|
|
44
44
|
|
|
45
45
|
def _load_viewfile(
|
|
46
46
|
self, project_name: str, path: str, reporter: LookMLSourceReport
|
|
@@ -205,8 +205,9 @@ class LookerAPI:
|
|
|
205
205
|
def folder_ancestors(
|
|
206
206
|
self,
|
|
207
207
|
folder_id: str,
|
|
208
|
-
fields: Union[str, List[str]] =
|
|
208
|
+
fields: Optional[Union[str, List[str]]] = None,
|
|
209
209
|
) -> Sequence[Folder]:
|
|
210
|
+
fields = fields or ["id", "name", "parent_id"]
|
|
210
211
|
self.client_stats.folder_calls += 1
|
|
211
212
|
try:
|
|
212
213
|
return self.client.folder_ancestors(
|
|
@@ -464,9 +464,10 @@ def process_lookml_template_language(
|
|
|
464
464
|
source_config: LookMLSourceConfig,
|
|
465
465
|
view_lkml_file_dict: dict,
|
|
466
466
|
reporter: LookMLSourceReport,
|
|
467
|
-
manifest_constants: Dict[str, "LookerConstant"] =
|
|
467
|
+
manifest_constants: Optional[Dict[str, "LookerConstant"]] = None,
|
|
468
468
|
resolve_constants: bool = False,
|
|
469
469
|
) -> None:
|
|
470
|
+
manifest_constants = manifest_constants or {}
|
|
470
471
|
if "views" not in view_lkml_file_dict:
|
|
471
472
|
return
|
|
472
473
|
|
|
@@ -507,9 +508,10 @@ def load_and_preprocess_file(
|
|
|
507
508
|
path: Union[str, pathlib.Path],
|
|
508
509
|
source_config: LookMLSourceConfig,
|
|
509
510
|
reporter: LookMLSourceReport,
|
|
510
|
-
manifest_constants: Dict[str, "LookerConstant"] =
|
|
511
|
+
manifest_constants: Optional[Dict[str, "LookerConstant"]] = None,
|
|
511
512
|
resolve_constants: bool = False,
|
|
512
513
|
) -> dict:
|
|
514
|
+
manifest_constants = manifest_constants or {}
|
|
513
515
|
parsed = load_lkml(path)
|
|
514
516
|
|
|
515
517
|
process_lookml_template_language(
|
|
@@ -501,7 +501,7 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
501
501
|
raise ValueError(
|
|
502
502
|
f"Could not locate a project name for model {model_name}. Consider configuring a static project name "
|
|
503
503
|
f"in your config file"
|
|
504
|
-
)
|
|
504
|
+
) from None
|
|
505
505
|
|
|
506
506
|
def get_manifest_if_present(self, folder: pathlib.Path) -> Optional[LookerManifest]:
|
|
507
507
|
manifest_file = folder / "manifest.lkml"
|
|
@@ -1006,8 +1006,9 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
1006
1006
|
def report_skipped_unreachable_views(
|
|
1007
1007
|
self,
|
|
1008
1008
|
viewfile_loader: LookerViewFileLoader,
|
|
1009
|
-
processed_view_map: Dict[str, Set[str]] =
|
|
1009
|
+
processed_view_map: Optional[Dict[str, Set[str]]] = None,
|
|
1010
1010
|
) -> None:
|
|
1011
|
+
processed_view_map = processed_view_map or {}
|
|
1011
1012
|
view_files: Dict[str, List[pathlib.Path]] = {}
|
|
1012
1013
|
for project, folder_path in self.base_projects_folder.items():
|
|
1013
1014
|
folder = pathlib.Path(folder_path)
|