acryl-datahub 0.15.0.6rc3__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2552 -2523
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +204 -191
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +4 -3
- datahub/api/entities/dataset/dataset.py +731 -42
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/check_cli.py +72 -19
- datahub/cli/docker_cli.py +3 -3
- datahub/cli/iceberg_cli.py +1 -1
- datahub/cli/ingest_cli.py +30 -93
- datahub/cli/lite_cli.py +4 -2
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/dataset_cli.py +128 -14
- datahub/configuration/common.py +10 -2
- datahub/configuration/git.py +1 -3
- datahub/configuration/kafka.py +1 -1
- datahub/emitter/mce_builder.py +28 -13
- datahub/emitter/mcp_builder.py +4 -1
- datahub/emitter/response_helper.py +145 -0
- datahub/emitter/rest_emitter.py +323 -10
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/fs/s3_fs.py +2 -2
- datahub/ingestion/glossary/classification_mixin.py +1 -5
- datahub/ingestion/graph/client.py +41 -22
- datahub/ingestion/graph/entity_versioning.py +3 -3
- datahub/ingestion/graph/filters.py +64 -37
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
- datahub/ingestion/run/pipeline.py +112 -148
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/sink/datahub_rest.py +8 -0
- datahub/ingestion/source/abs/config.py +2 -4
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
- datahub/ingestion/source/common/subtypes.py +12 -0
- datahub/ingestion/source/csv_enricher.py +3 -3
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
- datahub/ingestion/source/dbt/dbt_common.py +3 -5
- datahub/ingestion/source/dbt/dbt_tests.py +4 -8
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/dremio/dremio_api.py +4 -8
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -0
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +6 -3
- datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
- datahub/ingestion/source/ge_data_profiler.py +12 -15
- datahub/ingestion/source/iceberg/iceberg.py +46 -12
- datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
- datahub/ingestion/source/identity/okta.py +37 -7
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -7
- datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
- datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
- datahub/ingestion/source/looker/looker_common.py +3 -3
- datahub/ingestion/source/looker/looker_file_loader.py +2 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
- datahub/ingestion/source/looker/looker_source.py +1 -1
- datahub/ingestion/source/looker/looker_template_language.py +4 -2
- datahub/ingestion/source/looker/lookml_source.py +3 -2
- datahub/ingestion/source/metabase.py +57 -35
- datahub/ingestion/source/metadata/business_glossary.py +45 -3
- datahub/ingestion/source/metadata/lineage.py +2 -2
- datahub/ingestion/source/mlflow.py +365 -35
- datahub/ingestion/source/mode.py +18 -8
- datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
- datahub/ingestion/source/nifi.py +37 -11
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/openapi_parser.py +49 -17
- datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
- datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
- datahub/ingestion/source/preset.py +7 -4
- datahub/ingestion/source/pulsar.py +3 -2
- datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
- datahub/ingestion/source/redash.py +31 -7
- datahub/ingestion/source/redshift/config.py +4 -0
- datahub/ingestion/source/redshift/datashares.py +236 -0
- datahub/ingestion/source/redshift/lineage.py +6 -2
- datahub/ingestion/source/redshift/lineage_v2.py +24 -9
- datahub/ingestion/source/redshift/profile.py +1 -1
- datahub/ingestion/source/redshift/query.py +133 -33
- datahub/ingestion/source/redshift/redshift.py +46 -73
- datahub/ingestion/source/redshift/redshift_schema.py +186 -6
- datahub/ingestion/source/redshift/report.py +3 -0
- datahub/ingestion/source/s3/config.py +5 -5
- datahub/ingestion/source/s3/source.py +20 -41
- datahub/ingestion/source/salesforce.py +550 -275
- datahub/ingestion/source/schema_inference/object.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
- datahub/ingestion/source/sql/athena.py +10 -16
- datahub/ingestion/source/sql/druid.py +1 -5
- datahub/ingestion/source/sql/hive.py +15 -6
- datahub/ingestion/source/sql/hive_metastore.py +3 -2
- datahub/ingestion/source/sql/mssql/job_models.py +29 -0
- datahub/ingestion/source/sql/mssql/source.py +11 -5
- datahub/ingestion/source/sql/oracle.py +127 -63
- datahub/ingestion/source/sql/sql_common.py +6 -12
- datahub/ingestion/source/sql/sql_types.py +2 -2
- datahub/ingestion/source/sql/teradata.py +7 -5
- datahub/ingestion/source/sql/trino.py +2 -2
- datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
- datahub/ingestion/source/superset.py +222 -62
- datahub/ingestion/source/tableau/tableau.py +22 -6
- datahub/ingestion/source/tableau/tableau_common.py +3 -2
- datahub/ingestion/source/unity/ge_profiler.py +2 -1
- datahub/ingestion/source/unity/source.py +11 -1
- datahub/ingestion/source/vertexai.py +697 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/lite/duckdb_lite.py +3 -10
- datahub/lite/lite_local.py +1 -1
- datahub/lite/lite_util.py +4 -3
- datahub/metadata/_schema_classes.py +714 -417
- datahub/metadata/_urns/urn_defs.py +1673 -1649
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +16438 -16603
- datahub/metadata/schemas/AssertionInfo.avsc +3 -1
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
- datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
- datahub/metadata/schemas/ChartInfo.avsc +1 -0
- datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
- datahub/metadata/schemas/DataProcessKey.avsc +2 -1
- datahub/metadata/schemas/DataProductKey.avsc +2 -1
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
- datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/InputFields.avsc +3 -1
- datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
- datahub/metadata/schemas/PostKey.avsc +2 -1
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
- datahub/metadata/schemas/VersionProperties.avsc +18 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
- datahub/pydantic/__init__.py +0 -0
- datahub/pydantic/compat.py +58 -0
- datahub/sdk/__init__.py +30 -12
- datahub/sdk/_all_entities.py +1 -1
- datahub/sdk/_attribution.py +4 -0
- datahub/sdk/_shared.py +251 -16
- datahub/sdk/_utils.py +35 -0
- datahub/sdk/container.py +29 -5
- datahub/sdk/dataset.py +118 -20
- datahub/sdk/{_entity.py → entity.py} +24 -1
- datahub/sdk/entity_client.py +1 -1
- datahub/sdk/main_client.py +23 -0
- datahub/sdk/resolver_client.py +17 -29
- datahub/sdk/search_client.py +50 -0
- datahub/sdk/search_filters.py +374 -0
- datahub/specific/dataset.py +3 -4
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/sql_parsing/schema_resolver.py +1 -1
- datahub/sql_parsing/split_statements.py +20 -13
- datahub/sql_parsing/sql_parsing_common.py +7 -0
- datahub/sql_parsing/sqlglot_lineage.py +1 -1
- datahub/sql_parsing/sqlglot_utils.py +1 -4
- datahub/testing/check_sql_parser_result.py +5 -6
- datahub/testing/compare_metadata_json.py +7 -6
- datahub/testing/pytest_hooks.py +56 -0
- datahub/upgrade/upgrade.py +2 -2
- datahub/utilities/file_backed_collections.py +3 -14
- datahub/utilities/ingest_utils.py +106 -0
- datahub/utilities/mapping.py +1 -1
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/sentinels.py +22 -0
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
from dataclasses import dataclass, field
|
|
3
4
|
from datetime import datetime
|
|
4
5
|
from functools import lru_cache
|
|
5
6
|
from typing import Any, Dict, Iterable, List, Optional
|
|
@@ -22,6 +23,7 @@ from datahub.emitter.mce_builder import (
|
|
|
22
23
|
make_dataset_urn,
|
|
23
24
|
make_dataset_urn_with_platform_instance,
|
|
24
25
|
make_domain_urn,
|
|
26
|
+
make_user_urn,
|
|
25
27
|
)
|
|
26
28
|
from datahub.emitter.mcp_builder import add_domain_to_entity_wu
|
|
27
29
|
from datahub.ingestion.api.common import PipelineContext
|
|
@@ -36,9 +38,6 @@ from datahub.ingestion.api.decorators import (
|
|
|
36
38
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
37
39
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
38
40
|
from datahub.ingestion.source.sql.sql_types import resolve_sql_type
|
|
39
|
-
from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
|
|
40
|
-
get_platform_from_sqlalchemy_uri,
|
|
41
|
-
)
|
|
42
41
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
43
42
|
StaleEntityRemovalHandler,
|
|
44
43
|
StaleEntityRemovalSourceReport,
|
|
@@ -49,7 +48,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
49
48
|
StatefulIngestionSourceBase,
|
|
50
49
|
)
|
|
51
50
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
52
|
-
AuditStamp,
|
|
53
51
|
ChangeAuditStamps,
|
|
54
52
|
Status,
|
|
55
53
|
TimeStamp,
|
|
@@ -68,12 +66,22 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
68
66
|
SchemaMetadata,
|
|
69
67
|
)
|
|
70
68
|
from datahub.metadata.schema_classes import (
|
|
69
|
+
AuditStampClass,
|
|
71
70
|
ChartInfoClass,
|
|
72
71
|
ChartTypeClass,
|
|
73
72
|
DashboardInfoClass,
|
|
73
|
+
DatasetLineageTypeClass,
|
|
74
74
|
DatasetPropertiesClass,
|
|
75
|
+
GlobalTagsClass,
|
|
76
|
+
OwnerClass,
|
|
77
|
+
OwnershipClass,
|
|
78
|
+
OwnershipTypeClass,
|
|
79
|
+
TagAssociationClass,
|
|
80
|
+
UpstreamClass,
|
|
81
|
+
UpstreamLineageClass,
|
|
75
82
|
)
|
|
76
83
|
from datahub.utilities import config_clean
|
|
84
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
77
85
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
78
86
|
|
|
79
87
|
logger = logging.getLogger(__name__)
|
|
@@ -101,6 +109,14 @@ chart_type_from_viz_type = {
|
|
|
101
109
|
platform_without_databases = ["druid"]
|
|
102
110
|
|
|
103
111
|
|
|
112
|
+
@dataclass
|
|
113
|
+
class SupersetSourceReport(StaleEntityRemovalSourceReport):
|
|
114
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
115
|
+
|
|
116
|
+
def report_dropped(self, name: str) -> None:
|
|
117
|
+
self.filtered.append(name)
|
|
118
|
+
|
|
119
|
+
|
|
104
120
|
class SupersetDataset(BaseModel):
|
|
105
121
|
id: int
|
|
106
122
|
table_name: str
|
|
@@ -136,6 +152,18 @@ class SupersetConfig(
|
|
|
136
152
|
default=dict(),
|
|
137
153
|
description="regex patterns for tables to filter to assign domain_key. ",
|
|
138
154
|
)
|
|
155
|
+
dataset_pattern: AllowDenyPattern = Field(
|
|
156
|
+
default=AllowDenyPattern.allow_all(),
|
|
157
|
+
description="Regex patterns for dataset to filter in ingestion.",
|
|
158
|
+
)
|
|
159
|
+
chart_pattern: AllowDenyPattern = Field(
|
|
160
|
+
AllowDenyPattern.allow_all(),
|
|
161
|
+
description="Patterns for selecting chart names that are to be included",
|
|
162
|
+
)
|
|
163
|
+
dashboard_pattern: AllowDenyPattern = Field(
|
|
164
|
+
AllowDenyPattern.allow_all(),
|
|
165
|
+
description="Patterns for selecting dashboard names that are to be included",
|
|
166
|
+
)
|
|
139
167
|
username: Optional[str] = Field(default=None, description="Superset username.")
|
|
140
168
|
password: Optional[str] = Field(default=None, description="Superset password.")
|
|
141
169
|
# Configuration for stateful ingestion
|
|
@@ -216,7 +244,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
216
244
|
"""
|
|
217
245
|
|
|
218
246
|
config: SupersetConfig
|
|
219
|
-
report:
|
|
247
|
+
report: SupersetSourceReport
|
|
220
248
|
platform = "superset"
|
|
221
249
|
|
|
222
250
|
def __hash__(self):
|
|
@@ -225,13 +253,14 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
225
253
|
def __init__(self, ctx: PipelineContext, config: SupersetConfig):
|
|
226
254
|
super().__init__(config, ctx)
|
|
227
255
|
self.config = config
|
|
228
|
-
self.report =
|
|
256
|
+
self.report = SupersetSourceReport()
|
|
229
257
|
if self.config.domain:
|
|
230
258
|
self.domain_registry = DomainRegistry(
|
|
231
259
|
cached_domains=[domain_id for domain_id in self.config.domain],
|
|
232
260
|
graph=self.ctx.graph,
|
|
233
261
|
)
|
|
234
262
|
self.session = self.login()
|
|
263
|
+
self.owner_info = self.parse_owner_info()
|
|
235
264
|
|
|
236
265
|
def login(self) -> requests.Session:
|
|
237
266
|
login_response = requests.post(
|
|
@@ -271,7 +300,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
271
300
|
|
|
272
301
|
while current_page * page_size < total_items:
|
|
273
302
|
response = self.session.get(
|
|
274
|
-
f"{self.config.connect_uri}/api/v1/{entity_type}
|
|
303
|
+
f"{self.config.connect_uri}/api/v1/{entity_type}",
|
|
275
304
|
params={"q": f"(page:{current_page},page_size:{page_size})"},
|
|
276
305
|
)
|
|
277
306
|
|
|
@@ -287,25 +316,24 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
287
316
|
|
|
288
317
|
current_page += 1
|
|
289
318
|
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
return platform_name
|
|
319
|
+
def parse_owner_info(self) -> Dict[str, Any]:
|
|
320
|
+
entity_types = ["dataset", "dashboard", "chart"]
|
|
321
|
+
owners_info = {}
|
|
322
|
+
|
|
323
|
+
for entity in entity_types:
|
|
324
|
+
for owner in self.paginate_entity_api_results(f"{entity}/related/owners"):
|
|
325
|
+
owner_id = owner.get("value")
|
|
326
|
+
if owner_id:
|
|
327
|
+
owners_info[owner_id] = owner.get("extra", {}).get("email", "")
|
|
328
|
+
|
|
329
|
+
return owners_info
|
|
330
|
+
|
|
331
|
+
def build_owner_urn(self, data: Dict[str, Any]) -> List[str]:
|
|
332
|
+
return [
|
|
333
|
+
make_user_urn(self.owner_info.get(owner.get("id"), ""))
|
|
334
|
+
for owner in data.get("owners", [])
|
|
335
|
+
if owner.get("id")
|
|
336
|
+
]
|
|
309
337
|
|
|
310
338
|
@lru_cache(maxsize=None)
|
|
311
339
|
def get_dataset_info(self, dataset_id: int) -> dict:
|
|
@@ -323,8 +351,6 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
323
351
|
schema_name = dataset_response.get("result", {}).get("schema")
|
|
324
352
|
table_name = dataset_response.get("result", {}).get("table_name")
|
|
325
353
|
database_id = dataset_response.get("result", {}).get("database", {}).get("id")
|
|
326
|
-
platform = self.get_platform_from_database_id(database_id)
|
|
327
|
-
|
|
328
354
|
database_name = (
|
|
329
355
|
dataset_response.get("result", {}).get("database", {}).get("database_name")
|
|
330
356
|
)
|
|
@@ -333,21 +359,24 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
333
359
|
# Druid do not have a database concept and has a limited schema concept, but they are nonetheless reported
|
|
334
360
|
# from superset. There is only one database per platform instance, and one schema named druid, so it would be
|
|
335
361
|
# redundant to systemically store them both in the URN.
|
|
336
|
-
if
|
|
362
|
+
if platform_instance in platform_without_databases:
|
|
337
363
|
database_name = None
|
|
338
364
|
|
|
339
|
-
if
|
|
365
|
+
if platform_instance == "druid" and schema_name == "druid":
|
|
340
366
|
# Follow DataHub's druid source convention.
|
|
341
367
|
schema_name = None
|
|
342
368
|
|
|
343
|
-
|
|
369
|
+
# If the information about the datasource is already contained in the dataset response,
|
|
370
|
+
# can just return the urn directly
|
|
371
|
+
if table_name and database_id:
|
|
344
372
|
return make_dataset_urn(
|
|
345
|
-
platform=
|
|
373
|
+
platform=platform_instance,
|
|
346
374
|
name=".".join(
|
|
347
375
|
name for name in [database_name, schema_name, table_name] if name
|
|
348
376
|
),
|
|
349
377
|
env=self.config.env,
|
|
350
378
|
)
|
|
379
|
+
|
|
351
380
|
raise ValueError("Could not construct dataset URN")
|
|
352
381
|
|
|
353
382
|
def construct_dashboard_from_api_data(
|
|
@@ -355,7 +384,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
355
384
|
) -> DashboardSnapshot:
|
|
356
385
|
dashboard_urn = make_dashboard_urn(
|
|
357
386
|
platform=self.platform,
|
|
358
|
-
name=dashboard_data["id"],
|
|
387
|
+
name=str(dashboard_data["id"]),
|
|
359
388
|
platform_instance=self.config.platform_instance,
|
|
360
389
|
)
|
|
361
390
|
dashboard_snapshot = DashboardSnapshot(
|
|
@@ -363,15 +392,16 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
363
392
|
aspects=[Status(removed=False)],
|
|
364
393
|
)
|
|
365
394
|
|
|
366
|
-
modified_actor = f"urn:li:corpuser:{(dashboard_data.get('changed_by') or {}).get('
|
|
395
|
+
modified_actor = f"urn:li:corpuser:{self.owner_info.get((dashboard_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
|
|
367
396
|
modified_ts = int(
|
|
368
397
|
dp.parse(dashboard_data.get("changed_on_utc", "now")).timestamp() * 1000
|
|
369
398
|
)
|
|
370
399
|
title = dashboard_data.get("dashboard_title", "")
|
|
371
400
|
# note: the API does not currently supply created_by usernames due to a bug
|
|
372
|
-
last_modified =
|
|
373
|
-
|
|
374
|
-
|
|
401
|
+
last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
|
|
402
|
+
|
|
403
|
+
change_audit_stamps = ChangeAuditStamps(
|
|
404
|
+
created=None, lastModified=last_modified
|
|
375
405
|
)
|
|
376
406
|
dashboard_url = f"{self.config.display_uri}{dashboard_data.get('url', '')}"
|
|
377
407
|
|
|
@@ -386,7 +416,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
386
416
|
chart_urns.append(
|
|
387
417
|
make_chart_urn(
|
|
388
418
|
platform=self.platform,
|
|
389
|
-
name=value.get("meta", {}).get("chartId", "unknown"),
|
|
419
|
+
name=str(value.get("meta", {}).get("chartId", "unknown")),
|
|
390
420
|
platform_instance=self.config.platform_instance,
|
|
391
421
|
)
|
|
392
422
|
)
|
|
@@ -397,13 +427,11 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
397
427
|
"IsPublished": str(dashboard_data.get("published", False)).lower(),
|
|
398
428
|
"Owners": ", ".join(
|
|
399
429
|
map(
|
|
400
|
-
lambda owner: owner.get("
|
|
430
|
+
lambda owner: self.owner_info.get(owner.get("id", -1), "unknown"),
|
|
401
431
|
dashboard_data.get("owners", []),
|
|
402
432
|
)
|
|
403
433
|
),
|
|
404
|
-
"IsCertified": str(
|
|
405
|
-
True if dashboard_data.get("certified_by") else False
|
|
406
|
-
).lower(),
|
|
434
|
+
"IsCertified": str(bool(dashboard_data.get("certified_by"))).lower(),
|
|
407
435
|
}
|
|
408
436
|
|
|
409
437
|
if dashboard_data.get("certified_by"):
|
|
@@ -417,16 +445,39 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
417
445
|
description="",
|
|
418
446
|
title=title,
|
|
419
447
|
charts=chart_urns,
|
|
420
|
-
lastModified=last_modified,
|
|
421
448
|
dashboardUrl=dashboard_url,
|
|
422
449
|
customProperties=custom_properties,
|
|
450
|
+
lastModified=change_audit_stamps,
|
|
423
451
|
)
|
|
424
452
|
dashboard_snapshot.aspects.append(dashboard_info)
|
|
453
|
+
|
|
454
|
+
dashboard_owners_list = self.build_owner_urn(dashboard_data)
|
|
455
|
+
owners_info = OwnershipClass(
|
|
456
|
+
owners=[
|
|
457
|
+
OwnerClass(
|
|
458
|
+
owner=urn,
|
|
459
|
+
type=OwnershipTypeClass.TECHNICAL_OWNER,
|
|
460
|
+
)
|
|
461
|
+
for urn in (dashboard_owners_list or [])
|
|
462
|
+
],
|
|
463
|
+
lastModified=last_modified,
|
|
464
|
+
)
|
|
465
|
+
dashboard_snapshot.aspects.append(owners_info)
|
|
466
|
+
|
|
425
467
|
return dashboard_snapshot
|
|
426
468
|
|
|
427
469
|
def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
428
|
-
for dashboard_data in self.paginate_entity_api_results("dashboard", PAGE_SIZE):
|
|
470
|
+
for dashboard_data in self.paginate_entity_api_results("dashboard/", PAGE_SIZE):
|
|
429
471
|
try:
|
|
472
|
+
dashboard_id = str(dashboard_data.get("id"))
|
|
473
|
+
dashboard_title = dashboard_data.get("dashboard_title", "")
|
|
474
|
+
|
|
475
|
+
if not self.config.dashboard_pattern.allowed(dashboard_title):
|
|
476
|
+
self.report.report_dropped(
|
|
477
|
+
f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
|
|
478
|
+
)
|
|
479
|
+
continue
|
|
480
|
+
|
|
430
481
|
dashboard_snapshot = self.construct_dashboard_from_api_data(
|
|
431
482
|
dashboard_data
|
|
432
483
|
)
|
|
@@ -439,14 +490,14 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
439
490
|
mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
|
|
440
491
|
yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
|
|
441
492
|
yield from self._get_domain_wu(
|
|
442
|
-
title=
|
|
493
|
+
title=dashboard_title,
|
|
443
494
|
entity_urn=dashboard_snapshot.urn,
|
|
444
495
|
)
|
|
445
496
|
|
|
446
497
|
def construct_chart_from_chart_data(self, chart_data: dict) -> ChartSnapshot:
|
|
447
498
|
chart_urn = make_chart_urn(
|
|
448
499
|
platform=self.platform,
|
|
449
|
-
name=chart_data["id"],
|
|
500
|
+
name=str(chart_data["id"]),
|
|
450
501
|
platform_instance=self.config.platform_instance,
|
|
451
502
|
)
|
|
452
503
|
chart_snapshot = ChartSnapshot(
|
|
@@ -454,25 +505,33 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
454
505
|
aspects=[Status(removed=False)],
|
|
455
506
|
)
|
|
456
507
|
|
|
457
|
-
modified_actor = f"urn:li:corpuser:{(chart_data.get('changed_by') or {}).get('
|
|
508
|
+
modified_actor = f"urn:li:corpuser:{self.owner_info.get((chart_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
|
|
458
509
|
modified_ts = int(
|
|
459
510
|
dp.parse(chart_data.get("changed_on_utc", "now")).timestamp() * 1000
|
|
460
511
|
)
|
|
461
512
|
title = chart_data.get("slice_name", "")
|
|
462
513
|
|
|
463
514
|
# note: the API does not currently supply created_by usernames due to a bug
|
|
464
|
-
last_modified =
|
|
465
|
-
|
|
466
|
-
|
|
515
|
+
last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
|
|
516
|
+
|
|
517
|
+
change_audit_stamps = ChangeAuditStamps(
|
|
518
|
+
created=None, lastModified=last_modified
|
|
467
519
|
)
|
|
520
|
+
|
|
468
521
|
chart_type = chart_type_from_viz_type.get(chart_data.get("viz_type", ""))
|
|
469
522
|
chart_url = f"{self.config.display_uri}{chart_data.get('url', '')}"
|
|
470
523
|
|
|
471
524
|
datasource_id = chart_data.get("datasource_id")
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
525
|
+
if not datasource_id:
|
|
526
|
+
logger.debug(
|
|
527
|
+
f"chart {chart_data['id']} has no datasource_id, skipping fetching dataset info"
|
|
528
|
+
)
|
|
529
|
+
datasource_urn = None
|
|
530
|
+
else:
|
|
531
|
+
dataset_response = self.get_dataset_info(datasource_id)
|
|
532
|
+
datasource_urn = self.get_datasource_urn_from_id(
|
|
533
|
+
dataset_response, self.platform
|
|
534
|
+
)
|
|
476
535
|
|
|
477
536
|
params = json.loads(chart_data.get("params", "{}"))
|
|
478
537
|
metrics = [
|
|
@@ -515,23 +574,61 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
515
574
|
type=chart_type,
|
|
516
575
|
description="",
|
|
517
576
|
title=title,
|
|
518
|
-
lastModified=last_modified,
|
|
519
577
|
chartUrl=chart_url,
|
|
520
578
|
inputs=[datasource_urn] if datasource_urn else None,
|
|
521
579
|
customProperties=custom_properties,
|
|
580
|
+
lastModified=change_audit_stamps,
|
|
522
581
|
)
|
|
523
582
|
chart_snapshot.aspects.append(chart_info)
|
|
583
|
+
|
|
584
|
+
chart_owners_list = self.build_owner_urn(chart_data)
|
|
585
|
+
owners_info = OwnershipClass(
|
|
586
|
+
owners=[
|
|
587
|
+
OwnerClass(
|
|
588
|
+
owner=urn,
|
|
589
|
+
type=OwnershipTypeClass.TECHNICAL_OWNER,
|
|
590
|
+
)
|
|
591
|
+
for urn in (chart_owners_list or [])
|
|
592
|
+
],
|
|
593
|
+
lastModified=last_modified,
|
|
594
|
+
)
|
|
595
|
+
chart_snapshot.aspects.append(owners_info)
|
|
524
596
|
return chart_snapshot
|
|
525
597
|
|
|
526
598
|
def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
527
|
-
for chart_data in self.paginate_entity_api_results("chart", PAGE_SIZE):
|
|
599
|
+
for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE):
|
|
528
600
|
try:
|
|
601
|
+
chart_id = str(chart_data.get("id"))
|
|
602
|
+
chart_name = chart_data.get("slice_name", "")
|
|
603
|
+
|
|
604
|
+
if not self.config.chart_pattern.allowed(chart_name):
|
|
605
|
+
self.report.report_dropped(
|
|
606
|
+
f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
|
|
607
|
+
)
|
|
608
|
+
continue
|
|
609
|
+
|
|
610
|
+
# Emit a warning if charts use data from a dataset that will be filtered out
|
|
611
|
+
if self.config.dataset_pattern != AllowDenyPattern.allow_all():
|
|
612
|
+
datasource_id = chart_data.get("datasource_id")
|
|
613
|
+
if datasource_id:
|
|
614
|
+
dataset_response = self.get_dataset_info(datasource_id)
|
|
615
|
+
dataset_name = dataset_response.get("result", {}).get(
|
|
616
|
+
"table_name", ""
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
if dataset_name and not self.config.dataset_pattern.allowed(
|
|
620
|
+
dataset_name
|
|
621
|
+
):
|
|
622
|
+
self.report.warning(
|
|
623
|
+
f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
|
|
624
|
+
)
|
|
625
|
+
|
|
529
626
|
chart_snapshot = self.construct_chart_from_chart_data(chart_data)
|
|
530
627
|
|
|
531
628
|
mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
|
|
532
629
|
except Exception as e:
|
|
533
630
|
self.report.warning(
|
|
534
|
-
f"Failed to construct chart snapshot. Chart name: {
|
|
631
|
+
f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
|
|
535
632
|
)
|
|
536
633
|
continue
|
|
537
634
|
# Emit the chart
|
|
@@ -588,25 +685,65 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
588
685
|
) -> DatasetSnapshot:
|
|
589
686
|
dataset_response = self.get_dataset_info(dataset_data.get("id"))
|
|
590
687
|
dataset = SupersetDataset(**dataset_response["result"])
|
|
688
|
+
|
|
591
689
|
datasource_urn = self.get_datasource_urn_from_id(
|
|
592
690
|
dataset_response, self.platform
|
|
593
691
|
)
|
|
692
|
+
dataset_url = f"{self.config.display_uri}{dataset_response.get('result', {}).get('url', '')}"
|
|
693
|
+
|
|
694
|
+
modified_actor = f"urn:li:corpuser:{self.owner_info.get((dataset_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
|
|
695
|
+
modified_ts = int(
|
|
696
|
+
dp.parse(dataset_data.get("changed_on_utc", "now")).timestamp() * 1000
|
|
697
|
+
)
|
|
698
|
+
last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
|
|
699
|
+
|
|
700
|
+
upstream_warehouse_platform = (
|
|
701
|
+
dataset_response.get("result", {}).get("database", {}).get("backend")
|
|
702
|
+
)
|
|
594
703
|
|
|
595
|
-
|
|
704
|
+
# Preset has a way of naming their platforms differently than
|
|
705
|
+
# how datahub names them, so map the platform name to the correct naming
|
|
706
|
+
warehouse_naming = {
|
|
707
|
+
"awsathena": "athena",
|
|
708
|
+
"clickhousedb": "clickhouse",
|
|
709
|
+
"postgresql": "postgres",
|
|
710
|
+
}
|
|
711
|
+
|
|
712
|
+
if upstream_warehouse_platform in warehouse_naming:
|
|
713
|
+
upstream_warehouse_platform = warehouse_naming[upstream_warehouse_platform]
|
|
714
|
+
|
|
715
|
+
# TODO: Categorize physical vs virtual upstream dataset
|
|
716
|
+
# mark all upstream dataset as physical for now, in the future we would ideally like
|
|
717
|
+
# to differentiate physical vs virtual upstream datasets
|
|
718
|
+
tag_urn = f"urn:li:tag:{self.platform}:physical"
|
|
719
|
+
upstream_dataset = self.get_datasource_urn_from_id(
|
|
720
|
+
dataset_response, upstream_warehouse_platform
|
|
721
|
+
)
|
|
722
|
+
upstream_lineage = UpstreamLineageClass(
|
|
723
|
+
upstreams=[
|
|
724
|
+
UpstreamClass(
|
|
725
|
+
type=DatasetLineageTypeClass.TRANSFORMED,
|
|
726
|
+
dataset=upstream_dataset,
|
|
727
|
+
properties={"externalUrl": dataset_url},
|
|
728
|
+
)
|
|
729
|
+
]
|
|
730
|
+
)
|
|
596
731
|
|
|
597
732
|
dataset_info = DatasetPropertiesClass(
|
|
598
733
|
name=dataset.table_name,
|
|
599
734
|
description="",
|
|
600
|
-
lastModified=TimeStamp(time=dataset.modified_ts)
|
|
601
|
-
if dataset.modified_ts
|
|
602
|
-
else None,
|
|
603
735
|
externalUrl=dataset_url,
|
|
736
|
+
lastModified=TimeStamp(time=modified_ts),
|
|
604
737
|
)
|
|
738
|
+
global_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
|
|
739
|
+
|
|
605
740
|
aspects_items: List[Any] = []
|
|
606
741
|
aspects_items.extend(
|
|
607
742
|
[
|
|
608
743
|
self.gen_schema_metadata(dataset_response),
|
|
609
744
|
dataset_info,
|
|
745
|
+
upstream_lineage,
|
|
746
|
+
global_tags,
|
|
610
747
|
]
|
|
611
748
|
)
|
|
612
749
|
|
|
@@ -614,11 +751,34 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
614
751
|
urn=datasource_urn,
|
|
615
752
|
aspects=aspects_items,
|
|
616
753
|
)
|
|
754
|
+
|
|
755
|
+
dataset_owners_list = self.build_owner_urn(dataset_data)
|
|
756
|
+
owners_info = OwnershipClass(
|
|
757
|
+
owners=[
|
|
758
|
+
OwnerClass(
|
|
759
|
+
owner=urn,
|
|
760
|
+
type=OwnershipTypeClass.TECHNICAL_OWNER,
|
|
761
|
+
)
|
|
762
|
+
for urn in (dataset_owners_list or [])
|
|
763
|
+
],
|
|
764
|
+
lastModified=last_modified,
|
|
765
|
+
)
|
|
766
|
+
aspects_items.append(owners_info)
|
|
767
|
+
|
|
617
768
|
return dataset_snapshot
|
|
618
769
|
|
|
619
770
|
def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
620
|
-
for dataset_data in self.paginate_entity_api_results("dataset", PAGE_SIZE):
|
|
771
|
+
for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE):
|
|
621
772
|
try:
|
|
773
|
+
dataset_name = dataset_data.get("table_name", "")
|
|
774
|
+
|
|
775
|
+
# Check if dataset should be filtered by dataset name
|
|
776
|
+
if not self.config.dataset_pattern.allowed(dataset_name):
|
|
777
|
+
self.report.report_dropped(
|
|
778
|
+
f"Dataset '{dataset_name}' filtered by dataset_pattern"
|
|
779
|
+
)
|
|
780
|
+
continue
|
|
781
|
+
|
|
622
782
|
dataset_snapshot = self.construct_dataset_from_dataset_data(
|
|
623
783
|
dataset_data
|
|
624
784
|
)
|
|
@@ -1562,8 +1562,9 @@ class TableauSiteSource:
|
|
|
1562
1562
|
query: str,
|
|
1563
1563
|
connection_type: str,
|
|
1564
1564
|
page_size: int,
|
|
1565
|
-
query_filter: dict =
|
|
1565
|
+
query_filter: Optional[dict] = None,
|
|
1566
1566
|
) -> Iterable[dict]:
|
|
1567
|
+
query_filter = query_filter or {}
|
|
1567
1568
|
query_filter = optimize_query_filter(query_filter)
|
|
1568
1569
|
|
|
1569
1570
|
# Calls the get_connection_object_page function to get the objects,
|
|
@@ -1910,11 +1911,7 @@ class TableauSiteSource:
|
|
|
1910
1911
|
if upstream_col.get(c.TABLE)
|
|
1911
1912
|
else None
|
|
1912
1913
|
)
|
|
1913
|
-
if
|
|
1914
|
-
name
|
|
1915
|
-
and upstream_table_id
|
|
1916
|
-
and upstream_table_id in table_id_to_urn.keys()
|
|
1917
|
-
):
|
|
1914
|
+
if name and upstream_table_id and upstream_table_id in table_id_to_urn:
|
|
1918
1915
|
parent_dataset_urn = table_id_to_urn[upstream_table_id]
|
|
1919
1916
|
if (
|
|
1920
1917
|
self.is_snowflake_urn(parent_dataset_urn)
|
|
@@ -2190,6 +2187,10 @@ class TableauSiteSource:
|
|
|
2190
2187
|
dataset_snapshot.aspects.append(browse_paths)
|
|
2191
2188
|
else:
|
|
2192
2189
|
logger.debug(f"Browse path not set for Custom SQL table {csql_id}")
|
|
2190
|
+
logger.warning(
|
|
2191
|
+
f"Skipping Custom SQL table {csql_id} due to filtered downstream"
|
|
2192
|
+
)
|
|
2193
|
+
continue
|
|
2193
2194
|
|
|
2194
2195
|
dataset_properties = DatasetPropertiesClass(
|
|
2195
2196
|
name=csql.get(c.NAME),
|
|
@@ -2628,6 +2629,15 @@ class TableauSiteSource:
|
|
|
2628
2629
|
datasource_info = datasource
|
|
2629
2630
|
|
|
2630
2631
|
browse_path = self._get_project_browse_path_name(datasource)
|
|
2632
|
+
if (
|
|
2633
|
+
not is_embedded_ds
|
|
2634
|
+
and self._get_published_datasource_project_luid(datasource) is None
|
|
2635
|
+
):
|
|
2636
|
+
logger.warning(
|
|
2637
|
+
f"Skip ingesting published datasource {datasource.get(c.NAME)} because of filtered project"
|
|
2638
|
+
)
|
|
2639
|
+
return
|
|
2640
|
+
|
|
2631
2641
|
logger.debug(f"datasource {datasource.get(c.NAME)} browse-path {browse_path}")
|
|
2632
2642
|
datasource_id = datasource[c.ID]
|
|
2633
2643
|
datasource_urn = builder.make_dataset_urn_with_platform_instance(
|
|
@@ -2851,6 +2861,11 @@ class TableauSiteSource:
|
|
|
2851
2861
|
query_filter=tables_filter,
|
|
2852
2862
|
page_size=self.config.effective_database_table_page_size,
|
|
2853
2863
|
):
|
|
2864
|
+
if tableau_database_table_id_to_urn_map.get(tableau_table[c.ID]) is None:
|
|
2865
|
+
logger.warning(
|
|
2866
|
+
f"Skipping table {tableau_table[c.ID]} due to filtered out published datasource"
|
|
2867
|
+
)
|
|
2868
|
+
continue
|
|
2854
2869
|
database_table = self.database_tables[
|
|
2855
2870
|
tableau_database_table_id_to_urn_map[tableau_table[c.ID]]
|
|
2856
2871
|
]
|
|
@@ -2905,6 +2920,7 @@ class TableauSiteSource:
|
|
|
2905
2920
|
dataset_snapshot.aspects.append(browse_paths)
|
|
2906
2921
|
else:
|
|
2907
2922
|
logger.debug(f"Browse path not set for table {database_table.urn}")
|
|
2923
|
+
return
|
|
2908
2924
|
|
|
2909
2925
|
schema_metadata = self.get_schema_metadata_for_table(
|
|
2910
2926
|
tableau_columns, database_table.parsed_columns
|
|
@@ -514,7 +514,8 @@ FIELD_TYPE_MAPPING = {
|
|
|
514
514
|
}
|
|
515
515
|
|
|
516
516
|
|
|
517
|
-
def get_tags_from_params(params: List[str] =
|
|
517
|
+
def get_tags_from_params(params: Optional[List[str]] = None) -> GlobalTagsClass:
|
|
518
|
+
params = params or []
|
|
518
519
|
tags = [
|
|
519
520
|
TagAssociationClass(tag=builder.make_tag_urn(tag.upper()))
|
|
520
521
|
for tag in params
|
|
@@ -901,7 +902,7 @@ def get_unique_custom_sql(custom_sql_list: List[dict]) -> List[dict]:
|
|
|
901
902
|
"name": custom_sql.get("name"),
|
|
902
903
|
# We assume that this is unsupported custom sql if "actual tables that this query references"
|
|
903
904
|
# are missing from api result.
|
|
904
|
-
"isUnsupportedCustomSql":
|
|
905
|
+
"isUnsupportedCustomSql": not custom_sql.get("tables"),
|
|
905
906
|
"query": custom_sql.get("query"),
|
|
906
907
|
"connectionType": custom_sql.get("connectionType"),
|
|
907
908
|
"columns": custom_sql.get("columns"),
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import concurrent.futures
|
|
1
2
|
import logging
|
|
2
3
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
3
4
|
from dataclasses import dataclass, field
|
|
@@ -91,7 +92,7 @@ class UnityCatalogGEProfiler(GenericProfiler):
|
|
|
91
92
|
profile_requests.append(profile_request)
|
|
92
93
|
if i > 0 and i % 100 == 0:
|
|
93
94
|
logger.info(f"Finished table-level profiling for {i} tables")
|
|
94
|
-
except TimeoutError:
|
|
95
|
+
except (TimeoutError, concurrent.futures.TimeoutError):
|
|
95
96
|
logger.warning("Timed out waiting to complete table-level profiling.")
|
|
96
97
|
|
|
97
98
|
if len(profile_requests) == 0:
|
|
@@ -464,7 +464,17 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
464
464
|
|
|
465
465
|
with self.report.new_stage(f"Ingest schema {schema.id}"):
|
|
466
466
|
yield from self.gen_schema_containers(schema)
|
|
467
|
-
|
|
467
|
+
try:
|
|
468
|
+
yield from self.process_tables(schema)
|
|
469
|
+
except Exception as e:
|
|
470
|
+
logger.exception(f"Error parsing schema {schema}")
|
|
471
|
+
self.report.report_warning(
|
|
472
|
+
message="Missed schema because of parsing issues",
|
|
473
|
+
context=str(schema),
|
|
474
|
+
title="Error parsing schema",
|
|
475
|
+
exc=e,
|
|
476
|
+
)
|
|
477
|
+
continue
|
|
468
478
|
|
|
469
479
|
self.report.schemas.processed(schema.id)
|
|
470
480
|
|