acryl-datahub 0.15.0.4rc3__py3-none-any.whl → 0.15.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/METADATA +2507 -2470
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/RECORD +95 -86
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/entry_points.txt +1 -0
- datahub/__init__.py +1 -25
- datahub/_version.py +13 -0
- datahub/api/entities/dataprocess/dataprocess_instance.py +104 -11
- datahub/cli/check_cli.py +1 -1
- datahub/cli/cli_utils.py +3 -3
- datahub/cli/container_cli.py +1 -64
- datahub/cli/iceberg_cli.py +707 -0
- datahub/cli/ingest_cli.py +2 -2
- datahub/emitter/composite_emitter.py +36 -0
- datahub/emitter/rest_emitter.py +1 -1
- datahub/entrypoints.py +26 -5
- datahub/ingestion/api/incremental_lineage_helper.py +4 -0
- datahub/ingestion/api/registry.py +1 -1
- datahub/ingestion/glossary/classification_mixin.py +6 -0
- datahub/ingestion/glossary/classifier.py +3 -2
- datahub/ingestion/graph/client.py +2 -1
- datahub/ingestion/graph/entity_versioning.py +201 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
- datahub/ingestion/run/connection.py +1 -1
- datahub/ingestion/run/pipeline.py +3 -3
- datahub/ingestion/source/abs/report.py +2 -2
- datahub/ingestion/source/apply/__init__.py +0 -0
- datahub/ingestion/source/apply/datahub_apply.py +223 -0
- datahub/ingestion/source/aws/glue.py +5 -2
- datahub/ingestion/source/aws/sagemaker_processors/common.py +3 -2
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +1 -1
- datahub/ingestion/source/dbt/dbt_core.py +1 -1
- datahub/ingestion/source/delta_lake/report.py +2 -2
- datahub/ingestion/source/dynamodb/dynamodb.py +2 -1
- datahub/ingestion/source/elastic_search.py +2 -1
- datahub/ingestion/source/ge_profiling_config.py +11 -7
- datahub/ingestion/source/iceberg/iceberg_common.py +3 -2
- datahub/ingestion/source/identity/azure_ad.py +6 -14
- datahub/ingestion/source/identity/okta.py +2 -1
- datahub/ingestion/source/kafka/kafka.py +2 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -1
- datahub/ingestion/source/ldap.py +2 -1
- datahub/ingestion/source/looker/looker_config.py +3 -1
- datahub/ingestion/source/looker/looker_dataclasses.py +8 -0
- datahub/ingestion/source/looker/looker_file_loader.py +14 -3
- datahub/ingestion/source/looker/looker_template_language.py +104 -14
- datahub/ingestion/source/looker/lookml_config.py +29 -8
- datahub/ingestion/source/looker/lookml_source.py +110 -22
- datahub/ingestion/source/mode.py +2 -4
- datahub/ingestion/source/mongodb.py +2 -1
- datahub/ingestion/source/nifi.py +2 -1
- datahub/ingestion/source/powerbi/config.py +2 -2
- datahub/ingestion/source/powerbi_report_server/report_server.py +2 -1
- datahub/ingestion/source/redash.py +5 -5
- datahub/ingestion/source/salesforce.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
- datahub/ingestion/source/snowflake/snowflake_query.py +11 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +3 -1
- datahub/ingestion/source/snowflake/snowflake_schema.py +17 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +35 -43
- datahub/ingestion/source/snowflake/snowflake_tag.py +57 -3
- datahub/ingestion/source/snowflake/snowflake_v2.py +42 -4
- datahub/ingestion/source/sql/clickhouse.py +5 -43
- datahub/ingestion/source/sql/mssql/job_models.py +37 -8
- datahub/ingestion/source/sql/mssql/source.py +17 -0
- datahub/ingestion/source/sql/sql_config.py +0 -10
- datahub/ingestion/source/tableau/tableau.py +16 -13
- datahub/ingestion/source/tableau/tableau_common.py +1 -1
- datahub/ingestion/source/unity/ge_profiler.py +55 -4
- datahub/ingestion/source/unity/proxy.py +2 -2
- datahub/ingestion/source/unity/report.py +1 -0
- datahub/ingestion/source_config/operation_config.py +9 -0
- datahub/ingestion/source_report/pulsar.py +5 -4
- datahub/metadata/_schema_classes.py +304 -6
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatforminstance/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataset/__init__.py +2 -0
- datahub/metadata/schema.avsc +211 -12
- datahub/metadata/schemas/AssertionInfo.avsc +2 -2
- datahub/metadata/schemas/CorpUserSettings.avsc +9 -0
- datahub/metadata/schemas/DashboardInfo.avsc +5 -5
- datahub/metadata/schemas/DataPlatformInstanceKey.avsc +2 -1
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/Deprecation.avsc +12 -0
- datahub/metadata/schemas/DisplayProperties.avsc +62 -0
- datahub/metadata/schemas/IcebergCatalogInfo.avsc +28 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +92 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +17 -5
- datahub/metadata/schemas/PostInfo.avsc +28 -2
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/specific/dashboard.py +43 -1
- datahub/telemetry/telemetry.py +4 -4
- datahub/testing/check_imports.py +28 -0
- datahub/upgrade/upgrade.py +17 -9
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/top_level.txt +0 -0
|
@@ -249,6 +249,12 @@ class SnowflakeV2Config(
|
|
|
249
249
|
description="If enabled along with `extract_tags`, extracts snowflake's key-value tags as DataHub structured properties instead of DataHub tags.",
|
|
250
250
|
)
|
|
251
251
|
|
|
252
|
+
structured_properties_template_cache_invalidation_interval: int = Field(
|
|
253
|
+
hidden_from_docs=True,
|
|
254
|
+
default=60,
|
|
255
|
+
description="Interval in seconds to invalidate the structured properties template cache.",
|
|
256
|
+
)
|
|
257
|
+
|
|
252
258
|
include_external_url: bool = Field(
|
|
253
259
|
default=True,
|
|
254
260
|
description="Whether to populate Snowsight url for Snowflake Objects",
|
|
@@ -302,6 +308,13 @@ class SnowflakeV2Config(
|
|
|
302
308
|
" assertions CLI in snowflake",
|
|
303
309
|
)
|
|
304
310
|
|
|
311
|
+
pushdown_deny_usernames: List[str] = Field(
|
|
312
|
+
default=[],
|
|
313
|
+
description="List of snowflake usernames which will not be considered for lineage/usage/queries extraction. "
|
|
314
|
+
"This is primarily useful for improving performance by filtering out users with extremely high query volumes. "
|
|
315
|
+
"Only applicable if `use_queries_v2` is enabled.",
|
|
316
|
+
)
|
|
317
|
+
|
|
305
318
|
@validator("convert_urns_to_lowercase")
|
|
306
319
|
def validate_convert_urns_to_lowercase(cls, v):
|
|
307
320
|
if not v:
|
|
@@ -159,6 +159,17 @@ class SnowflakeQuery:
|
|
|
159
159
|
and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
|
|
160
160
|
order by table_schema, table_name"""
|
|
161
161
|
|
|
162
|
+
@staticmethod
|
|
163
|
+
def get_all_tags():
|
|
164
|
+
return """
|
|
165
|
+
SELECT tag_database as "TAG_DATABASE",
|
|
166
|
+
tag_schema AS "TAG_SCHEMA",
|
|
167
|
+
tag_name AS "TAG_NAME",
|
|
168
|
+
FROM snowflake.account_usage.tag_references
|
|
169
|
+
GROUP BY TAG_DATABASE , TAG_SCHEMA, tag_name
|
|
170
|
+
ORDER BY TAG_DATABASE, TAG_SCHEMA, TAG_NAME ASC;
|
|
171
|
+
"""
|
|
172
|
+
|
|
162
173
|
@staticmethod
|
|
163
174
|
def get_all_tags_on_object_with_propagation(
|
|
164
175
|
db_name: str, quoted_identifier: str, domain: str
|
|
@@ -12,6 +12,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
12
12
|
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
13
13
|
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
14
14
|
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
|
|
15
|
+
from datahub.utilities.lossy_collections import LossyDict
|
|
15
16
|
from datahub.utilities.perf_timer import PerfTimer
|
|
16
17
|
|
|
17
18
|
if TYPE_CHECKING:
|
|
@@ -66,7 +67,7 @@ class SnowflakeReport(SQLSourceReport, BaseTimeWindowReport):
|
|
|
66
67
|
num_external_table_edges_scanned: int = 0
|
|
67
68
|
ignore_start_time_lineage: Optional[bool] = None
|
|
68
69
|
upstream_lineage_in_report: Optional[bool] = None
|
|
69
|
-
upstream_lineage:
|
|
70
|
+
upstream_lineage: LossyDict[str, List[str]] = field(default_factory=LossyDict)
|
|
70
71
|
|
|
71
72
|
lineage_start_time: Optional[datetime] = None
|
|
72
73
|
lineage_end_time: Optional[datetime] = None
|
|
@@ -114,6 +115,7 @@ class SnowflakeV2Report(
|
|
|
114
115
|
num_tables_with_known_upstreams: int = 0
|
|
115
116
|
num_upstream_lineage_edge_parsing_failed: int = 0
|
|
116
117
|
num_secure_views_missing_definition: int = 0
|
|
118
|
+
num_structured_property_templates_created: int = 0
|
|
117
119
|
|
|
118
120
|
data_dictionary_cache: Optional["SnowflakeDataDictionary"] = None
|
|
119
121
|
|
|
@@ -285,6 +285,23 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
285
285
|
|
|
286
286
|
return secure_view_definitions
|
|
287
287
|
|
|
288
|
+
def get_all_tags(self) -> List[SnowflakeTag]:
|
|
289
|
+
cur = self.connection.query(
|
|
290
|
+
SnowflakeQuery.get_all_tags(),
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
tags = [
|
|
294
|
+
SnowflakeTag(
|
|
295
|
+
database=tag["TAG_DATABASE"],
|
|
296
|
+
schema=tag["TAG_SCHEMA"],
|
|
297
|
+
name=tag["TAG_NAME"],
|
|
298
|
+
value="",
|
|
299
|
+
)
|
|
300
|
+
for tag in cur
|
|
301
|
+
]
|
|
302
|
+
|
|
303
|
+
return tags
|
|
304
|
+
|
|
288
305
|
@serialized_lru_cache(maxsize=1)
|
|
289
306
|
def get_tables_for_database(
|
|
290
307
|
self, db_name: str
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
import logging
|
|
3
|
+
import time
|
|
3
4
|
from typing import Dict, Iterable, List, Optional, Union
|
|
4
5
|
|
|
5
6
|
from datahub.configuration.pattern_utils import is_schema_allowed
|
|
6
7
|
from datahub.emitter.mce_builder import (
|
|
7
|
-
get_sys_time,
|
|
8
8
|
make_data_platform_urn,
|
|
9
9
|
make_dataset_urn_with_platform_instance,
|
|
10
10
|
make_schema_field_urn,
|
|
@@ -74,7 +74,6 @@ from datahub.ingestion.source_report.ingestion_stage import (
|
|
|
74
74
|
PROFILING,
|
|
75
75
|
)
|
|
76
76
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
77
|
-
AuditStamp,
|
|
78
77
|
GlobalTags,
|
|
79
78
|
Status,
|
|
80
79
|
SubTypes,
|
|
@@ -101,15 +100,8 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
101
100
|
StringType,
|
|
102
101
|
TimeType,
|
|
103
102
|
)
|
|
104
|
-
from datahub.metadata.com.linkedin.pegasus2avro.structured import (
|
|
105
|
-
StructuredPropertyDefinition,
|
|
106
|
-
)
|
|
107
103
|
from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties
|
|
108
104
|
from datahub.metadata.urns import (
|
|
109
|
-
ContainerUrn,
|
|
110
|
-
DatasetUrn,
|
|
111
|
-
DataTypeUrn,
|
|
112
|
-
EntityTypeUrn,
|
|
113
105
|
SchemaFieldUrn,
|
|
114
106
|
StructuredPropertyUrn,
|
|
115
107
|
)
|
|
@@ -191,7 +183,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
191
183
|
self.domain_registry: Optional[DomainRegistry] = domain_registry
|
|
192
184
|
self.classification_handler = ClassificationHandler(self.config, self.report)
|
|
193
185
|
self.tag_extractor = SnowflakeTagExtractor(
|
|
194
|
-
config, self.data_dictionary, self.report
|
|
186
|
+
config, self.data_dictionary, self.report, identifiers
|
|
195
187
|
)
|
|
196
188
|
self.profiler: Optional[SnowflakeProfiler] = profiler
|
|
197
189
|
self.snowsight_url_builder: Optional[SnowsightUrlBuilder] = (
|
|
@@ -217,6 +209,16 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
217
209
|
return self.identifiers.snowflake_identifier(identifier)
|
|
218
210
|
|
|
219
211
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
212
|
+
if self.config.extract_tags_as_structured_properties:
|
|
213
|
+
logger.info("Creating structured property templates for tags")
|
|
214
|
+
yield from self.tag_extractor.create_structured_property_templates()
|
|
215
|
+
# We have to wait until cache invalidates to make sure the structured property template is available
|
|
216
|
+
logger.info(
|
|
217
|
+
f"Waiting for {self.config.structured_properties_template_cache_invalidation_interval} seconds for structured properties cache to invalidate"
|
|
218
|
+
)
|
|
219
|
+
time.sleep(
|
|
220
|
+
self.config.structured_properties_template_cache_invalidation_interval
|
|
221
|
+
)
|
|
220
222
|
self.databases = []
|
|
221
223
|
for database in self.get_databases() or []:
|
|
222
224
|
self.report.report_entity_scanned(database.name, "database")
|
|
@@ -491,15 +493,25 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
491
493
|
try:
|
|
492
494
|
view_definitions = self.data_dictionary.get_secure_view_definitions()
|
|
493
495
|
return view_definitions[db_name][schema_name][table_name]
|
|
496
|
+
except KeyError:
|
|
497
|
+
# Received secure view definitions but the view is not present in results
|
|
498
|
+
self.structured_reporter.info(
|
|
499
|
+
title="Secure view definition not found",
|
|
500
|
+
message="Lineage will be missing for the view.",
|
|
501
|
+
context=f"{db_name}.{schema_name}.{table_name}",
|
|
502
|
+
)
|
|
503
|
+
return None
|
|
494
504
|
except Exception as e:
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
505
|
+
action_msg = (
|
|
506
|
+
"Please check permissions."
|
|
507
|
+
if isinstance(e, SnowflakePermissionError)
|
|
508
|
+
else ""
|
|
509
|
+
)
|
|
510
|
+
|
|
501
511
|
self.structured_reporter.warning(
|
|
502
|
-
|
|
512
|
+
title="Failed to get secure views definitions",
|
|
513
|
+
message=f"Lineage will be missing for the view. {action_msg}",
|
|
514
|
+
context=f"{db_name}.{schema_name}.{table_name}",
|
|
503
515
|
exc=e,
|
|
504
516
|
)
|
|
505
517
|
return None
|
|
@@ -688,6 +700,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
688
700
|
|
|
689
701
|
def _process_tag(self, tag: SnowflakeTag) -> Iterable[MetadataWorkUnit]:
|
|
690
702
|
use_sp = self.config.extract_tags_as_structured_properties
|
|
703
|
+
|
|
691
704
|
identifier = (
|
|
692
705
|
self.snowflake_identifier(tag.structured_property_identifier())
|
|
693
706
|
if use_sp
|
|
@@ -698,10 +711,11 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
698
711
|
return
|
|
699
712
|
|
|
700
713
|
self.report.report_tag_processed(identifier)
|
|
714
|
+
|
|
701
715
|
if use_sp:
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
716
|
+
return
|
|
717
|
+
|
|
718
|
+
yield from self.gen_tag_workunits(tag)
|
|
705
719
|
|
|
706
720
|
def _format_tags_as_structured_properties(
|
|
707
721
|
self, tags: List[SnowflakeTag]
|
|
@@ -722,6 +736,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
722
736
|
if table.tags:
|
|
723
737
|
for tag in table.tags:
|
|
724
738
|
yield from self._process_tag(tag)
|
|
739
|
+
|
|
725
740
|
for column_name in table.column_tags:
|
|
726
741
|
for tag in table.column_tags[column_name]:
|
|
727
742
|
yield from self._process_tag(tag)
|
|
@@ -893,29 +908,6 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
893
908
|
entityUrn=tag_urn, aspect=tag_properties_aspect
|
|
894
909
|
).as_workunit()
|
|
895
910
|
|
|
896
|
-
def gen_tag_as_structured_property_workunits(
|
|
897
|
-
self, tag: SnowflakeTag
|
|
898
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
899
|
-
identifier = self.snowflake_identifier(tag.structured_property_identifier())
|
|
900
|
-
urn = StructuredPropertyUrn(identifier).urn()
|
|
901
|
-
aspect = StructuredPropertyDefinition(
|
|
902
|
-
qualifiedName=identifier,
|
|
903
|
-
displayName=tag.name,
|
|
904
|
-
valueType=DataTypeUrn("datahub.string").urn(),
|
|
905
|
-
entityTypes=[
|
|
906
|
-
EntityTypeUrn(f"datahub.{ContainerUrn.ENTITY_TYPE}").urn(),
|
|
907
|
-
EntityTypeUrn(f"datahub.{DatasetUrn.ENTITY_TYPE}").urn(),
|
|
908
|
-
EntityTypeUrn(f"datahub.{SchemaFieldUrn.ENTITY_TYPE}").urn(),
|
|
909
|
-
],
|
|
910
|
-
lastModified=AuditStamp(
|
|
911
|
-
time=get_sys_time(), actor="urn:li:corpuser:datahub"
|
|
912
|
-
),
|
|
913
|
-
)
|
|
914
|
-
yield MetadataChangeProposalWrapper(
|
|
915
|
-
entityUrn=urn,
|
|
916
|
-
aspect=aspect,
|
|
917
|
-
).as_workunit()
|
|
918
|
-
|
|
919
911
|
def gen_column_tags_as_structured_properties(
|
|
920
912
|
self, dataset_urn: str, table: Union[SnowflakeTable, SnowflakeView]
|
|
921
913
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Dict, List, Optional
|
|
2
|
+
from typing import Dict, Iterable, List, Optional
|
|
3
3
|
|
|
4
|
+
from datahub.emitter.mce_builder import get_sys_time
|
|
5
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
6
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
4
7
|
from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
|
|
5
8
|
from datahub.ingestion.source.snowflake.snowflake_config import (
|
|
6
9
|
SnowflakeV2Config,
|
|
@@ -12,7 +15,22 @@ from datahub.ingestion.source.snowflake.snowflake_schema import (
|
|
|
12
15
|
SnowflakeTag,
|
|
13
16
|
_SnowflakeTagCache,
|
|
14
17
|
)
|
|
15
|
-
from datahub.ingestion.source.snowflake.snowflake_utils import
|
|
18
|
+
from datahub.ingestion.source.snowflake.snowflake_utils import (
|
|
19
|
+
SnowflakeCommonMixin,
|
|
20
|
+
SnowflakeIdentifierBuilder,
|
|
21
|
+
)
|
|
22
|
+
from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp
|
|
23
|
+
from datahub.metadata.com.linkedin.pegasus2avro.structured import (
|
|
24
|
+
StructuredPropertyDefinition,
|
|
25
|
+
)
|
|
26
|
+
from datahub.metadata.urns import (
|
|
27
|
+
ContainerUrn,
|
|
28
|
+
DatasetUrn,
|
|
29
|
+
DataTypeUrn,
|
|
30
|
+
EntityTypeUrn,
|
|
31
|
+
SchemaFieldUrn,
|
|
32
|
+
StructuredPropertyUrn,
|
|
33
|
+
)
|
|
16
34
|
|
|
17
35
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
18
36
|
|
|
@@ -23,11 +41,12 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
|
|
|
23
41
|
config: SnowflakeV2Config,
|
|
24
42
|
data_dictionary: SnowflakeDataDictionary,
|
|
25
43
|
report: SnowflakeV2Report,
|
|
44
|
+
snowflake_identifiers: SnowflakeIdentifierBuilder,
|
|
26
45
|
) -> None:
|
|
27
46
|
self.config = config
|
|
28
47
|
self.data_dictionary = data_dictionary
|
|
29
48
|
self.report = report
|
|
30
|
-
|
|
49
|
+
self.snowflake_identifiers = snowflake_identifiers
|
|
31
50
|
self.tag_cache: Dict[str, _SnowflakeTagCache] = {}
|
|
32
51
|
|
|
33
52
|
def _get_tags_on_object_without_propagation(
|
|
@@ -59,6 +78,41 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
|
|
|
59
78
|
raise ValueError(f"Unknown domain {domain}")
|
|
60
79
|
return tags
|
|
61
80
|
|
|
81
|
+
def create_structured_property_templates(self) -> Iterable[MetadataWorkUnit]:
|
|
82
|
+
for tag in self.data_dictionary.get_all_tags():
|
|
83
|
+
if not self.config.structured_property_pattern.allowed(
|
|
84
|
+
tag.tag_identifier()
|
|
85
|
+
):
|
|
86
|
+
continue
|
|
87
|
+
if self.config.extract_tags_as_structured_properties:
|
|
88
|
+
self.report.num_structured_property_templates_created += 1
|
|
89
|
+
yield from self.gen_tag_as_structured_property_workunits(tag)
|
|
90
|
+
|
|
91
|
+
def gen_tag_as_structured_property_workunits(
|
|
92
|
+
self, tag: SnowflakeTag
|
|
93
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
94
|
+
identifier = self.snowflake_identifiers.snowflake_identifier(
|
|
95
|
+
tag.structured_property_identifier()
|
|
96
|
+
)
|
|
97
|
+
urn = StructuredPropertyUrn(identifier).urn()
|
|
98
|
+
aspect = StructuredPropertyDefinition(
|
|
99
|
+
qualifiedName=identifier,
|
|
100
|
+
displayName=tag.name,
|
|
101
|
+
valueType=DataTypeUrn("datahub.string").urn(),
|
|
102
|
+
entityTypes=[
|
|
103
|
+
EntityTypeUrn(f"datahub.{ContainerUrn.ENTITY_TYPE}").urn(),
|
|
104
|
+
EntityTypeUrn(f"datahub.{DatasetUrn.ENTITY_TYPE}").urn(),
|
|
105
|
+
EntityTypeUrn(f"datahub.{SchemaFieldUrn.ENTITY_TYPE}").urn(),
|
|
106
|
+
],
|
|
107
|
+
lastModified=AuditStamp(
|
|
108
|
+
time=get_sys_time(), actor="urn:li:corpuser:datahub"
|
|
109
|
+
),
|
|
110
|
+
)
|
|
111
|
+
yield MetadataChangeProposalWrapper(
|
|
112
|
+
entityUrn=urn,
|
|
113
|
+
aspect=aspect,
|
|
114
|
+
).as_workunit()
|
|
115
|
+
|
|
62
116
|
def _get_tags_on_object_with_propagation(
|
|
63
117
|
self,
|
|
64
118
|
domain: str,
|
|
@@ -5,6 +5,7 @@ import logging
|
|
|
5
5
|
import os
|
|
6
6
|
import os.path
|
|
7
7
|
import platform
|
|
8
|
+
import re
|
|
8
9
|
from dataclasses import dataclass
|
|
9
10
|
from typing import Dict, Iterable, List, Optional, Union
|
|
10
11
|
|
|
@@ -33,6 +34,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
|
33
34
|
from datahub.ingestion.source.snowflake.constants import (
|
|
34
35
|
GENERIC_PERMISSION_ERROR_KEY,
|
|
35
36
|
SnowflakeEdition,
|
|
37
|
+
SnowflakeObjectDomain,
|
|
36
38
|
)
|
|
37
39
|
from datahub.ingestion.source.snowflake.snowflake_assertion import (
|
|
38
40
|
SnowflakeAssertionsHandler,
|
|
@@ -162,6 +164,8 @@ class SnowflakeV2Source(
|
|
|
162
164
|
self.data_dictionary = SnowflakeDataDictionary(connection=self.connection)
|
|
163
165
|
self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None
|
|
164
166
|
|
|
167
|
+
self.discovered_datasets: Optional[List[str]] = None
|
|
168
|
+
|
|
165
169
|
self.aggregator: SqlParsingAggregator = self._exit_stack.enter_context(
|
|
166
170
|
SqlParsingAggregator(
|
|
167
171
|
platform=self.identifiers.platform,
|
|
@@ -182,6 +186,8 @@ class SnowflakeV2Source(
|
|
|
182
186
|
generate_usage_statistics=False,
|
|
183
187
|
generate_operations=False,
|
|
184
188
|
format_queries=self.config.format_sql_queries,
|
|
189
|
+
is_temp_table=self._is_temp_table,
|
|
190
|
+
is_allowed_table=self._is_allowed_table,
|
|
185
191
|
)
|
|
186
192
|
)
|
|
187
193
|
self.report.sql_aggregator = self.aggregator.report
|
|
@@ -444,6 +450,34 @@ class SnowflakeV2Source(
|
|
|
444
450
|
|
|
445
451
|
return _report
|
|
446
452
|
|
|
453
|
+
def _is_temp_table(self, name: str) -> bool:
|
|
454
|
+
if any(
|
|
455
|
+
re.match(pattern, name, flags=re.IGNORECASE)
|
|
456
|
+
for pattern in self.config.temporary_tables_pattern
|
|
457
|
+
):
|
|
458
|
+
return True
|
|
459
|
+
|
|
460
|
+
# This is also a temp table if
|
|
461
|
+
# 1. this name would be allowed by the dataset patterns, and
|
|
462
|
+
# 2. we have a list of discovered tables, and
|
|
463
|
+
# 3. it's not in the discovered tables list
|
|
464
|
+
if (
|
|
465
|
+
self.filters.is_dataset_pattern_allowed(name, SnowflakeObjectDomain.TABLE)
|
|
466
|
+
and self.discovered_datasets
|
|
467
|
+
and name not in self.discovered_datasets
|
|
468
|
+
):
|
|
469
|
+
return True
|
|
470
|
+
|
|
471
|
+
return False
|
|
472
|
+
|
|
473
|
+
def _is_allowed_table(self, name: str) -> bool:
|
|
474
|
+
if self.discovered_datasets and name not in self.discovered_datasets:
|
|
475
|
+
return False
|
|
476
|
+
|
|
477
|
+
return self.filters.is_dataset_pattern_allowed(
|
|
478
|
+
name, SnowflakeObjectDomain.TABLE
|
|
479
|
+
)
|
|
480
|
+
|
|
447
481
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
448
482
|
return [
|
|
449
483
|
*super().get_workunit_processors(),
|
|
@@ -513,7 +547,7 @@ class SnowflakeV2Source(
|
|
|
513
547
|
)
|
|
514
548
|
return
|
|
515
549
|
|
|
516
|
-
discovered_datasets = discovered_tables + discovered_views
|
|
550
|
+
self.discovered_datasets = discovered_tables + discovered_views
|
|
517
551
|
|
|
518
552
|
if self.config.use_queries_v2:
|
|
519
553
|
with self.report.new_stage(f"*: {VIEW_PARSING}"):
|
|
@@ -533,18 +567,20 @@ class SnowflakeV2Source(
|
|
|
533
567
|
include_queries=self.config.include_queries,
|
|
534
568
|
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
535
569
|
user_email_pattern=self.config.user_email_pattern,
|
|
570
|
+
pushdown_deny_usernames=self.config.pushdown_deny_usernames,
|
|
536
571
|
),
|
|
537
572
|
structured_report=self.report,
|
|
538
573
|
filters=self.filters,
|
|
539
574
|
identifiers=self.identifiers,
|
|
540
575
|
schema_resolver=schema_resolver,
|
|
541
|
-
discovered_tables=discovered_datasets,
|
|
576
|
+
discovered_tables=self.discovered_datasets,
|
|
542
577
|
graph=self.ctx.graph,
|
|
543
578
|
)
|
|
544
579
|
|
|
545
580
|
# TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
|
|
546
581
|
# but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors,
|
|
547
582
|
# it should be pretty straightforward to refactor this and only initialize the aggregator once.
|
|
583
|
+
# This also applies for the _is_temp_table and _is_allowed_table methods above, duplicated from SnowflakeQueriesExtractor.
|
|
548
584
|
self.report.queries_extractor = queries_extractor.report
|
|
549
585
|
yield from queries_extractor.get_workunits_internal()
|
|
550
586
|
queries_extractor.close()
|
|
@@ -568,12 +604,14 @@ class SnowflakeV2Source(
|
|
|
568
604
|
if (
|
|
569
605
|
self.config.include_usage_stats or self.config.include_operational_stats
|
|
570
606
|
) and self.usage_extractor:
|
|
571
|
-
yield from self.usage_extractor.get_usage_workunits(
|
|
607
|
+
yield from self.usage_extractor.get_usage_workunits(
|
|
608
|
+
self.discovered_datasets
|
|
609
|
+
)
|
|
572
610
|
|
|
573
611
|
if self.config.include_assertion_results:
|
|
574
612
|
yield from SnowflakeAssertionsHandler(
|
|
575
613
|
self.config, self.report, self.connection, self.identifiers
|
|
576
|
-
).get_assertion_workunits(discovered_datasets)
|
|
614
|
+
).get_assertion_workunits(self.discovered_datasets)
|
|
577
615
|
|
|
578
616
|
self.connection.close()
|
|
579
617
|
|
|
@@ -53,7 +53,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
53
53
|
)
|
|
54
54
|
from datahub.metadata.schema_classes import (
|
|
55
55
|
DatasetLineageTypeClass,
|
|
56
|
-
DatasetPropertiesClass,
|
|
57
56
|
DatasetSnapshotClass,
|
|
58
57
|
UpstreamClass,
|
|
59
58
|
)
|
|
@@ -418,41 +417,11 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
|
|
|
418
417
|
dataset_snapshot: DatasetSnapshotClass = wu.metadata.proposedSnapshot
|
|
419
418
|
assert dataset_snapshot
|
|
420
419
|
|
|
421
|
-
lineage_mcp
|
|
422
|
-
wu.metadata.proposedSnapshot.urn
|
|
423
|
-
)
|
|
420
|
+
lineage_mcp = self.get_lineage_mcp(wu.metadata.proposedSnapshot.urn)
|
|
424
421
|
|
|
425
422
|
if lineage_mcp is not None:
|
|
426
423
|
yield lineage_mcp.as_workunit()
|
|
427
424
|
|
|
428
|
-
if lineage_properties_aspect:
|
|
429
|
-
aspects = dataset_snapshot.aspects
|
|
430
|
-
if aspects is None:
|
|
431
|
-
aspects = []
|
|
432
|
-
|
|
433
|
-
dataset_properties_aspect: Optional[DatasetPropertiesClass] = None
|
|
434
|
-
|
|
435
|
-
for aspect in aspects:
|
|
436
|
-
if isinstance(aspect, DatasetPropertiesClass):
|
|
437
|
-
dataset_properties_aspect = aspect
|
|
438
|
-
|
|
439
|
-
if dataset_properties_aspect is None:
|
|
440
|
-
dataset_properties_aspect = DatasetPropertiesClass()
|
|
441
|
-
aspects.append(dataset_properties_aspect)
|
|
442
|
-
|
|
443
|
-
custom_properties = (
|
|
444
|
-
{
|
|
445
|
-
**dataset_properties_aspect.customProperties,
|
|
446
|
-
**lineage_properties_aspect.customProperties,
|
|
447
|
-
}
|
|
448
|
-
if dataset_properties_aspect.customProperties
|
|
449
|
-
else lineage_properties_aspect.customProperties
|
|
450
|
-
)
|
|
451
|
-
dataset_properties_aspect.customProperties = custom_properties
|
|
452
|
-
dataset_snapshot.aspects = aspects
|
|
453
|
-
|
|
454
|
-
dataset_snapshot.aspects.append(dataset_properties_aspect)
|
|
455
|
-
|
|
456
425
|
# Emit the work unit from super.
|
|
457
426
|
yield wu
|
|
458
427
|
|
|
@@ -656,19 +625,16 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
|
|
|
656
625
|
|
|
657
626
|
def get_lineage_mcp(
|
|
658
627
|
self, dataset_urn: str
|
|
659
|
-
) ->
|
|
660
|
-
Optional[MetadataChangeProposalWrapper], Optional[DatasetPropertiesClass]
|
|
661
|
-
]:
|
|
628
|
+
) -> Optional[MetadataChangeProposalWrapper]:
|
|
662
629
|
dataset_key = mce_builder.dataset_urn_to_key(dataset_urn)
|
|
663
630
|
if dataset_key is None:
|
|
664
|
-
return None
|
|
631
|
+
return None
|
|
665
632
|
|
|
666
633
|
if not self._lineage_map:
|
|
667
634
|
self._populate_lineage()
|
|
668
635
|
assert self._lineage_map is not None
|
|
669
636
|
|
|
670
637
|
upstream_lineage: List[UpstreamClass] = []
|
|
671
|
-
custom_properties: Dict[str, str] = {}
|
|
672
638
|
|
|
673
639
|
if dataset_key.name in self._lineage_map:
|
|
674
640
|
item = self._lineage_map[dataset_key.name]
|
|
@@ -684,16 +650,12 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
|
|
|
684
650
|
)
|
|
685
651
|
upstream_lineage.append(upstream_table)
|
|
686
652
|
|
|
687
|
-
properties = None
|
|
688
|
-
if custom_properties:
|
|
689
|
-
properties = DatasetPropertiesClass(customProperties=custom_properties)
|
|
690
|
-
|
|
691
653
|
if not upstream_lineage:
|
|
692
|
-
return None
|
|
654
|
+
return None
|
|
693
655
|
|
|
694
656
|
mcp = MetadataChangeProposalWrapper(
|
|
695
657
|
entityUrn=dataset_urn,
|
|
696
658
|
aspect=UpstreamLineage(upstreams=upstream_lineage),
|
|
697
659
|
)
|
|
698
660
|
|
|
699
|
-
return mcp
|
|
661
|
+
return mcp
|
|
@@ -7,7 +7,12 @@ from datahub.emitter.mce_builder import (
|
|
|
7
7
|
make_data_platform_urn,
|
|
8
8
|
make_dataplatform_instance_urn,
|
|
9
9
|
)
|
|
10
|
+
from datahub.emitter.mcp_builder import (
|
|
11
|
+
DatabaseKey,
|
|
12
|
+
SchemaKey,
|
|
13
|
+
)
|
|
10
14
|
from datahub.metadata.schema_classes import (
|
|
15
|
+
ContainerClass,
|
|
11
16
|
DataFlowInfoClass,
|
|
12
17
|
DataJobInfoClass,
|
|
13
18
|
DataJobInputOutputClass,
|
|
@@ -171,11 +176,7 @@ class MSSQLDataJob:
|
|
|
171
176
|
flow_id=self.entity.flow.formatted_name,
|
|
172
177
|
job_id=self.entity.formatted_name,
|
|
173
178
|
cluster=self.entity.flow.cluster,
|
|
174
|
-
platform_instance=
|
|
175
|
-
self.entity.flow.platform_instance
|
|
176
|
-
if self.entity.flow.platform_instance
|
|
177
|
-
else None
|
|
178
|
-
),
|
|
179
|
+
platform_instance=self.entity.flow.platform_instance,
|
|
179
180
|
)
|
|
180
181
|
|
|
181
182
|
def add_property(
|
|
@@ -222,6 +223,26 @@ class MSSQLDataJob:
|
|
|
222
223
|
)
|
|
223
224
|
return None
|
|
224
225
|
|
|
226
|
+
@property
|
|
227
|
+
def as_container_aspect(self) -> ContainerClass:
|
|
228
|
+
key_args = dict(
|
|
229
|
+
platform=self.entity.flow.orchestrator,
|
|
230
|
+
instance=self.entity.flow.platform_instance,
|
|
231
|
+
env=self.entity.flow.env,
|
|
232
|
+
database=self.entity.flow.db,
|
|
233
|
+
)
|
|
234
|
+
container_key = (
|
|
235
|
+
SchemaKey(
|
|
236
|
+
schema=self.entity.schema,
|
|
237
|
+
**key_args,
|
|
238
|
+
)
|
|
239
|
+
if isinstance(self.entity, StoredProcedure)
|
|
240
|
+
else DatabaseKey(
|
|
241
|
+
**key_args,
|
|
242
|
+
)
|
|
243
|
+
)
|
|
244
|
+
return ContainerClass(container=container_key.as_urn())
|
|
245
|
+
|
|
225
246
|
|
|
226
247
|
@dataclass
|
|
227
248
|
class MSSQLDataFlow:
|
|
@@ -244,9 +265,7 @@ class MSSQLDataFlow:
|
|
|
244
265
|
orchestrator=self.entity.orchestrator,
|
|
245
266
|
flow_id=self.entity.formatted_name,
|
|
246
267
|
cluster=self.entity.cluster,
|
|
247
|
-
platform_instance=
|
|
248
|
-
self.entity.platform_instance if self.entity.platform_instance else None
|
|
249
|
-
),
|
|
268
|
+
platform_instance=self.entity.platform_instance,
|
|
250
269
|
)
|
|
251
270
|
|
|
252
271
|
@property
|
|
@@ -267,3 +286,13 @@ class MSSQLDataFlow:
|
|
|
267
286
|
),
|
|
268
287
|
)
|
|
269
288
|
return None
|
|
289
|
+
|
|
290
|
+
@property
|
|
291
|
+
def as_container_aspect(self) -> ContainerClass:
|
|
292
|
+
databaseKey = DatabaseKey(
|
|
293
|
+
platform=self.entity.orchestrator,
|
|
294
|
+
instance=self.entity.platform_instance,
|
|
295
|
+
env=self.entity.env,
|
|
296
|
+
database=self.entity.db,
|
|
297
|
+
)
|
|
298
|
+
return ContainerClass(container=databaseKey.as_urn())
|
|
@@ -108,6 +108,10 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
108
108
|
default=True,
|
|
109
109
|
description="Enable lineage extraction for stored procedures",
|
|
110
110
|
)
|
|
111
|
+
include_containers_for_pipelines: bool = Field(
|
|
112
|
+
default=False,
|
|
113
|
+
description="Enable the container aspects ingestion for both pipelines and tasks. Note that this feature requires the corresponding model support in the backend, which was introduced in version 0.15.0.1.",
|
|
114
|
+
)
|
|
111
115
|
|
|
112
116
|
@pydantic.validator("uri_args")
|
|
113
117
|
def passwords_match(cls, v, values, **kwargs):
|
|
@@ -641,6 +645,12 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
641
645
|
aspect=data_platform_instance_aspect,
|
|
642
646
|
).as_workunit()
|
|
643
647
|
|
|
648
|
+
if self.config.include_containers_for_pipelines:
|
|
649
|
+
yield MetadataChangeProposalWrapper(
|
|
650
|
+
entityUrn=data_job.urn,
|
|
651
|
+
aspect=data_job.as_container_aspect,
|
|
652
|
+
).as_workunit()
|
|
653
|
+
|
|
644
654
|
if include_lineage:
|
|
645
655
|
yield MetadataChangeProposalWrapper(
|
|
646
656
|
entityUrn=data_job.urn,
|
|
@@ -683,6 +693,13 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
683
693
|
entityUrn=data_flow.urn,
|
|
684
694
|
aspect=data_platform_instance_aspect,
|
|
685
695
|
).as_workunit()
|
|
696
|
+
|
|
697
|
+
if self.config.include_containers_for_pipelines:
|
|
698
|
+
yield MetadataChangeProposalWrapper(
|
|
699
|
+
entityUrn=data_flow.urn,
|
|
700
|
+
aspect=data_flow.as_container_aspect,
|
|
701
|
+
).as_workunit()
|
|
702
|
+
|
|
686
703
|
# TODO: Add SubType when it appear
|
|
687
704
|
|
|
688
705
|
def get_inspectors(self) -> Iterable[Inspector]:
|