acryl-datahub 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2462 -2460
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +214 -210
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
- datahub/cli/cli_utils.py +13 -2
- datahub/cli/delete_cli.py +3 -3
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/ingest_cli.py +25 -15
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +5 -5
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/specific/structuredproperties_cli.py +84 -0
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_builder.py +27 -0
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/emitter/rest_emitter.py +141 -93
- datahub/entrypoints.py +6 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +5 -3
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source.py +8 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +22 -19
- datahub/ingestion/graph/config.py +1 -1
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +77 -47
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/s3_util.py +24 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
- datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +60 -60
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/config.py +20 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +7 -19
- datahub/ingestion/source/datahub/datahub_source.py +13 -3
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/delta_lake/source.py +0 -5
- datahub/ingestion/source/demo_data.py +1 -1
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
- datahub/ingestion/source/dremio/dremio_source.py +2 -2
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/fivetran/fivetran.py +1 -6
- datahub/ingestion/source/gc/datahub_gc.py +11 -14
- datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +2 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +13 -6
- datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
- datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +11 -6
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/metabase.py +1 -6
- datahub/ingestion/source/mlflow.py +4 -9
- datahub/ingestion/source/mode.py +5 -5
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -31
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redash.py +0 -5
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +45 -46
- datahub/ingestion/source/redshift/usage.py +33 -33
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +11 -15
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
- datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/sql_types.py +1 -2
- datahub/ingestion/source/sql/sql_utils.py +5 -0
- datahub/ingestion/source/sql/teradata.py +18 -5
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +1 -6
- datahub/ingestion/source/tableau/tableau.py +343 -117
- datahub/ingestion/source/tableau/tableau_common.py +5 -2
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +74 -78
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/source_report/ingestion_stage.py +24 -20
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +317 -44
- datahub/metadata/_urns/urn_defs.py +69 -15
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
- datahub/metadata/schema.avsc +302 -89
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
- datahub/metadata/schemas/MLModelKey.avsc +2 -1
- datahub/metadata/schemas/MLModelProperties.avsc +96 -48
- datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
- datahub/metadata/schemas/VersionProperties.avsc +216 -0
- datahub/metadata/schemas/VersionSetKey.avsc +26 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
- datahub/secret/datahub_secrets_client.py +12 -21
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +26 -21
- datahub/sql_parsing/sqlglot_lineage.py +3 -3
- datahub/sql_parsing/sqlglot_utils.py +1 -1
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +11 -11
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/perf_timer.py +11 -6
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/_urn_base.py +28 -5
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
|
@@ -45,15 +45,18 @@ class SnowflakeTag:
|
|
|
45
45
|
name: str
|
|
46
46
|
value: str
|
|
47
47
|
|
|
48
|
-
def
|
|
48
|
+
def tag_display_name(self) -> str:
|
|
49
49
|
return f"{self.name}: {self.value}"
|
|
50
50
|
|
|
51
|
-
def
|
|
51
|
+
def tag_identifier(self) -> str:
|
|
52
52
|
return f"{self._id_prefix_as_str()}:{self.value}"
|
|
53
53
|
|
|
54
54
|
def _id_prefix_as_str(self) -> str:
|
|
55
55
|
return f"{self.database}.{self.schema}.{self.name}"
|
|
56
56
|
|
|
57
|
+
def structured_property_identifier(self) -> str:
|
|
58
|
+
return f"snowflake.{self.database}.{self.schema}.{self.name}"
|
|
59
|
+
|
|
57
60
|
|
|
58
61
|
@dataclass
|
|
59
62
|
class SnowflakeColumn(BaseColumn):
|
|
@@ -139,9 +142,9 @@ class _SnowflakeTagCache:
|
|
|
139
142
|
)
|
|
140
143
|
|
|
141
144
|
# self._table_tags[<database_name>][<schema_name>][<table_name>] = list of tags applied to table
|
|
142
|
-
self._table_tags: Dict[
|
|
143
|
-
|
|
144
|
-
|
|
145
|
+
self._table_tags: Dict[str, Dict[str, Dict[str, List[SnowflakeTag]]]] = (
|
|
146
|
+
defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
|
|
147
|
+
)
|
|
145
148
|
|
|
146
149
|
# self._column_tags[<database_name>][<schema_name>][<table_name>][<column_name>] = list of tags applied to column
|
|
147
150
|
self._column_tags: Dict[
|
|
@@ -4,12 +4,14 @@ from typing import Dict, Iterable, List, Optional, Union
|
|
|
4
4
|
|
|
5
5
|
from datahub.configuration.pattern_utils import is_schema_allowed
|
|
6
6
|
from datahub.emitter.mce_builder import (
|
|
7
|
+
get_sys_time,
|
|
7
8
|
make_data_platform_urn,
|
|
8
9
|
make_dataset_urn_with_platform_instance,
|
|
9
10
|
make_schema_field_urn,
|
|
10
11
|
make_tag_urn,
|
|
11
12
|
)
|
|
12
13
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
14
|
+
from datahub.emitter.mcp_builder import add_structured_properties_to_entity_wu
|
|
13
15
|
from datahub.ingestion.api.source import SourceReport
|
|
14
16
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
15
17
|
from datahub.ingestion.glossary.classification_mixin import (
|
|
@@ -72,6 +74,7 @@ from datahub.ingestion.source_report.ingestion_stage import (
|
|
|
72
74
|
PROFILING,
|
|
73
75
|
)
|
|
74
76
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
77
|
+
AuditStamp,
|
|
75
78
|
GlobalTags,
|
|
76
79
|
Status,
|
|
77
80
|
SubTypes,
|
|
@@ -98,7 +101,18 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
98
101
|
StringType,
|
|
99
102
|
TimeType,
|
|
100
103
|
)
|
|
104
|
+
from datahub.metadata.com.linkedin.pegasus2avro.structured import (
|
|
105
|
+
StructuredPropertyDefinition,
|
|
106
|
+
)
|
|
101
107
|
from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties
|
|
108
|
+
from datahub.metadata.urns import (
|
|
109
|
+
ContainerUrn,
|
|
110
|
+
DatasetUrn,
|
|
111
|
+
DataTypeUrn,
|
|
112
|
+
EntityTypeUrn,
|
|
113
|
+
SchemaFieldUrn,
|
|
114
|
+
StructuredPropertyUrn,
|
|
115
|
+
)
|
|
102
116
|
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
103
117
|
KnownLineageMapping,
|
|
104
118
|
SqlParsingAggregator,
|
|
@@ -180,9 +194,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
180
194
|
config, self.data_dictionary, self.report
|
|
181
195
|
)
|
|
182
196
|
self.profiler: Optional[SnowflakeProfiler] = profiler
|
|
183
|
-
self.snowsight_url_builder: Optional[
|
|
184
|
-
|
|
185
|
-
|
|
197
|
+
self.snowsight_url_builder: Optional[SnowsightUrlBuilder] = (
|
|
198
|
+
snowsight_url_builder
|
|
199
|
+
)
|
|
186
200
|
|
|
187
201
|
# These are populated as side-effects of get_workunits_internal.
|
|
188
202
|
self.databases: List[SnowflakeDatabase] = []
|
|
@@ -216,21 +230,23 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
216
230
|
|
|
217
231
|
try:
|
|
218
232
|
for snowflake_db in self.databases:
|
|
219
|
-
self.report.
|
|
220
|
-
|
|
233
|
+
with self.report.new_stage(
|
|
234
|
+
f"{snowflake_db.name}: {METADATA_EXTRACTION}"
|
|
235
|
+
):
|
|
236
|
+
yield from self._process_database(snowflake_db)
|
|
221
237
|
|
|
222
|
-
self.report.
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
238
|
+
with self.report.new_stage(f"*: {EXTERNAL_TABLE_DDL_LINEAGE}"):
|
|
239
|
+
discovered_tables: List[str] = [
|
|
240
|
+
self.identifiers.get_dataset_identifier(
|
|
241
|
+
table_name, schema.name, db.name
|
|
242
|
+
)
|
|
243
|
+
for db in self.databases
|
|
244
|
+
for schema in db.schemas
|
|
245
|
+
for table_name in schema.tables
|
|
246
|
+
]
|
|
247
|
+
if self.aggregator:
|
|
248
|
+
for entry in self._external_tables_ddl_lineage(discovered_tables):
|
|
249
|
+
self.aggregator.add(entry)
|
|
234
250
|
|
|
235
251
|
except SnowflakePermissionError as e:
|
|
236
252
|
self.structured_reporter.failure(
|
|
@@ -251,9 +267,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
251
267
|
)
|
|
252
268
|
return None
|
|
253
269
|
else:
|
|
254
|
-
ischema_databases: List[
|
|
255
|
-
|
|
256
|
-
|
|
270
|
+
ischema_databases: List[SnowflakeDatabase] = (
|
|
271
|
+
self.get_databases_from_ischema(databases)
|
|
272
|
+
)
|
|
257
273
|
|
|
258
274
|
if len(ischema_databases) == 0:
|
|
259
275
|
self.structured_reporter.failure(
|
|
@@ -332,8 +348,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
332
348
|
yield from self._process_db_schemas(snowflake_db, db_tables)
|
|
333
349
|
|
|
334
350
|
if self.profiler and db_tables:
|
|
335
|
-
self.report.
|
|
336
|
-
|
|
351
|
+
with self.report.new_stage(f"{snowflake_db.name}: {PROFILING}"):
|
|
352
|
+
yield from self.profiler.get_workunits(snowflake_db, db_tables)
|
|
337
353
|
|
|
338
354
|
def _process_db_schemas(
|
|
339
355
|
self,
|
|
@@ -671,14 +687,31 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
671
687
|
yield from self.gen_dataset_workunits(view, schema_name, db_name)
|
|
672
688
|
|
|
673
689
|
def _process_tag(self, tag: SnowflakeTag) -> Iterable[MetadataWorkUnit]:
|
|
674
|
-
|
|
690
|
+
use_sp = self.config.extract_tags_as_structured_properties
|
|
691
|
+
identifier = (
|
|
692
|
+
self.snowflake_identifier(tag.structured_property_identifier())
|
|
693
|
+
if use_sp
|
|
694
|
+
else tag.tag_identifier()
|
|
695
|
+
)
|
|
675
696
|
|
|
676
|
-
if self.report.is_tag_processed(
|
|
697
|
+
if self.report.is_tag_processed(identifier):
|
|
677
698
|
return
|
|
678
699
|
|
|
679
|
-
self.report.report_tag_processed(
|
|
680
|
-
|
|
681
|
-
|
|
700
|
+
self.report.report_tag_processed(identifier)
|
|
701
|
+
if use_sp:
|
|
702
|
+
yield from self.gen_tag_as_structured_property_workunits(tag)
|
|
703
|
+
else:
|
|
704
|
+
yield from self.gen_tag_workunits(tag)
|
|
705
|
+
|
|
706
|
+
def _format_tags_as_structured_properties(
|
|
707
|
+
self, tags: List[SnowflakeTag]
|
|
708
|
+
) -> Dict[StructuredPropertyUrn, str]:
|
|
709
|
+
return {
|
|
710
|
+
StructuredPropertyUrn(
|
|
711
|
+
self.snowflake_identifier(tag.structured_property_identifier())
|
|
712
|
+
): tag.value
|
|
713
|
+
for tag in tags
|
|
714
|
+
}
|
|
682
715
|
|
|
683
716
|
def gen_dataset_workunits(
|
|
684
717
|
self,
|
|
@@ -723,6 +756,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
723
756
|
env=self.config.env,
|
|
724
757
|
)
|
|
725
758
|
|
|
759
|
+
if self.config.extract_tags_as_structured_properties:
|
|
760
|
+
yield from self.gen_column_tags_as_structured_properties(dataset_urn, table)
|
|
761
|
+
|
|
726
762
|
yield from add_table_to_schema_container(
|
|
727
763
|
dataset_urn=dataset_urn,
|
|
728
764
|
parent_container_key=schema_container_key,
|
|
@@ -756,16 +792,24 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
756
792
|
)
|
|
757
793
|
|
|
758
794
|
if table.tags:
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
795
|
+
if self.config.extract_tags_as_structured_properties:
|
|
796
|
+
yield from add_structured_properties_to_entity_wu(
|
|
797
|
+
dataset_urn,
|
|
798
|
+
self._format_tags_as_structured_properties(table.tags),
|
|
762
799
|
)
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
800
|
+
else:
|
|
801
|
+
tag_associations = [
|
|
802
|
+
TagAssociation(
|
|
803
|
+
tag=make_tag_urn(
|
|
804
|
+
self.snowflake_identifier(tag.tag_identifier())
|
|
805
|
+
)
|
|
806
|
+
)
|
|
807
|
+
for tag in table.tags
|
|
808
|
+
]
|
|
809
|
+
global_tags = GlobalTags(tag_associations)
|
|
810
|
+
yield MetadataChangeProposalWrapper(
|
|
811
|
+
entityUrn=dataset_urn, aspect=global_tags
|
|
812
|
+
).as_workunit()
|
|
769
813
|
|
|
770
814
|
if isinstance(table, SnowflakeView) and table.view_definition is not None:
|
|
771
815
|
view_properties_aspect = ViewProperties(
|
|
@@ -838,10 +882,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
838
882
|
)
|
|
839
883
|
|
|
840
884
|
def gen_tag_workunits(self, tag: SnowflakeTag) -> Iterable[MetadataWorkUnit]:
|
|
841
|
-
tag_urn = make_tag_urn(self.snowflake_identifier(tag.
|
|
885
|
+
tag_urn = make_tag_urn(self.snowflake_identifier(tag.tag_identifier()))
|
|
842
886
|
|
|
843
887
|
tag_properties_aspect = TagProperties(
|
|
844
|
-
name=tag.
|
|
888
|
+
name=tag.tag_display_name(),
|
|
845
889
|
description=f"Represents the Snowflake tag `{tag._id_prefix_as_str()}` with value `{tag.value}`.",
|
|
846
890
|
)
|
|
847
891
|
|
|
@@ -849,6 +893,41 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
849
893
|
entityUrn=tag_urn, aspect=tag_properties_aspect
|
|
850
894
|
).as_workunit()
|
|
851
895
|
|
|
896
|
+
def gen_tag_as_structured_property_workunits(
|
|
897
|
+
self, tag: SnowflakeTag
|
|
898
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
899
|
+
identifier = self.snowflake_identifier(tag.structured_property_identifier())
|
|
900
|
+
urn = StructuredPropertyUrn(identifier).urn()
|
|
901
|
+
aspect = StructuredPropertyDefinition(
|
|
902
|
+
qualifiedName=identifier,
|
|
903
|
+
displayName=tag.name,
|
|
904
|
+
valueType=DataTypeUrn("datahub.string").urn(),
|
|
905
|
+
entityTypes=[
|
|
906
|
+
EntityTypeUrn(f"datahub.{ContainerUrn.ENTITY_TYPE}").urn(),
|
|
907
|
+
EntityTypeUrn(f"datahub.{DatasetUrn.ENTITY_TYPE}").urn(),
|
|
908
|
+
EntityTypeUrn(f"datahub.{SchemaFieldUrn.ENTITY_TYPE}").urn(),
|
|
909
|
+
],
|
|
910
|
+
lastModified=AuditStamp(
|
|
911
|
+
time=get_sys_time(), actor="urn:li:corpuser:datahub"
|
|
912
|
+
),
|
|
913
|
+
)
|
|
914
|
+
yield MetadataChangeProposalWrapper(
|
|
915
|
+
entityUrn=urn,
|
|
916
|
+
aspect=aspect,
|
|
917
|
+
).as_workunit()
|
|
918
|
+
|
|
919
|
+
def gen_column_tags_as_structured_properties(
|
|
920
|
+
self, dataset_urn: str, table: Union[SnowflakeTable, SnowflakeView]
|
|
921
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
922
|
+
for column_name in table.column_tags:
|
|
923
|
+
schema_field_urn = SchemaFieldUrn(dataset_urn, column_name).urn()
|
|
924
|
+
yield from add_structured_properties_to_entity_wu(
|
|
925
|
+
schema_field_urn,
|
|
926
|
+
self._format_tags_as_structured_properties(
|
|
927
|
+
table.column_tags[column_name]
|
|
928
|
+
),
|
|
929
|
+
)
|
|
930
|
+
|
|
852
931
|
def gen_schema_metadata(
|
|
853
932
|
self,
|
|
854
933
|
table: Union[SnowflakeTable, SnowflakeView],
|
|
@@ -890,13 +969,14 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
890
969
|
[
|
|
891
970
|
TagAssociation(
|
|
892
971
|
make_tag_urn(
|
|
893
|
-
self.snowflake_identifier(tag.
|
|
972
|
+
self.snowflake_identifier(tag.tag_identifier())
|
|
894
973
|
)
|
|
895
974
|
)
|
|
896
975
|
for tag in table.column_tags[col.name]
|
|
897
976
|
]
|
|
898
977
|
)
|
|
899
978
|
if col.name in table.column_tags
|
|
979
|
+
and not self.config.extract_tags_as_structured_properties
|
|
900
980
|
else None
|
|
901
981
|
),
|
|
902
982
|
)
|
|
@@ -983,8 +1063,17 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
983
1063
|
)
|
|
984
1064
|
),
|
|
985
1065
|
tags=(
|
|
986
|
-
[
|
|
1066
|
+
[
|
|
1067
|
+
self.snowflake_identifier(tag.tag_identifier())
|
|
1068
|
+
for tag in database.tags
|
|
1069
|
+
]
|
|
987
1070
|
if database.tags
|
|
1071
|
+
and not self.config.extract_tags_as_structured_properties
|
|
1072
|
+
else None
|
|
1073
|
+
),
|
|
1074
|
+
structured_properties=(
|
|
1075
|
+
self._format_tags_as_structured_properties(database.tags)
|
|
1076
|
+
if database.tags and self.config.extract_tags_as_structured_properties
|
|
988
1077
|
else None
|
|
989
1078
|
),
|
|
990
1079
|
)
|
|
@@ -1036,8 +1125,13 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1036
1125
|
else None
|
|
1037
1126
|
),
|
|
1038
1127
|
tags=(
|
|
1039
|
-
[self.snowflake_identifier(tag.
|
|
1040
|
-
if schema.tags
|
|
1128
|
+
[self.snowflake_identifier(tag.tag_identifier()) for tag in schema.tags]
|
|
1129
|
+
if schema.tags and not self.config.extract_tags_as_structured_properties
|
|
1130
|
+
else None
|
|
1131
|
+
),
|
|
1132
|
+
structured_properties=(
|
|
1133
|
+
self._format_tags_as_structured_properties(schema.tags)
|
|
1134
|
+
if schema.tags and self.config.extract_tags_as_structured_properties
|
|
1041
1135
|
else None
|
|
1042
1136
|
),
|
|
1043
1137
|
)
|
|
@@ -38,9 +38,9 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
|
|
|
38
38
|
table_name: Optional[str],
|
|
39
39
|
) -> List[SnowflakeTag]:
|
|
40
40
|
if db_name not in self.tag_cache:
|
|
41
|
-
self.tag_cache[
|
|
42
|
-
db_name
|
|
43
|
-
|
|
41
|
+
self.tag_cache[db_name] = (
|
|
42
|
+
self.data_dictionary.get_tags_for_database_without_propagation(db_name)
|
|
43
|
+
)
|
|
44
44
|
|
|
45
45
|
if domain == SnowflakeObjectDomain.DATABASE:
|
|
46
46
|
return self.tag_cache[db_name].get_database_tags(db_name)
|
|
@@ -130,10 +130,10 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
|
|
|
130
130
|
temp_column_tags: Dict[str, List[SnowflakeTag]] = {}
|
|
131
131
|
if self.config.extract_tags == TagOption.without_lineage:
|
|
132
132
|
if db_name not in self.tag_cache:
|
|
133
|
-
self.tag_cache[
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
133
|
+
self.tag_cache[db_name] = (
|
|
134
|
+
self.data_dictionary.get_tags_for_database_without_propagation(
|
|
135
|
+
db_name
|
|
136
|
+
)
|
|
137
137
|
)
|
|
138
138
|
temp_column_tags = self.tag_cache[db_name].get_column_tags_for_table(
|
|
139
139
|
table_name, schema_name, db_name
|
|
@@ -165,10 +165,20 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
|
|
|
165
165
|
|
|
166
166
|
allowed_tags = []
|
|
167
167
|
for tag in tags:
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
168
|
+
identifier = (
|
|
169
|
+
tag._id_prefix_as_str()
|
|
170
|
+
if self.config.extract_tags_as_structured_properties
|
|
171
|
+
else tag.tag_identifier()
|
|
172
|
+
)
|
|
173
|
+
self.report.report_entity_scanned(identifier, "tag")
|
|
174
|
+
|
|
175
|
+
pattern = (
|
|
176
|
+
self.config.structured_property_pattern
|
|
177
|
+
if self.config.extract_tags_as_structured_properties
|
|
178
|
+
else self.config.tag_pattern
|
|
179
|
+
)
|
|
180
|
+
if not pattern.allowed(identifier):
|
|
181
|
+
self.report.report_dropped(identifier)
|
|
172
182
|
else:
|
|
173
183
|
allowed_tags.append(tag)
|
|
174
184
|
return allowed_tags
|
|
@@ -146,59 +146,58 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
146
146
|
if not self._should_ingest_usage():
|
|
147
147
|
return
|
|
148
148
|
|
|
149
|
-
self.report.
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
149
|
+
with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"):
|
|
150
|
+
if self.report.edition == SnowflakeEdition.STANDARD.value:
|
|
151
|
+
logger.info(
|
|
152
|
+
"Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported."
|
|
153
|
+
)
|
|
154
|
+
return
|
|
155
155
|
|
|
156
|
-
|
|
156
|
+
logger.info("Checking usage date ranges")
|
|
157
157
|
|
|
158
|
-
|
|
158
|
+
self._check_usage_date_ranges()
|
|
159
159
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
160
|
+
# If permission error, execution returns from here
|
|
161
|
+
if (
|
|
162
|
+
self.report.min_access_history_time is None
|
|
163
|
+
or self.report.max_access_history_time is None
|
|
164
|
+
):
|
|
165
|
+
return
|
|
166
166
|
|
|
167
|
-
|
|
168
|
-
|
|
167
|
+
# NOTE: In earlier `snowflake-usage` connector, users with no email were not considered in usage counts as well as in operation
|
|
168
|
+
# Now, we report the usage as well as operation metadata even if user email is absent
|
|
169
169
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
170
|
+
if self.config.include_usage_stats:
|
|
171
|
+
yield from auto_empty_dataset_usage_statistics(
|
|
172
|
+
self._get_workunits_internal(discovered_datasets),
|
|
173
|
+
config=BaseTimeWindowConfig(
|
|
174
|
+
start_time=self.start_time,
|
|
175
|
+
end_time=self.end_time,
|
|
176
|
+
bucket_duration=self.config.bucket_duration,
|
|
177
|
+
),
|
|
178
|
+
dataset_urns={
|
|
179
|
+
self.identifiers.gen_dataset_urn(dataset_identifier)
|
|
180
|
+
for dataset_identifier in discovered_datasets
|
|
181
|
+
},
|
|
182
|
+
)
|
|
183
183
|
|
|
184
|
-
self.report.
|
|
184
|
+
with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"):
|
|
185
|
+
if self.config.include_operational_stats:
|
|
186
|
+
# Generate the operation workunits.
|
|
187
|
+
access_events = self._get_snowflake_history()
|
|
188
|
+
for event in access_events:
|
|
189
|
+
yield from self._get_operation_aspect_work_unit(
|
|
190
|
+
event, discovered_datasets
|
|
191
|
+
)
|
|
185
192
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
193
|
+
if self.redundant_run_skip_handler:
|
|
194
|
+
# Update the checkpoint state for this run.
|
|
195
|
+
self.redundant_run_skip_handler.update_state(
|
|
196
|
+
self.config.start_time,
|
|
197
|
+
self.config.end_time,
|
|
198
|
+
self.config.bucket_duration,
|
|
192
199
|
)
|
|
193
200
|
|
|
194
|
-
if self.redundant_run_skip_handler:
|
|
195
|
-
# Update the checkpoint state for this run.
|
|
196
|
-
self.redundant_run_skip_handler.update_state(
|
|
197
|
-
self.config.start_time,
|
|
198
|
-
self.config.end_time,
|
|
199
|
-
self.config.bucket_duration,
|
|
200
|
-
)
|
|
201
|
-
|
|
202
201
|
def _get_workunits_internal(
|
|
203
202
|
self, discovered_datasets: List[str]
|
|
204
203
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -386,7 +385,7 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
386
385
|
)
|
|
387
386
|
self.report_status(USAGE_EXTRACTION_OPERATIONAL_STATS, False)
|
|
388
387
|
return
|
|
389
|
-
self.report.access_history_query_secs =
|
|
388
|
+
self.report.access_history_query_secs = timer.elapsed_seconds(digits=2)
|
|
390
389
|
|
|
391
390
|
for row in results:
|
|
392
391
|
yield from self._process_snowflake_history_row(row)
|
|
@@ -434,8 +433,8 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
434
433
|
self.report.max_access_history_time = db_row["MAX_TIME"].astimezone(
|
|
435
434
|
tz=timezone.utc
|
|
436
435
|
)
|
|
437
|
-
self.report.access_history_range_query_secs =
|
|
438
|
-
|
|
436
|
+
self.report.access_history_range_query_secs = timer.elapsed_seconds(
|
|
437
|
+
digits=2
|
|
439
438
|
)
|
|
440
439
|
|
|
441
440
|
def _get_operation_aspect_work_unit(
|
|
@@ -550,9 +549,9 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
550
549
|
):
|
|
551
550
|
# NOTE: Generated emails may be incorrect, as email may be different than
|
|
552
551
|
# username@email_domain
|
|
553
|
-
event_dict[
|
|
554
|
-
"
|
|
555
|
-
|
|
552
|
+
event_dict["EMAIL"] = (
|
|
553
|
+
f"{event_dict['USER_NAME']}@{self.config.email_domain}".lower()
|
|
554
|
+
)
|
|
556
555
|
|
|
557
556
|
if not event_dict["EMAIL"]:
|
|
558
557
|
self.report.rows_missing_email += 1
|
|
@@ -21,8 +21,7 @@ from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Repor
|
|
|
21
21
|
class SnowflakeStructuredReportMixin(abc.ABC):
|
|
22
22
|
@property
|
|
23
23
|
@abc.abstractmethod
|
|
24
|
-
def structured_reporter(self) -> SourceReport:
|
|
25
|
-
...
|
|
24
|
+
def structured_reporter(self) -> SourceReport: ...
|
|
26
25
|
|
|
27
26
|
|
|
28
27
|
class SnowsightUrlBuilder:
|