acryl-datahub 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (211) hide show
  1. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2440 -2438
  2. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +211 -207
  3. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/assertion/assertion_operator.py +3 -5
  6. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  7. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  8. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  9. datahub/api/entities/dataset/dataset.py +2 -1
  10. datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
  11. datahub/cli/cli_utils.py +13 -2
  12. datahub/cli/delete_cli.py +3 -3
  13. datahub/cli/docker_cli.py +6 -6
  14. datahub/cli/ingest_cli.py +25 -15
  15. datahub/cli/lite_cli.py +2 -2
  16. datahub/cli/migrate.py +5 -5
  17. datahub/cli/specific/assertions_cli.py +3 -3
  18. datahub/cli/specific/structuredproperties_cli.py +84 -0
  19. datahub/cli/timeline_cli.py +1 -1
  20. datahub/configuration/common.py +1 -2
  21. datahub/configuration/config_loader.py +73 -50
  22. datahub/configuration/git.py +2 -2
  23. datahub/configuration/time_window_config.py +10 -5
  24. datahub/emitter/mce_builder.py +4 -8
  25. datahub/emitter/mcp_builder.py +27 -0
  26. datahub/emitter/mcp_patch_builder.py +1 -2
  27. datahub/emitter/rest_emitter.py +126 -85
  28. datahub/entrypoints.py +6 -0
  29. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  30. datahub/ingestion/api/report.py +1 -2
  31. datahub/ingestion/api/source.py +4 -2
  32. datahub/ingestion/api/source_helpers.py +1 -1
  33. datahub/ingestion/extractor/json_schema_util.py +3 -3
  34. datahub/ingestion/extractor/schema_util.py +3 -5
  35. datahub/ingestion/fs/s3_fs.py +3 -3
  36. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  37. datahub/ingestion/graph/client.py +22 -19
  38. datahub/ingestion/graph/config.py +1 -1
  39. datahub/ingestion/run/pipeline.py +8 -7
  40. datahub/ingestion/run/pipeline_config.py +3 -3
  41. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  42. datahub/ingestion/source/abs/source.py +19 -8
  43. datahub/ingestion/source/aws/glue.py +77 -47
  44. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  45. datahub/ingestion/source/aws/s3_util.py +24 -1
  46. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  47. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  48. datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
  49. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  50. datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
  51. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  52. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
  53. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
  54. datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
  55. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  56. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  57. datahub/ingestion/source/bigquery_v2/usage.py +60 -60
  58. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  59. datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
  60. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
  61. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  62. datahub/ingestion/source/csv_enricher.py +29 -29
  63. datahub/ingestion/source/datahub/config.py +10 -0
  64. datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
  65. datahub/ingestion/source/datahub/datahub_source.py +12 -2
  66. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  67. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  68. datahub/ingestion/source/delta_lake/source.py +0 -5
  69. datahub/ingestion/source/demo_data.py +1 -1
  70. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  71. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  72. datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
  73. datahub/ingestion/source/dremio/dremio_source.py +2 -2
  74. datahub/ingestion/source/elastic_search.py +4 -4
  75. datahub/ingestion/source/fivetran/fivetran.py +1 -6
  76. datahub/ingestion/source/gc/datahub_gc.py +11 -14
  77. datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
  78. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
  79. datahub/ingestion/source/gcs/gcs_source.py +3 -2
  80. datahub/ingestion/source/ge_data_profiler.py +2 -5
  81. datahub/ingestion/source/ge_profiling_config.py +3 -3
  82. datahub/ingestion/source/iceberg/iceberg.py +13 -6
  83. datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
  84. datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
  85. datahub/ingestion/source/identity/azure_ad.py +3 -3
  86. datahub/ingestion/source/identity/okta.py +3 -3
  87. datahub/ingestion/source/kafka/kafka.py +11 -9
  88. datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
  89. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  90. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  91. datahub/ingestion/source/looker/looker_common.py +19 -19
  92. datahub/ingestion/source/looker/looker_config.py +11 -6
  93. datahub/ingestion/source/looker/looker_source.py +25 -25
  94. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  95. datahub/ingestion/source/looker/looker_usage.py +5 -7
  96. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  97. datahub/ingestion/source/looker/lookml_source.py +13 -15
  98. datahub/ingestion/source/looker/view_upstream.py +5 -5
  99. datahub/ingestion/source/metabase.py +1 -6
  100. datahub/ingestion/source/mlflow.py +4 -9
  101. datahub/ingestion/source/mode.py +5 -5
  102. datahub/ingestion/source/mongodb.py +6 -4
  103. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  104. datahub/ingestion/source/nifi.py +24 -31
  105. datahub/ingestion/source/openapi.py +9 -9
  106. datahub/ingestion/source/powerbi/config.py +12 -12
  107. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  108. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  109. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  110. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  111. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  112. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
  113. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  114. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  115. datahub/ingestion/source/redash.py +0 -5
  116. datahub/ingestion/source/redshift/config.py +3 -3
  117. datahub/ingestion/source/redshift/redshift.py +45 -46
  118. datahub/ingestion/source/redshift/usage.py +33 -33
  119. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  120. datahub/ingestion/source/s3/source.py +11 -15
  121. datahub/ingestion/source/salesforce.py +26 -25
  122. datahub/ingestion/source/schema/json_schema.py +1 -1
  123. datahub/ingestion/source/sigma/sigma.py +3 -3
  124. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  125. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  126. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  127. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  128. datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
  129. datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
  130. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
  131. datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
  132. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
  133. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  134. datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
  135. datahub/ingestion/source/sql/athena.py +1 -3
  136. datahub/ingestion/source/sql/clickhouse.py +8 -14
  137. datahub/ingestion/source/sql/oracle.py +1 -3
  138. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  139. datahub/ingestion/source/sql/sql_types.py +1 -2
  140. datahub/ingestion/source/sql/sql_utils.py +5 -0
  141. datahub/ingestion/source/sql/teradata.py +18 -5
  142. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  143. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  144. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  145. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  146. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  147. datahub/ingestion/source/superset.py +1 -6
  148. datahub/ingestion/source/tableau/tableau.py +343 -117
  149. datahub/ingestion/source/tableau/tableau_common.py +5 -2
  150. datahub/ingestion/source/unity/config.py +3 -1
  151. datahub/ingestion/source/unity/proxy.py +1 -1
  152. datahub/ingestion/source/unity/source.py +74 -74
  153. datahub/ingestion/source/unity/usage.py +3 -1
  154. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  155. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  156. datahub/ingestion/source/usage/usage_common.py +1 -1
  157. datahub/ingestion/source_report/ingestion_stage.py +24 -20
  158. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  159. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  160. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  161. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  162. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  163. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  164. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  165. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  166. datahub/lite/duckdb_lite.py +12 -10
  167. datahub/metadata/_schema_classes.py +317 -44
  168. datahub/metadata/_urns/urn_defs.py +69 -15
  169. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  170. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  171. datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
  172. datahub/metadata/schema.avsc +302 -89
  173. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  174. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  175. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  176. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  177. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  178. datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
  179. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
  180. datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
  181. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  182. datahub/metadata/schemas/MLModelProperties.avsc +96 -48
  183. datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
  184. datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
  185. datahub/metadata/schemas/VersionProperties.avsc +216 -0
  186. datahub/metadata/schemas/VersionSetKey.avsc +26 -0
  187. datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
  188. datahub/secret/datahub_secrets_client.py +12 -21
  189. datahub/secret/secret_common.py +14 -8
  190. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  191. datahub/sql_parsing/schema_resolver.py +5 -10
  192. datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
  193. datahub/sql_parsing/sqlglot_lineage.py +3 -3
  194. datahub/sql_parsing/sqlglot_utils.py +1 -1
  195. datahub/telemetry/stats.py +1 -2
  196. datahub/testing/mcp_diff.py +1 -1
  197. datahub/utilities/file_backed_collections.py +11 -11
  198. datahub/utilities/hive_schema_to_avro.py +2 -2
  199. datahub/utilities/logging_manager.py +2 -2
  200. datahub/utilities/lossy_collections.py +3 -3
  201. datahub/utilities/mapping.py +3 -3
  202. datahub/utilities/memory_footprint.py +3 -2
  203. datahub/utilities/perf_timer.py +11 -6
  204. datahub/utilities/serialized_lru_cache.py +3 -1
  205. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  206. datahub/utilities/sqllineage_patch.py +1 -1
  207. datahub/utilities/stats_collections.py +3 -1
  208. datahub/utilities/urns/_urn_base.py +28 -5
  209. datahub/utilities/urns/urn_iter.py +2 -2
  210. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
  211. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
@@ -4,12 +4,14 @@ from typing import Dict, Iterable, List, Optional, Union
4
4
 
5
5
  from datahub.configuration.pattern_utils import is_schema_allowed
6
6
  from datahub.emitter.mce_builder import (
7
+ get_sys_time,
7
8
  make_data_platform_urn,
8
9
  make_dataset_urn_with_platform_instance,
9
10
  make_schema_field_urn,
10
11
  make_tag_urn,
11
12
  )
12
13
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
14
+ from datahub.emitter.mcp_builder import add_structured_properties_to_entity_wu
13
15
  from datahub.ingestion.api.source import SourceReport
14
16
  from datahub.ingestion.api.workunit import MetadataWorkUnit
15
17
  from datahub.ingestion.glossary.classification_mixin import (
@@ -72,6 +74,7 @@ from datahub.ingestion.source_report.ingestion_stage import (
72
74
  PROFILING,
73
75
  )
74
76
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
77
+ AuditStamp,
75
78
  GlobalTags,
76
79
  Status,
77
80
  SubTypes,
@@ -98,7 +101,18 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
98
101
  StringType,
99
102
  TimeType,
100
103
  )
104
+ from datahub.metadata.com.linkedin.pegasus2avro.structured import (
105
+ StructuredPropertyDefinition,
106
+ )
101
107
  from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties
108
+ from datahub.metadata.urns import (
109
+ ContainerUrn,
110
+ DatasetUrn,
111
+ DataTypeUrn,
112
+ EntityTypeUrn,
113
+ SchemaFieldUrn,
114
+ StructuredPropertyUrn,
115
+ )
102
116
  from datahub.sql_parsing.sql_parsing_aggregator import (
103
117
  KnownLineageMapping,
104
118
  SqlParsingAggregator,
@@ -180,9 +194,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
180
194
  config, self.data_dictionary, self.report
181
195
  )
182
196
  self.profiler: Optional[SnowflakeProfiler] = profiler
183
- self.snowsight_url_builder: Optional[
184
- SnowsightUrlBuilder
185
- ] = snowsight_url_builder
197
+ self.snowsight_url_builder: Optional[SnowsightUrlBuilder] = (
198
+ snowsight_url_builder
199
+ )
186
200
 
187
201
  # These are populated as side-effects of get_workunits_internal.
188
202
  self.databases: List[SnowflakeDatabase] = []
@@ -216,21 +230,23 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
216
230
 
217
231
  try:
218
232
  for snowflake_db in self.databases:
219
- self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION)
220
- yield from self._process_database(snowflake_db)
233
+ with self.report.new_stage(
234
+ f"{snowflake_db.name}: {METADATA_EXTRACTION}"
235
+ ):
236
+ yield from self._process_database(snowflake_db)
221
237
 
222
- self.report.set_ingestion_stage("*", EXTERNAL_TABLE_DDL_LINEAGE)
223
- discovered_tables: List[str] = [
224
- self.identifiers.get_dataset_identifier(
225
- table_name, schema.name, db.name
226
- )
227
- for db in self.databases
228
- for schema in db.schemas
229
- for table_name in schema.tables
230
- ]
231
- if self.aggregator:
232
- for entry in self._external_tables_ddl_lineage(discovered_tables):
233
- self.aggregator.add(entry)
238
+ with self.report.new_stage(f"*: {EXTERNAL_TABLE_DDL_LINEAGE}"):
239
+ discovered_tables: List[str] = [
240
+ self.identifiers.get_dataset_identifier(
241
+ table_name, schema.name, db.name
242
+ )
243
+ for db in self.databases
244
+ for schema in db.schemas
245
+ for table_name in schema.tables
246
+ ]
247
+ if self.aggregator:
248
+ for entry in self._external_tables_ddl_lineage(discovered_tables):
249
+ self.aggregator.add(entry)
234
250
 
235
251
  except SnowflakePermissionError as e:
236
252
  self.structured_reporter.failure(
@@ -251,9 +267,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
251
267
  )
252
268
  return None
253
269
  else:
254
- ischema_databases: List[
255
- SnowflakeDatabase
256
- ] = self.get_databases_from_ischema(databases)
270
+ ischema_databases: List[SnowflakeDatabase] = (
271
+ self.get_databases_from_ischema(databases)
272
+ )
257
273
 
258
274
  if len(ischema_databases) == 0:
259
275
  self.structured_reporter.failure(
@@ -332,8 +348,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
332
348
  yield from self._process_db_schemas(snowflake_db, db_tables)
333
349
 
334
350
  if self.profiler and db_tables:
335
- self.report.set_ingestion_stage(snowflake_db.name, PROFILING)
336
- yield from self.profiler.get_workunits(snowflake_db, db_tables)
351
+ with self.report.new_stage(f"{snowflake_db.name}: {PROFILING}"):
352
+ yield from self.profiler.get_workunits(snowflake_db, db_tables)
337
353
 
338
354
  def _process_db_schemas(
339
355
  self,
@@ -671,14 +687,31 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
671
687
  yield from self.gen_dataset_workunits(view, schema_name, db_name)
672
688
 
673
689
  def _process_tag(self, tag: SnowflakeTag) -> Iterable[MetadataWorkUnit]:
674
- tag_identifier = tag.identifier()
690
+ use_sp = self.config.extract_tags_as_structured_properties
691
+ identifier = (
692
+ self.snowflake_identifier(tag.structured_property_identifier())
693
+ if use_sp
694
+ else tag.tag_identifier()
695
+ )
675
696
 
676
- if self.report.is_tag_processed(tag_identifier):
697
+ if self.report.is_tag_processed(identifier):
677
698
  return
678
699
 
679
- self.report.report_tag_processed(tag_identifier)
680
-
681
- yield from self.gen_tag_workunits(tag)
700
+ self.report.report_tag_processed(identifier)
701
+ if use_sp:
702
+ yield from self.gen_tag_as_structured_property_workunits(tag)
703
+ else:
704
+ yield from self.gen_tag_workunits(tag)
705
+
706
+ def _format_tags_as_structured_properties(
707
+ self, tags: List[SnowflakeTag]
708
+ ) -> Dict[StructuredPropertyUrn, str]:
709
+ return {
710
+ StructuredPropertyUrn(
711
+ self.snowflake_identifier(tag.structured_property_identifier())
712
+ ): tag.value
713
+ for tag in tags
714
+ }
682
715
 
683
716
  def gen_dataset_workunits(
684
717
  self,
@@ -723,6 +756,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
723
756
  env=self.config.env,
724
757
  )
725
758
 
759
+ if self.config.extract_tags_as_structured_properties:
760
+ yield from self.gen_column_tags_as_structured_properties(dataset_urn, table)
761
+
726
762
  yield from add_table_to_schema_container(
727
763
  dataset_urn=dataset_urn,
728
764
  parent_container_key=schema_container_key,
@@ -756,16 +792,24 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
756
792
  )
757
793
 
758
794
  if table.tags:
759
- tag_associations = [
760
- TagAssociation(
761
- tag=make_tag_urn(self.snowflake_identifier(tag.identifier()))
795
+ if self.config.extract_tags_as_structured_properties:
796
+ yield from add_structured_properties_to_entity_wu(
797
+ dataset_urn,
798
+ self._format_tags_as_structured_properties(table.tags),
762
799
  )
763
- for tag in table.tags
764
- ]
765
- global_tags = GlobalTags(tag_associations)
766
- yield MetadataChangeProposalWrapper(
767
- entityUrn=dataset_urn, aspect=global_tags
768
- ).as_workunit()
800
+ else:
801
+ tag_associations = [
802
+ TagAssociation(
803
+ tag=make_tag_urn(
804
+ self.snowflake_identifier(tag.tag_identifier())
805
+ )
806
+ )
807
+ for tag in table.tags
808
+ ]
809
+ global_tags = GlobalTags(tag_associations)
810
+ yield MetadataChangeProposalWrapper(
811
+ entityUrn=dataset_urn, aspect=global_tags
812
+ ).as_workunit()
769
813
 
770
814
  if isinstance(table, SnowflakeView) and table.view_definition is not None:
771
815
  view_properties_aspect = ViewProperties(
@@ -838,10 +882,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
838
882
  )
839
883
 
840
884
  def gen_tag_workunits(self, tag: SnowflakeTag) -> Iterable[MetadataWorkUnit]:
841
- tag_urn = make_tag_urn(self.snowflake_identifier(tag.identifier()))
885
+ tag_urn = make_tag_urn(self.snowflake_identifier(tag.tag_identifier()))
842
886
 
843
887
  tag_properties_aspect = TagProperties(
844
- name=tag.display_name(),
888
+ name=tag.tag_display_name(),
845
889
  description=f"Represents the Snowflake tag `{tag._id_prefix_as_str()}` with value `{tag.value}`.",
846
890
  )
847
891
 
@@ -849,6 +893,41 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
849
893
  entityUrn=tag_urn, aspect=tag_properties_aspect
850
894
  ).as_workunit()
851
895
 
896
+ def gen_tag_as_structured_property_workunits(
897
+ self, tag: SnowflakeTag
898
+ ) -> Iterable[MetadataWorkUnit]:
899
+ identifier = self.snowflake_identifier(tag.structured_property_identifier())
900
+ urn = StructuredPropertyUrn(identifier).urn()
901
+ aspect = StructuredPropertyDefinition(
902
+ qualifiedName=identifier,
903
+ displayName=tag.name,
904
+ valueType=DataTypeUrn("datahub.string").urn(),
905
+ entityTypes=[
906
+ EntityTypeUrn(f"datahub.{ContainerUrn.ENTITY_TYPE}").urn(),
907
+ EntityTypeUrn(f"datahub.{DatasetUrn.ENTITY_TYPE}").urn(),
908
+ EntityTypeUrn(f"datahub.{SchemaFieldUrn.ENTITY_TYPE}").urn(),
909
+ ],
910
+ lastModified=AuditStamp(
911
+ time=get_sys_time(), actor="urn:li:corpuser:datahub"
912
+ ),
913
+ )
914
+ yield MetadataChangeProposalWrapper(
915
+ entityUrn=urn,
916
+ aspect=aspect,
917
+ ).as_workunit()
918
+
919
+ def gen_column_tags_as_structured_properties(
920
+ self, dataset_urn: str, table: Union[SnowflakeTable, SnowflakeView]
921
+ ) -> Iterable[MetadataWorkUnit]:
922
+ for column_name in table.column_tags:
923
+ schema_field_urn = SchemaFieldUrn(dataset_urn, column_name).urn()
924
+ yield from add_structured_properties_to_entity_wu(
925
+ schema_field_urn,
926
+ self._format_tags_as_structured_properties(
927
+ table.column_tags[column_name]
928
+ ),
929
+ )
930
+
852
931
  def gen_schema_metadata(
853
932
  self,
854
933
  table: Union[SnowflakeTable, SnowflakeView],
@@ -890,13 +969,14 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
890
969
  [
891
970
  TagAssociation(
892
971
  make_tag_urn(
893
- self.snowflake_identifier(tag.identifier())
972
+ self.snowflake_identifier(tag.tag_identifier())
894
973
  )
895
974
  )
896
975
  for tag in table.column_tags[col.name]
897
976
  ]
898
977
  )
899
978
  if col.name in table.column_tags
979
+ and not self.config.extract_tags_as_structured_properties
900
980
  else None
901
981
  ),
902
982
  )
@@ -983,8 +1063,17 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
983
1063
  )
984
1064
  ),
985
1065
  tags=(
986
- [self.snowflake_identifier(tag.identifier()) for tag in database.tags]
1066
+ [
1067
+ self.snowflake_identifier(tag.tag_identifier())
1068
+ for tag in database.tags
1069
+ ]
987
1070
  if database.tags
1071
+ and not self.config.extract_tags_as_structured_properties
1072
+ else None
1073
+ ),
1074
+ structured_properties=(
1075
+ self._format_tags_as_structured_properties(database.tags)
1076
+ if database.tags and self.config.extract_tags_as_structured_properties
988
1077
  else None
989
1078
  ),
990
1079
  )
@@ -1036,8 +1125,13 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
1036
1125
  else None
1037
1126
  ),
1038
1127
  tags=(
1039
- [self.snowflake_identifier(tag.identifier()) for tag in schema.tags]
1040
- if schema.tags
1128
+ [self.snowflake_identifier(tag.tag_identifier()) for tag in schema.tags]
1129
+ if schema.tags and not self.config.extract_tags_as_structured_properties
1130
+ else None
1131
+ ),
1132
+ structured_properties=(
1133
+ self._format_tags_as_structured_properties(schema.tags)
1134
+ if schema.tags and self.config.extract_tags_as_structured_properties
1041
1135
  else None
1042
1136
  ),
1043
1137
  )
@@ -38,9 +38,9 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
38
38
  table_name: Optional[str],
39
39
  ) -> List[SnowflakeTag]:
40
40
  if db_name not in self.tag_cache:
41
- self.tag_cache[
42
- db_name
43
- ] = self.data_dictionary.get_tags_for_database_without_propagation(db_name)
41
+ self.tag_cache[db_name] = (
42
+ self.data_dictionary.get_tags_for_database_without_propagation(db_name)
43
+ )
44
44
 
45
45
  if domain == SnowflakeObjectDomain.DATABASE:
46
46
  return self.tag_cache[db_name].get_database_tags(db_name)
@@ -130,10 +130,10 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
130
130
  temp_column_tags: Dict[str, List[SnowflakeTag]] = {}
131
131
  if self.config.extract_tags == TagOption.without_lineage:
132
132
  if db_name not in self.tag_cache:
133
- self.tag_cache[
134
- db_name
135
- ] = self.data_dictionary.get_tags_for_database_without_propagation(
136
- db_name
133
+ self.tag_cache[db_name] = (
134
+ self.data_dictionary.get_tags_for_database_without_propagation(
135
+ db_name
136
+ )
137
137
  )
138
138
  temp_column_tags = self.tag_cache[db_name].get_column_tags_for_table(
139
139
  table_name, schema_name, db_name
@@ -165,10 +165,20 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
165
165
 
166
166
  allowed_tags = []
167
167
  for tag in tags:
168
- tag_identifier = tag.identifier()
169
- self.report.report_entity_scanned(tag_identifier, "tag")
170
- if not self.config.tag_pattern.allowed(tag_identifier):
171
- self.report.report_dropped(tag_identifier)
168
+ identifier = (
169
+ tag._id_prefix_as_str()
170
+ if self.config.extract_tags_as_structured_properties
171
+ else tag.tag_identifier()
172
+ )
173
+ self.report.report_entity_scanned(identifier, "tag")
174
+
175
+ pattern = (
176
+ self.config.structured_property_pattern
177
+ if self.config.extract_tags_as_structured_properties
178
+ else self.config.tag_pattern
179
+ )
180
+ if not pattern.allowed(identifier):
181
+ self.report.report_dropped(identifier)
172
182
  else:
173
183
  allowed_tags.append(tag)
174
184
  return allowed_tags
@@ -146,59 +146,58 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
146
146
  if not self._should_ingest_usage():
147
147
  return
148
148
 
149
- self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION)
150
- if self.report.edition == SnowflakeEdition.STANDARD.value:
151
- logger.info(
152
- "Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported."
153
- )
154
- return
149
+ with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"):
150
+ if self.report.edition == SnowflakeEdition.STANDARD.value:
151
+ logger.info(
152
+ "Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported."
153
+ )
154
+ return
155
155
 
156
- logger.info("Checking usage date ranges")
156
+ logger.info("Checking usage date ranges")
157
157
 
158
- self._check_usage_date_ranges()
158
+ self._check_usage_date_ranges()
159
159
 
160
- # If permission error, execution returns from here
161
- if (
162
- self.report.min_access_history_time is None
163
- or self.report.max_access_history_time is None
164
- ):
165
- return
160
+ # If permission error, execution returns from here
161
+ if (
162
+ self.report.min_access_history_time is None
163
+ or self.report.max_access_history_time is None
164
+ ):
165
+ return
166
166
 
167
- # NOTE: In earlier `snowflake-usage` connector, users with no email were not considered in usage counts as well as in operation
168
- # Now, we report the usage as well as operation metadata even if user email is absent
167
+ # NOTE: In earlier `snowflake-usage` connector, users with no email were not considered in usage counts as well as in operation
168
+ # Now, we report the usage as well as operation metadata even if user email is absent
169
169
 
170
- if self.config.include_usage_stats:
171
- yield from auto_empty_dataset_usage_statistics(
172
- self._get_workunits_internal(discovered_datasets),
173
- config=BaseTimeWindowConfig(
174
- start_time=self.start_time,
175
- end_time=self.end_time,
176
- bucket_duration=self.config.bucket_duration,
177
- ),
178
- dataset_urns={
179
- self.identifiers.gen_dataset_urn(dataset_identifier)
180
- for dataset_identifier in discovered_datasets
181
- },
182
- )
170
+ if self.config.include_usage_stats:
171
+ yield from auto_empty_dataset_usage_statistics(
172
+ self._get_workunits_internal(discovered_datasets),
173
+ config=BaseTimeWindowConfig(
174
+ start_time=self.start_time,
175
+ end_time=self.end_time,
176
+ bucket_duration=self.config.bucket_duration,
177
+ ),
178
+ dataset_urns={
179
+ self.identifiers.gen_dataset_urn(dataset_identifier)
180
+ for dataset_identifier in discovered_datasets
181
+ },
182
+ )
183
183
 
184
- self.report.set_ingestion_stage("*", USAGE_EXTRACTION_OPERATIONAL_STATS)
184
+ with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"):
185
+ if self.config.include_operational_stats:
186
+ # Generate the operation workunits.
187
+ access_events = self._get_snowflake_history()
188
+ for event in access_events:
189
+ yield from self._get_operation_aspect_work_unit(
190
+ event, discovered_datasets
191
+ )
185
192
 
186
- if self.config.include_operational_stats:
187
- # Generate the operation workunits.
188
- access_events = self._get_snowflake_history()
189
- for event in access_events:
190
- yield from self._get_operation_aspect_work_unit(
191
- event, discovered_datasets
193
+ if self.redundant_run_skip_handler:
194
+ # Update the checkpoint state for this run.
195
+ self.redundant_run_skip_handler.update_state(
196
+ self.config.start_time,
197
+ self.config.end_time,
198
+ self.config.bucket_duration,
192
199
  )
193
200
 
194
- if self.redundant_run_skip_handler:
195
- # Update the checkpoint state for this run.
196
- self.redundant_run_skip_handler.update_state(
197
- self.config.start_time,
198
- self.config.end_time,
199
- self.config.bucket_duration,
200
- )
201
-
202
201
  def _get_workunits_internal(
203
202
  self, discovered_datasets: List[str]
204
203
  ) -> Iterable[MetadataWorkUnit]:
@@ -386,7 +385,7 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
386
385
  )
387
386
  self.report_status(USAGE_EXTRACTION_OPERATIONAL_STATS, False)
388
387
  return
389
- self.report.access_history_query_secs = round(timer.elapsed_seconds(), 2)
388
+ self.report.access_history_query_secs = timer.elapsed_seconds(digits=2)
390
389
 
391
390
  for row in results:
392
391
  yield from self._process_snowflake_history_row(row)
@@ -434,8 +433,8 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
434
433
  self.report.max_access_history_time = db_row["MAX_TIME"].astimezone(
435
434
  tz=timezone.utc
436
435
  )
437
- self.report.access_history_range_query_secs = round(
438
- timer.elapsed_seconds(), 2
436
+ self.report.access_history_range_query_secs = timer.elapsed_seconds(
437
+ digits=2
439
438
  )
440
439
 
441
440
  def _get_operation_aspect_work_unit(
@@ -550,9 +549,9 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
550
549
  ):
551
550
  # NOTE: Generated emails may be incorrect, as email may be different than
552
551
  # username@email_domain
553
- event_dict[
554
- "EMAIL"
555
- ] = f'{event_dict["USER_NAME"]}@{self.config.email_domain}'.lower()
552
+ event_dict["EMAIL"] = (
553
+ f"{event_dict['USER_NAME']}@{self.config.email_domain}".lower()
554
+ )
556
555
 
557
556
  if not event_dict["EMAIL"]:
558
557
  self.report.rows_missing_email += 1
@@ -21,8 +21,7 @@ from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Repor
21
21
  class SnowflakeStructuredReportMixin(abc.ABC):
22
22
  @property
23
23
  @abc.abstractmethod
24
- def structured_reporter(self) -> SourceReport:
25
- ...
24
+ def structured_reporter(self) -> SourceReport: ...
26
25
 
27
26
 
28
27
  class SnowsightUrlBuilder:
@@ -23,7 +23,6 @@ from datahub.ingestion.api.incremental_properties_helper import (
23
23
  from datahub.ingestion.api.source import (
24
24
  CapabilityReport,
25
25
  MetadataWorkUnitProcessor,
26
- Source,
27
26
  SourceCapability,
28
27
  SourceReport,
29
28
  TestableSource,
@@ -212,9 +211,9 @@ class SnowflakeV2Source(
212
211
 
213
212
  self.usage_extractor: Optional[SnowflakeUsageExtractor] = None
214
213
  if self.config.include_usage_stats or self.config.include_operational_stats:
215
- redundant_usage_run_skip_handler: Optional[
216
- RedundantUsageRunSkipHandler
217
- ] = None
214
+ redundant_usage_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = (
215
+ None
216
+ )
218
217
  if self.config.enable_stateful_usage_ingestion:
219
218
  redundant_usage_run_skip_handler = RedundantUsageRunSkipHandler(
220
219
  source=self,
@@ -251,11 +250,6 @@ class SnowflakeV2Source(
251
250
 
252
251
  self.add_config_to_report()
253
252
 
254
- @classmethod
255
- def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
256
- config = SnowflakeV2Config.parse_obj(config_dict)
257
- return cls(ctx, config)
258
-
259
253
  @staticmethod
260
254
  def test_connection(config_dict: dict) -> TestConnectionReport:
261
255
  test_report = TestConnectionReport()
@@ -302,7 +296,16 @@ class SnowflakeV2Source(
302
296
 
303
297
  _report: Dict[Union[SourceCapability, str], CapabilityReport] = dict()
304
298
  privileges: List[SnowflakePrivilege] = []
305
- capabilities: List[SourceCapability] = [c.capability for c in SnowflakeV2Source.get_capabilities() if c.capability not in (SourceCapability.PLATFORM_INSTANCE, SourceCapability.DOMAINS, SourceCapability.DELETION_DETECTION)] # type: ignore
299
+ capabilities: List[SourceCapability] = [
300
+ c.capability
301
+ for c in SnowflakeV2Source.get_capabilities() # type: ignore
302
+ if c.capability
303
+ not in (
304
+ SourceCapability.PLATFORM_INSTANCE,
305
+ SourceCapability.DOMAINS,
306
+ SourceCapability.DELETION_DETECTION,
307
+ )
308
+ ]
306
309
 
307
310
  cur = conn.query("select current_role()")
308
311
  current_role = [row["CURRENT_ROLE()"] for row in cur][0]
@@ -480,8 +483,8 @@ class SnowflakeV2Source(
480
483
  identifiers=self.identifiers,
481
484
  )
482
485
 
483
- self.report.set_ingestion_stage("*", METADATA_EXTRACTION)
484
- yield from schema_extractor.get_workunits_internal()
486
+ with self.report.new_stage(f"*: {METADATA_EXTRACTION}"):
487
+ yield from schema_extractor.get_workunits_internal()
485
488
 
486
489
  databases = schema_extractor.databases
487
490
 
@@ -513,45 +516,46 @@ class SnowflakeV2Source(
513
516
  discovered_datasets = discovered_tables + discovered_views
514
517
 
515
518
  if self.config.use_queries_v2:
516
- self.report.set_ingestion_stage("*", VIEW_PARSING)
517
- yield from auto_workunit(self.aggregator.gen_metadata())
518
-
519
- self.report.set_ingestion_stage("*", QUERIES_EXTRACTION)
520
-
521
- schema_resolver = self.aggregator._schema_resolver
522
-
523
- queries_extractor = SnowflakeQueriesExtractor(
524
- connection=self.connection,
525
- config=SnowflakeQueriesExtractorConfig(
526
- window=self.config,
527
- temporary_tables_pattern=self.config.temporary_tables_pattern,
528
- include_lineage=self.config.include_table_lineage,
529
- include_usage_statistics=self.config.include_usage_stats,
530
- include_operations=self.config.include_operational_stats,
531
- user_email_pattern=self.config.user_email_pattern,
532
- ),
533
- structured_report=self.report,
534
- filters=self.filters,
535
- identifiers=self.identifiers,
536
- schema_resolver=schema_resolver,
537
- discovered_tables=discovered_datasets,
538
- graph=self.ctx.graph,
539
- )
519
+ with self.report.new_stage(f"*: {VIEW_PARSING}"):
520
+ yield from auto_workunit(self.aggregator.gen_metadata())
540
521
 
541
- # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
542
- # but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors,
543
- # it should be pretty straightforward to refactor this and only initialize the aggregator once.
544
- self.report.queries_extractor = queries_extractor.report
545
- yield from queries_extractor.get_workunits_internal()
546
- queries_extractor.close()
522
+ with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
523
+ schema_resolver = self.aggregator._schema_resolver
524
+
525
+ queries_extractor = SnowflakeQueriesExtractor(
526
+ connection=self.connection,
527
+ config=SnowflakeQueriesExtractorConfig(
528
+ window=self.config,
529
+ temporary_tables_pattern=self.config.temporary_tables_pattern,
530
+ include_lineage=self.config.include_table_lineage,
531
+ include_usage_statistics=self.config.include_usage_stats,
532
+ include_operations=self.config.include_operational_stats,
533
+ include_queries=self.config.include_queries,
534
+ include_query_usage_statistics=self.config.include_query_usage_statistics,
535
+ user_email_pattern=self.config.user_email_pattern,
536
+ ),
537
+ structured_report=self.report,
538
+ filters=self.filters,
539
+ identifiers=self.identifiers,
540
+ schema_resolver=schema_resolver,
541
+ discovered_tables=discovered_datasets,
542
+ graph=self.ctx.graph,
543
+ )
544
+
545
+ # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
546
+ # but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors,
547
+ # it should be pretty straightforward to refactor this and only initialize the aggregator once.
548
+ self.report.queries_extractor = queries_extractor.report
549
+ yield from queries_extractor.get_workunits_internal()
550
+ queries_extractor.close()
547
551
 
548
552
  else:
549
553
  if self.lineage_extractor:
550
- self.report.set_ingestion_stage("*", LINEAGE_EXTRACTION)
551
- self.lineage_extractor.add_time_based_lineage_to_aggregator(
552
- discovered_tables=discovered_tables,
553
- discovered_views=discovered_views,
554
- )
554
+ with self.report.new_stage(f"*: {LINEAGE_EXTRACTION}"):
555
+ self.lineage_extractor.add_time_based_lineage_to_aggregator(
556
+ discovered_tables=discovered_tables,
557
+ discovered_views=discovered_views,
558
+ )
555
559
 
556
560
  # This would emit view and external table ddl lineage
557
561
  # as well as query lineage via lineage_extractor
@@ -104,9 +104,7 @@ class CustomAthenaRestDialect(AthenaRestDialect):
104
104
  return "\n".join([r for r in res])
105
105
 
106
106
  @typing.no_type_check
107
- def _get_column_type(
108
- self, type_: Union[str, Dict[str, Any]]
109
- ) -> TypeEngine: # noqa: C901
107
+ def _get_column_type(self, type_: Union[str, Dict[str, Any]]) -> TypeEngine: # noqa: C901
110
108
  """Derives the data type of the Athena column.
111
109
 
112
110
  This method is overwritten to extend the behavior of PyAthena.