acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show
  1. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2236 -2240
  2. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
  3. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
  5. datahub/__init__.py +1 -1
  6. datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
  7. datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
  8. datahub/configuration/common.py +2 -5
  9. datahub/configuration/source_common.py +13 -0
  10. datahub/emitter/mce_builder.py +20 -4
  11. datahub/emitter/mcp_builder.py +2 -7
  12. datahub/emitter/mcp_patch_builder.py +37 -13
  13. datahub/emitter/rest_emitter.py +25 -3
  14. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
  15. datahub/ingestion/api/closeable.py +3 -3
  16. datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
  17. datahub/ingestion/api/report.py +4 -1
  18. datahub/ingestion/api/sink.py +4 -3
  19. datahub/ingestion/api/source.py +4 -0
  20. datahub/ingestion/api/source_helpers.py +2 -6
  21. datahub/ingestion/glossary/classifier.py +2 -3
  22. datahub/ingestion/graph/client.py +6 -3
  23. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
  24. datahub/ingestion/source/aws/aws_common.py +231 -27
  25. datahub/ingestion/source/aws/glue.py +12 -2
  26. datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
  27. datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
  28. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
  29. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
  30. datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
  31. datahub/ingestion/source/datahub/config.py +22 -1
  32. datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
  33. datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
  34. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  35. datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
  36. datahub/ingestion/source/gc/datahub_gc.py +21 -5
  37. datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
  38. datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
  39. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
  40. datahub/ingestion/source/iceberg/iceberg.py +27 -1
  41. datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
  42. datahub/ingestion/source/kafka_connect/__init__.py +0 -0
  43. datahub/ingestion/source/kafka_connect/common.py +202 -0
  44. datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
  45. datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
  46. datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
  47. datahub/ingestion/source/looker/looker_common.py +63 -2
  48. datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
  49. datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
  50. datahub/ingestion/source/looker/looker_source.py +31 -4
  51. datahub/ingestion/source/looker/looker_usage.py +23 -17
  52. datahub/ingestion/source/mlflow.py +30 -5
  53. datahub/ingestion/source/mode.py +40 -27
  54. datahub/ingestion/source/powerbi/config.py +1 -14
  55. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
  56. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
  57. datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
  58. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
  59. datahub/ingestion/source/s3/source.py +1 -1
  60. datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
  61. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
  62. datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
  63. datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
  64. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
  65. datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
  66. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
  67. datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
  68. datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
  69. datahub/ingestion/source/sql/hive.py +621 -8
  70. datahub/ingestion/source/sql/hive_metastore.py +7 -0
  71. datahub/ingestion/source/sql/mssql/job_models.py +30 -1
  72. datahub/ingestion/source/sql/mssql/source.py +15 -1
  73. datahub/ingestion/source/sql/sql_common.py +41 -102
  74. datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
  75. datahub/ingestion/source/sql/sql_report.py +2 -0
  76. datahub/ingestion/source/state/checkpoint.py +2 -1
  77. datahub/ingestion/source/tableau/tableau.py +122 -45
  78. datahub/ingestion/source/tableau/tableau_common.py +18 -0
  79. datahub/ingestion/source/tableau/tableau_constant.py +3 -1
  80. datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
  81. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  82. datahub/ingestion/source/unity/proxy.py +8 -27
  83. datahub/ingestion/source/usage/usage_common.py +15 -1
  84. datahub/ingestion/source_report/ingestion_stage.py +3 -0
  85. datahub/metadata/_schema_classes.py +256 -3
  86. datahub/metadata/_urns/urn_defs.py +168 -168
  87. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
  88. datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
  89. datahub/metadata/schema.avsc +252 -33
  90. datahub/metadata/schemas/DataJobKey.avsc +2 -1
  91. datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
  92. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  93. datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
  94. datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
  95. datahub/metadata/schemas/MLModelProperties.avsc +62 -2
  96. datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
  97. datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
  98. datahub/specific/aspect_helpers/__init__.py +0 -0
  99. datahub/specific/aspect_helpers/custom_properties.py +79 -0
  100. datahub/specific/aspect_helpers/ownership.py +67 -0
  101. datahub/specific/aspect_helpers/structured_properties.py +72 -0
  102. datahub/specific/aspect_helpers/tags.py +42 -0
  103. datahub/specific/aspect_helpers/terms.py +43 -0
  104. datahub/specific/chart.py +28 -184
  105. datahub/specific/dashboard.py +31 -196
  106. datahub/specific/datajob.py +34 -189
  107. datahub/specific/dataproduct.py +24 -86
  108. datahub/specific/dataset.py +48 -133
  109. datahub/specific/form.py +12 -32
  110. datahub/specific/structured_property.py +9 -9
  111. datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
  112. datahub/sql_parsing/sqlglot_lineage.py +15 -5
  113. datahub/sql_parsing/tool_meta_extractor.py +119 -5
  114. datahub/utilities/time.py +8 -3
  115. datahub/utilities/urns/_urn_base.py +5 -7
  116. datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
  117. datahub/specific/custom_properties.py +0 -37
  118. datahub/specific/ownership.py +0 -48
  119. datahub/specific/structured_properties.py +0 -53
  120. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
@@ -300,6 +300,28 @@ class SnowflakeIdentifierBuilder:
300
300
  def get_quoted_identifier_for_table(db_name, schema_name, table_name):
301
301
  return f'"{db_name}"."{schema_name}"."{table_name}"'
302
302
 
303
+ # Note - decide how to construct user urns.
304
+ # Historically urns were created using part before @ from user's email.
305
+ # Users without email were skipped from both user entries as well as aggregates.
306
+ # However email is not mandatory field in snowflake user, user_name is always present.
307
+ def get_user_identifier(
308
+ self,
309
+ user_name: str,
310
+ user_email: Optional[str],
311
+ ) -> str:
312
+ if user_email:
313
+ return self.snowflake_identifier(
314
+ user_email
315
+ if self.identifier_config.email_as_user_identifier is True
316
+ else user_email.split("@")[0]
317
+ )
318
+ return self.snowflake_identifier(
319
+ f"{user_name}@{self.identifier_config.email_domain}"
320
+ if self.identifier_config.email_as_user_identifier is True
321
+ and self.identifier_config.email_domain is not None
322
+ else user_name
323
+ )
324
+
303
325
 
304
326
  class SnowflakeCommonMixin(SnowflakeStructuredReportMixin):
305
327
  platform = "snowflake"
@@ -315,24 +337,6 @@ class SnowflakeCommonMixin(SnowflakeStructuredReportMixin):
315
337
  def identifiers(self) -> SnowflakeIdentifierBuilder:
316
338
  return SnowflakeIdentifierBuilder(self.config, self.report)
317
339
 
318
- # Note - decide how to construct user urns.
319
- # Historically urns were created using part before @ from user's email.
320
- # Users without email were skipped from both user entries as well as aggregates.
321
- # However email is not mandatory field in snowflake user, user_name is always present.
322
- def get_user_identifier(
323
- self,
324
- user_name: str,
325
- user_email: Optional[str],
326
- email_as_user_identifier: bool,
327
- ) -> str:
328
- if user_email:
329
- return self.identifiers.snowflake_identifier(
330
- user_email
331
- if email_as_user_identifier is True
332
- else user_email.split("@")[0]
333
- )
334
- return self.identifiers.snowflake_identifier(user_name)
335
-
336
340
  # TODO: Revisit this after stateful ingestion can commit checkpoint
337
341
  # for failures that do not affect the checkpoint
338
342
  # TODO: Add additional parameters to match the signature of the .warning and .failure methods
@@ -82,6 +82,7 @@ from datahub.ingestion.source_report.ingestion_stage import (
82
82
  LINEAGE_EXTRACTION,
83
83
  METADATA_EXTRACTION,
84
84
  QUERIES_EXTRACTION,
85
+ VIEW_PARSING,
85
86
  )
86
87
  from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator
87
88
  from datahub.utilities.registries.domain_registry import DomainRegistry
@@ -103,7 +104,7 @@ logger: logging.Logger = logging.getLogger(__name__)
103
104
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
104
105
  @capability(
105
106
  SourceCapability.LINEAGE_COARSE,
106
- "Enabled by default, can be disabled via configuration `include_table_lineage` and `include_view_lineage`",
107
+ "Enabled by default, can be disabled via configuration `include_table_lineage`",
107
108
  )
108
109
  @capability(
109
110
  SourceCapability.LINEAGE_FINE,
@@ -161,35 +162,32 @@ class SnowflakeV2Source(
161
162
  # For database, schema, tables, views, etc
162
163
  self.data_dictionary = SnowflakeDataDictionary(connection=self.connection)
163
164
  self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None
164
- self.aggregator: Optional[SqlParsingAggregator] = None
165
-
166
- if self.config.use_queries_v2 or self.config.include_table_lineage:
167
- self.aggregator = self._exit_stack.enter_context(
168
- SqlParsingAggregator(
169
- platform=self.identifiers.platform,
170
- platform_instance=self.config.platform_instance,
171
- env=self.config.env,
172
- graph=self.ctx.graph,
173
- eager_graph_load=(
174
- # If we're ingestion schema metadata for tables/views, then we will populate
175
- # schemas into the resolver as we go. We only need to do a bulk fetch
176
- # if we're not ingesting schema metadata as part of ingestion.
177
- not (
178
- self.config.include_technical_schema
179
- and self.config.include_tables
180
- and self.config.include_views
181
- )
182
- and not self.config.lazy_schema_resolver
183
- ),
184
- generate_usage_statistics=False,
185
- generate_operations=False,
186
- format_queries=self.config.format_sql_queries,
187
- )
165
+
166
+ self.aggregator: SqlParsingAggregator = self._exit_stack.enter_context(
167
+ SqlParsingAggregator(
168
+ platform=self.identifiers.platform,
169
+ platform_instance=self.config.platform_instance,
170
+ env=self.config.env,
171
+ graph=self.ctx.graph,
172
+ eager_graph_load=(
173
+ # If we're ingestion schema metadata for tables/views, then we will populate
174
+ # schemas into the resolver as we go. We only need to do a bulk fetch
175
+ # if we're not ingesting schema metadata as part of ingestion.
176
+ not (
177
+ self.config.include_technical_schema
178
+ and self.config.include_tables
179
+ and self.config.include_views
180
+ )
181
+ and not self.config.lazy_schema_resolver
182
+ ),
183
+ generate_usage_statistics=False,
184
+ generate_operations=False,
185
+ format_queries=self.config.format_sql_queries,
188
186
  )
189
- self.report.sql_aggregator = self.aggregator.report
187
+ )
188
+ self.report.sql_aggregator = self.aggregator.report
190
189
 
191
190
  if self.config.include_table_lineage:
192
- assert self.aggregator is not None
193
191
  redundant_lineage_run_skip_handler: Optional[
194
192
  RedundantLineageRunSkipHandler
195
193
  ] = None
@@ -487,8 +485,6 @@ class SnowflakeV2Source(
487
485
 
488
486
  databases = schema_extractor.databases
489
487
 
490
- # TODO: The checkpoint state for stale entity detection can be committed here.
491
-
492
488
  if self.config.shares:
493
489
  yield from SnowflakeSharesHandler(
494
490
  self.config, self.report
@@ -517,15 +513,14 @@ class SnowflakeV2Source(
517
513
  discovered_datasets = discovered_tables + discovered_views
518
514
 
519
515
  if self.config.use_queries_v2:
520
- self.report.set_ingestion_stage("*", "View Parsing")
521
- assert self.aggregator is not None
516
+ self.report.set_ingestion_stage("*", VIEW_PARSING)
522
517
  yield from auto_workunit(self.aggregator.gen_metadata())
523
518
 
524
519
  self.report.set_ingestion_stage("*", QUERIES_EXTRACTION)
525
520
 
526
521
  schema_resolver = self.aggregator._schema_resolver
527
522
 
528
- queries_extractor: SnowflakeQueriesExtractor = SnowflakeQueriesExtractor(
523
+ queries_extractor = SnowflakeQueriesExtractor(
529
524
  connection=self.connection,
530
525
  config=SnowflakeQueriesExtractorConfig(
531
526
  window=self.config,
@@ -540,6 +535,7 @@ class SnowflakeV2Source(
540
535
  identifiers=self.identifiers,
541
536
  schema_resolver=schema_resolver,
542
537
  discovered_tables=discovered_datasets,
538
+ graph=self.ctx.graph,
543
539
  )
544
540
 
545
541
  # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
@@ -550,13 +546,21 @@ class SnowflakeV2Source(
550
546
  queries_extractor.close()
551
547
 
552
548
  else:
553
- if self.config.include_table_lineage and self.lineage_extractor:
549
+ if self.lineage_extractor:
554
550
  self.report.set_ingestion_stage("*", LINEAGE_EXTRACTION)
555
- yield from self.lineage_extractor.get_workunits(
551
+ self.lineage_extractor.add_time_based_lineage_to_aggregator(
556
552
  discovered_tables=discovered_tables,
557
553
  discovered_views=discovered_views,
558
554
  )
559
555
 
556
+ # This would emit view and external table ddl lineage
557
+ # as well as query lineage via lineage_extractor
558
+ for mcp in self.aggregator.gen_metadata():
559
+ yield mcp.as_workunit()
560
+
561
+ if self.lineage_extractor:
562
+ self.lineage_extractor.update_state()
563
+
560
564
  if (
561
565
  self.config.include_usage_stats or self.config.include_operational_stats
562
566
  ) and self.usage_extractor: