acryl-datahub 0.15.0rc24__py3-none-any.whl → 0.15.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2408 -2412
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
- datahub/__init__.py +1 -1
- datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
- datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
- datahub/configuration/common.py +2 -5
- datahub/configuration/source_common.py +13 -0
- datahub/emitter/mce_builder.py +20 -4
- datahub/emitter/mcp_builder.py +2 -7
- datahub/emitter/mcp_patch_builder.py +37 -13
- datahub/emitter/rest_emitter.py +25 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
- datahub/ingestion/api/closeable.py +3 -3
- datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
- datahub/ingestion/api/report.py +4 -1
- datahub/ingestion/api/sink.py +4 -3
- datahub/ingestion/api/source.py +4 -0
- datahub/ingestion/api/source_helpers.py +2 -6
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/graph/client.py +6 -3
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
- datahub/ingestion/source/aws/aws_common.py +231 -27
- datahub/ingestion/source/aws/glue.py +12 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
- datahub/ingestion/source/datahub/config.py +22 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
- datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
- datahub/ingestion/source/gc/datahub_gc.py +21 -5
- datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
- datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
- datahub/ingestion/source/iceberg/iceberg.py +27 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
- datahub/ingestion/source/kafka_connect/__init__.py +0 -0
- datahub/ingestion/source/kafka_connect/common.py +202 -0
- datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
- datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
- datahub/ingestion/source/looker/looker_common.py +63 -2
- datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
- datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
- datahub/ingestion/source/looker/looker_source.py +31 -4
- datahub/ingestion/source/looker/looker_usage.py +23 -17
- datahub/ingestion/source/mlflow.py +30 -5
- datahub/ingestion/source/mode.py +40 -27
- datahub/ingestion/source/powerbi/config.py +1 -14
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
- datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
- datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
- datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
- datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
- datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
- datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
- datahub/ingestion/source/sql/hive.py +621 -8
- datahub/ingestion/source/sql/hive_metastore.py +7 -0
- datahub/ingestion/source/sql/mssql/job_models.py +30 -1
- datahub/ingestion/source/sql/mssql/source.py +15 -1
- datahub/ingestion/source/sql/sql_common.py +41 -102
- datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
- datahub/ingestion/source/sql/sql_report.py +2 -0
- datahub/ingestion/source/state/checkpoint.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +122 -45
- datahub/ingestion/source/tableau/tableau_common.py +18 -0
- datahub/ingestion/source/tableau/tableau_constant.py +3 -1
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/proxy.py +8 -27
- datahub/ingestion/source/usage/usage_common.py +15 -1
- datahub/ingestion/source_report/ingestion_stage.py +3 -0
- datahub/metadata/_schema_classes.py +256 -3
- datahub/metadata/_urns/urn_defs.py +168 -168
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
- datahub/metadata/schema.avsc +252 -33
- datahub/metadata/schemas/DataJobKey.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
- datahub/metadata/schemas/MLModelProperties.avsc +62 -2
- datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
- datahub/specific/aspect_helpers/__init__.py +0 -0
- datahub/specific/aspect_helpers/custom_properties.py +79 -0
- datahub/specific/aspect_helpers/ownership.py +67 -0
- datahub/specific/aspect_helpers/structured_properties.py +72 -0
- datahub/specific/aspect_helpers/tags.py +42 -0
- datahub/specific/aspect_helpers/terms.py +43 -0
- datahub/specific/chart.py +28 -184
- datahub/specific/dashboard.py +31 -196
- datahub/specific/datajob.py +34 -189
- datahub/specific/dataproduct.py +24 -86
- datahub/specific/dataset.py +48 -133
- datahub/specific/form.py +12 -32
- datahub/specific/structured_property.py +9 -9
- datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
- datahub/sql_parsing/sqlglot_lineage.py +15 -5
- datahub/sql_parsing/tool_meta_extractor.py +119 -5
- datahub/utilities/time.py +8 -3
- datahub/utilities/urns/_urn_base.py +5 -7
- datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
- datahub/specific/custom_properties.py +0 -37
- datahub/specific/ownership.py +0 -48
- datahub/specific/structured_properties.py +0 -53
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
|
@@ -300,6 +300,28 @@ class SnowflakeIdentifierBuilder:
|
|
|
300
300
|
def get_quoted_identifier_for_table(db_name, schema_name, table_name):
|
|
301
301
|
return f'"{db_name}"."{schema_name}"."{table_name}"'
|
|
302
302
|
|
|
303
|
+
# Note - decide how to construct user urns.
|
|
304
|
+
# Historically urns were created using part before @ from user's email.
|
|
305
|
+
# Users without email were skipped from both user entries as well as aggregates.
|
|
306
|
+
# However email is not mandatory field in snowflake user, user_name is always present.
|
|
307
|
+
def get_user_identifier(
|
|
308
|
+
self,
|
|
309
|
+
user_name: str,
|
|
310
|
+
user_email: Optional[str],
|
|
311
|
+
) -> str:
|
|
312
|
+
if user_email:
|
|
313
|
+
return self.snowflake_identifier(
|
|
314
|
+
user_email
|
|
315
|
+
if self.identifier_config.email_as_user_identifier is True
|
|
316
|
+
else user_email.split("@")[0]
|
|
317
|
+
)
|
|
318
|
+
return self.snowflake_identifier(
|
|
319
|
+
f"{user_name}@{self.identifier_config.email_domain}"
|
|
320
|
+
if self.identifier_config.email_as_user_identifier is True
|
|
321
|
+
and self.identifier_config.email_domain is not None
|
|
322
|
+
else user_name
|
|
323
|
+
)
|
|
324
|
+
|
|
303
325
|
|
|
304
326
|
class SnowflakeCommonMixin(SnowflakeStructuredReportMixin):
|
|
305
327
|
platform = "snowflake"
|
|
@@ -315,24 +337,6 @@ class SnowflakeCommonMixin(SnowflakeStructuredReportMixin):
|
|
|
315
337
|
def identifiers(self) -> SnowflakeIdentifierBuilder:
|
|
316
338
|
return SnowflakeIdentifierBuilder(self.config, self.report)
|
|
317
339
|
|
|
318
|
-
# Note - decide how to construct user urns.
|
|
319
|
-
# Historically urns were created using part before @ from user's email.
|
|
320
|
-
# Users without email were skipped from both user entries as well as aggregates.
|
|
321
|
-
# However email is not mandatory field in snowflake user, user_name is always present.
|
|
322
|
-
def get_user_identifier(
|
|
323
|
-
self,
|
|
324
|
-
user_name: str,
|
|
325
|
-
user_email: Optional[str],
|
|
326
|
-
email_as_user_identifier: bool,
|
|
327
|
-
) -> str:
|
|
328
|
-
if user_email:
|
|
329
|
-
return self.identifiers.snowflake_identifier(
|
|
330
|
-
user_email
|
|
331
|
-
if email_as_user_identifier is True
|
|
332
|
-
else user_email.split("@")[0]
|
|
333
|
-
)
|
|
334
|
-
return self.identifiers.snowflake_identifier(user_name)
|
|
335
|
-
|
|
336
340
|
# TODO: Revisit this after stateful ingestion can commit checkpoint
|
|
337
341
|
# for failures that do not affect the checkpoint
|
|
338
342
|
# TODO: Add additional parameters to match the signature of the .warning and .failure methods
|
|
@@ -82,6 +82,7 @@ from datahub.ingestion.source_report.ingestion_stage import (
|
|
|
82
82
|
LINEAGE_EXTRACTION,
|
|
83
83
|
METADATA_EXTRACTION,
|
|
84
84
|
QUERIES_EXTRACTION,
|
|
85
|
+
VIEW_PARSING,
|
|
85
86
|
)
|
|
86
87
|
from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator
|
|
87
88
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
@@ -103,7 +104,7 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
103
104
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
|
|
104
105
|
@capability(
|
|
105
106
|
SourceCapability.LINEAGE_COARSE,
|
|
106
|
-
"Enabled by default, can be disabled via configuration `include_table_lineage`
|
|
107
|
+
"Enabled by default, can be disabled via configuration `include_table_lineage`",
|
|
107
108
|
)
|
|
108
109
|
@capability(
|
|
109
110
|
SourceCapability.LINEAGE_FINE,
|
|
@@ -161,35 +162,32 @@ class SnowflakeV2Source(
|
|
|
161
162
|
# For database, schema, tables, views, etc
|
|
162
163
|
self.data_dictionary = SnowflakeDataDictionary(connection=self.connection)
|
|
163
164
|
self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
generate_operations=False,
|
|
186
|
-
format_queries=self.config.format_sql_queries,
|
|
187
|
-
)
|
|
165
|
+
|
|
166
|
+
self.aggregator: SqlParsingAggregator = self._exit_stack.enter_context(
|
|
167
|
+
SqlParsingAggregator(
|
|
168
|
+
platform=self.identifiers.platform,
|
|
169
|
+
platform_instance=self.config.platform_instance,
|
|
170
|
+
env=self.config.env,
|
|
171
|
+
graph=self.ctx.graph,
|
|
172
|
+
eager_graph_load=(
|
|
173
|
+
# If we're ingestion schema metadata for tables/views, then we will populate
|
|
174
|
+
# schemas into the resolver as we go. We only need to do a bulk fetch
|
|
175
|
+
# if we're not ingesting schema metadata as part of ingestion.
|
|
176
|
+
not (
|
|
177
|
+
self.config.include_technical_schema
|
|
178
|
+
and self.config.include_tables
|
|
179
|
+
and self.config.include_views
|
|
180
|
+
)
|
|
181
|
+
and not self.config.lazy_schema_resolver
|
|
182
|
+
),
|
|
183
|
+
generate_usage_statistics=False,
|
|
184
|
+
generate_operations=False,
|
|
185
|
+
format_queries=self.config.format_sql_queries,
|
|
188
186
|
)
|
|
189
|
-
|
|
187
|
+
)
|
|
188
|
+
self.report.sql_aggregator = self.aggregator.report
|
|
190
189
|
|
|
191
190
|
if self.config.include_table_lineage:
|
|
192
|
-
assert self.aggregator is not None
|
|
193
191
|
redundant_lineage_run_skip_handler: Optional[
|
|
194
192
|
RedundantLineageRunSkipHandler
|
|
195
193
|
] = None
|
|
@@ -487,8 +485,6 @@ class SnowflakeV2Source(
|
|
|
487
485
|
|
|
488
486
|
databases = schema_extractor.databases
|
|
489
487
|
|
|
490
|
-
# TODO: The checkpoint state for stale entity detection can be committed here.
|
|
491
|
-
|
|
492
488
|
if self.config.shares:
|
|
493
489
|
yield from SnowflakeSharesHandler(
|
|
494
490
|
self.config, self.report
|
|
@@ -517,15 +513,14 @@ class SnowflakeV2Source(
|
|
|
517
513
|
discovered_datasets = discovered_tables + discovered_views
|
|
518
514
|
|
|
519
515
|
if self.config.use_queries_v2:
|
|
520
|
-
self.report.set_ingestion_stage("*",
|
|
521
|
-
assert self.aggregator is not None
|
|
516
|
+
self.report.set_ingestion_stage("*", VIEW_PARSING)
|
|
522
517
|
yield from auto_workunit(self.aggregator.gen_metadata())
|
|
523
518
|
|
|
524
519
|
self.report.set_ingestion_stage("*", QUERIES_EXTRACTION)
|
|
525
520
|
|
|
526
521
|
schema_resolver = self.aggregator._schema_resolver
|
|
527
522
|
|
|
528
|
-
queries_extractor
|
|
523
|
+
queries_extractor = SnowflakeQueriesExtractor(
|
|
529
524
|
connection=self.connection,
|
|
530
525
|
config=SnowflakeQueriesExtractorConfig(
|
|
531
526
|
window=self.config,
|
|
@@ -540,6 +535,7 @@ class SnowflakeV2Source(
|
|
|
540
535
|
identifiers=self.identifiers,
|
|
541
536
|
schema_resolver=schema_resolver,
|
|
542
537
|
discovered_tables=discovered_datasets,
|
|
538
|
+
graph=self.ctx.graph,
|
|
543
539
|
)
|
|
544
540
|
|
|
545
541
|
# TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
|
|
@@ -550,13 +546,21 @@ class SnowflakeV2Source(
|
|
|
550
546
|
queries_extractor.close()
|
|
551
547
|
|
|
552
548
|
else:
|
|
553
|
-
if self.
|
|
549
|
+
if self.lineage_extractor:
|
|
554
550
|
self.report.set_ingestion_stage("*", LINEAGE_EXTRACTION)
|
|
555
|
-
|
|
551
|
+
self.lineage_extractor.add_time_based_lineage_to_aggregator(
|
|
556
552
|
discovered_tables=discovered_tables,
|
|
557
553
|
discovered_views=discovered_views,
|
|
558
554
|
)
|
|
559
555
|
|
|
556
|
+
# This would emit view and external table ddl lineage
|
|
557
|
+
# as well as query lineage via lineage_extractor
|
|
558
|
+
for mcp in self.aggregator.gen_metadata():
|
|
559
|
+
yield mcp.as_workunit()
|
|
560
|
+
|
|
561
|
+
if self.lineage_extractor:
|
|
562
|
+
self.lineage_extractor.update_state()
|
|
563
|
+
|
|
560
564
|
if (
|
|
561
565
|
self.config.include_usage_stats or self.config.include_operational_stats
|
|
562
566
|
) and self.usage_extractor:
|