acryl-datahub 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2440 -2438
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +211 -207
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
- datahub/cli/cli_utils.py +13 -2
- datahub/cli/delete_cli.py +3 -3
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/ingest_cli.py +25 -15
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +5 -5
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/specific/structuredproperties_cli.py +84 -0
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_builder.py +27 -0
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/emitter/rest_emitter.py +126 -85
- datahub/entrypoints.py +6 -0
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source.py +4 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +22 -19
- datahub/ingestion/graph/config.py +1 -1
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +77 -47
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/s3_util.py +24 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
- datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +60 -60
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/config.py +10 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
- datahub/ingestion/source/datahub/datahub_source.py +12 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/delta_lake/source.py +0 -5
- datahub/ingestion/source/demo_data.py +1 -1
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
- datahub/ingestion/source/dremio/dremio_source.py +2 -2
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/fivetran/fivetran.py +1 -6
- datahub/ingestion/source/gc/datahub_gc.py +11 -14
- datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +2 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +13 -6
- datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
- datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +11 -6
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/metabase.py +1 -6
- datahub/ingestion/source/mlflow.py +4 -9
- datahub/ingestion/source/mode.py +5 -5
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -31
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redash.py +0 -5
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +45 -46
- datahub/ingestion/source/redshift/usage.py +33 -33
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +11 -15
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
- datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/sql_types.py +1 -2
- datahub/ingestion/source/sql/sql_utils.py +5 -0
- datahub/ingestion/source/sql/teradata.py +18 -5
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +1 -6
- datahub/ingestion/source/tableau/tableau.py +343 -117
- datahub/ingestion/source/tableau/tableau_common.py +5 -2
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +74 -74
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/source_report/ingestion_stage.py +24 -20
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +317 -44
- datahub/metadata/_urns/urn_defs.py +69 -15
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
- datahub/metadata/schema.avsc +302 -89
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
- datahub/metadata/schemas/MLModelKey.avsc +2 -1
- datahub/metadata/schemas/MLModelProperties.avsc +96 -48
- datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
- datahub/metadata/schemas/VersionProperties.avsc +216 -0
- datahub/metadata/schemas/VersionSetKey.avsc +26 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
- datahub/secret/datahub_secrets_client.py +12 -21
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
- datahub/sql_parsing/sqlglot_lineage.py +3 -3
- datahub/sql_parsing/sqlglot_utils.py +1 -1
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +11 -11
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/perf_timer.py +11 -6
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/_urn_base.py +28 -5
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
|
@@ -248,9 +248,9 @@ class BigQuerySchemaGenerator:
|
|
|
248
248
|
def get_project_workunits(
|
|
249
249
|
self, project: BigqueryProject
|
|
250
250
|
) -> Iterable[MetadataWorkUnit]:
|
|
251
|
-
self.report.
|
|
252
|
-
|
|
253
|
-
|
|
251
|
+
with self.report.new_stage(f"{project.id}: {METADATA_EXTRACTION}"):
|
|
252
|
+
logger.info(f"Processing project: {project.id}")
|
|
253
|
+
yield from self._process_project(project)
|
|
254
254
|
|
|
255
255
|
def get_dataplatform_instance_aspect(
|
|
256
256
|
self, dataset_urn: str, project_id: str
|
|
@@ -311,8 +311,10 @@ class BigQuerySchemaGenerator:
|
|
|
311
311
|
platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
|
|
312
312
|
label, tag_urn, managed_by_datahub=False
|
|
313
313
|
)
|
|
314
|
-
label_info: BigQueryLabelInfo =
|
|
315
|
-
|
|
314
|
+
label_info: BigQueryLabelInfo = (
|
|
315
|
+
platform_resource.resource_info.value.as_pydantic_object( # type: ignore
|
|
316
|
+
BigQueryLabelInfo
|
|
317
|
+
)
|
|
316
318
|
)
|
|
317
319
|
tag_urn = TagUrn.from_string(label_info.datahub_urn)
|
|
318
320
|
|
|
@@ -405,11 +407,11 @@ class BigQuerySchemaGenerator:
|
|
|
405
407
|
|
|
406
408
|
if self.config.is_profiling_enabled():
|
|
407
409
|
logger.info(f"Starting profiling project {project_id}")
|
|
408
|
-
self.report.
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
410
|
+
with self.report.new_stage(f"{project_id}: {PROFILING}"):
|
|
411
|
+
yield from self.profiler.get_workunits(
|
|
412
|
+
project_id=project_id,
|
|
413
|
+
tables=db_tables,
|
|
414
|
+
)
|
|
413
415
|
|
|
414
416
|
def _process_project_datasets(
|
|
415
417
|
self,
|
|
@@ -820,8 +822,10 @@ class BigQuerySchemaGenerator:
|
|
|
820
822
|
platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
|
|
821
823
|
label, tag_urn, managed_by_datahub=False
|
|
822
824
|
)
|
|
823
|
-
label_info: BigQueryLabelInfo =
|
|
824
|
-
|
|
825
|
+
label_info: BigQueryLabelInfo = (
|
|
826
|
+
platform_resource.resource_info.value.as_pydantic_object( # type: ignore
|
|
827
|
+
BigQueryLabelInfo
|
|
828
|
+
)
|
|
825
829
|
)
|
|
826
830
|
tag_urn = TagUrn.from_string(label_info.datahub_urn)
|
|
827
831
|
|
|
@@ -860,8 +864,10 @@ class BigQuerySchemaGenerator:
|
|
|
860
864
|
platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
|
|
861
865
|
label, tag_urn, managed_by_datahub=False
|
|
862
866
|
)
|
|
863
|
-
label_info: BigQueryLabelInfo =
|
|
864
|
-
|
|
867
|
+
label_info: BigQueryLabelInfo = (
|
|
868
|
+
platform_resource.resource_info.value.as_pydantic_object( # type: ignore
|
|
869
|
+
BigQueryLabelInfo
|
|
870
|
+
)
|
|
865
871
|
)
|
|
866
872
|
tag_urn = TagUrn.from_string(label_info.datahub_urn)
|
|
867
873
|
|
|
@@ -1203,8 +1209,8 @@ class BigQuerySchemaGenerator:
|
|
|
1203
1209
|
report=self.report,
|
|
1204
1210
|
)
|
|
1205
1211
|
|
|
1206
|
-
self.report.metadata_extraction_sec[f"{project_id}.{dataset.name}"] =
|
|
1207
|
-
timer.elapsed_seconds()
|
|
1212
|
+
self.report.metadata_extraction_sec[f"{project_id}.{dataset.name}"] = (
|
|
1213
|
+
timer.elapsed_seconds(digits=2)
|
|
1208
1214
|
)
|
|
1209
1215
|
|
|
1210
1216
|
def get_core_table_details(
|
|
@@ -330,11 +330,11 @@ class BigqueryLineageExtractor:
|
|
|
330
330
|
projects = ["*"] # project_id not used when using exported metadata
|
|
331
331
|
|
|
332
332
|
for project in projects:
|
|
333
|
-
self.report.
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
333
|
+
with self.report.new_stage(f"{project}: {LINEAGE_EXTRACTION}"):
|
|
334
|
+
yield from self.generate_lineage(
|
|
335
|
+
project,
|
|
336
|
+
table_refs,
|
|
337
|
+
)
|
|
338
338
|
|
|
339
339
|
if self.redundant_run_skip_handler:
|
|
340
340
|
# Update the checkpoint state for this run.
|
|
@@ -368,8 +368,8 @@ class BigqueryLineageExtractor:
|
|
|
368
368
|
self.report.lineage_metadata_entries[project_id] = len(lineage)
|
|
369
369
|
logger.info(f"Built lineage map containing {len(lineage)} entries.")
|
|
370
370
|
logger.debug(f"lineage metadata is {lineage}")
|
|
371
|
-
self.report.lineage_extraction_sec[project_id] =
|
|
372
|
-
|
|
371
|
+
self.report.lineage_extraction_sec[project_id] = timer.elapsed_seconds(
|
|
372
|
+
digits=2
|
|
373
373
|
)
|
|
374
374
|
self.report.lineage_mem_size[project_id] = humanfriendly.format_size(
|
|
375
375
|
memory_footprint.total_size(lineage)
|
|
@@ -697,7 +697,7 @@ class BigqueryLineageExtractor:
|
|
|
697
697
|
if parsed_queries[-1]:
|
|
698
698
|
query = f"""create table `{destination_table.get_sanitized_table_ref().table_identifier.get_table_name()}` AS
|
|
699
699
|
(
|
|
700
|
-
{parsed_queries[-1].sql(dialect=
|
|
700
|
+
{parsed_queries[-1].sql(dialect="bigquery")}
|
|
701
701
|
)"""
|
|
702
702
|
else:
|
|
703
703
|
query = e.query
|
|
@@ -809,11 +809,11 @@ class BigqueryLineageExtractor:
|
|
|
809
809
|
upstream_lineage, temp_table_upstream
|
|
810
810
|
)
|
|
811
811
|
|
|
812
|
-
upstreams[
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
812
|
+
upstreams[ref_temp_table_upstream] = (
|
|
813
|
+
_merge_lineage_edge_columns(
|
|
814
|
+
upstreams.get(ref_temp_table_upstream),
|
|
815
|
+
collapsed_lineage,
|
|
816
|
+
)
|
|
817
817
|
)
|
|
818
818
|
else:
|
|
819
819
|
upstreams[upstream_table_ref] = _merge_lineage_edge_columns(
|
|
@@ -1004,9 +1004,9 @@ class BigqueryLineageExtractor:
|
|
|
1004
1004
|
dataset_urn
|
|
1005
1005
|
)
|
|
1006
1006
|
for gcs_dataset_urn in gcs_urns:
|
|
1007
|
-
schema_metadata_for_gcs: Optional[
|
|
1008
|
-
|
|
1009
|
-
|
|
1007
|
+
schema_metadata_for_gcs: Optional[SchemaMetadataClass] = (
|
|
1008
|
+
graph.get_schema_metadata(gcs_dataset_urn)
|
|
1009
|
+
)
|
|
1010
1010
|
if schema_metadata and schema_metadata_for_gcs:
|
|
1011
1011
|
fine_grained_lineage = self.get_fine_grained_lineages_with_gcs(
|
|
1012
1012
|
dataset_urn,
|
|
@@ -271,9 +271,9 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
271
271
|
# Preprocessing stage that deduplicates the queries using query hash per usage bucket
|
|
272
272
|
# Note: FileBackedDict is an ordered dictionary, so the order of execution of
|
|
273
273
|
# queries is inherently maintained
|
|
274
|
-
queries_deduped: FileBackedDict[
|
|
275
|
-
|
|
276
|
-
|
|
274
|
+
queries_deduped: FileBackedDict[Dict[int, ObservedQuery]] = (
|
|
275
|
+
self.deduplicate_queries(queries)
|
|
276
|
+
)
|
|
277
277
|
self.report.num_unique_queries = len(queries_deduped)
|
|
278
278
|
logger.info(f"Found {self.report.num_unique_queries} unique queries")
|
|
279
279
|
|
|
@@ -495,62 +495,62 @@ class BigQueryUsageExtractor:
|
|
|
495
495
|
def _generate_operational_workunits(
|
|
496
496
|
self, usage_state: BigQueryUsageState, table_refs: Collection[str]
|
|
497
497
|
) -> Iterable[MetadataWorkUnit]:
|
|
498
|
-
self.report.
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
498
|
+
with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"):
|
|
499
|
+
for audit_event in usage_state.standalone_events():
|
|
500
|
+
try:
|
|
501
|
+
operational_wu = self._create_operation_workunit(
|
|
502
|
+
audit_event, table_refs
|
|
503
|
+
)
|
|
504
|
+
if operational_wu:
|
|
505
|
+
yield operational_wu
|
|
506
|
+
self.report.num_operational_stats_workunits_emitted += 1
|
|
507
|
+
except Exception as e:
|
|
508
|
+
self.report.warning(
|
|
509
|
+
message="Unable to generate operation workunit",
|
|
510
|
+
context=f"{audit_event}",
|
|
511
|
+
exc=e,
|
|
512
|
+
)
|
|
513
513
|
|
|
514
514
|
def _generate_usage_workunits(
|
|
515
515
|
self, usage_state: BigQueryUsageState
|
|
516
516
|
) -> Iterable[MetadataWorkUnit]:
|
|
517
|
-
self.report.
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
517
|
+
with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"):
|
|
518
|
+
top_n = (
|
|
519
|
+
self.config.usage.top_n_queries
|
|
520
|
+
if self.config.usage.include_top_n_queries
|
|
521
|
+
else 0
|
|
522
|
+
)
|
|
523
|
+
for entry in usage_state.usage_statistics(top_n=top_n):
|
|
524
|
+
try:
|
|
525
|
+
query_freq = [
|
|
526
|
+
(
|
|
527
|
+
self.uuid_to_query.get(
|
|
528
|
+
query_hash, usage_state.queries[query_hash]
|
|
529
|
+
),
|
|
530
|
+
count,
|
|
531
|
+
)
|
|
532
|
+
for query_hash, count in entry.query_freq
|
|
533
|
+
]
|
|
534
|
+
yield make_usage_workunit(
|
|
535
|
+
bucket_start_time=datetime.fromisoformat(entry.timestamp),
|
|
536
|
+
resource=BigQueryTableRef.from_string_name(entry.resource),
|
|
537
|
+
query_count=entry.query_count,
|
|
538
|
+
query_freq=query_freq,
|
|
539
|
+
user_freq=entry.user_freq,
|
|
540
|
+
column_freq=entry.column_freq,
|
|
541
|
+
bucket_duration=self.config.bucket_duration,
|
|
542
|
+
resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
|
|
543
|
+
top_n_queries=self.config.usage.top_n_queries,
|
|
544
|
+
format_sql_queries=self.config.usage.format_sql_queries,
|
|
545
|
+
queries_character_limit=self.config.usage.queries_character_limit,
|
|
546
|
+
)
|
|
547
|
+
self.report.num_usage_workunits_emitted += 1
|
|
548
|
+
except Exception as e:
|
|
549
|
+
self.report.warning(
|
|
550
|
+
message="Unable to generate usage statistics workunit",
|
|
551
|
+
context=f"{entry.timestamp}, {entry.resource}",
|
|
552
|
+
exc=e,
|
|
531
553
|
)
|
|
532
|
-
for query_hash, count in entry.query_freq
|
|
533
|
-
]
|
|
534
|
-
yield make_usage_workunit(
|
|
535
|
-
bucket_start_time=datetime.fromisoformat(entry.timestamp),
|
|
536
|
-
resource=BigQueryTableRef.from_string_name(entry.resource),
|
|
537
|
-
query_count=entry.query_count,
|
|
538
|
-
query_freq=query_freq,
|
|
539
|
-
user_freq=entry.user_freq,
|
|
540
|
-
column_freq=entry.column_freq,
|
|
541
|
-
bucket_duration=self.config.bucket_duration,
|
|
542
|
-
resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
|
|
543
|
-
top_n_queries=self.config.usage.top_n_queries,
|
|
544
|
-
format_sql_queries=self.config.usage.format_sql_queries,
|
|
545
|
-
queries_character_limit=self.config.usage.queries_character_limit,
|
|
546
|
-
)
|
|
547
|
-
self.report.num_usage_workunits_emitted += 1
|
|
548
|
-
except Exception as e:
|
|
549
|
-
self.report.warning(
|
|
550
|
-
message="Unable to generate usage statistics workunit",
|
|
551
|
-
context=f"{entry.timestamp}, {entry.resource}",
|
|
552
|
-
exc=e,
|
|
553
|
-
)
|
|
554
554
|
|
|
555
555
|
def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]:
|
|
556
556
|
if self.config.use_exported_bigquery_audit_metadata:
|
|
@@ -559,10 +559,10 @@ class BigQueryUsageExtractor:
|
|
|
559
559
|
for project_id in projects:
|
|
560
560
|
with PerfTimer() as timer:
|
|
561
561
|
try:
|
|
562
|
-
self.report.
|
|
563
|
-
project_id
|
|
564
|
-
)
|
|
565
|
-
|
|
562
|
+
with self.report.new_stage(
|
|
563
|
+
f"{project_id}: {USAGE_EXTRACTION_INGESTION}"
|
|
564
|
+
):
|
|
565
|
+
yield from self._get_parsed_bigquery_log_events(project_id)
|
|
566
566
|
except Exception as e:
|
|
567
567
|
self.report.usage_failed_extraction.append(project_id)
|
|
568
568
|
self.report.warning(
|
|
@@ -572,8 +572,8 @@ class BigQueryUsageExtractor:
|
|
|
572
572
|
)
|
|
573
573
|
self.report_status(f"usage-extraction-{project_id}", False)
|
|
574
574
|
|
|
575
|
-
self.report.usage_extraction_sec[project_id] =
|
|
576
|
-
|
|
575
|
+
self.report.usage_extraction_sec[project_id] = timer.elapsed_seconds(
|
|
576
|
+
digits=2
|
|
577
577
|
)
|
|
578
578
|
|
|
579
579
|
def _store_usage_event(
|
|
@@ -763,9 +763,9 @@ class BigQueryUsageExtractor:
|
|
|
763
763
|
)
|
|
764
764
|
|
|
765
765
|
if event.query_event.default_dataset:
|
|
766
|
-
custom_properties[
|
|
767
|
-
|
|
768
|
-
|
|
766
|
+
custom_properties["defaultDatabase"] = (
|
|
767
|
+
event.query_event.default_dataset
|
|
768
|
+
)
|
|
769
769
|
if event.read_event:
|
|
770
770
|
if event.read_event.readReason:
|
|
771
771
|
custom_properties["readReason"] = event.read_event.readReason
|
|
@@ -70,30 +70,30 @@ class CassandraProfiler:
|
|
|
70
70
|
) -> Iterable[MetadataWorkUnit]:
|
|
71
71
|
for keyspace_name in cassandra_data.keyspaces:
|
|
72
72
|
tables = cassandra_data.tables.get(keyspace_name, [])
|
|
73
|
-
self.report.
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
73
|
+
with self.report.new_stage(f"{keyspace_name}: {PROFILING}"):
|
|
74
|
+
with ThreadPoolExecutor(
|
|
75
|
+
max_workers=self.config.profiling.max_workers
|
|
76
|
+
) as executor:
|
|
77
|
+
future_to_dataset = {
|
|
78
|
+
executor.submit(
|
|
79
|
+
self.generate_profile,
|
|
80
|
+
keyspace_name,
|
|
81
|
+
table_name,
|
|
82
|
+
cassandra_data.columns.get(table_name, []),
|
|
83
|
+
): table_name
|
|
84
|
+
for table_name in tables
|
|
85
|
+
}
|
|
86
|
+
for future in as_completed(future_to_dataset):
|
|
87
|
+
table_name = future_to_dataset[future]
|
|
88
|
+
try:
|
|
89
|
+
yield from future.result()
|
|
90
|
+
except Exception as exc:
|
|
91
|
+
self.report.profiling_skipped_other[table_name] += 1
|
|
92
|
+
self.report.failure(
|
|
93
|
+
message="Failed to profile for table",
|
|
94
|
+
context=f"{keyspace_name}.{table_name}",
|
|
95
|
+
exc=exc,
|
|
96
|
+
)
|
|
97
97
|
|
|
98
98
|
def generate_profile(
|
|
99
99
|
self,
|
|
@@ -54,9 +54,6 @@ class CassandraSourceReport(StaleEntityRemovalSourceReport, IngestionStageReport
|
|
|
54
54
|
else:
|
|
55
55
|
raise KeyError(f"Unknown entity {ent_type}.")
|
|
56
56
|
|
|
57
|
-
def set_ingestion_stage(self, keyspace: str, stage: str) -> None:
|
|
58
|
-
self.report_ingestion_stage_start(f"{keyspace}: {stage}")
|
|
59
|
-
|
|
60
57
|
# TODO Need to create seperate common config for profiling report
|
|
61
58
|
profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
|
|
62
59
|
profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
|
|
@@ -110,10 +107,10 @@ class CassandraToSchemaFieldConverter:
|
|
|
110
107
|
|
|
111
108
|
@staticmethod
|
|
112
109
|
def get_column_type(cassandra_column_type: str) -> SchemaFieldDataType:
|
|
113
|
-
type_class: Optional[
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
110
|
+
type_class: Optional[Type] = (
|
|
111
|
+
CassandraToSchemaFieldConverter._field_type_to_schema_field_type.get(
|
|
112
|
+
cassandra_column_type
|
|
113
|
+
)
|
|
117
114
|
)
|
|
118
115
|
if type_class is None:
|
|
119
116
|
logger.warning(
|
|
@@ -293,9 +293,9 @@ class ConfluentSchemaRegistry(KafkaSchemaRegistryBase):
|
|
|
293
293
|
def _load_json_schema_with_resolved_references(
|
|
294
294
|
self, schema: Schema, name: str, subject: str
|
|
295
295
|
) -> dict:
|
|
296
|
-
imported_json_schemas: List[
|
|
297
|
-
|
|
298
|
-
|
|
296
|
+
imported_json_schemas: List[JsonSchemaWrapper] = (
|
|
297
|
+
self.get_schemas_from_confluent_ref_json(schema, name=name, subject=subject)
|
|
298
|
+
)
|
|
299
299
|
schema_dict = json.loads(schema.schema_str)
|
|
300
300
|
reference_map = {}
|
|
301
301
|
for imported_schema in imported_json_schemas:
|
|
@@ -332,9 +332,9 @@ class ConfluentSchemaRegistry(KafkaSchemaRegistryBase):
|
|
|
332
332
|
)
|
|
333
333
|
|
|
334
334
|
elif schema.schema_type == "PROTOBUF":
|
|
335
|
-
imported_schemas: List[
|
|
336
|
-
|
|
337
|
-
|
|
335
|
+
imported_schemas: List[ProtobufSchema] = (
|
|
336
|
+
self.get_schemas_from_confluent_ref_protobuf(schema)
|
|
337
|
+
)
|
|
338
338
|
base_name: str = topic.replace(".", "_")
|
|
339
339
|
fields = protobuf_util.protobuf_schema_to_mce_fields(
|
|
340
340
|
ProtobufSchema(
|
|
@@ -371,11 +371,11 @@ class CSVEnricherSource(Source):
|
|
|
371
371
|
domain: Optional[str],
|
|
372
372
|
description: Optional[str],
|
|
373
373
|
) -> Iterable[MetadataWorkUnit]:
|
|
374
|
-
maybe_terms_wu: Optional[
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
374
|
+
maybe_terms_wu: Optional[MetadataWorkUnit] = (
|
|
375
|
+
self.get_resource_glossary_terms_work_unit(
|
|
376
|
+
entity_urn=entity_urn,
|
|
377
|
+
term_associations=term_associations,
|
|
378
|
+
)
|
|
379
379
|
)
|
|
380
380
|
if maybe_terms_wu:
|
|
381
381
|
self.report.num_glossary_term_workunits_produced += 1
|
|
@@ -389,31 +389,31 @@ class CSVEnricherSource(Source):
|
|
|
389
389
|
self.report.num_tag_workunits_produced += 1
|
|
390
390
|
yield maybe_tags_wu
|
|
391
391
|
|
|
392
|
-
maybe_owners_wu: Optional[
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
392
|
+
maybe_owners_wu: Optional[MetadataWorkUnit] = (
|
|
393
|
+
self.get_resource_owners_work_unit(
|
|
394
|
+
entity_urn=entity_urn,
|
|
395
|
+
owners=owners,
|
|
396
|
+
)
|
|
397
397
|
)
|
|
398
398
|
if maybe_owners_wu:
|
|
399
399
|
self.report.num_owners_workunits_produced += 1
|
|
400
400
|
yield maybe_owners_wu
|
|
401
401
|
|
|
402
|
-
maybe_domain_wu: Optional[
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
402
|
+
maybe_domain_wu: Optional[MetadataWorkUnit] = (
|
|
403
|
+
self.get_resource_domain_work_unit(
|
|
404
|
+
entity_urn=entity_urn,
|
|
405
|
+
domain=domain,
|
|
406
|
+
)
|
|
407
407
|
)
|
|
408
408
|
if maybe_domain_wu:
|
|
409
409
|
self.report.num_domain_workunits_produced += 1
|
|
410
410
|
yield maybe_domain_wu
|
|
411
411
|
|
|
412
|
-
maybe_description_wu: Optional[
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
412
|
+
maybe_description_wu: Optional[MetadataWorkUnit] = (
|
|
413
|
+
self.get_resource_description_work_unit(
|
|
414
|
+
entity_urn=entity_urn,
|
|
415
|
+
description=description,
|
|
416
|
+
)
|
|
417
417
|
)
|
|
418
418
|
if maybe_description_wu:
|
|
419
419
|
self.report.num_description_workunits_produced += 1
|
|
@@ -426,9 +426,9 @@ class CSVEnricherSource(Source):
|
|
|
426
426
|
needs_write: bool,
|
|
427
427
|
) -> Tuple[EditableSchemaMetadataClass, bool]:
|
|
428
428
|
field_path: str = sub_resource_row.field_path
|
|
429
|
-
term_associations: List[
|
|
430
|
-
|
|
431
|
-
|
|
429
|
+
term_associations: List[GlossaryTermAssociationClass] = (
|
|
430
|
+
sub_resource_row.term_associations
|
|
431
|
+
)
|
|
432
432
|
tag_associations: List[TagAssociationClass] = sub_resource_row.tag_associations
|
|
433
433
|
description: Optional[str] = sub_resource_row.description
|
|
434
434
|
has_terms: bool = len(term_associations) > 0
|
|
@@ -517,9 +517,9 @@ class CSVEnricherSource(Source):
|
|
|
517
517
|
# Boolean field to tell whether we need to write an MCPW.
|
|
518
518
|
needs_write = False
|
|
519
519
|
|
|
520
|
-
current_editable_schema_metadata: Optional[
|
|
521
|
-
|
|
522
|
-
|
|
520
|
+
current_editable_schema_metadata: Optional[EditableSchemaMetadataClass] = (
|
|
521
|
+
None
|
|
522
|
+
)
|
|
523
523
|
if self.ctx.graph and not self.should_overwrite:
|
|
524
524
|
# Fetch the current editable schema metadata
|
|
525
525
|
current_editable_schema_metadata = self.ctx.graph.get_aspect(
|
|
@@ -655,9 +655,9 @@ class CSVEnricherSource(Source):
|
|
|
655
655
|
entity_urn = row["resource"]
|
|
656
656
|
entity_type = Urn.from_string(row["resource"]).get_type()
|
|
657
657
|
|
|
658
|
-
term_associations: List[
|
|
659
|
-
|
|
660
|
-
|
|
658
|
+
term_associations: List[GlossaryTermAssociationClass] = (
|
|
659
|
+
self.maybe_extract_glossary_terms(row)
|
|
660
|
+
)
|
|
661
661
|
tag_associations: List[TagAssociationClass] = self.maybe_extract_tags(row)
|
|
662
662
|
owners: List[OwnerClass] = self.maybe_extract_owners(row, is_resource_row)
|
|
663
663
|
|
|
@@ -25,6 +25,10 @@ DEFAULT_EXCLUDE_ASPECTS = {
|
|
|
25
25
|
"globalSettingsKey",
|
|
26
26
|
"globalSettingsInfo",
|
|
27
27
|
"testResults",
|
|
28
|
+
"dataHubExecutionRequestKey",
|
|
29
|
+
"dataHubExecutionRequestInput",
|
|
30
|
+
"dataHubExecutionRequestSignal",
|
|
31
|
+
"dataHubExecutionRequestResult",
|
|
28
32
|
}
|
|
29
33
|
|
|
30
34
|
|
|
@@ -108,6 +112,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
|
|
|
108
112
|
|
|
109
113
|
urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern())
|
|
110
114
|
|
|
115
|
+
drop_duplicate_schema_fields: bool = Field(
|
|
116
|
+
default=False,
|
|
117
|
+
description="Whether to drop duplicate schema fields in the schemaMetadata aspect. "
|
|
118
|
+
"Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
|
|
119
|
+
)
|
|
120
|
+
|
|
111
121
|
@root_validator(skip_on_failure=True)
|
|
112
122
|
def check_ingesting_data(cls, values):
|
|
113
123
|
if (
|
|
@@ -152,7 +152,9 @@ class DataHubDatabaseReader:
|
|
|
152
152
|
) -> Iterable[Dict[str, Any]]:
|
|
153
153
|
with self.engine.connect() as conn:
|
|
154
154
|
if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
|
|
155
|
-
with
|
|
155
|
+
with (
|
|
156
|
+
conn.begin()
|
|
157
|
+
): # Transaction required for PostgreSQL server-side cursor
|
|
156
158
|
# Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
|
|
157
159
|
# https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
|
|
158
160
|
conn = conn.execution_options(
|
|
@@ -222,7 +224,7 @@ class DataHubDatabaseReader:
|
|
|
222
224
|
)
|
|
223
225
|
except Exception as e:
|
|
224
226
|
logger.warning(
|
|
225
|
-
f
|
|
227
|
+
f"Failed to parse metadata for {row['urn']}: {e}", exc_info=True
|
|
226
228
|
)
|
|
227
229
|
self.report.num_database_parse_errors += 1
|
|
228
230
|
self.report.database_parse_errors.setdefault(
|
|
@@ -12,7 +12,10 @@ from datahub.ingestion.api.decorators import (
|
|
|
12
12
|
support_status,
|
|
13
13
|
)
|
|
14
14
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
15
|
-
from datahub.ingestion.api.source_helpers import
|
|
15
|
+
from datahub.ingestion.api.source_helpers import (
|
|
16
|
+
auto_fix_duplicate_schema_field_paths,
|
|
17
|
+
auto_workunit_reporter,
|
|
18
|
+
)
|
|
16
19
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
17
20
|
from datahub.ingestion.source.datahub.config import DataHubSourceConfig
|
|
18
21
|
from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader
|
|
@@ -57,7 +60,14 @@ class DataHubSource(StatefulIngestionSourceBase):
|
|
|
57
60
|
|
|
58
61
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
59
62
|
# Exactly replicate data from DataHub source
|
|
60
|
-
return [
|
|
63
|
+
return [
|
|
64
|
+
(
|
|
65
|
+
auto_fix_duplicate_schema_field_paths
|
|
66
|
+
if self.config.drop_duplicate_schema_fields
|
|
67
|
+
else None
|
|
68
|
+
),
|
|
69
|
+
partial(auto_workunit_reporter, self.get_report()),
|
|
70
|
+
]
|
|
61
71
|
|
|
62
72
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
63
73
|
self.report.stop_time = datetime.now(tz=timezone.utc)
|