acryl-datahub 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2462 -2460
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +214 -210
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
- datahub/cli/cli_utils.py +13 -2
- datahub/cli/delete_cli.py +3 -3
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/ingest_cli.py +25 -15
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +5 -5
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/specific/structuredproperties_cli.py +84 -0
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_builder.py +27 -0
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/emitter/rest_emitter.py +141 -93
- datahub/entrypoints.py +6 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +5 -3
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source.py +8 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +22 -19
- datahub/ingestion/graph/config.py +1 -1
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +77 -47
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/s3_util.py +24 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
- datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +60 -60
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/config.py +20 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +7 -19
- datahub/ingestion/source/datahub/datahub_source.py +13 -3
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/delta_lake/source.py +0 -5
- datahub/ingestion/source/demo_data.py +1 -1
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
- datahub/ingestion/source/dremio/dremio_source.py +2 -2
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/fivetran/fivetran.py +1 -6
- datahub/ingestion/source/gc/datahub_gc.py +11 -14
- datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +2 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +13 -6
- datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
- datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +11 -6
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/metabase.py +1 -6
- datahub/ingestion/source/mlflow.py +4 -9
- datahub/ingestion/source/mode.py +5 -5
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -31
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redash.py +0 -5
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +45 -46
- datahub/ingestion/source/redshift/usage.py +33 -33
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +11 -15
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
- datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/sql_types.py +1 -2
- datahub/ingestion/source/sql/sql_utils.py +5 -0
- datahub/ingestion/source/sql/teradata.py +18 -5
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +1 -6
- datahub/ingestion/source/tableau/tableau.py +343 -117
- datahub/ingestion/source/tableau/tableau_common.py +5 -2
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +74 -78
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/source_report/ingestion_stage.py +24 -20
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +317 -44
- datahub/metadata/_urns/urn_defs.py +69 -15
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
- datahub/metadata/schema.avsc +302 -89
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
- datahub/metadata/schemas/MLModelKey.avsc +2 -1
- datahub/metadata/schemas/MLModelProperties.avsc +96 -48
- datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
- datahub/metadata/schemas/VersionProperties.avsc +216 -0
- datahub/metadata/schemas/VersionSetKey.avsc +26 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
- datahub/secret/datahub_secrets_client.py +12 -21
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +26 -21
- datahub/sql_parsing/sqlglot_lineage.py +3 -3
- datahub/sql_parsing/sqlglot_utils.py +1 -1
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +11 -11
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/perf_timer.py +11 -6
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/_urn_base.py +28 -5
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
|
@@ -137,9 +137,9 @@ class BigQueryCredential(ConfigModel):
|
|
|
137
137
|
@root_validator(skip_on_failure=True)
|
|
138
138
|
def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
|
139
139
|
if values.get("client_x509_cert_url") is None:
|
|
140
|
-
values[
|
|
141
|
-
"
|
|
142
|
-
|
|
140
|
+
values["client_x509_cert_url"] = (
|
|
141
|
+
f"https://www.googleapis.com/robot/v1/metadata/x509/{values['client_email']}"
|
|
142
|
+
)
|
|
143
143
|
return values
|
|
144
144
|
|
|
145
145
|
def create_credential_temp_file(self) -> str:
|
|
@@ -447,6 +447,14 @@ class BigQueryV2Config(
|
|
|
447
447
|
default=False,
|
|
448
448
|
description="If enabled, uses the new queries extractor to extract queries from bigquery.",
|
|
449
449
|
)
|
|
450
|
+
include_queries: bool = Field(
|
|
451
|
+
default=True,
|
|
452
|
+
description="If enabled, generate query entities associated with lineage edges. Only applicable if `use_queries_v2` is enabled.",
|
|
453
|
+
)
|
|
454
|
+
include_query_usage_statistics: bool = Field(
|
|
455
|
+
default=True,
|
|
456
|
+
description="If enabled, generate query popularity statistics. Only applicable if `use_queries_v2` is enabled.",
|
|
457
|
+
)
|
|
450
458
|
|
|
451
459
|
@property
|
|
452
460
|
def have_table_data_read_permission(self) -> bool:
|
|
@@ -603,9 +611,9 @@ class BigQueryV2Config(
|
|
|
603
611
|
cls, v: Optional[List[str]], values: Dict
|
|
604
612
|
) -> Optional[List[str]]:
|
|
605
613
|
if values.get("use_exported_bigquery_audit_metadata"):
|
|
606
|
-
assert (
|
|
607
|
-
|
|
608
|
-
)
|
|
614
|
+
assert v and len(v) > 0, (
|
|
615
|
+
"`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata: True`."
|
|
616
|
+
)
|
|
609
617
|
|
|
610
618
|
return v
|
|
611
619
|
|
|
@@ -87,9 +87,9 @@ class BigQueryPlatformResourceHelper:
|
|
|
87
87
|
key=platform_resource_key, graph_client=self.graph
|
|
88
88
|
)
|
|
89
89
|
if platform_resource:
|
|
90
|
-
self.platform_resource_cache[
|
|
91
|
-
|
|
92
|
-
|
|
90
|
+
self.platform_resource_cache[platform_resource_key.primary_key] = (
|
|
91
|
+
platform_resource
|
|
92
|
+
)
|
|
93
93
|
return platform_resource
|
|
94
94
|
return None
|
|
95
95
|
|
|
@@ -115,7 +115,11 @@ class BigQueryPlatformResourceHelper:
|
|
|
115
115
|
and platform_resource.resource_info.value
|
|
116
116
|
):
|
|
117
117
|
try:
|
|
118
|
-
existing_info: Optional[BigQueryLabelInfo] =
|
|
118
|
+
existing_info: Optional[BigQueryLabelInfo] = (
|
|
119
|
+
platform_resource.resource_info.value.as_pydantic_object( # type: ignore
|
|
120
|
+
BigQueryLabelInfo
|
|
121
|
+
)
|
|
122
|
+
)
|
|
119
123
|
except ValidationError as e:
|
|
120
124
|
logger.error(
|
|
121
125
|
f"Error converting existing value to BigQueryLabelInfo: {e}. Creating new one. Maybe this is because of a non backward compatible schema change."
|
|
@@ -190,6 +190,3 @@ class BigQueryV2Report(
|
|
|
190
190
|
num_skipped_external_table_lineage: int = 0
|
|
191
191
|
|
|
192
192
|
queries_extractor: Optional[BigQueryQueriesExtractorReport] = None
|
|
193
|
-
|
|
194
|
-
def set_ingestion_stage(self, project_id: str, stage: str) -> None:
|
|
195
|
-
self.report_ingestion_stage_start(f"{project_id}: {stage}")
|
|
@@ -248,9 +248,9 @@ class BigQuerySchemaGenerator:
|
|
|
248
248
|
def get_project_workunits(
|
|
249
249
|
self, project: BigqueryProject
|
|
250
250
|
) -> Iterable[MetadataWorkUnit]:
|
|
251
|
-
self.report.
|
|
252
|
-
|
|
253
|
-
|
|
251
|
+
with self.report.new_stage(f"{project.id}: {METADATA_EXTRACTION}"):
|
|
252
|
+
logger.info(f"Processing project: {project.id}")
|
|
253
|
+
yield from self._process_project(project)
|
|
254
254
|
|
|
255
255
|
def get_dataplatform_instance_aspect(
|
|
256
256
|
self, dataset_urn: str, project_id: str
|
|
@@ -311,8 +311,10 @@ class BigQuerySchemaGenerator:
|
|
|
311
311
|
platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
|
|
312
312
|
label, tag_urn, managed_by_datahub=False
|
|
313
313
|
)
|
|
314
|
-
label_info: BigQueryLabelInfo =
|
|
315
|
-
|
|
314
|
+
label_info: BigQueryLabelInfo = (
|
|
315
|
+
platform_resource.resource_info.value.as_pydantic_object( # type: ignore
|
|
316
|
+
BigQueryLabelInfo
|
|
317
|
+
)
|
|
316
318
|
)
|
|
317
319
|
tag_urn = TagUrn.from_string(label_info.datahub_urn)
|
|
318
320
|
|
|
@@ -405,11 +407,11 @@ class BigQuerySchemaGenerator:
|
|
|
405
407
|
|
|
406
408
|
if self.config.is_profiling_enabled():
|
|
407
409
|
logger.info(f"Starting profiling project {project_id}")
|
|
408
|
-
self.report.
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
410
|
+
with self.report.new_stage(f"{project_id}: {PROFILING}"):
|
|
411
|
+
yield from self.profiler.get_workunits(
|
|
412
|
+
project_id=project_id,
|
|
413
|
+
tables=db_tables,
|
|
414
|
+
)
|
|
413
415
|
|
|
414
416
|
def _process_project_datasets(
|
|
415
417
|
self,
|
|
@@ -820,8 +822,10 @@ class BigQuerySchemaGenerator:
|
|
|
820
822
|
platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
|
|
821
823
|
label, tag_urn, managed_by_datahub=False
|
|
822
824
|
)
|
|
823
|
-
label_info: BigQueryLabelInfo =
|
|
824
|
-
|
|
825
|
+
label_info: BigQueryLabelInfo = (
|
|
826
|
+
platform_resource.resource_info.value.as_pydantic_object( # type: ignore
|
|
827
|
+
BigQueryLabelInfo
|
|
828
|
+
)
|
|
825
829
|
)
|
|
826
830
|
tag_urn = TagUrn.from_string(label_info.datahub_urn)
|
|
827
831
|
|
|
@@ -860,8 +864,10 @@ class BigQuerySchemaGenerator:
|
|
|
860
864
|
platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
|
|
861
865
|
label, tag_urn, managed_by_datahub=False
|
|
862
866
|
)
|
|
863
|
-
label_info: BigQueryLabelInfo =
|
|
864
|
-
|
|
867
|
+
label_info: BigQueryLabelInfo = (
|
|
868
|
+
platform_resource.resource_info.value.as_pydantic_object( # type: ignore
|
|
869
|
+
BigQueryLabelInfo
|
|
870
|
+
)
|
|
865
871
|
)
|
|
866
872
|
tag_urn = TagUrn.from_string(label_info.datahub_urn)
|
|
867
873
|
|
|
@@ -1203,8 +1209,8 @@ class BigQuerySchemaGenerator:
|
|
|
1203
1209
|
report=self.report,
|
|
1204
1210
|
)
|
|
1205
1211
|
|
|
1206
|
-
self.report.metadata_extraction_sec[f"{project_id}.{dataset.name}"] =
|
|
1207
|
-
timer.elapsed_seconds()
|
|
1212
|
+
self.report.metadata_extraction_sec[f"{project_id}.{dataset.name}"] = (
|
|
1213
|
+
timer.elapsed_seconds(digits=2)
|
|
1208
1214
|
)
|
|
1209
1215
|
|
|
1210
1216
|
def get_core_table_details(
|
|
@@ -330,11 +330,11 @@ class BigqueryLineageExtractor:
|
|
|
330
330
|
projects = ["*"] # project_id not used when using exported metadata
|
|
331
331
|
|
|
332
332
|
for project in projects:
|
|
333
|
-
self.report.
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
333
|
+
with self.report.new_stage(f"{project}: {LINEAGE_EXTRACTION}"):
|
|
334
|
+
yield from self.generate_lineage(
|
|
335
|
+
project,
|
|
336
|
+
table_refs,
|
|
337
|
+
)
|
|
338
338
|
|
|
339
339
|
if self.redundant_run_skip_handler:
|
|
340
340
|
# Update the checkpoint state for this run.
|
|
@@ -368,8 +368,8 @@ class BigqueryLineageExtractor:
|
|
|
368
368
|
self.report.lineage_metadata_entries[project_id] = len(lineage)
|
|
369
369
|
logger.info(f"Built lineage map containing {len(lineage)} entries.")
|
|
370
370
|
logger.debug(f"lineage metadata is {lineage}")
|
|
371
|
-
self.report.lineage_extraction_sec[project_id] =
|
|
372
|
-
|
|
371
|
+
self.report.lineage_extraction_sec[project_id] = timer.elapsed_seconds(
|
|
372
|
+
digits=2
|
|
373
373
|
)
|
|
374
374
|
self.report.lineage_mem_size[project_id] = humanfriendly.format_size(
|
|
375
375
|
memory_footprint.total_size(lineage)
|
|
@@ -697,7 +697,7 @@ class BigqueryLineageExtractor:
|
|
|
697
697
|
if parsed_queries[-1]:
|
|
698
698
|
query = f"""create table `{destination_table.get_sanitized_table_ref().table_identifier.get_table_name()}` AS
|
|
699
699
|
(
|
|
700
|
-
{parsed_queries[-1].sql(dialect=
|
|
700
|
+
{parsed_queries[-1].sql(dialect="bigquery")}
|
|
701
701
|
)"""
|
|
702
702
|
else:
|
|
703
703
|
query = e.query
|
|
@@ -809,11 +809,11 @@ class BigqueryLineageExtractor:
|
|
|
809
809
|
upstream_lineage, temp_table_upstream
|
|
810
810
|
)
|
|
811
811
|
|
|
812
|
-
upstreams[
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
812
|
+
upstreams[ref_temp_table_upstream] = (
|
|
813
|
+
_merge_lineage_edge_columns(
|
|
814
|
+
upstreams.get(ref_temp_table_upstream),
|
|
815
|
+
collapsed_lineage,
|
|
816
|
+
)
|
|
817
817
|
)
|
|
818
818
|
else:
|
|
819
819
|
upstreams[upstream_table_ref] = _merge_lineage_edge_columns(
|
|
@@ -1004,9 +1004,9 @@ class BigqueryLineageExtractor:
|
|
|
1004
1004
|
dataset_urn
|
|
1005
1005
|
)
|
|
1006
1006
|
for gcs_dataset_urn in gcs_urns:
|
|
1007
|
-
schema_metadata_for_gcs: Optional[
|
|
1008
|
-
|
|
1009
|
-
|
|
1007
|
+
schema_metadata_for_gcs: Optional[SchemaMetadataClass] = (
|
|
1008
|
+
graph.get_schema_metadata(gcs_dataset_urn)
|
|
1009
|
+
)
|
|
1010
1010
|
if schema_metadata and schema_metadata_for_gcs:
|
|
1011
1011
|
fine_grained_lineage = self.get_fine_grained_lineages_with_gcs(
|
|
1012
1012
|
dataset_urn,
|
|
@@ -271,9 +271,9 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
271
271
|
# Preprocessing stage that deduplicates the queries using query hash per usage bucket
|
|
272
272
|
# Note: FileBackedDict is an ordered dictionary, so the order of execution of
|
|
273
273
|
# queries is inherently maintained
|
|
274
|
-
queries_deduped: FileBackedDict[
|
|
275
|
-
|
|
276
|
-
|
|
274
|
+
queries_deduped: FileBackedDict[Dict[int, ObservedQuery]] = (
|
|
275
|
+
self.deduplicate_queries(queries)
|
|
276
|
+
)
|
|
277
277
|
self.report.num_unique_queries = len(queries_deduped)
|
|
278
278
|
logger.info(f"Found {self.report.num_unique_queries} unique queries")
|
|
279
279
|
|
|
@@ -495,62 +495,62 @@ class BigQueryUsageExtractor:
|
|
|
495
495
|
def _generate_operational_workunits(
|
|
496
496
|
self, usage_state: BigQueryUsageState, table_refs: Collection[str]
|
|
497
497
|
) -> Iterable[MetadataWorkUnit]:
|
|
498
|
-
self.report.
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
498
|
+
with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"):
|
|
499
|
+
for audit_event in usage_state.standalone_events():
|
|
500
|
+
try:
|
|
501
|
+
operational_wu = self._create_operation_workunit(
|
|
502
|
+
audit_event, table_refs
|
|
503
|
+
)
|
|
504
|
+
if operational_wu:
|
|
505
|
+
yield operational_wu
|
|
506
|
+
self.report.num_operational_stats_workunits_emitted += 1
|
|
507
|
+
except Exception as e:
|
|
508
|
+
self.report.warning(
|
|
509
|
+
message="Unable to generate operation workunit",
|
|
510
|
+
context=f"{audit_event}",
|
|
511
|
+
exc=e,
|
|
512
|
+
)
|
|
513
513
|
|
|
514
514
|
def _generate_usage_workunits(
|
|
515
515
|
self, usage_state: BigQueryUsageState
|
|
516
516
|
) -> Iterable[MetadataWorkUnit]:
|
|
517
|
-
self.report.
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
517
|
+
with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"):
|
|
518
|
+
top_n = (
|
|
519
|
+
self.config.usage.top_n_queries
|
|
520
|
+
if self.config.usage.include_top_n_queries
|
|
521
|
+
else 0
|
|
522
|
+
)
|
|
523
|
+
for entry in usage_state.usage_statistics(top_n=top_n):
|
|
524
|
+
try:
|
|
525
|
+
query_freq = [
|
|
526
|
+
(
|
|
527
|
+
self.uuid_to_query.get(
|
|
528
|
+
query_hash, usage_state.queries[query_hash]
|
|
529
|
+
),
|
|
530
|
+
count,
|
|
531
|
+
)
|
|
532
|
+
for query_hash, count in entry.query_freq
|
|
533
|
+
]
|
|
534
|
+
yield make_usage_workunit(
|
|
535
|
+
bucket_start_time=datetime.fromisoformat(entry.timestamp),
|
|
536
|
+
resource=BigQueryTableRef.from_string_name(entry.resource),
|
|
537
|
+
query_count=entry.query_count,
|
|
538
|
+
query_freq=query_freq,
|
|
539
|
+
user_freq=entry.user_freq,
|
|
540
|
+
column_freq=entry.column_freq,
|
|
541
|
+
bucket_duration=self.config.bucket_duration,
|
|
542
|
+
resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
|
|
543
|
+
top_n_queries=self.config.usage.top_n_queries,
|
|
544
|
+
format_sql_queries=self.config.usage.format_sql_queries,
|
|
545
|
+
queries_character_limit=self.config.usage.queries_character_limit,
|
|
546
|
+
)
|
|
547
|
+
self.report.num_usage_workunits_emitted += 1
|
|
548
|
+
except Exception as e:
|
|
549
|
+
self.report.warning(
|
|
550
|
+
message="Unable to generate usage statistics workunit",
|
|
551
|
+
context=f"{entry.timestamp}, {entry.resource}",
|
|
552
|
+
exc=e,
|
|
531
553
|
)
|
|
532
|
-
for query_hash, count in entry.query_freq
|
|
533
|
-
]
|
|
534
|
-
yield make_usage_workunit(
|
|
535
|
-
bucket_start_time=datetime.fromisoformat(entry.timestamp),
|
|
536
|
-
resource=BigQueryTableRef.from_string_name(entry.resource),
|
|
537
|
-
query_count=entry.query_count,
|
|
538
|
-
query_freq=query_freq,
|
|
539
|
-
user_freq=entry.user_freq,
|
|
540
|
-
column_freq=entry.column_freq,
|
|
541
|
-
bucket_duration=self.config.bucket_duration,
|
|
542
|
-
resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
|
|
543
|
-
top_n_queries=self.config.usage.top_n_queries,
|
|
544
|
-
format_sql_queries=self.config.usage.format_sql_queries,
|
|
545
|
-
queries_character_limit=self.config.usage.queries_character_limit,
|
|
546
|
-
)
|
|
547
|
-
self.report.num_usage_workunits_emitted += 1
|
|
548
|
-
except Exception as e:
|
|
549
|
-
self.report.warning(
|
|
550
|
-
message="Unable to generate usage statistics workunit",
|
|
551
|
-
context=f"{entry.timestamp}, {entry.resource}",
|
|
552
|
-
exc=e,
|
|
553
|
-
)
|
|
554
554
|
|
|
555
555
|
def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]:
|
|
556
556
|
if self.config.use_exported_bigquery_audit_metadata:
|
|
@@ -559,10 +559,10 @@ class BigQueryUsageExtractor:
|
|
|
559
559
|
for project_id in projects:
|
|
560
560
|
with PerfTimer() as timer:
|
|
561
561
|
try:
|
|
562
|
-
self.report.
|
|
563
|
-
project_id
|
|
564
|
-
)
|
|
565
|
-
|
|
562
|
+
with self.report.new_stage(
|
|
563
|
+
f"{project_id}: {USAGE_EXTRACTION_INGESTION}"
|
|
564
|
+
):
|
|
565
|
+
yield from self._get_parsed_bigquery_log_events(project_id)
|
|
566
566
|
except Exception as e:
|
|
567
567
|
self.report.usage_failed_extraction.append(project_id)
|
|
568
568
|
self.report.warning(
|
|
@@ -572,8 +572,8 @@ class BigQueryUsageExtractor:
|
|
|
572
572
|
)
|
|
573
573
|
self.report_status(f"usage-extraction-{project_id}", False)
|
|
574
574
|
|
|
575
|
-
self.report.usage_extraction_sec[project_id] =
|
|
576
|
-
|
|
575
|
+
self.report.usage_extraction_sec[project_id] = timer.elapsed_seconds(
|
|
576
|
+
digits=2
|
|
577
577
|
)
|
|
578
578
|
|
|
579
579
|
def _store_usage_event(
|
|
@@ -763,9 +763,9 @@ class BigQueryUsageExtractor:
|
|
|
763
763
|
)
|
|
764
764
|
|
|
765
765
|
if event.query_event.default_dataset:
|
|
766
|
-
custom_properties[
|
|
767
|
-
|
|
768
|
-
|
|
766
|
+
custom_properties["defaultDatabase"] = (
|
|
767
|
+
event.query_event.default_dataset
|
|
768
|
+
)
|
|
769
769
|
if event.read_event:
|
|
770
770
|
if event.read_event.readReason:
|
|
771
771
|
custom_properties["readReason"] = event.read_event.readReason
|
|
@@ -70,30 +70,30 @@ class CassandraProfiler:
|
|
|
70
70
|
) -> Iterable[MetadataWorkUnit]:
|
|
71
71
|
for keyspace_name in cassandra_data.keyspaces:
|
|
72
72
|
tables = cassandra_data.tables.get(keyspace_name, [])
|
|
73
|
-
self.report.
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
73
|
+
with self.report.new_stage(f"{keyspace_name}: {PROFILING}"):
|
|
74
|
+
with ThreadPoolExecutor(
|
|
75
|
+
max_workers=self.config.profiling.max_workers
|
|
76
|
+
) as executor:
|
|
77
|
+
future_to_dataset = {
|
|
78
|
+
executor.submit(
|
|
79
|
+
self.generate_profile,
|
|
80
|
+
keyspace_name,
|
|
81
|
+
table_name,
|
|
82
|
+
cassandra_data.columns.get(table_name, []),
|
|
83
|
+
): table_name
|
|
84
|
+
for table_name in tables
|
|
85
|
+
}
|
|
86
|
+
for future in as_completed(future_to_dataset):
|
|
87
|
+
table_name = future_to_dataset[future]
|
|
88
|
+
try:
|
|
89
|
+
yield from future.result()
|
|
90
|
+
except Exception as exc:
|
|
91
|
+
self.report.profiling_skipped_other[table_name] += 1
|
|
92
|
+
self.report.failure(
|
|
93
|
+
message="Failed to profile for table",
|
|
94
|
+
context=f"{keyspace_name}.{table_name}",
|
|
95
|
+
exc=exc,
|
|
96
|
+
)
|
|
97
97
|
|
|
98
98
|
def generate_profile(
|
|
99
99
|
self,
|
|
@@ -54,9 +54,6 @@ class CassandraSourceReport(StaleEntityRemovalSourceReport, IngestionStageReport
|
|
|
54
54
|
else:
|
|
55
55
|
raise KeyError(f"Unknown entity {ent_type}.")
|
|
56
56
|
|
|
57
|
-
def set_ingestion_stage(self, keyspace: str, stage: str) -> None:
|
|
58
|
-
self.report_ingestion_stage_start(f"{keyspace}: {stage}")
|
|
59
|
-
|
|
60
57
|
# TODO Need to create seperate common config for profiling report
|
|
61
58
|
profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
|
|
62
59
|
profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
|
|
@@ -110,10 +107,10 @@ class CassandraToSchemaFieldConverter:
|
|
|
110
107
|
|
|
111
108
|
@staticmethod
|
|
112
109
|
def get_column_type(cassandra_column_type: str) -> SchemaFieldDataType:
|
|
113
|
-
type_class: Optional[
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
110
|
+
type_class: Optional[Type] = (
|
|
111
|
+
CassandraToSchemaFieldConverter._field_type_to_schema_field_type.get(
|
|
112
|
+
cassandra_column_type
|
|
113
|
+
)
|
|
117
114
|
)
|
|
118
115
|
if type_class is None:
|
|
119
116
|
logger.warning(
|
|
@@ -293,9 +293,9 @@ class ConfluentSchemaRegistry(KafkaSchemaRegistryBase):
|
|
|
293
293
|
def _load_json_schema_with_resolved_references(
|
|
294
294
|
self, schema: Schema, name: str, subject: str
|
|
295
295
|
) -> dict:
|
|
296
|
-
imported_json_schemas: List[
|
|
297
|
-
|
|
298
|
-
|
|
296
|
+
imported_json_schemas: List[JsonSchemaWrapper] = (
|
|
297
|
+
self.get_schemas_from_confluent_ref_json(schema, name=name, subject=subject)
|
|
298
|
+
)
|
|
299
299
|
schema_dict = json.loads(schema.schema_str)
|
|
300
300
|
reference_map = {}
|
|
301
301
|
for imported_schema in imported_json_schemas:
|
|
@@ -332,9 +332,9 @@ class ConfluentSchemaRegistry(KafkaSchemaRegistryBase):
|
|
|
332
332
|
)
|
|
333
333
|
|
|
334
334
|
elif schema.schema_type == "PROTOBUF":
|
|
335
|
-
imported_schemas: List[
|
|
336
|
-
|
|
337
|
-
|
|
335
|
+
imported_schemas: List[ProtobufSchema] = (
|
|
336
|
+
self.get_schemas_from_confluent_ref_protobuf(schema)
|
|
337
|
+
)
|
|
338
338
|
base_name: str = topic.replace(".", "_")
|
|
339
339
|
fields = protobuf_util.protobuf_schema_to_mce_fields(
|
|
340
340
|
ProtobufSchema(
|
|
@@ -371,11 +371,11 @@ class CSVEnricherSource(Source):
|
|
|
371
371
|
domain: Optional[str],
|
|
372
372
|
description: Optional[str],
|
|
373
373
|
) -> Iterable[MetadataWorkUnit]:
|
|
374
|
-
maybe_terms_wu: Optional[
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
374
|
+
maybe_terms_wu: Optional[MetadataWorkUnit] = (
|
|
375
|
+
self.get_resource_glossary_terms_work_unit(
|
|
376
|
+
entity_urn=entity_urn,
|
|
377
|
+
term_associations=term_associations,
|
|
378
|
+
)
|
|
379
379
|
)
|
|
380
380
|
if maybe_terms_wu:
|
|
381
381
|
self.report.num_glossary_term_workunits_produced += 1
|
|
@@ -389,31 +389,31 @@ class CSVEnricherSource(Source):
|
|
|
389
389
|
self.report.num_tag_workunits_produced += 1
|
|
390
390
|
yield maybe_tags_wu
|
|
391
391
|
|
|
392
|
-
maybe_owners_wu: Optional[
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
392
|
+
maybe_owners_wu: Optional[MetadataWorkUnit] = (
|
|
393
|
+
self.get_resource_owners_work_unit(
|
|
394
|
+
entity_urn=entity_urn,
|
|
395
|
+
owners=owners,
|
|
396
|
+
)
|
|
397
397
|
)
|
|
398
398
|
if maybe_owners_wu:
|
|
399
399
|
self.report.num_owners_workunits_produced += 1
|
|
400
400
|
yield maybe_owners_wu
|
|
401
401
|
|
|
402
|
-
maybe_domain_wu: Optional[
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
402
|
+
maybe_domain_wu: Optional[MetadataWorkUnit] = (
|
|
403
|
+
self.get_resource_domain_work_unit(
|
|
404
|
+
entity_urn=entity_urn,
|
|
405
|
+
domain=domain,
|
|
406
|
+
)
|
|
407
407
|
)
|
|
408
408
|
if maybe_domain_wu:
|
|
409
409
|
self.report.num_domain_workunits_produced += 1
|
|
410
410
|
yield maybe_domain_wu
|
|
411
411
|
|
|
412
|
-
maybe_description_wu: Optional[
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
412
|
+
maybe_description_wu: Optional[MetadataWorkUnit] = (
|
|
413
|
+
self.get_resource_description_work_unit(
|
|
414
|
+
entity_urn=entity_urn,
|
|
415
|
+
description=description,
|
|
416
|
+
)
|
|
417
417
|
)
|
|
418
418
|
if maybe_description_wu:
|
|
419
419
|
self.report.num_description_workunits_produced += 1
|
|
@@ -426,9 +426,9 @@ class CSVEnricherSource(Source):
|
|
|
426
426
|
needs_write: bool,
|
|
427
427
|
) -> Tuple[EditableSchemaMetadataClass, bool]:
|
|
428
428
|
field_path: str = sub_resource_row.field_path
|
|
429
|
-
term_associations: List[
|
|
430
|
-
|
|
431
|
-
|
|
429
|
+
term_associations: List[GlossaryTermAssociationClass] = (
|
|
430
|
+
sub_resource_row.term_associations
|
|
431
|
+
)
|
|
432
432
|
tag_associations: List[TagAssociationClass] = sub_resource_row.tag_associations
|
|
433
433
|
description: Optional[str] = sub_resource_row.description
|
|
434
434
|
has_terms: bool = len(term_associations) > 0
|
|
@@ -517,9 +517,9 @@ class CSVEnricherSource(Source):
|
|
|
517
517
|
# Boolean field to tell whether we need to write an MCPW.
|
|
518
518
|
needs_write = False
|
|
519
519
|
|
|
520
|
-
current_editable_schema_metadata: Optional[
|
|
521
|
-
|
|
522
|
-
|
|
520
|
+
current_editable_schema_metadata: Optional[EditableSchemaMetadataClass] = (
|
|
521
|
+
None
|
|
522
|
+
)
|
|
523
523
|
if self.ctx.graph and not self.should_overwrite:
|
|
524
524
|
# Fetch the current editable schema metadata
|
|
525
525
|
current_editable_schema_metadata = self.ctx.graph.get_aspect(
|
|
@@ -655,9 +655,9 @@ class CSVEnricherSource(Source):
|
|
|
655
655
|
entity_urn = row["resource"]
|
|
656
656
|
entity_type = Urn.from_string(row["resource"]).get_type()
|
|
657
657
|
|
|
658
|
-
term_associations: List[
|
|
659
|
-
|
|
660
|
-
|
|
658
|
+
term_associations: List[GlossaryTermAssociationClass] = (
|
|
659
|
+
self.maybe_extract_glossary_terms(row)
|
|
660
|
+
)
|
|
661
661
|
tag_associations: List[TagAssociationClass] = self.maybe_extract_tags(row)
|
|
662
662
|
owners: List[OwnerClass] = self.maybe_extract_owners(row, is_resource_row)
|
|
663
663
|
|