acryl-datahub 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2462 -2460
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +214 -210
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
- datahub/cli/cli_utils.py +13 -2
- datahub/cli/delete_cli.py +3 -3
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/ingest_cli.py +25 -15
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +5 -5
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/specific/structuredproperties_cli.py +84 -0
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_builder.py +27 -0
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/emitter/rest_emitter.py +141 -93
- datahub/entrypoints.py +6 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +5 -3
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source.py +8 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +22 -19
- datahub/ingestion/graph/config.py +1 -1
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +77 -47
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/s3_util.py +24 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
- datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +60 -60
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/config.py +20 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +7 -19
- datahub/ingestion/source/datahub/datahub_source.py +13 -3
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/delta_lake/source.py +0 -5
- datahub/ingestion/source/demo_data.py +1 -1
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
- datahub/ingestion/source/dremio/dremio_source.py +2 -2
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/fivetran/fivetran.py +1 -6
- datahub/ingestion/source/gc/datahub_gc.py +11 -14
- datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +2 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +13 -6
- datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
- datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +11 -6
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/metabase.py +1 -6
- datahub/ingestion/source/mlflow.py +4 -9
- datahub/ingestion/source/mode.py +5 -5
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -31
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redash.py +0 -5
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +45 -46
- datahub/ingestion/source/redshift/usage.py +33 -33
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +11 -15
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
- datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/sql_types.py +1 -2
- datahub/ingestion/source/sql/sql_utils.py +5 -0
- datahub/ingestion/source/sql/teradata.py +18 -5
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +1 -6
- datahub/ingestion/source/tableau/tableau.py +343 -117
- datahub/ingestion/source/tableau/tableau_common.py +5 -2
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +74 -78
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/source_report/ingestion_stage.py +24 -20
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +317 -44
- datahub/metadata/_urns/urn_defs.py +69 -15
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
- datahub/metadata/schema.avsc +302 -89
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
- datahub/metadata/schemas/MLModelKey.avsc +2 -1
- datahub/metadata/schemas/MLModelProperties.avsc +96 -48
- datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
- datahub/metadata/schemas/VersionProperties.avsc +216 -0
- datahub/metadata/schemas/VersionSetKey.avsc +26 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
- datahub/secret/datahub_secrets_client.py +12 -21
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +26 -21
- datahub/sql_parsing/sqlglot_lineage.py +3 -3
- datahub/sql_parsing/sqlglot_utils.py +1 -1
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +11 -11
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/perf_timer.py +11 -6
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/_urn_base.py +28 -5
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
|
@@ -165,6 +165,7 @@ class KnownQueryLineageInfo:
|
|
|
165
165
|
timestamp: Optional[datetime] = None
|
|
166
166
|
session_id: Optional[str] = None
|
|
167
167
|
query_type: QueryType = QueryType.UNKNOWN
|
|
168
|
+
query_id: Optional[str] = None
|
|
168
169
|
|
|
169
170
|
|
|
170
171
|
@dataclasses.dataclass
|
|
@@ -283,6 +284,7 @@ class SqlAggregatorReport(Report):
|
|
|
283
284
|
|
|
284
285
|
# Queries.
|
|
285
286
|
num_queries_entities_generated: int = 0
|
|
287
|
+
num_queries_used_in_lineage: Optional[int] = None
|
|
286
288
|
num_queries_skipped_due_to_filters: int = 0
|
|
287
289
|
|
|
288
290
|
# Usage-related.
|
|
@@ -618,11 +620,13 @@ class SqlParsingAggregator(Closeable):
|
|
|
618
620
|
self.report.num_known_query_lineage += 1
|
|
619
621
|
|
|
620
622
|
# Generate a fingerprint for the query.
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
623
|
+
query_fingerprint = known_query_lineage.query_id
|
|
624
|
+
if not query_fingerprint:
|
|
625
|
+
with self.report.sql_fingerprinting_timer:
|
|
626
|
+
query_fingerprint = get_query_fingerprint(
|
|
627
|
+
known_query_lineage.query_text,
|
|
628
|
+
platform=self.platform.platform_name,
|
|
629
|
+
)
|
|
626
630
|
formatted_query = self._maybe_format_query(known_query_lineage.query_text)
|
|
627
631
|
|
|
628
632
|
# Register the query.
|
|
@@ -678,10 +682,10 @@ class SqlParsingAggregator(Closeable):
|
|
|
678
682
|
query_id = self._known_lineage_query_id()
|
|
679
683
|
|
|
680
684
|
# Generate CLL if schema of downstream is known
|
|
681
|
-
column_lineage: List[
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
+
column_lineage: List[ColumnLineageInfo] = (
|
|
686
|
+
self._generate_identity_column_lineage(
|
|
687
|
+
upstream_urn=upstream_urn, downstream_urn=downstream_urn
|
|
688
|
+
)
|
|
685
689
|
)
|
|
686
690
|
|
|
687
691
|
# Register the query.
|
|
@@ -1040,9 +1044,9 @@ class SqlParsingAggregator(Closeable):
|
|
|
1040
1044
|
temp_table_schemas: Dict[str, Optional[List[models.SchemaFieldClass]]] = {}
|
|
1041
1045
|
for temp_table_urn, query_ids in self._temp_lineage_map[session_id].items():
|
|
1042
1046
|
for query_id in query_ids:
|
|
1043
|
-
temp_table_schemas[
|
|
1044
|
-
|
|
1045
|
-
|
|
1047
|
+
temp_table_schemas[temp_table_urn] = (
|
|
1048
|
+
self._inferred_temp_schemas.get(query_id)
|
|
1049
|
+
)
|
|
1046
1050
|
if temp_table_schemas:
|
|
1047
1051
|
break
|
|
1048
1052
|
|
|
@@ -1069,9 +1073,9 @@ class SqlParsingAggregator(Closeable):
|
|
|
1069
1073
|
schema_resolver=self._schema_resolver,
|
|
1070
1074
|
)
|
|
1071
1075
|
if parsed.debug_info.error:
|
|
1072
|
-
self.report.views_parse_failures[
|
|
1073
|
-
|
|
1074
|
-
|
|
1076
|
+
self.report.views_parse_failures[view_urn] = (
|
|
1077
|
+
f"{parsed.debug_info.error} on query: {view_definition.view_definition[:100]}"
|
|
1078
|
+
)
|
|
1075
1079
|
if parsed.debug_info.table_error:
|
|
1076
1080
|
self.report.num_views_failed += 1
|
|
1077
1081
|
return # we can't do anything with this query
|
|
@@ -1197,6 +1201,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
1197
1201
|
queries_generated: Set[QueryId] = set()
|
|
1198
1202
|
|
|
1199
1203
|
yield from self._gen_lineage_mcps(queries_generated)
|
|
1204
|
+
self.report.num_queries_used_in_lineage = len(queries_generated)
|
|
1200
1205
|
yield from self._gen_usage_statistics_mcps()
|
|
1201
1206
|
yield from self._gen_operation_mcps(queries_generated)
|
|
1202
1207
|
yield from self._gen_remaining_queries(queries_generated)
|
|
@@ -1578,9 +1583,9 @@ class SqlParsingAggregator(Closeable):
|
|
|
1578
1583
|
temp_query_lineage_info
|
|
1579
1584
|
)
|
|
1580
1585
|
else:
|
|
1581
|
-
temp_upstream_queries[
|
|
1582
|
-
|
|
1583
|
-
|
|
1586
|
+
temp_upstream_queries[upstream] = (
|
|
1587
|
+
temp_query_lineage_info
|
|
1588
|
+
)
|
|
1584
1589
|
|
|
1585
1590
|
# Compute merged upstreams.
|
|
1586
1591
|
new_upstreams = OrderedSet[UrnStr]()
|
|
@@ -1660,9 +1665,9 @@ class SqlParsingAggregator(Closeable):
|
|
|
1660
1665
|
composed_of_queries_truncated: LossyList[str] = LossyList()
|
|
1661
1666
|
for query_id in composed_of_queries:
|
|
1662
1667
|
composed_of_queries_truncated.append(query_id)
|
|
1663
|
-
self.report.queries_with_temp_upstreams[
|
|
1664
|
-
|
|
1665
|
-
|
|
1668
|
+
self.report.queries_with_temp_upstreams[composite_query_id] = (
|
|
1669
|
+
composed_of_queries_truncated
|
|
1670
|
+
)
|
|
1666
1671
|
|
|
1667
1672
|
merged_query_text = ";\n\n".join(
|
|
1668
1673
|
[q.formatted_query_string for q in ordered_queries]
|
|
@@ -442,9 +442,9 @@ def _create_table_ddl_cll(
|
|
|
442
442
|
) -> List[_ColumnLineageInfo]:
|
|
443
443
|
column_lineage: List[_ColumnLineageInfo] = []
|
|
444
444
|
|
|
445
|
-
assert (
|
|
446
|
-
output_table
|
|
447
|
-
)
|
|
445
|
+
assert output_table is not None, (
|
|
446
|
+
"output_table must be set for create DDL statements"
|
|
447
|
+
)
|
|
448
448
|
|
|
449
449
|
create_schema: sqlglot.exp.Schema = statement.this
|
|
450
450
|
sqlglot_columns = create_schema.expressions
|
|
@@ -404,7 +404,7 @@ def detach_ctes(
|
|
|
404
404
|
if new_statement == statement:
|
|
405
405
|
if iteration > 1:
|
|
406
406
|
logger.debug(
|
|
407
|
-
f"Required {iteration+1} iterations to detach and eliminate all CTEs"
|
|
407
|
+
f"Required {iteration + 1} iterations to detach and eliminate all CTEs"
|
|
408
408
|
)
|
|
409
409
|
break
|
|
410
410
|
statement = new_statement
|
datahub/telemetry/stats.py
CHANGED
datahub/testing/mcp_diff.py
CHANGED
|
@@ -246,7 +246,7 @@ class MCPDiff:
|
|
|
246
246
|
for urn in self.aspect_changes.keys() - self.urns_added - self.urns_removed:
|
|
247
247
|
aspect_map = self.aspect_changes[urn]
|
|
248
248
|
s.append(f"Urn changed, {urn}:")
|
|
249
|
-
for
|
|
249
|
+
for aspect_diffs in aspect_map.values():
|
|
250
250
|
for i, ga in aspect_diffs.aspects_added.items():
|
|
251
251
|
s.append(self.report_aspect(ga, i, "added"))
|
|
252
252
|
if verbose:
|
|
@@ -224,9 +224,9 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
224
224
|
_use_sqlite_on_conflict: bool = field(repr=False, default=True)
|
|
225
225
|
|
|
226
226
|
def __post_init__(self) -> None:
|
|
227
|
-
assert (
|
|
228
|
-
|
|
229
|
-
)
|
|
227
|
+
assert self.cache_eviction_batch_size > 0, (
|
|
228
|
+
"cache_eviction_batch_size must be positive"
|
|
229
|
+
)
|
|
230
230
|
|
|
231
231
|
for reserved_column in ("key", "value", "rowid"):
|
|
232
232
|
if reserved_column in self.extra_columns:
|
|
@@ -243,7 +243,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
243
243
|
# This was added in 3.24.0 from 2018-06-04.
|
|
244
244
|
# See https://www.sqlite.org/lang_conflict.html
|
|
245
245
|
if OVERRIDE_SQLITE_VERSION_REQUIREMENT:
|
|
246
|
-
self.
|
|
246
|
+
self._use_sqlite_on_conflict = False
|
|
247
247
|
else:
|
|
248
248
|
raise RuntimeError("SQLite version 3.24.0 or later is required")
|
|
249
249
|
|
|
@@ -261,7 +261,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
261
261
|
rowid INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
262
262
|
key TEXT UNIQUE,
|
|
263
263
|
value BLOB
|
|
264
|
-
{
|
|
264
|
+
{"".join(f", {column_name} BLOB" for column_name in self.extra_columns.keys())}
|
|
265
265
|
)"""
|
|
266
266
|
)
|
|
267
267
|
|
|
@@ -316,12 +316,12 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
316
316
|
f"""INSERT INTO {self.tablename} (
|
|
317
317
|
key,
|
|
318
318
|
value
|
|
319
|
-
{
|
|
319
|
+
{"".join(f", {column_name}" for column_name in self.extra_columns.keys())}
|
|
320
320
|
)
|
|
321
|
-
VALUES ({
|
|
321
|
+
VALUES ({", ".join(["?"] * (2 + len(self.extra_columns)))})
|
|
322
322
|
ON CONFLICT (key) DO UPDATE SET
|
|
323
323
|
value = excluded.value
|
|
324
|
-
{
|
|
324
|
+
{"".join(f", {column_name} = excluded.{column_name}" for column_name in self.extra_columns.keys())}
|
|
325
325
|
""",
|
|
326
326
|
items_to_write,
|
|
327
327
|
)
|
|
@@ -332,16 +332,16 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
332
332
|
f"""INSERT INTO {self.tablename} (
|
|
333
333
|
key,
|
|
334
334
|
value
|
|
335
|
-
{
|
|
335
|
+
{"".join(f", {column_name}" for column_name in self.extra_columns.keys())}
|
|
336
336
|
)
|
|
337
|
-
VALUES ({
|
|
337
|
+
VALUES ({", ".join(["?"] * (2 + len(self.extra_columns)))})""",
|
|
338
338
|
item,
|
|
339
339
|
)
|
|
340
340
|
except sqlite3.IntegrityError:
|
|
341
341
|
self._conn.execute(
|
|
342
342
|
f"""UPDATE {self.tablename} SET
|
|
343
343
|
value = ?
|
|
344
|
-
{
|
|
344
|
+
{"".join(f", {column_name} = ?" for column_name in self.extra_columns.keys())}
|
|
345
345
|
WHERE key = ?""",
|
|
346
346
|
(*item[1:], item[0]),
|
|
347
347
|
)
|
|
@@ -142,10 +142,10 @@ class HiveColumnToAvroConverter:
|
|
|
142
142
|
fields.append({"name": field_name, "type": field_type})
|
|
143
143
|
|
|
144
144
|
if kwargs.get("ustruct_seqn") is not None:
|
|
145
|
-
struct_name = f
|
|
145
|
+
struct_name = f"__structn_{kwargs['ustruct_seqn']}_{str(uuid.uuid4()).replace('-', '')}"
|
|
146
146
|
|
|
147
147
|
else:
|
|
148
|
-
struct_name = f
|
|
148
|
+
struct_name = f"__struct_{str(uuid.uuid4()).replace('-', '')}"
|
|
149
149
|
return {
|
|
150
150
|
"type": "record",
|
|
151
151
|
"name": struct_name,
|
|
@@ -130,9 +130,9 @@ class _ColorLogFormatter(logging.Formatter):
|
|
|
130
130
|
# Mimic our default format, but with color.
|
|
131
131
|
message_fg = self.MESSAGE_COLORS.get(record.levelname)
|
|
132
132
|
return (
|
|
133
|
-
f
|
|
133
|
+
f"{click.style(f'[{self.formatTime(record, self.datefmt)}]', fg='green', dim=True)} "
|
|
134
134
|
f"{click.style(f'{record.levelname:8}', fg=message_fg)} "
|
|
135
|
-
f
|
|
135
|
+
f"{click.style(f'{{{record.name}:{record.lineno}}}', fg='blue', dim=True)} - "
|
|
136
136
|
f"{click.style(record.getMessage(), fg=message_fg)}"
|
|
137
137
|
)
|
|
138
138
|
|
|
@@ -151,9 +151,9 @@ class LossyDict(Dict[_KT, _VT], Generic[_KT, _VT]):
|
|
|
151
151
|
def as_obj(self) -> Dict[Union[_KT, str], Union[_VT, str]]:
|
|
152
152
|
base_dict: Dict[Union[_KT, str], Union[_VT, str]] = super().copy() # type: ignore
|
|
153
153
|
if self.sampled:
|
|
154
|
-
base_dict[
|
|
155
|
-
"sampled"
|
|
156
|
-
|
|
154
|
+
base_dict["sampled"] = (
|
|
155
|
+
f"{len(self.keys())} sampled of at most {self.total_key_count()} entries."
|
|
156
|
+
)
|
|
157
157
|
return base_dict
|
|
158
158
|
|
|
159
159
|
def total_key_count(self) -> int:
|
datahub/utilities/mapping.py
CHANGED
|
@@ -349,9 +349,9 @@ class OperationProcessor:
|
|
|
349
349
|
elements=[institutional_memory_element]
|
|
350
350
|
)
|
|
351
351
|
|
|
352
|
-
aspect_map[
|
|
353
|
-
|
|
354
|
-
|
|
352
|
+
aspect_map[Constants.ADD_DOC_LINK_OPERATION] = (
|
|
353
|
+
institutional_memory_aspect
|
|
354
|
+
)
|
|
355
355
|
else:
|
|
356
356
|
raise Exception(
|
|
357
357
|
f"Expected 1 item of type list for the documentation_link meta_mapping config,"
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from collections import deque
|
|
2
2
|
from itertools import chain
|
|
3
3
|
from sys import getsizeof
|
|
4
|
-
from typing import Any,
|
|
4
|
+
from typing import Any, Iterator
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
def total_size(o: Any, handlers: Any = {}) -> int:
|
|
@@ -15,7 +15,8 @@ def total_size(o: Any, handlers: Any = {}) -> int:
|
|
|
15
15
|
Based on https://github.com/ActiveState/recipe-577504-compute-mem-footprint/blob/master/recipe.py
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
|
-
dict_handler:
|
|
18
|
+
def dict_handler(d: dict) -> Iterator[Any]:
|
|
19
|
+
return chain.from_iterable(d.items())
|
|
19
20
|
|
|
20
21
|
all_handlers = {
|
|
21
22
|
tuple: iter,
|
datahub/utilities/perf_timer.py
CHANGED
|
@@ -57,7 +57,7 @@ class PerfTimer(AbstractContextManager):
|
|
|
57
57
|
self.finish()
|
|
58
58
|
return None
|
|
59
59
|
|
|
60
|
-
def elapsed_seconds(self) -> float:
|
|
60
|
+
def elapsed_seconds(self, digits: int = 4) -> float:
|
|
61
61
|
"""
|
|
62
62
|
Returns the elapsed time in seconds.
|
|
63
63
|
"""
|
|
@@ -65,11 +65,18 @@ class PerfTimer(AbstractContextManager):
|
|
|
65
65
|
return self._past_active_time
|
|
66
66
|
|
|
67
67
|
if self.end_time is None:
|
|
68
|
-
|
|
68
|
+
elapsed = (time.perf_counter() - self.start_time) + (self._past_active_time)
|
|
69
69
|
else:
|
|
70
|
-
|
|
70
|
+
elapsed = (self.end_time - self.start_time) + self._past_active_time
|
|
71
|
+
|
|
72
|
+
return round(elapsed, digits)
|
|
71
73
|
|
|
72
74
|
def assert_timer_is_running(self) -> None:
|
|
75
|
+
if not self.is_running():
|
|
76
|
+
self._error_state = True
|
|
77
|
+
logger.warning("Did you forget to start the timer ?")
|
|
78
|
+
|
|
79
|
+
def is_running(self) -> bool:
|
|
73
80
|
"""
|
|
74
81
|
Returns true if timer is in running state.
|
|
75
82
|
Timer is in NOT in running state if
|
|
@@ -77,9 +84,7 @@ class PerfTimer(AbstractContextManager):
|
|
|
77
84
|
2. it is in paused state.
|
|
78
85
|
3. it had been started and finished in the past but not started again.
|
|
79
86
|
"""
|
|
80
|
-
|
|
81
|
-
self._error_state = True
|
|
82
|
-
logger.warning("Did you forget to start the timer ?")
|
|
87
|
+
return self.start_time is not None and not self.paused and self.end_time is None
|
|
83
88
|
|
|
84
89
|
def __repr__(self) -> str:
|
|
85
90
|
return repr(self.as_obj())
|
|
@@ -41,7 +41,9 @@ def serialized_lru_cache(
|
|
|
41
41
|
def wrapper(*args: _F.args, **kwargs: _F.kwargs) -> _T:
|
|
42
42
|
# We need a type ignore here because there's no way for us to require that
|
|
43
43
|
# the args and kwargs are hashable while using ParamSpec.
|
|
44
|
-
key: _Key = cachetools.keys.hashkey(
|
|
44
|
+
key: _Key = cachetools.keys.hashkey(
|
|
45
|
+
*args, **{k: v for k, v in kwargs.items() if "cache_exclude" not in k}
|
|
46
|
+
) # type: ignore
|
|
45
47
|
|
|
46
48
|
with cache_lock:
|
|
47
49
|
if key in cache:
|
|
@@ -160,12 +160,12 @@ class SQLAlchemyQueryCombiner:
|
|
|
160
160
|
_greenlets_by_thread_lock: threading.Lock = dataclasses.field(
|
|
161
161
|
default_factory=lambda: threading.Lock()
|
|
162
162
|
)
|
|
163
|
-
_queries_by_thread: Dict[
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
_greenlets_by_thread: Dict[
|
|
167
|
-
|
|
168
|
-
|
|
163
|
+
_queries_by_thread: Dict[greenlet.greenlet, Dict[str, _QueryFuture]] = (
|
|
164
|
+
dataclasses.field(default_factory=lambda: collections.defaultdict(dict))
|
|
165
|
+
)
|
|
166
|
+
_greenlets_by_thread: Dict[greenlet.greenlet, Set[greenlet.greenlet]] = (
|
|
167
|
+
dataclasses.field(default_factory=lambda: collections.defaultdict(set))
|
|
168
|
+
)
|
|
169
169
|
|
|
170
170
|
@staticmethod
|
|
171
171
|
def _generate_sql_safe_identifier() -> str:
|
|
@@ -8,7 +8,7 @@ from sqllineage.utils.constant import EdgeType
|
|
|
8
8
|
|
|
9
9
|
# Patch based on sqllineage v1.3.3
|
|
10
10
|
def end_of_query_cleanup_patch(self, holder: SubQueryLineageHolder) -> None: # type: ignore
|
|
11
|
-
for
|
|
11
|
+
for tbl in self.tables:
|
|
12
12
|
holder.add_read(tbl)
|
|
13
13
|
self.union_barriers.append((len(self.columns), len(self.tables)))
|
|
14
14
|
|
|
@@ -48,7 +48,9 @@ class TopKDict(DefaultDict[_KT, _VT]):
|
|
|
48
48
|
total_value: Union[_VT, str] = sum(trimmed_dict.values()) # type: ignore
|
|
49
49
|
except Exception:
|
|
50
50
|
total_value = ""
|
|
51
|
-
trimmed_dict[f"... top {self.top_k} of total {len(self)} entries"] =
|
|
51
|
+
trimmed_dict[f"... top {self.top_k} of total {len(self)} entries"] = ( # type: ignore
|
|
52
|
+
total_value # type: ignore
|
|
53
|
+
)
|
|
52
54
|
return trimmed_dict
|
|
53
55
|
|
|
54
56
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import functools
|
|
2
2
|
import urllib.parse
|
|
3
3
|
from abc import abstractmethod
|
|
4
|
-
from typing import ClassVar, Dict, List, Optional, Type
|
|
4
|
+
from typing import ClassVar, Dict, List, Optional, Type, Union
|
|
5
5
|
|
|
6
6
|
from deprecated import deprecated
|
|
7
7
|
from typing_extensions import Self
|
|
@@ -86,12 +86,24 @@ class Urn:
|
|
|
86
86
|
return self._entity_ids
|
|
87
87
|
|
|
88
88
|
@classmethod
|
|
89
|
-
def from_string(cls, urn_str: str) -> Self:
|
|
90
|
-
"""
|
|
91
|
-
|
|
89
|
+
def from_string(cls, urn_str: Union[str, "Urn"], /) -> Self:
|
|
90
|
+
"""Create an Urn from its string representation.
|
|
91
|
+
|
|
92
|
+
When called against the base Urn class, this method will return a more specific Urn type where possible.
|
|
93
|
+
|
|
94
|
+
>>> from datahub.metadata.urns import DatasetUrn, Urn
|
|
95
|
+
>>> urn_str = 'urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)'
|
|
96
|
+
>>> urn = Urn.from_string(urn_str)
|
|
97
|
+
>>> assert isinstance(urn, DatasetUrn)
|
|
98
|
+
|
|
99
|
+
When called against a specific Urn type (e.g. DatasetUrn.from_string), this method can
|
|
100
|
+
also be used for type narrowing.
|
|
101
|
+
|
|
102
|
+
>>> urn_str = 'urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)'
|
|
103
|
+
>>> assert DatasetUrn.from_string(urn_str)
|
|
92
104
|
|
|
93
105
|
Args:
|
|
94
|
-
urn_str: The string representation of the Urn.
|
|
106
|
+
urn_str: The string representation of the urn. Also accepts an existing Urn instance.
|
|
95
107
|
|
|
96
108
|
Returns:
|
|
97
109
|
Urn of the given string representation.
|
|
@@ -100,6 +112,17 @@ class Urn:
|
|
|
100
112
|
InvalidUrnError: If the string representation is in invalid format.
|
|
101
113
|
"""
|
|
102
114
|
|
|
115
|
+
if isinstance(urn_str, Urn):
|
|
116
|
+
if issubclass(cls, _SpecificUrn) and isinstance(urn_str, cls):
|
|
117
|
+
# Fast path - we're already the right type.
|
|
118
|
+
|
|
119
|
+
# I'm not really sure why we need a type ignore here, but mypy doesn't really
|
|
120
|
+
# understand the isinstance check above.
|
|
121
|
+
return urn_str # type: ignore
|
|
122
|
+
|
|
123
|
+
# Fall through, so that we can convert a generic Urn to a specific Urn type.
|
|
124
|
+
urn_str = urn_str.urn()
|
|
125
|
+
|
|
103
126
|
# TODO: Add handling for url encoded urns e.g. urn%3A ...
|
|
104
127
|
|
|
105
128
|
if not urn_str.startswith("urn:li:"):
|
|
@@ -21,7 +21,7 @@ def _add_prefix_to_paths(
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
def list_urns_with_path(
|
|
24
|
-
model: Union[DictWrapper, MetadataChangeProposalWrapper]
|
|
24
|
+
model: Union[DictWrapper, MetadataChangeProposalWrapper],
|
|
25
25
|
) -> List[Tuple[str, _Path]]:
|
|
26
26
|
"""List urns in the given model with their paths.
|
|
27
27
|
|
|
@@ -145,7 +145,7 @@ def lowercase_dataset_urns(
|
|
|
145
145
|
MetadataChangeEventClass,
|
|
146
146
|
MetadataChangeProposalClass,
|
|
147
147
|
MetadataChangeProposalWrapper,
|
|
148
|
-
]
|
|
148
|
+
],
|
|
149
149
|
) -> None:
|
|
150
150
|
def modify_urn(urn: str) -> str:
|
|
151
151
|
if guess_entity_type(urn) == "dataset":
|
|
File without changes
|
|
File without changes
|