acryl-datahub 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2440 -2438
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +211 -207
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
- datahub/cli/cli_utils.py +13 -2
- datahub/cli/delete_cli.py +3 -3
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/ingest_cli.py +25 -15
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +5 -5
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/specific/structuredproperties_cli.py +84 -0
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_builder.py +27 -0
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/emitter/rest_emitter.py +126 -85
- datahub/entrypoints.py +6 -0
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source.py +4 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +22 -19
- datahub/ingestion/graph/config.py +1 -1
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +77 -47
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/s3_util.py +24 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
- datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +60 -60
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/config.py +10 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
- datahub/ingestion/source/datahub/datahub_source.py +12 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/delta_lake/source.py +0 -5
- datahub/ingestion/source/demo_data.py +1 -1
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
- datahub/ingestion/source/dremio/dremio_source.py +2 -2
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/fivetran/fivetran.py +1 -6
- datahub/ingestion/source/gc/datahub_gc.py +11 -14
- datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +2 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +13 -6
- datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
- datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +11 -6
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/metabase.py +1 -6
- datahub/ingestion/source/mlflow.py +4 -9
- datahub/ingestion/source/mode.py +5 -5
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -31
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redash.py +0 -5
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +45 -46
- datahub/ingestion/source/redshift/usage.py +33 -33
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +11 -15
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
- datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/sql_types.py +1 -2
- datahub/ingestion/source/sql/sql_utils.py +5 -0
- datahub/ingestion/source/sql/teradata.py +18 -5
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +1 -6
- datahub/ingestion/source/tableau/tableau.py +343 -117
- datahub/ingestion/source/tableau/tableau_common.py +5 -2
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +74 -74
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/source_report/ingestion_stage.py +24 -20
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +317 -44
- datahub/metadata/_urns/urn_defs.py +69 -15
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
- datahub/metadata/schema.avsc +302 -89
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
- datahub/metadata/schemas/MLModelKey.avsc +2 -1
- datahub/metadata/schemas/MLModelProperties.avsc +96 -48
- datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
- datahub/metadata/schemas/VersionProperties.avsc +216 -0
- datahub/metadata/schemas/VersionSetKey.avsc +26 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
- datahub/secret/datahub_secrets_client.py +12 -21
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
- datahub/sql_parsing/sqlglot_lineage.py +3 -3
- datahub/sql_parsing/sqlglot_utils.py +1 -1
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +11 -11
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/perf_timer.py +11 -6
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/_urn_base.py +28 -5
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
|
@@ -284,6 +284,7 @@ class SqlAggregatorReport(Report):
|
|
|
284
284
|
|
|
285
285
|
# Queries.
|
|
286
286
|
num_queries_entities_generated: int = 0
|
|
287
|
+
num_queries_used_in_lineage: Optional[int] = None
|
|
287
288
|
num_queries_skipped_due_to_filters: int = 0
|
|
288
289
|
|
|
289
290
|
# Usage-related.
|
|
@@ -681,10 +682,10 @@ class SqlParsingAggregator(Closeable):
|
|
|
681
682
|
query_id = self._known_lineage_query_id()
|
|
682
683
|
|
|
683
684
|
# Generate CLL if schema of downstream is known
|
|
684
|
-
column_lineage: List[
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
685
|
+
column_lineage: List[ColumnLineageInfo] = (
|
|
686
|
+
self._generate_identity_column_lineage(
|
|
687
|
+
upstream_urn=upstream_urn, downstream_urn=downstream_urn
|
|
688
|
+
)
|
|
688
689
|
)
|
|
689
690
|
|
|
690
691
|
# Register the query.
|
|
@@ -1043,9 +1044,9 @@ class SqlParsingAggregator(Closeable):
|
|
|
1043
1044
|
temp_table_schemas: Dict[str, Optional[List[models.SchemaFieldClass]]] = {}
|
|
1044
1045
|
for temp_table_urn, query_ids in self._temp_lineage_map[session_id].items():
|
|
1045
1046
|
for query_id in query_ids:
|
|
1046
|
-
temp_table_schemas[
|
|
1047
|
-
|
|
1048
|
-
|
|
1047
|
+
temp_table_schemas[temp_table_urn] = (
|
|
1048
|
+
self._inferred_temp_schemas.get(query_id)
|
|
1049
|
+
)
|
|
1049
1050
|
if temp_table_schemas:
|
|
1050
1051
|
break
|
|
1051
1052
|
|
|
@@ -1072,9 +1073,9 @@ class SqlParsingAggregator(Closeable):
|
|
|
1072
1073
|
schema_resolver=self._schema_resolver,
|
|
1073
1074
|
)
|
|
1074
1075
|
if parsed.debug_info.error:
|
|
1075
|
-
self.report.views_parse_failures[
|
|
1076
|
-
|
|
1077
|
-
|
|
1076
|
+
self.report.views_parse_failures[view_urn] = (
|
|
1077
|
+
f"{parsed.debug_info.error} on query: {view_definition.view_definition[:100]}"
|
|
1078
|
+
)
|
|
1078
1079
|
if parsed.debug_info.table_error:
|
|
1079
1080
|
self.report.num_views_failed += 1
|
|
1080
1081
|
return # we can't do anything with this query
|
|
@@ -1200,6 +1201,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
1200
1201
|
queries_generated: Set[QueryId] = set()
|
|
1201
1202
|
|
|
1202
1203
|
yield from self._gen_lineage_mcps(queries_generated)
|
|
1204
|
+
self.report.num_queries_used_in_lineage = len(queries_generated)
|
|
1203
1205
|
yield from self._gen_usage_statistics_mcps()
|
|
1204
1206
|
yield from self._gen_operation_mcps(queries_generated)
|
|
1205
1207
|
yield from self._gen_remaining_queries(queries_generated)
|
|
@@ -1581,9 +1583,9 @@ class SqlParsingAggregator(Closeable):
|
|
|
1581
1583
|
temp_query_lineage_info
|
|
1582
1584
|
)
|
|
1583
1585
|
else:
|
|
1584
|
-
temp_upstream_queries[
|
|
1585
|
-
|
|
1586
|
-
|
|
1586
|
+
temp_upstream_queries[upstream] = (
|
|
1587
|
+
temp_query_lineage_info
|
|
1588
|
+
)
|
|
1587
1589
|
|
|
1588
1590
|
# Compute merged upstreams.
|
|
1589
1591
|
new_upstreams = OrderedSet[UrnStr]()
|
|
@@ -1663,9 +1665,9 @@ class SqlParsingAggregator(Closeable):
|
|
|
1663
1665
|
composed_of_queries_truncated: LossyList[str] = LossyList()
|
|
1664
1666
|
for query_id in composed_of_queries:
|
|
1665
1667
|
composed_of_queries_truncated.append(query_id)
|
|
1666
|
-
self.report.queries_with_temp_upstreams[
|
|
1667
|
-
|
|
1668
|
-
|
|
1668
|
+
self.report.queries_with_temp_upstreams[composite_query_id] = (
|
|
1669
|
+
composed_of_queries_truncated
|
|
1670
|
+
)
|
|
1669
1671
|
|
|
1670
1672
|
merged_query_text = ";\n\n".join(
|
|
1671
1673
|
[q.formatted_query_string for q in ordered_queries]
|
|
@@ -442,9 +442,9 @@ def _create_table_ddl_cll(
|
|
|
442
442
|
) -> List[_ColumnLineageInfo]:
|
|
443
443
|
column_lineage: List[_ColumnLineageInfo] = []
|
|
444
444
|
|
|
445
|
-
assert (
|
|
446
|
-
output_table
|
|
447
|
-
)
|
|
445
|
+
assert output_table is not None, (
|
|
446
|
+
"output_table must be set for create DDL statements"
|
|
447
|
+
)
|
|
448
448
|
|
|
449
449
|
create_schema: sqlglot.exp.Schema = statement.this
|
|
450
450
|
sqlglot_columns = create_schema.expressions
|
|
@@ -404,7 +404,7 @@ def detach_ctes(
|
|
|
404
404
|
if new_statement == statement:
|
|
405
405
|
if iteration > 1:
|
|
406
406
|
logger.debug(
|
|
407
|
-
f"Required {iteration+1} iterations to detach and eliminate all CTEs"
|
|
407
|
+
f"Required {iteration + 1} iterations to detach and eliminate all CTEs"
|
|
408
408
|
)
|
|
409
409
|
break
|
|
410
410
|
statement = new_statement
|
datahub/telemetry/stats.py
CHANGED
datahub/testing/mcp_diff.py
CHANGED
|
@@ -246,7 +246,7 @@ class MCPDiff:
|
|
|
246
246
|
for urn in self.aspect_changes.keys() - self.urns_added - self.urns_removed:
|
|
247
247
|
aspect_map = self.aspect_changes[urn]
|
|
248
248
|
s.append(f"Urn changed, {urn}:")
|
|
249
|
-
for
|
|
249
|
+
for aspect_diffs in aspect_map.values():
|
|
250
250
|
for i, ga in aspect_diffs.aspects_added.items():
|
|
251
251
|
s.append(self.report_aspect(ga, i, "added"))
|
|
252
252
|
if verbose:
|
|
@@ -224,9 +224,9 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
224
224
|
_use_sqlite_on_conflict: bool = field(repr=False, default=True)
|
|
225
225
|
|
|
226
226
|
def __post_init__(self) -> None:
|
|
227
|
-
assert (
|
|
228
|
-
|
|
229
|
-
)
|
|
227
|
+
assert self.cache_eviction_batch_size > 0, (
|
|
228
|
+
"cache_eviction_batch_size must be positive"
|
|
229
|
+
)
|
|
230
230
|
|
|
231
231
|
for reserved_column in ("key", "value", "rowid"):
|
|
232
232
|
if reserved_column in self.extra_columns:
|
|
@@ -243,7 +243,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
243
243
|
# This was added in 3.24.0 from 2018-06-04.
|
|
244
244
|
# See https://www.sqlite.org/lang_conflict.html
|
|
245
245
|
if OVERRIDE_SQLITE_VERSION_REQUIREMENT:
|
|
246
|
-
self.
|
|
246
|
+
self._use_sqlite_on_conflict = False
|
|
247
247
|
else:
|
|
248
248
|
raise RuntimeError("SQLite version 3.24.0 or later is required")
|
|
249
249
|
|
|
@@ -261,7 +261,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
261
261
|
rowid INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
262
262
|
key TEXT UNIQUE,
|
|
263
263
|
value BLOB
|
|
264
|
-
{
|
|
264
|
+
{"".join(f", {column_name} BLOB" for column_name in self.extra_columns.keys())}
|
|
265
265
|
)"""
|
|
266
266
|
)
|
|
267
267
|
|
|
@@ -316,12 +316,12 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
316
316
|
f"""INSERT INTO {self.tablename} (
|
|
317
317
|
key,
|
|
318
318
|
value
|
|
319
|
-
{
|
|
319
|
+
{"".join(f", {column_name}" for column_name in self.extra_columns.keys())}
|
|
320
320
|
)
|
|
321
|
-
VALUES ({
|
|
321
|
+
VALUES ({", ".join(["?"] * (2 + len(self.extra_columns)))})
|
|
322
322
|
ON CONFLICT (key) DO UPDATE SET
|
|
323
323
|
value = excluded.value
|
|
324
|
-
{
|
|
324
|
+
{"".join(f", {column_name} = excluded.{column_name}" for column_name in self.extra_columns.keys())}
|
|
325
325
|
""",
|
|
326
326
|
items_to_write,
|
|
327
327
|
)
|
|
@@ -332,16 +332,16 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
332
332
|
f"""INSERT INTO {self.tablename} (
|
|
333
333
|
key,
|
|
334
334
|
value
|
|
335
|
-
{
|
|
335
|
+
{"".join(f", {column_name}" for column_name in self.extra_columns.keys())}
|
|
336
336
|
)
|
|
337
|
-
VALUES ({
|
|
337
|
+
VALUES ({", ".join(["?"] * (2 + len(self.extra_columns)))})""",
|
|
338
338
|
item,
|
|
339
339
|
)
|
|
340
340
|
except sqlite3.IntegrityError:
|
|
341
341
|
self._conn.execute(
|
|
342
342
|
f"""UPDATE {self.tablename} SET
|
|
343
343
|
value = ?
|
|
344
|
-
{
|
|
344
|
+
{"".join(f", {column_name} = ?" for column_name in self.extra_columns.keys())}
|
|
345
345
|
WHERE key = ?""",
|
|
346
346
|
(*item[1:], item[0]),
|
|
347
347
|
)
|
|
@@ -142,10 +142,10 @@ class HiveColumnToAvroConverter:
|
|
|
142
142
|
fields.append({"name": field_name, "type": field_type})
|
|
143
143
|
|
|
144
144
|
if kwargs.get("ustruct_seqn") is not None:
|
|
145
|
-
struct_name = f
|
|
145
|
+
struct_name = f"__structn_{kwargs['ustruct_seqn']}_{str(uuid.uuid4()).replace('-', '')}"
|
|
146
146
|
|
|
147
147
|
else:
|
|
148
|
-
struct_name = f
|
|
148
|
+
struct_name = f"__struct_{str(uuid.uuid4()).replace('-', '')}"
|
|
149
149
|
return {
|
|
150
150
|
"type": "record",
|
|
151
151
|
"name": struct_name,
|
|
@@ -130,9 +130,9 @@ class _ColorLogFormatter(logging.Formatter):
|
|
|
130
130
|
# Mimic our default format, but with color.
|
|
131
131
|
message_fg = self.MESSAGE_COLORS.get(record.levelname)
|
|
132
132
|
return (
|
|
133
|
-
f
|
|
133
|
+
f"{click.style(f'[{self.formatTime(record, self.datefmt)}]', fg='green', dim=True)} "
|
|
134
134
|
f"{click.style(f'{record.levelname:8}', fg=message_fg)} "
|
|
135
|
-
f
|
|
135
|
+
f"{click.style(f'{{{record.name}:{record.lineno}}}', fg='blue', dim=True)} - "
|
|
136
136
|
f"{click.style(record.getMessage(), fg=message_fg)}"
|
|
137
137
|
)
|
|
138
138
|
|
|
@@ -151,9 +151,9 @@ class LossyDict(Dict[_KT, _VT], Generic[_KT, _VT]):
|
|
|
151
151
|
def as_obj(self) -> Dict[Union[_KT, str], Union[_VT, str]]:
|
|
152
152
|
base_dict: Dict[Union[_KT, str], Union[_VT, str]] = super().copy() # type: ignore
|
|
153
153
|
if self.sampled:
|
|
154
|
-
base_dict[
|
|
155
|
-
"sampled"
|
|
156
|
-
|
|
154
|
+
base_dict["sampled"] = (
|
|
155
|
+
f"{len(self.keys())} sampled of at most {self.total_key_count()} entries."
|
|
156
|
+
)
|
|
157
157
|
return base_dict
|
|
158
158
|
|
|
159
159
|
def total_key_count(self) -> int:
|
datahub/utilities/mapping.py
CHANGED
|
@@ -349,9 +349,9 @@ class OperationProcessor:
|
|
|
349
349
|
elements=[institutional_memory_element]
|
|
350
350
|
)
|
|
351
351
|
|
|
352
|
-
aspect_map[
|
|
353
|
-
|
|
354
|
-
|
|
352
|
+
aspect_map[Constants.ADD_DOC_LINK_OPERATION] = (
|
|
353
|
+
institutional_memory_aspect
|
|
354
|
+
)
|
|
355
355
|
else:
|
|
356
356
|
raise Exception(
|
|
357
357
|
f"Expected 1 item of type list for the documentation_link meta_mapping config,"
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from collections import deque
|
|
2
2
|
from itertools import chain
|
|
3
3
|
from sys import getsizeof
|
|
4
|
-
from typing import Any,
|
|
4
|
+
from typing import Any, Iterator
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
def total_size(o: Any, handlers: Any = {}) -> int:
|
|
@@ -15,7 +15,8 @@ def total_size(o: Any, handlers: Any = {}) -> int:
|
|
|
15
15
|
Based on https://github.com/ActiveState/recipe-577504-compute-mem-footprint/blob/master/recipe.py
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
|
-
dict_handler:
|
|
18
|
+
def dict_handler(d: dict) -> Iterator[Any]:
|
|
19
|
+
return chain.from_iterable(d.items())
|
|
19
20
|
|
|
20
21
|
all_handlers = {
|
|
21
22
|
tuple: iter,
|
datahub/utilities/perf_timer.py
CHANGED
|
@@ -57,7 +57,7 @@ class PerfTimer(AbstractContextManager):
|
|
|
57
57
|
self.finish()
|
|
58
58
|
return None
|
|
59
59
|
|
|
60
|
-
def elapsed_seconds(self) -> float:
|
|
60
|
+
def elapsed_seconds(self, digits: int = 4) -> float:
|
|
61
61
|
"""
|
|
62
62
|
Returns the elapsed time in seconds.
|
|
63
63
|
"""
|
|
@@ -65,11 +65,18 @@ class PerfTimer(AbstractContextManager):
|
|
|
65
65
|
return self._past_active_time
|
|
66
66
|
|
|
67
67
|
if self.end_time is None:
|
|
68
|
-
|
|
68
|
+
elapsed = (time.perf_counter() - self.start_time) + (self._past_active_time)
|
|
69
69
|
else:
|
|
70
|
-
|
|
70
|
+
elapsed = (self.end_time - self.start_time) + self._past_active_time
|
|
71
|
+
|
|
72
|
+
return round(elapsed, digits)
|
|
71
73
|
|
|
72
74
|
def assert_timer_is_running(self) -> None:
|
|
75
|
+
if not self.is_running():
|
|
76
|
+
self._error_state = True
|
|
77
|
+
logger.warning("Did you forget to start the timer ?")
|
|
78
|
+
|
|
79
|
+
def is_running(self) -> bool:
|
|
73
80
|
"""
|
|
74
81
|
Returns true if timer is in running state.
|
|
75
82
|
Timer is in NOT in running state if
|
|
@@ -77,9 +84,7 @@ class PerfTimer(AbstractContextManager):
|
|
|
77
84
|
2. it is in paused state.
|
|
78
85
|
3. it had been started and finished in the past but not started again.
|
|
79
86
|
"""
|
|
80
|
-
|
|
81
|
-
self._error_state = True
|
|
82
|
-
logger.warning("Did you forget to start the timer ?")
|
|
87
|
+
return self.start_time is not None and not self.paused and self.end_time is None
|
|
83
88
|
|
|
84
89
|
def __repr__(self) -> str:
|
|
85
90
|
return repr(self.as_obj())
|
|
@@ -41,7 +41,9 @@ def serialized_lru_cache(
|
|
|
41
41
|
def wrapper(*args: _F.args, **kwargs: _F.kwargs) -> _T:
|
|
42
42
|
# We need a type ignore here because there's no way for us to require that
|
|
43
43
|
# the args and kwargs are hashable while using ParamSpec.
|
|
44
|
-
key: _Key = cachetools.keys.hashkey(
|
|
44
|
+
key: _Key = cachetools.keys.hashkey(
|
|
45
|
+
*args, **{k: v for k, v in kwargs.items() if "cache_exclude" not in k}
|
|
46
|
+
) # type: ignore
|
|
45
47
|
|
|
46
48
|
with cache_lock:
|
|
47
49
|
if key in cache:
|
|
@@ -160,12 +160,12 @@ class SQLAlchemyQueryCombiner:
|
|
|
160
160
|
_greenlets_by_thread_lock: threading.Lock = dataclasses.field(
|
|
161
161
|
default_factory=lambda: threading.Lock()
|
|
162
162
|
)
|
|
163
|
-
_queries_by_thread: Dict[
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
_greenlets_by_thread: Dict[
|
|
167
|
-
|
|
168
|
-
|
|
163
|
+
_queries_by_thread: Dict[greenlet.greenlet, Dict[str, _QueryFuture]] = (
|
|
164
|
+
dataclasses.field(default_factory=lambda: collections.defaultdict(dict))
|
|
165
|
+
)
|
|
166
|
+
_greenlets_by_thread: Dict[greenlet.greenlet, Set[greenlet.greenlet]] = (
|
|
167
|
+
dataclasses.field(default_factory=lambda: collections.defaultdict(set))
|
|
168
|
+
)
|
|
169
169
|
|
|
170
170
|
@staticmethod
|
|
171
171
|
def _generate_sql_safe_identifier() -> str:
|
|
@@ -8,7 +8,7 @@ from sqllineage.utils.constant import EdgeType
|
|
|
8
8
|
|
|
9
9
|
# Patch based on sqllineage v1.3.3
|
|
10
10
|
def end_of_query_cleanup_patch(self, holder: SubQueryLineageHolder) -> None: # type: ignore
|
|
11
|
-
for
|
|
11
|
+
for tbl in self.tables:
|
|
12
12
|
holder.add_read(tbl)
|
|
13
13
|
self.union_barriers.append((len(self.columns), len(self.tables)))
|
|
14
14
|
|
|
@@ -48,7 +48,9 @@ class TopKDict(DefaultDict[_KT, _VT]):
|
|
|
48
48
|
total_value: Union[_VT, str] = sum(trimmed_dict.values()) # type: ignore
|
|
49
49
|
except Exception:
|
|
50
50
|
total_value = ""
|
|
51
|
-
trimmed_dict[f"... top {self.top_k} of total {len(self)} entries"] =
|
|
51
|
+
trimmed_dict[f"... top {self.top_k} of total {len(self)} entries"] = ( # type: ignore
|
|
52
|
+
total_value # type: ignore
|
|
53
|
+
)
|
|
52
54
|
return trimmed_dict
|
|
53
55
|
|
|
54
56
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import functools
|
|
2
2
|
import urllib.parse
|
|
3
3
|
from abc import abstractmethod
|
|
4
|
-
from typing import ClassVar, Dict, List, Optional, Type
|
|
4
|
+
from typing import ClassVar, Dict, List, Optional, Type, Union
|
|
5
5
|
|
|
6
6
|
from deprecated import deprecated
|
|
7
7
|
from typing_extensions import Self
|
|
@@ -86,12 +86,24 @@ class Urn:
|
|
|
86
86
|
return self._entity_ids
|
|
87
87
|
|
|
88
88
|
@classmethod
|
|
89
|
-
def from_string(cls, urn_str: str) -> Self:
|
|
90
|
-
"""
|
|
91
|
-
|
|
89
|
+
def from_string(cls, urn_str: Union[str, "Urn"], /) -> Self:
|
|
90
|
+
"""Create an Urn from its string representation.
|
|
91
|
+
|
|
92
|
+
When called against the base Urn class, this method will return a more specific Urn type where possible.
|
|
93
|
+
|
|
94
|
+
>>> from datahub.metadata.urns import DatasetUrn, Urn
|
|
95
|
+
>>> urn_str = 'urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)'
|
|
96
|
+
>>> urn = Urn.from_string(urn_str)
|
|
97
|
+
>>> assert isinstance(urn, DatasetUrn)
|
|
98
|
+
|
|
99
|
+
When called against a specific Urn type (e.g. DatasetUrn.from_string), this method can
|
|
100
|
+
also be used for type narrowing.
|
|
101
|
+
|
|
102
|
+
>>> urn_str = 'urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)'
|
|
103
|
+
>>> assert DatasetUrn.from_string(urn_str)
|
|
92
104
|
|
|
93
105
|
Args:
|
|
94
|
-
urn_str: The string representation of the Urn.
|
|
106
|
+
urn_str: The string representation of the urn. Also accepts an existing Urn instance.
|
|
95
107
|
|
|
96
108
|
Returns:
|
|
97
109
|
Urn of the given string representation.
|
|
@@ -100,6 +112,17 @@ class Urn:
|
|
|
100
112
|
InvalidUrnError: If the string representation is in invalid format.
|
|
101
113
|
"""
|
|
102
114
|
|
|
115
|
+
if isinstance(urn_str, Urn):
|
|
116
|
+
if issubclass(cls, _SpecificUrn) and isinstance(urn_str, cls):
|
|
117
|
+
# Fast path - we're already the right type.
|
|
118
|
+
|
|
119
|
+
# I'm not really sure why we need a type ignore here, but mypy doesn't really
|
|
120
|
+
# understand the isinstance check above.
|
|
121
|
+
return urn_str # type: ignore
|
|
122
|
+
|
|
123
|
+
# Fall through, so that we can convert a generic Urn to a specific Urn type.
|
|
124
|
+
urn_str = urn_str.urn()
|
|
125
|
+
|
|
103
126
|
# TODO: Add handling for url encoded urns e.g. urn%3A ...
|
|
104
127
|
|
|
105
128
|
if not urn_str.startswith("urn:li:"):
|
|
@@ -21,7 +21,7 @@ def _add_prefix_to_paths(
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
def list_urns_with_path(
|
|
24
|
-
model: Union[DictWrapper, MetadataChangeProposalWrapper]
|
|
24
|
+
model: Union[DictWrapper, MetadataChangeProposalWrapper],
|
|
25
25
|
) -> List[Tuple[str, _Path]]:
|
|
26
26
|
"""List urns in the given model with their paths.
|
|
27
27
|
|
|
@@ -145,7 +145,7 @@ def lowercase_dataset_urns(
|
|
|
145
145
|
MetadataChangeEventClass,
|
|
146
146
|
MetadataChangeProposalClass,
|
|
147
147
|
MetadataChangeProposalWrapper,
|
|
148
|
-
]
|
|
148
|
+
],
|
|
149
149
|
) -> None:
|
|
150
150
|
def modify_urn(urn: str) -> str:
|
|
151
151
|
if guess_entity_type(urn) == "dataset":
|
|
File without changes
|
|
File without changes
|