acryl-datahub 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (214) hide show
  1. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2462 -2460
  2. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +214 -210
  3. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/assertion/assertion_operator.py +3 -5
  6. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  7. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  8. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  9. datahub/api/entities/dataset/dataset.py +2 -1
  10. datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
  11. datahub/cli/cli_utils.py +13 -2
  12. datahub/cli/delete_cli.py +3 -3
  13. datahub/cli/docker_cli.py +6 -6
  14. datahub/cli/ingest_cli.py +25 -15
  15. datahub/cli/lite_cli.py +2 -2
  16. datahub/cli/migrate.py +5 -5
  17. datahub/cli/specific/assertions_cli.py +3 -3
  18. datahub/cli/specific/structuredproperties_cli.py +84 -0
  19. datahub/cli/timeline_cli.py +1 -1
  20. datahub/configuration/common.py +1 -2
  21. datahub/configuration/config_loader.py +73 -50
  22. datahub/configuration/git.py +2 -2
  23. datahub/configuration/time_window_config.py +10 -5
  24. datahub/emitter/mce_builder.py +4 -8
  25. datahub/emitter/mcp_builder.py +27 -0
  26. datahub/emitter/mcp_patch_builder.py +1 -2
  27. datahub/emitter/rest_emitter.py +141 -93
  28. datahub/entrypoints.py +6 -0
  29. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +5 -3
  30. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  31. datahub/ingestion/api/report.py +1 -2
  32. datahub/ingestion/api/source.py +8 -2
  33. datahub/ingestion/api/source_helpers.py +1 -1
  34. datahub/ingestion/extractor/json_schema_util.py +3 -3
  35. datahub/ingestion/extractor/schema_util.py +3 -5
  36. datahub/ingestion/fs/s3_fs.py +3 -3
  37. datahub/ingestion/glossary/classifier.py +2 -3
  38. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  39. datahub/ingestion/graph/client.py +22 -19
  40. datahub/ingestion/graph/config.py +1 -1
  41. datahub/ingestion/run/pipeline.py +8 -7
  42. datahub/ingestion/run/pipeline_config.py +3 -3
  43. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  44. datahub/ingestion/source/abs/source.py +19 -8
  45. datahub/ingestion/source/aws/glue.py +77 -47
  46. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  47. datahub/ingestion/source/aws/s3_util.py +24 -1
  48. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  49. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  50. datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
  51. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  52. datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
  53. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  54. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
  55. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
  56. datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
  57. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  58. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  59. datahub/ingestion/source/bigquery_v2/usage.py +60 -60
  60. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  61. datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
  62. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
  63. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  64. datahub/ingestion/source/csv_enricher.py +29 -29
  65. datahub/ingestion/source/datahub/config.py +20 -0
  66. datahub/ingestion/source/datahub/datahub_database_reader.py +7 -19
  67. datahub/ingestion/source/datahub/datahub_source.py +13 -3
  68. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  69. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  70. datahub/ingestion/source/delta_lake/source.py +0 -5
  71. datahub/ingestion/source/demo_data.py +1 -1
  72. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  73. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  74. datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
  75. datahub/ingestion/source/dremio/dremio_source.py +2 -2
  76. datahub/ingestion/source/elastic_search.py +4 -4
  77. datahub/ingestion/source/fivetran/fivetran.py +1 -6
  78. datahub/ingestion/source/gc/datahub_gc.py +11 -14
  79. datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
  80. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
  81. datahub/ingestion/source/gcs/gcs_source.py +3 -2
  82. datahub/ingestion/source/ge_data_profiler.py +2 -5
  83. datahub/ingestion/source/ge_profiling_config.py +3 -3
  84. datahub/ingestion/source/iceberg/iceberg.py +13 -6
  85. datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
  86. datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
  87. datahub/ingestion/source/identity/azure_ad.py +3 -3
  88. datahub/ingestion/source/identity/okta.py +3 -3
  89. datahub/ingestion/source/kafka/kafka.py +11 -9
  90. datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
  91. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  92. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  93. datahub/ingestion/source/looker/looker_common.py +19 -19
  94. datahub/ingestion/source/looker/looker_config.py +11 -6
  95. datahub/ingestion/source/looker/looker_source.py +25 -25
  96. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  97. datahub/ingestion/source/looker/looker_usage.py +5 -7
  98. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  99. datahub/ingestion/source/looker/lookml_source.py +13 -15
  100. datahub/ingestion/source/looker/view_upstream.py +5 -5
  101. datahub/ingestion/source/metabase.py +1 -6
  102. datahub/ingestion/source/mlflow.py +4 -9
  103. datahub/ingestion/source/mode.py +5 -5
  104. datahub/ingestion/source/mongodb.py +6 -4
  105. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  106. datahub/ingestion/source/nifi.py +24 -31
  107. datahub/ingestion/source/openapi.py +9 -9
  108. datahub/ingestion/source/powerbi/config.py +12 -12
  109. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  110. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  111. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  112. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  113. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  114. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
  115. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  116. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  117. datahub/ingestion/source/redash.py +0 -5
  118. datahub/ingestion/source/redshift/config.py +3 -3
  119. datahub/ingestion/source/redshift/redshift.py +45 -46
  120. datahub/ingestion/source/redshift/usage.py +33 -33
  121. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  122. datahub/ingestion/source/s3/source.py +11 -15
  123. datahub/ingestion/source/salesforce.py +26 -25
  124. datahub/ingestion/source/schema/json_schema.py +1 -1
  125. datahub/ingestion/source/sigma/sigma.py +3 -3
  126. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  127. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  128. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  129. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -0
  130. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  131. datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
  132. datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
  133. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
  134. datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
  135. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
  136. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  137. datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
  138. datahub/ingestion/source/sql/athena.py +1 -3
  139. datahub/ingestion/source/sql/clickhouse.py +8 -14
  140. datahub/ingestion/source/sql/oracle.py +1 -3
  141. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  142. datahub/ingestion/source/sql/sql_types.py +1 -2
  143. datahub/ingestion/source/sql/sql_utils.py +5 -0
  144. datahub/ingestion/source/sql/teradata.py +18 -5
  145. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  146. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  147. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  148. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  149. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  150. datahub/ingestion/source/superset.py +1 -6
  151. datahub/ingestion/source/tableau/tableau.py +343 -117
  152. datahub/ingestion/source/tableau/tableau_common.py +5 -2
  153. datahub/ingestion/source/unity/config.py +3 -1
  154. datahub/ingestion/source/unity/proxy.py +1 -1
  155. datahub/ingestion/source/unity/source.py +74 -78
  156. datahub/ingestion/source/unity/usage.py +3 -1
  157. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  158. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  159. datahub/ingestion/source/usage/usage_common.py +1 -1
  160. datahub/ingestion/source_report/ingestion_stage.py +24 -20
  161. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  162. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  163. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  164. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  165. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  166. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  167. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  168. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  169. datahub/lite/duckdb_lite.py +12 -10
  170. datahub/metadata/_schema_classes.py +317 -44
  171. datahub/metadata/_urns/urn_defs.py +69 -15
  172. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  173. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  174. datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
  175. datahub/metadata/schema.avsc +302 -89
  176. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  177. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  178. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  179. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  180. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  181. datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
  182. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
  183. datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
  184. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  185. datahub/metadata/schemas/MLModelProperties.avsc +96 -48
  186. datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
  187. datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
  188. datahub/metadata/schemas/VersionProperties.avsc +216 -0
  189. datahub/metadata/schemas/VersionSetKey.avsc +26 -0
  190. datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
  191. datahub/secret/datahub_secrets_client.py +12 -21
  192. datahub/secret/secret_common.py +14 -8
  193. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  194. datahub/sql_parsing/schema_resolver.py +5 -10
  195. datahub/sql_parsing/sql_parsing_aggregator.py +26 -21
  196. datahub/sql_parsing/sqlglot_lineage.py +3 -3
  197. datahub/sql_parsing/sqlglot_utils.py +1 -1
  198. datahub/telemetry/stats.py +1 -2
  199. datahub/testing/mcp_diff.py +1 -1
  200. datahub/utilities/file_backed_collections.py +11 -11
  201. datahub/utilities/hive_schema_to_avro.py +2 -2
  202. datahub/utilities/logging_manager.py +2 -2
  203. datahub/utilities/lossy_collections.py +3 -3
  204. datahub/utilities/mapping.py +3 -3
  205. datahub/utilities/memory_footprint.py +3 -2
  206. datahub/utilities/perf_timer.py +11 -6
  207. datahub/utilities/serialized_lru_cache.py +3 -1
  208. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  209. datahub/utilities/sqllineage_patch.py +1 -1
  210. datahub/utilities/stats_collections.py +3 -1
  211. datahub/utilities/urns/_urn_base.py +28 -5
  212. datahub/utilities/urns/urn_iter.py +2 -2
  213. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
  214. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
@@ -165,6 +165,7 @@ class KnownQueryLineageInfo:
165
165
  timestamp: Optional[datetime] = None
166
166
  session_id: Optional[str] = None
167
167
  query_type: QueryType = QueryType.UNKNOWN
168
+ query_id: Optional[str] = None
168
169
 
169
170
 
170
171
  @dataclasses.dataclass
@@ -283,6 +284,7 @@ class SqlAggregatorReport(Report):
283
284
 
284
285
  # Queries.
285
286
  num_queries_entities_generated: int = 0
287
+ num_queries_used_in_lineage: Optional[int] = None
286
288
  num_queries_skipped_due_to_filters: int = 0
287
289
 
288
290
  # Usage-related.
@@ -618,11 +620,13 @@ class SqlParsingAggregator(Closeable):
618
620
  self.report.num_known_query_lineage += 1
619
621
 
620
622
  # Generate a fingerprint for the query.
621
- with self.report.sql_fingerprinting_timer:
622
- query_fingerprint = get_query_fingerprint(
623
- known_query_lineage.query_text,
624
- platform=self.platform.platform_name,
625
- )
623
+ query_fingerprint = known_query_lineage.query_id
624
+ if not query_fingerprint:
625
+ with self.report.sql_fingerprinting_timer:
626
+ query_fingerprint = get_query_fingerprint(
627
+ known_query_lineage.query_text,
628
+ platform=self.platform.platform_name,
629
+ )
626
630
  formatted_query = self._maybe_format_query(known_query_lineage.query_text)
627
631
 
628
632
  # Register the query.
@@ -678,10 +682,10 @@ class SqlParsingAggregator(Closeable):
678
682
  query_id = self._known_lineage_query_id()
679
683
 
680
684
  # Generate CLL if schema of downstream is known
681
- column_lineage: List[
682
- ColumnLineageInfo
683
- ] = self._generate_identity_column_lineage(
684
- upstream_urn=upstream_urn, downstream_urn=downstream_urn
685
+ column_lineage: List[ColumnLineageInfo] = (
686
+ self._generate_identity_column_lineage(
687
+ upstream_urn=upstream_urn, downstream_urn=downstream_urn
688
+ )
685
689
  )
686
690
 
687
691
  # Register the query.
@@ -1040,9 +1044,9 @@ class SqlParsingAggregator(Closeable):
1040
1044
  temp_table_schemas: Dict[str, Optional[List[models.SchemaFieldClass]]] = {}
1041
1045
  for temp_table_urn, query_ids in self._temp_lineage_map[session_id].items():
1042
1046
  for query_id in query_ids:
1043
- temp_table_schemas[
1044
- temp_table_urn
1045
- ] = self._inferred_temp_schemas.get(query_id)
1047
+ temp_table_schemas[temp_table_urn] = (
1048
+ self._inferred_temp_schemas.get(query_id)
1049
+ )
1046
1050
  if temp_table_schemas:
1047
1051
  break
1048
1052
 
@@ -1069,9 +1073,9 @@ class SqlParsingAggregator(Closeable):
1069
1073
  schema_resolver=self._schema_resolver,
1070
1074
  )
1071
1075
  if parsed.debug_info.error:
1072
- self.report.views_parse_failures[
1073
- view_urn
1074
- ] = f"{parsed.debug_info.error} on query: {view_definition.view_definition[:100]}"
1076
+ self.report.views_parse_failures[view_urn] = (
1077
+ f"{parsed.debug_info.error} on query: {view_definition.view_definition[:100]}"
1078
+ )
1075
1079
  if parsed.debug_info.table_error:
1076
1080
  self.report.num_views_failed += 1
1077
1081
  return # we can't do anything with this query
@@ -1197,6 +1201,7 @@ class SqlParsingAggregator(Closeable):
1197
1201
  queries_generated: Set[QueryId] = set()
1198
1202
 
1199
1203
  yield from self._gen_lineage_mcps(queries_generated)
1204
+ self.report.num_queries_used_in_lineage = len(queries_generated)
1200
1205
  yield from self._gen_usage_statistics_mcps()
1201
1206
  yield from self._gen_operation_mcps(queries_generated)
1202
1207
  yield from self._gen_remaining_queries(queries_generated)
@@ -1578,9 +1583,9 @@ class SqlParsingAggregator(Closeable):
1578
1583
  temp_query_lineage_info
1579
1584
  )
1580
1585
  else:
1581
- temp_upstream_queries[
1582
- upstream
1583
- ] = temp_query_lineage_info
1586
+ temp_upstream_queries[upstream] = (
1587
+ temp_query_lineage_info
1588
+ )
1584
1589
 
1585
1590
  # Compute merged upstreams.
1586
1591
  new_upstreams = OrderedSet[UrnStr]()
@@ -1660,9 +1665,9 @@ class SqlParsingAggregator(Closeable):
1660
1665
  composed_of_queries_truncated: LossyList[str] = LossyList()
1661
1666
  for query_id in composed_of_queries:
1662
1667
  composed_of_queries_truncated.append(query_id)
1663
- self.report.queries_with_temp_upstreams[
1664
- composite_query_id
1665
- ] = composed_of_queries_truncated
1668
+ self.report.queries_with_temp_upstreams[composite_query_id] = (
1669
+ composed_of_queries_truncated
1670
+ )
1666
1671
 
1667
1672
  merged_query_text = ";\n\n".join(
1668
1673
  [q.formatted_query_string for q in ordered_queries]
@@ -442,9 +442,9 @@ def _create_table_ddl_cll(
442
442
  ) -> List[_ColumnLineageInfo]:
443
443
  column_lineage: List[_ColumnLineageInfo] = []
444
444
 
445
- assert (
446
- output_table is not None
447
- ), "output_table must be set for create DDL statements"
445
+ assert output_table is not None, (
446
+ "output_table must be set for create DDL statements"
447
+ )
448
448
 
449
449
  create_schema: sqlglot.exp.Schema = statement.this
450
450
  sqlglot_columns = create_schema.expressions
@@ -404,7 +404,7 @@ def detach_ctes(
404
404
  if new_statement == statement:
405
405
  if iteration > 1:
406
406
  logger.debug(
407
- f"Required {iteration+1} iterations to detach and eliminate all CTEs"
407
+ f"Required {iteration + 1} iterations to detach and eliminate all CTEs"
408
408
  )
409
409
  break
410
410
  statement = new_statement
@@ -5,8 +5,7 @@ from typing_extensions import Protocol
5
5
 
6
6
 
7
7
  class SupportsLT(Protocol):
8
- def __lt__(self, __other: Any) -> Any:
9
- ...
8
+ def __lt__(self, __other: Any) -> Any: ...
10
9
 
11
10
 
12
11
  _SupportsComparisonT = TypeVar("_SupportsComparisonT", bound=SupportsLT)
@@ -246,7 +246,7 @@ class MCPDiff:
246
246
  for urn in self.aspect_changes.keys() - self.urns_added - self.urns_removed:
247
247
  aspect_map = self.aspect_changes[urn]
248
248
  s.append(f"Urn changed, {urn}:")
249
- for aspect_name, aspect_diffs in aspect_map.items():
249
+ for aspect_diffs in aspect_map.values():
250
250
  for i, ga in aspect_diffs.aspects_added.items():
251
251
  s.append(self.report_aspect(ga, i, "added"))
252
252
  if verbose:
@@ -224,9 +224,9 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
224
224
  _use_sqlite_on_conflict: bool = field(repr=False, default=True)
225
225
 
226
226
  def __post_init__(self) -> None:
227
- assert (
228
- self.cache_eviction_batch_size > 0
229
- ), "cache_eviction_batch_size must be positive"
227
+ assert self.cache_eviction_batch_size > 0, (
228
+ "cache_eviction_batch_size must be positive"
229
+ )
230
230
 
231
231
  for reserved_column in ("key", "value", "rowid"):
232
232
  if reserved_column in self.extra_columns:
@@ -243,7 +243,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
243
243
  # This was added in 3.24.0 from 2018-06-04.
244
244
  # See https://www.sqlite.org/lang_conflict.html
245
245
  if OVERRIDE_SQLITE_VERSION_REQUIREMENT:
246
- self.use_sqlite_on_conflict = False
246
+ self._use_sqlite_on_conflict = False
247
247
  else:
248
248
  raise RuntimeError("SQLite version 3.24.0 or later is required")
249
249
 
@@ -261,7 +261,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
261
261
  rowid INTEGER PRIMARY KEY AUTOINCREMENT,
262
262
  key TEXT UNIQUE,
263
263
  value BLOB
264
- {''.join(f', {column_name} BLOB' for column_name in self.extra_columns.keys())}
264
+ {"".join(f", {column_name} BLOB" for column_name in self.extra_columns.keys())}
265
265
  )"""
266
266
  )
267
267
 
@@ -316,12 +316,12 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
316
316
  f"""INSERT INTO {self.tablename} (
317
317
  key,
318
318
  value
319
- {''.join(f', {column_name}' for column_name in self.extra_columns.keys())}
319
+ {"".join(f", {column_name}" for column_name in self.extra_columns.keys())}
320
320
  )
321
- VALUES ({', '.join(['?'] *(2 + len(self.extra_columns)))})
321
+ VALUES ({", ".join(["?"] * (2 + len(self.extra_columns)))})
322
322
  ON CONFLICT (key) DO UPDATE SET
323
323
  value = excluded.value
324
- {''.join(f', {column_name} = excluded.{column_name}' for column_name in self.extra_columns.keys())}
324
+ {"".join(f", {column_name} = excluded.{column_name}" for column_name in self.extra_columns.keys())}
325
325
  """,
326
326
  items_to_write,
327
327
  )
@@ -332,16 +332,16 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
332
332
  f"""INSERT INTO {self.tablename} (
333
333
  key,
334
334
  value
335
- {''.join(f', {column_name}' for column_name in self.extra_columns.keys())}
335
+ {"".join(f", {column_name}" for column_name in self.extra_columns.keys())}
336
336
  )
337
- VALUES ({', '.join(['?'] *(2 + len(self.extra_columns)))})""",
337
+ VALUES ({", ".join(["?"] * (2 + len(self.extra_columns)))})""",
338
338
  item,
339
339
  )
340
340
  except sqlite3.IntegrityError:
341
341
  self._conn.execute(
342
342
  f"""UPDATE {self.tablename} SET
343
343
  value = ?
344
- {''.join(f', {column_name} = ?' for column_name in self.extra_columns.keys())}
344
+ {"".join(f", {column_name} = ?" for column_name in self.extra_columns.keys())}
345
345
  WHERE key = ?""",
346
346
  (*item[1:], item[0]),
347
347
  )
@@ -142,10 +142,10 @@ class HiveColumnToAvroConverter:
142
142
  fields.append({"name": field_name, "type": field_type})
143
143
 
144
144
  if kwargs.get("ustruct_seqn") is not None:
145
- struct_name = f'__structn_{kwargs["ustruct_seqn"]}_{str(uuid.uuid4()).replace("-", "")}'
145
+ struct_name = f"__structn_{kwargs['ustruct_seqn']}_{str(uuid.uuid4()).replace('-', '')}"
146
146
 
147
147
  else:
148
- struct_name = f'__struct_{str(uuid.uuid4()).replace("-", "")}'
148
+ struct_name = f"__struct_{str(uuid.uuid4()).replace('-', '')}"
149
149
  return {
150
150
  "type": "record",
151
151
  "name": struct_name,
@@ -130,9 +130,9 @@ class _ColorLogFormatter(logging.Formatter):
130
130
  # Mimic our default format, but with color.
131
131
  message_fg = self.MESSAGE_COLORS.get(record.levelname)
132
132
  return (
133
- f'{click.style(f"[{self.formatTime(record, self.datefmt)}]", fg="green", dim=True)} '
133
+ f"{click.style(f'[{self.formatTime(record, self.datefmt)}]', fg='green', dim=True)} "
134
134
  f"{click.style(f'{record.levelname:8}', fg=message_fg)} "
135
- f'{click.style(f"{{{record.name}:{record.lineno}}}", fg="blue", dim=True)} - '
135
+ f"{click.style(f'{{{record.name}:{record.lineno}}}', fg='blue', dim=True)} - "
136
136
  f"{click.style(record.getMessage(), fg=message_fg)}"
137
137
  )
138
138
 
@@ -151,9 +151,9 @@ class LossyDict(Dict[_KT, _VT], Generic[_KT, _VT]):
151
151
  def as_obj(self) -> Dict[Union[_KT, str], Union[_VT, str]]:
152
152
  base_dict: Dict[Union[_KT, str], Union[_VT, str]] = super().copy() # type: ignore
153
153
  if self.sampled:
154
- base_dict[
155
- "sampled"
156
- ] = f"{len(self.keys())} sampled of at most {self.total_key_count()} entries."
154
+ base_dict["sampled"] = (
155
+ f"{len(self.keys())} sampled of at most {self.total_key_count()} entries."
156
+ )
157
157
  return base_dict
158
158
 
159
159
  def total_key_count(self) -> int:
@@ -349,9 +349,9 @@ class OperationProcessor:
349
349
  elements=[institutional_memory_element]
350
350
  )
351
351
 
352
- aspect_map[
353
- Constants.ADD_DOC_LINK_OPERATION
354
- ] = institutional_memory_aspect
352
+ aspect_map[Constants.ADD_DOC_LINK_OPERATION] = (
353
+ institutional_memory_aspect
354
+ )
355
355
  else:
356
356
  raise Exception(
357
357
  f"Expected 1 item of type list for the documentation_link meta_mapping config,"
@@ -1,7 +1,7 @@
1
1
  from collections import deque
2
2
  from itertools import chain
3
3
  from sys import getsizeof
4
- from typing import Any, Callable
4
+ from typing import Any, Iterator
5
5
 
6
6
 
7
7
  def total_size(o: Any, handlers: Any = {}) -> int:
@@ -15,7 +15,8 @@ def total_size(o: Any, handlers: Any = {}) -> int:
15
15
  Based on https://github.com/ActiveState/recipe-577504-compute-mem-footprint/blob/master/recipe.py
16
16
  """
17
17
 
18
- dict_handler: Callable[[Any], chain[Any]] = lambda d: chain.from_iterable(d.items())
18
+ def dict_handler(d: dict) -> Iterator[Any]:
19
+ return chain.from_iterable(d.items())
19
20
 
20
21
  all_handlers = {
21
22
  tuple: iter,
@@ -57,7 +57,7 @@ class PerfTimer(AbstractContextManager):
57
57
  self.finish()
58
58
  return None
59
59
 
60
- def elapsed_seconds(self) -> float:
60
+ def elapsed_seconds(self, digits: int = 4) -> float:
61
61
  """
62
62
  Returns the elapsed time in seconds.
63
63
  """
@@ -65,11 +65,18 @@ class PerfTimer(AbstractContextManager):
65
65
  return self._past_active_time
66
66
 
67
67
  if self.end_time is None:
68
- return (time.perf_counter() - self.start_time) + (self._past_active_time)
68
+ elapsed = (time.perf_counter() - self.start_time) + (self._past_active_time)
69
69
  else:
70
- return (self.end_time - self.start_time) + self._past_active_time
70
+ elapsed = (self.end_time - self.start_time) + self._past_active_time
71
+
72
+ return round(elapsed, digits)
71
73
 
72
74
  def assert_timer_is_running(self) -> None:
75
+ if not self.is_running():
76
+ self._error_state = True
77
+ logger.warning("Did you forget to start the timer ?")
78
+
79
+ def is_running(self) -> bool:
73
80
  """
74
81
  Returns true if timer is in running state.
75
82
  Timer is in NOT in running state if
@@ -77,9 +84,7 @@ class PerfTimer(AbstractContextManager):
77
84
  2. it is in paused state.
78
85
  3. it had been started and finished in the past but not started again.
79
86
  """
80
- if self.start_time is None or self.paused or self.end_time:
81
- self._error_state = True
82
- logger.warning("Did you forget to start the timer ?")
87
+ return self.start_time is not None and not self.paused and self.end_time is None
83
88
 
84
89
  def __repr__(self) -> str:
85
90
  return repr(self.as_obj())
@@ -41,7 +41,9 @@ def serialized_lru_cache(
41
41
  def wrapper(*args: _F.args, **kwargs: _F.kwargs) -> _T:
42
42
  # We need a type ignore here because there's no way for us to require that
43
43
  # the args and kwargs are hashable while using ParamSpec.
44
- key: _Key = cachetools.keys.hashkey(*args, **{k: v for k, v in kwargs.items() if "cache_exclude" not in k}) # type: ignore
44
+ key: _Key = cachetools.keys.hashkey(
45
+ *args, **{k: v for k, v in kwargs.items() if "cache_exclude" not in k}
46
+ ) # type: ignore
45
47
 
46
48
  with cache_lock:
47
49
  if key in cache:
@@ -160,12 +160,12 @@ class SQLAlchemyQueryCombiner:
160
160
  _greenlets_by_thread_lock: threading.Lock = dataclasses.field(
161
161
  default_factory=lambda: threading.Lock()
162
162
  )
163
- _queries_by_thread: Dict[
164
- greenlet.greenlet, Dict[str, _QueryFuture]
165
- ] = dataclasses.field(default_factory=lambda: collections.defaultdict(dict))
166
- _greenlets_by_thread: Dict[
167
- greenlet.greenlet, Set[greenlet.greenlet]
168
- ] = dataclasses.field(default_factory=lambda: collections.defaultdict(set))
163
+ _queries_by_thread: Dict[greenlet.greenlet, Dict[str, _QueryFuture]] = (
164
+ dataclasses.field(default_factory=lambda: collections.defaultdict(dict))
165
+ )
166
+ _greenlets_by_thread: Dict[greenlet.greenlet, Set[greenlet.greenlet]] = (
167
+ dataclasses.field(default_factory=lambda: collections.defaultdict(set))
168
+ )
169
169
 
170
170
  @staticmethod
171
171
  def _generate_sql_safe_identifier() -> str:
@@ -8,7 +8,7 @@ from sqllineage.utils.constant import EdgeType
8
8
 
9
9
  # Patch based on sqllineage v1.3.3
10
10
  def end_of_query_cleanup_patch(self, holder: SubQueryLineageHolder) -> None: # type: ignore
11
- for i, tbl in enumerate(self.tables):
11
+ for tbl in self.tables:
12
12
  holder.add_read(tbl)
13
13
  self.union_barriers.append((len(self.columns), len(self.tables)))
14
14
 
@@ -48,7 +48,9 @@ class TopKDict(DefaultDict[_KT, _VT]):
48
48
  total_value: Union[_VT, str] = sum(trimmed_dict.values()) # type: ignore
49
49
  except Exception:
50
50
  total_value = ""
51
- trimmed_dict[f"... top {self.top_k} of total {len(self)} entries"] = total_value # type: ignore
51
+ trimmed_dict[f"... top {self.top_k} of total {len(self)} entries"] = ( # type: ignore
52
+ total_value # type: ignore
53
+ )
52
54
  return trimmed_dict
53
55
 
54
56
 
@@ -1,7 +1,7 @@
1
1
  import functools
2
2
  import urllib.parse
3
3
  from abc import abstractmethod
4
- from typing import ClassVar, Dict, List, Optional, Type
4
+ from typing import ClassVar, Dict, List, Optional, Type, Union
5
5
 
6
6
  from deprecated import deprecated
7
7
  from typing_extensions import Self
@@ -86,12 +86,24 @@ class Urn:
86
86
  return self._entity_ids
87
87
 
88
88
  @classmethod
89
- def from_string(cls, urn_str: str) -> Self:
90
- """
91
- Creates an Urn from its string representation.
89
+ def from_string(cls, urn_str: Union[str, "Urn"], /) -> Self:
90
+ """Create an Urn from its string representation.
91
+
92
+ When called against the base Urn class, this method will return a more specific Urn type where possible.
93
+
94
+ >>> from datahub.metadata.urns import DatasetUrn, Urn
95
+ >>> urn_str = 'urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)'
96
+ >>> urn = Urn.from_string(urn_str)
97
+ >>> assert isinstance(urn, DatasetUrn)
98
+
99
+ When called against a specific Urn type (e.g. DatasetUrn.from_string), this method can
100
+ also be used for type narrowing.
101
+
102
+ >>> urn_str = 'urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)'
103
+ >>> assert DatasetUrn.from_string(urn_str)
92
104
 
93
105
  Args:
94
- urn_str: The string representation of the Urn.
106
+ urn_str: The string representation of the urn. Also accepts an existing Urn instance.
95
107
 
96
108
  Returns:
97
109
  Urn of the given string representation.
@@ -100,6 +112,17 @@ class Urn:
100
112
  InvalidUrnError: If the string representation is in invalid format.
101
113
  """
102
114
 
115
+ if isinstance(urn_str, Urn):
116
+ if issubclass(cls, _SpecificUrn) and isinstance(urn_str, cls):
117
+ # Fast path - we're already the right type.
118
+
119
+ # I'm not really sure why we need a type ignore here, but mypy doesn't really
120
+ # understand the isinstance check above.
121
+ return urn_str # type: ignore
122
+
123
+ # Fall through, so that we can convert a generic Urn to a specific Urn type.
124
+ urn_str = urn_str.urn()
125
+
103
126
  # TODO: Add handling for url encoded urns e.g. urn%3A ...
104
127
 
105
128
  if not urn_str.startswith("urn:li:"):
@@ -21,7 +21,7 @@ def _add_prefix_to_paths(
21
21
 
22
22
 
23
23
  def list_urns_with_path(
24
- model: Union[DictWrapper, MetadataChangeProposalWrapper]
24
+ model: Union[DictWrapper, MetadataChangeProposalWrapper],
25
25
  ) -> List[Tuple[str, _Path]]:
26
26
  """List urns in the given model with their paths.
27
27
 
@@ -145,7 +145,7 @@ def lowercase_dataset_urns(
145
145
  MetadataChangeEventClass,
146
146
  MetadataChangeProposalClass,
147
147
  MetadataChangeProposalWrapper,
148
- ]
148
+ ],
149
149
  ) -> None:
150
150
  def modify_urn(urn: str) -> str:
151
151
  if guess_entity_type(urn) == "dataset":