acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (223) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/METADATA +2511 -2484
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/RECORD +223 -189
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  10. datahub/api/entities/external/restricted_text.py +247 -0
  11. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  12. datahub/cli/check_cli.py +88 -7
  13. datahub/cli/cli_utils.py +63 -0
  14. datahub/cli/container_cli.py +5 -0
  15. datahub/cli/delete_cli.py +124 -27
  16. datahub/cli/docker_check.py +107 -12
  17. datahub/cli/docker_cli.py +149 -227
  18. datahub/cli/exists_cli.py +0 -2
  19. datahub/cli/get_cli.py +0 -2
  20. datahub/cli/iceberg_cli.py +5 -0
  21. datahub/cli/ingest_cli.py +3 -15
  22. datahub/cli/migrate.py +2 -0
  23. datahub/cli/put_cli.py +1 -4
  24. datahub/cli/quickstart_versioning.py +50 -7
  25. datahub/cli/specific/assertions_cli.py +0 -4
  26. datahub/cli/specific/datacontract_cli.py +0 -3
  27. datahub/cli/specific/dataproduct_cli.py +0 -11
  28. datahub/cli/specific/dataset_cli.py +1 -8
  29. datahub/cli/specific/forms_cli.py +0 -4
  30. datahub/cli/specific/group_cli.py +0 -2
  31. datahub/cli/specific/structuredproperties_cli.py +1 -4
  32. datahub/cli/specific/user_cli.py +0 -2
  33. datahub/cli/state_cli.py +0 -2
  34. datahub/cli/timeline_cli.py +0 -2
  35. datahub/configuration/pydantic_migration_helpers.py +7 -5
  36. datahub/emitter/rest_emitter.py +70 -12
  37. datahub/entrypoints.py +4 -3
  38. datahub/ingestion/api/decorators.py +15 -3
  39. datahub/ingestion/api/report.py +332 -3
  40. datahub/ingestion/api/sink.py +3 -0
  41. datahub/ingestion/api/source.py +48 -44
  42. datahub/ingestion/autogenerated/__init__.py +0 -0
  43. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  44. datahub/ingestion/autogenerated/lineage.json +401 -0
  45. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  46. datahub/ingestion/extractor/schema_util.py +13 -4
  47. datahub/ingestion/glossary/classification_mixin.py +5 -0
  48. datahub/ingestion/graph/client.py +100 -15
  49. datahub/ingestion/graph/config.py +1 -0
  50. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  51. datahub/ingestion/run/pipeline.py +54 -2
  52. datahub/ingestion/sink/datahub_rest.py +13 -0
  53. datahub/ingestion/source/abs/source.py +1 -1
  54. datahub/ingestion/source/aws/aws_common.py +4 -0
  55. datahub/ingestion/source/aws/glue.py +489 -244
  56. datahub/ingestion/source/aws/tag_entities.py +292 -0
  57. datahub/ingestion/source/azure/azure_common.py +2 -2
  58. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  59. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  60. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  61. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  62. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  63. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  64. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  65. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  66. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  67. datahub/ingestion/source/common/subtypes.py +45 -0
  68. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  69. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  70. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  71. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  72. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  73. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  74. datahub/ingestion/source/debug/__init__.py +0 -0
  75. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  76. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  77. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  78. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  79. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  80. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  81. datahub/ingestion/source/file.py +3 -0
  82. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  83. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  84. datahub/ingestion/source/ge_data_profiler.py +76 -28
  85. datahub/ingestion/source/ge_profiling_config.py +11 -0
  86. datahub/ingestion/source/hex/api.py +26 -1
  87. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  88. datahub/ingestion/source/identity/azure_ad.py +1 -1
  89. datahub/ingestion/source/identity/okta.py +1 -14
  90. datahub/ingestion/source/kafka/kafka.py +16 -0
  91. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  92. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  93. datahub/ingestion/source/looker/looker_source.py +1 -0
  94. datahub/ingestion/source/mlflow.py +11 -1
  95. datahub/ingestion/source/mock_data/__init__.py +0 -0
  96. datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
  97. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  98. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  99. datahub/ingestion/source/nifi.py +1 -1
  100. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  101. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  102. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  103. datahub/ingestion/source/preset.py +2 -2
  104. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  105. datahub/ingestion/source/redshift/redshift.py +21 -1
  106. datahub/ingestion/source/redshift/usage.py +4 -3
  107. datahub/ingestion/source/s3/report.py +4 -2
  108. datahub/ingestion/source/s3/source.py +367 -115
  109. datahub/ingestion/source/sac/sac.py +3 -1
  110. datahub/ingestion/source/salesforce.py +6 -3
  111. datahub/ingestion/source/sigma/sigma.py +7 -1
  112. datahub/ingestion/source/slack/slack.py +2 -1
  113. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  114. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  115. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  116. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  117. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  118. datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
  119. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  120. datahub/ingestion/source/sql/athena.py +119 -11
  121. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  122. datahub/ingestion/source/sql/clickhouse.py +3 -1
  123. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  124. datahub/ingestion/source/sql/hana.py +3 -1
  125. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  126. datahub/ingestion/source/sql/mariadb.py +0 -1
  127. datahub/ingestion/source/sql/mssql/source.py +239 -34
  128. datahub/ingestion/source/sql/mysql.py +0 -1
  129. datahub/ingestion/source/sql/oracle.py +1 -1
  130. datahub/ingestion/source/sql/postgres.py +0 -1
  131. datahub/ingestion/source/sql/sql_common.py +121 -34
  132. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  133. datahub/ingestion/source/sql/teradata.py +997 -235
  134. datahub/ingestion/source/sql/vertica.py +10 -6
  135. datahub/ingestion/source/sql_queries.py +2 -2
  136. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  137. datahub/ingestion/source/superset.py +58 -3
  138. datahub/ingestion/source/tableau/tableau.py +58 -37
  139. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  140. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  141. datahub/ingestion/source/unity/config.py +5 -0
  142. datahub/ingestion/source/unity/proxy.py +118 -0
  143. datahub/ingestion/source/unity/source.py +195 -17
  144. datahub/ingestion/source/unity/tag_entities.py +295 -0
  145. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  146. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  147. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  148. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  149. datahub/metadata/_internal_schema_classes.py +1522 -569
  150. datahub/metadata/_urns/urn_defs.py +1826 -1658
  151. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  152. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  153. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  154. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  155. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
  156. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  158. datahub/metadata/schema.avsc +17758 -17097
  159. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  160. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  161. datahub/metadata/schemas/Applications.avsc +38 -0
  162. datahub/metadata/schemas/ChartKey.avsc +1 -0
  163. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  164. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  165. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  166. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  167. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  168. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  169. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  170. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
  171. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  172. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  173. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  174. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  175. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  176. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  177. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  178. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  179. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  180. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  181. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  182. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  183. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  184. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  185. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  186. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  187. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  188. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  189. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  190. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  191. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  192. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  193. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  194. datahub/metadata/schemas/__init__.py +3 -3
  195. datahub/sdk/__init__.py +2 -0
  196. datahub/sdk/_all_entities.py +7 -0
  197. datahub/sdk/_shared.py +116 -0
  198. datahub/sdk/chart.py +315 -0
  199. datahub/sdk/container.py +7 -0
  200. datahub/sdk/dashboard.py +432 -0
  201. datahub/sdk/dataflow.py +7 -0
  202. datahub/sdk/datajob.py +45 -13
  203. datahub/sdk/dataset.py +8 -2
  204. datahub/sdk/entity_client.py +82 -2
  205. datahub/sdk/lineage_client.py +683 -82
  206. datahub/sdk/main_client.py +46 -16
  207. datahub/sdk/mlmodel.py +101 -38
  208. datahub/sdk/mlmodelgroup.py +7 -0
  209. datahub/sdk/search_client.py +4 -3
  210. datahub/sdk/search_filters.py +95 -27
  211. datahub/specific/chart.py +1 -1
  212. datahub/specific/dataproduct.py +4 -0
  213. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  214. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  215. datahub/telemetry/telemetry.py +17 -11
  216. datahub/testing/sdk_v2_helpers.py +7 -1
  217. datahub/upgrade/upgrade.py +56 -14
  218. datahub/utilities/server_config_util.py +8 -0
  219. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  220. datahub/utilities/stats_collections.py +4 -0
  221. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/WHEEL +0 -0
  222. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/licenses/LICENSE +0 -0
  223. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/top_level.txt +0 -0
@@ -4,6 +4,7 @@ import functools
4
4
  import json
5
5
  import logging
6
6
  import os
7
+ import re
7
8
  import time
8
9
  from collections import defaultdict
9
10
  from dataclasses import dataclass
@@ -60,6 +61,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
60
61
  MetadataChangeProposal,
61
62
  )
62
63
  from datahub.metadata.com.linkedin.pegasus2avro.usage import UsageAggregation
64
+ from datahub.metadata.schema_classes import (
65
+ KEY_ASPECT_NAMES,
66
+ ChangeTypeClass,
67
+ )
63
68
  from datahub.utilities.server_config_util import RestServiceConfig, ServiceFeature
64
69
 
65
70
  if TYPE_CHECKING:
@@ -104,6 +109,22 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
104
109
  )
105
110
 
106
111
 
112
+ def preserve_unicode_escapes(obj: Any) -> Any:
113
+ """Recursively convert unicode characters back to escape sequences"""
114
+ if isinstance(obj, dict):
115
+ return {k: preserve_unicode_escapes(v) for k, v in obj.items()}
116
+ elif isinstance(obj, list):
117
+ return [preserve_unicode_escapes(item) for item in obj]
118
+ elif isinstance(obj, str):
119
+ # Convert non-ASCII characters back to \u escapes
120
+ def escape_unicode(match: Any) -> Any:
121
+ return f"\\u{ord(match.group(0)):04x}"
122
+
123
+ return re.sub(r"[^\x00-\x7F]", escape_unicode, obj)
124
+ else:
125
+ return obj
126
+
127
+
107
128
  class EmitMode(ConfigEnum):
108
129
  # Fully synchronous processing that updates both primary storage (SQL) and search storage (Elasticsearch) before returning.
109
130
  # Provides the strongest consistency guarantee but with the highest cost. Best for critical operations where immediate
@@ -314,6 +335,7 @@ class DataHubRestEmitter(Closeable, Emitter):
314
335
  openapi_ingestion: Optional[bool] = None,
315
336
  client_mode: Optional[ClientMode] = None,
316
337
  datahub_component: Optional[str] = None,
338
+ server_config_refresh_interval: Optional[int] = None,
317
339
  ):
318
340
  if not gms_server:
319
341
  raise ConfigurationError("gms server is required")
@@ -329,6 +351,8 @@ class DataHubRestEmitter(Closeable, Emitter):
329
351
  self._openapi_ingestion = (
330
352
  openapi_ingestion # Re-evaluated after test connection
331
353
  )
354
+ self._server_config_refresh_interval = server_config_refresh_interval
355
+ self._config_fetch_time: Optional[float] = None
332
356
 
333
357
  headers = {
334
358
  "X-RestLi-Protocol-Version": "2.0.0",
@@ -398,7 +422,17 @@ class DataHubRestEmitter(Closeable, Emitter):
398
422
  Raises:
399
423
  ConfigurationError: If there's an error fetching or validating the configuration
400
424
  """
401
- if not hasattr(self, "_server_config") or not self._server_config:
425
+
426
+ if (
427
+ not hasattr(self, "_server_config")
428
+ or self._server_config is None
429
+ or (
430
+ self._server_config_refresh_interval is not None
431
+ and self._config_fetch_time is not None
432
+ and (time.time() - self._config_fetch_time)
433
+ > self._server_config_refresh_interval
434
+ )
435
+ ):
402
436
  if self._session is None or self._gms_server is None:
403
437
  raise ConfigurationError(
404
438
  "Session and URL are required to load configuration"
@@ -419,6 +453,7 @@ class DataHubRestEmitter(Closeable, Emitter):
419
453
  )
420
454
 
421
455
  self._server_config = RestServiceConfig(raw_config=raw_config)
456
+ self._config_fetch_time = time.time()
422
457
  self._post_fetch_server_config()
423
458
 
424
459
  else:
@@ -453,6 +488,8 @@ class DataHubRestEmitter(Closeable, Emitter):
453
488
  DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
454
489
  )
455
490
 
491
+ def test_connection(self) -> None:
492
+ self.fetch_server_config()
456
493
  logger.debug(
457
494
  f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
458
495
  )
@@ -460,12 +497,21 @@ class DataHubRestEmitter(Closeable, Emitter):
460
497
  f"{EmitMode.ASYNC_WAIT} {'IS' if self._should_trace(emit_mode=EmitMode.ASYNC_WAIT, warn=False) else 'IS NOT'} supported."
461
498
  )
462
499
 
463
- def test_connection(self) -> None:
464
- self.fetch_server_config()
465
-
466
500
  def get_server_config(self) -> dict:
467
501
  return self.server_config.raw_config
468
502
 
503
+ def invalidate_config_cache(self) -> None:
504
+ """Manually invalidate the configuration cache."""
505
+ if (
506
+ hasattr(self, "_server_config")
507
+ and self._server_config is not None
508
+ and self._server_config_refresh_interval is not None
509
+ ):
510
+ # Set fetch time to beyond TTL in the past to force refresh on next access
511
+ self._config_fetch_time = (
512
+ time.time() - self._server_config_refresh_interval - 1
513
+ )
514
+
469
515
  def to_graph(self) -> "DataHubGraph":
470
516
  from datahub.ingestion.graph.client import DataHubGraph
471
517
 
@@ -584,15 +630,27 @@ class DataHubRestEmitter(Closeable, Emitter):
584
630
  trace_data = extract_trace_data(response) if response else None
585
631
 
586
632
  else:
587
- url = f"{self._gms_server}/aspects?action=ingestProposal"
633
+ if mcp.changeType == ChangeTypeClass.DELETE:
634
+ if mcp.aspectName not in KEY_ASPECT_NAMES:
635
+ raise OperationalError(
636
+ f"Delete not supported for non key aspect: {mcp.aspectName} for urn: "
637
+ f"{mcp.entityUrn}"
638
+ )
588
639
 
589
- mcp_obj = pre_json_transform(mcp.to_obj())
590
- payload_dict = {
591
- "proposal": mcp_obj,
592
- "async": "true"
593
- if emit_mode in (EmitMode.ASYNC, EmitMode.ASYNC_WAIT)
594
- else "false",
595
- }
640
+ url = f"{self._gms_server}/entities?action=delete"
641
+ payload_dict = {
642
+ "urn": mcp.entityUrn,
643
+ }
644
+ else:
645
+ url = f"{self._gms_server}/aspects?action=ingestProposal"
646
+
647
+ mcp_obj = preserve_unicode_escapes(pre_json_transform(mcp.to_obj()))
648
+ payload_dict = {
649
+ "proposal": mcp_obj,
650
+ "async": "true"
651
+ if emit_mode in (EmitMode.ASYNC, EmitMode.ASYNC_WAIT)
652
+ else "false",
653
+ }
596
654
 
597
655
  payload = json.dumps(payload_dict)
598
656
 
datahub/entrypoints.py CHANGED
@@ -10,6 +10,7 @@ import click
10
10
  import datahub._version as datahub_version
11
11
  from datahub.cli.check_cli import check
12
12
  from datahub.cli.cli_utils import (
13
+ enable_auto_decorators,
13
14
  fixup_gms_url,
14
15
  generate_access_token,
15
16
  make_shim_command,
@@ -38,7 +39,6 @@ from datahub.cli.timeline_cli import timeline
38
39
  from datahub.configuration.common import should_show_stack_trace
39
40
  from datahub.ingestion.graph.client import get_default_graph
40
41
  from datahub.ingestion.graph.config import ClientMode
41
- from datahub.telemetry import telemetry
42
42
  from datahub.utilities._custom_package_loader import model_version_name
43
43
  from datahub.utilities.logging_manager import configure_logging
44
44
  from datahub.utilities.server_config_util import get_gms_config
@@ -111,7 +111,6 @@ def datahub(
111
111
  default=False,
112
112
  help="If passed will show server config. Assumes datahub init has happened.",
113
113
  )
114
- @telemetry.with_telemetry()
115
114
  def version(include_server: bool = False) -> None:
116
115
  """Print version number and exit."""
117
116
 
@@ -131,7 +130,6 @@ def version(include_server: bool = False) -> None:
131
130
  default=False,
132
131
  help="If passed then uses password to initialise token.",
133
132
  )
134
- @telemetry.with_telemetry()
135
133
  def init(use_password: bool = False) -> None:
136
134
  """Configure which datahub instance to connect to"""
137
135
 
@@ -218,6 +216,9 @@ except ImportError as e:
218
216
  make_shim_command("actions", "run `pip install acryl-datahub-actions`")
219
217
  )
220
218
 
219
+ # Adding telemetry and upgrade decorators to all commands
220
+ enable_auto_decorators(datahub)
221
+
221
222
 
222
223
  def main(**kwargs):
223
224
  # We use threads in a variety of places within our CLI. The multiprocessing
@@ -1,12 +1,16 @@
1
+ # So that SourceCapabilityModifier can be resolved at runtime
2
+ from __future__ import annotations
3
+
1
4
  from dataclasses import dataclass
2
5
  from enum import Enum, auto
3
- from typing import Callable, Dict, Optional, Type
6
+ from typing import Callable, Dict, List, Optional, Type
4
7
 
5
8
  from datahub.ingestion.api.common import PipelineContext
6
9
  from datahub.ingestion.api.source import (
7
10
  Source,
8
11
  SourceCapability as SourceCapability,
9
12
  )
13
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
10
14
 
11
15
 
12
16
  def config_class(config_cls: Type) -> Callable[[Type], Type]:
@@ -88,10 +92,14 @@ class CapabilitySetting:
88
92
  capability: SourceCapability
89
93
  description: str
90
94
  supported: bool
95
+ subtype_modifier: Optional[List[SourceCapabilityModifier]] = None
91
96
 
92
97
 
93
98
  def capability(
94
- capability_name: SourceCapability, description: str, supported: bool = True
99
+ capability_name: SourceCapability,
100
+ description: str,
101
+ supported: bool = True,
102
+ subtype_modifier: Optional[List[SourceCapabilityModifier]] = None,
95
103
  ) -> Callable[[Type], Type]:
96
104
  """
97
105
  A decorator to mark a source as having a certain capability
@@ -104,6 +112,7 @@ def capability(
104
112
  for base in cls.__bases__
105
113
  ):
106
114
  cls.__capabilities = {}
115
+
107
116
  cls.get_capabilities = lambda: cls.__capabilities.values()
108
117
 
109
118
  # If the superclasses have capability annotations, copy those over.
@@ -113,7 +122,10 @@ def capability(
113
122
  cls.__capabilities.update(base_caps)
114
123
 
115
124
  cls.__capabilities[capability_name] = CapabilitySetting(
116
- capability=capability_name, description=description, supported=supported
125
+ capability=capability_name,
126
+ description=description,
127
+ supported=supported,
128
+ subtype_modifier=subtype_modifier,
117
129
  )
118
130
  return cls
119
131
 
@@ -2,17 +2,31 @@ import dataclasses
2
2
  import json
3
3
  import logging
4
4
  import pprint
5
- from dataclasses import dataclass
5
+ from collections import defaultdict
6
+ from dataclasses import dataclass, field
6
7
  from datetime import datetime, timedelta
7
8
  from enum import Enum
8
- from typing import Any, Optional, runtime_checkable
9
+ from typing import Any, Dict, List, Optional, Set, Union, cast, runtime_checkable
9
10
 
10
11
  import humanfriendly
11
12
  import pydantic
12
13
  from pydantic import BaseModel
14
+ from tabulate import tabulate
13
15
  from typing_extensions import Literal, Protocol
14
16
 
17
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
18
+ from datahub.emitter.mcp_builder import mcps_from_mce
19
+ from datahub.ingestion.api.closeable import Closeable
15
20
  from datahub.ingestion.api.report_helpers import format_datetime_relative
21
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
22
+ from datahub.ingestion.autogenerated.lineage_helper import is_lineage_aspect
23
+ from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
24
+ from datahub.metadata.schema_classes import (
25
+ MetadataChangeProposalClass,
26
+ SubTypesClass,
27
+ UpstreamLineageClass,
28
+ )
29
+ from datahub.utilities.file_backed_collections import FileBackedDict
16
30
  from datahub.utilities.lossy_collections import LossyList
17
31
 
18
32
  logger = logging.getLogger(__name__)
@@ -82,7 +96,58 @@ class Report(SupportsAsObj):
82
96
  }
83
97
 
84
98
  def as_string(self) -> str:
85
- return pprint.pformat(self.as_obj(), width=150, sort_dicts=False)
99
+ self_obj = self.as_obj()
100
+ _aspects_by_subtypes = self_obj.pop("aspects_by_subtypes", None)
101
+
102
+ # Format the main report data
103
+ result = pprint.pformat(self_obj, width=150, sort_dicts=False)
104
+
105
+ # Add aspects_by_subtypes table if it exists
106
+ if _aspects_by_subtypes:
107
+ result += "\n\nAspects by Subtypes:\n"
108
+ result += self._format_aspects_by_subtypes_table(_aspects_by_subtypes)
109
+
110
+ return result
111
+
112
+ def _format_aspects_by_subtypes_table(
113
+ self, aspects_by_subtypes: Dict[str, Dict[str, Dict[str, int]]]
114
+ ) -> str:
115
+ """Format aspects_by_subtypes data as a table with aspects as rows and entity/subtype as columns."""
116
+ if not aspects_by_subtypes:
117
+ return "No aspects by subtypes data available."
118
+
119
+ all_aspects: set[str] = {
120
+ aspect
121
+ for subtypes in aspects_by_subtypes.values()
122
+ for aspects in subtypes.values()
123
+ for aspect in aspects
124
+ }
125
+
126
+ aspect_rows = sorted(all_aspects)
127
+
128
+ entity_subtype_columns = []
129
+ for entity_type, subtypes in aspects_by_subtypes.items():
130
+ for subtype in subtypes:
131
+ entity_subtype_columns.append(f"{entity_type} ({subtype})")
132
+
133
+ entity_subtype_columns.sort()
134
+
135
+ headers = ["Aspect"] + entity_subtype_columns
136
+
137
+ table_data = [
138
+ [aspect]
139
+ + [
140
+ aspects.get(aspect, 0)
141
+ for subtypes in aspects_by_subtypes.values()
142
+ for aspects in subtypes.values()
143
+ ]
144
+ for aspect in aspect_rows
145
+ ]
146
+
147
+ if table_data:
148
+ return tabulate(table_data, headers=headers, tablefmt="grid")
149
+ else:
150
+ return "No aspects by subtypes data available."
86
151
 
87
152
  def as_json(self) -> str:
88
153
  return json.dumps(self.as_obj())
@@ -90,6 +155,14 @@ class Report(SupportsAsObj):
90
155
  # TODO add helper method for warning / failure status + counts?
91
156
 
92
157
 
158
+ @dataclass
159
+ class SourceReportSubtypes:
160
+ urn: str
161
+ entity_type: str
162
+ subType: str = field(default="unknown")
163
+ aspects: Dict[str, int] = field(default_factory=dict)
164
+
165
+
93
166
  class ReportAttribute(BaseModel):
94
167
  severity: LogLevel = "DEBUG"
95
168
  help: Optional[str] = None
@@ -108,6 +181,262 @@ class ReportAttribute(BaseModel):
108
181
  logger.log(level=self.logger_sev, msg=msg, stacklevel=3)
109
182
 
110
183
 
184
+ @dataclass
185
+ class ExamplesReport(Report, Closeable):
186
+ aspects: Dict[str, Dict[str, int]] = field(
187
+ default_factory=lambda: defaultdict(lambda: defaultdict(int))
188
+ )
189
+ aspects_by_subtypes: Dict[str, Dict[str, Dict[str, int]]] = field(
190
+ default_factory=lambda: defaultdict(
191
+ lambda: defaultdict(lambda: defaultdict(int))
192
+ )
193
+ )
194
+ samples: Dict[str, Dict[str, List[str]]] = field(
195
+ default_factory=lambda: defaultdict(lambda: defaultdict(list))
196
+ )
197
+ _file_based_dict: Optional[FileBackedDict[SourceReportSubtypes]] = None
198
+
199
+ # We are adding this to make querying easier for fine-grained lineage
200
+ _fine_grained_lineage_special_case_name = "fineGrainedLineages"
201
+ _samples_to_add: int = 20
202
+ _lineage_aspects_seen: Set[str] = field(default_factory=set)
203
+
204
+ def __post_init__(self) -> None:
205
+ self._file_based_dict = FileBackedDict(
206
+ tablename="urn_aspects",
207
+ extra_columns={
208
+ "urn": lambda val: val.urn,
209
+ "entityType": lambda val: val.entity_type,
210
+ "subTypes": lambda val: val.subType,
211
+ "aspects": lambda val: json.dumps(val.aspects),
212
+ },
213
+ )
214
+
215
+ def close(self) -> None:
216
+ self.compute_stats()
217
+ if self._file_based_dict is not None:
218
+ self._file_based_dict.close()
219
+ self._file_based_dict = None
220
+
221
+ def _build_aspects_where_clause(self, aspects: List[str]) -> str:
222
+ """Build WHERE clause for matching any of the given aspects."""
223
+ if not aspects:
224
+ return ""
225
+
226
+ conditions = []
227
+ for aspect in aspects:
228
+ conditions.append(f"aspects LIKE '%{aspect}%'")
229
+
230
+ return " OR ".join(conditions)
231
+
232
+ def _collect_samples_by_subtype(self, where_clause: str, sample_key: str) -> None:
233
+ """Helper method to collect samples organized by subtype for a given where clause."""
234
+
235
+ subtype_query = f"""
236
+ SELECT DISTINCT subTypes
237
+ FROM urn_aspects
238
+ WHERE {where_clause}
239
+ """
240
+ assert self._file_based_dict is not None
241
+ subtypes = set()
242
+ for row in self._file_based_dict.sql_query(subtype_query):
243
+ sub_type = row["subTypes"] or "unknown"
244
+ subtypes.add(sub_type)
245
+
246
+ for sub_type in subtypes:
247
+ query = f"""
248
+ SELECT urn
249
+ FROM urn_aspects
250
+ WHERE {where_clause} AND subTypes = ?
251
+ limit {self._samples_to_add}
252
+ """
253
+
254
+ for row in self._file_based_dict.sql_query(query, (sub_type,)):
255
+ self.samples[sample_key][sub_type].append(row["urn"])
256
+
257
+ def _collect_samples_by_aspects(self, aspects: List[str], sample_key: str) -> None:
258
+ """Helper method to collect samples for entities that have any of the given aspects."""
259
+ if not aspects:
260
+ return
261
+
262
+ where_clause = self._build_aspects_where_clause(aspects)
263
+ self._collect_samples_by_subtype(where_clause, sample_key)
264
+
265
+ def _collect_samples_by_lineage_aspects(
266
+ self, aspects: List[str], sample_key: str
267
+ ) -> None:
268
+ """Helper method to collect samples for entities that have any of the given lineage aspects.
269
+
270
+ Lineage aspects are stored in JSON format and require quote escaping in LIKE clauses.
271
+ """
272
+ if not aspects:
273
+ return
274
+
275
+ lineage_conditions = []
276
+ for aspect in aspects:
277
+ lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
278
+
279
+ where_clause = " OR ".join(lineage_conditions)
280
+ self._collect_samples_by_subtype(where_clause, sample_key)
281
+
282
+ def _collect_samples_with_all_conditions(self, sample_key: str) -> None:
283
+ """
284
+ Collect samples for entities that have lineage, profiling, and usage aspects.
285
+ These specific 3 cases are added here as these URNs will be shown in the UI. Subject to change in future.
286
+ """
287
+ if not self._lineage_aspects_seen:
288
+ return
289
+ assert self._file_based_dict is not None
290
+
291
+ # Build lineage conditions using the same logic as _collect_samples_by_lineage_aspects
292
+ lineage_conditions = []
293
+ for aspect in self._lineage_aspects_seen:
294
+ lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
295
+ lineage_where_clause = " OR ".join(lineage_conditions)
296
+
297
+ # Build profiling conditions using the same logic as _collect_samples_by_aspects
298
+ profiling_where_clause = self._build_aspects_where_clause(["datasetProfile"])
299
+
300
+ # Build usage conditions using the same logic as _collect_samples_by_aspects
301
+ usage_where_clause = self._build_aspects_where_clause(
302
+ [
303
+ "datasetUsageStatistics",
304
+ "chartUsageStatistics",
305
+ "dashboardUsageStatistics",
306
+ ]
307
+ )
308
+
309
+ query = f"""
310
+ SELECT urn, subTypes
311
+ FROM urn_aspects
312
+ WHERE ({lineage_where_clause})
313
+ AND ({profiling_where_clause})
314
+ AND ({usage_where_clause})
315
+ limit {self._samples_to_add}
316
+ """
317
+
318
+ for row in self._file_based_dict.sql_query(query):
319
+ sub_type = row["subTypes"] or "unknown"
320
+ self.samples[sample_key][sub_type].append(row["urn"])
321
+
322
+ def _has_fine_grained_lineage(
323
+ self, mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper]
324
+ ) -> bool:
325
+ if isinstance(mcp.aspect, UpstreamLineageClass):
326
+ upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
327
+ if upstream_lineage.fineGrainedLineages:
328
+ return True
329
+ return False
330
+
331
+ def _update_file_based_dict(
332
+ self,
333
+ urn: str,
334
+ entityType: str,
335
+ aspectName: str,
336
+ mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper],
337
+ ) -> None:
338
+ if is_lineage_aspect(entityType, aspectName):
339
+ self._lineage_aspects_seen.add(aspectName)
340
+ has_fine_grained_lineage = self._has_fine_grained_lineage(mcp)
341
+
342
+ sub_type = "unknown"
343
+ if isinstance(mcp.aspect, SubTypesClass):
344
+ sub_type = mcp.aspect.typeNames[0]
345
+
346
+ assert self._file_based_dict is not None
347
+ if urn in self._file_based_dict:
348
+ if sub_type != "unknown":
349
+ self._file_based_dict[urn].subType = sub_type
350
+ aspects_dict = self._file_based_dict[urn].aspects
351
+ if aspectName in aspects_dict:
352
+ aspects_dict[aspectName] += 1
353
+ else:
354
+ aspects_dict[aspectName] = 1
355
+ if has_fine_grained_lineage:
356
+ if self._fine_grained_lineage_special_case_name in aspects_dict:
357
+ aspects_dict[self._fine_grained_lineage_special_case_name] += 1
358
+ else:
359
+ aspects_dict[self._fine_grained_lineage_special_case_name] = 1
360
+ self._file_based_dict.mark_dirty(urn)
361
+ else:
362
+ aspects_dict = {aspectName: 1}
363
+ if has_fine_grained_lineage:
364
+ aspects_dict[self._fine_grained_lineage_special_case_name] = 1
365
+ self._file_based_dict[urn] = SourceReportSubtypes(
366
+ urn=urn,
367
+ entity_type=entityType,
368
+ subType=sub_type,
369
+ aspects=aspects_dict,
370
+ )
371
+
372
+ def _store_workunit_data(self, wu: MetadataWorkUnit) -> None:
373
+ urn = wu.get_urn()
374
+
375
+ if not isinstance(wu.metadata, MetadataChangeEvent):
376
+ mcps = [wu.metadata]
377
+ else:
378
+ mcps = list(mcps_from_mce(wu.metadata))
379
+
380
+ for mcp in mcps:
381
+ entityType = mcp.entityType
382
+ aspectName = mcp.aspectName
383
+
384
+ if aspectName is None:
385
+ continue
386
+
387
+ self._update_file_based_dict(urn, entityType, aspectName, mcp)
388
+
389
+ def compute_stats(self) -> None:
390
+ if self._file_based_dict is None:
391
+ return
392
+
393
+ query = """
394
+ SELECT entityType, subTypes, aspects, count(*) as count
395
+ FROM urn_aspects
396
+ group by entityType, subTypes, aspects
397
+ """
398
+
399
+ entity_subtype_aspect_counts: Dict[str, Dict[str, Dict[str, int]]] = (
400
+ defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
401
+ )
402
+ for row in self._file_based_dict.sql_query(query):
403
+ entity_type = row["entityType"]
404
+ sub_type = row["subTypes"]
405
+ count = row["count"]
406
+ aspects_raw = row["aspects"] or "[]"
407
+
408
+ aspects = json.loads(aspects_raw)
409
+ for aspect, aspect_count in aspects.items():
410
+ entity_subtype_aspect_counts[entity_type][sub_type][aspect] += (
411
+ aspect_count * count
412
+ )
413
+
414
+ self.aspects.clear()
415
+ self.aspects_by_subtypes.clear()
416
+ _aspects_seen: Set[str] = set()
417
+ for entity_type, subtype_counts in entity_subtype_aspect_counts.items():
418
+ for sub_type, aspect_counts in subtype_counts.items():
419
+ for aspect, count in aspect_counts.items():
420
+ self.aspects[entity_type][aspect] += count
421
+ _aspects_seen.add(aspect)
422
+ self.aspects_by_subtypes[entity_type][sub_type] = dict(aspect_counts)
423
+
424
+ self.samples.clear()
425
+ self._collect_samples_by_aspects(["datasetProfile"], "profiling")
426
+ self._collect_samples_by_aspects(
427
+ [
428
+ "datasetUsageStatistics",
429
+ "chartUsageStatistics",
430
+ "dashboardUsageStatistics",
431
+ ],
432
+ "usage",
433
+ )
434
+ self._collect_samples_by_lineage_aspects(
435
+ list(self._lineage_aspects_seen), "lineage"
436
+ )
437
+ self._collect_samples_with_all_conditions("all_3")
438
+
439
+
111
440
  class EntityFilterReport(ReportAttribute):
112
441
  type: str
113
442
 
@@ -147,6 +147,9 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
147
147
  def close(self) -> None:
148
148
  pass
149
149
 
150
+ def flush(self) -> None:
151
+ pass
152
+
150
153
  def configured(self) -> str:
151
154
  """Override this method to output a human-readable and scrubbed version of the configured sink"""
152
155
  return ""