acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (221) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2558 -2531
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +221 -187
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  10. datahub/api/entities/external/restricted_text.py +247 -0
  11. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  12. datahub/cli/check_cli.py +88 -7
  13. datahub/cli/cli_utils.py +63 -0
  14. datahub/cli/container_cli.py +5 -0
  15. datahub/cli/delete_cli.py +124 -27
  16. datahub/cli/docker_check.py +107 -12
  17. datahub/cli/docker_cli.py +149 -227
  18. datahub/cli/exists_cli.py +0 -2
  19. datahub/cli/get_cli.py +0 -2
  20. datahub/cli/iceberg_cli.py +5 -0
  21. datahub/cli/ingest_cli.py +3 -15
  22. datahub/cli/migrate.py +2 -0
  23. datahub/cli/put_cli.py +1 -4
  24. datahub/cli/quickstart_versioning.py +50 -7
  25. datahub/cli/specific/assertions_cli.py +0 -4
  26. datahub/cli/specific/datacontract_cli.py +0 -3
  27. datahub/cli/specific/dataproduct_cli.py +0 -11
  28. datahub/cli/specific/dataset_cli.py +1 -8
  29. datahub/cli/specific/forms_cli.py +0 -4
  30. datahub/cli/specific/group_cli.py +0 -2
  31. datahub/cli/specific/structuredproperties_cli.py +1 -4
  32. datahub/cli/specific/user_cli.py +0 -2
  33. datahub/cli/state_cli.py +0 -2
  34. datahub/cli/timeline_cli.py +0 -2
  35. datahub/emitter/rest_emitter.py +70 -12
  36. datahub/entrypoints.py +4 -3
  37. datahub/ingestion/api/decorators.py +15 -3
  38. datahub/ingestion/api/report.py +332 -3
  39. datahub/ingestion/api/sink.py +3 -0
  40. datahub/ingestion/api/source.py +48 -44
  41. datahub/ingestion/autogenerated/__init__.py +0 -0
  42. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  43. datahub/ingestion/autogenerated/lineage.json +401 -0
  44. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  45. datahub/ingestion/extractor/schema_util.py +13 -4
  46. datahub/ingestion/glossary/classification_mixin.py +5 -0
  47. datahub/ingestion/graph/client.py +100 -15
  48. datahub/ingestion/graph/config.py +1 -0
  49. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  50. datahub/ingestion/run/pipeline.py +54 -2
  51. datahub/ingestion/sink/datahub_rest.py +13 -0
  52. datahub/ingestion/source/abs/source.py +1 -1
  53. datahub/ingestion/source/aws/aws_common.py +4 -0
  54. datahub/ingestion/source/aws/glue.py +489 -244
  55. datahub/ingestion/source/aws/tag_entities.py +292 -0
  56. datahub/ingestion/source/azure/azure_common.py +2 -2
  57. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  58. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  59. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  60. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  61. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  62. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  63. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  64. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  65. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  66. datahub/ingestion/source/common/subtypes.py +45 -0
  67. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  68. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  69. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  70. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  71. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  72. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  73. datahub/ingestion/source/debug/__init__.py +0 -0
  74. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  75. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  76. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  77. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  78. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  79. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  80. datahub/ingestion/source/file.py +3 -0
  81. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  82. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  83. datahub/ingestion/source/ge_data_profiler.py +76 -28
  84. datahub/ingestion/source/ge_profiling_config.py +11 -0
  85. datahub/ingestion/source/hex/api.py +26 -1
  86. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  87. datahub/ingestion/source/identity/azure_ad.py +1 -1
  88. datahub/ingestion/source/identity/okta.py +1 -14
  89. datahub/ingestion/source/kafka/kafka.py +16 -0
  90. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  91. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  92. datahub/ingestion/source/looker/looker_source.py +1 -0
  93. datahub/ingestion/source/mlflow.py +11 -1
  94. datahub/ingestion/source/mock_data/__init__.py +0 -0
  95. datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
  96. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  97. datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
  98. datahub/ingestion/source/nifi.py +1 -1
  99. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  100. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  101. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  102. datahub/ingestion/source/preset.py +2 -2
  103. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  104. datahub/ingestion/source/redshift/redshift.py +21 -1
  105. datahub/ingestion/source/redshift/usage.py +4 -3
  106. datahub/ingestion/source/s3/report.py +4 -2
  107. datahub/ingestion/source/s3/source.py +367 -115
  108. datahub/ingestion/source/sac/sac.py +3 -1
  109. datahub/ingestion/source/salesforce.py +6 -3
  110. datahub/ingestion/source/sigma/sigma.py +7 -1
  111. datahub/ingestion/source/slack/slack.py +2 -1
  112. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  113. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  114. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  115. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  116. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  117. datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
  118. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  119. datahub/ingestion/source/sql/athena.py +119 -11
  120. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  121. datahub/ingestion/source/sql/clickhouse.py +3 -1
  122. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  123. datahub/ingestion/source/sql/hana.py +3 -1
  124. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  125. datahub/ingestion/source/sql/mariadb.py +0 -1
  126. datahub/ingestion/source/sql/mssql/source.py +239 -34
  127. datahub/ingestion/source/sql/mysql.py +0 -1
  128. datahub/ingestion/source/sql/oracle.py +1 -1
  129. datahub/ingestion/source/sql/postgres.py +0 -1
  130. datahub/ingestion/source/sql/sql_common.py +121 -34
  131. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  132. datahub/ingestion/source/sql/teradata.py +997 -235
  133. datahub/ingestion/source/sql/vertica.py +10 -6
  134. datahub/ingestion/source/sql_queries.py +2 -2
  135. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  136. datahub/ingestion/source/superset.py +58 -3
  137. datahub/ingestion/source/tableau/tableau.py +58 -37
  138. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  139. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  140. datahub/ingestion/source/unity/config.py +5 -0
  141. datahub/ingestion/source/unity/proxy.py +118 -0
  142. datahub/ingestion/source/unity/source.py +195 -17
  143. datahub/ingestion/source/unity/tag_entities.py +295 -0
  144. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  145. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  146. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  147. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  148. datahub/metadata/_internal_schema_classes.py +1433 -546
  149. datahub/metadata/_urns/urn_defs.py +1826 -1658
  150. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  151. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  152. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  153. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  154. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  155. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  156. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  157. datahub/metadata/schema.avsc +17736 -17112
  158. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  159. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  160. datahub/metadata/schemas/Applications.avsc +38 -0
  161. datahub/metadata/schemas/ChartKey.avsc +1 -0
  162. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  163. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  164. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  165. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  166. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  167. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  168. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  169. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
  170. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  171. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  172. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  173. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  174. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  175. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  176. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  177. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  178. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  179. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  180. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  181. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  182. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  183. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  184. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  185. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  186. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  187. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  188. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  189. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  190. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  191. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  192. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  193. datahub/metadata/schemas/__init__.py +3 -3
  194. datahub/sdk/__init__.py +2 -0
  195. datahub/sdk/_all_entities.py +7 -0
  196. datahub/sdk/_shared.py +116 -0
  197. datahub/sdk/chart.py +315 -0
  198. datahub/sdk/container.py +7 -0
  199. datahub/sdk/dashboard.py +432 -0
  200. datahub/sdk/dataflow.py +7 -0
  201. datahub/sdk/datajob.py +45 -13
  202. datahub/sdk/dataset.py +8 -2
  203. datahub/sdk/entity_client.py +82 -2
  204. datahub/sdk/lineage_client.py +683 -82
  205. datahub/sdk/main_client.py +46 -16
  206. datahub/sdk/mlmodel.py +101 -38
  207. datahub/sdk/mlmodelgroup.py +7 -0
  208. datahub/sdk/search_client.py +4 -3
  209. datahub/specific/chart.py +1 -1
  210. datahub/specific/dataproduct.py +4 -0
  211. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  212. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  213. datahub/telemetry/telemetry.py +17 -11
  214. datahub/testing/sdk_v2_helpers.py +7 -1
  215. datahub/upgrade/upgrade.py +46 -13
  216. datahub/utilities/server_config_util.py +8 -0
  217. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  218. datahub/utilities/stats_collections.py +4 -0
  219. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +0 -0
  220. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
  221. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
datahub/entrypoints.py CHANGED
@@ -10,6 +10,7 @@ import click
10
10
  import datahub._version as datahub_version
11
11
  from datahub.cli.check_cli import check
12
12
  from datahub.cli.cli_utils import (
13
+ enable_auto_decorators,
13
14
  fixup_gms_url,
14
15
  generate_access_token,
15
16
  make_shim_command,
@@ -38,7 +39,6 @@ from datahub.cli.timeline_cli import timeline
38
39
  from datahub.configuration.common import should_show_stack_trace
39
40
  from datahub.ingestion.graph.client import get_default_graph
40
41
  from datahub.ingestion.graph.config import ClientMode
41
- from datahub.telemetry import telemetry
42
42
  from datahub.utilities._custom_package_loader import model_version_name
43
43
  from datahub.utilities.logging_manager import configure_logging
44
44
  from datahub.utilities.server_config_util import get_gms_config
@@ -111,7 +111,6 @@ def datahub(
111
111
  default=False,
112
112
  help="If passed will show server config. Assumes datahub init has happened.",
113
113
  )
114
- @telemetry.with_telemetry()
115
114
  def version(include_server: bool = False) -> None:
116
115
  """Print version number and exit."""
117
116
 
@@ -131,7 +130,6 @@ def version(include_server: bool = False) -> None:
131
130
  default=False,
132
131
  help="If passed then uses password to initialise token.",
133
132
  )
134
- @telemetry.with_telemetry()
135
133
  def init(use_password: bool = False) -> None:
136
134
  """Configure which datahub instance to connect to"""
137
135
 
@@ -218,6 +216,9 @@ except ImportError as e:
218
216
  make_shim_command("actions", "run `pip install acryl-datahub-actions`")
219
217
  )
220
218
 
219
+ # Adding telemetry and upgrade decorators to all commands
220
+ enable_auto_decorators(datahub)
221
+
221
222
 
222
223
  def main(**kwargs):
223
224
  # We use threads in a variety of places within our CLI. The multiprocessing
@@ -1,12 +1,16 @@
1
+ # So that SourceCapabilityModifier can be resolved at runtime
2
+ from __future__ import annotations
3
+
1
4
  from dataclasses import dataclass
2
5
  from enum import Enum, auto
3
- from typing import Callable, Dict, Optional, Type
6
+ from typing import Callable, Dict, List, Optional, Type
4
7
 
5
8
  from datahub.ingestion.api.common import PipelineContext
6
9
  from datahub.ingestion.api.source import (
7
10
  Source,
8
11
  SourceCapability as SourceCapability,
9
12
  )
13
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
10
14
 
11
15
 
12
16
  def config_class(config_cls: Type) -> Callable[[Type], Type]:
@@ -88,10 +92,14 @@ class CapabilitySetting:
88
92
  capability: SourceCapability
89
93
  description: str
90
94
  supported: bool
95
+ subtype_modifier: Optional[List[SourceCapabilityModifier]] = None
91
96
 
92
97
 
93
98
  def capability(
94
- capability_name: SourceCapability, description: str, supported: bool = True
99
+ capability_name: SourceCapability,
100
+ description: str,
101
+ supported: bool = True,
102
+ subtype_modifier: Optional[List[SourceCapabilityModifier]] = None,
95
103
  ) -> Callable[[Type], Type]:
96
104
  """
97
105
  A decorator to mark a source as having a certain capability
@@ -104,6 +112,7 @@ def capability(
104
112
  for base in cls.__bases__
105
113
  ):
106
114
  cls.__capabilities = {}
115
+
107
116
  cls.get_capabilities = lambda: cls.__capabilities.values()
108
117
 
109
118
  # If the superclasses have capability annotations, copy those over.
@@ -113,7 +122,10 @@ def capability(
113
122
  cls.__capabilities.update(base_caps)
114
123
 
115
124
  cls.__capabilities[capability_name] = CapabilitySetting(
116
- capability=capability_name, description=description, supported=supported
125
+ capability=capability_name,
126
+ description=description,
127
+ supported=supported,
128
+ subtype_modifier=subtype_modifier,
117
129
  )
118
130
  return cls
119
131
 
@@ -2,17 +2,31 @@ import dataclasses
2
2
  import json
3
3
  import logging
4
4
  import pprint
5
- from dataclasses import dataclass
5
+ from collections import defaultdict
6
+ from dataclasses import dataclass, field
6
7
  from datetime import datetime, timedelta
7
8
  from enum import Enum
8
- from typing import Any, Optional, runtime_checkable
9
+ from typing import Any, Dict, List, Optional, Set, Union, cast, runtime_checkable
9
10
 
10
11
  import humanfriendly
11
12
  import pydantic
12
13
  from pydantic import BaseModel
14
+ from tabulate import tabulate
13
15
  from typing_extensions import Literal, Protocol
14
16
 
17
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
18
+ from datahub.emitter.mcp_builder import mcps_from_mce
19
+ from datahub.ingestion.api.closeable import Closeable
15
20
  from datahub.ingestion.api.report_helpers import format_datetime_relative
21
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
22
+ from datahub.ingestion.autogenerated.lineage_helper import is_lineage_aspect
23
+ from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
24
+ from datahub.metadata.schema_classes import (
25
+ MetadataChangeProposalClass,
26
+ SubTypesClass,
27
+ UpstreamLineageClass,
28
+ )
29
+ from datahub.utilities.file_backed_collections import FileBackedDict
16
30
  from datahub.utilities.lossy_collections import LossyList
17
31
 
18
32
  logger = logging.getLogger(__name__)
@@ -82,7 +96,58 @@ class Report(SupportsAsObj):
82
96
  }
83
97
 
84
98
  def as_string(self) -> str:
85
- return pprint.pformat(self.as_obj(), width=150, sort_dicts=False)
99
+ self_obj = self.as_obj()
100
+ _aspects_by_subtypes = self_obj.pop("aspects_by_subtypes", None)
101
+
102
+ # Format the main report data
103
+ result = pprint.pformat(self_obj, width=150, sort_dicts=False)
104
+
105
+ # Add aspects_by_subtypes table if it exists
106
+ if _aspects_by_subtypes:
107
+ result += "\n\nAspects by Subtypes:\n"
108
+ result += self._format_aspects_by_subtypes_table(_aspects_by_subtypes)
109
+
110
+ return result
111
+
112
+ def _format_aspects_by_subtypes_table(
113
+ self, aspects_by_subtypes: Dict[str, Dict[str, Dict[str, int]]]
114
+ ) -> str:
115
+ """Format aspects_by_subtypes data as a table with aspects as rows and entity/subtype as columns."""
116
+ if not aspects_by_subtypes:
117
+ return "No aspects by subtypes data available."
118
+
119
+ all_aspects: set[str] = {
120
+ aspect
121
+ for subtypes in aspects_by_subtypes.values()
122
+ for aspects in subtypes.values()
123
+ for aspect in aspects
124
+ }
125
+
126
+ aspect_rows = sorted(all_aspects)
127
+
128
+ entity_subtype_columns = []
129
+ for entity_type, subtypes in aspects_by_subtypes.items():
130
+ for subtype in subtypes:
131
+ entity_subtype_columns.append(f"{entity_type} ({subtype})")
132
+
133
+ entity_subtype_columns.sort()
134
+
135
+ headers = ["Aspect"] + entity_subtype_columns
136
+
137
+ table_data = [
138
+ [aspect]
139
+ + [
140
+ aspects.get(aspect, 0)
141
+ for subtypes in aspects_by_subtypes.values()
142
+ for aspects in subtypes.values()
143
+ ]
144
+ for aspect in aspect_rows
145
+ ]
146
+
147
+ if table_data:
148
+ return tabulate(table_data, headers=headers, tablefmt="grid")
149
+ else:
150
+ return "No aspects by subtypes data available."
86
151
 
87
152
  def as_json(self) -> str:
88
153
  return json.dumps(self.as_obj())
@@ -90,6 +155,14 @@ class Report(SupportsAsObj):
90
155
  # TODO add helper method for warning / failure status + counts?
91
156
 
92
157
 
158
+ @dataclass
159
+ class SourceReportSubtypes:
160
+ urn: str
161
+ entity_type: str
162
+ subType: str = field(default="unknown")
163
+ aspects: Dict[str, int] = field(default_factory=dict)
164
+
165
+
93
166
  class ReportAttribute(BaseModel):
94
167
  severity: LogLevel = "DEBUG"
95
168
  help: Optional[str] = None
@@ -108,6 +181,262 @@ class ReportAttribute(BaseModel):
108
181
  logger.log(level=self.logger_sev, msg=msg, stacklevel=3)
109
182
 
110
183
 
184
+ @dataclass
185
+ class ExamplesReport(Report, Closeable):
186
+ aspects: Dict[str, Dict[str, int]] = field(
187
+ default_factory=lambda: defaultdict(lambda: defaultdict(int))
188
+ )
189
+ aspects_by_subtypes: Dict[str, Dict[str, Dict[str, int]]] = field(
190
+ default_factory=lambda: defaultdict(
191
+ lambda: defaultdict(lambda: defaultdict(int))
192
+ )
193
+ )
194
+ samples: Dict[str, Dict[str, List[str]]] = field(
195
+ default_factory=lambda: defaultdict(lambda: defaultdict(list))
196
+ )
197
+ _file_based_dict: Optional[FileBackedDict[SourceReportSubtypes]] = None
198
+
199
+ # We are adding this to make querying easier for fine-grained lineage
200
+ _fine_grained_lineage_special_case_name = "fineGrainedLineages"
201
+ _samples_to_add: int = 20
202
+ _lineage_aspects_seen: Set[str] = field(default_factory=set)
203
+
204
+ def __post_init__(self) -> None:
205
+ self._file_based_dict = FileBackedDict(
206
+ tablename="urn_aspects",
207
+ extra_columns={
208
+ "urn": lambda val: val.urn,
209
+ "entityType": lambda val: val.entity_type,
210
+ "subTypes": lambda val: val.subType,
211
+ "aspects": lambda val: json.dumps(val.aspects),
212
+ },
213
+ )
214
+
215
+ def close(self) -> None:
216
+ self.compute_stats()
217
+ if self._file_based_dict is not None:
218
+ self._file_based_dict.close()
219
+ self._file_based_dict = None
220
+
221
+ def _build_aspects_where_clause(self, aspects: List[str]) -> str:
222
+ """Build WHERE clause for matching any of the given aspects."""
223
+ if not aspects:
224
+ return ""
225
+
226
+ conditions = []
227
+ for aspect in aspects:
228
+ conditions.append(f"aspects LIKE '%{aspect}%'")
229
+
230
+ return " OR ".join(conditions)
231
+
232
+ def _collect_samples_by_subtype(self, where_clause: str, sample_key: str) -> None:
233
+ """Helper method to collect samples organized by subtype for a given where clause."""
234
+
235
+ subtype_query = f"""
236
+ SELECT DISTINCT subTypes
237
+ FROM urn_aspects
238
+ WHERE {where_clause}
239
+ """
240
+ assert self._file_based_dict is not None
241
+ subtypes = set()
242
+ for row in self._file_based_dict.sql_query(subtype_query):
243
+ sub_type = row["subTypes"] or "unknown"
244
+ subtypes.add(sub_type)
245
+
246
+ for sub_type in subtypes:
247
+ query = f"""
248
+ SELECT urn
249
+ FROM urn_aspects
250
+ WHERE {where_clause} AND subTypes = ?
251
+ limit {self._samples_to_add}
252
+ """
253
+
254
+ for row in self._file_based_dict.sql_query(query, (sub_type,)):
255
+ self.samples[sample_key][sub_type].append(row["urn"])
256
+
257
+ def _collect_samples_by_aspects(self, aspects: List[str], sample_key: str) -> None:
258
+ """Helper method to collect samples for entities that have any of the given aspects."""
259
+ if not aspects:
260
+ return
261
+
262
+ where_clause = self._build_aspects_where_clause(aspects)
263
+ self._collect_samples_by_subtype(where_clause, sample_key)
264
+
265
+ def _collect_samples_by_lineage_aspects(
266
+ self, aspects: List[str], sample_key: str
267
+ ) -> None:
268
+ """Helper method to collect samples for entities that have any of the given lineage aspects.
269
+
270
+ Lineage aspects are stored in JSON format and require quote escaping in LIKE clauses.
271
+ """
272
+ if not aspects:
273
+ return
274
+
275
+ lineage_conditions = []
276
+ for aspect in aspects:
277
+ lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
278
+
279
+ where_clause = " OR ".join(lineage_conditions)
280
+ self._collect_samples_by_subtype(where_clause, sample_key)
281
+
282
+ def _collect_samples_with_all_conditions(self, sample_key: str) -> None:
283
+ """
284
+ Collect samples for entities that have lineage, profiling, and usage aspects.
285
+ These specific 3 cases are added here as these URNs will be shown in the UI. Subject to change in future.
286
+ """
287
+ if not self._lineage_aspects_seen:
288
+ return
289
+ assert self._file_based_dict is not None
290
+
291
+ # Build lineage conditions using the same logic as _collect_samples_by_lineage_aspects
292
+ lineage_conditions = []
293
+ for aspect in self._lineage_aspects_seen:
294
+ lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
295
+ lineage_where_clause = " OR ".join(lineage_conditions)
296
+
297
+ # Build profiling conditions using the same logic as _collect_samples_by_aspects
298
+ profiling_where_clause = self._build_aspects_where_clause(["datasetProfile"])
299
+
300
+ # Build usage conditions using the same logic as _collect_samples_by_aspects
301
+ usage_where_clause = self._build_aspects_where_clause(
302
+ [
303
+ "datasetUsageStatistics",
304
+ "chartUsageStatistics",
305
+ "dashboardUsageStatistics",
306
+ ]
307
+ )
308
+
309
+ query = f"""
310
+ SELECT urn, subTypes
311
+ FROM urn_aspects
312
+ WHERE ({lineage_where_clause})
313
+ AND ({profiling_where_clause})
314
+ AND ({usage_where_clause})
315
+ limit {self._samples_to_add}
316
+ """
317
+
318
+ for row in self._file_based_dict.sql_query(query):
319
+ sub_type = row["subTypes"] or "unknown"
320
+ self.samples[sample_key][sub_type].append(row["urn"])
321
+
322
+ def _has_fine_grained_lineage(
323
+ self, mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper]
324
+ ) -> bool:
325
+ if isinstance(mcp.aspect, UpstreamLineageClass):
326
+ upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
327
+ if upstream_lineage.fineGrainedLineages:
328
+ return True
329
+ return False
330
+
331
+ def _update_file_based_dict(
332
+ self,
333
+ urn: str,
334
+ entityType: str,
335
+ aspectName: str,
336
+ mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper],
337
+ ) -> None:
338
+ if is_lineage_aspect(entityType, aspectName):
339
+ self._lineage_aspects_seen.add(aspectName)
340
+ has_fine_grained_lineage = self._has_fine_grained_lineage(mcp)
341
+
342
+ sub_type = "unknown"
343
+ if isinstance(mcp.aspect, SubTypesClass):
344
+ sub_type = mcp.aspect.typeNames[0]
345
+
346
+ assert self._file_based_dict is not None
347
+ if urn in self._file_based_dict:
348
+ if sub_type != "unknown":
349
+ self._file_based_dict[urn].subType = sub_type
350
+ aspects_dict = self._file_based_dict[urn].aspects
351
+ if aspectName in aspects_dict:
352
+ aspects_dict[aspectName] += 1
353
+ else:
354
+ aspects_dict[aspectName] = 1
355
+ if has_fine_grained_lineage:
356
+ if self._fine_grained_lineage_special_case_name in aspects_dict:
357
+ aspects_dict[self._fine_grained_lineage_special_case_name] += 1
358
+ else:
359
+ aspects_dict[self._fine_grained_lineage_special_case_name] = 1
360
+ self._file_based_dict.mark_dirty(urn)
361
+ else:
362
+ aspects_dict = {aspectName: 1}
363
+ if has_fine_grained_lineage:
364
+ aspects_dict[self._fine_grained_lineage_special_case_name] = 1
365
+ self._file_based_dict[urn] = SourceReportSubtypes(
366
+ urn=urn,
367
+ entity_type=entityType,
368
+ subType=sub_type,
369
+ aspects=aspects_dict,
370
+ )
371
+
372
+ def _store_workunit_data(self, wu: MetadataWorkUnit) -> None:
373
+ urn = wu.get_urn()
374
+
375
+ if not isinstance(wu.metadata, MetadataChangeEvent):
376
+ mcps = [wu.metadata]
377
+ else:
378
+ mcps = list(mcps_from_mce(wu.metadata))
379
+
380
+ for mcp in mcps:
381
+ entityType = mcp.entityType
382
+ aspectName = mcp.aspectName
383
+
384
+ if aspectName is None:
385
+ continue
386
+
387
+ self._update_file_based_dict(urn, entityType, aspectName, mcp)
388
+
389
+ def compute_stats(self) -> None:
390
+ if self._file_based_dict is None:
391
+ return
392
+
393
+ query = """
394
+ SELECT entityType, subTypes, aspects, count(*) as count
395
+ FROM urn_aspects
396
+ group by entityType, subTypes, aspects
397
+ """
398
+
399
+ entity_subtype_aspect_counts: Dict[str, Dict[str, Dict[str, int]]] = (
400
+ defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
401
+ )
402
+ for row in self._file_based_dict.sql_query(query):
403
+ entity_type = row["entityType"]
404
+ sub_type = row["subTypes"]
405
+ count = row["count"]
406
+ aspects_raw = row["aspects"] or "[]"
407
+
408
+ aspects = json.loads(aspects_raw)
409
+ for aspect, aspect_count in aspects.items():
410
+ entity_subtype_aspect_counts[entity_type][sub_type][aspect] += (
411
+ aspect_count * count
412
+ )
413
+
414
+ self.aspects.clear()
415
+ self.aspects_by_subtypes.clear()
416
+ _aspects_seen: Set[str] = set()
417
+ for entity_type, subtype_counts in entity_subtype_aspect_counts.items():
418
+ for sub_type, aspect_counts in subtype_counts.items():
419
+ for aspect, count in aspect_counts.items():
420
+ self.aspects[entity_type][aspect] += count
421
+ _aspects_seen.add(aspect)
422
+ self.aspects_by_subtypes[entity_type][sub_type] = dict(aspect_counts)
423
+
424
+ self.samples.clear()
425
+ self._collect_samples_by_aspects(["datasetProfile"], "profiling")
426
+ self._collect_samples_by_aspects(
427
+ [
428
+ "datasetUsageStatistics",
429
+ "chartUsageStatistics",
430
+ "dashboardUsageStatistics",
431
+ ],
432
+ "usage",
433
+ )
434
+ self._collect_samples_by_lineage_aspects(
435
+ list(self._lineage_aspects_seen), "lineage"
436
+ )
437
+ self._collect_samples_with_all_conditions("all_3")
438
+
439
+
111
440
  class EntityFilterReport(ReportAttribute):
112
441
  type: str
113
442
 
@@ -147,6 +147,9 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
147
147
  def close(self) -> None:
148
148
  pass
149
149
 
150
+ def flush(self) -> None:
151
+ pass
152
+
150
153
  def configured(self) -> str:
151
154
  """Override this method to output a human-readable and scrubbed version of the configured sink"""
152
155
  return ""
@@ -2,7 +2,6 @@ import contextlib
2
2
  import datetime
3
3
  import logging
4
4
  from abc import ABCMeta, abstractmethod
5
- from collections import defaultdict
6
5
  from dataclasses import dataclass, field
7
6
  from enum import Enum
8
7
  from functools import partial
@@ -15,7 +14,6 @@ from typing import (
15
14
  List,
16
15
  Optional,
17
16
  Sequence,
18
- Set,
19
17
  Type,
20
18
  TypeVar,
21
19
  Union,
@@ -28,7 +26,6 @@ from typing_extensions import LiteralString, Self
28
26
  from datahub.configuration.common import ConfigModel
29
27
  from datahub.configuration.source_common import PlatformInstanceConfigMixin
30
28
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
31
- from datahub.emitter.mcp_builder import mcps_from_mce
32
29
  from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
33
30
  auto_patch_last_modified,
34
31
  )
@@ -37,7 +34,7 @@ from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
37
34
  )
38
35
  from datahub.ingestion.api.closeable import Closeable
39
36
  from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
40
- from datahub.ingestion.api.report import Report
37
+ from datahub.ingestion.api.report import ExamplesReport, Report
41
38
  from datahub.ingestion.api.source_helpers import (
42
39
  AutoSystemMetadata,
43
40
  auto_browse_path_v2,
@@ -50,9 +47,8 @@ from datahub.ingestion.api.source_helpers import (
50
47
  auto_workunit_reporter,
51
48
  )
52
49
  from datahub.ingestion.api.workunit import MetadataWorkUnit
53
- from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
54
- from datahub.metadata.schema_classes import UpstreamLineageClass
55
50
  from datahub.sdk.entity import Entity
51
+ from datahub.telemetry import stats
56
52
  from datahub.utilities.lossy_collections import LossyDict, LossyList
57
53
  from datahub.utilities.type_annotations import get_class_from_annotation
58
54
 
@@ -76,6 +72,7 @@ class SourceCapability(Enum):
76
72
  SCHEMA_METADATA = "Schema Metadata"
77
73
  CONTAINERS = "Asset Containers"
78
74
  CLASSIFICATION = "Classification"
75
+ TEST_CONNECTION = "Test Connection"
79
76
 
80
77
 
81
78
  class StructuredLogLevel(Enum):
@@ -190,20 +187,11 @@ class StructuredLogs(Report):
190
187
 
191
188
 
192
189
  @dataclass
193
- class SourceReport(Report):
190
+ class SourceReport(ExamplesReport):
194
191
  event_not_produced_warn: bool = True
195
192
  events_produced: int = 0
196
193
  events_produced_per_sec: int = 0
197
194
 
198
- _urns_seen: Set[str] = field(default_factory=set)
199
- entities: Dict[str, list] = field(default_factory=lambda: defaultdict(LossyList))
200
- aspects: Dict[str, Dict[str, int]] = field(
201
- default_factory=lambda: defaultdict(lambda: defaultdict(int))
202
- )
203
- aspect_urn_samples: Dict[str, Dict[str, LossyList[str]]] = field(
204
- default_factory=lambda: defaultdict(lambda: defaultdict(LossyList))
205
- )
206
-
207
195
  _structured_logs: StructuredLogs = field(default_factory=StructuredLogs)
208
196
 
209
197
  @property
@@ -220,33 +208,10 @@ class SourceReport(Report):
220
208
 
221
209
  def report_workunit(self, wu: WorkUnit) -> None:
222
210
  self.events_produced += 1
211
+ if not isinstance(wu, MetadataWorkUnit):
212
+ return
223
213
 
224
- if isinstance(wu, MetadataWorkUnit):
225
- urn = wu.get_urn()
226
-
227
- # Specialized entity reporting.
228
- if not isinstance(wu.metadata, MetadataChangeEvent):
229
- mcps = [wu.metadata]
230
- else:
231
- mcps = list(mcps_from_mce(wu.metadata))
232
-
233
- for mcp in mcps:
234
- entityType = mcp.entityType
235
- aspectName = mcp.aspectName
236
-
237
- if urn not in self._urns_seen:
238
- self._urns_seen.add(urn)
239
- self.entities[entityType].append(urn)
240
-
241
- if aspectName is not None: # usually true
242
- self.aspects[entityType][aspectName] += 1
243
- self.aspect_urn_samples[entityType][aspectName].append(urn)
244
- if isinstance(mcp.aspect, UpstreamLineageClass):
245
- upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
246
- if upstream_lineage.fineGrainedLineages:
247
- self.aspect_urn_samples[entityType][
248
- "fineGrainedLineages"
249
- ].append(urn)
214
+ super()._store_workunit_data(wu)
250
215
 
251
216
  def report_warning(
252
217
  self,
@@ -265,9 +230,10 @@ class SourceReport(Report):
265
230
  context: Optional[str] = None,
266
231
  title: Optional[LiteralString] = None,
267
232
  exc: Optional[BaseException] = None,
233
+ log: bool = True,
268
234
  ) -> None:
269
235
  self._structured_logs.report_log(
270
- StructuredLogLevel.WARN, message, title, context, exc, log=True
236
+ StructuredLogLevel.WARN, message, title, context, exc, log=log
271
237
  )
272
238
 
273
239
  def report_failure(
@@ -325,6 +291,7 @@ class SourceReport(Report):
325
291
  )
326
292
 
327
293
  def __post_init__(self) -> None:
294
+ super().__post_init__()
328
295
  self.start_time = datetime.datetime.now()
329
296
  self.running_time: datetime.timedelta = datetime.timedelta(seconds=0)
330
297
 
@@ -337,6 +304,43 @@ class SourceReport(Report):
337
304
  "infos": Report.to_pure_python_obj(self.infos),
338
305
  }
339
306
 
307
+ @staticmethod
308
+ def _discretize_dict_values(
309
+ nested_dict: Dict[str, Dict[str, int]],
310
+ ) -> Dict[str, Dict[str, int]]:
311
+ """Helper method to discretize values in a nested dictionary structure."""
312
+ result = {}
313
+ for outer_key, inner_dict in nested_dict.items():
314
+ discretized_dict: Dict[str, int] = {}
315
+ for inner_key, count in inner_dict.items():
316
+ discretized_dict[inner_key] = stats.discretize(count)
317
+ result[outer_key] = discretized_dict
318
+ return result
319
+
320
+ def get_aspects_dict(self) -> Dict[str, Dict[str, int]]:
321
+ """Convert the nested defaultdict aspects to a regular dict for serialization."""
322
+ return self._discretize_dict_values(self.aspects)
323
+
324
+ def get_aspects_by_subtypes_dict(self) -> Dict[str, Dict[str, Dict[str, int]]]:
325
+ """Get aspect counts grouped by entity type and subtype."""
326
+ return self._discretize_dict_values_nested(self.aspects_by_subtypes)
327
+
328
+ @staticmethod
329
+ def _discretize_dict_values_nested(
330
+ nested_dict: Dict[str, Dict[str, Dict[str, int]]],
331
+ ) -> Dict[str, Dict[str, Dict[str, int]]]:
332
+ """Helper method to discretize values in a nested dictionary structure with three levels."""
333
+ result = {}
334
+ for outer_key, middle_dict in nested_dict.items():
335
+ discretized_middle_dict: Dict[str, Dict[str, int]] = {}
336
+ for middle_key, inner_dict in middle_dict.items():
337
+ discretized_inner_dict: Dict[str, int] = {}
338
+ for inner_key, count in inner_dict.items():
339
+ discretized_inner_dict[inner_key] = stats.discretize(count)
340
+ discretized_middle_dict[middle_key] = discretized_inner_dict
341
+ result[outer_key] = discretized_middle_dict
342
+ return result
343
+
340
344
  def compute_stats(self) -> None:
341
345
  super().compute_stats()
342
346
 
@@ -503,7 +507,7 @@ class Source(Closeable, metaclass=ABCMeta):
503
507
  pass
504
508
 
505
509
  def close(self) -> None:
506
- pass
510
+ self.get_report().close()
507
511
 
508
512
  def _infer_platform(self) -> Optional[str]:
509
513
  config = self.get_config()
File without changes