acryl-datahub 1.2.0.3rc1__py3-none-any.whl → 1.2.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (74) hide show
  1. {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/METADATA +2609 -2608
  2. {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/RECORD +74 -73
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/dataset/dataset.py +3 -3
  5. datahub/api/entities/external/external_tag.py +6 -4
  6. datahub/api/entities/external/lake_formation_external_entites.py +50 -49
  7. datahub/api/entities/external/restricted_text.py +105 -180
  8. datahub/api/entities/external/unity_catalog_external_entites.py +51 -52
  9. datahub/api/entities/forms/forms.py +3 -3
  10. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  11. datahub/cli/quickstart_versioning.py +1 -1
  12. datahub/cli/specific/assertions_cli.py +37 -2
  13. datahub/cli/specific/datacontract_cli.py +54 -4
  14. datahub/emitter/rest_emitter.py +18 -5
  15. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +1 -1
  16. datahub/ingestion/api/report.py +21 -2
  17. datahub/ingestion/api/source.py +81 -7
  18. datahub/ingestion/autogenerated/capability_summary.json +47 -19
  19. datahub/ingestion/graph/client.py +19 -3
  20. datahub/ingestion/sink/datahub_rest.py +2 -0
  21. datahub/ingestion/source/abs/config.py +1 -1
  22. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  23. datahub/ingestion/source/abs/source.py +9 -0
  24. datahub/ingestion/source/aws/glue.py +18 -2
  25. datahub/ingestion/source/aws/tag_entities.py +4 -4
  26. datahub/ingestion/source/data_lake_common/path_spec.py +6 -3
  27. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  28. datahub/ingestion/source/dbt/dbt_cloud.py +6 -3
  29. datahub/ingestion/source/dbt/dbt_common.py +10 -0
  30. datahub/ingestion/source/delta_lake/source.py +8 -1
  31. datahub/ingestion/source/dremio/dremio_source.py +19 -2
  32. datahub/ingestion/source/fivetran/fivetran.py +9 -3
  33. datahub/ingestion/source/fivetran/fivetran_log_api.py +4 -3
  34. datahub/ingestion/source/ge_data_profiler.py +8 -0
  35. datahub/ingestion/source/grafana/models.py +6 -0
  36. datahub/ingestion/source/hex/hex.py +1 -1
  37. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  38. datahub/ingestion/source/iceberg/iceberg.py +4 -4
  39. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  40. datahub/ingestion/source/mock_data/datahub_mock_data.py +26 -10
  41. datahub/ingestion/source/powerbi/powerbi.py +4 -1
  42. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  43. datahub/ingestion/source/redshift/datashares.py +1 -1
  44. datahub/ingestion/source/redshift/redshift.py +1 -0
  45. datahub/ingestion/source/salesforce.py +8 -0
  46. datahub/ingestion/source/slack/slack.py +7 -14
  47. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -4
  48. datahub/ingestion/source/sql/athena_properties_extractor.py +2 -2
  49. datahub/ingestion/source/sql/hive_metastore.py +8 -0
  50. datahub/ingestion/source/sql/teradata.py +8 -1
  51. datahub/ingestion/source/sql/trino.py +9 -0
  52. datahub/ingestion/source/tableau/tableau.py +1 -1
  53. datahub/ingestion/source/unity/config.py +36 -1
  54. datahub/ingestion/source/unity/proxy.py +332 -46
  55. datahub/ingestion/source/unity/proxy_types.py +12 -2
  56. datahub/ingestion/source/unity/source.py +91 -34
  57. datahub/ingestion/source/unity/tag_entities.py +5 -5
  58. datahub/ingestion/source/usage/starburst_trino_usage.py +2 -2
  59. datahub/ingestion/transformer/base_transformer.py +8 -5
  60. datahub/metadata/_internal_schema_classes.py +513 -513
  61. datahub/metadata/_urns/urn_defs.py +1684 -1684
  62. datahub/metadata/schema.avsc +16745 -16348
  63. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  64. datahub/sdk/entity_client.py +22 -7
  65. datahub/sdk/search_client.py +3 -0
  66. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  67. datahub/specific/datajob.py +15 -1
  68. datahub/specific/dataset.py +37 -59
  69. datahub/utilities/mapping.py +29 -2
  70. datahub/utilities/server_config_util.py +2 -1
  71. {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/WHEEL +0 -0
  72. {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/entry_points.txt +0 -0
  73. {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/licenses/LICENSE +0 -0
  74. {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/top_level.txt +0 -0
@@ -44,7 +44,7 @@ def get_minimum_supported_version_message(version: str) -> str:
44
44
  class QuickstartExecutionPlan(BaseModel):
45
45
  composefile_git_ref: str
46
46
  docker_tag: str
47
- mysql_tag: Optional[str]
47
+ mysql_tag: Optional[str] = None
48
48
 
49
49
 
50
50
  def _is_it_a_version(version: str) -> bool:
@@ -1,3 +1,8 @@
1
+ """
2
+ DEPRECATED: This assertions CLI is no longer supported and will be removed in a future version.
3
+ Please use alternative methods for managing assertions in DataHub.
4
+ """
5
+
1
6
  import logging
2
7
  import os
3
8
  from pathlib import Path
@@ -26,7 +31,18 @@ REPORT_FILE_NAME = "compile_report.json"
26
31
 
27
32
  @click.group(cls=DefaultGroup, default="upsert")
28
33
  def assertions() -> None:
29
- """A group of commands to interact with the Assertion entity in DataHub."""
34
+ """A group of commands to interact with the Assertion entity in DataHub.
35
+
36
+ ⚠️ DEPRECATED: This assertions CLI is no longer supported and will be removed
37
+ in a future version. Please use alternative methods for managing assertions in DataHub.
38
+ """
39
+ click.secho(
40
+ "⚠️ WARNING: The assertions CLI is deprecated and no longer supported. "
41
+ "It may be removed in a future version. Please use alternative methods for managing assertions in DataHub.",
42
+ fg="yellow",
43
+ bold=True,
44
+ err=True,
45
+ )
30
46
  pass
31
47
 
32
48
 
@@ -34,7 +50,16 @@ def assertions() -> None:
34
50
  @click.option("-f", "--file", required=True, type=click.Path(exists=True))
35
51
  @upgrade.check_upgrade
36
52
  def upsert(file: str) -> None:
37
- """Upsert (create or update) a set of assertions in DataHub."""
53
+ """Upsert (create or update) a set of assertions in DataHub.
54
+
55
+ ⚠️ DEPRECATED: This command is deprecated and no longer supported.
56
+ """
57
+ click.secho(
58
+ "⚠️ WARNING: The 'upsert' command is deprecated and no longer supported.",
59
+ fg="yellow",
60
+ bold=True,
61
+ err=True,
62
+ )
38
63
 
39
64
  assertions_spec: AssertionsConfigSpec = AssertionsConfigSpec.from_yaml(file)
40
65
 
@@ -78,7 +103,15 @@ def compile(
78
103
  In future, we may introduce separate command to automatically apply these compiled changes
79
104
  in assertion platform. Currently, generated result artifacts are stored in target folder
80
105
  unless another folder is specified using option `--output-to <folder>`.
106
+
107
+ ⚠️ DEPRECATED: This command is deprecated and no longer supported.
81
108
  """
109
+ click.secho(
110
+ "⚠️ WARNING: The 'compile' command is deprecated and no longer supported.",
111
+ fg="yellow",
112
+ bold=True,
113
+ err=True,
114
+ )
82
115
 
83
116
  if platform not in ASSERTION_PLATFORMS:
84
117
  click.secho(
@@ -146,3 +179,5 @@ def extras_list_to_dict(extras: List[str]) -> Dict[str, str]:
146
179
  # Later:
147
180
  # 3. execute compiled assertions on assertion platform (Later, requires connection details to platform),
148
181
  # 4. cleanup assertions from assertion platform (generate artifacts. optionally execute)
182
+ #
183
+ # NOTE: This entire assertions CLI is deprecated and these TODOs will not be implemented.
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import warnings
2
3
  from typing import Optional
3
4
 
4
5
  import click
@@ -14,15 +15,52 @@ logger = logging.getLogger(__name__)
14
15
 
15
16
  @click.group(cls=DefaultGroup, default="upsert")
16
17
  def datacontract() -> None:
17
- """A group of commands to interact with the DataContract entity in DataHub."""
18
- pass
18
+ """
19
+ A group of commands to interact with the DataContract entity in DataHub.
20
+
21
+ WARNING: This CLI is DEPRECATED and no longer supported.
22
+ Please migrate to alternative data contract solutions.
23
+ """
24
+ # Issue deprecation warning
25
+ warnings.warn(
26
+ "The datacontract CLI is deprecated and no longer supported. "
27
+ "Please migrate to alternative data contract solutions.",
28
+ DeprecationWarning,
29
+ stacklevel=2,
30
+ )
31
+
32
+ # Log deprecation message for runtime visibility
33
+ logger.warning(
34
+ "DEPRECATED: The datacontract CLI is no longer supported and will be removed in a future version. "
35
+ "Please migrate to alternative data contract solutions."
36
+ )
37
+
38
+ # Display deprecation message to user
39
+ click.secho(
40
+ "⚠️ WARNING: This datacontract CLI is DEPRECATED and no longer supported.",
41
+ fg="yellow",
42
+ bold=True,
43
+ )
44
+ click.secho("Please migrate to alternative data contract solutions.", fg="yellow")
19
45
 
20
46
 
21
47
  @datacontract.command()
22
48
  @click.option("-f", "--file", required=True, type=click.Path(exists=True))
23
49
  @upgrade.check_upgrade
24
50
  def upsert(file: str) -> None:
25
- """Upsert (create or update) a Data Contract in DataHub."""
51
+ """
52
+ Upsert (create or update) a Data Contract in DataHub.
53
+
54
+ WARNING: This command is DEPRECATED and no longer supported.
55
+ """
56
+
57
+ click.secho(
58
+ "⚠️ WARNING: The 'upsert' command is deprecated and no longer supported.",
59
+ fg="yellow",
60
+ bold=True,
61
+ )
62
+
63
+ logger.warning("DEPRECATED: datacontract upsert command is no longer supported")
26
64
 
27
65
  data_contract: DataContract = DataContract.from_yaml(file)
28
66
  urn = data_contract.urn
@@ -59,7 +97,19 @@ def upsert(file: str) -> None:
59
97
  @click.option("--hard/--soft", required=False, is_flag=True, default=False)
60
98
  @upgrade.check_upgrade
61
99
  def delete(urn: Optional[str], file: Optional[str], hard: bool) -> None:
62
- """Delete a Data Contract in DataHub. Defaults to a soft-delete. Use --hard to completely erase metadata."""
100
+ """
101
+ Delete a Data Contract in DataHub. Defaults to a soft-delete. Use --hard to completely erase metadata.
102
+
103
+ WARNING: This command is DEPRECATED and no longer supported.
104
+ """
105
+
106
+ click.secho(
107
+ "⚠️ WARNING: The 'delete' command is deprecated and no longer supported.",
108
+ fg="yellow",
109
+ bold=True,
110
+ )
111
+
112
+ logger.warning("DEPRECATED: datacontract delete command is no longer supported")
63
113
 
64
114
  if not urn:
65
115
  if not file:
@@ -95,7 +95,7 @@ TRACE_INITIAL_BACKOFF = 1.0 # Start with 1 second
95
95
  TRACE_MAX_BACKOFF = 300.0 # Cap at 5 minutes
96
96
  TRACE_BACKOFF_FACTOR = 2.0 # Double the wait time each attempt
97
97
 
98
- # The limit is 16mb. We will use a max of 15mb to have some space
98
+ # The limit is 16,000,000 bytes. We will use a max of 15mb to have some space
99
99
  # for overhead like request headers.
100
100
  # This applies to pretty much all calls to GMS.
101
101
  INGEST_MAX_PAYLOAD_BYTES = int(
@@ -586,6 +586,11 @@ class DataHubRestEmitter(Closeable, Emitter):
586
586
  "systemMetadata": system_metadata_obj,
587
587
  }
588
588
  payload = json.dumps(snapshot)
589
+ if len(payload) > INGEST_MAX_PAYLOAD_BYTES:
590
+ logger.warning(
591
+ f"MCE object has size {len(payload)} that exceeds the max payload size of {INGEST_MAX_PAYLOAD_BYTES}, "
592
+ "so this metadata will likely fail to be emitted."
593
+ )
589
594
 
590
595
  self._emit_generic(url, payload)
591
596
 
@@ -764,16 +769,24 @@ class DataHubRestEmitter(Closeable, Emitter):
764
769
  url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
765
770
 
766
771
  mcp_objs = [pre_json_transform(mcp.to_obj()) for mcp in mcps]
772
+ if len(mcp_objs) == 0:
773
+ return 0
767
774
 
768
775
  # As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
769
776
  # If we will exceed the limit, we need to break it up into chunks.
770
- mcp_obj_chunks: List[List[str]] = []
771
- current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
777
+ mcp_obj_chunks: List[List[str]] = [[]]
778
+ current_chunk_size = 0
772
779
  for mcp_obj in mcp_objs:
780
+ mcp_identifier = f"{mcp_obj.get('entityUrn')}-{mcp_obj.get('aspectName')}"
773
781
  mcp_obj_size = len(json.dumps(mcp_obj))
774
782
  if _DATAHUB_EMITTER_TRACE:
775
783
  logger.debug(
776
- f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
784
+ f"Iterating through object ({mcp_identifier}) with size {mcp_obj_size}"
785
+ )
786
+ if mcp_obj_size > INGEST_MAX_PAYLOAD_BYTES:
787
+ logger.warning(
788
+ f"MCP object {mcp_identifier} has size {mcp_obj_size} that exceeds the max payload size of {INGEST_MAX_PAYLOAD_BYTES}, "
789
+ "so this metadata will likely fail to be emitted."
777
790
  )
778
791
 
779
792
  if (
@@ -786,7 +799,7 @@ class DataHubRestEmitter(Closeable, Emitter):
786
799
  current_chunk_size = 0
787
800
  mcp_obj_chunks[-1].append(mcp_obj)
788
801
  current_chunk_size += mcp_obj_size
789
- if len(mcp_obj_chunks) > 0:
802
+ if len(mcp_obj_chunks) > 1 or _DATAHUB_EMITTER_TRACE:
790
803
  logger.debug(
791
804
  f"Decided to send {len(mcps)} MCP batch in {len(mcp_obj_chunks)} chunks"
792
805
  )
@@ -90,7 +90,7 @@ class EnsureAspectSizeProcessor:
90
90
  on GMS side and failure of the entire ingestion. This processor will attempt to trim suspected aspects.
91
91
  """
92
92
  for wu in stream:
93
- logger.debug(f"Ensuring size of workunit: {wu.id}")
93
+ # logger.debug(f"Ensuring size of workunit: {wu.id}")
94
94
 
95
95
  if schema := wu.get_aspect_of_type(SchemaMetadataClass):
96
96
  self.ensure_schema_metadata_size(wu.get_urn(), schema)
@@ -186,11 +186,19 @@ class ExamplesReport(Report, Closeable):
186
186
  aspects: Dict[str, Dict[str, int]] = field(
187
187
  default_factory=lambda: defaultdict(lambda: defaultdict(int))
188
188
  )
189
+ # This counts existence of aspects for each entity/subtype
190
+ # This is used for the UI to calculate %age of entities with the aspect
189
191
  aspects_by_subtypes: Dict[str, Dict[str, Dict[str, int]]] = field(
190
192
  default_factory=lambda: defaultdict(
191
193
  lambda: defaultdict(lambda: defaultdict(int))
192
194
  )
193
195
  )
196
+ # This counts all aspects for each entity/subtype
197
+ aspects_by_subtypes_full_count: Dict[str, Dict[str, Dict[str, int]]] = field(
198
+ default_factory=lambda: defaultdict(
199
+ lambda: defaultdict(lambda: defaultdict(int))
200
+ )
201
+ )
194
202
  samples: Dict[str, Dict[str, List[str]]] = field(
195
203
  default_factory=lambda: defaultdict(lambda: defaultdict(list))
196
204
  )
@@ -399,6 +407,9 @@ class ExamplesReport(Report, Closeable):
399
407
  entity_subtype_aspect_counts: Dict[str, Dict[str, Dict[str, int]]] = (
400
408
  defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
401
409
  )
410
+ entity_subtype_aspect_counts_exist: Dict[str, Dict[str, Dict[str, int]]] = (
411
+ defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
412
+ )
402
413
  for row in self._file_based_dict.sql_query(query):
403
414
  entity_type = row["entityType"]
404
415
  sub_type = row["subTypes"]
@@ -410,15 +421,23 @@ class ExamplesReport(Report, Closeable):
410
421
  entity_subtype_aspect_counts[entity_type][sub_type][aspect] += (
411
422
  aspect_count * count
412
423
  )
424
+ entity_subtype_aspect_counts_exist[entity_type][sub_type][aspect] += (
425
+ count
426
+ )
413
427
 
414
428
  self.aspects.clear()
415
429
  self.aspects_by_subtypes.clear()
416
- _aspects_seen: Set[str] = set()
430
+ self.aspects_by_subtypes_full_count.clear()
417
431
  for entity_type, subtype_counts in entity_subtype_aspect_counts.items():
418
432
  for sub_type, aspect_counts in subtype_counts.items():
419
433
  for aspect, count in aspect_counts.items():
420
434
  self.aspects[entity_type][aspect] += count
421
- _aspects_seen.add(aspect)
435
+ self.aspects_by_subtypes_full_count[entity_type][sub_type] = dict(
436
+ aspect_counts
437
+ )
438
+
439
+ for entity_type, subtype_counts in entity_subtype_aspect_counts_exist.items():
440
+ for sub_type, aspect_counts in subtype_counts.items():
422
441
  self.aspects_by_subtypes[entity_type][sub_type] = dict(aspect_counts)
423
442
 
424
443
  self.samples.clear()
@@ -81,11 +81,24 @@ class StructuredLogLevel(Enum):
81
81
  ERROR = logging.ERROR
82
82
 
83
83
 
84
+ class StructuredLogCategory(Enum):
85
+ """
86
+ This is used to categorise the errors mainly based on the biggest impact area
87
+ This is to be used to help in self-serve understand the impact of any log entry
88
+ More enums to be added as logs are updated to be self-serve
89
+ """
90
+
91
+ LINEAGE = "LINEAGE"
92
+ USAGE = "USAGE"
93
+ PROFILING = "PROFILING"
94
+
95
+
84
96
  @dataclass
85
97
  class StructuredLogEntry(Report):
86
98
  title: Optional[str]
87
99
  message: str
88
100
  context: LossyList[str]
101
+ log_category: Optional[StructuredLogCategory] = None
89
102
 
90
103
 
91
104
  @dataclass
@@ -108,9 +121,10 @@ class StructuredLogs(Report):
108
121
  exc: Optional[BaseException] = None,
109
122
  log: bool = False,
110
123
  stacklevel: int = 1,
124
+ log_category: Optional[StructuredLogCategory] = None,
111
125
  ) -> None:
112
126
  """
113
- Report a user-facing warning for the ingestion run.
127
+ Report a user-facing log for the ingestion run.
114
128
 
115
129
  Args:
116
130
  level: The level of the log entry.
@@ -118,6 +132,9 @@ class StructuredLogs(Report):
118
132
  title: The category / heading to present on for this message in the UI.
119
133
  context: Additional context (e.g. where, how) for the log entry.
120
134
  exc: The exception associated with the event. We'll show the stack trace when in debug mode.
135
+ log_category: The type of the log entry. This is used to categorise the log entry.
136
+ log: Whether to log the entry to the console.
137
+ stacklevel: The stack level to use for the log entry.
121
138
  """
122
139
 
123
140
  # One for this method, and one for the containing report_* call.
@@ -160,6 +177,7 @@ class StructuredLogs(Report):
160
177
  title=title,
161
178
  message=message,
162
179
  context=context_list,
180
+ log_category=log_category,
163
181
  )
164
182
  else:
165
183
  if context is not None:
@@ -219,9 +237,19 @@ class SourceReport(ExamplesReport):
219
237
  context: Optional[str] = None,
220
238
  title: Optional[LiteralString] = None,
221
239
  exc: Optional[BaseException] = None,
240
+ log_category: Optional[StructuredLogCategory] = None,
222
241
  ) -> None:
242
+ """
243
+ See docs of StructuredLogs.report_log for details of args
244
+ """
223
245
  self._structured_logs.report_log(
224
- StructuredLogLevel.WARN, message, title, context, exc, log=False
246
+ StructuredLogLevel.WARN,
247
+ message,
248
+ title,
249
+ context,
250
+ exc,
251
+ log=False,
252
+ log_category=log_category,
225
253
  )
226
254
 
227
255
  def warning(
@@ -231,9 +259,19 @@ class SourceReport(ExamplesReport):
231
259
  title: Optional[LiteralString] = None,
232
260
  exc: Optional[BaseException] = None,
233
261
  log: bool = True,
262
+ log_category: Optional[StructuredLogCategory] = None,
234
263
  ) -> None:
264
+ """
265
+ See docs of StructuredLogs.report_log for details of args
266
+ """
235
267
  self._structured_logs.report_log(
236
- StructuredLogLevel.WARN, message, title, context, exc, log=log
268
+ StructuredLogLevel.WARN,
269
+ message,
270
+ title,
271
+ context,
272
+ exc,
273
+ log=log,
274
+ log_category=log_category,
237
275
  )
238
276
 
239
277
  def report_failure(
@@ -243,9 +281,19 @@ class SourceReport(ExamplesReport):
243
281
  title: Optional[LiteralString] = None,
244
282
  exc: Optional[BaseException] = None,
245
283
  log: bool = True,
284
+ log_category: Optional[StructuredLogCategory] = None,
246
285
  ) -> None:
286
+ """
287
+ See docs of StructuredLogs.report_log for details of args
288
+ """
247
289
  self._structured_logs.report_log(
248
- StructuredLogLevel.ERROR, message, title, context, exc, log=log
290
+ StructuredLogLevel.ERROR,
291
+ message,
292
+ title,
293
+ context,
294
+ exc,
295
+ log=log,
296
+ log_category=log_category,
249
297
  )
250
298
 
251
299
  def failure(
@@ -255,9 +303,19 @@ class SourceReport(ExamplesReport):
255
303
  title: Optional[LiteralString] = None,
256
304
  exc: Optional[BaseException] = None,
257
305
  log: bool = True,
306
+ log_category: Optional[StructuredLogCategory] = None,
258
307
  ) -> None:
308
+ """
309
+ See docs of StructuredLogs.report_log for details of args
310
+ """
259
311
  self._structured_logs.report_log(
260
- StructuredLogLevel.ERROR, message, title, context, exc, log=log
312
+ StructuredLogLevel.ERROR,
313
+ message,
314
+ title,
315
+ context,
316
+ exc,
317
+ log=log,
318
+ log_category=log_category,
261
319
  )
262
320
 
263
321
  def info(
@@ -267,9 +325,19 @@ class SourceReport(ExamplesReport):
267
325
  title: Optional[LiteralString] = None,
268
326
  exc: Optional[BaseException] = None,
269
327
  log: bool = True,
328
+ log_category: Optional[StructuredLogCategory] = None,
270
329
  ) -> None:
330
+ """
331
+ See docs of StructuredLogs.report_log for details of args
332
+ """
271
333
  self._structured_logs.report_log(
272
- StructuredLogLevel.INFO, message, title, context, exc, log=log
334
+ StructuredLogLevel.INFO,
335
+ message,
336
+ title,
337
+ context,
338
+ exc,
339
+ log=log,
340
+ log_category=log_category,
273
341
  )
274
342
 
275
343
  @contextlib.contextmanager
@@ -279,6 +347,7 @@ class SourceReport(ExamplesReport):
279
347
  title: Optional[LiteralString] = None,
280
348
  context: Optional[str] = None,
281
349
  level: StructuredLogLevel = StructuredLogLevel.ERROR,
350
+ log_category: Optional[StructuredLogCategory] = None,
282
351
  ) -> Iterator[None]:
283
352
  # Convenience method that helps avoid boilerplate try/except blocks.
284
353
  # TODO: I'm not super happy with the naming here - it's not obvious that this
@@ -287,7 +356,12 @@ class SourceReport(ExamplesReport):
287
356
  yield
288
357
  except Exception as exc:
289
358
  self._structured_logs.report_log(
290
- level, message=message, title=title, context=context, exc=exc
359
+ level,
360
+ message=message,
361
+ title=title,
362
+ context=context,
363
+ exc=exc,
364
+ log_category=log_category,
291
365
  )
292
366
 
293
367
  def __post_init__(self) -> None:
@@ -1,9 +1,18 @@
1
1
  {
2
- "generated_at": "2025-07-24T13:24:05.751563+00:00",
2
+ "generated_at": "2025-07-31T12:54:30.557618+00:00",
3
3
  "generated_by": "metadata-ingestion/scripts/capability_summary.py",
4
4
  "plugin_details": {
5
5
  "abs": {
6
6
  "capabilities": [
7
+ {
8
+ "capability": "CONTAINERS",
9
+ "description": "Extract ABS containers and folders",
10
+ "subtype_modifier": [
11
+ "Folder",
12
+ "ABS container"
13
+ ],
14
+ "supported": true
15
+ },
7
16
  {
8
17
  "capability": "DATA_PROFILING",
9
18
  "description": "Optionally enabled via configuration",
@@ -468,7 +477,9 @@
468
477
  {
469
478
  "capability": "CONTAINERS",
470
479
  "description": "Enabled by default",
471
- "subtype_modifier": null,
480
+ "subtype_modifier": [
481
+ "Database"
482
+ ],
472
483
  "supported": true
473
484
  },
474
485
  {
@@ -531,13 +542,6 @@
531
542
  "platform_name": "File Based Lineage",
532
543
  "support_status": "CERTIFIED"
533
544
  },
534
- "datahub-mock-data": {
535
- "capabilities": [],
536
- "classname": "datahub.ingestion.source.mock_data.datahub_mock_data.DataHubMockDataSource",
537
- "platform_id": "datahubmockdata",
538
- "platform_name": "DataHubMockData",
539
- "support_status": "TESTING"
540
- },
541
545
  "dbt": {
542
546
  "capabilities": [
543
547
  {
@@ -607,7 +611,9 @@
607
611
  {
608
612
  "capability": "CONTAINERS",
609
613
  "description": "Enabled by default",
610
- "subtype_modifier": null,
614
+ "subtype_modifier": [
615
+ "Folder"
616
+ ],
611
617
  "supported": true
612
618
  },
613
619
  {
@@ -643,6 +649,14 @@
643
649
  "subtype_modifier": null,
644
650
  "supported": true
645
651
  },
652
+ {
653
+ "capability": "LINEAGE_FINE",
654
+ "description": "Extract column-level lineage",
655
+ "subtype_modifier": [
656
+ "Table"
657
+ ],
658
+ "supported": true
659
+ },
646
660
  {
647
661
  "capability": "DATA_PROFILING",
648
662
  "description": "Optionally enabled via configuration",
@@ -688,7 +702,9 @@
688
702
  {
689
703
  "capability": "LINEAGE_COARSE",
690
704
  "description": "Enabled by default",
691
- "subtype_modifier": null,
705
+ "subtype_modifier": [
706
+ "Table"
707
+ ],
692
708
  "supported": true
693
709
  }
694
710
  ],
@@ -1229,8 +1245,7 @@
1229
1245
  "capability": "CONTAINERS",
1230
1246
  "description": "Enabled by default",
1231
1247
  "subtype_modifier": [
1232
- "Database",
1233
- "Schema"
1248
+ "Catalog"
1234
1249
  ],
1235
1250
  "supported": true
1236
1251
  },
@@ -2387,8 +2402,9 @@
2387
2402
  },
2388
2403
  {
2389
2404
  "capability": "LINEAGE_COARSE",
2390
- "description": "Enabled by default to get lineage for views via `include_view_lineage`",
2405
+ "description": "Extract table-level lineage",
2391
2406
  "subtype_modifier": [
2407
+ "Table",
2392
2408
  "View"
2393
2409
  ],
2394
2410
  "supported": true
@@ -2411,8 +2427,7 @@
2411
2427
  "capability": "CONTAINERS",
2412
2428
  "description": "Enabled by default",
2413
2429
  "subtype_modifier": [
2414
- "Database",
2415
- "Schema"
2430
+ "Catalog"
2416
2431
  ],
2417
2432
  "supported": true
2418
2433
  },
@@ -2598,7 +2613,8 @@
2598
2613
  "capability": "CONTAINERS",
2599
2614
  "description": "Enabled by default",
2600
2615
  "subtype_modifier": [
2601
- "Database"
2616
+ "Database",
2617
+ "Schema"
2602
2618
  ],
2603
2619
  "supported": true
2604
2620
  },
@@ -2812,6 +2828,15 @@
2812
2828
  "description": "Enabled by default",
2813
2829
  "subtype_modifier": null,
2814
2830
  "supported": true
2831
+ },
2832
+ {
2833
+ "capability": "LINEAGE_COARSE",
2834
+ "description": "Extract table-level lineage for Salesforce objects",
2835
+ "subtype_modifier": [
2836
+ "Custom Object",
2837
+ "Object"
2838
+ ],
2839
+ "supported": true
2815
2840
  }
2816
2841
  ],
2817
2842
  "classname": "datahub.ingestion.source.salesforce.SalesforceSource",
@@ -3207,7 +3232,9 @@
3207
3232
  {
3208
3233
  "capability": "CONTAINERS",
3209
3234
  "description": "Enabled by default",
3210
- "subtype_modifier": null,
3235
+ "subtype_modifier": [
3236
+ "Database"
3237
+ ],
3211
3238
  "supported": true
3212
3239
  },
3213
3240
  {
@@ -3339,8 +3366,9 @@
3339
3366
  },
3340
3367
  {
3341
3368
  "capability": "LINEAGE_COARSE",
3342
- "description": "Enabled by default to get lineage for views via `include_view_lineage`",
3369
+ "description": "Extract table-level lineage",
3343
3370
  "subtype_modifier": [
3371
+ "Table",
3344
3372
  "View"
3345
3373
  ],
3346
3374
  "supported": true
@@ -76,7 +76,15 @@ from datahub.metadata.schema_classes import (
76
76
  SystemMetadataClass,
77
77
  TelemetryClientIdClass,
78
78
  )
79
- from datahub.metadata.urns import CorpUserUrn, Urn
79
+ from datahub.metadata.urns import (
80
+ CorpUserUrn,
81
+ MlFeatureTableUrn,
82
+ MlFeatureUrn,
83
+ MlModelGroupUrn,
84
+ MlModelUrn,
85
+ MlPrimaryKeyUrn,
86
+ Urn,
87
+ )
80
88
  from datahub.telemetry.telemetry import telemetry_instance
81
89
  from datahub.utilities.perf_timer import PerfTimer
82
90
  from datahub.utilities.str_enum import StrEnum
@@ -118,8 +126,16 @@ def entity_type_to_graphql(entity_type: str) -> str:
118
126
  """Convert the entity types into GraphQL "EntityType" enum values."""
119
127
 
120
128
  # Hard-coded special cases.
121
- if entity_type == CorpUserUrn.ENTITY_TYPE:
122
- return "CORP_USER"
129
+ special_cases = {
130
+ CorpUserUrn.ENTITY_TYPE: "CORP_USER",
131
+ MlModelUrn.ENTITY_TYPE: "MLMODEL",
132
+ MlModelGroupUrn.ENTITY_TYPE: "MLMODEL_GROUP",
133
+ MlFeatureTableUrn.ENTITY_TYPE: "MLFEATURE_TABLE",
134
+ MlFeatureUrn.ENTITY_TYPE: "MLFEATURE",
135
+ MlPrimaryKeyUrn.ENTITY_TYPE: "MLPRIMARY_KEY",
136
+ }
137
+ if entity_type in special_cases:
138
+ return special_cases[entity_type]
123
139
 
124
140
  # Convert camelCase to UPPER_UNDERSCORE.
125
141
  entity_type = (
@@ -92,6 +92,7 @@ class DatahubRestSinkConfig(DatahubClientConfig):
92
92
  @dataclasses.dataclass
93
93
  class DataHubRestSinkReport(SinkReport):
94
94
  mode: Optional[RestSinkMode] = None
95
+ endpoint: Optional[RestSinkEndpoint] = None
95
96
  max_threads: Optional[int] = None
96
97
  gms_version: Optional[str] = None
97
98
  pending_requests: int = 0
@@ -142,6 +143,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
142
143
 
143
144
  self.report.gms_version = gms_config.service_version
144
145
  self.report.mode = self.config.mode
146
+ self.report.endpoint = self.config.endpoint
145
147
  self.report.max_threads = self.config.max_threads
146
148
  logger.debug("Setting env variables to override config")
147
149
  logger.debug("Setting gms config")
@@ -151,7 +151,7 @@ class DataLakeSourceConfig(
151
151
  raise ValueError("platform must not be empty")
152
152
  return platform
153
153
 
154
- @pydantic.root_validator()
154
+ @pydantic.root_validator(skip_on_failure=True)
155
155
  def ensure_profiling_pattern_is_passed_to_profiling(
156
156
  cls, values: Dict[str, Any]
157
157
  ) -> Dict[str, Any]: