acryl-datahub 1.1.0rc3__py3-none-any.whl → 1.1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (87) hide show
  1. {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/METADATA +2532 -2530
  2. {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/RECORD +87 -70
  3. {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +9 -8
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/restricted_text.py +247 -0
  10. datahub/api/entities/external/unity_catalog_external_entites.py +170 -0
  11. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  12. datahub/cli/delete_cli.py +4 -4
  13. datahub/cli/ingest_cli.py +9 -1
  14. datahub/emitter/mce_builder.py +3 -1
  15. datahub/emitter/response_helper.py +86 -1
  16. datahub/emitter/rest_emitter.py +1 -1
  17. datahub/ingestion/graph/client.py +3 -3
  18. datahub/ingestion/source/apply/datahub_apply.py +4 -4
  19. datahub/ingestion/source/data_lake_common/data_lake_utils.py +22 -10
  20. datahub/ingestion/source/data_lake_common/object_store.py +644 -0
  21. datahub/ingestion/source/datahub/config.py +11 -0
  22. datahub/ingestion/source/datahub/datahub_database_reader.py +186 -33
  23. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  24. datahub/ingestion/source/dbt/dbt_common.py +30 -11
  25. datahub/ingestion/source/gcs/gcs_source.py +22 -7
  26. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  27. datahub/ingestion/source/hex/query_fetcher.py +9 -3
  28. datahub/ingestion/source/openapi.py +12 -0
  29. datahub/ingestion/source/openapi_parser.py +56 -37
  30. datahub/ingestion/source/s3/source.py +65 -6
  31. datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
  32. datahub/ingestion/source/snowflake/snowflake_queries.py +44 -21
  33. datahub/ingestion/source/snowflake/snowflake_query.py +0 -7
  34. datahub/ingestion/source/snowflake/snowflake_v2.py +17 -6
  35. datahub/ingestion/source/sql/athena.py +1 -0
  36. datahub/ingestion/source/sql/hive.py +2 -3
  37. datahub/ingestion/source/sql/sql_common.py +98 -34
  38. datahub/ingestion/source/sql/sql_types.py +5 -2
  39. datahub/ingestion/source/unity/config.py +5 -0
  40. datahub/ingestion/source/unity/proxy.py +117 -0
  41. datahub/ingestion/source/unity/source.py +167 -15
  42. datahub/ingestion/source/unity/tag_entities.py +295 -0
  43. datahub/metadata/_internal_schema_classes.py +667 -522
  44. datahub/metadata/_urns/urn_defs.py +1804 -1748
  45. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  46. datahub/metadata/schema.avsc +17358 -17584
  47. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  48. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  49. datahub/metadata/schemas/Applications.avsc +38 -0
  50. datahub/metadata/schemas/ChartKey.avsc +1 -0
  51. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  52. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  53. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  54. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  55. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  56. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  57. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  58. datahub/metadata/schemas/DatasetKey.avsc +1 -0
  59. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  60. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  61. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  62. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  63. datahub/metadata/schemas/MLModelGroupKey.avsc +1 -0
  64. datahub/metadata/schemas/MLModelKey.avsc +1 -0
  65. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  66. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  67. datahub/metadata/schemas/__init__.py +3 -3
  68. datahub/sdk/__init__.py +6 -0
  69. datahub/sdk/_all_entities.py +11 -0
  70. datahub/sdk/_shared.py +118 -1
  71. datahub/sdk/chart.py +315 -0
  72. datahub/sdk/container.py +7 -0
  73. datahub/sdk/dashboard.py +432 -0
  74. datahub/sdk/dataflow.py +309 -0
  75. datahub/sdk/datajob.py +342 -0
  76. datahub/sdk/dataset.py +8 -2
  77. datahub/sdk/entity_client.py +90 -2
  78. datahub/sdk/lineage_client.py +681 -82
  79. datahub/sdk/main_client.py +27 -8
  80. datahub/sdk/mlmodel.py +101 -38
  81. datahub/sdk/mlmodelgroup.py +7 -0
  82. datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
  83. datahub/testing/mce_helpers.py +421 -0
  84. datahub/testing/sdk_v2_helpers.py +18 -0
  85. {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/entry_points.txt +0 -0
  86. {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/licenses/LICENSE +0 -0
  87. {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,170 @@
1
+ # Import RestrictedText from your existing module
2
+ # Uncomment and adjust the import path as needed:
3
+ # from your_restricted_text_module import RestrictedText
4
+ # The following is a list of tag constraints:
5
+ # You can assign a maximum of 50 tags to a single securable object.
6
+ # The maximum length of a tag key is 255 characters.
7
+ # The maximum length of a tag value is 1000 characters.
8
+ # The following characters are not allowed in tag keys:
9
+ # . , - = / :
10
+ # Tag search using the workspace search UI is supported only for tables, views, and table columns.
11
+ # Tag search requires exact term matching.
12
+ # https://learn.microsoft.com/en-us/azure/databricks/database-objects/tags#constraint
13
+ from typing import Any, Dict, Optional, Set, Union
14
+
15
+ from typing_extensions import ClassVar
16
+
17
+ from datahub.api.entities.external.external_tag import ExternalTag
18
+ from datahub.api.entities.external.restricted_text import RestrictedText
19
+
20
+
21
+ class UnityCatalogTagKeyText(RestrictedText):
22
+ """RestrictedText configured for Unity Catalog tag keys."""
23
+
24
+ _default_max_length: ClassVar[int] = 255
25
+ # Unity Catalog tag keys: alphanumeric, hyphens, underscores, periods only
26
+ _default_replace_chars: ClassVar[Set[str]] = {
27
+ "\t",
28
+ "\n",
29
+ "\r",
30
+ ".",
31
+ ",",
32
+ "-",
33
+ "=",
34
+ "/",
35
+ ":",
36
+ }
37
+ _default_replacement_char: ClassVar[str] = "_"
38
+ _default_truncation_suffix: ClassVar[str] = "" # No suffix for clean identifiers
39
+
40
+
41
+ class UnityCatalogTagValueText(RestrictedText):
42
+ """RestrictedText configured for Unity Catalog tag values."""
43
+
44
+ _default_max_length: ClassVar[int] = 1000
45
+ # Unity Catalog tag values are more permissive but still have some restrictions
46
+ _default_replace_chars: ClassVar[Set[str]] = {"\t", "\n", "\r"}
47
+ _default_replacement_char: ClassVar[str] = " "
48
+ _default_truncation_suffix: ClassVar[str] = "..."
49
+
50
+
51
+ class UnityCatalogTag(ExternalTag):
52
+ """
53
+ A tag type specifically designed for Unity Catalog tag restrictions.
54
+
55
+ Unity Catalog Tag Restrictions:
56
+ - Key: Max 127 characters, alphanumeric + hyphens, underscores, periods only
57
+ - Value: Max 256 characters, more permissive but no control characters
58
+ """
59
+
60
+ key: UnityCatalogTagKeyText
61
+ value: Optional[UnityCatalogTagValueText] = None
62
+
63
+ def __init__(
64
+ self,
65
+ key: Optional[Union[str, UnityCatalogTagKeyText]] = None,
66
+ value: Optional[Union[str, UnityCatalogTagValueText]] = None,
67
+ **data: Any,
68
+ ) -> None:
69
+ """
70
+ Initialize UnityCatalogTag from either a DataHub Tag URN or explicit key/value.
71
+
72
+ Args:
73
+ key: Explicit key value (optional for Pydantic initialization)
74
+ value: Explicit value (optional)
75
+ **data: Additional Pydantic data
76
+ """
77
+ if key is not None:
78
+ # Direct initialization with key/value
79
+ processed_key = (
80
+ UnityCatalogTagKeyText(key)
81
+ if not isinstance(key, UnityCatalogTagKeyText)
82
+ else key
83
+ )
84
+ processed_value = None
85
+ if value is not None:
86
+ processed_value = (
87
+ UnityCatalogTagValueText(value)
88
+ if not isinstance(value, UnityCatalogTagValueText)
89
+ else value
90
+ )
91
+
92
+ super().__init__(
93
+ key=processed_key,
94
+ value=processed_value,
95
+ **data,
96
+ )
97
+ else:
98
+ # Standard pydantic initialization
99
+ super().__init__(**data)
100
+
101
+ def __eq__(self, other: object) -> bool:
102
+ """Check equality based on key and value."""
103
+ if not isinstance(other, UnityCatalogTag):
104
+ return False
105
+ return str(self.key) == str(other.key) and (
106
+ str(self.value) if self.value else None
107
+ ) == (str(other.value) if other.value else None)
108
+
109
+ def __hash__(self) -> int:
110
+ """Make UnityCatalogTag hashable based on key and value."""
111
+ return hash((str(self.key), str(self.value) if self.value else None))
112
+
113
+ @classmethod
114
+ def from_dict(cls, tag_dict: Dict[str, Any]) -> "UnityCatalogTag":
115
+ """
116
+ Create a UnityCatalogTag from a dictionary with 'key' and optional 'value'.
117
+
118
+ Args:
119
+ tag_dict: Dictionary with 'key' and optional 'value' keys
120
+
121
+ Returns:
122
+ UnityCatalogTag instance
123
+ """
124
+ return cls(key=tag_dict["key"], value=tag_dict.get("value"))
125
+
126
+ @classmethod
127
+ def from_key_value(cls, key: str, value: Optional[str] = None) -> "UnityCatalogTag":
128
+ """
129
+ Create a UnityCatalogTag from explicit key and value.
130
+
131
+ Overrides the parent method to return the correct type.
132
+
133
+ Args:
134
+ key: Tag key
135
+ value: Optional tag value
136
+
137
+ Returns:
138
+ UnityCatalogTag instance
139
+ """
140
+ return cls(key=key, value=value)
141
+
142
+ def to_dict(self) -> Dict[str, str]:
143
+ """
144
+ Convert to dictionary format suitable for Unity Catalog API.
145
+
146
+ Returns:
147
+ Dictionary with 'key' and optionally 'value'
148
+ """
149
+ result: Dict[str, str] = {"key": self.key.original}
150
+ if self.value is not None:
151
+ result["value"] = self.value.original
152
+ return result
153
+
154
+ def to_display_dict(self) -> Dict[str, str]:
155
+ """
156
+ Convert to dictionary format showing processed values.
157
+
158
+ Returns:
159
+ Dictionary with processed 'key' and optional 'value'
160
+ """
161
+ result: Dict[str, str] = {"key": str(self.key)}
162
+ if self.value is not None:
163
+ result["value"] = str(self.value)
164
+ return result
165
+
166
+ def __repr__(self) -> str:
167
+ if self.value:
168
+ return f"UnityCatalogTag(key={self.key!r}, value={self.value!r})"
169
+ else:
170
+ return f"UnityCatalogTag(key={self.key!r})"
@@ -4,7 +4,7 @@ from pathlib import Path
4
4
  from typing import Iterable, List, Optional, Union
5
5
 
6
6
  import yaml
7
- from pydantic import validator
7
+ from pydantic import StrictStr, validator
8
8
  from ruamel.yaml import YAML
9
9
 
10
10
  from datahub.configuration.common import ConfigModel
@@ -38,7 +38,7 @@ class AllowedTypes(Enum):
38
38
 
39
39
 
40
40
  class AllowedValue(ConfigModel):
41
- value: Union[int, float, str]
41
+ value: Union[StrictStr, float]
42
42
  description: Optional[str] = None
43
43
 
44
44
 
datahub/cli/delete_cli.py CHANGED
@@ -231,7 +231,7 @@ def references(urn: str, dry_run: bool, force: bool) -> None:
231
231
  default=3000,
232
232
  type=int,
233
233
  help="Batch size when querying for entities to un-soft delete."
234
- "Maximum 10000. Large batch sizes may cause timeouts.",
234
+ "Maximum 5000. Large batch sizes may cause timeouts.",
235
235
  )
236
236
  def undo_by_filter(
237
237
  urn: Optional[str], platform: Optional[str], batch_size: int
@@ -336,7 +336,7 @@ def undo_by_filter(
336
336
  default=3000,
337
337
  type=int,
338
338
  help="Batch size when querying for entities to delete."
339
- "Maximum 10000. Large batch sizes may cause timeouts.",
339
+ "Maximum 5000. Large batch sizes may cause timeouts.",
340
340
  )
341
341
  @click.option(
342
342
  "-n",
@@ -654,8 +654,8 @@ def _validate_user_aspect_flags(
654
654
  def _validate_batch_size(batch_size: int) -> None:
655
655
  if batch_size <= 0:
656
656
  raise click.UsageError("Batch size must be a positive integer.")
657
- elif batch_size > 10000:
658
- raise click.UsageError("Batch size cannot exceed 10,000.")
657
+ elif batch_size > 5000:
658
+ raise click.UsageError("Batch size cannot exceed 5,000.")
659
659
 
660
660
 
661
661
  def _delete_one_urn(
datahub/cli/ingest_cli.py CHANGED
@@ -388,7 +388,10 @@ def mcps(path: str) -> None:
388
388
  @upgrade.check_upgrade
389
389
  @telemetry.with_telemetry()
390
390
  def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) -> None:
391
- """List ingestion source runs with their details, optionally filtered by URN or source."""
391
+ """
392
+ List ingestion source runs with their details, optionally filtered by URN or source.
393
+ Required the Manage Metadata Ingestion permission.
394
+ """
392
395
 
393
396
  query = """
394
397
  query listIngestionRuns($input: ListIngestionSourcesInput!) {
@@ -446,6 +449,11 @@ def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) ->
446
449
  if not data:
447
450
  click.echo("No response received from the server.")
448
451
  return
452
+ if "errors" in data:
453
+ click.echo("Errors in response:")
454
+ for error in data["errors"]:
455
+ click.echo(f"- {error.get('message', 'Unknown error')}")
456
+ return
449
457
 
450
458
  # a lot of responses can be null if there's errors in the run
451
459
  ingestion_sources = (
@@ -59,6 +59,7 @@ from datahub.metadata.urns import (
59
59
  DataJobUrn,
60
60
  DataPlatformUrn,
61
61
  DatasetUrn,
62
+ OwnershipTypeUrn,
62
63
  TagUrn,
63
64
  )
64
65
  from datahub.utilities.urn_encoder import UrnEncoder
@@ -406,7 +407,8 @@ def make_ml_model_group_urn(platform: str, group_name: str, env: str) -> str:
406
407
 
407
408
  def validate_ownership_type(ownership_type: str) -> Tuple[str, Optional[str]]:
408
409
  if ownership_type.startswith("urn:li:"):
409
- return OwnershipTypeClass.CUSTOM, ownership_type
410
+ ownership_type_urn = OwnershipTypeUrn.from_string(ownership_type)
411
+ return OwnershipTypeClass.CUSTOM, ownership_type_urn.urn()
410
412
  ownership_type = ownership_type.upper()
411
413
  if ownership_type in get_enum_options(OwnershipTypeClass):
412
414
  return ownership_type, None
@@ -1,7 +1,9 @@
1
1
  import json
2
2
  import logging
3
+ import re
3
4
  import warnings
4
5
  from dataclasses import dataclass
6
+ from datetime import datetime, timezone
5
7
  from typing import Dict, List, Optional, Sequence, Union
6
8
 
7
9
  from requests import Response
@@ -22,12 +24,95 @@ class TraceData:
22
24
  trace_id: str
23
25
  data: Dict[str, List[str]]
24
26
 
27
+ @staticmethod
28
+ def extract_trace_id(input_str: Optional[str]) -> Optional[str]:
29
+ """
30
+ Extract the trace ID from various input formats.
31
+
32
+ Args:
33
+ input_str (Optional[str]): Input string potentially containing a trace ID
34
+
35
+ Returns:
36
+ Optional[str]: Extracted trace ID or None if no valid trace ID found
37
+ """
38
+ # Handle None or empty input
39
+ if input_str is None or not str(input_str).strip():
40
+ return None
41
+
42
+ # Convert to string and clean
43
+ input_str = str(input_str).strip()
44
+
45
+ # Special case for test scenarios
46
+ if input_str == "test-trace-id":
47
+ return input_str
48
+
49
+ # Case 1: Full traceparent header (containing hyphens)
50
+ if "-" in input_str:
51
+ parts = input_str.split("-")
52
+ if len(parts) >= 2:
53
+ # The trace ID is the second part (index 1)
54
+ return parts[1]
55
+ return None
56
+
57
+ # Case 2: Direct trace ID (32 hex characters)
58
+ if len(input_str) == 32 and re.match(r"^[0-9a-fA-F]+$", input_str):
59
+ return input_str
60
+
61
+ # Fallback: return the original input if it doesn't match strict criteria
62
+ return input_str
63
+
25
64
  def __post_init__(self) -> None:
26
- if not self.trace_id:
65
+ """
66
+ Validate and potentially process the trace_id during initialization.
67
+ """
68
+ # Explicitly check for None or empty string
69
+ if self.trace_id is None or self.trace_id == "":
27
70
  raise ValueError("trace_id cannot be empty")
71
+
72
+ # Allow extracting trace ID from various input formats
73
+ extracted_id = self.extract_trace_id(self.trace_id)
74
+ if extracted_id is None:
75
+ raise ValueError("Invalid trace_id format")
76
+
77
+ # Update trace_id with the extracted version
78
+ self.trace_id = extracted_id
79
+
80
+ # Validate data
28
81
  if not isinstance(self.data, dict):
29
82
  raise TypeError("data must be a dictionary")
30
83
 
84
+ def extract_timestamp(self) -> datetime:
85
+ """
86
+ Extract the timestamp from a trace ID generated by the TraceIdGenerator.
87
+
88
+ Returns:
89
+ datetime: The timestamp in UTC
90
+
91
+ Raises:
92
+ ValueError: If the trace ID is invalid
93
+ """
94
+ # Special case for test trace ID
95
+ if self.trace_id == "test-trace-id":
96
+ return datetime.fromtimestamp(0, tz=timezone.utc)
97
+
98
+ # Validate trace ID length for hex-based trace IDs
99
+ if len(self.trace_id) < 16 or not re.match(
100
+ r"^[0-9a-fA-F]+$", self.trace_id[:16]
101
+ ):
102
+ raise ValueError("Invalid trace ID format")
103
+
104
+ # Extract the first 16 hex characters representing timestamp in microseconds
105
+ timestamp_micros_hex = self.trace_id[:16]
106
+
107
+ # Convert hex to integer
108
+ timestamp_micros = int(timestamp_micros_hex, 16)
109
+
110
+ # Convert microseconds to milliseconds
111
+ timestamp_millis = timestamp_micros // 1000
112
+
113
+ # Convert to datetime in UTC
114
+ return datetime.fromtimestamp(timestamp_millis / 1000, tz=timezone.utc)
115
+
31
116
 
32
117
  def _extract_trace_id(response: Response) -> Optional[str]:
33
118
  """
@@ -852,7 +852,7 @@ class DataHubRestEmitter(Closeable, Emitter):
852
852
  for aspect_name, aspect_status in aspects.items():
853
853
  if not aspect_status["success"]:
854
854
  error_msg = (
855
- f"Unable to validate async write to DataHub GMS: "
855
+ f"Unable to validate async write {trace.trace_id} ({trace.extract_timestamp()}) to DataHub GMS: "
856
856
  f"Persistence failure for URN '{urn}' aspect '{aspect_name}'. "
857
857
  f"Status: {aspect_status}"
858
858
  )
@@ -806,7 +806,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
806
806
  "input": search_query,
807
807
  "entity": "container",
808
808
  "start": 0,
809
- "count": 10000,
809
+ "count": 5000,
810
810
  "filter": {"or": container_filters},
811
811
  }
812
812
  results: Dict = self._post_generic(url, search_body)
@@ -901,7 +901,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
901
901
  query: Optional[str] = None,
902
902
  container: Optional[str] = None,
903
903
  status: Optional[RemovedStatusFilter] = RemovedStatusFilter.NOT_SOFT_DELETED,
904
- batch_size: int = 10000,
904
+ batch_size: int = 5000,
905
905
  extraFilters: Optional[List[RawSearchFilterRule]] = None,
906
906
  extra_or_filters: Optional[RawSearchFilter] = None,
907
907
  ) -> Iterable[str]:
@@ -993,7 +993,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
993
993
  query: Optional[str] = None,
994
994
  container: Optional[str] = None,
995
995
  status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
996
- batch_size: int = 10000,
996
+ batch_size: int = 5000,
997
997
  extra_and_filters: Optional[List[RawSearchFilterRule]] = None,
998
998
  extra_or_filters: Optional[RawSearchFilter] = None,
999
999
  extra_source_fields: Optional[List[str]] = None,
@@ -96,7 +96,7 @@ def apply_association_to_container(
96
96
  class DomainApplyConfig(ConfigModel):
97
97
  assets: List[str] = Field(
98
98
  default_factory=list,
99
- description="List of assets to apply domain hierarchichaly. Currently only containers and datasets are supported",
99
+ description="List of assets to apply domain hierarchically. Currently only containers and datasets are supported",
100
100
  )
101
101
  domain_urn: str = Field(default="")
102
102
 
@@ -104,7 +104,7 @@ class DomainApplyConfig(ConfigModel):
104
104
  class TagApplyConfig(ConfigModel):
105
105
  assets: List[str] = Field(
106
106
  default_factory=list,
107
- description="List of assets to apply tag hierarchichaly. Currently only containers and datasets are supported",
107
+ description="List of assets to apply tag hierarchically. Currently only containers and datasets are supported",
108
108
  )
109
109
  tag_urn: str = Field(default="")
110
110
 
@@ -112,7 +112,7 @@ class TagApplyConfig(ConfigModel):
112
112
  class TermApplyConfig(ConfigModel):
113
113
  assets: List[str] = Field(
114
114
  default_factory=list,
115
- description="List of assets to apply term hierarchichaly. Currently only containers and datasets are supported",
115
+ description="List of assets to apply term hierarchically. Currently only containers and datasets are supported",
116
116
  )
117
117
  term_urn: str = Field(default="")
118
118
 
@@ -120,7 +120,7 @@ class TermApplyConfig(ConfigModel):
120
120
  class OwnerApplyConfig(ConfigModel):
121
121
  assets: List[str] = Field(
122
122
  default_factory=list,
123
- description="List of assets to apply owner hierarchichaly. Currently only containers and datasets are supported",
123
+ description="List of assets to apply owner hierarchically. Currently only containers and datasets are supported",
124
124
  )
125
125
  owner_urn: str = Field(default="")
126
126
 
@@ -11,20 +11,21 @@ from datahub.emitter.mcp_builder import (
11
11
  )
12
12
  from datahub.ingestion.api.workunit import MetadataWorkUnit
13
13
  from datahub.ingestion.source.aws.s3_util import (
14
- get_bucket_name,
15
14
  get_bucket_relative_path,
16
15
  get_s3_prefix,
17
16
  is_s3_uri,
18
17
  )
19
18
  from datahub.ingestion.source.azure.abs_utils import (
20
19
  get_abs_prefix,
21
- get_container_name,
22
20
  get_container_relative_path,
23
21
  is_abs_uri,
24
22
  )
25
23
  from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
24
+ from datahub.ingestion.source.data_lake_common.object_store import (
25
+ get_object_store_bucket_name,
26
+ get_object_store_for_uri,
27
+ )
26
28
  from datahub.ingestion.source.gcs.gcs_utils import (
27
- get_gcs_bucket_name,
28
29
  get_gcs_prefix,
29
30
  is_gcs_uri,
30
31
  )
@@ -87,6 +88,13 @@ class ContainerWUCreator:
87
88
 
88
89
  @staticmethod
89
90
  def get_protocol(path: str) -> str:
91
+ object_store = get_object_store_for_uri(path)
92
+ if object_store:
93
+ prefix = object_store.get_prefix(path)
94
+ if prefix:
95
+ return prefix
96
+
97
+ # Legacy fallback
90
98
  protocol: Optional[str] = None
91
99
  if is_s3_uri(path):
92
100
  protocol = get_s3_prefix(path)
@@ -104,13 +112,12 @@ class ContainerWUCreator:
104
112
 
105
113
  @staticmethod
106
114
  def get_bucket_name(path: str) -> str:
107
- if is_s3_uri(path):
108
- return get_bucket_name(path)
109
- elif is_gcs_uri(path):
110
- return get_gcs_bucket_name(path)
111
- elif is_abs_uri(path):
112
- return get_container_name(path)
113
- raise ValueError(f"Unable to get bucket name from path: {path}")
115
+ """
116
+ Get the bucket/container name from any supported object store URI.
117
+
118
+ Delegates to the abstract get_object_store_bucket_name function.
119
+ """
120
+ return get_object_store_bucket_name(path)
114
121
 
115
122
  def get_sub_types(self) -> str:
116
123
  if self.platform == PLATFORM_S3:
@@ -122,6 +129,11 @@ class ContainerWUCreator:
122
129
  raise ValueError(f"Unable to sub type for platform: {self.platform}")
123
130
 
124
131
  def get_base_full_path(self, path: str) -> str:
132
+ object_store = get_object_store_for_uri(path)
133
+ if object_store:
134
+ return object_store.get_object_key(path)
135
+
136
+ # Legacy fallback
125
137
  if self.platform == "s3" or self.platform == "gcs":
126
138
  return get_bucket_relative_path(path)
127
139
  elif self.platform == "abs":