acryl-datahub 1.1.0rc3__py3-none-any.whl → 1.1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/METADATA +2532 -2530
- {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/RECORD +87 -70
- {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +9 -8
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +170 -0
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/delete_cli.py +4 -4
- datahub/cli/ingest_cli.py +9 -1
- datahub/emitter/mce_builder.py +3 -1
- datahub/emitter/response_helper.py +86 -1
- datahub/emitter/rest_emitter.py +1 -1
- datahub/ingestion/graph/client.py +3 -3
- datahub/ingestion/source/apply/datahub_apply.py +4 -4
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +22 -10
- datahub/ingestion/source/data_lake_common/object_store.py +644 -0
- datahub/ingestion/source/datahub/config.py +11 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +186 -33
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_common.py +30 -11
- datahub/ingestion/source/gcs/gcs_source.py +22 -7
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/hex/query_fetcher.py +9 -3
- datahub/ingestion/source/openapi.py +12 -0
- datahub/ingestion/source/openapi_parser.py +56 -37
- datahub/ingestion/source/s3/source.py +65 -6
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +44 -21
- datahub/ingestion/source/snowflake/snowflake_query.py +0 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +17 -6
- datahub/ingestion/source/sql/athena.py +1 -0
- datahub/ingestion/source/sql/hive.py +2 -3
- datahub/ingestion/source/sql/sql_common.py +98 -34
- datahub/ingestion/source/sql/sql_types.py +5 -2
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +117 -0
- datahub/ingestion/source/unity/source.py +167 -15
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/metadata/_internal_schema_classes.py +667 -522
- datahub/metadata/_urns/urn_defs.py +1804 -1748
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/schema.avsc +17358 -17584
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +1 -0
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +1 -0
- datahub/metadata/schemas/MLModelKey.avsc +1 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +6 -0
- datahub/sdk/_all_entities.py +11 -0
- datahub/sdk/_shared.py +118 -1
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +342 -0
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +90 -2
- datahub/sdk/lineage_client.py +681 -82
- datahub/sdk/main_client.py +27 -8
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/sdk_v2_helpers.py +18 -0
- {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# Import RestrictedText from your existing module
|
|
2
|
+
# Uncomment and adjust the import path as needed:
|
|
3
|
+
# from your_restricted_text_module import RestrictedText
|
|
4
|
+
# The following is a list of tag constraints:
|
|
5
|
+
# You can assign a maximum of 50 tags to a single securable object.
|
|
6
|
+
# The maximum length of a tag key is 255 characters.
|
|
7
|
+
# The maximum length of a tag value is 1000 characters.
|
|
8
|
+
# The following characters are not allowed in tag keys:
|
|
9
|
+
# . , - = / :
|
|
10
|
+
# Tag search using the workspace search UI is supported only for tables, views, and table columns.
|
|
11
|
+
# Tag search requires exact term matching.
|
|
12
|
+
# https://learn.microsoft.com/en-us/azure/databricks/database-objects/tags#constraint
|
|
13
|
+
from typing import Any, Dict, Optional, Set, Union
|
|
14
|
+
|
|
15
|
+
from typing_extensions import ClassVar
|
|
16
|
+
|
|
17
|
+
from datahub.api.entities.external.external_tag import ExternalTag
|
|
18
|
+
from datahub.api.entities.external.restricted_text import RestrictedText
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class UnityCatalogTagKeyText(RestrictedText):
|
|
22
|
+
"""RestrictedText configured for Unity Catalog tag keys."""
|
|
23
|
+
|
|
24
|
+
_default_max_length: ClassVar[int] = 255
|
|
25
|
+
# Unity Catalog tag keys: alphanumeric, hyphens, underscores, periods only
|
|
26
|
+
_default_replace_chars: ClassVar[Set[str]] = {
|
|
27
|
+
"\t",
|
|
28
|
+
"\n",
|
|
29
|
+
"\r",
|
|
30
|
+
".",
|
|
31
|
+
",",
|
|
32
|
+
"-",
|
|
33
|
+
"=",
|
|
34
|
+
"/",
|
|
35
|
+
":",
|
|
36
|
+
}
|
|
37
|
+
_default_replacement_char: ClassVar[str] = "_"
|
|
38
|
+
_default_truncation_suffix: ClassVar[str] = "" # No suffix for clean identifiers
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class UnityCatalogTagValueText(RestrictedText):
|
|
42
|
+
"""RestrictedText configured for Unity Catalog tag values."""
|
|
43
|
+
|
|
44
|
+
_default_max_length: ClassVar[int] = 1000
|
|
45
|
+
# Unity Catalog tag values are more permissive but still have some restrictions
|
|
46
|
+
_default_replace_chars: ClassVar[Set[str]] = {"\t", "\n", "\r"}
|
|
47
|
+
_default_replacement_char: ClassVar[str] = " "
|
|
48
|
+
_default_truncation_suffix: ClassVar[str] = "..."
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class UnityCatalogTag(ExternalTag):
|
|
52
|
+
"""
|
|
53
|
+
A tag type specifically designed for Unity Catalog tag restrictions.
|
|
54
|
+
|
|
55
|
+
Unity Catalog Tag Restrictions:
|
|
56
|
+
- Key: Max 127 characters, alphanumeric + hyphens, underscores, periods only
|
|
57
|
+
- Value: Max 256 characters, more permissive but no control characters
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
key: UnityCatalogTagKeyText
|
|
61
|
+
value: Optional[UnityCatalogTagValueText] = None
|
|
62
|
+
|
|
63
|
+
def __init__(
|
|
64
|
+
self,
|
|
65
|
+
key: Optional[Union[str, UnityCatalogTagKeyText]] = None,
|
|
66
|
+
value: Optional[Union[str, UnityCatalogTagValueText]] = None,
|
|
67
|
+
**data: Any,
|
|
68
|
+
) -> None:
|
|
69
|
+
"""
|
|
70
|
+
Initialize UnityCatalogTag from either a DataHub Tag URN or explicit key/value.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
key: Explicit key value (optional for Pydantic initialization)
|
|
74
|
+
value: Explicit value (optional)
|
|
75
|
+
**data: Additional Pydantic data
|
|
76
|
+
"""
|
|
77
|
+
if key is not None:
|
|
78
|
+
# Direct initialization with key/value
|
|
79
|
+
processed_key = (
|
|
80
|
+
UnityCatalogTagKeyText(key)
|
|
81
|
+
if not isinstance(key, UnityCatalogTagKeyText)
|
|
82
|
+
else key
|
|
83
|
+
)
|
|
84
|
+
processed_value = None
|
|
85
|
+
if value is not None:
|
|
86
|
+
processed_value = (
|
|
87
|
+
UnityCatalogTagValueText(value)
|
|
88
|
+
if not isinstance(value, UnityCatalogTagValueText)
|
|
89
|
+
else value
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
super().__init__(
|
|
93
|
+
key=processed_key,
|
|
94
|
+
value=processed_value,
|
|
95
|
+
**data,
|
|
96
|
+
)
|
|
97
|
+
else:
|
|
98
|
+
# Standard pydantic initialization
|
|
99
|
+
super().__init__(**data)
|
|
100
|
+
|
|
101
|
+
def __eq__(self, other: object) -> bool:
|
|
102
|
+
"""Check equality based on key and value."""
|
|
103
|
+
if not isinstance(other, UnityCatalogTag):
|
|
104
|
+
return False
|
|
105
|
+
return str(self.key) == str(other.key) and (
|
|
106
|
+
str(self.value) if self.value else None
|
|
107
|
+
) == (str(other.value) if other.value else None)
|
|
108
|
+
|
|
109
|
+
def __hash__(self) -> int:
|
|
110
|
+
"""Make UnityCatalogTag hashable based on key and value."""
|
|
111
|
+
return hash((str(self.key), str(self.value) if self.value else None))
|
|
112
|
+
|
|
113
|
+
@classmethod
|
|
114
|
+
def from_dict(cls, tag_dict: Dict[str, Any]) -> "UnityCatalogTag":
|
|
115
|
+
"""
|
|
116
|
+
Create a UnityCatalogTag from a dictionary with 'key' and optional 'value'.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
tag_dict: Dictionary with 'key' and optional 'value' keys
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
UnityCatalogTag instance
|
|
123
|
+
"""
|
|
124
|
+
return cls(key=tag_dict["key"], value=tag_dict.get("value"))
|
|
125
|
+
|
|
126
|
+
@classmethod
|
|
127
|
+
def from_key_value(cls, key: str, value: Optional[str] = None) -> "UnityCatalogTag":
|
|
128
|
+
"""
|
|
129
|
+
Create a UnityCatalogTag from explicit key and value.
|
|
130
|
+
|
|
131
|
+
Overrides the parent method to return the correct type.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
key: Tag key
|
|
135
|
+
value: Optional tag value
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
UnityCatalogTag instance
|
|
139
|
+
"""
|
|
140
|
+
return cls(key=key, value=value)
|
|
141
|
+
|
|
142
|
+
def to_dict(self) -> Dict[str, str]:
|
|
143
|
+
"""
|
|
144
|
+
Convert to dictionary format suitable for Unity Catalog API.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Dictionary with 'key' and optionally 'value'
|
|
148
|
+
"""
|
|
149
|
+
result: Dict[str, str] = {"key": self.key.original}
|
|
150
|
+
if self.value is not None:
|
|
151
|
+
result["value"] = self.value.original
|
|
152
|
+
return result
|
|
153
|
+
|
|
154
|
+
def to_display_dict(self) -> Dict[str, str]:
|
|
155
|
+
"""
|
|
156
|
+
Convert to dictionary format showing processed values.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Dictionary with processed 'key' and optional 'value'
|
|
160
|
+
"""
|
|
161
|
+
result: Dict[str, str] = {"key": str(self.key)}
|
|
162
|
+
if self.value is not None:
|
|
163
|
+
result["value"] = str(self.value)
|
|
164
|
+
return result
|
|
165
|
+
|
|
166
|
+
def __repr__(self) -> str:
|
|
167
|
+
if self.value:
|
|
168
|
+
return f"UnityCatalogTag(key={self.key!r}, value={self.value!r})"
|
|
169
|
+
else:
|
|
170
|
+
return f"UnityCatalogTag(key={self.key!r})"
|
|
@@ -4,7 +4,7 @@ from pathlib import Path
|
|
|
4
4
|
from typing import Iterable, List, Optional, Union
|
|
5
5
|
|
|
6
6
|
import yaml
|
|
7
|
-
from pydantic import validator
|
|
7
|
+
from pydantic import StrictStr, validator
|
|
8
8
|
from ruamel.yaml import YAML
|
|
9
9
|
|
|
10
10
|
from datahub.configuration.common import ConfigModel
|
|
@@ -38,7 +38,7 @@ class AllowedTypes(Enum):
|
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
class AllowedValue(ConfigModel):
|
|
41
|
-
value: Union[
|
|
41
|
+
value: Union[StrictStr, float]
|
|
42
42
|
description: Optional[str] = None
|
|
43
43
|
|
|
44
44
|
|
datahub/cli/delete_cli.py
CHANGED
|
@@ -231,7 +231,7 @@ def references(urn: str, dry_run: bool, force: bool) -> None:
|
|
|
231
231
|
default=3000,
|
|
232
232
|
type=int,
|
|
233
233
|
help="Batch size when querying for entities to un-soft delete."
|
|
234
|
-
"Maximum
|
|
234
|
+
"Maximum 5000. Large batch sizes may cause timeouts.",
|
|
235
235
|
)
|
|
236
236
|
def undo_by_filter(
|
|
237
237
|
urn: Optional[str], platform: Optional[str], batch_size: int
|
|
@@ -336,7 +336,7 @@ def undo_by_filter(
|
|
|
336
336
|
default=3000,
|
|
337
337
|
type=int,
|
|
338
338
|
help="Batch size when querying for entities to delete."
|
|
339
|
-
"Maximum
|
|
339
|
+
"Maximum 5000. Large batch sizes may cause timeouts.",
|
|
340
340
|
)
|
|
341
341
|
@click.option(
|
|
342
342
|
"-n",
|
|
@@ -654,8 +654,8 @@ def _validate_user_aspect_flags(
|
|
|
654
654
|
def _validate_batch_size(batch_size: int) -> None:
|
|
655
655
|
if batch_size <= 0:
|
|
656
656
|
raise click.UsageError("Batch size must be a positive integer.")
|
|
657
|
-
elif batch_size >
|
|
658
|
-
raise click.UsageError("Batch size cannot exceed
|
|
657
|
+
elif batch_size > 5000:
|
|
658
|
+
raise click.UsageError("Batch size cannot exceed 5,000.")
|
|
659
659
|
|
|
660
660
|
|
|
661
661
|
def _delete_one_urn(
|
datahub/cli/ingest_cli.py
CHANGED
|
@@ -388,7 +388,10 @@ def mcps(path: str) -> None:
|
|
|
388
388
|
@upgrade.check_upgrade
|
|
389
389
|
@telemetry.with_telemetry()
|
|
390
390
|
def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) -> None:
|
|
391
|
-
"""
|
|
391
|
+
"""
|
|
392
|
+
List ingestion source runs with their details, optionally filtered by URN or source.
|
|
393
|
+
Required the Manage Metadata Ingestion permission.
|
|
394
|
+
"""
|
|
392
395
|
|
|
393
396
|
query = """
|
|
394
397
|
query listIngestionRuns($input: ListIngestionSourcesInput!) {
|
|
@@ -446,6 +449,11 @@ def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) ->
|
|
|
446
449
|
if not data:
|
|
447
450
|
click.echo("No response received from the server.")
|
|
448
451
|
return
|
|
452
|
+
if "errors" in data:
|
|
453
|
+
click.echo("Errors in response:")
|
|
454
|
+
for error in data["errors"]:
|
|
455
|
+
click.echo(f"- {error.get('message', 'Unknown error')}")
|
|
456
|
+
return
|
|
449
457
|
|
|
450
458
|
# a lot of responses can be null if there's errors in the run
|
|
451
459
|
ingestion_sources = (
|
datahub/emitter/mce_builder.py
CHANGED
|
@@ -59,6 +59,7 @@ from datahub.metadata.urns import (
|
|
|
59
59
|
DataJobUrn,
|
|
60
60
|
DataPlatformUrn,
|
|
61
61
|
DatasetUrn,
|
|
62
|
+
OwnershipTypeUrn,
|
|
62
63
|
TagUrn,
|
|
63
64
|
)
|
|
64
65
|
from datahub.utilities.urn_encoder import UrnEncoder
|
|
@@ -406,7 +407,8 @@ def make_ml_model_group_urn(platform: str, group_name: str, env: str) -> str:
|
|
|
406
407
|
|
|
407
408
|
def validate_ownership_type(ownership_type: str) -> Tuple[str, Optional[str]]:
|
|
408
409
|
if ownership_type.startswith("urn:li:"):
|
|
409
|
-
|
|
410
|
+
ownership_type_urn = OwnershipTypeUrn.from_string(ownership_type)
|
|
411
|
+
return OwnershipTypeClass.CUSTOM, ownership_type_urn.urn()
|
|
410
412
|
ownership_type = ownership_type.upper()
|
|
411
413
|
if ownership_type in get_enum_options(OwnershipTypeClass):
|
|
412
414
|
return ownership_type, None
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import re
|
|
3
4
|
import warnings
|
|
4
5
|
from dataclasses import dataclass
|
|
6
|
+
from datetime import datetime, timezone
|
|
5
7
|
from typing import Dict, List, Optional, Sequence, Union
|
|
6
8
|
|
|
7
9
|
from requests import Response
|
|
@@ -22,12 +24,95 @@ class TraceData:
|
|
|
22
24
|
trace_id: str
|
|
23
25
|
data: Dict[str, List[str]]
|
|
24
26
|
|
|
27
|
+
@staticmethod
|
|
28
|
+
def extract_trace_id(input_str: Optional[str]) -> Optional[str]:
|
|
29
|
+
"""
|
|
30
|
+
Extract the trace ID from various input formats.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
input_str (Optional[str]): Input string potentially containing a trace ID
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Optional[str]: Extracted trace ID or None if no valid trace ID found
|
|
37
|
+
"""
|
|
38
|
+
# Handle None or empty input
|
|
39
|
+
if input_str is None or not str(input_str).strip():
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
# Convert to string and clean
|
|
43
|
+
input_str = str(input_str).strip()
|
|
44
|
+
|
|
45
|
+
# Special case for test scenarios
|
|
46
|
+
if input_str == "test-trace-id":
|
|
47
|
+
return input_str
|
|
48
|
+
|
|
49
|
+
# Case 1: Full traceparent header (containing hyphens)
|
|
50
|
+
if "-" in input_str:
|
|
51
|
+
parts = input_str.split("-")
|
|
52
|
+
if len(parts) >= 2:
|
|
53
|
+
# The trace ID is the second part (index 1)
|
|
54
|
+
return parts[1]
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
# Case 2: Direct trace ID (32 hex characters)
|
|
58
|
+
if len(input_str) == 32 and re.match(r"^[0-9a-fA-F]+$", input_str):
|
|
59
|
+
return input_str
|
|
60
|
+
|
|
61
|
+
# Fallback: return the original input if it doesn't match strict criteria
|
|
62
|
+
return input_str
|
|
63
|
+
|
|
25
64
|
def __post_init__(self) -> None:
|
|
26
|
-
|
|
65
|
+
"""
|
|
66
|
+
Validate and potentially process the trace_id during initialization.
|
|
67
|
+
"""
|
|
68
|
+
# Explicitly check for None or empty string
|
|
69
|
+
if self.trace_id is None or self.trace_id == "":
|
|
27
70
|
raise ValueError("trace_id cannot be empty")
|
|
71
|
+
|
|
72
|
+
# Allow extracting trace ID from various input formats
|
|
73
|
+
extracted_id = self.extract_trace_id(self.trace_id)
|
|
74
|
+
if extracted_id is None:
|
|
75
|
+
raise ValueError("Invalid trace_id format")
|
|
76
|
+
|
|
77
|
+
# Update trace_id with the extracted version
|
|
78
|
+
self.trace_id = extracted_id
|
|
79
|
+
|
|
80
|
+
# Validate data
|
|
28
81
|
if not isinstance(self.data, dict):
|
|
29
82
|
raise TypeError("data must be a dictionary")
|
|
30
83
|
|
|
84
|
+
def extract_timestamp(self) -> datetime:
|
|
85
|
+
"""
|
|
86
|
+
Extract the timestamp from a trace ID generated by the TraceIdGenerator.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
datetime: The timestamp in UTC
|
|
90
|
+
|
|
91
|
+
Raises:
|
|
92
|
+
ValueError: If the trace ID is invalid
|
|
93
|
+
"""
|
|
94
|
+
# Special case for test trace ID
|
|
95
|
+
if self.trace_id == "test-trace-id":
|
|
96
|
+
return datetime.fromtimestamp(0, tz=timezone.utc)
|
|
97
|
+
|
|
98
|
+
# Validate trace ID length for hex-based trace IDs
|
|
99
|
+
if len(self.trace_id) < 16 or not re.match(
|
|
100
|
+
r"^[0-9a-fA-F]+$", self.trace_id[:16]
|
|
101
|
+
):
|
|
102
|
+
raise ValueError("Invalid trace ID format")
|
|
103
|
+
|
|
104
|
+
# Extract the first 16 hex characters representing timestamp in microseconds
|
|
105
|
+
timestamp_micros_hex = self.trace_id[:16]
|
|
106
|
+
|
|
107
|
+
# Convert hex to integer
|
|
108
|
+
timestamp_micros = int(timestamp_micros_hex, 16)
|
|
109
|
+
|
|
110
|
+
# Convert microseconds to milliseconds
|
|
111
|
+
timestamp_millis = timestamp_micros // 1000
|
|
112
|
+
|
|
113
|
+
# Convert to datetime in UTC
|
|
114
|
+
return datetime.fromtimestamp(timestamp_millis / 1000, tz=timezone.utc)
|
|
115
|
+
|
|
31
116
|
|
|
32
117
|
def _extract_trace_id(response: Response) -> Optional[str]:
|
|
33
118
|
"""
|
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -852,7 +852,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
852
852
|
for aspect_name, aspect_status in aspects.items():
|
|
853
853
|
if not aspect_status["success"]:
|
|
854
854
|
error_msg = (
|
|
855
|
-
f"Unable to validate async write to DataHub GMS: "
|
|
855
|
+
f"Unable to validate async write {trace.trace_id} ({trace.extract_timestamp()}) to DataHub GMS: "
|
|
856
856
|
f"Persistence failure for URN '{urn}' aspect '{aspect_name}'. "
|
|
857
857
|
f"Status: {aspect_status}"
|
|
858
858
|
)
|
|
@@ -806,7 +806,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
806
806
|
"input": search_query,
|
|
807
807
|
"entity": "container",
|
|
808
808
|
"start": 0,
|
|
809
|
-
"count":
|
|
809
|
+
"count": 5000,
|
|
810
810
|
"filter": {"or": container_filters},
|
|
811
811
|
}
|
|
812
812
|
results: Dict = self._post_generic(url, search_body)
|
|
@@ -901,7 +901,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
901
901
|
query: Optional[str] = None,
|
|
902
902
|
container: Optional[str] = None,
|
|
903
903
|
status: Optional[RemovedStatusFilter] = RemovedStatusFilter.NOT_SOFT_DELETED,
|
|
904
|
-
batch_size: int =
|
|
904
|
+
batch_size: int = 5000,
|
|
905
905
|
extraFilters: Optional[List[RawSearchFilterRule]] = None,
|
|
906
906
|
extra_or_filters: Optional[RawSearchFilter] = None,
|
|
907
907
|
) -> Iterable[str]:
|
|
@@ -993,7 +993,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
993
993
|
query: Optional[str] = None,
|
|
994
994
|
container: Optional[str] = None,
|
|
995
995
|
status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
|
|
996
|
-
batch_size: int =
|
|
996
|
+
batch_size: int = 5000,
|
|
997
997
|
extra_and_filters: Optional[List[RawSearchFilterRule]] = None,
|
|
998
998
|
extra_or_filters: Optional[RawSearchFilter] = None,
|
|
999
999
|
extra_source_fields: Optional[List[str]] = None,
|
|
@@ -96,7 +96,7 @@ def apply_association_to_container(
|
|
|
96
96
|
class DomainApplyConfig(ConfigModel):
|
|
97
97
|
assets: List[str] = Field(
|
|
98
98
|
default_factory=list,
|
|
99
|
-
description="List of assets to apply domain
|
|
99
|
+
description="List of assets to apply domain hierarchically. Currently only containers and datasets are supported",
|
|
100
100
|
)
|
|
101
101
|
domain_urn: str = Field(default="")
|
|
102
102
|
|
|
@@ -104,7 +104,7 @@ class DomainApplyConfig(ConfigModel):
|
|
|
104
104
|
class TagApplyConfig(ConfigModel):
|
|
105
105
|
assets: List[str] = Field(
|
|
106
106
|
default_factory=list,
|
|
107
|
-
description="List of assets to apply tag
|
|
107
|
+
description="List of assets to apply tag hierarchically. Currently only containers and datasets are supported",
|
|
108
108
|
)
|
|
109
109
|
tag_urn: str = Field(default="")
|
|
110
110
|
|
|
@@ -112,7 +112,7 @@ class TagApplyConfig(ConfigModel):
|
|
|
112
112
|
class TermApplyConfig(ConfigModel):
|
|
113
113
|
assets: List[str] = Field(
|
|
114
114
|
default_factory=list,
|
|
115
|
-
description="List of assets to apply term
|
|
115
|
+
description="List of assets to apply term hierarchically. Currently only containers and datasets are supported",
|
|
116
116
|
)
|
|
117
117
|
term_urn: str = Field(default="")
|
|
118
118
|
|
|
@@ -120,7 +120,7 @@ class TermApplyConfig(ConfigModel):
|
|
|
120
120
|
class OwnerApplyConfig(ConfigModel):
|
|
121
121
|
assets: List[str] = Field(
|
|
122
122
|
default_factory=list,
|
|
123
|
-
description="List of assets to apply owner
|
|
123
|
+
description="List of assets to apply owner hierarchically. Currently only containers and datasets are supported",
|
|
124
124
|
)
|
|
125
125
|
owner_urn: str = Field(default="")
|
|
126
126
|
|
|
@@ -11,20 +11,21 @@ from datahub.emitter.mcp_builder import (
|
|
|
11
11
|
)
|
|
12
12
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
13
13
|
from datahub.ingestion.source.aws.s3_util import (
|
|
14
|
-
get_bucket_name,
|
|
15
14
|
get_bucket_relative_path,
|
|
16
15
|
get_s3_prefix,
|
|
17
16
|
is_s3_uri,
|
|
18
17
|
)
|
|
19
18
|
from datahub.ingestion.source.azure.abs_utils import (
|
|
20
19
|
get_abs_prefix,
|
|
21
|
-
get_container_name,
|
|
22
20
|
get_container_relative_path,
|
|
23
21
|
is_abs_uri,
|
|
24
22
|
)
|
|
25
23
|
from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
|
|
24
|
+
from datahub.ingestion.source.data_lake_common.object_store import (
|
|
25
|
+
get_object_store_bucket_name,
|
|
26
|
+
get_object_store_for_uri,
|
|
27
|
+
)
|
|
26
28
|
from datahub.ingestion.source.gcs.gcs_utils import (
|
|
27
|
-
get_gcs_bucket_name,
|
|
28
29
|
get_gcs_prefix,
|
|
29
30
|
is_gcs_uri,
|
|
30
31
|
)
|
|
@@ -87,6 +88,13 @@ class ContainerWUCreator:
|
|
|
87
88
|
|
|
88
89
|
@staticmethod
|
|
89
90
|
def get_protocol(path: str) -> str:
|
|
91
|
+
object_store = get_object_store_for_uri(path)
|
|
92
|
+
if object_store:
|
|
93
|
+
prefix = object_store.get_prefix(path)
|
|
94
|
+
if prefix:
|
|
95
|
+
return prefix
|
|
96
|
+
|
|
97
|
+
# Legacy fallback
|
|
90
98
|
protocol: Optional[str] = None
|
|
91
99
|
if is_s3_uri(path):
|
|
92
100
|
protocol = get_s3_prefix(path)
|
|
@@ -104,13 +112,12 @@ class ContainerWUCreator:
|
|
|
104
112
|
|
|
105
113
|
@staticmethod
|
|
106
114
|
def get_bucket_name(path: str) -> str:
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
raise ValueError(f"Unable to get bucket name from path: {path}")
|
|
115
|
+
"""
|
|
116
|
+
Get the bucket/container name from any supported object store URI.
|
|
117
|
+
|
|
118
|
+
Delegates to the abstract get_object_store_bucket_name function.
|
|
119
|
+
"""
|
|
120
|
+
return get_object_store_bucket_name(path)
|
|
114
121
|
|
|
115
122
|
def get_sub_types(self) -> str:
|
|
116
123
|
if self.platform == PLATFORM_S3:
|
|
@@ -122,6 +129,11 @@ class ContainerWUCreator:
|
|
|
122
129
|
raise ValueError(f"Unable to sub type for platform: {self.platform}")
|
|
123
130
|
|
|
124
131
|
def get_base_full_path(self, path: str) -> str:
|
|
132
|
+
object_store = get_object_store_for_uri(path)
|
|
133
|
+
if object_store:
|
|
134
|
+
return object_store.get_object_key(path)
|
|
135
|
+
|
|
136
|
+
# Legacy fallback
|
|
125
137
|
if self.platform == "s3" or self.platform == "gcs":
|
|
126
138
|
return get_bucket_relative_path(path)
|
|
127
139
|
elif self.platform == "abs":
|