acryl-datahub 1.1.0rc4__py3-none-any.whl → 1.1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/METADATA +2414 -2412
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/RECORD +87 -70
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +9 -8
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +170 -0
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/delete_cli.py +4 -4
- datahub/cli/ingest_cli.py +9 -1
- datahub/emitter/mce_builder.py +3 -1
- datahub/emitter/response_helper.py +86 -1
- datahub/emitter/rest_emitter.py +1 -1
- datahub/ingestion/graph/client.py +3 -3
- datahub/ingestion/source/apply/datahub_apply.py +4 -4
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +22 -10
- datahub/ingestion/source/data_lake_common/object_store.py +644 -0
- datahub/ingestion/source/datahub/config.py +11 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +186 -33
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_common.py +30 -11
- datahub/ingestion/source/gcs/gcs_source.py +22 -7
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/hex/query_fetcher.py +9 -3
- datahub/ingestion/source/openapi.py +12 -0
- datahub/ingestion/source/openapi_parser.py +56 -37
- datahub/ingestion/source/s3/source.py +65 -6
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +44 -21
- datahub/ingestion/source/snowflake/snowflake_query.py +0 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +17 -6
- datahub/ingestion/source/sql/athena.py +1 -0
- datahub/ingestion/source/sql/hive.py +2 -3
- datahub/ingestion/source/sql/sql_common.py +98 -34
- datahub/ingestion/source/sql/sql_types.py +5 -2
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +117 -0
- datahub/ingestion/source/unity/source.py +167 -15
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/metadata/_internal_schema_classes.py +667 -522
- datahub/metadata/_urns/urn_defs.py +1804 -1748
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/schema.avsc +17358 -17584
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +1 -0
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +1 -0
- datahub/metadata/schemas/MLModelKey.avsc +1 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +6 -0
- datahub/sdk/_all_entities.py +11 -0
- datahub/sdk/_shared.py +118 -1
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +342 -0
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +90 -2
- datahub/sdk/lineage_client.py +681 -82
- datahub/sdk/main_client.py +27 -8
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/sdk_v2_helpers.py +18 -0
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from abc import abstractmethod
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Iterable, List, Optional, Union
|
|
6
|
+
|
|
7
|
+
import cachetools
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
|
|
10
|
+
from datahub.api.entities.platformresource.platform_resource import (
|
|
11
|
+
PlatformResource,
|
|
12
|
+
PlatformResourceKey,
|
|
13
|
+
)
|
|
14
|
+
from datahub.ingestion.graph.client import DataHubGraph
|
|
15
|
+
from datahub.metadata.urns import PlatformResourceUrn, Urn
|
|
16
|
+
from datahub.utilities.search_utils import ElasticDocumentQuery
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class PlatformResourceRepository:
|
|
22
|
+
def __init__(self, graph: DataHubGraph):
|
|
23
|
+
self.graph = graph
|
|
24
|
+
self.cache: cachetools.TTLCache = cachetools.TTLCache(maxsize=1000, ttl=60 * 5)
|
|
25
|
+
|
|
26
|
+
def search_by_filter(
|
|
27
|
+
self, filter: ElasticDocumentQuery, add_to_cache: bool = True
|
|
28
|
+
) -> Iterable[PlatformResource]:
|
|
29
|
+
results = PlatformResource.search_by_filters(self.graph, filter)
|
|
30
|
+
for platform_resource in results:
|
|
31
|
+
if add_to_cache:
|
|
32
|
+
self.cache[platform_resource.id] = platform_resource
|
|
33
|
+
yield platform_resource
|
|
34
|
+
|
|
35
|
+
def create(self, platform_resource: PlatformResource) -> None:
|
|
36
|
+
platform_resource.to_datahub(self.graph)
|
|
37
|
+
self.cache[platform_resource.id] = platform_resource
|
|
38
|
+
|
|
39
|
+
def get(self, key: PlatformResourceKey) -> Optional[PlatformResource]:
|
|
40
|
+
return self.cache.get(key.id)
|
|
41
|
+
|
|
42
|
+
def delete(self, key: PlatformResourceKey) -> None:
|
|
43
|
+
self.graph.delete_entity(urn=PlatformResourceUrn(key.id).urn(), hard=True)
|
|
44
|
+
del self.cache[key.id]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ExternalEntityId:
|
|
48
|
+
"""
|
|
49
|
+
ExternalEntityId is a unique
|
|
50
|
+
identifier for an ExternalEntity.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
@abstractmethod
|
|
54
|
+
def to_platform_resource_key(self) -> PlatformResourceKey:
|
|
55
|
+
"""
|
|
56
|
+
Converts the ExternalEntityId to a PlatformResourceKey.
|
|
57
|
+
"""
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class CaseSensitivity(Enum):
|
|
62
|
+
UPPER = "upper"
|
|
63
|
+
LOWER = "lower"
|
|
64
|
+
MIXED = "mixed"
|
|
65
|
+
|
|
66
|
+
@staticmethod
|
|
67
|
+
def detect_case_sensitivity(value: str) -> "CaseSensitivity":
|
|
68
|
+
if value.isupper():
|
|
69
|
+
return CaseSensitivity.UPPER
|
|
70
|
+
elif value.islower():
|
|
71
|
+
return CaseSensitivity.LOWER
|
|
72
|
+
return CaseSensitivity.MIXED
|
|
73
|
+
|
|
74
|
+
@staticmethod
|
|
75
|
+
def detect_for_many(values: List[str]) -> "CaseSensitivity":
|
|
76
|
+
"""
|
|
77
|
+
Detects the case sensitivity for a list of strings.
|
|
78
|
+
Returns CaseSensitivity.MIXED if the case sensitivity is mixed.
|
|
79
|
+
"""
|
|
80
|
+
if len(values) == 0:
|
|
81
|
+
return CaseSensitivity.MIXED
|
|
82
|
+
|
|
83
|
+
if all(
|
|
84
|
+
CaseSensitivity.detect_case_sensitivity(value) == CaseSensitivity.UPPER
|
|
85
|
+
for value in values
|
|
86
|
+
):
|
|
87
|
+
return CaseSensitivity.UPPER
|
|
88
|
+
elif all(
|
|
89
|
+
CaseSensitivity.detect_case_sensitivity(value) == CaseSensitivity.LOWER
|
|
90
|
+
for value in values
|
|
91
|
+
):
|
|
92
|
+
return CaseSensitivity.LOWER
|
|
93
|
+
return CaseSensitivity.MIXED
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class LinkedResourceSet(BaseModel):
|
|
97
|
+
"""
|
|
98
|
+
A LinkedResourceSet is a set of DataHub URNs that are linked to an ExternalEntity.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
urns: List[str]
|
|
102
|
+
|
|
103
|
+
def _has_conflict(self, urn: Urn) -> bool:
|
|
104
|
+
"""
|
|
105
|
+
Detects if the urn is safe to add into the set
|
|
106
|
+
This is used to detect conflicts between DataHub URNs that are linked to
|
|
107
|
+
the same ExternalEntity.
|
|
108
|
+
e.g. Case sensitivity of URNs
|
|
109
|
+
Mixing tags and terms in the same set etc.
|
|
110
|
+
Return True if the urn is not safe to add into the set, else False.
|
|
111
|
+
If the urn is already in the set, we don't need to add it again, but
|
|
112
|
+
that is not a conflict.
|
|
113
|
+
"""
|
|
114
|
+
if urn.urn() in self.urns:
|
|
115
|
+
return False
|
|
116
|
+
|
|
117
|
+
# Detect the entity_type of the urns in the existing set
|
|
118
|
+
detected_entity_type = None
|
|
119
|
+
for existing_urn in self.urns:
|
|
120
|
+
try:
|
|
121
|
+
parsed_urn = Urn.from_string(existing_urn)
|
|
122
|
+
entity_type = parsed_urn.entity_type
|
|
123
|
+
if detected_entity_type is None:
|
|
124
|
+
detected_entity_type = entity_type
|
|
125
|
+
elif detected_entity_type != entity_type:
|
|
126
|
+
logger.warning(
|
|
127
|
+
f"Detected entity_type {detected_entity_type} is not equals to {entity_type}"
|
|
128
|
+
)
|
|
129
|
+
return True
|
|
130
|
+
except ValueError:
|
|
131
|
+
# Not a valid URN
|
|
132
|
+
logger.warning(f"Invalid URN {existing_urn} in LinkedResourceSet")
|
|
133
|
+
return True
|
|
134
|
+
try:
|
|
135
|
+
parsed_urn = urn
|
|
136
|
+
if (
|
|
137
|
+
detected_entity_type is not None
|
|
138
|
+
and parsed_urn.entity_type != detected_entity_type
|
|
139
|
+
):
|
|
140
|
+
logger.warning(
|
|
141
|
+
f"Detected entity_type {detected_entity_type} is not equals to parsed_urn's entity_type: {parsed_urn.entity_type}"
|
|
142
|
+
)
|
|
143
|
+
return True
|
|
144
|
+
except ValueError:
|
|
145
|
+
# Not a valid URN
|
|
146
|
+
logger.warning(f"Invalid URN: {urn} in LinkedResourceSet")
|
|
147
|
+
return True
|
|
148
|
+
return False
|
|
149
|
+
|
|
150
|
+
def add(self, urn: Union[str, Urn]) -> bool:
|
|
151
|
+
"""
|
|
152
|
+
Adds a URN to the set.
|
|
153
|
+
Returns True if the URN was added, False if it was already in the set.
|
|
154
|
+
Raises a ValueError if the URN is in conflict with the existing set.
|
|
155
|
+
"""
|
|
156
|
+
# Deduplicate the URNs if we have somehow duplicate items from concurrent runs
|
|
157
|
+
self.urns = list(set(self.urns))
|
|
158
|
+
if isinstance(urn, str):
|
|
159
|
+
urn = Urn.from_string(urn)
|
|
160
|
+
if self._has_conflict(urn):
|
|
161
|
+
raise ValueError(f"Conflict detected when adding URN {urn} to the set")
|
|
162
|
+
if urn.urn() not in self.urns:
|
|
163
|
+
self.urns.append(urn.urn())
|
|
164
|
+
return True
|
|
165
|
+
return False
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class ExternalEntity:
|
|
169
|
+
"""
|
|
170
|
+
An ExternalEntity is a representation of an entity that external to DataHub
|
|
171
|
+
but could be linked to one or more DataHub entities.
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
@abstractmethod
|
|
175
|
+
def is_managed_by_datahub(self) -> bool:
|
|
176
|
+
"""
|
|
177
|
+
Returns whether the entity is managed by DataHub.
|
|
178
|
+
"""
|
|
179
|
+
pass
|
|
180
|
+
|
|
181
|
+
@abstractmethod
|
|
182
|
+
def datahub_linked_resources(self) -> LinkedResourceSet:
|
|
183
|
+
"""
|
|
184
|
+
Returns the URNs of the DataHub entities linked to the external entity.
|
|
185
|
+
Empty list if no linked entities.
|
|
186
|
+
"""
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
@abstractmethod
|
|
190
|
+
def as_platform_resource(self) -> PlatformResource:
|
|
191
|
+
"""
|
|
192
|
+
Converts the ExternalEntity to a PlatformResource.
|
|
193
|
+
"""
|
|
194
|
+
pass
|
|
195
|
+
|
|
196
|
+
@abstractmethod
|
|
197
|
+
def get_id(self) -> ExternalEntityId:
|
|
198
|
+
"""
|
|
199
|
+
Returns the ExternalEntityId for the ExternalEntity.
|
|
200
|
+
"""
|
|
201
|
+
pass
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
@dataclass
|
|
205
|
+
class MissingExternalEntity(ExternalEntity):
|
|
206
|
+
id: ExternalEntityId
|
|
207
|
+
|
|
208
|
+
def is_managed_by_datahub(self) -> bool:
|
|
209
|
+
return False
|
|
210
|
+
|
|
211
|
+
def datahub_linked_resources(self) -> LinkedResourceSet:
|
|
212
|
+
return LinkedResourceSet(urns=[])
|
|
213
|
+
|
|
214
|
+
def as_platform_resource(self) -> Optional[PlatformResource]: # type: ignore[override]
|
|
215
|
+
return None
|
|
216
|
+
|
|
217
|
+
def get_id(self) -> ExternalEntityId:
|
|
218
|
+
return self.id
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class ExternalSystem:
|
|
222
|
+
@abstractmethod
|
|
223
|
+
def exists(self, external_entity_id: ExternalEntityId) -> bool:
|
|
224
|
+
"""
|
|
225
|
+
Returns whether the ExternalEntityId exists in the external system.
|
|
226
|
+
"""
|
|
227
|
+
pass
|
|
228
|
+
|
|
229
|
+
@abstractmethod
|
|
230
|
+
def get(
|
|
231
|
+
self,
|
|
232
|
+
external_entity_id: ExternalEntityId,
|
|
233
|
+
platform_resource_repository: PlatformResourceRepository,
|
|
234
|
+
) -> Optional[ExternalEntity]:
|
|
235
|
+
"""
|
|
236
|
+
Returns the ExternalEntity for the ExternalEntityId.
|
|
237
|
+
Uses the platform resource repository to enrich the ExternalEntity with DataHub URNs.
|
|
238
|
+
"""
|
|
239
|
+
pass
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""
|
|
2
|
+
External Tags Module
|
|
3
|
+
|
|
4
|
+
This module provides tag types that integrate with external systems like DataHub and Unity Catalog.
|
|
5
|
+
It builds on top of RestrictedText to provide sanitized, truncated tag handling with original value preservation.
|
|
6
|
+
|
|
7
|
+
Classes:
|
|
8
|
+
- ExternalTag: DataHub-compatible tag with key/value parsing from URNs
|
|
9
|
+
|
|
10
|
+
Example Usage:
|
|
11
|
+
# DataHub Tags
|
|
12
|
+
tag = ExternalTag.from_urn("urn:li:tag:environment:production")
|
|
13
|
+
datahub_urn = tag.get_datahub_tag # Returns TagUrn object or string
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from typing import Any, Optional, Tuple, Union
|
|
20
|
+
|
|
21
|
+
from pydantic import BaseModel
|
|
22
|
+
|
|
23
|
+
from datahub.api.entities.external.restricted_text import RestrictedText
|
|
24
|
+
from datahub.metadata.urns import TagUrn
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ExternalTag(BaseModel):
|
|
28
|
+
"""A tag type that parses DataHub Tag URNs into key-value pairs with RestrictedText properties."""
|
|
29
|
+
|
|
30
|
+
key: RestrictedText
|
|
31
|
+
value: Optional[RestrictedText] = None
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
key: Optional[Union[str, RestrictedText]] = None,
|
|
36
|
+
value: Optional[Union[str, RestrictedText]] = None,
|
|
37
|
+
**data: Any,
|
|
38
|
+
) -> None:
|
|
39
|
+
"""
|
|
40
|
+
Initialize ExternalTag from either a DataHub Tag URN or explicit key/value.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
key: Explicit key value (optional for Pydantic initialization)
|
|
44
|
+
value: Explicit value (optional)
|
|
45
|
+
**data: Additional Pydantic data
|
|
46
|
+
"""
|
|
47
|
+
if key is not None:
|
|
48
|
+
# Direct initialization with key/value
|
|
49
|
+
processed_key = (
|
|
50
|
+
RestrictedText(key) if not isinstance(key, RestrictedText) else key
|
|
51
|
+
)
|
|
52
|
+
processed_value = None
|
|
53
|
+
if value is not None:
|
|
54
|
+
processed_value = (
|
|
55
|
+
RestrictedText(value)
|
|
56
|
+
if not isinstance(value, RestrictedText)
|
|
57
|
+
else value
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
super().__init__(
|
|
61
|
+
key=processed_key,
|
|
62
|
+
value=processed_value,
|
|
63
|
+
**data,
|
|
64
|
+
)
|
|
65
|
+
else:
|
|
66
|
+
# Standard pydantic initialization
|
|
67
|
+
super().__init__(**data)
|
|
68
|
+
|
|
69
|
+
@staticmethod
|
|
70
|
+
def _parse_tag_name(tag_name: str) -> Tuple[str, Optional[str]]:
|
|
71
|
+
"""
|
|
72
|
+
Parse tag name into key and optional value.
|
|
73
|
+
|
|
74
|
+
If tag_name contains ':', split on first ':' into key:value
|
|
75
|
+
Otherwise, use entire tag_name as key with no value.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
tag_name: The tag name portion from the URN
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Tuple of (key, value) where value may be None
|
|
82
|
+
"""
|
|
83
|
+
if ":" in tag_name:
|
|
84
|
+
parts = tag_name.split(":", 1) # Split on first ':' only
|
|
85
|
+
return parts[0], parts[1]
|
|
86
|
+
else:
|
|
87
|
+
return tag_name, None
|
|
88
|
+
|
|
89
|
+
def to_datahub_tag_urn(self) -> TagUrn:
|
|
90
|
+
"""
|
|
91
|
+
Generate a DataHub Tag URN from the key and value.
|
|
92
|
+
This method creates the URN using the original (unprocessed) values.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
'urn:li:tag:key:value' if value exists, otherwise 'urn:li:tag:key'
|
|
96
|
+
"""
|
|
97
|
+
if self.value is not None:
|
|
98
|
+
tag_name = f"{self.key.original}:{self.value.original}"
|
|
99
|
+
else:
|
|
100
|
+
tag_name = self.key.original
|
|
101
|
+
|
|
102
|
+
return TagUrn(name=tag_name)
|
|
103
|
+
|
|
104
|
+
@classmethod
|
|
105
|
+
def from_urn(cls, tag_urn: Union[str, "TagUrn"]) -> "ExternalTag":
|
|
106
|
+
"""
|
|
107
|
+
Create an ExternalTag from a DataHub Tag URN.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
tag_urn: DataHub Tag URN string or TagUrn object
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
ExternalTag instance
|
|
114
|
+
"""
|
|
115
|
+
if isinstance(tag_urn, str):
|
|
116
|
+
tag_urn = TagUrn.from_string(tag_urn)
|
|
117
|
+
key, value = cls._parse_tag_name(tag_urn.name)
|
|
118
|
+
return cls(key=key, value=value)
|
|
119
|
+
|
|
120
|
+
@classmethod
|
|
121
|
+
def from_key_value(cls, key: str, value: Optional[str] = None) -> "ExternalTag":
|
|
122
|
+
"""
|
|
123
|
+
Create an ExternalTag from explicit key and value.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
key: Tag key
|
|
127
|
+
value: Optional tag value
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
ExternalTag instance
|
|
131
|
+
"""
|
|
132
|
+
return cls(key=key, value=value)
|
|
133
|
+
|
|
134
|
+
def __str__(self) -> str:
|
|
135
|
+
"""String representation of the tag."""
|
|
136
|
+
if self.value is not None:
|
|
137
|
+
return f"{self.key}:{self.value}"
|
|
138
|
+
else:
|
|
139
|
+
return str(self.key)
|
|
140
|
+
|
|
141
|
+
def __repr__(self) -> str:
|
|
142
|
+
if self.value is not None:
|
|
143
|
+
return f"ExternalTag(key={self.key!r}, value={self.value!r})"
|
|
144
|
+
else:
|
|
145
|
+
return f"ExternalTag(key={self.key!r})"
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
"""The `RestrictedText` module provides a custom Pydantic type that stores the original
|
|
2
|
+
value but returns a truncated and sanitized version when accessed.
|
|
3
|
+
|
|
4
|
+
Features:
|
|
5
|
+
- Configurable maximum length with truncation
|
|
6
|
+
- Character replacement (default replaces with underscore)
|
|
7
|
+
- Preserves original value internally
|
|
8
|
+
- Customizable truncation suffix
|
|
9
|
+
- Compatible with both Pydantic v1 and v2
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from typing import Any, ClassVar, Optional, Set, Union
|
|
15
|
+
|
|
16
|
+
# Check Pydantic version and import accordingly
|
|
17
|
+
try:
|
|
18
|
+
from pydantic import VERSION
|
|
19
|
+
|
|
20
|
+
PYDANTIC_V2 = int(VERSION.split(".")[0]) >= 2
|
|
21
|
+
except (ImportError, AttributeError):
|
|
22
|
+
# Fallback for older versions that don't have VERSION
|
|
23
|
+
PYDANTIC_V2 = False
|
|
24
|
+
|
|
25
|
+
if PYDANTIC_V2:
|
|
26
|
+
from pydantic import GetCoreSchemaHandler # type: ignore[attr-defined]
|
|
27
|
+
from pydantic_core import core_schema
|
|
28
|
+
else:
|
|
29
|
+
from pydantic.validators import str_validator
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class RestrictedTextConfig:
|
|
33
|
+
"""Configuration class for RestrictedText."""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
max_length: Optional[int] = None,
|
|
38
|
+
replace_chars: Optional[Set[str]] = None,
|
|
39
|
+
replacement_char: Optional[str] = None,
|
|
40
|
+
truncation_suffix: Optional[str] = None,
|
|
41
|
+
):
|
|
42
|
+
self.max_length = max_length
|
|
43
|
+
self.replace_chars = replace_chars
|
|
44
|
+
self.replacement_char = replacement_char
|
|
45
|
+
self.truncation_suffix = truncation_suffix
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class RestrictedText(str):
|
|
49
|
+
"""A string type that stores the original value but returns a truncated and sanitized version.
|
|
50
|
+
|
|
51
|
+
This type allows you to:
|
|
52
|
+
- Set a maximum length for the displayed value
|
|
53
|
+
- Replace specific characters with a replacement character
|
|
54
|
+
- Access both the original and processed values
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from pydantic import BaseModel
|
|
58
|
+
|
|
59
|
+
class TestModel(BaseModel):
|
|
60
|
+
# Basic usage with default settings
|
|
61
|
+
name: RestrictedText
|
|
62
|
+
|
|
63
|
+
# Custom max length and character replacement using Field
|
|
64
|
+
custom_field: RestrictedText = RestrictedText.with_config(
|
|
65
|
+
max_length=10,
|
|
66
|
+
forbidden_chars={' ', '-', '.'},
|
|
67
|
+
replacement_char='_'
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Usage example
|
|
71
|
+
model = TestModel(
|
|
72
|
+
name="This is a very long string with special characters!",
|
|
73
|
+
custom_field="hello-world.test"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
print(model.name) # Truncated and sanitized version
|
|
77
|
+
print(model.name.original) # Original value
|
|
78
|
+
print(model.custom_field) # "hello_worl..."
|
|
79
|
+
```
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
# Default configuration
|
|
83
|
+
_default_max_length: ClassVar[Optional[int]] = 50
|
|
84
|
+
_default_replace_chars: ClassVar[Set[str]] = {" ", "\t", "\n", "\r"}
|
|
85
|
+
_default_replacement_char: ClassVar[str] = "_"
|
|
86
|
+
_default_truncation_suffix: ClassVar[str] = "..."
|
|
87
|
+
|
|
88
|
+
def __new__(cls, value: str = "") -> "RestrictedText":
|
|
89
|
+
"""Create a new string instance."""
|
|
90
|
+
instance = str.__new__(cls, "") # We'll set the display value later
|
|
91
|
+
return instance
|
|
92
|
+
|
|
93
|
+
def __init__(self, value: str = ""):
|
|
94
|
+
"""Initialize the RestrictedText with a value."""
|
|
95
|
+
self.original: str = value
|
|
96
|
+
self.max_length = self._default_max_length
|
|
97
|
+
self.replace_chars = self._default_replace_chars
|
|
98
|
+
self.replacement_char = self._default_replacement_char
|
|
99
|
+
self.truncation_suffix = self._default_truncation_suffix
|
|
100
|
+
|
|
101
|
+
# Process the value
|
|
102
|
+
self._processed_value = self._process_value(value)
|
|
103
|
+
|
|
104
|
+
def _configure(
|
|
105
|
+
self,
|
|
106
|
+
max_length: Optional[int] = None,
|
|
107
|
+
replace_chars: Optional[Set[str]] = None,
|
|
108
|
+
replacement_char: Optional[str] = None,
|
|
109
|
+
truncation_suffix: Optional[str] = None,
|
|
110
|
+
) -> "RestrictedText":
|
|
111
|
+
"""Configure this instance with custom settings."""
|
|
112
|
+
if max_length is not None:
|
|
113
|
+
self.max_length = max_length
|
|
114
|
+
if replace_chars is not None:
|
|
115
|
+
self.replace_chars = replace_chars
|
|
116
|
+
if replacement_char is not None:
|
|
117
|
+
self.replacement_char = replacement_char
|
|
118
|
+
if truncation_suffix is not None:
|
|
119
|
+
self.truncation_suffix = truncation_suffix
|
|
120
|
+
|
|
121
|
+
# Reprocess the value with new configuration
|
|
122
|
+
self._processed_value = self._process_value(self.original)
|
|
123
|
+
return self
|
|
124
|
+
|
|
125
|
+
def _process_value(self, value: str) -> str:
|
|
126
|
+
"""Process the value by replacing characters and truncating."""
|
|
127
|
+
# Replace specified characters
|
|
128
|
+
processed = value
|
|
129
|
+
for char in self.replace_chars:
|
|
130
|
+
processed = processed.replace(char, self.replacement_char)
|
|
131
|
+
|
|
132
|
+
# Truncate if necessary
|
|
133
|
+
if self.max_length is not None and len(processed) > self.max_length:
|
|
134
|
+
if len(self.truncation_suffix) >= self.max_length:
|
|
135
|
+
# If suffix is too long, just truncate without suffix
|
|
136
|
+
processed = processed[: self.max_length]
|
|
137
|
+
else:
|
|
138
|
+
# Truncate and add suffix
|
|
139
|
+
truncate_length = self.max_length - len(self.truncation_suffix)
|
|
140
|
+
processed = processed[:truncate_length] + self.truncation_suffix
|
|
141
|
+
|
|
142
|
+
return processed
|
|
143
|
+
|
|
144
|
+
def __str__(self) -> str:
|
|
145
|
+
"""Return the processed (truncated and sanitized) value."""
|
|
146
|
+
return self._processed_value
|
|
147
|
+
|
|
148
|
+
def __repr__(self) -> str:
|
|
149
|
+
return f"{self.__class__.__name__}({self._processed_value!r})"
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def processed(self) -> str:
|
|
153
|
+
"""Get the processed (truncated and sanitized) value."""
|
|
154
|
+
return self._processed_value
|
|
155
|
+
|
|
156
|
+
@classmethod
|
|
157
|
+
def with_config(
|
|
158
|
+
cls,
|
|
159
|
+
max_length: Optional[int] = None,
|
|
160
|
+
forbidden_chars: Optional[Set[str]] = None,
|
|
161
|
+
replacement_char: Optional[str] = None,
|
|
162
|
+
truncation_suffix: Optional[str] = None,
|
|
163
|
+
) -> RestrictedTextConfig:
|
|
164
|
+
"""Create a configuration object for use as field default.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
max_length: Maximum length of the processed string
|
|
168
|
+
forbidden_chars: Set of characters to replace
|
|
169
|
+
replacement_char: Character to use as replacement
|
|
170
|
+
truncation_suffix: Suffix to add when truncating
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
A configuration object that can be used as field default
|
|
174
|
+
"""
|
|
175
|
+
return RestrictedTextConfig(
|
|
176
|
+
max_length=max_length,
|
|
177
|
+
replace_chars=forbidden_chars,
|
|
178
|
+
replacement_char=replacement_char,
|
|
179
|
+
truncation_suffix=truncation_suffix,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# Pydantic v2 methods
|
|
183
|
+
if PYDANTIC_V2:
|
|
184
|
+
|
|
185
|
+
@classmethod
|
|
186
|
+
def _validate(
|
|
187
|
+
cls,
|
|
188
|
+
__input_value: Union[str, "RestrictedText"],
|
|
189
|
+
_: core_schema.ValidationInfo,
|
|
190
|
+
) -> "RestrictedText":
|
|
191
|
+
"""Validate and create a RestrictedText instance."""
|
|
192
|
+
if isinstance(__input_value, RestrictedText):
|
|
193
|
+
return __input_value
|
|
194
|
+
return cls(__input_value)
|
|
195
|
+
|
|
196
|
+
@classmethod
|
|
197
|
+
def __get_pydantic_core_schema__(
|
|
198
|
+
cls, source: type[Any], handler: GetCoreSchemaHandler
|
|
199
|
+
) -> core_schema.CoreSchema:
|
|
200
|
+
"""Get the Pydantic core schema for this type."""
|
|
201
|
+
return core_schema.with_info_after_validator_function(
|
|
202
|
+
cls._validate,
|
|
203
|
+
core_schema.str_schema(),
|
|
204
|
+
field_name=cls.__name__,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Pydantic v1 methods
|
|
208
|
+
else:
|
|
209
|
+
|
|
210
|
+
@classmethod
|
|
211
|
+
def __get_validators__(cls):
|
|
212
|
+
"""Pydantic v1 validator method."""
|
|
213
|
+
yield cls.validate
|
|
214
|
+
|
|
215
|
+
@classmethod
|
|
216
|
+
def validate(cls, v, field=None):
|
|
217
|
+
"""Validate and create a RestrictedText instance for Pydantic v1."""
|
|
218
|
+
if isinstance(v, RestrictedText):
|
|
219
|
+
return v
|
|
220
|
+
|
|
221
|
+
if not isinstance(v, str):
|
|
222
|
+
# Let pydantic handle the string validation
|
|
223
|
+
v = str_validator(v)
|
|
224
|
+
|
|
225
|
+
# Create instance
|
|
226
|
+
instance = cls(v)
|
|
227
|
+
|
|
228
|
+
# Check if there's a field default that contains configuration
|
|
229
|
+
if (
|
|
230
|
+
field
|
|
231
|
+
and hasattr(field, "default")
|
|
232
|
+
and isinstance(field.default, RestrictedTextConfig)
|
|
233
|
+
):
|
|
234
|
+
config = field.default
|
|
235
|
+
instance._configure(
|
|
236
|
+
max_length=config.max_length,
|
|
237
|
+
replace_chars=config.replace_chars,
|
|
238
|
+
replacement_char=config.replacement_char,
|
|
239
|
+
truncation_suffix=config.truncation_suffix,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
return instance
|
|
243
|
+
|
|
244
|
+
@classmethod
|
|
245
|
+
def __modify_schema__(cls, field_schema):
|
|
246
|
+
"""Modify the JSON schema for Pydantic v1."""
|
|
247
|
+
field_schema.update(type="string", examples=["example string"])
|