acryl-datahub 1.0.0rc8__py3-none-any.whl → 1.0.0rc10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/METADATA +2623 -2624
- {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/RECORD +53 -49
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +731 -42
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/specific/dataset_cli.py +128 -14
- datahub/emitter/mce_builder.py +28 -13
- datahub/ingestion/graph/client.py +15 -11
- datahub/ingestion/graph/filters.py +64 -37
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/common/subtypes.py +7 -0
- datahub/ingestion/source/identity/okta.py +22 -0
- datahub/ingestion/source/metabase.py +3 -3
- datahub/ingestion/source/mode.py +1 -1
- datahub/ingestion/source/preset.py +7 -4
- datahub/ingestion/source/sql/mssql/job_models.py +29 -0
- datahub/ingestion/source/sql/mssql/source.py +10 -4
- datahub/ingestion/source/superset.py +158 -24
- datahub/metadata/_schema_classes.py +157 -14
- datahub/metadata/_urns/urn_defs.py +82 -58
- datahub/metadata/schema.avsc +23 -10
- datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +2 -1
- datahub/metadata/schemas/DataProductKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
- datahub/metadata/schemas/MLModelKey.avsc +2 -1
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
- datahub/metadata/schemas/PostKey.avsc +2 -1
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/VersionProperties.avsc +18 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
- datahub/pydantic/__init__.py +0 -0
- datahub/pydantic/compat.py +58 -0
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/_all_entities.py +1 -1
- datahub/sdk/_shared.py +88 -3
- datahub/sdk/container.py +7 -1
- datahub/sdk/dataset.py +7 -1
- datahub/sdk/{_entity.py → entity.py} +4 -0
- datahub/sdk/entity_client.py +1 -1
- datahub/sdk/main_client.py +7 -1
- datahub/sdk/resolver_client.py +17 -29
- datahub/sdk/search_client.py +50 -0
- datahub/sdk/search_filters.py +374 -0
- {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/top_level.txt +0 -0
|
@@ -2,11 +2,24 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
import time
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import (
|
|
6
|
+
Dict,
|
|
7
|
+
Iterable,
|
|
8
|
+
List,
|
|
9
|
+
Literal,
|
|
10
|
+
Optional,
|
|
11
|
+
Tuple,
|
|
12
|
+
Union,
|
|
13
|
+
get_args,
|
|
14
|
+
)
|
|
6
15
|
|
|
7
|
-
|
|
16
|
+
import avro
|
|
17
|
+
import yaml
|
|
18
|
+
from pydantic import BaseModel, Field, root_validator, validator
|
|
8
19
|
from ruamel.yaml import YAML
|
|
20
|
+
from typing_extensions import TypeAlias
|
|
9
21
|
|
|
22
|
+
import datahub.metadata.schema_classes as models
|
|
10
23
|
from datahub.api.entities.structuredproperties.structuredproperties import AllowedTypes
|
|
11
24
|
from datahub.configuration.common import ConfigModel
|
|
12
25
|
from datahub.emitter.mce_builder import (
|
|
@@ -40,6 +53,16 @@ from datahub.metadata.schema_classes import (
|
|
|
40
53
|
TagAssociationClass,
|
|
41
54
|
UpstreamClass,
|
|
42
55
|
)
|
|
56
|
+
from datahub.metadata.urns import (
|
|
57
|
+
DataPlatformUrn,
|
|
58
|
+
GlossaryTermUrn,
|
|
59
|
+
SchemaFieldUrn,
|
|
60
|
+
StructuredPropertyUrn,
|
|
61
|
+
TagUrn,
|
|
62
|
+
)
|
|
63
|
+
from datahub.pydantic.compat import (
|
|
64
|
+
PYDANTIC_VERSION,
|
|
65
|
+
)
|
|
43
66
|
from datahub.specific.dataset import DatasetPatchBuilder
|
|
44
67
|
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
|
45
68
|
|
|
@@ -47,35 +70,103 @@ logging.basicConfig(level=logging.INFO)
|
|
|
47
70
|
logger = logging.getLogger(__name__)
|
|
48
71
|
|
|
49
72
|
|
|
50
|
-
class
|
|
73
|
+
class StrictModel(BaseModel):
|
|
74
|
+
"""
|
|
75
|
+
Base model with strict validation.
|
|
76
|
+
Compatible with both Pydantic v1 and v2.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
if PYDANTIC_VERSION >= 2:
|
|
80
|
+
# Pydantic v2 config
|
|
81
|
+
model_config = {
|
|
82
|
+
"validate_assignment": True,
|
|
83
|
+
"extra": "forbid",
|
|
84
|
+
}
|
|
85
|
+
else:
|
|
86
|
+
# Pydantic v1 config
|
|
87
|
+
class Config:
|
|
88
|
+
validate_assignment = True
|
|
89
|
+
extra = "forbid"
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# Define type aliases for the complex types
|
|
93
|
+
PropertyValue: TypeAlias = Union[float, str]
|
|
94
|
+
PropertyValueList: TypeAlias = List[PropertyValue]
|
|
95
|
+
StructuredProperties: TypeAlias = Dict[str, Union[PropertyValue, PropertyValueList]]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class StructuredPropertiesHelper:
|
|
99
|
+
@staticmethod
|
|
100
|
+
def simplify_structured_properties_list(
|
|
101
|
+
structured_properties: Optional[StructuredProperties],
|
|
102
|
+
) -> Optional[StructuredProperties]:
|
|
103
|
+
def urn_strip(urn: str) -> str:
|
|
104
|
+
if urn.startswith("urn:li:structuredProperty:"):
|
|
105
|
+
return urn[len("urn:li:structuredProperty:") :]
|
|
106
|
+
return urn
|
|
107
|
+
|
|
108
|
+
if structured_properties:
|
|
109
|
+
simplified_structured_properties = (
|
|
110
|
+
{urn_strip(k): v for k, v in structured_properties.items()}
|
|
111
|
+
if structured_properties
|
|
112
|
+
else None
|
|
113
|
+
)
|
|
114
|
+
if simplified_structured_properties:
|
|
115
|
+
# convert lists to single values if possible
|
|
116
|
+
for k, v in simplified_structured_properties.items():
|
|
117
|
+
if isinstance(v, list):
|
|
118
|
+
if len(v) == 1:
|
|
119
|
+
simplified_structured_properties[k] = v[0]
|
|
120
|
+
else:
|
|
121
|
+
simplified_structured_properties[k] = v
|
|
122
|
+
else:
|
|
123
|
+
simplified_structured_properties[k] = v
|
|
124
|
+
|
|
125
|
+
return simplified_structured_properties
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class SchemaFieldSpecification(StrictModel):
|
|
51
130
|
id: Optional[str] = None
|
|
52
131
|
urn: Optional[str] = None
|
|
53
|
-
structured_properties: Optional[
|
|
54
|
-
Dict[str, Union[str, float, List[Union[str, float]]]]
|
|
55
|
-
] = None
|
|
132
|
+
structured_properties: Optional[StructuredProperties] = None
|
|
56
133
|
type: Optional[str] = None
|
|
57
134
|
nativeDataType: Optional[str] = None
|
|
58
135
|
jsonPath: Union[None, str] = None
|
|
59
|
-
nullable:
|
|
136
|
+
nullable: bool = False
|
|
60
137
|
description: Union[None, str] = None
|
|
138
|
+
doc: Union[None, str] = None # doc is an alias for description
|
|
61
139
|
label: Optional[str] = None
|
|
62
140
|
created: Optional[dict] = None
|
|
63
141
|
lastModified: Optional[dict] = None
|
|
64
|
-
recursive:
|
|
142
|
+
recursive: bool = False
|
|
65
143
|
globalTags: Optional[List[str]] = None
|
|
66
144
|
glossaryTerms: Optional[List[str]] = None
|
|
67
145
|
isPartOfKey: Optional[bool] = None
|
|
68
146
|
isPartitioningKey: Optional[bool] = None
|
|
69
147
|
jsonProps: Optional[dict] = None
|
|
70
148
|
|
|
149
|
+
def remove_type_metadata(self) -> "SchemaFieldSpecification":
|
|
150
|
+
"""
|
|
151
|
+
Removes type metadata from the schema field specification.
|
|
152
|
+
This is useful when syncing field metadata back to yaml when
|
|
153
|
+
the type information is already present in the schema file.
|
|
154
|
+
"""
|
|
155
|
+
self.type = None
|
|
156
|
+
self.nativeDataType = None
|
|
157
|
+
self.jsonPath = None
|
|
158
|
+
self.isPartitioningKey = None
|
|
159
|
+
self.isPartOfKey = None
|
|
160
|
+
self.jsonProps = None
|
|
161
|
+
return self
|
|
162
|
+
|
|
71
163
|
def with_structured_properties(
|
|
72
|
-
self,
|
|
73
|
-
structured_properties: Optional[Dict[str, List[Union[str, float]]]],
|
|
164
|
+
self, structured_properties: Optional[StructuredProperties]
|
|
74
165
|
) -> "SchemaFieldSpecification":
|
|
75
166
|
self.structured_properties = (
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
167
|
+
StructuredPropertiesHelper.simplify_structured_properties_list(
|
|
168
|
+
structured_properties
|
|
169
|
+
)
|
|
79
170
|
)
|
|
80
171
|
return self
|
|
81
172
|
|
|
@@ -85,10 +176,10 @@ class SchemaFieldSpecification(BaseModel):
|
|
|
85
176
|
) -> "SchemaFieldSpecification":
|
|
86
177
|
return SchemaFieldSpecification(
|
|
87
178
|
id=Dataset._simplify_field_path(schema_field.fieldPath),
|
|
88
|
-
urn=make_schema_field_urn(
|
|
89
|
-
|
|
179
|
+
urn=make_schema_field_urn(parent_urn, schema_field.fieldPath),
|
|
180
|
+
type=SchemaFieldSpecification._from_datahub_type(
|
|
181
|
+
schema_field.type, schema_field.nativeDataType, allow_complex=True
|
|
90
182
|
),
|
|
91
|
-
type=str(schema_field.type),
|
|
92
183
|
nativeDataType=schema_field.nativeDataType,
|
|
93
184
|
nullable=schema_field.nullable,
|
|
94
185
|
description=schema_field.description,
|
|
@@ -100,14 +191,15 @@ class SchemaFieldSpecification(BaseModel):
|
|
|
100
191
|
else None
|
|
101
192
|
),
|
|
102
193
|
recursive=schema_field.recursive,
|
|
103
|
-
globalTags=(
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
glossaryTerms=
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
194
|
+
globalTags=[TagUrn(tag.tag).name for tag in schema_field.globalTags.tags]
|
|
195
|
+
if schema_field.globalTags
|
|
196
|
+
else None,
|
|
197
|
+
glossaryTerms=[
|
|
198
|
+
GlossaryTermUrn(term.urn).name
|
|
199
|
+
for term in schema_field.glossaryTerms.terms
|
|
200
|
+
]
|
|
201
|
+
if schema_field.glossaryTerms
|
|
202
|
+
else None,
|
|
111
203
|
isPartitioningKey=schema_field.isPartitioningKey,
|
|
112
204
|
jsonProps=(
|
|
113
205
|
json.loads(schema_field.jsonProps) if schema_field.jsonProps else None
|
|
@@ -120,10 +212,142 @@ class SchemaFieldSpecification(BaseModel):
|
|
|
120
212
|
raise ValueError("Either id or urn must be present")
|
|
121
213
|
return v
|
|
122
214
|
|
|
215
|
+
@root_validator(pre=True)
|
|
216
|
+
def sync_description_and_doc(cls, values: Dict) -> Dict:
|
|
217
|
+
"""Synchronize doc and description fields if one is provided but not the other."""
|
|
218
|
+
description = values.get("description")
|
|
219
|
+
doc = values.get("doc")
|
|
220
|
+
|
|
221
|
+
if description is not None and doc is None:
|
|
222
|
+
values["doc"] = description
|
|
223
|
+
elif doc is not None and description is None:
|
|
224
|
+
values["description"] = doc
|
|
225
|
+
|
|
226
|
+
return values
|
|
227
|
+
|
|
228
|
+
def get_datahub_type(self) -> models.SchemaFieldDataTypeClass:
|
|
229
|
+
PrimitiveType = Literal[
|
|
230
|
+
"string",
|
|
231
|
+
"number",
|
|
232
|
+
"int",
|
|
233
|
+
"long",
|
|
234
|
+
"float",
|
|
235
|
+
"double",
|
|
236
|
+
"boolean",
|
|
237
|
+
"bytes",
|
|
238
|
+
"fixed",
|
|
239
|
+
]
|
|
240
|
+
type = self.type.lower() if self.type else self.type
|
|
241
|
+
if type not in set(get_args(PrimitiveType)):
|
|
242
|
+
raise ValueError(f"Type {self.type} is not a valid primitive type")
|
|
243
|
+
|
|
244
|
+
if type == "string":
|
|
245
|
+
return models.SchemaFieldDataTypeClass(type=models.StringTypeClass())
|
|
246
|
+
elif type in ["number", "long", "float", "double", "int"]:
|
|
247
|
+
return models.SchemaFieldDataTypeClass(type=models.NumberTypeClass())
|
|
248
|
+
elif type == "fixed":
|
|
249
|
+
return models.SchemaFieldDataTypeClass(type=models.FixedTypeClass())
|
|
250
|
+
elif type == "bytes":
|
|
251
|
+
return models.SchemaFieldDataTypeClass(type=models.BytesTypeClass())
|
|
252
|
+
elif type == "boolean":
|
|
253
|
+
return models.SchemaFieldDataTypeClass(type=models.BooleanTypeClass())
|
|
254
|
+
|
|
255
|
+
raise ValueError(f"Type {self.type} is not a valid primitive type")
|
|
256
|
+
|
|
257
|
+
@staticmethod
|
|
258
|
+
def _from_datahub_type(
|
|
259
|
+
input_type: models.SchemaFieldDataTypeClass,
|
|
260
|
+
native_data_type: str,
|
|
261
|
+
allow_complex: bool = False,
|
|
262
|
+
) -> str:
|
|
263
|
+
if isinstance(input_type.type, models.StringTypeClass):
|
|
264
|
+
return "string"
|
|
265
|
+
elif isinstance(input_type.type, models.NumberTypeClass):
|
|
266
|
+
if native_data_type in ["long", "float", "double", "int"]:
|
|
267
|
+
return native_data_type
|
|
268
|
+
return "number"
|
|
269
|
+
elif isinstance(input_type.type, models.FixedTypeClass):
|
|
270
|
+
return "fixed"
|
|
271
|
+
elif isinstance(input_type.type, models.BytesTypeClass):
|
|
272
|
+
return "bytes"
|
|
273
|
+
elif isinstance(input_type.type, models.BooleanTypeClass):
|
|
274
|
+
return "boolean"
|
|
275
|
+
elif allow_complex and isinstance(input_type.type, models.ArrayTypeClass):
|
|
276
|
+
return "array"
|
|
277
|
+
elif allow_complex and isinstance(input_type.type, models.MapTypeClass):
|
|
278
|
+
return "map"
|
|
279
|
+
elif allow_complex and isinstance(input_type.type, models.UnionTypeClass):
|
|
280
|
+
return "union"
|
|
281
|
+
elif allow_complex:
|
|
282
|
+
return "record"
|
|
283
|
+
raise ValueError(f"Type {input_type} is not a valid primitive type")
|
|
284
|
+
|
|
285
|
+
if PYDANTIC_VERSION < 2:
|
|
286
|
+
|
|
287
|
+
def dict(self, **kwargs):
|
|
288
|
+
"""Custom dict method for Pydantic v1 to handle YAML serialization properly."""
|
|
289
|
+
exclude = kwargs.pop("exclude", None) or set()
|
|
290
|
+
|
|
291
|
+
# If description and doc are identical, exclude doc from the output
|
|
292
|
+
if self.description == self.doc and self.description is not None:
|
|
293
|
+
exclude.add("doc")
|
|
294
|
+
|
|
295
|
+
# if nativeDataType and type are identical, exclude nativeDataType from the output
|
|
296
|
+
if self.nativeDataType == self.type and self.nativeDataType is not None:
|
|
297
|
+
exclude.add("nativeDataType")
|
|
298
|
+
|
|
299
|
+
# if the id is the same as the urn's fieldPath, exclude id from the output
|
|
300
|
+
|
|
301
|
+
if self.urn:
|
|
302
|
+
field_urn = SchemaFieldUrn.from_string(self.urn)
|
|
303
|
+
if Dataset._simplify_field_path(field_urn.field_path) == self.id:
|
|
304
|
+
exclude.add("urn")
|
|
305
|
+
|
|
306
|
+
kwargs.pop("exclude_defaults", None)
|
|
307
|
+
|
|
308
|
+
self.structured_properties = (
|
|
309
|
+
StructuredPropertiesHelper.simplify_structured_properties_list(
|
|
310
|
+
self.structured_properties
|
|
311
|
+
)
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
return super().dict(exclude=exclude, exclude_defaults=True, **kwargs)
|
|
315
|
+
|
|
316
|
+
else:
|
|
317
|
+
# For v2, implement model_dump with similar logic as dict
|
|
318
|
+
def model_dump(self, **kwargs):
|
|
319
|
+
"""Custom model_dump method for Pydantic v2 to handle YAML serialization properly."""
|
|
320
|
+
exclude = kwargs.pop("exclude", None) or set()
|
|
321
|
+
|
|
322
|
+
# If description and doc are identical, exclude doc from the output
|
|
323
|
+
if self.description == self.doc and self.description is not None:
|
|
324
|
+
exclude.add("doc")
|
|
325
|
+
|
|
326
|
+
# if nativeDataType and type are identical, exclude nativeDataType from the output
|
|
327
|
+
if self.nativeDataType == self.type and self.nativeDataType is not None:
|
|
328
|
+
exclude.add("nativeDataType")
|
|
329
|
+
|
|
330
|
+
# if the id is the same as the urn's fieldPath, exclude id from the output
|
|
331
|
+
if self.urn:
|
|
332
|
+
field_urn = SchemaFieldUrn.from_string(self.urn)
|
|
333
|
+
if Dataset._simplify_field_path(field_urn.field_path) == self.id:
|
|
334
|
+
exclude.add("urn")
|
|
335
|
+
|
|
336
|
+
self.structured_properties = (
|
|
337
|
+
StructuredPropertiesHelper.simplify_structured_properties_list(
|
|
338
|
+
self.structured_properties
|
|
339
|
+
)
|
|
340
|
+
)
|
|
341
|
+
if hasattr(super(), "model_dump"):
|
|
342
|
+
return super().model_dump( # type: ignore
|
|
343
|
+
exclude=exclude, exclude_defaults=True, **kwargs
|
|
344
|
+
)
|
|
345
|
+
|
|
123
346
|
|
|
124
347
|
class SchemaSpecification(BaseModel):
|
|
125
348
|
file: Optional[str] = None
|
|
126
349
|
fields: Optional[List[SchemaFieldSpecification]] = None
|
|
350
|
+
raw_schema: Optional[str] = None
|
|
127
351
|
|
|
128
352
|
@validator("file")
|
|
129
353
|
def file_must_be_avsc(cls, v):
|
|
@@ -143,12 +367,16 @@ class Ownership(ConfigModel):
|
|
|
143
367
|
|
|
144
368
|
|
|
145
369
|
class StructuredPropertyValue(ConfigModel):
|
|
146
|
-
value: Union[str, float, List[str], List[float]]
|
|
370
|
+
value: Union[str, int, float, List[str], List[int], List[float]]
|
|
147
371
|
created: Optional[str] = None
|
|
148
372
|
lastModified: Optional[str] = None
|
|
149
373
|
|
|
150
374
|
|
|
151
|
-
class
|
|
375
|
+
class DatasetRetrievalConfig(BaseModel):
|
|
376
|
+
include_downstreams: Optional[bool] = False
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
class Dataset(StrictModel):
|
|
152
380
|
id: Optional[str] = None
|
|
153
381
|
platform: Optional[str] = None
|
|
154
382
|
env: str = "PROD"
|
|
@@ -163,9 +391,7 @@ class Dataset(BaseModel):
|
|
|
163
391
|
tags: Optional[List[str]] = None
|
|
164
392
|
glossary_terms: Optional[List[str]] = None
|
|
165
393
|
owners: Optional[List[Union[str, Ownership]]] = None
|
|
166
|
-
structured_properties: Optional[
|
|
167
|
-
Dict[str, Union[str, float, List[Union[str, float]]]]
|
|
168
|
-
] = None
|
|
394
|
+
structured_properties: Optional[StructuredProperties] = None
|
|
169
395
|
external_url: Optional[str] = None
|
|
170
396
|
|
|
171
397
|
@property
|
|
@@ -199,6 +425,10 @@ class Dataset(BaseModel):
|
|
|
199
425
|
return v[len("urn:li:dataPlatform:") :]
|
|
200
426
|
return v
|
|
201
427
|
|
|
428
|
+
@validator("structured_properties")
|
|
429
|
+
def simplify_structured_properties(cls, v):
|
|
430
|
+
return StructuredPropertiesHelper.simplify_structured_properties_list(v)
|
|
431
|
+
|
|
202
432
|
def _mint_auditstamp(self, message: str) -> AuditStampClass:
|
|
203
433
|
return AuditStampClass(
|
|
204
434
|
time=int(time.time() * 1000.0),
|
|
@@ -221,6 +451,14 @@ class Dataset(BaseModel):
|
|
|
221
451
|
typeUrn=ownership_type_urn,
|
|
222
452
|
)
|
|
223
453
|
|
|
454
|
+
@staticmethod
|
|
455
|
+
def get_patch_builder(urn: str) -> DatasetPatchBuilder:
|
|
456
|
+
return DatasetPatchBuilder(urn)
|
|
457
|
+
|
|
458
|
+
def patch_builder(self) -> DatasetPatchBuilder:
|
|
459
|
+
assert self.urn
|
|
460
|
+
return DatasetPatchBuilder(self.urn)
|
|
461
|
+
|
|
224
462
|
@classmethod
|
|
225
463
|
def from_yaml(cls, file: str) -> Iterable["Dataset"]:
|
|
226
464
|
with open(file) as fp:
|
|
@@ -230,9 +468,45 @@ class Dataset(BaseModel):
|
|
|
230
468
|
datasets = [datasets]
|
|
231
469
|
for dataset_raw in datasets:
|
|
232
470
|
dataset = Dataset.parse_obj(dataset_raw)
|
|
471
|
+
# dataset = Dataset.model_validate(dataset_raw, strict=True)
|
|
233
472
|
yield dataset
|
|
234
473
|
|
|
235
|
-
def
|
|
474
|
+
def entity_references(self) -> List[str]:
|
|
475
|
+
urn_prefix = f"{StructuredPropertyUrn.URN_PREFIX}:{StructuredPropertyUrn.LI_DOMAIN}:{StructuredPropertyUrn.ENTITY_TYPE}"
|
|
476
|
+
references = []
|
|
477
|
+
if self.schema_metadata:
|
|
478
|
+
if self.schema_metadata.fields:
|
|
479
|
+
for field in self.schema_metadata.fields:
|
|
480
|
+
if field.structured_properties:
|
|
481
|
+
references.extend(
|
|
482
|
+
[
|
|
483
|
+
f"{urn_prefix}:{prop_key}"
|
|
484
|
+
if not prop_key.startswith(urn_prefix)
|
|
485
|
+
else prop_key
|
|
486
|
+
for prop_key in field.structured_properties.keys()
|
|
487
|
+
]
|
|
488
|
+
)
|
|
489
|
+
if field.glossaryTerms:
|
|
490
|
+
references.extend(
|
|
491
|
+
[make_term_urn(term) for term in field.glossaryTerms]
|
|
492
|
+
)
|
|
493
|
+
# We don't check references for tags
|
|
494
|
+
if self.structured_properties:
|
|
495
|
+
references.extend(
|
|
496
|
+
[
|
|
497
|
+
f"{urn_prefix}:{prop_key}"
|
|
498
|
+
if not prop_key.startswith(urn_prefix)
|
|
499
|
+
else prop_key
|
|
500
|
+
for prop_key in self.structured_properties.keys()
|
|
501
|
+
]
|
|
502
|
+
)
|
|
503
|
+
if self.glossary_terms:
|
|
504
|
+
references.extend([make_term_urn(term) for term in self.glossary_terms])
|
|
505
|
+
|
|
506
|
+
# We don't check references for tags
|
|
507
|
+
return list(set(references))
|
|
508
|
+
|
|
509
|
+
def generate_mcp( # noqa: C901
|
|
236
510
|
self,
|
|
237
511
|
) -> Iterable[Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper]]:
|
|
238
512
|
mcp = MetadataChangeProposalWrapper(
|
|
@@ -247,9 +521,12 @@ class Dataset(BaseModel):
|
|
|
247
521
|
yield mcp
|
|
248
522
|
|
|
249
523
|
if self.schema_metadata:
|
|
524
|
+
schema_fields = set()
|
|
250
525
|
if self.schema_metadata.file:
|
|
251
526
|
with open(self.schema_metadata.file) as schema_fp:
|
|
252
527
|
schema_string = schema_fp.read()
|
|
528
|
+
schema_fields_list = avro_schema_to_mce_fields(schema_string)
|
|
529
|
+
schema_fields = {field.fieldPath for field in schema_fields_list}
|
|
253
530
|
schema_metadata = SchemaMetadataClass(
|
|
254
531
|
schemaName=self.name or self.id or self.urn or "",
|
|
255
532
|
platform=self.platform_urn,
|
|
@@ -264,7 +541,102 @@ class Dataset(BaseModel):
|
|
|
264
541
|
yield mcp
|
|
265
542
|
|
|
266
543
|
if self.schema_metadata.fields:
|
|
544
|
+
field_type_info_present = any(
|
|
545
|
+
field.type for field in self.schema_metadata.fields
|
|
546
|
+
)
|
|
547
|
+
all_fields_type_info_present = all(
|
|
548
|
+
field.type for field in self.schema_metadata.fields
|
|
549
|
+
)
|
|
550
|
+
if field_type_info_present and not all_fields_type_info_present:
|
|
551
|
+
raise ValueError(
|
|
552
|
+
"Either all fields must have type information or none of them should"
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
if all_fields_type_info_present:
|
|
556
|
+
update_technical_schema = True
|
|
557
|
+
else:
|
|
558
|
+
update_technical_schema = False
|
|
559
|
+
if update_technical_schema and not self.schema_metadata.file:
|
|
560
|
+
# We produce a schema metadata aspect only if we have type information
|
|
561
|
+
# and a schema file is not provided.
|
|
562
|
+
schema_metadata = SchemaMetadataClass(
|
|
563
|
+
schemaName=self.name or self.id or self.urn or "",
|
|
564
|
+
platform=self.platform_urn,
|
|
565
|
+
version=0,
|
|
566
|
+
hash="",
|
|
567
|
+
fields=[
|
|
568
|
+
SchemaFieldClass(
|
|
569
|
+
fieldPath=field.id, # type: ignore[arg-type]
|
|
570
|
+
type=field.get_datahub_type(),
|
|
571
|
+
nativeDataType=field.nativeDataType or field.type, # type: ignore[arg-type]
|
|
572
|
+
nullable=field.nullable,
|
|
573
|
+
description=field.description,
|
|
574
|
+
label=field.label,
|
|
575
|
+
created=None, # This should be auto-populated.
|
|
576
|
+
lastModified=None, # This should be auto-populated.
|
|
577
|
+
recursive=field.recursive,
|
|
578
|
+
globalTags=GlobalTagsClass(
|
|
579
|
+
tags=[
|
|
580
|
+
TagAssociationClass(tag=make_tag_urn(tag))
|
|
581
|
+
for tag in field.globalTags
|
|
582
|
+
]
|
|
583
|
+
)
|
|
584
|
+
if field.globalTags is not None
|
|
585
|
+
else None,
|
|
586
|
+
glossaryTerms=GlossaryTermsClass(
|
|
587
|
+
terms=[
|
|
588
|
+
GlossaryTermAssociationClass(
|
|
589
|
+
urn=make_term_urn(term)
|
|
590
|
+
)
|
|
591
|
+
for term in field.glossaryTerms
|
|
592
|
+
],
|
|
593
|
+
auditStamp=self._mint_auditstamp("yaml"),
|
|
594
|
+
)
|
|
595
|
+
if field.glossaryTerms is not None
|
|
596
|
+
else None,
|
|
597
|
+
isPartOfKey=field.isPartOfKey,
|
|
598
|
+
isPartitioningKey=field.isPartitioningKey,
|
|
599
|
+
jsonProps=json.dumps(field.jsonProps)
|
|
600
|
+
if field.jsonProps is not None
|
|
601
|
+
else None,
|
|
602
|
+
)
|
|
603
|
+
for field in self.schema_metadata.fields
|
|
604
|
+
],
|
|
605
|
+
platformSchema=OtherSchemaClass(
|
|
606
|
+
rawSchema=yaml.dump(
|
|
607
|
+
self.schema_metadata.dict(
|
|
608
|
+
exclude_none=True, exclude_unset=True
|
|
609
|
+
)
|
|
610
|
+
)
|
|
611
|
+
),
|
|
612
|
+
)
|
|
613
|
+
mcp = MetadataChangeProposalWrapper(
|
|
614
|
+
entityUrn=self.urn, aspect=schema_metadata
|
|
615
|
+
)
|
|
616
|
+
yield mcp
|
|
617
|
+
|
|
267
618
|
for field in self.schema_metadata.fields:
|
|
619
|
+
if schema_fields:
|
|
620
|
+
# search for the field in the schema fields set
|
|
621
|
+
matched_fields = [
|
|
622
|
+
schema_field
|
|
623
|
+
for schema_field in schema_fields
|
|
624
|
+
if field.id == schema_field
|
|
625
|
+
or field.id == Dataset._simplify_field_path(schema_field)
|
|
626
|
+
]
|
|
627
|
+
if not matched_fields:
|
|
628
|
+
raise ValueError(
|
|
629
|
+
f"Field {field.id} not found in the schema file"
|
|
630
|
+
)
|
|
631
|
+
if len(matched_fields) > 1:
|
|
632
|
+
raise ValueError(
|
|
633
|
+
f"Field {field.id} matches multiple entries {matched_fields}in the schema file. Use the fully qualified field path."
|
|
634
|
+
)
|
|
635
|
+
assert len(matched_fields) == 1
|
|
636
|
+
assert (
|
|
637
|
+
self.urn is not None
|
|
638
|
+
) # validator should have filled this in
|
|
639
|
+
field.urn = make_schema_field_urn(self.urn, matched_fields[0])
|
|
268
640
|
field_urn = field.urn or make_schema_field_urn(
|
|
269
641
|
self.urn, # type: ignore[arg-type]
|
|
270
642
|
field.id, # type: ignore[arg-type]
|
|
@@ -299,12 +671,15 @@ class Dataset(BaseModel):
|
|
|
299
671
|
yield mcp
|
|
300
672
|
|
|
301
673
|
if field.structured_properties:
|
|
674
|
+
urn_prefix = f"{StructuredPropertyUrn.URN_PREFIX}:{StructuredPropertyUrn.LI_DOMAIN}:{StructuredPropertyUrn.ENTITY_TYPE}"
|
|
302
675
|
mcp = MetadataChangeProposalWrapper(
|
|
303
676
|
entityUrn=field_urn,
|
|
304
677
|
aspect=StructuredPropertiesClass(
|
|
305
678
|
properties=[
|
|
306
679
|
StructuredPropertyValueAssignmentClass(
|
|
307
|
-
propertyUrn=f"
|
|
680
|
+
propertyUrn=f"{urn_prefix}:{prop_key}"
|
|
681
|
+
if not prop_key.startswith(urn_prefix)
|
|
682
|
+
else prop_key,
|
|
308
683
|
values=(
|
|
309
684
|
prop_value
|
|
310
685
|
if isinstance(prop_value, list)
|
|
@@ -403,6 +778,10 @@ class Dataset(BaseModel):
|
|
|
403
778
|
|
|
404
779
|
@staticmethod
|
|
405
780
|
def _simplify_field_path(field_path: str) -> str:
|
|
781
|
+
# field paths with [type=array] or [type=map] or [type=union] should never be simplified
|
|
782
|
+
for type in ["array", "map", "union"]:
|
|
783
|
+
if f"[type={type}]" in field_path:
|
|
784
|
+
return field_path
|
|
406
785
|
if field_path.startswith("[version=2.0]"):
|
|
407
786
|
# v2 field path
|
|
408
787
|
field_components = []
|
|
@@ -434,7 +813,26 @@ class Dataset(BaseModel):
|
|
|
434
813
|
)
|
|
435
814
|
|
|
436
815
|
if schema_metadata:
|
|
816
|
+
# If the schema is built off of an avro schema, we only extract the fields if they have structured properties
|
|
817
|
+
# Otherwise, we extract all fields
|
|
818
|
+
if (
|
|
819
|
+
schema_metadata.platformSchema
|
|
820
|
+
and isinstance(schema_metadata.platformSchema, models.OtherSchemaClass)
|
|
821
|
+
and schema_metadata.platformSchema.rawSchema
|
|
822
|
+
):
|
|
823
|
+
try:
|
|
824
|
+
maybe_avro_schema = avro.schema.parse(
|
|
825
|
+
schema_metadata.platformSchema.rawSchema
|
|
826
|
+
)
|
|
827
|
+
schema_fields = avro_schema_to_mce_fields(maybe_avro_schema)
|
|
828
|
+
except Exception as e:
|
|
829
|
+
logger.debug("Failed to parse avro schema: %s", e)
|
|
830
|
+
schema_fields = []
|
|
831
|
+
|
|
437
832
|
schema_specification = SchemaSpecification(
|
|
833
|
+
raw_schema=schema_metadata.platformSchema.rawSchema
|
|
834
|
+
if hasattr(schema_metadata.platformSchema, "rawSchema")
|
|
835
|
+
else None,
|
|
438
836
|
fields=[
|
|
439
837
|
SchemaFieldSpecification.from_schema_field(
|
|
440
838
|
field, urn
|
|
@@ -462,8 +860,21 @@ class Dataset(BaseModel):
|
|
|
462
860
|
)
|
|
463
861
|
for field in schema_metadata.fields
|
|
464
862
|
]
|
|
465
|
-
]
|
|
863
|
+
],
|
|
466
864
|
)
|
|
865
|
+
if schema_fields and schema_specification.fields:
|
|
866
|
+
# Source was an avro schema, so we only include fields with structured properties, tags or glossary terms
|
|
867
|
+
schema_specification.fields = [
|
|
868
|
+
field.remove_type_metadata()
|
|
869
|
+
for field in schema_specification.fields
|
|
870
|
+
if field.structured_properties
|
|
871
|
+
or field.globalTags
|
|
872
|
+
or field.glossaryTerms
|
|
873
|
+
]
|
|
874
|
+
if (
|
|
875
|
+
not schema_specification.fields
|
|
876
|
+
): # set fields to None if there are no fields after filtering
|
|
877
|
+
schema_specification.fields = None
|
|
467
878
|
return schema_specification
|
|
468
879
|
else:
|
|
469
880
|
return None
|
|
@@ -485,7 +896,14 @@ class Dataset(BaseModel):
|
|
|
485
896
|
return yaml_owners
|
|
486
897
|
|
|
487
898
|
@classmethod
|
|
488
|
-
def from_datahub(
|
|
899
|
+
def from_datahub(
|
|
900
|
+
cls,
|
|
901
|
+
graph: DataHubGraph,
|
|
902
|
+
urn: str,
|
|
903
|
+
config: DatasetRetrievalConfig = DatasetRetrievalConfig(),
|
|
904
|
+
) -> "Dataset":
|
|
905
|
+
dataset_urn = DatasetUrn.from_string(urn)
|
|
906
|
+
platform_urn = DataPlatformUrn.from_string(dataset_urn.platform)
|
|
489
907
|
dataset_properties: Optional[DatasetPropertiesClass] = graph.get_aspect(
|
|
490
908
|
urn, DatasetPropertiesClass
|
|
491
909
|
)
|
|
@@ -500,7 +918,7 @@ class Dataset(BaseModel):
|
|
|
500
918
|
urn, StructuredPropertiesClass
|
|
501
919
|
)
|
|
502
920
|
if structured_properties:
|
|
503
|
-
structured_properties_map:
|
|
921
|
+
structured_properties_map: StructuredProperties = {}
|
|
504
922
|
for sp in structured_properties.properties:
|
|
505
923
|
if sp.propertyUrn in structured_properties_map:
|
|
506
924
|
assert isinstance(structured_properties_map[sp.propertyUrn], list)
|
|
@@ -508,7 +926,19 @@ class Dataset(BaseModel):
|
|
|
508
926
|
else:
|
|
509
927
|
structured_properties_map[sp.propertyUrn] = sp.values
|
|
510
928
|
|
|
511
|
-
|
|
929
|
+
if config.include_downstreams:
|
|
930
|
+
related_downstreams = graph.get_related_entities(
|
|
931
|
+
urn,
|
|
932
|
+
relationship_types=[
|
|
933
|
+
"DownstreamOf",
|
|
934
|
+
],
|
|
935
|
+
direction=DataHubGraph.RelationshipDirection.INCOMING,
|
|
936
|
+
)
|
|
937
|
+
downstreams = [r.urn for r in related_downstreams]
|
|
938
|
+
|
|
939
|
+
return Dataset( # type: ignore[arg-type]
|
|
940
|
+
id=dataset_urn.name,
|
|
941
|
+
platform=platform_urn.platform_name,
|
|
512
942
|
urn=urn,
|
|
513
943
|
description=(
|
|
514
944
|
dataset_properties.description
|
|
@@ -521,9 +951,11 @@ class Dataset(BaseModel):
|
|
|
521
951
|
else None
|
|
522
952
|
),
|
|
523
953
|
schema=Dataset._schema_from_schema_metadata(graph, urn),
|
|
524
|
-
tags=[tag.tag for tag in tags.tags] if tags else None,
|
|
954
|
+
tags=[TagUrn(tag.tag).name for tag in tags.tags] if tags else None,
|
|
525
955
|
glossary_terms=(
|
|
526
|
-
[term.urn for term in glossary_terms.terms]
|
|
956
|
+
[GlossaryTermUrn(term.urn).name for term in glossary_terms.terms]
|
|
957
|
+
if glossary_terms
|
|
958
|
+
else None
|
|
527
959
|
),
|
|
528
960
|
owners=yaml_owners,
|
|
529
961
|
properties=(
|
|
@@ -533,14 +965,271 @@ class Dataset(BaseModel):
|
|
|
533
965
|
structured_properties=(
|
|
534
966
|
structured_properties_map if structured_properties else None
|
|
535
967
|
),
|
|
968
|
+
downstreams=downstreams if config.include_downstreams else None,
|
|
536
969
|
)
|
|
537
970
|
|
|
971
|
+
if PYDANTIC_VERSION < 2:
|
|
972
|
+
|
|
973
|
+
def dict(self, **kwargs):
|
|
974
|
+
"""Custom dict method for Pydantic v1 to handle YAML serialization properly."""
|
|
975
|
+
exclude = kwargs.pop("exclude", set())
|
|
976
|
+
|
|
977
|
+
# If id and name are identical, exclude name from the output
|
|
978
|
+
if self.id == self.name and self.id is not None:
|
|
979
|
+
exclude.add("name")
|
|
980
|
+
|
|
981
|
+
# if subtype and subtypes are identical or subtypes is a singleton list, exclude subtypes from the output
|
|
982
|
+
if self.subtypes and len(self.subtypes) == 1:
|
|
983
|
+
self.subtype = self.subtypes[0]
|
|
984
|
+
exclude.add("subtypes")
|
|
985
|
+
|
|
986
|
+
result = super().dict(exclude=exclude, **kwargs)
|
|
987
|
+
|
|
988
|
+
# Custom handling for schema_metadata/schema
|
|
989
|
+
if self.schema_metadata and "schema" in result:
|
|
990
|
+
schema_data = result["schema"]
|
|
991
|
+
|
|
992
|
+
# Handle fields if they exist
|
|
993
|
+
if "fields" in schema_data and isinstance(schema_data["fields"], list):
|
|
994
|
+
# Process each field using its custom dict method
|
|
995
|
+
processed_fields = []
|
|
996
|
+
if self.schema_metadata and self.schema_metadata.fields:
|
|
997
|
+
for field in self.schema_metadata.fields:
|
|
998
|
+
if field:
|
|
999
|
+
# Use dict method for Pydantic v1
|
|
1000
|
+
processed_field = field.dict(**kwargs)
|
|
1001
|
+
processed_fields.append(processed_field)
|
|
1002
|
+
|
|
1003
|
+
# Replace the fields in the result with the processed ones
|
|
1004
|
+
schema_data["fields"] = processed_fields
|
|
1005
|
+
|
|
1006
|
+
return result
|
|
1007
|
+
else:
|
|
1008
|
+
|
|
1009
|
+
def model_dump(self, **kwargs):
|
|
1010
|
+
"""Custom model_dump method for Pydantic v2 to handle YAML serialization properly."""
|
|
1011
|
+
exclude = kwargs.pop("exclude", set())
|
|
1012
|
+
|
|
1013
|
+
# If id and name are identical, exclude name from the output
|
|
1014
|
+
if self.id == self.name and self.id is not None:
|
|
1015
|
+
exclude.add("name")
|
|
1016
|
+
|
|
1017
|
+
# if subtype and subtypes are identical or subtypes is a singleton list, exclude subtypes from the output
|
|
1018
|
+
if self.subtypes and len(self.subtypes) == 1:
|
|
1019
|
+
self.subtype = self.subtypes[0]
|
|
1020
|
+
exclude.add("subtypes")
|
|
1021
|
+
|
|
1022
|
+
if hasattr(super(), "model_dump"):
|
|
1023
|
+
result = super().model_dump(exclude=exclude, **kwargs) # type: ignore
|
|
1024
|
+
else:
|
|
1025
|
+
result = super().dict(exclude=exclude, **kwargs)
|
|
1026
|
+
|
|
1027
|
+
# Custom handling for schema_metadata/schema
|
|
1028
|
+
if self.schema_metadata and "schema" in result:
|
|
1029
|
+
schema_data = result["schema"]
|
|
1030
|
+
|
|
1031
|
+
# Handle fields if they exist
|
|
1032
|
+
if "fields" in schema_data and isinstance(schema_data["fields"], list):
|
|
1033
|
+
# Process each field using its custom model_dump method
|
|
1034
|
+
processed_fields = []
|
|
1035
|
+
if self.schema_metadata and self.schema_metadata.fields:
|
|
1036
|
+
for field in self.schema_metadata.fields:
|
|
1037
|
+
if field:
|
|
1038
|
+
processed_field = field.model_dump(**kwargs)
|
|
1039
|
+
processed_fields.append(processed_field)
|
|
1040
|
+
|
|
1041
|
+
# Replace the fields in the result with the processed ones
|
|
1042
|
+
schema_data["fields"] = processed_fields
|
|
1043
|
+
|
|
1044
|
+
return result
|
|
1045
|
+
|
|
538
1046
|
def to_yaml(
|
|
539
1047
|
self,
|
|
540
1048
|
file: Path,
|
|
541
|
-
) ->
|
|
1049
|
+
) -> bool:
|
|
1050
|
+
"""
|
|
1051
|
+
Write model to YAML file only if content has changed.
|
|
1052
|
+
Preserves comments and structure of the existing YAML file.
|
|
1053
|
+
Returns True if file was written, False if no changes were detected.
|
|
1054
|
+
"""
|
|
1055
|
+
# Create new model data
|
|
1056
|
+
# Create new model data - choose dict() or model_dump() based on Pydantic version
|
|
1057
|
+
if PYDANTIC_VERSION >= 2:
|
|
1058
|
+
new_data = self.model_dump(
|
|
1059
|
+
exclude_none=True, exclude_unset=True, by_alias=True
|
|
1060
|
+
)
|
|
1061
|
+
else:
|
|
1062
|
+
new_data = self.dict(exclude_none=True, exclude_unset=True, by_alias=True)
|
|
1063
|
+
|
|
1064
|
+
# Set up ruamel.yaml for preserving comments
|
|
1065
|
+
yaml_handler = YAML(typ="rt") # round-trip mode
|
|
1066
|
+
yaml_handler.default_flow_style = False
|
|
1067
|
+
yaml_handler.preserve_quotes = True # type: ignore[assignment]
|
|
1068
|
+
yaml_handler.indent(mapping=2, sequence=2, offset=0)
|
|
1069
|
+
|
|
1070
|
+
if file.exists():
|
|
1071
|
+
try:
|
|
1072
|
+
# Load existing data with comments preserved
|
|
1073
|
+
with open(file, "r") as fp:
|
|
1074
|
+
existing_data = yaml_handler.load(fp)
|
|
1075
|
+
|
|
1076
|
+
# Determine if the file contains a list or a single document
|
|
1077
|
+
if isinstance(existing_data, dict):
|
|
1078
|
+
existing_data = [existing_data]
|
|
1079
|
+
is_original_list = False
|
|
1080
|
+
else:
|
|
1081
|
+
is_original_list = True
|
|
1082
|
+
if isinstance(existing_data, list):
|
|
1083
|
+
# Handle list case
|
|
1084
|
+
updated = False
|
|
1085
|
+
identifier = "urn"
|
|
1086
|
+
model_id = self.urn
|
|
1087
|
+
|
|
1088
|
+
if model_id is not None:
|
|
1089
|
+
# Try to find and update existing item
|
|
1090
|
+
for item in existing_data:
|
|
1091
|
+
existing_dataset = Dataset(**item)
|
|
1092
|
+
item_identifier = item.get(identifier, existing_dataset.urn)
|
|
1093
|
+
if item_identifier == model_id:
|
|
1094
|
+
# Found the item to update - preserve structure while updating values
|
|
1095
|
+
updated = True
|
|
1096
|
+
if (
|
|
1097
|
+
existing_dataset.schema_metadata
|
|
1098
|
+
and existing_dataset.schema_metadata.file
|
|
1099
|
+
):
|
|
1100
|
+
# Preserve the existing schema file path
|
|
1101
|
+
new_data["schema"]["file"] = (
|
|
1102
|
+
existing_dataset.schema_metadata.file
|
|
1103
|
+
)
|
|
1104
|
+
# Check if the content of the schema file has changed
|
|
1105
|
+
with open(
|
|
1106
|
+
existing_dataset.schema_metadata.file
|
|
1107
|
+
) as schema_fp:
|
|
1108
|
+
schema_fp_content = schema_fp.read()
|
|
1109
|
+
|
|
1110
|
+
if (
|
|
1111
|
+
schema_fp_content
|
|
1112
|
+
!= new_data["schema"]["raw_schema"]
|
|
1113
|
+
):
|
|
1114
|
+
# If the content has changed, update the schema file
|
|
1115
|
+
schema_file_path = Path(
|
|
1116
|
+
existing_dataset.schema_metadata.file
|
|
1117
|
+
)
|
|
1118
|
+
schema_file_path.write_text(
|
|
1119
|
+
new_data["schema"]["raw_schema"]
|
|
1120
|
+
)
|
|
1121
|
+
# Remove raw_schema from the schema aspect before updating
|
|
1122
|
+
if "schema" in new_data:
|
|
1123
|
+
new_data["schema"].pop("raw_schema")
|
|
1124
|
+
|
|
1125
|
+
_update_dict_preserving_comments(
|
|
1126
|
+
item, new_data, ["urn", "properties", "raw_schema"]
|
|
1127
|
+
)
|
|
1128
|
+
break
|
|
1129
|
+
|
|
1130
|
+
if not updated:
|
|
1131
|
+
# Item not found, append to the list
|
|
1132
|
+
existing_data.append(new_data)
|
|
1133
|
+
updated = True
|
|
1134
|
+
|
|
1135
|
+
# If no update was needed, return early
|
|
1136
|
+
if not updated:
|
|
1137
|
+
return False
|
|
1138
|
+
|
|
1139
|
+
# Write the updated data back
|
|
1140
|
+
with open(file, "w") as fp:
|
|
1141
|
+
if not is_original_list:
|
|
1142
|
+
existing_data = existing_data[0]
|
|
1143
|
+
yaml_handler.dump(existing_data, fp)
|
|
1144
|
+
|
|
1145
|
+
return True
|
|
1146
|
+
|
|
1147
|
+
except Exception as e:
|
|
1148
|
+
# If there's any error, we'll create a new file
|
|
1149
|
+
print(
|
|
1150
|
+
f"Error processing existing file {file}: {e}. Will create a new one."
|
|
1151
|
+
)
|
|
1152
|
+
else:
|
|
1153
|
+
# File doesn't exist or had errors - create a new one with default settings
|
|
1154
|
+
yaml_handler.indent(mapping=2, sequence=2, offset=0)
|
|
1155
|
+
|
|
1156
|
+
file.parent.mkdir(parents=True, exist_ok=True)
|
|
1157
|
+
|
|
542
1158
|
with open(file, "w") as fp:
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
1159
|
+
yaml_handler.dump(new_data, fp)
|
|
1160
|
+
|
|
1161
|
+
return True
|
|
1162
|
+
|
|
1163
|
+
|
|
1164
|
+
def _update_dict_preserving_comments(
|
|
1165
|
+
target: Dict, source: Dict, optional_fields: Optional[List[str]] = None
|
|
1166
|
+
) -> None:
|
|
1167
|
+
"""
|
|
1168
|
+
Updates a target dictionary with values from source, preserving comments and structure.
|
|
1169
|
+
This modifies the target dictionary in-place.
|
|
1170
|
+
"""
|
|
1171
|
+
if optional_fields is None:
|
|
1172
|
+
optional_fields = ["urn"]
|
|
1173
|
+
# For each key in the source dict
|
|
1174
|
+
for key, value in source.items():
|
|
1175
|
+
if key in target:
|
|
1176
|
+
if isinstance(value, dict) and isinstance(target[key], dict):
|
|
1177
|
+
# Recursively update nested dictionaries
|
|
1178
|
+
_update_dict_preserving_comments(target[key], value)
|
|
1179
|
+
else:
|
|
1180
|
+
# Update scalar or list values
|
|
1181
|
+
# If target value is an int, and source value is a float that is equal to the int, convert to int
|
|
1182
|
+
if isinstance(value, float) and int(value) == value:
|
|
1183
|
+
target[key] = int(value)
|
|
1184
|
+
else:
|
|
1185
|
+
target[key] = value
|
|
1186
|
+
elif key not in optional_fields:
|
|
1187
|
+
# Add new keys
|
|
1188
|
+
target[key] = value
|
|
1189
|
+
|
|
1190
|
+
# Remove keys that are in target but not in source
|
|
1191
|
+
keys_to_remove = [k for k in target if k not in source]
|
|
1192
|
+
for key in keys_to_remove:
|
|
1193
|
+
del target[key]
|
|
1194
|
+
|
|
1195
|
+
|
|
1196
|
+
def _dict_equal(dict1: Dict, dict2: Dict, optional_keys: List[str]) -> bool:
|
|
1197
|
+
"""
|
|
1198
|
+
Compare two dictionaries for equality, ignoring ruamel.yaml's metadata.
|
|
1199
|
+
"""
|
|
1200
|
+
|
|
1201
|
+
if len(dict1) != len(dict2):
|
|
1202
|
+
# Check if the difference is only in optional keys
|
|
1203
|
+
if len(dict1) > len(dict2):
|
|
1204
|
+
for key in optional_keys:
|
|
1205
|
+
if key in dict1 and key not in dict2:
|
|
1206
|
+
del dict1[key]
|
|
1207
|
+
elif len(dict2) > len(dict1):
|
|
1208
|
+
for key in optional_keys:
|
|
1209
|
+
if key in dict2 and key not in dict1:
|
|
1210
|
+
del dict2[key]
|
|
1211
|
+
if len(dict1) != len(dict2):
|
|
1212
|
+
return False
|
|
1213
|
+
|
|
1214
|
+
for key, value in dict1.items():
|
|
1215
|
+
if key not in dict2:
|
|
1216
|
+
return False
|
|
1217
|
+
|
|
1218
|
+
if isinstance(value, dict) and isinstance(dict2[key], dict):
|
|
1219
|
+
if not _dict_equal(value, dict2[key], optional_keys):
|
|
1220
|
+
return False
|
|
1221
|
+
elif isinstance(value, list) and isinstance(dict2[key], list):
|
|
1222
|
+
if len(value) != len(dict2[key]):
|
|
1223
|
+
return False
|
|
1224
|
+
|
|
1225
|
+
# Check list items (simplified for brevity)
|
|
1226
|
+
for i in range(len(value)):
|
|
1227
|
+
if isinstance(value[i], dict) and isinstance(dict2[key][i], dict):
|
|
1228
|
+
if not _dict_equal(value[i], dict2[key][i], optional_keys):
|
|
1229
|
+
return False
|
|
1230
|
+
elif value[i] != dict2[key][i]:
|
|
1231
|
+
return False
|
|
1232
|
+
elif value != dict2[key]:
|
|
1233
|
+
return False
|
|
1234
|
+
|
|
1235
|
+
return True
|