acryl-datahub 1.0.0rc7__py3-none-any.whl → 1.0.0rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (88) hide show
  1. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/METADATA +2487 -2487
  2. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/RECORD +88 -84
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/dataset/dataset.py +731 -42
  5. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  6. datahub/cli/specific/dataset_cli.py +128 -14
  7. datahub/configuration/git.py +1 -3
  8. datahub/ingestion/glossary/classification_mixin.py +1 -1
  9. datahub/ingestion/graph/client.py +16 -12
  10. datahub/ingestion/graph/filters.py +64 -37
  11. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  12. datahub/ingestion/source/abs/config.py +2 -4
  13. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  14. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +1 -1
  15. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  16. datahub/ingestion/source/csv_enricher.py +1 -1
  17. datahub/ingestion/source/dbt/dbt_common.py +1 -1
  18. datahub/ingestion/source/file.py +5 -2
  19. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  20. datahub/ingestion/source/ge_data_profiler.py +11 -14
  21. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  22. datahub/ingestion/source/iceberg/iceberg_common.py +31 -20
  23. datahub/ingestion/source/identity/okta.py +1 -3
  24. datahub/ingestion/source/kafka_connect/source_connectors.py +4 -7
  25. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  26. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  27. datahub/ingestion/source/looker/lookml_source.py +2 -1
  28. datahub/ingestion/source/metadata/lineage.py +2 -2
  29. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  30. datahub/ingestion/source/nifi.py +6 -3
  31. datahub/ingestion/source/openapi_parser.py +2 -2
  32. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  33. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  34. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  35. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  36. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  37. datahub/ingestion/source/preset.py +7 -4
  38. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  39. datahub/ingestion/source/redash.py +2 -1
  40. datahub/ingestion/source/s3/config.py +2 -4
  41. datahub/ingestion/source/s3/source.py +20 -41
  42. datahub/ingestion/source/salesforce.py +1 -1
  43. datahub/ingestion/source/schema_inference/object.py +1 -1
  44. datahub/ingestion/source/snowflake/snowflake_connection.py +1 -1
  45. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  46. datahub/ingestion/source/sql/athena.py +2 -2
  47. datahub/ingestion/source/sql/sql_common.py +2 -2
  48. datahub/ingestion/source/sql/sql_types.py +2 -2
  49. datahub/ingestion/source/sql/teradata.py +4 -2
  50. datahub/ingestion/source/sql/trino.py +2 -2
  51. datahub/ingestion/source/superset.py +218 -56
  52. datahub/ingestion/source/tableau/tableau.py +1 -5
  53. datahub/lite/duckdb_lite.py +3 -9
  54. datahub/metadata/_schema_classes.py +157 -14
  55. datahub/metadata/_urns/urn_defs.py +58 -58
  56. datahub/metadata/schema.avsc +23 -10
  57. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  58. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  59. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  60. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  61. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  62. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  63. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  64. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  65. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
  66. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  67. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  68. datahub/metadata/schemas/PostKey.avsc +2 -1
  69. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  70. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  71. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  72. datahub/pydantic/__init__.py +0 -0
  73. datahub/pydantic/compat.py +58 -0
  74. datahub/sdk/__init__.py +1 -0
  75. datahub/sdk/_all_entities.py +1 -1
  76. datahub/sdk/_shared.py +88 -3
  77. datahub/sdk/container.py +7 -1
  78. datahub/sdk/dataset.py +10 -4
  79. datahub/sdk/{_entity.py → entity.py} +4 -0
  80. datahub/sdk/entity_client.py +1 -1
  81. datahub/sdk/main_client.py +7 -1
  82. datahub/sdk/resolver_client.py +17 -29
  83. datahub/sdk/search_client.py +50 -0
  84. datahub/sdk/search_filters.py +374 -0
  85. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/LICENSE +0 -0
  86. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/WHEEL +0 -0
  87. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/entry_points.txt +0 -0
  88. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/top_level.txt +0 -0
@@ -2,11 +2,24 @@ import json
2
2
  import logging
3
3
  import time
4
4
  from pathlib import Path
5
- from typing import Dict, Iterable, List, Optional, Tuple, Union
5
+ from typing import (
6
+ Dict,
7
+ Iterable,
8
+ List,
9
+ Literal,
10
+ Optional,
11
+ Tuple,
12
+ Union,
13
+ get_args,
14
+ )
6
15
 
7
- from pydantic import BaseModel, Field, validator
16
+ import avro
17
+ import yaml
18
+ from pydantic import BaseModel, Field, root_validator, validator
8
19
  from ruamel.yaml import YAML
20
+ from typing_extensions import TypeAlias
9
21
 
22
+ import datahub.metadata.schema_classes as models
10
23
  from datahub.api.entities.structuredproperties.structuredproperties import AllowedTypes
11
24
  from datahub.configuration.common import ConfigModel
12
25
  from datahub.emitter.mce_builder import (
@@ -40,6 +53,16 @@ from datahub.metadata.schema_classes import (
40
53
  TagAssociationClass,
41
54
  UpstreamClass,
42
55
  )
56
+ from datahub.metadata.urns import (
57
+ DataPlatformUrn,
58
+ GlossaryTermUrn,
59
+ SchemaFieldUrn,
60
+ StructuredPropertyUrn,
61
+ TagUrn,
62
+ )
63
+ from datahub.pydantic.compat import (
64
+ PYDANTIC_VERSION,
65
+ )
43
66
  from datahub.specific.dataset import DatasetPatchBuilder
44
67
  from datahub.utilities.urns.dataset_urn import DatasetUrn
45
68
 
@@ -47,35 +70,103 @@ logging.basicConfig(level=logging.INFO)
47
70
  logger = logging.getLogger(__name__)
48
71
 
49
72
 
50
- class SchemaFieldSpecification(BaseModel):
73
+ class StrictModel(BaseModel):
74
+ """
75
+ Base model with strict validation.
76
+ Compatible with both Pydantic v1 and v2.
77
+ """
78
+
79
+ if PYDANTIC_VERSION >= 2:
80
+ # Pydantic v2 config
81
+ model_config = {
82
+ "validate_assignment": True,
83
+ "extra": "forbid",
84
+ }
85
+ else:
86
+ # Pydantic v1 config
87
+ class Config:
88
+ validate_assignment = True
89
+ extra = "forbid"
90
+
91
+
92
+ # Define type aliases for the complex types
93
+ PropertyValue: TypeAlias = Union[float, str]
94
+ PropertyValueList: TypeAlias = List[PropertyValue]
95
+ StructuredProperties: TypeAlias = Dict[str, Union[PropertyValue, PropertyValueList]]
96
+
97
+
98
+ class StructuredPropertiesHelper:
99
+ @staticmethod
100
+ def simplify_structured_properties_list(
101
+ structured_properties: Optional[StructuredProperties],
102
+ ) -> Optional[StructuredProperties]:
103
+ def urn_strip(urn: str) -> str:
104
+ if urn.startswith("urn:li:structuredProperty:"):
105
+ return urn[len("urn:li:structuredProperty:") :]
106
+ return urn
107
+
108
+ if structured_properties:
109
+ simplified_structured_properties = (
110
+ {urn_strip(k): v for k, v in structured_properties.items()}
111
+ if structured_properties
112
+ else None
113
+ )
114
+ if simplified_structured_properties:
115
+ # convert lists to single values if possible
116
+ for k, v in simplified_structured_properties.items():
117
+ if isinstance(v, list):
118
+ if len(v) == 1:
119
+ simplified_structured_properties[k] = v[0]
120
+ else:
121
+ simplified_structured_properties[k] = v
122
+ else:
123
+ simplified_structured_properties[k] = v
124
+
125
+ return simplified_structured_properties
126
+ return None
127
+
128
+
129
+ class SchemaFieldSpecification(StrictModel):
51
130
  id: Optional[str] = None
52
131
  urn: Optional[str] = None
53
- structured_properties: Optional[
54
- Dict[str, Union[str, float, List[Union[str, float]]]]
55
- ] = None
132
+ structured_properties: Optional[StructuredProperties] = None
56
133
  type: Optional[str] = None
57
134
  nativeDataType: Optional[str] = None
58
135
  jsonPath: Union[None, str] = None
59
- nullable: Optional[bool] = None
136
+ nullable: bool = False
60
137
  description: Union[None, str] = None
138
+ doc: Union[None, str] = None # doc is an alias for description
61
139
  label: Optional[str] = None
62
140
  created: Optional[dict] = None
63
141
  lastModified: Optional[dict] = None
64
- recursive: Optional[bool] = None
142
+ recursive: bool = False
65
143
  globalTags: Optional[List[str]] = None
66
144
  glossaryTerms: Optional[List[str]] = None
67
145
  isPartOfKey: Optional[bool] = None
68
146
  isPartitioningKey: Optional[bool] = None
69
147
  jsonProps: Optional[dict] = None
70
148
 
149
+ def remove_type_metadata(self) -> "SchemaFieldSpecification":
150
+ """
151
+ Removes type metadata from the schema field specification.
152
+ This is useful when syncing field metadata back to yaml when
153
+ the type information is already present in the schema file.
154
+ """
155
+ self.type = None
156
+ self.nativeDataType = None
157
+ self.jsonPath = None
158
+ self.isPartitioningKey = None
159
+ self.isPartOfKey = None
160
+ self.jsonProps = None
161
+ return self
162
+
71
163
  def with_structured_properties(
72
- self,
73
- structured_properties: Optional[Dict[str, List[Union[str, float]]]],
164
+ self, structured_properties: Optional[StructuredProperties]
74
165
  ) -> "SchemaFieldSpecification":
75
166
  self.structured_properties = (
76
- {k: v for k, v in structured_properties.items()}
77
- if structured_properties
78
- else None
167
+ StructuredPropertiesHelper.simplify_structured_properties_list(
168
+ structured_properties
169
+ )
79
170
  )
80
171
  return self
81
172
 
@@ -85,10 +176,10 @@ class SchemaFieldSpecification(BaseModel):
85
176
  ) -> "SchemaFieldSpecification":
86
177
  return SchemaFieldSpecification(
87
178
  id=Dataset._simplify_field_path(schema_field.fieldPath),
88
- urn=make_schema_field_urn(
89
- parent_urn, Dataset._simplify_field_path(schema_field.fieldPath)
179
+ urn=make_schema_field_urn(parent_urn, schema_field.fieldPath),
180
+ type=SchemaFieldSpecification._from_datahub_type(
181
+ schema_field.type, schema_field.nativeDataType, allow_complex=True
90
182
  ),
91
- type=str(schema_field.type),
92
183
  nativeDataType=schema_field.nativeDataType,
93
184
  nullable=schema_field.nullable,
94
185
  description=schema_field.description,
@@ -100,14 +191,15 @@ class SchemaFieldSpecification(BaseModel):
100
191
  else None
101
192
  ),
102
193
  recursive=schema_field.recursive,
103
- globalTags=(
104
- schema_field.globalTags.__dict__ if schema_field.globalTags else None
105
- ),
106
- glossaryTerms=(
107
- schema_field.glossaryTerms.__dict__
108
- if schema_field.glossaryTerms
109
- else None
110
- ),
194
+ globalTags=[TagUrn(tag.tag).name for tag in schema_field.globalTags.tags]
195
+ if schema_field.globalTags
196
+ else None,
197
+ glossaryTerms=[
198
+ GlossaryTermUrn(term.urn).name
199
+ for term in schema_field.glossaryTerms.terms
200
+ ]
201
+ if schema_field.glossaryTerms
202
+ else None,
111
203
  isPartitioningKey=schema_field.isPartitioningKey,
112
204
  jsonProps=(
113
205
  json.loads(schema_field.jsonProps) if schema_field.jsonProps else None
@@ -120,10 +212,142 @@ class SchemaFieldSpecification(BaseModel):
120
212
  raise ValueError("Either id or urn must be present")
121
213
  return v
122
214
 
215
+ @root_validator(pre=True)
216
+ def sync_description_and_doc(cls, values: Dict) -> Dict:
217
+ """Synchronize doc and description fields if one is provided but not the other."""
218
+ description = values.get("description")
219
+ doc = values.get("doc")
220
+
221
+ if description is not None and doc is None:
222
+ values["doc"] = description
223
+ elif doc is not None and description is None:
224
+ values["description"] = doc
225
+
226
+ return values
227
+
228
+ def get_datahub_type(self) -> models.SchemaFieldDataTypeClass:
229
+ PrimitiveType = Literal[
230
+ "string",
231
+ "number",
232
+ "int",
233
+ "long",
234
+ "float",
235
+ "double",
236
+ "boolean",
237
+ "bytes",
238
+ "fixed",
239
+ ]
240
+ type = self.type.lower() if self.type else self.type
241
+ if type not in set(get_args(PrimitiveType)):
242
+ raise ValueError(f"Type {self.type} is not a valid primitive type")
243
+
244
+ if type == "string":
245
+ return models.SchemaFieldDataTypeClass(type=models.StringTypeClass())
246
+ elif type in ["number", "long", "float", "double", "int"]:
247
+ return models.SchemaFieldDataTypeClass(type=models.NumberTypeClass())
248
+ elif type == "fixed":
249
+ return models.SchemaFieldDataTypeClass(type=models.FixedTypeClass())
250
+ elif type == "bytes":
251
+ return models.SchemaFieldDataTypeClass(type=models.BytesTypeClass())
252
+ elif type == "boolean":
253
+ return models.SchemaFieldDataTypeClass(type=models.BooleanTypeClass())
254
+
255
+ raise ValueError(f"Type {self.type} is not a valid primitive type")
256
+
257
+ @staticmethod
258
+ def _from_datahub_type(
259
+ input_type: models.SchemaFieldDataTypeClass,
260
+ native_data_type: str,
261
+ allow_complex: bool = False,
262
+ ) -> str:
263
+ if isinstance(input_type.type, models.StringTypeClass):
264
+ return "string"
265
+ elif isinstance(input_type.type, models.NumberTypeClass):
266
+ if native_data_type in ["long", "float", "double", "int"]:
267
+ return native_data_type
268
+ return "number"
269
+ elif isinstance(input_type.type, models.FixedTypeClass):
270
+ return "fixed"
271
+ elif isinstance(input_type.type, models.BytesTypeClass):
272
+ return "bytes"
273
+ elif isinstance(input_type.type, models.BooleanTypeClass):
274
+ return "boolean"
275
+ elif allow_complex and isinstance(input_type.type, models.ArrayTypeClass):
276
+ return "array"
277
+ elif allow_complex and isinstance(input_type.type, models.MapTypeClass):
278
+ return "map"
279
+ elif allow_complex and isinstance(input_type.type, models.UnionTypeClass):
280
+ return "union"
281
+ elif allow_complex:
282
+ return "record"
283
+ raise ValueError(f"Type {input_type} is not a valid primitive type")
284
+
285
+ if PYDANTIC_VERSION < 2:
286
+
287
+ def dict(self, **kwargs):
288
+ """Custom dict method for Pydantic v1 to handle YAML serialization properly."""
289
+ exclude = kwargs.pop("exclude", None) or set()
290
+
291
+ # If description and doc are identical, exclude doc from the output
292
+ if self.description == self.doc and self.description is not None:
293
+ exclude.add("doc")
294
+
295
+ # if nativeDataType and type are identical, exclude nativeDataType from the output
296
+ if self.nativeDataType == self.type and self.nativeDataType is not None:
297
+ exclude.add("nativeDataType")
298
+
299
+ # if the id is the same as the urn's fieldPath, exclude id from the output
300
+
301
+ if self.urn:
302
+ field_urn = SchemaFieldUrn.from_string(self.urn)
303
+ if Dataset._simplify_field_path(field_urn.field_path) == self.id:
304
+ exclude.add("urn")
305
+
306
+ kwargs.pop("exclude_defaults", None)
307
+
308
+ self.structured_properties = (
309
+ StructuredPropertiesHelper.simplify_structured_properties_list(
310
+ self.structured_properties
311
+ )
312
+ )
313
+
314
+ return super().dict(exclude=exclude, exclude_defaults=True, **kwargs)
315
+
316
+ else:
317
+ # For v2, implement model_dump with similar logic as dict
318
+ def model_dump(self, **kwargs):
319
+ """Custom model_dump method for Pydantic v2 to handle YAML serialization properly."""
320
+ exclude = kwargs.pop("exclude", None) or set()
321
+
322
+ # If description and doc are identical, exclude doc from the output
323
+ if self.description == self.doc and self.description is not None:
324
+ exclude.add("doc")
325
+
326
+ # if nativeDataType and type are identical, exclude nativeDataType from the output
327
+ if self.nativeDataType == self.type and self.nativeDataType is not None:
328
+ exclude.add("nativeDataType")
329
+
330
+ # if the id is the same as the urn's fieldPath, exclude id from the output
331
+ if self.urn:
332
+ field_urn = SchemaFieldUrn.from_string(self.urn)
333
+ if Dataset._simplify_field_path(field_urn.field_path) == self.id:
334
+ exclude.add("urn")
335
+
336
+ self.structured_properties = (
337
+ StructuredPropertiesHelper.simplify_structured_properties_list(
338
+ self.structured_properties
339
+ )
340
+ )
341
+ if hasattr(super(), "model_dump"):
342
+ return super().model_dump( # type: ignore
343
+ exclude=exclude, exclude_defaults=True, **kwargs
344
+ )
345
+
123
346
 
124
347
  class SchemaSpecification(BaseModel):
125
348
  file: Optional[str] = None
126
349
  fields: Optional[List[SchemaFieldSpecification]] = None
350
+ raw_schema: Optional[str] = None
127
351
 
128
352
  @validator("file")
129
353
  def file_must_be_avsc(cls, v):
@@ -143,12 +367,16 @@ class Ownership(ConfigModel):
143
367
 
144
368
 
145
369
  class StructuredPropertyValue(ConfigModel):
146
- value: Union[str, float, List[str], List[float]]
370
+ value: Union[str, int, float, List[str], List[int], List[float]]
147
371
  created: Optional[str] = None
148
372
  lastModified: Optional[str] = None
149
373
 
150
374
 
151
- class Dataset(BaseModel):
375
+ class DatasetRetrievalConfig(BaseModel):
376
+ include_downstreams: Optional[bool] = False
377
+
378
+
379
+ class Dataset(StrictModel):
152
380
  id: Optional[str] = None
153
381
  platform: Optional[str] = None
154
382
  env: str = "PROD"
@@ -163,9 +391,7 @@ class Dataset(BaseModel):
163
391
  tags: Optional[List[str]] = None
164
392
  glossary_terms: Optional[List[str]] = None
165
393
  owners: Optional[List[Union[str, Ownership]]] = None
166
- structured_properties: Optional[
167
- Dict[str, Union[str, float, List[Union[str, float]]]]
168
- ] = None
394
+ structured_properties: Optional[StructuredProperties] = None
169
395
  external_url: Optional[str] = None
170
396
 
171
397
  @property
@@ -199,6 +425,10 @@ class Dataset(BaseModel):
199
425
  return v[len("urn:li:dataPlatform:") :]
200
426
  return v
201
427
 
428
+ @validator("structured_properties")
429
+ def simplify_structured_properties(cls, v):
430
+ return StructuredPropertiesHelper.simplify_structured_properties_list(v)
431
+
202
432
  def _mint_auditstamp(self, message: str) -> AuditStampClass:
203
433
  return AuditStampClass(
204
434
  time=int(time.time() * 1000.0),
@@ -221,6 +451,14 @@ class Dataset(BaseModel):
221
451
  typeUrn=ownership_type_urn,
222
452
  )
223
453
 
454
+ @staticmethod
455
+ def get_patch_builder(urn: str) -> DatasetPatchBuilder:
456
+ return DatasetPatchBuilder(urn)
457
+
458
+ def patch_builder(self) -> DatasetPatchBuilder:
459
+ assert self.urn
460
+ return DatasetPatchBuilder(self.urn)
461
+
224
462
  @classmethod
225
463
  def from_yaml(cls, file: str) -> Iterable["Dataset"]:
226
464
  with open(file) as fp:
@@ -230,9 +468,45 @@ class Dataset(BaseModel):
230
468
  datasets = [datasets]
231
469
  for dataset_raw in datasets:
232
470
  dataset = Dataset.parse_obj(dataset_raw)
471
+ # dataset = Dataset.model_validate(dataset_raw, strict=True)
233
472
  yield dataset
234
473
 
235
- def generate_mcp(
474
+ def entity_references(self) -> List[str]:
475
+ urn_prefix = f"{StructuredPropertyUrn.URN_PREFIX}:{StructuredPropertyUrn.LI_DOMAIN}:{StructuredPropertyUrn.ENTITY_TYPE}"
476
+ references = []
477
+ if self.schema_metadata:
478
+ if self.schema_metadata.fields:
479
+ for field in self.schema_metadata.fields:
480
+ if field.structured_properties:
481
+ references.extend(
482
+ [
483
+ f"{urn_prefix}:{prop_key}"
484
+ if not prop_key.startswith(urn_prefix)
485
+ else prop_key
486
+ for prop_key in field.structured_properties.keys()
487
+ ]
488
+ )
489
+ if field.glossaryTerms:
490
+ references.extend(
491
+ [make_term_urn(term) for term in field.glossaryTerms]
492
+ )
493
+ # We don't check references for tags
494
+ if self.structured_properties:
495
+ references.extend(
496
+ [
497
+ f"{urn_prefix}:{prop_key}"
498
+ if not prop_key.startswith(urn_prefix)
499
+ else prop_key
500
+ for prop_key in self.structured_properties.keys()
501
+ ]
502
+ )
503
+ if self.glossary_terms:
504
+ references.extend([make_term_urn(term) for term in self.glossary_terms])
505
+
506
+ # We don't check references for tags
507
+ return list(set(references))
508
+
509
+ def generate_mcp( # noqa: C901
236
510
  self,
237
511
  ) -> Iterable[Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper]]:
238
512
  mcp = MetadataChangeProposalWrapper(
@@ -247,9 +521,12 @@ class Dataset(BaseModel):
247
521
  yield mcp
248
522
 
249
523
  if self.schema_metadata:
524
+ schema_fields = set()
250
525
  if self.schema_metadata.file:
251
526
  with open(self.schema_metadata.file) as schema_fp:
252
527
  schema_string = schema_fp.read()
528
+ schema_fields_list = avro_schema_to_mce_fields(schema_string)
529
+ schema_fields = {field.fieldPath for field in schema_fields_list}
253
530
  schema_metadata = SchemaMetadataClass(
254
531
  schemaName=self.name or self.id or self.urn or "",
255
532
  platform=self.platform_urn,
@@ -264,7 +541,102 @@ class Dataset(BaseModel):
264
541
  yield mcp
265
542
 
266
543
  if self.schema_metadata.fields:
544
+ field_type_info_present = any(
545
+ field.type for field in self.schema_metadata.fields
546
+ )
547
+ all_fields_type_info_present = all(
548
+ field.type for field in self.schema_metadata.fields
549
+ )
550
+ if field_type_info_present and not all_fields_type_info_present:
551
+ raise ValueError(
552
+ "Either all fields must have type information or none of them should"
553
+ )
554
+
555
+ if all_fields_type_info_present:
556
+ update_technical_schema = True
557
+ else:
558
+ update_technical_schema = False
559
+ if update_technical_schema and not self.schema_metadata.file:
560
+ # We produce a schema metadata aspect only if we have type information
561
+ # and a schema file is not provided.
562
+ schema_metadata = SchemaMetadataClass(
563
+ schemaName=self.name or self.id or self.urn or "",
564
+ platform=self.platform_urn,
565
+ version=0,
566
+ hash="",
567
+ fields=[
568
+ SchemaFieldClass(
569
+ fieldPath=field.id, # type: ignore[arg-type]
570
+ type=field.get_datahub_type(),
571
+ nativeDataType=field.nativeDataType or field.type, # type: ignore[arg-type]
572
+ nullable=field.nullable,
573
+ description=field.description,
574
+ label=field.label,
575
+ created=None, # This should be auto-populated.
576
+ lastModified=None, # This should be auto-populated.
577
+ recursive=field.recursive,
578
+ globalTags=GlobalTagsClass(
579
+ tags=[
580
+ TagAssociationClass(tag=make_tag_urn(tag))
581
+ for tag in field.globalTags
582
+ ]
583
+ )
584
+ if field.globalTags is not None
585
+ else None,
586
+ glossaryTerms=GlossaryTermsClass(
587
+ terms=[
588
+ GlossaryTermAssociationClass(
589
+ urn=make_term_urn(term)
590
+ )
591
+ for term in field.glossaryTerms
592
+ ],
593
+ auditStamp=self._mint_auditstamp("yaml"),
594
+ )
595
+ if field.glossaryTerms is not None
596
+ else None,
597
+ isPartOfKey=field.isPartOfKey,
598
+ isPartitioningKey=field.isPartitioningKey,
599
+ jsonProps=json.dumps(field.jsonProps)
600
+ if field.jsonProps is not None
601
+ else None,
602
+ )
603
+ for field in self.schema_metadata.fields
604
+ ],
605
+ platformSchema=OtherSchemaClass(
606
+ rawSchema=yaml.dump(
607
+ self.schema_metadata.dict(
608
+ exclude_none=True, exclude_unset=True
609
+ )
610
+ )
611
+ ),
612
+ )
613
+ mcp = MetadataChangeProposalWrapper(
614
+ entityUrn=self.urn, aspect=schema_metadata
615
+ )
616
+ yield mcp
617
+
267
618
  for field in self.schema_metadata.fields:
619
+ if schema_fields:
620
+ # search for the field in the schema fields set
621
+ matched_fields = [
622
+ schema_field
623
+ for schema_field in schema_fields
624
+ if field.id == schema_field
625
+ or field.id == Dataset._simplify_field_path(schema_field)
626
+ ]
627
+ if not matched_fields:
628
+ raise ValueError(
629
+ f"Field {field.id} not found in the schema file"
630
+ )
631
+ if len(matched_fields) > 1:
632
+ raise ValueError(
633
+ f"Field {field.id} matches multiple entries {matched_fields}in the schema file. Use the fully qualified field path."
634
+ )
635
+ assert len(matched_fields) == 1
636
+ assert (
637
+ self.urn is not None
638
+ ) # validator should have filled this in
639
+ field.urn = make_schema_field_urn(self.urn, matched_fields[0])
268
640
  field_urn = field.urn or make_schema_field_urn(
269
641
  self.urn, # type: ignore[arg-type]
270
642
  field.id, # type: ignore[arg-type]
@@ -299,12 +671,15 @@ class Dataset(BaseModel):
299
671
  yield mcp
300
672
 
301
673
  if field.structured_properties:
674
+ urn_prefix = f"{StructuredPropertyUrn.URN_PREFIX}:{StructuredPropertyUrn.LI_DOMAIN}:{StructuredPropertyUrn.ENTITY_TYPE}"
302
675
  mcp = MetadataChangeProposalWrapper(
303
676
  entityUrn=field_urn,
304
677
  aspect=StructuredPropertiesClass(
305
678
  properties=[
306
679
  StructuredPropertyValueAssignmentClass(
307
- propertyUrn=f"urn:li:structuredProperty:{prop_key}",
680
+ propertyUrn=f"{urn_prefix}:{prop_key}"
681
+ if not prop_key.startswith(urn_prefix)
682
+ else prop_key,
308
683
  values=(
309
684
  prop_value
310
685
  if isinstance(prop_value, list)
@@ -403,6 +778,10 @@ class Dataset(BaseModel):
403
778
 
404
779
  @staticmethod
405
780
  def _simplify_field_path(field_path: str) -> str:
781
+ # field paths with [type=array] or [type=map] or [type=union] should never be simplified
782
+ for type in ["array", "map", "union"]:
783
+ if f"[type={type}]" in field_path:
784
+ return field_path
406
785
  if field_path.startswith("[version=2.0]"):
407
786
  # v2 field path
408
787
  field_components = []
@@ -434,7 +813,26 @@ class Dataset(BaseModel):
434
813
  )
435
814
 
436
815
  if schema_metadata:
816
+ # If the schema is built off of an avro schema, we only extract the fields if they have structured properties
817
+ # Otherwise, we extract all fields
818
+ if (
819
+ schema_metadata.platformSchema
820
+ and isinstance(schema_metadata.platformSchema, models.OtherSchemaClass)
821
+ and schema_metadata.platformSchema.rawSchema
822
+ ):
823
+ try:
824
+ maybe_avro_schema = avro.schema.parse(
825
+ schema_metadata.platformSchema.rawSchema
826
+ )
827
+ schema_fields = avro_schema_to_mce_fields(maybe_avro_schema)
828
+ except Exception as e:
829
+ logger.debug("Failed to parse avro schema: %s", e)
830
+ schema_fields = []
831
+
437
832
  schema_specification = SchemaSpecification(
833
+ raw_schema=schema_metadata.platformSchema.rawSchema
834
+ if hasattr(schema_metadata.platformSchema, "rawSchema")
835
+ else None,
438
836
  fields=[
439
837
  SchemaFieldSpecification.from_schema_field(
440
838
  field, urn
@@ -462,8 +860,21 @@ class Dataset(BaseModel):
462
860
  )
463
861
  for field in schema_metadata.fields
464
862
  ]
465
- ]
863
+ ],
466
864
  )
865
+ if schema_fields and schema_specification.fields:
866
+ # Source was an avro schema, so we only include fields with structured properties, tags or glossary terms
867
+ schema_specification.fields = [
868
+ field.remove_type_metadata()
869
+ for field in schema_specification.fields
870
+ if field.structured_properties
871
+ or field.globalTags
872
+ or field.glossaryTerms
873
+ ]
874
+ if (
875
+ not schema_specification.fields
876
+ ): # set fields to None if there are no fields after filtering
877
+ schema_specification.fields = None
467
878
  return schema_specification
468
879
  else:
469
880
  return None
@@ -485,7 +896,14 @@ class Dataset(BaseModel):
485
896
  return yaml_owners
486
897
 
487
898
  @classmethod
488
- def from_datahub(cls, graph: DataHubGraph, urn: str) -> "Dataset":
899
+ def from_datahub(
900
+ cls,
901
+ graph: DataHubGraph,
902
+ urn: str,
903
+ config: DatasetRetrievalConfig = DatasetRetrievalConfig(),
904
+ ) -> "Dataset":
905
+ dataset_urn = DatasetUrn.from_string(urn)
906
+ platform_urn = DataPlatformUrn.from_string(dataset_urn.platform)
489
907
  dataset_properties: Optional[DatasetPropertiesClass] = graph.get_aspect(
490
908
  urn, DatasetPropertiesClass
491
909
  )
@@ -500,7 +918,7 @@ class Dataset(BaseModel):
500
918
  urn, StructuredPropertiesClass
501
919
  )
502
920
  if structured_properties:
503
- structured_properties_map: Dict[str, List[Union[str, float]]] = {}
921
+ structured_properties_map: StructuredProperties = {}
504
922
  for sp in structured_properties.properties:
505
923
  if sp.propertyUrn in structured_properties_map:
506
924
  assert isinstance(structured_properties_map[sp.propertyUrn], list)
@@ -508,7 +926,19 @@ class Dataset(BaseModel):
508
926
  else:
509
927
  structured_properties_map[sp.propertyUrn] = sp.values
510
928
 
511
- return Dataset( # type: ignore[call-arg]
929
+ if config.include_downstreams:
930
+ related_downstreams = graph.get_related_entities(
931
+ urn,
932
+ relationship_types=[
933
+ "DownstreamOf",
934
+ ],
935
+ direction=DataHubGraph.RelationshipDirection.INCOMING,
936
+ )
937
+ downstreams = [r.urn for r in related_downstreams]
938
+
939
+ return Dataset( # type: ignore[arg-type]
940
+ id=dataset_urn.name,
941
+ platform=platform_urn.platform_name,
512
942
  urn=urn,
513
943
  description=(
514
944
  dataset_properties.description
@@ -521,9 +951,11 @@ class Dataset(BaseModel):
521
951
  else None
522
952
  ),
523
953
  schema=Dataset._schema_from_schema_metadata(graph, urn),
524
- tags=[tag.tag for tag in tags.tags] if tags else None,
954
+ tags=[TagUrn(tag.tag).name for tag in tags.tags] if tags else None,
525
955
  glossary_terms=(
526
- [term.urn for term in glossary_terms.terms] if glossary_terms else None
956
+ [GlossaryTermUrn(term.urn).name for term in glossary_terms.terms]
957
+ if glossary_terms
958
+ else None
527
959
  ),
528
960
  owners=yaml_owners,
529
961
  properties=(
@@ -533,14 +965,271 @@ class Dataset(BaseModel):
533
965
  structured_properties=(
534
966
  structured_properties_map if structured_properties else None
535
967
  ),
968
+ downstreams=downstreams if config.include_downstreams else None,
536
969
  )
537
970
 
971
+ if PYDANTIC_VERSION < 2:
972
+
973
+ def dict(self, **kwargs):
974
+ """Custom dict method for Pydantic v1 to handle YAML serialization properly."""
975
+ exclude = kwargs.pop("exclude", set())
976
+
977
+ # If id and name are identical, exclude name from the output
978
+ if self.id == self.name and self.id is not None:
979
+ exclude.add("name")
980
+
981
+ # if subtype and subtypes are identical or subtypes is a singleton list, exclude subtypes from the output
982
+ if self.subtypes and len(self.subtypes) == 1:
983
+ self.subtype = self.subtypes[0]
984
+ exclude.add("subtypes")
985
+
986
+ result = super().dict(exclude=exclude, **kwargs)
987
+
988
+ # Custom handling for schema_metadata/schema
989
+ if self.schema_metadata and "schema" in result:
990
+ schema_data = result["schema"]
991
+
992
+ # Handle fields if they exist
993
+ if "fields" in schema_data and isinstance(schema_data["fields"], list):
994
+ # Process each field using its custom dict method
995
+ processed_fields = []
996
+ if self.schema_metadata and self.schema_metadata.fields:
997
+ for field in self.schema_metadata.fields:
998
+ if field:
999
+ # Use dict method for Pydantic v1
1000
+ processed_field = field.dict(**kwargs)
1001
+ processed_fields.append(processed_field)
1002
+
1003
+ # Replace the fields in the result with the processed ones
1004
+ schema_data["fields"] = processed_fields
1005
+
1006
+ return result
1007
+ else:
1008
+
1009
+ def model_dump(self, **kwargs):
1010
+ """Custom model_dump method for Pydantic v2 to handle YAML serialization properly."""
1011
+ exclude = kwargs.pop("exclude", set())
1012
+
1013
+ # If id and name are identical, exclude name from the output
1014
+ if self.id == self.name and self.id is not None:
1015
+ exclude.add("name")
1016
+
1017
+ # if subtype and subtypes are identical or subtypes is a singleton list, exclude subtypes from the output
1018
+ if self.subtypes and len(self.subtypes) == 1:
1019
+ self.subtype = self.subtypes[0]
1020
+ exclude.add("subtypes")
1021
+
1022
+ if hasattr(super(), "model_dump"):
1023
+ result = super().model_dump(exclude=exclude, **kwargs) # type: ignore
1024
+ else:
1025
+ result = super().dict(exclude=exclude, **kwargs)
1026
+
1027
+ # Custom handling for schema_metadata/schema
1028
+ if self.schema_metadata and "schema" in result:
1029
+ schema_data = result["schema"]
1030
+
1031
+ # Handle fields if they exist
1032
+ if "fields" in schema_data and isinstance(schema_data["fields"], list):
1033
+ # Process each field using its custom model_dump method
1034
+ processed_fields = []
1035
+ if self.schema_metadata and self.schema_metadata.fields:
1036
+ for field in self.schema_metadata.fields:
1037
+ if field:
1038
+ processed_field = field.model_dump(**kwargs)
1039
+ processed_fields.append(processed_field)
1040
+
1041
+ # Replace the fields in the result with the processed ones
1042
+ schema_data["fields"] = processed_fields
1043
+
1044
+ return result
1045
+
538
1046
  def to_yaml(
539
1047
  self,
540
1048
  file: Path,
541
- ) -> None:
1049
+ ) -> bool:
1050
+ """
1051
+ Write model to YAML file only if content has changed.
1052
+ Preserves comments and structure of the existing YAML file.
1053
+ Returns True if file was written, False if no changes were detected.
1054
+ """
1055
+ # Create new model data
1056
+ # Create new model data - choose dict() or model_dump() based on Pydantic version
1057
+ if PYDANTIC_VERSION >= 2:
1058
+ new_data = self.model_dump(
1059
+ exclude_none=True, exclude_unset=True, by_alias=True
1060
+ )
1061
+ else:
1062
+ new_data = self.dict(exclude_none=True, exclude_unset=True, by_alias=True)
1063
+
1064
+ # Set up ruamel.yaml for preserving comments
1065
+ yaml_handler = YAML(typ="rt") # round-trip mode
1066
+ yaml_handler.default_flow_style = False
1067
+ yaml_handler.preserve_quotes = True # type: ignore[assignment]
1068
+ yaml_handler.indent(mapping=2, sequence=2, offset=0)
1069
+
1070
+ if file.exists():
1071
+ try:
1072
+ # Load existing data with comments preserved
1073
+ with open(file, "r") as fp:
1074
+ existing_data = yaml_handler.load(fp)
1075
+
1076
+ # Determine if the file contains a list or a single document
1077
+ if isinstance(existing_data, dict):
1078
+ existing_data = [existing_data]
1079
+ is_original_list = False
1080
+ else:
1081
+ is_original_list = True
1082
+ if isinstance(existing_data, list):
1083
+ # Handle list case
1084
+ updated = False
1085
+ identifier = "urn"
1086
+ model_id = self.urn
1087
+
1088
+ if model_id is not None:
1089
+ # Try to find and update existing item
1090
+ for item in existing_data:
1091
+ existing_dataset = Dataset(**item)
1092
+ item_identifier = item.get(identifier, existing_dataset.urn)
1093
+ if item_identifier == model_id:
1094
+ # Found the item to update - preserve structure while updating values
1095
+ updated = True
1096
+ if (
1097
+ existing_dataset.schema_metadata
1098
+ and existing_dataset.schema_metadata.file
1099
+ ):
1100
+ # Preserve the existing schema file path
1101
+ new_data["schema"]["file"] = (
1102
+ existing_dataset.schema_metadata.file
1103
+ )
1104
+ # Check if the content of the schema file has changed
1105
+ with open(
1106
+ existing_dataset.schema_metadata.file
1107
+ ) as schema_fp:
1108
+ schema_fp_content = schema_fp.read()
1109
+
1110
+ if (
1111
+ schema_fp_content
1112
+ != new_data["schema"]["raw_schema"]
1113
+ ):
1114
+ # If the content has changed, update the schema file
1115
+ schema_file_path = Path(
1116
+ existing_dataset.schema_metadata.file
1117
+ )
1118
+ schema_file_path.write_text(
1119
+ new_data["schema"]["raw_schema"]
1120
+ )
1121
+ # Remove raw_schema from the schema aspect before updating
1122
+ if "schema" in new_data:
1123
+ new_data["schema"].pop("raw_schema")
1124
+
1125
+ _update_dict_preserving_comments(
1126
+ item, new_data, ["urn", "properties", "raw_schema"]
1127
+ )
1128
+ break
1129
+
1130
+ if not updated:
1131
+ # Item not found, append to the list
1132
+ existing_data.append(new_data)
1133
+ updated = True
1134
+
1135
+ # If no update was needed, return early
1136
+ if not updated:
1137
+ return False
1138
+
1139
+ # Write the updated data back
1140
+ with open(file, "w") as fp:
1141
+ if not is_original_list:
1142
+ existing_data = existing_data[0]
1143
+ yaml_handler.dump(existing_data, fp)
1144
+
1145
+ return True
1146
+
1147
+ except Exception as e:
1148
+ # If there's any error, we'll create a new file
1149
+ print(
1150
+ f"Error processing existing file {file}: {e}. Will create a new one."
1151
+ )
1152
+ else:
1153
+ # File doesn't exist or had errors - create a new one with default settings
1154
+ yaml_handler.indent(mapping=2, sequence=2, offset=0)
1155
+
1156
+ file.parent.mkdir(parents=True, exist_ok=True)
1157
+
542
1158
  with open(file, "w") as fp:
543
- yaml = YAML(typ="rt") # default, if not specfied, is 'rt' (round-trip)
544
- yaml.indent(mapping=2, sequence=4, offset=2)
545
- yaml.default_flow_style = False
546
- yaml.dump(self.dict(exclude_none=True, exclude_unset=True), fp)
1159
+ yaml_handler.dump(new_data, fp)
1160
+
1161
+ return True
1162
+
1163
+
1164
+ def _update_dict_preserving_comments(
1165
+ target: Dict, source: Dict, optional_fields: Optional[List[str]] = None
1166
+ ) -> None:
1167
+ """
1168
+ Updates a target dictionary with values from source, preserving comments and structure.
1169
+ This modifies the target dictionary in-place.
1170
+ """
1171
+ if optional_fields is None:
1172
+ optional_fields = ["urn"]
1173
+ # For each key in the source dict
1174
+ for key, value in source.items():
1175
+ if key in target:
1176
+ if isinstance(value, dict) and isinstance(target[key], dict):
1177
+ # Recursively update nested dictionaries
1178
+ _update_dict_preserving_comments(target[key], value)
1179
+ else:
1180
+ # Update scalar or list values
1181
+ # If target value is an int, and source value is a float that is equal to the int, convert to int
1182
+ if isinstance(value, float) and int(value) == value:
1183
+ target[key] = int(value)
1184
+ else:
1185
+ target[key] = value
1186
+ elif key not in optional_fields:
1187
+ # Add new keys
1188
+ target[key] = value
1189
+
1190
+ # Remove keys that are in target but not in source
1191
+ keys_to_remove = [k for k in target if k not in source]
1192
+ for key in keys_to_remove:
1193
+ del target[key]
1194
+
1195
+
1196
+ def _dict_equal(dict1: Dict, dict2: Dict, optional_keys: List[str]) -> bool:
1197
+ """
1198
+ Compare two dictionaries for equality, ignoring ruamel.yaml's metadata.
1199
+ """
1200
+
1201
+ if len(dict1) != len(dict2):
1202
+ # Check if the difference is only in optional keys
1203
+ if len(dict1) > len(dict2):
1204
+ for key in optional_keys:
1205
+ if key in dict1 and key not in dict2:
1206
+ del dict1[key]
1207
+ elif len(dict2) > len(dict1):
1208
+ for key in optional_keys:
1209
+ if key in dict2 and key not in dict1:
1210
+ del dict2[key]
1211
+ if len(dict1) != len(dict2):
1212
+ return False
1213
+
1214
+ for key, value in dict1.items():
1215
+ if key not in dict2:
1216
+ return False
1217
+
1218
+ if isinstance(value, dict) and isinstance(dict2[key], dict):
1219
+ if not _dict_equal(value, dict2[key], optional_keys):
1220
+ return False
1221
+ elif isinstance(value, list) and isinstance(dict2[key], list):
1222
+ if len(value) != len(dict2[key]):
1223
+ return False
1224
+
1225
+ # Check list items (simplified for brevity)
1226
+ for i in range(len(value)):
1227
+ if isinstance(value[i], dict) and isinstance(dict2[key][i], dict):
1228
+ if not _dict_equal(value[i], dict2[key][i], optional_keys):
1229
+ return False
1230
+ elif value[i] != dict2[key][i]:
1231
+ return False
1232
+ elif value != dict2[key]:
1233
+ return False
1234
+
1235
+ return True