PyPI - acryl-datahub - Versions diffs - 1.0.0rc8__py3-none-any.whl → 1.0.0rc10__py3-none-any.whl - Mend

acryl-datahub 1.0.0rc8py3-none-any.whl → 1.0.0rc10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (53) hide show

{acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/METADATA +2623 -2624
{acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/RECORD +53 -49
datahub/_version.py +1 -1
datahub/api/entities/dataset/dataset.py +731 -42
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/cli/specific/dataset_cli.py +128 -14
datahub/emitter/mce_builder.py +28 -13
datahub/ingestion/graph/client.py +15 -11
datahub/ingestion/graph/filters.py +64 -37
datahub/ingestion/source/cassandra/cassandra.py +1 -1
datahub/ingestion/source/common/subtypes.py +7 -0
datahub/ingestion/source/identity/okta.py +22 -0
datahub/ingestion/source/metabase.py +3 -3
datahub/ingestion/source/mode.py +1 -1
datahub/ingestion/source/preset.py +7 -4
datahub/ingestion/source/sql/mssql/job_models.py +29 -0
datahub/ingestion/source/sql/mssql/source.py +10 -4
datahub/ingestion/source/superset.py +158 -24
datahub/metadata/_schema_classes.py +157 -14
datahub/metadata/_urns/urn_defs.py +82 -58
datahub/metadata/schema.avsc +23 -10
datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
datahub/metadata/schemas/CorpUserKey.avsc +2 -1
datahub/metadata/schemas/DataProcessKey.avsc +2 -1
datahub/metadata/schemas/DataProductKey.avsc +2 -1
datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
datahub/metadata/schemas/MLModelKey.avsc +2 -1
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
datahub/metadata/schemas/PostKey.avsc +2 -1
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/metadata/schemas/VersionProperties.avsc +18 -0
datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
datahub/pydantic/__init__.py +0 -0
datahub/pydantic/compat.py +58 -0
datahub/sdk/__init__.py +1 -0
datahub/sdk/_all_entities.py +1 -1
datahub/sdk/_shared.py +88 -3
datahub/sdk/container.py +7 -1
datahub/sdk/dataset.py +7 -1
datahub/sdk/{_entity.py → entity.py} +4 -0
datahub/sdk/entity_client.py +1 -1
datahub/sdk/main_client.py +7 -1
datahub/sdk/resolver_client.py +17 -29
datahub/sdk/search_client.py +50 -0
datahub/sdk/search_filters.py +374 -0
{acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/LICENSE +0 -0
{acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/WHEEL +0 -0
{acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/top_level.txt +0 -0

datahub/api/entities/dataset/dataset.py CHANGED Viewed

@@ -2,11 +2,24 @@ import json
 import logging
 import time
 from pathlib import Path
-from typing import Dict, Iterable, List, Optional, Tuple, Union
+from typing import (
+    Dict,
+    Iterable,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Union,
+    get_args,
+)
-from pydantic import BaseModel, Field, validator
+import avro
+import yaml
+from pydantic import BaseModel, Field, root_validator, validator
 from ruamel.yaml import YAML
+from typing_extensions import TypeAlias
+import datahub.metadata.schema_classes as models
 from datahub.api.entities.structuredproperties.structuredproperties import AllowedTypes
 from datahub.configuration.common import ConfigModel
 from datahub.emitter.mce_builder import (
@@ -40,6 +53,16 @@ from datahub.metadata.schema_classes import (
     TagAssociationClass,
     UpstreamClass,
 )
+from datahub.metadata.urns import (
+    DataPlatformUrn,
+    GlossaryTermUrn,
+    SchemaFieldUrn,
+    StructuredPropertyUrn,
+    TagUrn,
+)
+from datahub.pydantic.compat import (
+    PYDANTIC_VERSION,
+)
 from datahub.specific.dataset import DatasetPatchBuilder
 from datahub.utilities.urns.dataset_urn import DatasetUrn
@@ -47,35 +70,103 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-class SchemaFieldSpecification(BaseModel):
+class StrictModel(BaseModel):
+    """
+    Base model with strict validation.
+    Compatible with both Pydantic v1 and v2.
+    """
+    if PYDANTIC_VERSION >= 2:
+        # Pydantic v2 config
+        model_config = {
+            "validate_assignment": True,
+            "extra": "forbid",
+        }
+    else:
+        # Pydantic v1 config
+        class Config:
+            validate_assignment = True
+            extra = "forbid"
+# Define type aliases for the complex types
+PropertyValue: TypeAlias = Union[float, str]
+PropertyValueList: TypeAlias = List[PropertyValue]
+StructuredProperties: TypeAlias = Dict[str, Union[PropertyValue, PropertyValueList]]
+class StructuredPropertiesHelper:
+    @staticmethod
+    def simplify_structured_properties_list(
+        structured_properties: Optional[StructuredProperties],
+    ) -> Optional[StructuredProperties]:
+        def urn_strip(urn: str) -> str:
+            if urn.startswith("urn:li:structuredProperty:"):
+                return urn[len("urn:li:structuredProperty:") :]
+            return urn
+        if structured_properties:
+            simplified_structured_properties = (
+                {urn_strip(k): v for k, v in structured_properties.items()}
+                if structured_properties
+                else None
+            )
+            if simplified_structured_properties:
+                # convert lists to single values if possible
+                for k, v in simplified_structured_properties.items():
+                    if isinstance(v, list):
+                        if len(v) == 1:
+                            simplified_structured_properties[k] = v[0]
+                        else:
+                            simplified_structured_properties[k] = v
+                    else:
+                        simplified_structured_properties[k] = v
+            return simplified_structured_properties
+        return None
+class SchemaFieldSpecification(StrictModel):
     id: Optional[str] = None
     urn: Optional[str] = None
-    structured_properties: Optional[
-        Dict[str, Union[str, float, List[Union[str, float]]]]
-    ] = None
+    structured_properties: Optional[StructuredProperties] = None
     type: Optional[str] = None
     nativeDataType: Optional[str] = None
     jsonPath: Union[None, str] = None
-    nullable: Optional[bool] = None
+    nullable: bool = False
     description: Union[None, str] = None
+    doc: Union[None, str] = None  # doc is an alias for description
     label: Optional[str] = None
     created: Optional[dict] = None
     lastModified: Optional[dict] = None
-    recursive: Optional[bool] = None
+    recursive: bool = False
     globalTags: Optional[List[str]] = None
     glossaryTerms: Optional[List[str]] = None
     isPartOfKey: Optional[bool] = None
     isPartitioningKey: Optional[bool] = None
     jsonProps: Optional[dict] = None
+    def remove_type_metadata(self) -> "SchemaFieldSpecification":
+        """
+        Removes type metadata from the schema field specification.
+        This is useful when syncing field metadata back to yaml when
+        the type information is already present in the schema file.
+        """
+        self.type = None
+        self.nativeDataType = None
+        self.jsonPath = None
+        self.isPartitioningKey = None
+        self.isPartOfKey = None
+        self.jsonProps = None
+        return self
     def with_structured_properties(
-        self,
-        structured_properties: Optional[Dict[str, List[Union[str, float]]]],
+        self, structured_properties: Optional[StructuredProperties]
     ) -> "SchemaFieldSpecification":
         self.structured_properties = (
-            {k: v for k, v in structured_properties.items()}
-            if structured_properties
-            else None
+            StructuredPropertiesHelper.simplify_structured_properties_list(
+                structured_properties
+            )
         )
         return self
@@ -85,10 +176,10 @@ class SchemaFieldSpecification(BaseModel):
     ) -> "SchemaFieldSpecification":
         return SchemaFieldSpecification(
             id=Dataset._simplify_field_path(schema_field.fieldPath),
-            urn=make_schema_field_urn(
-                parent_urn, Dataset._simplify_field_path(schema_field.fieldPath)
+            urn=make_schema_field_urn(parent_urn, schema_field.fieldPath),
+            type=SchemaFieldSpecification._from_datahub_type(
+                schema_field.type, schema_field.nativeDataType, allow_complex=True
             ),
-            type=str(schema_field.type),
             nativeDataType=schema_field.nativeDataType,
             nullable=schema_field.nullable,
             description=schema_field.description,
@@ -100,14 +191,15 @@ class SchemaFieldSpecification(BaseModel):
                 else None
             ),
             recursive=schema_field.recursive,
-            globalTags=(
-                schema_field.globalTags.__dict__ if schema_field.globalTags else None
-            ),
-            glossaryTerms=(
-                schema_field.glossaryTerms.__dict__
-                if schema_field.glossaryTerms
-                else None
-            ),
+            globalTags=[TagUrn(tag.tag).name for tag in schema_field.globalTags.tags]
+            if schema_field.globalTags
+            else None,
+            glossaryTerms=[
+                GlossaryTermUrn(term.urn).name
+                for term in schema_field.glossaryTerms.terms
+            ]
+            if schema_field.glossaryTerms
+            else None,
             isPartitioningKey=schema_field.isPartitioningKey,
             jsonProps=(
                 json.loads(schema_field.jsonProps) if schema_field.jsonProps else None
@@ -120,10 +212,142 @@ class SchemaFieldSpecification(BaseModel):
             raise ValueError("Either id or urn must be present")
         return v
+    @root_validator(pre=True)
+    def sync_description_and_doc(cls, values: Dict) -> Dict:
+        """Synchronize doc and description fields if one is provided but not the other."""
+        description = values.get("description")
+        doc = values.get("doc")
+        if description is not None and doc is None:
+            values["doc"] = description
+        elif doc is not None and description is None:
+            values["description"] = doc
+        return values
+    def get_datahub_type(self) -> models.SchemaFieldDataTypeClass:
+        PrimitiveType = Literal[
+            "string",
+            "number",
+            "int",
+            "long",
+            "float",
+            "double",
+            "boolean",
+            "bytes",
+            "fixed",
+        ]
+        type = self.type.lower() if self.type else self.type
+        if type not in set(get_args(PrimitiveType)):
+            raise ValueError(f"Type {self.type} is not a valid primitive type")
+        if type == "string":
+            return models.SchemaFieldDataTypeClass(type=models.StringTypeClass())
+        elif type in ["number", "long", "float", "double", "int"]:
+            return models.SchemaFieldDataTypeClass(type=models.NumberTypeClass())
+        elif type == "fixed":
+            return models.SchemaFieldDataTypeClass(type=models.FixedTypeClass())
+        elif type == "bytes":
+            return models.SchemaFieldDataTypeClass(type=models.BytesTypeClass())
+        elif type == "boolean":
+            return models.SchemaFieldDataTypeClass(type=models.BooleanTypeClass())
+        raise ValueError(f"Type {self.type} is not a valid primitive type")
+    @staticmethod
+    def _from_datahub_type(
+        input_type: models.SchemaFieldDataTypeClass,
+        native_data_type: str,
+        allow_complex: bool = False,
+    ) -> str:
+        if isinstance(input_type.type, models.StringTypeClass):
+            return "string"
+        elif isinstance(input_type.type, models.NumberTypeClass):
+            if native_data_type in ["long", "float", "double", "int"]:
+                return native_data_type
+            return "number"
+        elif isinstance(input_type.type, models.FixedTypeClass):
+            return "fixed"
+        elif isinstance(input_type.type, models.BytesTypeClass):
+            return "bytes"
+        elif isinstance(input_type.type, models.BooleanTypeClass):
+            return "boolean"
+        elif allow_complex and isinstance(input_type.type, models.ArrayTypeClass):
+            return "array"
+        elif allow_complex and isinstance(input_type.type, models.MapTypeClass):
+            return "map"
+        elif allow_complex and isinstance(input_type.type, models.UnionTypeClass):
+            return "union"
+        elif allow_complex:
+            return "record"
+        raise ValueError(f"Type {input_type} is not a valid primitive type")
+    if PYDANTIC_VERSION < 2:
+        def dict(self, **kwargs):
+            """Custom dict method for Pydantic v1 to handle YAML serialization properly."""
+            exclude = kwargs.pop("exclude", None) or set()
+            # If description and doc are identical, exclude doc from the output
+            if self.description == self.doc and self.description is not None:
+                exclude.add("doc")
+            # if nativeDataType and type are identical, exclude nativeDataType from the output
+            if self.nativeDataType == self.type and self.nativeDataType is not None:
+                exclude.add("nativeDataType")
+            # if the id is the same as the urn's fieldPath, exclude id from the output
+            if self.urn:
+                field_urn = SchemaFieldUrn.from_string(self.urn)
+                if Dataset._simplify_field_path(field_urn.field_path) == self.id:
+                    exclude.add("urn")
+            kwargs.pop("exclude_defaults", None)
+            self.structured_properties = (
+                StructuredPropertiesHelper.simplify_structured_properties_list(
+                    self.structured_properties
+                )
+            )
+            return super().dict(exclude=exclude, exclude_defaults=True, **kwargs)
+    else:
+        # For v2, implement model_dump with similar logic as dict
+        def model_dump(self, **kwargs):
+            """Custom model_dump method for Pydantic v2 to handle YAML serialization properly."""
+            exclude = kwargs.pop("exclude", None) or set()
+            # If description and doc are identical, exclude doc from the output
+            if self.description == self.doc and self.description is not None:
+                exclude.add("doc")
+            # if nativeDataType and type are identical, exclude nativeDataType from the output
+            if self.nativeDataType == self.type and self.nativeDataType is not None:
+                exclude.add("nativeDataType")
+            # if the id is the same as the urn's fieldPath, exclude id from the output
+            if self.urn:
+                field_urn = SchemaFieldUrn.from_string(self.urn)
+                if Dataset._simplify_field_path(field_urn.field_path) == self.id:
+                    exclude.add("urn")
+            self.structured_properties = (
+                StructuredPropertiesHelper.simplify_structured_properties_list(
+                    self.structured_properties
+                )
+            )
+            if hasattr(super(), "model_dump"):
+                return super().model_dump(  # type: ignore
+                    exclude=exclude, exclude_defaults=True, **kwargs
+                )
 class SchemaSpecification(BaseModel):
     file: Optional[str] = None
     fields: Optional[List[SchemaFieldSpecification]] = None
+    raw_schema: Optional[str] = None
     @validator("file")
     def file_must_be_avsc(cls, v):
@@ -143,12 +367,16 @@ class Ownership(ConfigModel):
 class StructuredPropertyValue(ConfigModel):
-    value: Union[str, float, List[str], List[float]]
+    value: Union[str, int, float, List[str], List[int], List[float]]
     created: Optional[str] = None
     lastModified: Optional[str] = None
-class Dataset(BaseModel):
+class DatasetRetrievalConfig(BaseModel):
+    include_downstreams: Optional[bool] = False
+class Dataset(StrictModel):
     id: Optional[str] = None
     platform: Optional[str] = None
     env: str = "PROD"
@@ -163,9 +391,7 @@ class Dataset(BaseModel):
     tags: Optional[List[str]] = None
     glossary_terms: Optional[List[str]] = None
     owners: Optional[List[Union[str, Ownership]]] = None
-    structured_properties: Optional[
-        Dict[str, Union[str, float, List[Union[str, float]]]]
-    ] = None
+    structured_properties: Optional[StructuredProperties] = None
     external_url: Optional[str] = None
     @property
@@ -199,6 +425,10 @@ class Dataset(BaseModel):
             return v[len("urn:li:dataPlatform:") :]
         return v
+    @validator("structured_properties")
+    def simplify_structured_properties(cls, v):
+        return StructuredPropertiesHelper.simplify_structured_properties_list(v)
     def _mint_auditstamp(self, message: str) -> AuditStampClass:
         return AuditStampClass(
             time=int(time.time() * 1000.0),
@@ -221,6 +451,14 @@ class Dataset(BaseModel):
                 typeUrn=ownership_type_urn,
             )
+    @staticmethod
+    def get_patch_builder(urn: str) -> DatasetPatchBuilder:
+        return DatasetPatchBuilder(urn)
+    def patch_builder(self) -> DatasetPatchBuilder:
+        assert self.urn
+        return DatasetPatchBuilder(self.urn)
     @classmethod
     def from_yaml(cls, file: str) -> Iterable["Dataset"]:
         with open(file) as fp:
@@ -230,9 +468,45 @@ class Dataset(BaseModel):
                 datasets = [datasets]
             for dataset_raw in datasets:
                 dataset = Dataset.parse_obj(dataset_raw)
+                # dataset = Dataset.model_validate(dataset_raw, strict=True)
                 yield dataset
-    def generate_mcp(
+    def entity_references(self) -> List[str]:
+        urn_prefix = f"{StructuredPropertyUrn.URN_PREFIX}:{StructuredPropertyUrn.LI_DOMAIN}:{StructuredPropertyUrn.ENTITY_TYPE}"
+        references = []
+        if self.schema_metadata:
+            if self.schema_metadata.fields:
+                for field in self.schema_metadata.fields:
+                    if field.structured_properties:
+                        references.extend(
+                            [
+                                f"{urn_prefix}:{prop_key}"
+                                if not prop_key.startswith(urn_prefix)
+                                else prop_key
+                                for prop_key in field.structured_properties.keys()
+                            ]
+                        )
+                    if field.glossaryTerms:
+                        references.extend(
+                            [make_term_urn(term) for term in field.glossaryTerms]
+                        )
+                    # We don't check references for tags
+        if self.structured_properties:
+            references.extend(
+                [
+                    f"{urn_prefix}:{prop_key}"
+                    if not prop_key.startswith(urn_prefix)
+                    else prop_key
+                    for prop_key in self.structured_properties.keys()
+                ]
+            )
+        if self.glossary_terms:
+            references.extend([make_term_urn(term) for term in self.glossary_terms])
+        # We don't check references for tags
+        return list(set(references))
+    def generate_mcp(  # noqa: C901
         self,
     ) -> Iterable[Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper]]:
         mcp = MetadataChangeProposalWrapper(
@@ -247,9 +521,12 @@ class Dataset(BaseModel):
         yield mcp
         if self.schema_metadata:
+            schema_fields = set()
             if self.schema_metadata.file:
                 with open(self.schema_metadata.file) as schema_fp:
                     schema_string = schema_fp.read()
+                    schema_fields_list = avro_schema_to_mce_fields(schema_string)
+                    schema_fields = {field.fieldPath for field in schema_fields_list}
                     schema_metadata = SchemaMetadataClass(
                         schemaName=self.name or self.id or self.urn or "",
                         platform=self.platform_urn,
@@ -264,7 +541,102 @@ class Dataset(BaseModel):
                     yield mcp
             if self.schema_metadata.fields:
+                field_type_info_present = any(
+                    field.type for field in self.schema_metadata.fields
+                )
+                all_fields_type_info_present = all(
+                    field.type for field in self.schema_metadata.fields
+                )
+                if field_type_info_present and not all_fields_type_info_present:
+                    raise ValueError(
+                        "Either all fields must have type information or none of them should"
+                    )
+                if all_fields_type_info_present:
+                    update_technical_schema = True
+                else:
+                    update_technical_schema = False
+                if update_technical_schema and not self.schema_metadata.file:
+                    # We produce a schema metadata aspect only if we have type information
+                    # and a schema file is not provided.
+                    schema_metadata = SchemaMetadataClass(
+                        schemaName=self.name or self.id or self.urn or "",
+                        platform=self.platform_urn,
+                        version=0,
+                        hash="",
+                        fields=[
+                            SchemaFieldClass(
+                                fieldPath=field.id,  # type: ignore[arg-type]
+                                type=field.get_datahub_type(),
+                                nativeDataType=field.nativeDataType or field.type,  # type: ignore[arg-type]
+                                nullable=field.nullable,
+                                description=field.description,
+                                label=field.label,
+                                created=None,  # This should be auto-populated.
+                                lastModified=None,  # This should be auto-populated.
+                                recursive=field.recursive,
+                                globalTags=GlobalTagsClass(
+                                    tags=[
+                                        TagAssociationClass(tag=make_tag_urn(tag))
+                                        for tag in field.globalTags
+                                    ]
+                                )
+                                if field.globalTags is not None
+                                else None,
+                                glossaryTerms=GlossaryTermsClass(
+                                    terms=[
+                                        GlossaryTermAssociationClass(
+                                            urn=make_term_urn(term)
+                                        )
+                                        for term in field.glossaryTerms
+                                    ],
+                                    auditStamp=self._mint_auditstamp("yaml"),
+                                )
+                                if field.glossaryTerms is not None
+                                else None,
+                                isPartOfKey=field.isPartOfKey,
+                                isPartitioningKey=field.isPartitioningKey,
+                                jsonProps=json.dumps(field.jsonProps)
+                                if field.jsonProps is not None
+                                else None,
+                            )
+                            for field in self.schema_metadata.fields
+                        ],
+                        platformSchema=OtherSchemaClass(
+                            rawSchema=yaml.dump(
+                                self.schema_metadata.dict(
+                                    exclude_none=True, exclude_unset=True
+                                )
+                            )
+                        ),
+                    )
+                    mcp = MetadataChangeProposalWrapper(
+                        entityUrn=self.urn, aspect=schema_metadata
+                    )
+                    yield mcp
                 for field in self.schema_metadata.fields:
+                    if schema_fields:
+                        # search for the field in the schema fields set
+                        matched_fields = [
+                            schema_field
+                            for schema_field in schema_fields
+                            if field.id == schema_field
+                            or field.id == Dataset._simplify_field_path(schema_field)
+                        ]
+                        if not matched_fields:
+                            raise ValueError(
+                                f"Field {field.id} not found in the schema file"
+                            )
+                        if len(matched_fields) > 1:
+                            raise ValueError(
+                                f"Field {field.id} matches multiple entries {matched_fields}in the schema file. Use the fully qualified field path."
+                            )
+                        assert len(matched_fields) == 1
+                        assert (
+                            self.urn is not None
+                        )  # validator should have filled this in
+                        field.urn = make_schema_field_urn(self.urn, matched_fields[0])
                     field_urn = field.urn or make_schema_field_urn(
                         self.urn,  # type: ignore[arg-type]
                         field.id,  # type: ignore[arg-type]
@@ -299,12 +671,15 @@ class Dataset(BaseModel):
                         yield mcp
                     if field.structured_properties:
+                        urn_prefix = f"{StructuredPropertyUrn.URN_PREFIX}:{StructuredPropertyUrn.LI_DOMAIN}:{StructuredPropertyUrn.ENTITY_TYPE}"
                         mcp = MetadataChangeProposalWrapper(
                             entityUrn=field_urn,
                             aspect=StructuredPropertiesClass(
                                 properties=[
                                     StructuredPropertyValueAssignmentClass(
-                                        propertyUrn=f"urn:li:structuredProperty:{prop_key}",
+                                        propertyUrn=f"{urn_prefix}:{prop_key}"
+                                        if not prop_key.startswith(urn_prefix)
+                                        else prop_key,
                                         values=(
                                             prop_value
                                             if isinstance(prop_value, list)
@@ -403,6 +778,10 @@ class Dataset(BaseModel):
     @staticmethod
     def _simplify_field_path(field_path: str) -> str:
+        # field paths with [type=array] or [type=map] or [type=union] should never be simplified
+        for type in ["array", "map", "union"]:
+            if f"[type={type}]" in field_path:
+                return field_path
         if field_path.startswith("[version=2.0]"):
             # v2 field path
             field_components = []
@@ -434,7 +813,26 @@ class Dataset(BaseModel):
         )
         if schema_metadata:
+            # If the schema is built off of an avro schema, we only extract the fields if they have structured properties
+            # Otherwise, we extract all fields
+            if (
+                schema_metadata.platformSchema
+                and isinstance(schema_metadata.platformSchema, models.OtherSchemaClass)
+                and schema_metadata.platformSchema.rawSchema
+            ):
+                try:
+                    maybe_avro_schema = avro.schema.parse(
+                        schema_metadata.platformSchema.rawSchema
+                    )
+                    schema_fields = avro_schema_to_mce_fields(maybe_avro_schema)
+                except Exception as e:
+                    logger.debug("Failed to parse avro schema: %s", e)
+                    schema_fields = []
             schema_specification = SchemaSpecification(
+                raw_schema=schema_metadata.platformSchema.rawSchema
+                if hasattr(schema_metadata.platformSchema, "rawSchema")
+                else None,
                 fields=[
                     SchemaFieldSpecification.from_schema_field(
                         field, urn
@@ -462,8 +860,21 @@ class Dataset(BaseModel):
                         )
                         for field in schema_metadata.fields
                     ]
-                ]
+                ],
             )
+            if schema_fields and schema_specification.fields:
+                # Source was an avro schema, so we only include fields with structured properties, tags or glossary terms
+                schema_specification.fields = [
+                    field.remove_type_metadata()
+                    for field in schema_specification.fields
+                    if field.structured_properties
+                    or field.globalTags
+                    or field.glossaryTerms
+                ]
+                if (
+                    not schema_specification.fields
+                ):  # set fields to None if there are no fields after filtering
+                    schema_specification.fields = None
             return schema_specification
         else:
             return None
@@ -485,7 +896,14 @@ class Dataset(BaseModel):
         return yaml_owners
     @classmethod
-    def from_datahub(cls, graph: DataHubGraph, urn: str) -> "Dataset":
+    def from_datahub(
+        cls,
+        graph: DataHubGraph,
+        urn: str,
+        config: DatasetRetrievalConfig = DatasetRetrievalConfig(),
+    ) -> "Dataset":
+        dataset_urn = DatasetUrn.from_string(urn)
+        platform_urn = DataPlatformUrn.from_string(dataset_urn.platform)
         dataset_properties: Optional[DatasetPropertiesClass] = graph.get_aspect(
             urn, DatasetPropertiesClass
         )
@@ -500,7 +918,7 @@ class Dataset(BaseModel):
             urn, StructuredPropertiesClass
         )
         if structured_properties:
-            structured_properties_map: Dict[str, List[Union[str, float]]] = {}
+            structured_properties_map: StructuredProperties = {}
             for sp in structured_properties.properties:
                 if sp.propertyUrn in structured_properties_map:
                     assert isinstance(structured_properties_map[sp.propertyUrn], list)
@@ -508,7 +926,19 @@ class Dataset(BaseModel):
                 else:
                     structured_properties_map[sp.propertyUrn] = sp.values
-        return Dataset(  # type: ignore[call-arg]
+        if config.include_downstreams:
+            related_downstreams = graph.get_related_entities(
+                urn,
+                relationship_types=[
+                    "DownstreamOf",
+                ],
+                direction=DataHubGraph.RelationshipDirection.INCOMING,
+            )
+            downstreams = [r.urn for r in related_downstreams]
+        return Dataset(  # type: ignore[arg-type]
+            id=dataset_urn.name,
+            platform=platform_urn.platform_name,
             urn=urn,
             description=(
                 dataset_properties.description
@@ -521,9 +951,11 @@ class Dataset(BaseModel):
                 else None
             ),
             schema=Dataset._schema_from_schema_metadata(graph, urn),
-            tags=[tag.tag for tag in tags.tags] if tags else None,
+            tags=[TagUrn(tag.tag).name for tag in tags.tags] if tags else None,
             glossary_terms=(
-                [term.urn for term in glossary_terms.terms] if glossary_terms else None
+                [GlossaryTermUrn(term.urn).name for term in glossary_terms.terms]
+                if glossary_terms
+                else None
             ),
             owners=yaml_owners,
             properties=(
@@ -533,14 +965,271 @@ class Dataset(BaseModel):
             structured_properties=(
                 structured_properties_map if structured_properties else None
             ),
+            downstreams=downstreams if config.include_downstreams else None,
         )
+    if PYDANTIC_VERSION < 2:
+        def dict(self, **kwargs):
+            """Custom dict method for Pydantic v1 to handle YAML serialization properly."""
+            exclude = kwargs.pop("exclude", set())
+            # If id and name are identical, exclude name from the output
+            if self.id == self.name and self.id is not None:
+                exclude.add("name")
+            # if subtype and subtypes are identical or subtypes is a singleton list, exclude subtypes from the output
+            if self.subtypes and len(self.subtypes) == 1:
+                self.subtype = self.subtypes[0]
+                exclude.add("subtypes")
+            result = super().dict(exclude=exclude, **kwargs)
+            # Custom handling for schema_metadata/schema
+            if self.schema_metadata and "schema" in result:
+                schema_data = result["schema"]
+                # Handle fields if they exist
+                if "fields" in schema_data and isinstance(schema_data["fields"], list):
+                    # Process each field using its custom dict method
+                    processed_fields = []
+                    if self.schema_metadata and self.schema_metadata.fields:
+                        for field in self.schema_metadata.fields:
+                            if field:
+                                # Use dict method for Pydantic v1
+                                processed_field = field.dict(**kwargs)
+                                processed_fields.append(processed_field)
+                    # Replace the fields in the result with the processed ones
+                    schema_data["fields"] = processed_fields
+            return result
+    else:
+        def model_dump(self, **kwargs):
+            """Custom model_dump method for Pydantic v2 to handle YAML serialization properly."""
+            exclude = kwargs.pop("exclude", set())
+            # If id and name are identical, exclude name from the output
+            if self.id == self.name and self.id is not None:
+                exclude.add("name")
+            # if subtype and subtypes are identical or subtypes is a singleton list, exclude subtypes from the output
+            if self.subtypes and len(self.subtypes) == 1:
+                self.subtype = self.subtypes[0]
+                exclude.add("subtypes")
+            if hasattr(super(), "model_dump"):
+                result = super().model_dump(exclude=exclude, **kwargs)  # type: ignore
+            else:
+                result = super().dict(exclude=exclude, **kwargs)
+            # Custom handling for schema_metadata/schema
+            if self.schema_metadata and "schema" in result:
+                schema_data = result["schema"]
+                # Handle fields if they exist
+                if "fields" in schema_data and isinstance(schema_data["fields"], list):
+                    # Process each field using its custom model_dump method
+                    processed_fields = []
+                    if self.schema_metadata and self.schema_metadata.fields:
+                        for field in self.schema_metadata.fields:
+                            if field:
+                                processed_field = field.model_dump(**kwargs)
+                                processed_fields.append(processed_field)
+                    # Replace the fields in the result with the processed ones
+                    schema_data["fields"] = processed_fields
+            return result
     def to_yaml(
         self,
         file: Path,
-    ) -> None:
+    ) -> bool:
+        """
+        Write model to YAML file only if content has changed.
+        Preserves comments and structure of the existing YAML file.
+        Returns True if file was written, False if no changes were detected.
+        """
+        # Create new model data
+        # Create new model data - choose dict() or model_dump() based on Pydantic version
+        if PYDANTIC_VERSION >= 2:
+            new_data = self.model_dump(
+                exclude_none=True, exclude_unset=True, by_alias=True
+            )
+        else:
+            new_data = self.dict(exclude_none=True, exclude_unset=True, by_alias=True)
+        # Set up ruamel.yaml for preserving comments
+        yaml_handler = YAML(typ="rt")  # round-trip mode
+        yaml_handler.default_flow_style = False
+        yaml_handler.preserve_quotes = True  # type: ignore[assignment]
+        yaml_handler.indent(mapping=2, sequence=2, offset=0)
+        if file.exists():
+            try:
+                # Load existing data with comments preserved
+                with open(file, "r") as fp:
+                    existing_data = yaml_handler.load(fp)
+                # Determine if the file contains a list or a single document
+                if isinstance(existing_data, dict):
+                    existing_data = [existing_data]
+                    is_original_list = False
+                else:
+                    is_original_list = True
+                if isinstance(existing_data, list):
+                    # Handle list case
+                    updated = False
+                    identifier = "urn"
+                    model_id = self.urn
+                    if model_id is not None:
+                        # Try to find and update existing item
+                        for item in existing_data:
+                            existing_dataset = Dataset(**item)
+                            item_identifier = item.get(identifier, existing_dataset.urn)
+                            if item_identifier == model_id:
+                                # Found the item to update - preserve structure while updating values
+                                updated = True
+                                if (
+                                    existing_dataset.schema_metadata
+                                    and existing_dataset.schema_metadata.file
+                                ):
+                                    # Preserve the existing schema file path
+                                    new_data["schema"]["file"] = (
+                                        existing_dataset.schema_metadata.file
+                                    )
+                                    # Check if the content of the schema file has changed
+                                    with open(
+                                        existing_dataset.schema_metadata.file
+                                    ) as schema_fp:
+                                        schema_fp_content = schema_fp.read()
+                                    if (
+                                        schema_fp_content
+                                        != new_data["schema"]["raw_schema"]
+                                    ):
+                                        # If the content has changed, update the schema file
+                                        schema_file_path = Path(
+                                            existing_dataset.schema_metadata.file
+                                        )
+                                        schema_file_path.write_text(
+                                            new_data["schema"]["raw_schema"]
+                                        )
+                                # Remove raw_schema from the schema aspect before updating
+                                if "schema" in new_data:
+                                    new_data["schema"].pop("raw_schema")
+                                _update_dict_preserving_comments(
+                                    item, new_data, ["urn", "properties", "raw_schema"]
+                                )
+                                break
+                    if not updated:
+                        # Item not found, append to the list
+                        existing_data.append(new_data)
+                        updated = True
+                    # If no update was needed, return early
+                    if not updated:
+                        return False
+                    # Write the updated data back
+                    with open(file, "w") as fp:
+                        if not is_original_list:
+                            existing_data = existing_data[0]
+                        yaml_handler.dump(existing_data, fp)
+                return True
+            except Exception as e:
+                # If there's any error, we'll create a new file
+                print(
+                    f"Error processing existing file {file}: {e}. Will create a new one."
+                )
+        else:
+            # File doesn't exist or had errors - create a new one with default settings
+            yaml_handler.indent(mapping=2, sequence=2, offset=0)
+        file.parent.mkdir(parents=True, exist_ok=True)
         with open(file, "w") as fp:
-            yaml = YAML(typ="rt")  # default, if not specfied, is 'rt' (round-trip)
-            yaml.indent(mapping=2, sequence=4, offset=2)
-            yaml.default_flow_style = False
-            yaml.dump(self.dict(exclude_none=True, exclude_unset=True), fp)
+            yaml_handler.dump(new_data, fp)
+        return True
+def _update_dict_preserving_comments(
+    target: Dict, source: Dict, optional_fields: Optional[List[str]] = None
+) -> None:
+    """
+    Updates a target dictionary with values from source, preserving comments and structure.
+    This modifies the target dictionary in-place.
+    """
+    if optional_fields is None:
+        optional_fields = ["urn"]
+    # For each key in the source dict
+    for key, value in source.items():
+        if key in target:
+            if isinstance(value, dict) and isinstance(target[key], dict):
+                # Recursively update nested dictionaries
+                _update_dict_preserving_comments(target[key], value)
+            else:
+                # Update scalar or list values
+                # If target value is an int, and source value is a float that is equal to the int, convert to int
+                if isinstance(value, float) and int(value) == value:
+                    target[key] = int(value)
+                else:
+                    target[key] = value
+        elif key not in optional_fields:
+            # Add new keys
+            target[key] = value
+    # Remove keys that are in target but not in source
+    keys_to_remove = [k for k in target if k not in source]
+    for key in keys_to_remove:
+        del target[key]
+def _dict_equal(dict1: Dict, dict2: Dict, optional_keys: List[str]) -> bool:
+    """
+    Compare two dictionaries for equality, ignoring ruamel.yaml's metadata.
+    """
+    if len(dict1) != len(dict2):
+        # Check if the difference is only in optional keys
+        if len(dict1) > len(dict2):
+            for key in optional_keys:
+                if key in dict1 and key not in dict2:
+                    del dict1[key]
+        elif len(dict2) > len(dict1):
+            for key in optional_keys:
+                if key in dict2 and key not in dict1:
+                    del dict2[key]
+        if len(dict1) != len(dict2):
+            return False
+    for key, value in dict1.items():
+        if key not in dict2:
+            return False
+        if isinstance(value, dict) and isinstance(dict2[key], dict):
+            if not _dict_equal(value, dict2[key], optional_keys):
+                return False
+        elif isinstance(value, list) and isinstance(dict2[key], list):
+            if len(value) != len(dict2[key]):
+                return False
+            # Check list items (simplified for brevity)
+            for i in range(len(value)):
+                if isinstance(value[i], dict) and isinstance(dict2[key][i], dict):
+                    if not _dict_equal(value[i], dict2[key][i], optional_keys):
+                        return False
+                elif value[i] != dict2[key][i]:
+                    return False
+        elif value != dict2[key]:
+            return False
+    return True

acryl-datahub 1.0.0rc8__py3-none-any.whl → 1.0.0rc10__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0rc8py3-none-any.whl → 1.0.0rc10py3-none-any.whl