PyPI - acryl-datahub - Versions diffs - 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl - Mend

acryl-datahub 0.14.1.13rc8py3-none-any.whl → 0.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (139) hide show

{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2506 -2456
{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +136 -131
{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
datahub/__init__.py +1 -1
datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
datahub/cli/cli_utils.py +2 -0
datahub/cli/delete_cli.py +103 -24
datahub/cli/ingest_cli.py +110 -0
datahub/cli/put_cli.py +1 -1
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/cli/specific/structuredproperties_cli.py +2 -1
datahub/configuration/common.py +3 -3
datahub/configuration/git.py +7 -1
datahub/configuration/kafka_consumer_config.py +31 -1
datahub/emitter/mcp_patch_builder.py +43 -0
datahub/emitter/rest_emitter.py +17 -4
datahub/ingestion/api/incremental_properties_helper.py +69 -0
datahub/ingestion/api/source.py +6 -1
datahub/ingestion/api/source_helpers.py +4 -2
datahub/ingestion/graph/client.py +2 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
datahub/ingestion/run/pipeline.py +6 -5
datahub/ingestion/run/pipeline_config.py +6 -0
datahub/ingestion/sink/datahub_rest.py +15 -4
datahub/ingestion/source/abs/source.py +4 -0
datahub/ingestion/source/aws/aws_common.py +13 -1
datahub/ingestion/source/aws/sagemaker.py +8 -0
datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +0 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +0 -21
datahub/ingestion/source/bigquery_v2/profiler.py +0 -6
datahub/ingestion/source/common/subtypes.py +2 -0
datahub/ingestion/source/csv_enricher.py +1 -1
datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
datahub/ingestion/source/datahub/datahub_source.py +8 -1
datahub/ingestion/source/dbt/dbt_common.py +7 -61
datahub/ingestion/source/dremio/dremio_api.py +204 -86
datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
datahub/ingestion/source/dremio/dremio_config.py +5 -0
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
datahub/ingestion/source/dremio/dremio_entities.py +4 -0
datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
datahub/ingestion/source/dremio/dremio_source.py +7 -2
datahub/ingestion/source/elastic_search.py +1 -1
datahub/ingestion/source/feast.py +97 -6
datahub/ingestion/source/gc/datahub_gc.py +46 -35
datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
datahub/ingestion/source/ge_data_profiler.py +46 -9
datahub/ingestion/source/ge_profiling_config.py +5 -0
datahub/ingestion/source/iceberg/iceberg.py +12 -5
datahub/ingestion/source/kafka/kafka.py +39 -19
datahub/ingestion/source/kafka/kafka_connect.py +81 -51
datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
datahub/ingestion/source/looker/view_upstream.py +65 -30
datahub/ingestion/source/metadata/business_glossary.py +35 -18
datahub/ingestion/source/mode.py +0 -23
datahub/ingestion/source/neo4j/__init__.py +0 -0
datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
datahub/ingestion/source/powerbi/__init__.py +0 -1
datahub/ingestion/source/powerbi/config.py +3 -3
datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
datahub/ingestion/source/powerbi/powerbi.py +12 -6
datahub/ingestion/source/preset.py +1 -0
datahub/ingestion/source/pulsar.py +21 -2
datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
datahub/ingestion/source/redash.py +13 -63
datahub/ingestion/source/redshift/config.py +1 -0
datahub/ingestion/source/redshift/redshift.py +3 -0
datahub/ingestion/source/s3/source.py +2 -3
datahub/ingestion/source/sigma/data_classes.py +1 -0
datahub/ingestion/source/sigma/sigma.py +101 -43
datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
datahub/ingestion/source/sql/athena.py +46 -22
datahub/ingestion/source/sql/mssql/source.py +18 -6
datahub/ingestion/source/sql/sql_common.py +34 -21
datahub/ingestion/source/sql/sql_report.py +1 -0
datahub/ingestion/source/sql/sql_types.py +85 -8
datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
datahub/ingestion/source/superset.py +215 -65
datahub/ingestion/source/tableau/tableau.py +237 -76
datahub/ingestion/source/tableau/tableau_common.py +12 -6
datahub/ingestion/source/tableau/tableau_constant.py +2 -0
datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
datahub/ingestion/source/tableau/tableau_validation.py +48 -0
datahub/ingestion/source/unity/proxy_types.py +1 -0
datahub/ingestion/source/unity/source.py +4 -0
datahub/ingestion/source/unity/usage.py +20 -11
datahub/ingestion/transformer/add_dataset_tags.py +1 -1
datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
datahub/integrations/assertion/common.py +1 -1
datahub/lite/duckdb_lite.py +12 -17
datahub/metadata/_schema_classes.py +512 -392
datahub/metadata/_urns/urn_defs.py +1355 -1355
datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
datahub/metadata/schema.avsc +17222 -17499
datahub/metadata/schemas/FormInfo.avsc +4 -0
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
datahub/specific/chart.py +0 -39
datahub/specific/dashboard.py +0 -39
datahub/specific/datajob.py +7 -57
datahub/sql_parsing/schema_resolver.py +23 -0
datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
datahub/sql_parsing/sqlglot_lineage.py +55 -14
datahub/sql_parsing/sqlglot_utils.py +8 -2
datahub/telemetry/telemetry.py +23 -9
datahub/testing/compare_metadata_json.py +1 -1
datahub/testing/doctest.py +12 -0
datahub/utilities/file_backed_collections.py +35 -2
datahub/utilities/partition_executor.py +1 -1
datahub/utilities/urn_encoder.py +2 -1
datahub/utilities/urns/_urn_base.py +1 -1
datahub/utilities/urns/structured_properties_urn.py +1 -1
datahub/utilities/sql_lineage_parser_impl.py +0 -160
datahub/utilities/sql_parser.py +0 -94
datahub/utilities/sql_parser_base.py +0 -21
{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/dremio/dremio_api.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import concurrent.futures
 import json
 import logging
+import re
 import warnings
 from collections import defaultdict
 from enum import Enum
@@ -609,32 +610,6 @@ class DremioAPIOperations:
         return self.execute_query(query=jobs_query)
-    def get_source_by_id(self, source_id: str) -> Optional[Dict]:
-        """
-        Fetch source details by ID.
-        """
-        response = self.get(
-            url=f"/source/{source_id}",
-        )
-        return response if response else None
-    def get_source_for_dataset(self, schema: str, dataset: str) -> Optional[Dict]:
-        """
-        Get source information for a dataset given its schema and name.
-        """
-        dataset_id = self.get_dataset_id(schema, dataset)
-        if not dataset_id:
-            return None
-        catalog_entry = self.get(
-            url=f"/catalog/{dataset_id}",
-        )
-        if not catalog_entry or "path" not in catalog_entry:
-            return None
-        source_id = catalog_entry["path"][0]
-        return self.get_source_by_id(source_id)
     def get_tags_for_resource(self, resource_id: str) -> Optional[List[str]]:
         """
         Get Dremio tags for a given resource_id.
@@ -673,55 +648,119 @@ class DremioAPIOperations:
             )
         return None
-    def get_containers_for_location(
-        self, resource_id: str, path: List[str]
-    ) -> List[Dict[str, str]]:
-        containers = []
+    def _check_pattern_match(
+        self,
+        pattern: str,
+        paths: List[str],
+        allow_prefix: bool = True,
+    ) -> bool:
+        """
+        Helper method to check if a pattern matches any of the paths.
+        Handles hierarchical matching where each level is matched independently.
+        Also handles prefix matching for partial paths.
+        """
+        if pattern == ".*":
+            return True
-        def traverse_path(location_id: str, entity_path: List[str]) -> List:
-            nonlocal containers
-            try:
-                response = self.get(url=f"/catalog/{location_id}")
-                if (
-                    response.get("entityType")
-                    == DremioEntityContainerType.FOLDER.value.lower()
-                ):
-                    containers.append(
-                        {
-                            "id": location_id,
-                            "name": entity_path[-1],
-                            "path": entity_path[:-1],
-                            "container_type": DremioEntityContainerType.FOLDER,
-                        }
-                    )
+        # Convert the pattern to regex with proper anchoring
+        regex_pattern = pattern
+        if pattern.startswith("^"):
+            # Already has start anchor
+            regex_pattern = pattern.replace(".", r"\.")  # Escape dots
+            regex_pattern = regex_pattern.replace(
+                r"\.*", ".*"
+            )  # Convert .* to wildcard
+        else:
+            # Add start anchor and handle dots
+            regex_pattern = "^" + pattern.replace(".", r"\.").replace(r"\.*", ".*")
+        # Handle end matching
+        if not pattern.endswith(".*"):
+            if pattern.endswith("$"):
+                # Keep explicit end anchor
+                pass
+            elif not allow_prefix:
+                # Add end anchor for exact matching
+                regex_pattern = regex_pattern + "$"
+        for path in paths:
+            if re.match(regex_pattern, path, re.IGNORECASE):
+                return True
-                for container in response.get("children", []):
-                    if (
-                        container.get("type")
-                        == DremioEntityContainerType.CONTAINER.value
-                    ):
-                        traverse_path(container.get("id"), container.get("path"))
+        return False
-            except Exception as exc:
-                logging.info(
-                    "Location {} contains no tables or views. Skipping...".format(id)
-                )
-                self.report.warning(
-                    message="Failed to get tables or views",
-                    context=f"{id}",
-                    exc=exc,
-                )
+    def should_include_container(self, path: List[str], name: str) -> bool:
+        """
+        Helper method to check if a container should be included based on schema patterns.
+        Used by both get_all_containers and get_containers_for_location.
+        """
+        path_components = path + [name] if path else [name]
+        full_path = ".".join(path_components)
-            return containers
+        # Default allow everything case
+        if self.allow_schema_pattern == [".*"] and not self.deny_schema_pattern:
+            self.report.report_container_scanned(full_path)
+            return True
-        return traverse_path(location_id=resource_id, entity_path=path)
+        # Check deny patterns first
+        if self.deny_schema_pattern:
+            for pattern in self.deny_schema_pattern:
+                if self._check_pattern_match(
+                    pattern=pattern,
+                    paths=[full_path],
+                    allow_prefix=False,
+                ):
+                    self.report.report_container_filtered(full_path)
+                    return False
+        # Check allow patterns
+        for pattern in self.allow_schema_pattern:
+            # For patterns with wildcards, check if this path is a parent of the pattern
+            if "*" in pattern:
+                pattern_parts = pattern.split(".")
+                path_parts = path_components
+                # If pattern has exact same number of parts, check each component
+                if len(pattern_parts) == len(path_parts):
+                    matches = True
+                    for p_part, c_part in zip(pattern_parts, path_parts):
+                        if p_part != "*" and p_part.lower() != c_part.lower():
+                            matches = False
+                            break
+                    if matches:
+                        self.report.report_container_scanned(full_path)
+                        return True
+                # Otherwise check if current path is prefix match
+                else:
+                    # Remove the trailing wildcard if present
+                    if pattern_parts[-1] == "*":
+                        pattern_parts = pattern_parts[:-1]
+                    for i in range(len(path_parts)):
+                        current_path = ".".join(path_parts[: i + 1])
+                        pattern_prefix = ".".join(pattern_parts[: i + 1])
+                        if pattern_prefix.startswith(current_path):
+                            self.report.report_container_scanned(full_path)
+                            return True
+            # Direct pattern matching
+            if self._check_pattern_match(
+                pattern=pattern,
+                paths=[full_path],
+                allow_prefix=True,
+            ):
+                self.report.report_container_scanned(full_path)
+                return True
+        self.report.report_container_filtered(full_path)
+        return False
     def get_all_containers(self):
         """
-        Query the Dremio sources API and return source information.
+        Query the Dremio sources API and return filtered source information.
         """
         containers = []
         response = self.get(url="/catalog")
         def process_source(source):
@@ -731,34 +770,41 @@ class DremioAPIOperations:
                 )
                 source_config = source_resp.get("config", {})
-                if source_config.get("database"):
-                    db = source_config.get("database")
-                else:
-                    db = source_config.get("databaseName", "")
-                return {
-                    "id": source.get("id"),
-                    "name": source.get("path")[0],
-                    "path": [],
-                    "container_type": DremioEntityContainerType.SOURCE,
-                    "source_type": source_resp.get("type"),
-                    "root_path": source_config.get("rootPath"),
-                    "database_name": db,
-                }
+                db = source_config.get(
+                    "database", source_config.get("databaseName", "")
+                )
+                if self.should_include_container([], source.get("path")[0]):
+                    return {
+                        "id": source.get("id"),
+                        "name": source.get("path")[0],
+                        "path": [],
+                        "container_type": DremioEntityContainerType.SOURCE,
+                        "source_type": source_resp.get("type"),
+                        "root_path": source_config.get("rootPath"),
+                        "database_name": db,
+                    }
             else:
-                return {
-                    "id": source.get("id"),
-                    "name": source.get("path")[0],
-                    "path": [],
-                    "container_type": DremioEntityContainerType.SPACE,
-                }
+                if self.should_include_container([], source.get("path")[0]):
+                    return {
+                        "id": source.get("id"),
+                        "name": source.get("path")[0],
+                        "path": [],
+                        "container_type": DremioEntityContainerType.SPACE,
+                    }
+            return None
         def process_source_and_containers(source):
             container = process_source(source)
+            if not container:
+                return []
+            # Get sub-containers
             sub_containers = self.get_containers_for_location(
                 resource_id=container.get("id"),
                 path=[container.get("name")],
             )
             return [container] + sub_containers
         # Use ThreadPoolExecutor to parallelize the processing of sources
@@ -771,6 +817,78 @@ class DremioAPIOperations:
             }
             for future in concurrent.futures.as_completed(future_to_source):
-                containers.extend(future.result())
+                source = future_to_source[future]
+                try:
+                    containers.extend(future.result())
+                except Exception as exc:
+                    logger.error(f"Error processing source: {exc}")
+                    self.report.warning(
+                        message="Failed to process source",
+                        context=f"{source}",
+                        exc=exc,
+                    )
         return containers
+    def get_context_for_vds(self, resource_id: str) -> str:
+        context_array = self.get(
+            url=f"/catalog/{resource_id}",
+        ).get("sqlContext")
+        if context_array:
+            return ".".join(
+                f'"{part}"' if "." in part else f"{part}" for part in context_array
+            )
+        else:
+            return ""
+    def get_containers_for_location(
+        self, resource_id: str, path: List[str]
+    ) -> List[Dict[str, str]]:
+        containers = []
+        def traverse_path(location_id: str, entity_path: List[str]) -> List:
+            nonlocal containers
+            try:
+                response = self.get(url=f"/catalog/{location_id}")
+                # Check if current folder should be included
+                if (
+                    response.get("entityType")
+                    == DremioEntityContainerType.FOLDER.value.lower()
+                ):
+                    folder_name = entity_path[-1]
+                    folder_path = entity_path[:-1]
+                    if self.should_include_container(folder_path, folder_name):
+                        containers.append(
+                            {
+                                "id": location_id,
+                                "name": folder_name,
+                                "path": folder_path,
+                                "container_type": DremioEntityContainerType.FOLDER,
+                            }
+                        )
+                # Recursively process child containers
+                for container in response.get("children", []):
+                    if (
+                        container.get("type")
+                        == DremioEntityContainerType.CONTAINER.value
+                    ):
+                        traverse_path(container.get("id"), container.get("path"))
+            except Exception as exc:
+                logging.info(
+                    "Location {} contains no tables or views. Skipping...".format(
+                        location_id
+                    )
+                )
+                self.report.warning(
+                    message="Failed to get tables or views",
+                    context=f"{location_id}",
+                    exc=exc,
+                )
+            return containers
+        return traverse_path(location_id=resource_id, entity_path=path)

datahub/ingestion/source/dremio/dremio_aspects.py CHANGED Viewed

@@ -142,6 +142,7 @@ class DremioAspects:
         platform: str,
         ui_url: str,
         env: str,
+        ingest_owner: bool,
         domain: Optional[str] = None,
         platform_instance: Optional[str] = None,
     ):
@@ -150,6 +151,7 @@ class DremioAspects:
         self.env = env
         self.domain = domain
         self.ui_url = ui_url
+        self.ingest_owner = ingest_owner
     def get_container_key(
         self, name: Optional[str], path: Optional[List[str]]
@@ -426,21 +428,23 @@ class DremioAspects:
         return f'{self.ui_url}/{container_type}/{dataset_url_path}"{dataset.resource_name}"'
     def _create_ownership(self, dataset: DremioDataset) -> Optional[OwnershipClass]:
-        if not dataset.owner:
-            return None
-        owner = (
-            make_user_urn(dataset.owner)
-            if dataset.owner_type == "USER"
-            else make_group_urn(dataset.owner)
-        )
-        return OwnershipClass(
-            owners=[
-                OwnerClass(
-                    owner=owner,
-                    type=OwnershipTypeClass.TECHNICAL_OWNER,
-                )
-            ]
-        )
+        if self.ingest_owner and dataset.owner:
+            owner_urn = (
+                make_user_urn(dataset.owner)
+                if dataset.owner_type == "USER"
+                else make_group_urn(dataset.owner)
+            )
+            ownership: OwnershipClass = OwnershipClass(
+                owners=[
+                    OwnerClass(
+                        owner=owner_urn,
+                        type=OwnershipTypeClass.TECHNICAL_OWNER,
+                    )
+                ]
+            )
+            return ownership
+        return None
     def _create_glossary_terms(self, entity: DremioDataset) -> GlossaryTermsClass:
         return GlossaryTermsClass(

datahub/ingestion/source/dremio/dremio_config.py CHANGED Viewed

@@ -174,3 +174,8 @@ class DremioSourceConfig(
         default=False,
         description="Whether to include query-based lineage information.",
     )
+    ingest_owner: bool = Field(
+        default=True,
+        description="Ingest Owner from source. This will override Owner info entered from UI",
+    )

datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py CHANGED Viewed

@@ -31,6 +31,7 @@ class DremioToDataHubSourceTypeMapping:
         "SNOWFLAKE": "snowflake",
         "SYNAPSE": "mssql",
         "TERADATA": "teradata",
+        "VERTICA": "vertica",
     }
     DATABASE_SOURCE_TYPES = {
@@ -52,6 +53,7 @@ class DremioToDataHubSourceTypeMapping:
         "SNOWFLAKE",
         "SYNAPSE",
         "TERADATA",
+        "VERTICA",
     }
     FILE_OBJECT_STORAGE_TYPES = {

datahub/ingestion/source/dremio/dremio_entities.py CHANGED Viewed

@@ -200,6 +200,7 @@ class DremioDataset:
     columns: List[DremioDatasetColumn]
     sql_definition: Optional[str]
     dataset_type: DremioDatasetType
+    default_schema: Optional[str]
     owner: Optional[str]
     owner_type: Optional[str]
     created: str
@@ -235,6 +236,9 @@ class DremioDataset:
         if self.sql_definition:
             self.dataset_type = DremioDatasetType.VIEW
+            self.default_schema = api_operations.get_context_for_vds(
+                resource_id=self.resource_id
+            )
         else:
             self.dataset_type = DremioDatasetType.TABLE

datahub/ingestion/source/dremio/dremio_reporting.py CHANGED Viewed

@@ -14,12 +14,27 @@ class DremioSourceReport(
 ):
     num_containers_failed: int = 0
     num_datasets_failed: int = 0
+    containers_scanned: int = 0
+    containers_filtered: int = 0
     def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
         # recording total combined latency is not very useful, keeping this method as a placeholder
         # for future implementation of min / max / percentiles etc.
         pass
+    def report_container_scanned(self, name: str) -> None:
+        """
+        Record that a container was successfully scanned
+        """
+        self.containers_scanned += 1
+    def report_container_filtered(self, container_name: str) -> None:
+        """
+        Record that a container was filtered out
+        """
+        self.containers_filtered += 1
+        self.report_dropped(container_name)
     def report_entity_scanned(self, name: str, ent_type: str = "View") -> None:
         """
         Entity could be a view or a table

datahub/ingestion/source/dremio/dremio_source.py CHANGED Viewed

@@ -97,6 +97,7 @@ class DremioSource(StatefulIngestionSourceBase):
     - Ownership and Glossary Terms:
         - Metadata related to ownership of datasets, extracted from Dremio’s ownership model.
         - Glossary terms and business metadata associated with datasets, providing additional context to the data.
+        - Note: Ownership information will only be available for the Cloud and Enterprise editions, it will not be available for the Community edition.
     - Optional SQL Profiling (if enabled):
         - Table, row, and column statistics can be profiled and ingested via optional SQL queries.
@@ -123,6 +124,7 @@ class DremioSource(StatefulIngestionSourceBase):
         self.dremio_aspects = DremioAspects(
             platform=self.get_platform(),
             domain=self.config.domain,
+            ingest_owner=self.config.ingest_owner,
             platform_instance=self.config.platform_instance,
             env=self.config.env,
             ui_url=dremio_api.ui_url,
@@ -394,10 +396,12 @@ class DremioSource(StatefulIngestionSourceBase):
         ):
             yield dremio_mcp
             # Check if the emitted aspect is SchemaMetadataClass
-            if isinstance(dremio_mcp.metadata, SchemaMetadataClass):
+            if isinstance(
+                dremio_mcp.metadata, MetadataChangeProposalWrapper
+            ) and isinstance(dremio_mcp.metadata.aspect, SchemaMetadataClass):
                 self.sql_parsing_aggregator.register_schema(
                     urn=dataset_urn,
-                    schema=dremio_mcp.metadata,
+                    schema=dremio_mcp.metadata.aspect,
                 )
         if dataset_info.dataset_type == DremioDatasetType.VIEW:
@@ -415,6 +419,7 @@ class DremioSource(StatefulIngestionSourceBase):
                     view_urn=dataset_urn,
                     view_definition=dataset_info.sql_definition,
                     default_db=self.default_db,
+                    default_schema=dataset_info.default_schema,
                 )
         elif dataset_info.dataset_type == DremioDatasetType.TABLE:

datahub/ingestion/source/elastic_search.py CHANGED Viewed

@@ -227,7 +227,7 @@ def collapse_name(name: str, collapse_urns: CollapseUrns) -> str:
 def collapse_urn(urn: str, collapse_urns: CollapseUrns) -> str:
     if len(collapse_urns.urns_suffix_regex) == 0:
         return urn
-    urn_obj = DatasetUrn.create_from_string(urn)
+    urn_obj = DatasetUrn.from_string(urn)
     name = collapse_name(name=urn_obj.get_dataset_name(), collapse_urns=collapse_urns)
     data_platform_urn = urn_obj.get_data_platform_urn()
     return str(

datahub/ingestion/source/feast.py CHANGED Viewed

@@ -42,10 +42,14 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
 from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
 from datahub.metadata.schema_classes import (
     BrowsePathsClass,
+    GlobalTagsClass,
     MLFeaturePropertiesClass,
     MLFeatureTablePropertiesClass,
     MLPrimaryKeyPropertiesClass,
+    OwnerClass,
+    OwnershipClass,
     StatusClass,
+    TagAssociationClass,
 )
 # FIXME: ValueType module cannot be used as a type
@@ -91,6 +95,24 @@ class FeastRepositorySourceConfig(ConfigModel):
     environment: str = Field(
         default=DEFAULT_ENV, description="Environment to use when constructing URNs"
     )
+    # owner_mappings example:
+    # This must be added to the recipe in order to extract owners, otherwise NO owners will be extracted
+    # owner_mappings:
+    #   - feast_owner_name: "<owner>"
+    #     datahub_owner_urn: "urn:li:corpGroup:<owner>"
+    #     datahub_ownership_type: "BUSINESS_OWNER"
+    owner_mappings: Optional[List[Dict[str, str]]] = Field(
+        default=None, description="Mapping of owner names to owner types"
+    )
+    enable_owner_extraction: bool = Field(
+        default=False,
+        description="If this is disabled, then we NEVER try to map owners. "
+        "If this is enabled, then owner_mappings is REQUIRED to extract ownership.",
+    )
+    enable_tag_extraction: bool = Field(
+        default=False,
+        description="If this is disabled, then we NEVER try to extract tags.",
+    )
 @platform_name("Feast")
@@ -215,10 +237,15 @@ class FeastRepositorySource(Source):
         """
         feature_view_name = f"{self.feature_store.project}.{feature_view.name}"
+        aspects = (
+            [StatusClass(removed=False)]
+            + self._get_tags(entity)
+            + self._get_owners(entity)
+        )
         entity_snapshot = MLPrimaryKeySnapshot(
             urn=builder.make_ml_primary_key_urn(feature_view_name, entity.name),
-            aspects=[StatusClass(removed=False)],
+            aspects=aspects,
         )
         entity_snapshot.aspects.append(
@@ -243,10 +270,11 @@ class FeastRepositorySource(Source):
         Generate an MLFeature work unit for a Feast feature.
         """
         feature_view_name = f"{self.feature_store.project}.{feature_view.name}"
+        aspects = [StatusClass(removed=False)] + self._get_tags(field)
         feature_snapshot = MLFeatureSnapshot(
             urn=builder.make_ml_feature_urn(feature_view_name, field.name),
-            aspects=[StatusClass(removed=False)],
+            aspects=aspects,
         )
         feature_sources = []
@@ -295,13 +323,18 @@ class FeastRepositorySource(Source):
         """
         feature_view_name = f"{self.feature_store.project}.{feature_view.name}"
+        aspects = (
+            [
+                BrowsePathsClass(paths=[f"/feast/{self.feature_store.project}"]),
+                StatusClass(removed=False),
+            ]
+            + self._get_tags(feature_view)
+            + self._get_owners(feature_view)
+        )
         feature_view_snapshot = MLFeatureTableSnapshot(
             urn=builder.make_ml_feature_table_urn("feast", feature_view_name),
-            aspects=[
-                BrowsePathsClass(paths=[f"/feast/{self.feature_store.project}"]),
-                StatusClass(removed=False),
-            ],
+            aspects=aspects,
         )
         feature_view_snapshot.aspects.append(
@@ -360,6 +393,64 @@ class FeastRepositorySource(Source):
         return MetadataWorkUnit(id=on_demand_feature_view_name, mce=mce)
+    # If a tag is specified in a Feast object, then the tag will be ingested into Datahub if enable_tag_extraction is
+    # True, otherwise NO tags will be ingested
+    def _get_tags(self, obj: Union[Entity, FeatureView, FeastField]) -> list:
+        """
+        Extracts tags from the given object and returns a list of aspects.
+        """
+        aspects: List[Union[GlobalTagsClass]] = []
+        # Extract tags
+        if self.source_config.enable_tag_extraction:
+            if obj.tags.get("name"):
+                tag_name: str = obj.tags["name"]
+                tag_association = TagAssociationClass(
+                    tag=builder.make_tag_urn(tag_name)
+                )
+                global_tags_aspect = GlobalTagsClass(tags=[tag_association])
+                aspects.append(global_tags_aspect)
+        return aspects
+    # If an owner is specified in a Feast object, it will only be ingested into Datahub if owner_mappings is specified
+    # and enable_owner_extraction is True in FeastRepositorySourceConfig, otherwise NO owners will be ingested
+    def _get_owners(self, obj: Union[Entity, FeatureView, FeastField]) -> list:
+        """
+        Extracts owners from the given object and returns a list of aspects.
+        """
+        aspects: List[Union[OwnershipClass]] = []
+        # Extract owner
+        if self.source_config.enable_owner_extraction:
+            owner = getattr(obj, "owner", None)
+            if owner:
+                # Create owner association, skipping if None
+                owner_association = self._create_owner_association(owner)
+                if owner_association:  # Only add valid owner associations
+                    owners_aspect = OwnershipClass(owners=[owner_association])
+                    aspects.append(owners_aspect)
+        return aspects
+    def _create_owner_association(self, owner: str) -> Optional[OwnerClass]:
+        """
+        Create an OwnerClass instance for the given owner using the owner mappings.
+        """
+        if self.source_config.owner_mappings is not None:
+            for mapping in self.source_config.owner_mappings:
+                if mapping["feast_owner_name"] == owner:
+                    ownership_type_class: str = mapping.get(
+                        "datahub_ownership_type", "TECHNICAL_OWNER"
+                    )
+                    datahub_owner_urn = mapping.get("datahub_owner_urn")
+                    if datahub_owner_urn:
+                        return OwnerClass(
+                            owner=datahub_owner_urn,
+                            type=ownership_type_class,
+                        )
+        return None
     @classmethod
     def create(cls, config_dict, ctx):
         config = FeastRepositorySourceConfig.parse_obj(config_dict)

acryl-datahub 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.14.1.13rc8py3-none-any.whl → 0.15.0py3-none-any.whl