PyPI - acryl-datahub - Versions diffs - 0.15.0.2rc3__py3-none-any.whl → 0.15.0.2rc5__py3-none-any.whl - Mend

acryl-datahub 0.15.0.2rc3py3-none-any.whl → 0.15.0.2rc5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (58) hide show

{acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc5.dist-info}/METADATA +2460 -2460
{acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc5.dist-info}/RECORD +58 -54
datahub/__init__.py +1 -1
datahub/cli/delete_cli.py +3 -3
datahub/cli/migrate.py +2 -2
datahub/emitter/mcp_builder.py +27 -0
datahub/emitter/rest_emitter.py +1 -1
datahub/ingestion/api/source.py +2 -2
datahub/ingestion/graph/client.py +4 -2
datahub/ingestion/source/aws/glue.py +14 -1
datahub/ingestion/source/aws/s3_util.py +24 -1
datahub/ingestion/source/delta_lake/source.py +0 -5
datahub/ingestion/source/demo_data.py +1 -1
datahub/ingestion/source/fivetran/fivetran.py +1 -6
datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +14 -1
datahub/ingestion/source/iceberg/iceberg.py +10 -3
datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
datahub/ingestion/source/kafka_connect/kafka_connect.py +1 -6
datahub/ingestion/source/metabase.py +1 -6
datahub/ingestion/source/mlflow.py +0 -5
datahub/ingestion/source/nifi.py +0 -5
datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
datahub/ingestion/source/redash.py +0 -5
datahub/ingestion/source/redshift/redshift.py +1 -0
datahub/ingestion/source/s3/source.py +10 -14
datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
datahub/ingestion/source/snowflake/snowflake_schema.py +5 -2
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -20
datahub/ingestion/source/snowflake/snowflake_tag.py +14 -4
datahub/ingestion/source/snowflake/snowflake_v2.py +0 -6
datahub/ingestion/source/sql/sql_types.py +1 -1
datahub/ingestion/source/sql/sql_utils.py +5 -0
datahub/ingestion/source/superset.py +1 -6
datahub/ingestion/source/tableau/tableau.py +0 -6
datahub/metadata/_schema_classes.py +316 -43
datahub/metadata/_urns/urn_defs.py +69 -15
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
datahub/metadata/schema.avsc +296 -87
datahub/metadata/schemas/DataFlowKey.avsc +1 -0
datahub/metadata/schemas/DataJobKey.avsc +1 -0
datahub/metadata/schemas/DatasetKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
datahub/metadata/schemas/MLModelKey.avsc +2 -1
datahub/metadata/schemas/MLModelProperties.avsc +96 -48
datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
datahub/metadata/schemas/VersionProperties.avsc +216 -0
datahub/metadata/schemas/VersionSetKey.avsc +26 -0
datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
{acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc5.dist-info}/WHEEL +0 -0
{acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc5.dist-info}/entry_points.txt +0 -0
{acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc5.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/fivetran/fivetran.py CHANGED Viewed

@@ -16,7 +16,7 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport
+from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.fivetran.config import (
     KNOWN_DATA_PLATFORM_MAPPING,
@@ -291,11 +291,6 @@ class FivetranSource(StatefulIngestionSourceBase):
             dpi = self._generate_dpi_from_job(job, datajob)
             yield from self._get_dpi_workunits(job, dpi)
-    @classmethod
-    def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
-        config = FivetranSourceConfig.parse_obj(config_dict)
-        return cls(config, ctx)
     def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
         return [
             *super().get_workunit_processors(),

datahub/ingestion/source/gc/execution_request_cleanup.py CHANGED Viewed

@@ -29,7 +29,7 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
     )
     keep_history_max_days: int = Field(
-        30,
+        90,
         description="Maximum number of days to keep execution requests for, per ingestion source",
     )
@@ -48,6 +48,10 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
         description="Maximum runtime in seconds for the cleanup task",
     )
+    limit_entities_delete: Optional[int] = Field(
+        10000, description="Max number of execution requests to hard delete."
+    )
     max_read_errors: int = Field(
         default=10,
         description="Maximum number of read errors before aborting",
@@ -65,6 +69,8 @@ class DatahubExecutionRequestCleanupReport(SourceReport):
     ergc_delete_errors: int = 0
     ergc_start_time: Optional[datetime.datetime] = None
     ergc_end_time: Optional[datetime.datetime] = None
+    ergc_delete_limit_reached: bool = False
+    ergc_runtime_limit_reached: bool = False
 class CleanupRecord(BaseModel):
@@ -85,12 +91,20 @@ class DatahubExecutionRequestCleanup:
         self.graph = graph
         self.report = report
         self.instance_id = int(time.time())
+        self.last_print_time = 0.0
         if config is not None:
             self.config = config
         else:
             self.config = DatahubExecutionRequestCleanupConfig()
+    def _print_report(self) -> None:
+        time_taken = round(time.time() - self.last_print_time, 1)
+        # Print report every 2 minutes
+        if time_taken > 120:
+            self.last_print_time = time.time()
+            logger.info(f"\n{self.report.as_string()}")
     def _to_cleanup_record(self, entry: Dict) -> CleanupRecord:
         input_aspect = (
             entry.get("aspects", {})
@@ -175,6 +189,7 @@ class DatahubExecutionRequestCleanup:
         running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
         for entry in self._scroll_execution_requests():
+            self._print_report()
             self.report.ergc_records_read += 1
             key = entry.ingestion_source
@@ -225,15 +240,12 @@ class DatahubExecutionRequestCleanup:
                     f"record timestamp: {entry.requested_at}."
                 )
             )
-            self.report.ergc_records_deleted += 1
             yield entry
     def _delete_entry(self, entry: CleanupRecord) -> None:
         try:
-            logger.info(
-                f"ergc({self.instance_id}): going to delete ExecutionRequest {entry.request_id}"
-            )
             self.graph.delete_entity(entry.urn, True)
+            self.report.ergc_records_deleted += 1
         except Exception as e:
             self.report.ergc_delete_errors += 1
             self.report.failure(
@@ -252,10 +264,23 @@ class DatahubExecutionRequestCleanup:
                 >= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
             )
         ):
+            self.report.ergc_runtime_limit_reached = True
             logger.info(f"ergc({self.instance_id}): max runtime reached.")
             return True
         return False
+    def _reached_delete_limit(self) -> bool:
+        if (
+            self.config.limit_entities_delete
+            and self.report.ergc_records_deleted >= self.config.limit_entities_delete
+        ):
+            logger.info(
+                f"ergc({self.instance_id}): max delete limit reached: {self.config.limit_entities_delete}."
+            )
+            self.report.ergc_delete_limit_reached = True
+            return True
+        return False
     def run(self) -> None:
         if not self.config.enabled:
             logger.info(
@@ -274,7 +299,7 @@ class DatahubExecutionRequestCleanup:
         )
         for entry in self._scroll_garbage_records():
-            if self._reached_runtime_limit():
+            if self._reached_runtime_limit() or self._reached_delete_limit():
                 break
             self._delete_entry(entry)

datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py CHANGED Viewed

@@ -231,6 +231,15 @@ class SoftDeletedEntitiesCleanup:
     def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
         assert self.ctx.graph
         scroll_id: Optional[str] = None
+        batch_size = self.config.batch_size
+        if entity_type == "DATA_PROCESS_INSTANCE":
+            # Due to a bug in Data process instance querying this is a temp workaround
+            # to avoid a giant stacktrace by having a smaller batch size in first call
+            # This will be remove in future version after server with fix has been
+            # around for a while
+            batch_size = 10
         while True:
             try:
                 result = self.ctx.graph.execute_graphql(
@@ -240,7 +249,7 @@ class SoftDeletedEntitiesCleanup:
                             "types": [entity_type],
                             "query": "*",
                             "scrollId": scroll_id if scroll_id else None,
-                            "count": self.config.batch_size,
+                            "count": batch_size,
                             "orFilters": [
                                 {
                                     "and": [
@@ -263,6 +272,10 @@ class SoftDeletedEntitiesCleanup:
             scroll_across_entities = result.get("scrollAcrossEntities")
             if not scroll_across_entities or not scroll_across_entities.get("count"):
                 break
+            if entity_type == "DATA_PROCESS_INSTANCE":
+                # Temp workaround. See note in beginning of the function
+                # We make the batch size = config after call has succeeded once
+                batch_size = self.config.batch_size
             scroll_id = scroll_across_entities.get("nextScrollId")
             self.report.num_queries_found += scroll_across_entities.get("count")
             for query in scroll_across_entities.get("searchResults"):

datahub/ingestion/source/iceberg/iceberg.py CHANGED Viewed

@@ -203,7 +203,9 @@ class IcebergSource(StatefulIngestionSourceBase):
                 with PerfTimer() as timer:
                     table = thread_local.local_catalog.load_table(dataset_path)
                     time_taken = timer.elapsed_seconds()
-                    self.report.report_table_load_time(time_taken)
+                    self.report.report_table_load_time(
+                        time_taken, dataset_name, table.metadata_location
+                    )
                 LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}")
                 yield from self._create_iceberg_workunit(dataset_name, table)
             except NoSuchPropertyException as e:
@@ -247,7 +249,10 @@ class IcebergSource(StatefulIngestionSourceBase):
                     f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
                 )
             except Exception as e:
-                self.report.report_failure("general", f"Failed to create workunit: {e}")
+                self.report.report_failure(
+                    "general",
+                    f"Failed to create workunit for dataset {dataset_name}: {e}",
+                )
                 LOGGER.exception(
                     f"Exception while processing table {dataset_path}, skipping it.",
                 )
@@ -312,7 +317,9 @@ class IcebergSource(StatefulIngestionSourceBase):
             dataset_snapshot.aspects.append(schema_metadata)
             mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
-        self.report.report_table_processing_time(timer.elapsed_seconds())
+        self.report.report_table_processing_time(
+            timer.elapsed_seconds(), dataset_name, table.metadata_location
+        )
         yield MetadataWorkUnit(id=dataset_name, mce=mce)
         dpi_aspect = self._get_dataplatform_instance_aspect(dataset_urn=dataset_urn)

datahub/ingestion/source/iceberg/iceberg_common.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional
 from humanfriendly import format_timespan
 from pydantic import Field, validator
 from pyiceberg.catalog import Catalog, load_catalog
+from sortedcontainers import SortedList
 from datahub.configuration.common import AllowDenyPattern, ConfigModel
 from datahub.configuration.source_common import DatasetSourceConfigMixin
@@ -146,19 +147,40 @@ class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin)
         return load_catalog(name=catalog_name, **catalog_config)
+class TopTableTimings:
+    _VALUE_FIELD: str = "timing"
+    top_entites: SortedList
+    _size: int
+    def __init__(self, size: int = 10):
+        self._size = size
+        self.top_entites = SortedList(key=lambda x: -x.get(self._VALUE_FIELD, 0))
+    def add(self, entity: Dict[str, Any]) -> None:
+        if self._VALUE_FIELD not in entity:
+            return
+        self.top_entites.add(entity)
+        if len(self.top_entites) > self._size:
+            self.top_entites.pop()
+    def __str__(self) -> str:
+        if len(self.top_entites) == 0:
+            return "no timings reported"
+        return str(list(self.top_entites))
 class TimingClass:
-    times: List[int]
+    times: SortedList
     def __init__(self):
-        self.times = []
+        self.times = SortedList()
-    def add_timing(self, t):
-        self.times.append(t)
+    def add_timing(self, t: float) -> None:
+        self.times.add(t)
-    def __str__(self):
+    def __str__(self) -> str:
         if len(self.times) == 0:
             return "no timings reported"
-        self.times.sort()
         total = sum(self.times)
         avg = total / len(self.times)
         return str(
@@ -180,6 +202,9 @@ class IcebergSourceReport(StaleEntityRemovalSourceReport):
     load_table_timings: TimingClass = field(default_factory=TimingClass)
     processing_table_timings: TimingClass = field(default_factory=TimingClass)
     profiling_table_timings: TimingClass = field(default_factory=TimingClass)
+    tables_load_timings: TopTableTimings = field(default_factory=TopTableTimings)
+    tables_profile_timings: TopTableTimings = field(default_factory=TopTableTimings)
+    tables_process_timings: TopTableTimings = field(default_factory=TopTableTimings)
     listed_namespaces: int = 0
     total_listed_tables: int = 0
     tables_listed_per_namespace: TopKDict[str, int] = field(
@@ -201,11 +226,26 @@ class IcebergSourceReport(StaleEntityRemovalSourceReport):
     def report_dropped(self, ent_name: str) -> None:
         self.filtered.append(ent_name)
-    def report_table_load_time(self, t: float) -> None:
+    def report_table_load_time(
+        self, t: float, table_name: str, table_metadata_location: str
+    ) -> None:
         self.load_table_timings.add_timing(t)
+        self.tables_load_timings.add(
+            {"table": table_name, "timing": t, "metadata_file": table_metadata_location}
+        )
-    def report_table_processing_time(self, t: float) -> None:
+    def report_table_processing_time(
+        self, t: float, table_name: str, table_metadata_location: str
+    ) -> None:
         self.processing_table_timings.add_timing(t)
+        self.tables_process_timings.add(
+            {"table": table_name, "timing": t, "metadata_file": table_metadata_location}
+        )
-    def report_table_profiling_time(self, t: float) -> None:
+    def report_table_profiling_time(
+        self, t: float, table_name: str, table_metadata_location: str
+    ) -> None:
         self.profiling_table_timings.add_timing(t)
+        self.tables_profile_timings.add(
+            {"table": table_name, "timing": t, "metadata_file": table_metadata_location}
+        )

datahub/ingestion/source/iceberg/iceberg_profiler.py CHANGED Viewed

@@ -204,7 +204,9 @@ class IcebergProfiler:
                         )
                     dataset_profile.fieldProfiles.append(column_profile)
             time_taken = timer.elapsed_seconds()
-            self.report.report_table_profiling_time(time_taken)
+            self.report.report_table_profiling_time(
+                time_taken, dataset_name, table.metadata_location
+            )
             LOGGER.debug(
                 f"Finished profiling of dataset: {dataset_name} in {time_taken}"
             )

datahub/ingestion/source/kafka_connect/kafka_connect.py CHANGED Viewed

@@ -17,7 +17,7 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
+from datahub.ingestion.api.source import MetadataWorkUnitProcessor
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.kafka_connect.common import (
     CONNECTOR_CLASS,
@@ -94,11 +94,6 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
         if not jpype.isJVMStarted():
             jpype.startJVM()
-    @classmethod
-    def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
-        config = KafkaConnectSourceConfig.parse_obj(config_dict)
-        return cls(config, ctx)
     def get_connectors_manifest(self) -> Iterable[ConnectorManifest]:
         """Get Kafka Connect connectors manifest using REST API.
         Enrich with lineages metadata.

datahub/ingestion/source/metabase.py CHANGED Viewed

@@ -23,7 +23,7 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport
+from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
@@ -789,11 +789,6 @@ class MetabaseSource(StatefulIngestionSourceBase):
         return platform, dbname, schema, platform_instance
-    @classmethod
-    def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
-        config = MetabaseConfig.parse_obj(config_dict)
-        return cls(ctx, config)
     def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
         return [
             *super().get_workunit_processors(),

datahub/ingestion/source/mlflow.py CHANGED Viewed

@@ -333,8 +333,3 @@ class MLflowSource(Source):
             aspect=global_tags,
         )
         return wu
-    @classmethod
-    def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
-        config = MLflowConfig.parse_obj(config_dict)
-        return cls(ctx, config)

datahub/ingestion/source/nifi.py CHANGED Viewed

@@ -484,11 +484,6 @@ class NifiSource(Source):
     def rest_api_base_url(self):
         return self.config.site_url[: -len("nifi/")] + "nifi-api/"
-    @classmethod
-    def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
-        config = NifiSourceConfig.parse_obj(config_dict)
-        return cls(config, ctx)
     def get_report(self) -> SourceReport:
         return self.report

datahub/ingestion/source/powerbi_report_server/report_server.py CHANGED Viewed

@@ -485,7 +485,7 @@ class PowerBiReportServerDashboardSourceReport(SourceReport):
         self.filtered_reports.append(view)
-@platform_name("PowerBI")
+@platform_name("PowerBI Report Server")
 @config_class(PowerBiReportServerDashboardSourceConfig)
 @support_status(SupportStatus.INCUBATING)
 @capability(SourceCapability.OWNERSHIP, "Enabled by default")

datahub/ingestion/source/redash.py CHANGED Viewed

@@ -369,11 +369,6 @@ class RedashSource(Source):
         else:
             raise ValueError(f"Failed to connect to {self.config.connect_uri}/api")
-    @classmethod
-    def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
-        config = RedashConfig.parse_obj(config_dict)
-        return cls(ctx, config)
     def _get_chart_data_source(self, data_source_id: Optional[int] = None) -> Dict:
         url = f"/api/data_sources/{data_source_id}"
         resp = self.client._get(url).json()

datahub/ingestion/source/redshift/redshift.py CHANGED Viewed

@@ -276,6 +276,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
         "HLLSKETCH": NullType,
         "TIMETZ": TimeType,
         "VARBYTE": StringType,
+        "SUPER": NullType,
     }
     def get_platform_instance_id(self) -> str:

datahub/ingestion/source/s3/source.py CHANGED Viewed

@@ -6,9 +6,8 @@ import pathlib
 import re
 import time
 from datetime import datetime
-from itertools import groupby
 from pathlib import PurePath
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
 from urllib.parse import urlparse
 import smart_open.compression as so_compression
@@ -41,6 +40,7 @@ from datahub.ingestion.source.aws.s3_util import (
     get_bucket_name,
     get_bucket_relative_path,
     get_key_prefix,
+    group_s3_objects_by_dirname,
     strip_s3_prefix,
 )
 from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
@@ -75,6 +75,9 @@ from datahub.metadata.schema_classes import (
 from datahub.telemetry import stats, telemetry
 from datahub.utilities.perf_timer import PerfTimer
+if TYPE_CHECKING:
+    from mypy_boto3_s3.service_resource import Bucket
 # hide annoying debug errors from py4j
 logging.getLogger("py4j").setLevel(logging.ERROR)
 logger: logging.Logger = logging.getLogger(__name__)
@@ -842,7 +845,7 @@ class S3Source(StatefulIngestionSourceBase):
     def get_folder_info(
         self,
         path_spec: PathSpec,
-        bucket: Any,  # Todo: proper type
+        bucket: "Bucket",
         prefix: str,
     ) -> List[Folder]:
         """
@@ -857,22 +860,15 @@ class S3Source(StatefulIngestionSourceBase):
         Parameters:
         path_spec (PathSpec): The path specification used to determine partitioning.
-        bucket (Any): The S3 bucket object.
+        bucket (Bucket): The S3 bucket object.
         prefix (str): The prefix path in the S3 bucket to list objects from.
         Returns:
         List[Folder]: A list of Folder objects representing the partitions found.
         """
-        prefix_to_list = prefix
-        files = list(
-            bucket.objects.filter(Prefix=f"{prefix_to_list}").page_size(PAGE_SIZE)
-        )
-        files = sorted(files, key=lambda a: a.last_modified)
-        grouped_files = groupby(files, lambda x: x.key.rsplit("/", 1)[0])
         partitions: List[Folder] = []
-        for key, group in grouped_files:
+        s3_objects = bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
+        for key, group in group_s3_objects_by_dirname(s3_objects).items():
             file_size = 0
             creation_time = None
             modification_time = None
@@ -904,7 +900,7 @@ class S3Source(StatefulIngestionSourceBase):
                 Folder(
                     partition_id=id,
                     is_partition=bool(id),
-                    creation_time=creation_time if creation_time else None,
+                    creation_time=creation_time if creation_time else None,  # type: ignore[arg-type]
                     modification_time=modification_time,
                     sample_file=self.create_s3_path(max_file.bucket_name, max_file.key),
                     size=file_size,

datahub/ingestion/source/snowflake/snowflake_config.py CHANGED Viewed

@@ -244,6 +244,11 @@ class SnowflakeV2Config(
         description="""Optional. Allowed values are `without_lineage`, `with_lineage`, and `skip` (default). `without_lineage` only extracts tags that have been applied directly to the given entity. `with_lineage` extracts both directly applied and propagated tags, but will be significantly slower. See the [Snowflake documentation](https://docs.snowflake.com/en/user-guide/object-tagging.html#tag-lineage) for information about tag lineage/propagation. """,
     )
+    extract_tags_as_structured_properties: bool = Field(
+        default=False,
+        description="If enabled along with `extract_tags`, extracts snowflake's key-value tags as DataHub structured properties instead of DataHub tags.",
+    )
     include_external_url: bool = Field(
         default=True,
         description="Whether to populate Snowsight url for Snowflake Objects",
@@ -263,6 +268,14 @@ class SnowflakeV2Config(
         description="List of regex patterns for tags to include in ingestion. Only used if `extract_tags` is enabled.",
     )
+    structured_property_pattern: AllowDenyPattern = Field(
+        default=AllowDenyPattern.allow_all(),
+        description=(
+            "List of regex patterns for structured properties to include in ingestion."
+            " Only used if `extract_tags` and `extract_tags_as_structured_properties` are enabled."
+        ),
+    )
     # This is required since access_history table does not capture whether the table was temporary table.
     temporary_tables_pattern: List[str] = Field(
         default=DEFAULT_TEMP_TABLES_PATTERNS,

datahub/ingestion/source/snowflake/snowflake_schema.py CHANGED Viewed

@@ -45,15 +45,18 @@ class SnowflakeTag:
     name: str
     value: str
-    def display_name(self) -> str:
+    def tag_display_name(self) -> str:
         return f"{self.name}: {self.value}"
-    def identifier(self) -> str:
+    def tag_identifier(self) -> str:
         return f"{self._id_prefix_as_str()}:{self.value}"
     def _id_prefix_as_str(self) -> str:
         return f"{self.database}.{self.schema}.{self.name}"
+    def structured_property_identifier(self) -> str:
+        return f"snowflake.{self.database}.{self.schema}.{self.name}"
 @dataclass
 class SnowflakeColumn(BaseColumn):

acryl-datahub 0.15.0.2rc3__py3-none-any.whl → 0.15.0.2rc5__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.2rc3py3-none-any.whl → 0.15.0.2rc5py3-none-any.whl