PyPI - acryl-datahub - Versions diffs - 1.0.0rc5__py3-none-any.whl → 1.0.0rc7__py3-none-any.whl - Mend

acryl-datahub 1.0.0rc5py3-none-any.whl → 1.0.0rc7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (72) hide show

{acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/METADATA +2449 -2449
{acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/RECORD +72 -71
{acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/WHEEL +1 -1
datahub/_version.py +1 -1
datahub/cli/docker_cli.py +1 -1
datahub/cli/iceberg_cli.py +1 -1
datahub/cli/ingest_cli.py +3 -1
datahub/cli/lite_cli.py +4 -2
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/configuration/kafka.py +1 -1
datahub/ingestion/api/source_helpers.py +4 -0
datahub/ingestion/fs/s3_fs.py +2 -2
datahub/ingestion/graph/client.py +15 -6
datahub/ingestion/graph/entity_versioning.py +3 -3
datahub/ingestion/run/pipeline.py +109 -143
datahub/ingestion/run/sink_callback.py +77 -0
datahub/ingestion/source/cassandra/cassandra.py +152 -233
datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
datahub/ingestion/source/csv_enricher.py +2 -2
datahub/ingestion/source/delta_lake/config.py +8 -1
datahub/ingestion/source/delta_lake/report.py +4 -2
datahub/ingestion/source/delta_lake/source.py +20 -5
datahub/ingestion/source/dremio/dremio_api.py +3 -3
datahub/ingestion/source/dremio/dremio_aspects.py +2 -1
datahub/ingestion/source/elastic_search.py +26 -6
datahub/ingestion/source/feast.py +27 -8
datahub/ingestion/source/file.py +1 -1
datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
datahub/ingestion/source/identity/okta.py +1 -2
datahub/ingestion/source/kafka/kafka.py +1 -1
datahub/ingestion/source/looker/looker_file_loader.py +2 -2
datahub/ingestion/source/looker/lookml_source.py +1 -1
datahub/ingestion/source/metabase.py +54 -32
datahub/ingestion/source/mlflow.py +30 -7
datahub/ingestion/source/mode.py +8 -3
datahub/ingestion/source/neo4j/neo4j_source.py +26 -6
datahub/ingestion/source/nifi.py +29 -6
datahub/ingestion/source/powerbi_report_server/report_server.py +25 -6
datahub/ingestion/source/pulsar.py +3 -2
datahub/ingestion/source/redash.py +29 -6
datahub/ingestion/source/s3/config.py +3 -1
datahub/ingestion/source/salesforce.py +28 -6
datahub/ingestion/source/sigma/sigma.py +1 -1
datahub/ingestion/source/slack/slack.py +31 -10
datahub/ingestion/source/snowflake/snowflake_connection.py +1 -1
datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
datahub/ingestion/source/sql/druid.py +1 -5
datahub/ingestion/source/sql/oracle.py +34 -0
datahub/ingestion/source/tableau/tableau.py +2 -1
datahub/ingestion/source/tableau/tableau_common.py +2 -1
datahub/ingestion/source_config/pulsar.py +3 -1
datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
datahub/lite/duckdb_lite.py +2 -1
datahub/lite/lite_local.py +1 -1
datahub/lite/lite_util.py +4 -3
datahub/metadata/_schema_classes.py +517 -410
datahub/metadata/_urns/urn_defs.py +1670 -1670
datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
datahub/metadata/schema.avsc +17362 -17638
datahub/metadata/schemas/IncidentInfo.avsc +130 -46
datahub/metadata/schemas/__init__.py +3 -3
datahub/sdk/__init__.py +29 -12
datahub/sdk/_entity.py +18 -1
datahub/sdk/container.py +3 -1
datahub/sdk/dataset.py +5 -3
datahub/sql_parsing/_sqlglot_patch.py +2 -10
datahub/utilities/memory_footprint.py +3 -2
datahub/utilities/unified_diff.py +5 -1
{acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/LICENSE +0 -0
{acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/powerbi_report_server/report_server.py CHANGED Viewed

@@ -14,7 +14,9 @@ from requests_ntlm import HttpNtlmAuth
 import datahub.emitter.mce_builder as builder
 from datahub.configuration.common import AllowDenyPattern
-from datahub.configuration.source_common import EnvConfigMixin
+from datahub.configuration.source_common import (
+    EnvConfigMixin,
+)
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
@@ -25,7 +27,7 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.source import Source, SourceReport
+from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.powerbi_report_server.constants import (
     API_ENDPOINTS,
@@ -39,6 +41,14 @@ from datahub.ingestion.source.powerbi_report_server.report_server_domain import
     PowerBiReport,
     Report,
 )
+from datahub.ingestion.source.state.stale_entity_removal_handler import (
+    StaleEntityRemovalHandler,
+    StaleEntityRemovalSourceReport,
+)
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+    StatefulIngestionConfigBase,
+    StatefulIngestionSourceBase,
+)
 from datahub.metadata.com.linkedin.pegasus2avro.common import ChangeAuditStamps
 from datahub.metadata.schema_classes import (
     BrowsePathsClass,
@@ -58,7 +68,7 @@ from datahub.utilities.lossy_collections import LossyList
 LOGGER = logging.getLogger(__name__)
-class PowerBiReportServerAPIConfig(EnvConfigMixin):
+class PowerBiReportServerAPIConfig(StatefulIngestionConfigBase, EnvConfigMixin):
     username: str = pydantic.Field(description="Windows account username")
     password: str = pydantic.Field(description="Windows account password")
     workstation_name: str = pydantic.Field(
@@ -475,7 +485,7 @@ class Mapper:
 @dataclass
-class PowerBiReportServerDashboardSourceReport(SourceReport):
+class PowerBiReportServerDashboardSourceReport(StaleEntityRemovalSourceReport):
     scanned_report: int = 0
     filtered_reports: LossyList[str] = dataclass_field(default_factory=LossyList)
@@ -490,7 +500,7 @@ class PowerBiReportServerDashboardSourceReport(SourceReport):
 @config_class(PowerBiReportServerDashboardSourceConfig)
 @support_status(SupportStatus.INCUBATING)
 @capability(SourceCapability.OWNERSHIP, "Enabled by default")
-class PowerBiReportServerDashboardSource(Source):
+class PowerBiReportServerDashboardSource(StatefulIngestionSourceBase):
     """
     Use this plugin to connect to [PowerBI Report Server](https://powerbi.microsoft.com/en-us/report-server/).
     It extracts the following:
@@ -520,8 +530,9 @@ class PowerBiReportServerDashboardSource(Source):
     def __init__(
         self, config: PowerBiReportServerDashboardSourceConfig, ctx: PipelineContext
     ):
-        super().__init__(ctx)
+        super().__init__(config, ctx)
         self.source_config = config
+        self.ctx = ctx
         self.report = PowerBiReportServerDashboardSourceReport()
         self.auth = PowerBiReportServerAPI(self.source_config).get_auth_credentials
         self.powerbi_client = PowerBiReportServerAPI(self.source_config)
@@ -532,6 +543,14 @@ class PowerBiReportServerDashboardSource(Source):
         config = PowerBiReportServerDashboardSourceConfig.parse_obj(config_dict)
         return cls(config, ctx)
+    def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+        return [
+            *super().get_workunit_processors(),
+            StaleEntityRemovalHandler.create(
+                self, self.source_config, self.ctx
+            ).workunit_processor,
+        ]
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         """
         Datahub Ingestion framework invoke this method

datahub/ingestion/source/pulsar.py CHANGED Viewed

@@ -116,6 +116,7 @@ class PulsarSource(StatefulIngestionSourceBase):
     def __init__(self, config: PulsarSourceConfig, ctx: PipelineContext):
         super().__init__(config, ctx)
         self.platform: str = "pulsar"
+        self.ctx = ctx
         self.config: PulsarSourceConfig = config
         self.report: PulsarSourceReport = PulsarSourceReport()
@@ -229,8 +230,8 @@ class PulsarSource(StatefulIngestionSourceBase):
                 self.report.report_warning("HTTPError", message)
         except requests.exceptions.RequestException as e:
             raise Exception(
-                f"An ambiguous exception occurred while handling the request: {e}"
-            )
+                "An ambiguous exception occurred while handling the request"
+            ) from e
     @classmethod
     def create(cls, config_dict, ctx):

datahub/ingestion/source/redash.py CHANGED Viewed

@@ -12,7 +12,7 @@ from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 import datahub.emitter.mce_builder as builder
-from datahub.configuration.common import AllowDenyPattern, ConfigModel
+from datahub.configuration.common import AllowDenyPattern
 from datahub.emitter.mce_builder import DEFAULT_ENV
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (  # SourceCapability,; capability,
@@ -22,8 +22,20 @@ from datahub.ingestion.api.decorators import (  # SourceCapability,; capability,
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
+from datahub.ingestion.api.source import (
+    MetadataWorkUnitProcessor,
+    SourceCapability,
+    SourceReport,
+)
 from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.state.stale_entity_removal_handler import (
+    StaleEntityRemovalHandler,
+    StaleEntityRemovalSourceReport,
+)
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+    StatefulIngestionConfigBase,
+    StatefulIngestionSourceBase,
+)
 from datahub.metadata.com.linkedin.pegasus2avro.common import (
     AuditStamp,
     ChangeAuditStamps,
@@ -235,7 +247,9 @@ def get_full_qualified_name(platform: str, database_name: str, table_name: str)
         return f"{database_name}.{table_name}"
-class RedashConfig(ConfigModel):
+class RedashConfig(
+    StatefulIngestionConfigBase,
+):
     # See the Redash API for details
     # https://redash.io/help/user-guide/integrations-and-api/api
     connect_uri: str = Field(
@@ -277,7 +291,7 @@ class RedashConfig(ConfigModel):
 @dataclass
-class RedashSourceReport(SourceReport):
+class RedashSourceReport(StaleEntityRemovalSourceReport):
     items_scanned: int = 0
     filtered: LossyList[str] = field(default_factory=LossyList)
     queries_problem_parsing: LossySet[str] = field(default_factory=LossySet)
@@ -305,7 +319,7 @@ class RedashSourceReport(SourceReport):
 @config_class(RedashConfig)
 @support_status(SupportStatus.INCUBATING)
 @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
-class RedashSource(Source):
+class RedashSource(StatefulIngestionSourceBase):
     """
     This plugin extracts the following:
@@ -316,8 +330,9 @@ class RedashSource(Source):
     platform = "redash"
     def __init__(self, ctx: PipelineContext, config: RedashConfig):
-        super().__init__(ctx)
+        super().__init__(config, ctx)
         self.config: RedashConfig = config
+        self.ctx = ctx
         self.report: RedashSourceReport = RedashSourceReport()
         # Handle trailing slash removal
@@ -724,6 +739,14 @@ class RedashSource(Source):
     def add_config_to_report(self) -> None:
         self.report.api_page_limit = self.config.api_page_limit
+    def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+        return [
+            *super().get_workunit_processors(),
+            StaleEntityRemovalHandler.create(
+                self, self.config, self.ctx
+            ).workunit_processor,
+        ]
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         self.validate_connection()
         self.add_config_to_report()

datahub/ingestion/source/s3/config.py CHANGED Viewed

@@ -5,7 +5,9 @@ import pydantic
 from pydantic.fields import Field
 from datahub.configuration.common import AllowDenyPattern
-from datahub.configuration.source_common import DatasetSourceConfigMixin
+from datahub.configuration.source_common import (
+    DatasetSourceConfigMixin,
+)
 from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
 from datahub.configuration.validate_field_rename import pydantic_renamed_field
 from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig

datahub/ingestion/source/salesforce.py CHANGED Viewed

@@ -17,7 +17,9 @@ from datahub.configuration.common import (
     ConfigModel,
     ConfigurationError,
 )
-from datahub.configuration.source_common import DatasetSourceConfigMixin
+from datahub.configuration.source_common import (
+    DatasetSourceConfigMixin,
+)
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.emitter.mcp_builder import add_domain_to_entity_wu
 from datahub.ingestion.api.common import PipelineContext
@@ -29,9 +31,17 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.source import Source, SourceReport
+from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.common.subtypes import DatasetSubTypes
+from datahub.ingestion.source.state.stale_entity_removal_handler import (
+    StaleEntityRemovalHandler,
+    StaleEntityRemovalSourceReport,
+)
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+    StatefulIngestionConfigBase,
+    StatefulIngestionSourceBase,
+)
 from datahub.ingestion.source_config.operation_config import (
     OperationConfig,
     is_profiling_enabled,
@@ -85,7 +95,10 @@ class SalesforceProfilingConfig(ConfigModel):
     # TODO - support field level profiling
-class SalesforceConfig(DatasetSourceConfigMixin):
+class SalesforceConfig(
+    StatefulIngestionConfigBase,
+    DatasetSourceConfigMixin,
+):
     platform: str = "salesforce"
     auth: SalesforceAuthType = SalesforceAuthType.USERNAME_PASSWORD
@@ -149,7 +162,7 @@ class SalesforceConfig(DatasetSourceConfigMixin):
 @dataclass
-class SalesforceSourceReport(SourceReport):
+class SalesforceSourceReport(StaleEntityRemovalSourceReport):
     filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
     def report_dropped(self, ent_name: str) -> None:
@@ -214,7 +227,7 @@ FIELD_TYPE_MAPPING = {
     capability_name=SourceCapability.TAGS,
     description="Enabled by default",
 )
-class SalesforceSource(Source):
+class SalesforceSource(StatefulIngestionSourceBase):
     base_url: str
     config: SalesforceConfig
     report: SalesforceSourceReport
@@ -223,7 +236,8 @@ class SalesforceSource(Source):
     fieldCounts: Dict[str, int]
     def __init__(self, config: SalesforceConfig, ctx: PipelineContext) -> None:
-        super().__init__(ctx)
+        super().__init__(config, ctx)
+        self.ctx = ctx
         self.config = config
         self.report = SalesforceSourceReport()
         self.session = requests.Session()
@@ -328,6 +342,14 @@ class SalesforceSource(Source):
             )
         )
+    def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+        return [
+            *super().get_workunit_processors(),
+            StaleEntityRemovalHandler.create(
+                self, self.config, self.ctx
+            ).workunit_processor,
+        ]
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         try:
             sObjects = self.get_salesforce_objects()

datahub/ingestion/source/sigma/sigma.py CHANGED Viewed

@@ -124,7 +124,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
         try:
             self.sigma_api = SigmaAPI(self.config, self.reporter)
         except Exception as e:
-            raise ConfigurationError(f"Unable to connect sigma API. Exception: {e}")
+            raise ConfigurationError("Unable to connect sigma API") from e
     @staticmethod
     def test_connection(config_dict: dict) -> TestConnectionReport:

datahub/ingestion/source/slack/slack.py CHANGED Viewed

@@ -9,7 +9,6 @@ from tenacity import retry, wait_exponential
 from tenacity.before_sleep import before_sleep_log
 import datahub.emitter.mce_builder as builder
-from datahub.configuration.common import ConfigModel
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
@@ -18,8 +17,19 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.source import Source, SourceReport
+from datahub.ingestion.api.source import (
+    MetadataWorkUnitProcessor,
+    SourceReport,
+)
 from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.state.stale_entity_removal_handler import (
+    StaleEntityRemovalHandler,
+    StaleEntityRemovalSourceReport,
+)
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+    StatefulIngestionConfigBase,
+    StatefulIngestionSourceBase,
+)
 from datahub.metadata.schema_classes import (
     CorpUserEditableInfoClass,
     DatasetPropertiesClass,
@@ -44,7 +54,9 @@ class CorpUser:
     slack_display_name: Optional[str] = None
-class SlackSourceConfig(ConfigModel):
+class SlackSourceConfig(
+    StatefulIngestionConfigBase,
+):
     bot_token: SecretStr = Field(
         description="Bot token for the Slack workspace. Needs `users:read`, `users:read.email` and `users.profile:read` scopes.",
     )
@@ -58,22 +70,22 @@ class SlackSourceConfig(ConfigModel):
         default=10,
         description="Number of API requests per minute. Low-level config. Do not tweak unless you are facing any issues.",
     )
-    ingest_public_channels = Field(
+    ingest_public_channels: bool = Field(
         type=bool,
         default=False,
         description="Whether to ingest public channels. If set to true needs `channels:read` scope.",
     )
-    channels_iteration_limit = Field(
+    channels_iteration_limit: int = Field(
         type=int,
         default=200,
         description="Limit the number of channels to be ingested in a iteration. Low-level config. Do not tweak unless you are facing any issues.",
     )
-    channel_min_members = Field(
+    channel_min_members: int = Field(
         type=int,
         default=2,
         description="Ingest channels with at least this many members.",
     )
-    should_ingest_archived_channels = Field(
+    should_ingest_archived_channels: bool = Field(
         type=bool,
         default=False,
         description="Whether to ingest archived channels.",
@@ -81,7 +93,7 @@ class SlackSourceConfig(ConfigModel):
 @dataclass
-class SlackSourceReport(SourceReport):
+class SlackSourceReport(StaleEntityRemovalSourceReport):
     channels_reported: int = 0
     archived_channels_reported: int = 0
@@ -92,11 +104,12 @@ PLATFORM_NAME = "slack"
 @platform_name("Slack")
 @config_class(SlackSourceConfig)
 @support_status(SupportStatus.TESTING)
-class SlackSource(Source):
+class SlackSource(StatefulIngestionSourceBase):
     def __init__(self, ctx: PipelineContext, config: SlackSourceConfig):
+        super().__init__(config, ctx)
         self.ctx = ctx
         self.config = config
-        self.report = SlackSourceReport()
+        self.report: SlackSourceReport = SlackSourceReport()
         self.workspace_base_url: Optional[str] = None
         self.rate_limiter = RateLimiter(
             max_calls=self.config.api_requests_per_min, period=60
@@ -111,6 +124,14 @@ class SlackSource(Source):
     def get_slack_client(self) -> WebClient:
         return WebClient(token=self.config.bot_token.get_secret_value())
+    def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+        return [
+            *super().get_workunit_processors(),
+            StaleEntityRemovalHandler.create(
+                self, self.config, self.ctx
+            ).workunit_processor,
+        ]
     def get_workunits_internal(
         self,
     ) -> Iterable[MetadataWorkUnit]:

datahub/ingestion/source/snowflake/snowflake_connection.py CHANGED Viewed

@@ -312,7 +312,7 @@ class SnowflakeConnectionConfig(ConfigModel):
             raise ValueError(
                 f"access_token not found in response {response}. "
                 "Please check your OAuth configuration."
-            )
+            ) from None
         connect_args = self.get_options()["connect_args"]
         return snowflake.connector.connect(
             user=self.username,

datahub/ingestion/source/snowflake/snowflake_query.py CHANGED Viewed

@@ -134,10 +134,11 @@ class SnowflakeQuery:
         clustering_key AS "CLUSTERING_KEY",
         auto_clustering_on AS "AUTO_CLUSTERING_ON",
         is_dynamic AS "IS_DYNAMIC",
-        is_iceberg AS "IS_ICEBERG"
+        is_iceberg AS "IS_ICEBERG",
+        is_hybrid AS "IS_HYBRID"
         FROM {db_clause}information_schema.tables t
         WHERE table_schema != 'INFORMATION_SCHEMA'
-        and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
+        and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE')
         order by table_schema, table_name"""
     @staticmethod
@@ -156,10 +157,11 @@ class SnowflakeQuery:
         clustering_key AS "CLUSTERING_KEY",
         auto_clustering_on AS "AUTO_CLUSTERING_ON",
         is_dynamic AS "IS_DYNAMIC",
-        is_iceberg AS "IS_ICEBERG"
+        is_iceberg AS "IS_ICEBERG",
+        is_hybrid AS "IS_HYBRID"
         FROM {db_clause}information_schema.tables t
         where table_schema='{schema_name}'
-        and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
+        and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
         order by table_schema, table_name"""
     @staticmethod

datahub/ingestion/source/snowflake/snowflake_schema.py CHANGED Viewed

@@ -96,10 +96,7 @@ class SnowflakeTable(BaseTable):
     column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
     is_dynamic: bool = False
     is_iceberg: bool = False
-    @property
-    def is_hybrid(self) -> bool:
-        return self.type is not None and self.type == "HYBRID TABLE"
+    is_hybrid: bool = False
     def get_subtype(self) -> DatasetSubTypes:
         return DatasetSubTypes.TABLE
@@ -369,6 +366,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
                     clustering_key=table["CLUSTERING_KEY"],
                     is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
                     is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
+                    is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
                 )
             )
         return tables
@@ -395,6 +393,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
                     clustering_key=table["CLUSTERING_KEY"],
                     is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
                     is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
+                    is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
                 )
             )
         return tables

datahub/ingestion/source/sql/druid.py CHANGED Viewed

@@ -50,11 +50,7 @@ class DruidConfig(BasicSQLAlchemyConfig):
     """
     def get_identifier(self, schema: str, table: str) -> str:
-        return (
-            f"{self.platform_instance}.{table}"
-            if self.platform_instance
-            else f"{table}"
-        )
+        return f"{table}"
 @platform_name("Druid")

datahub/ingestion/source/sql/oracle.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import datetime
 import logging
+import platform
 import re
 # This import verifies that the dependencies are available.
@@ -85,6 +86,16 @@ class OracleConfig(BasicSQLAlchemyConfig):
         description="The data dictionary views mode, to extract information about schema objects "
         "('ALL' and 'DBA' views are supported). (https://docs.oracle.com/cd/E11882_01/nav/catalog_views.htm)",
     )
+    # oracledb settings to enable thick mode and client library location
+    enable_thick_mode: Optional[bool] = Field(
+        default=False,
+        description="Connection defaults to thin mode. Set to True to enable thick mode.",
+    )
+    thick_mode_lib_dir: Optional[str] = Field(
+        default=None,
+        description="If using thick mode on Windows or Mac, set thick_mode_lib_dir to the oracle client libraries path. "
+        "On Linux, this value is ignored, as ldconfig or LD_LIBRARY_PATH will define the location.",
+    )
     @pydantic.validator("service_name")
     def check_service_name(cls, v, values):
@@ -100,6 +111,18 @@ class OracleConfig(BasicSQLAlchemyConfig):
             raise ValueError("Specify one of data dictionary views mode: 'ALL', 'DBA'.")
         return values
+    @pydantic.validator("thick_mode_lib_dir", always=True)
+    def check_thick_mode_lib_dir(cls, v, values):
+        if (
+            v is None
+            and values.get("enable_thick_mode")
+            and (platform.system() == "Darwin" or platform.system() == "Windows")
+        ):
+            raise ValueError(
+                "Specify 'thick_mode_lib_dir' on Mac/Windows when enable_thick_mode is true"
+            )
+        return v
     def get_sql_alchemy_url(self):
         url = super().get_sql_alchemy_url()
         if self.service_name:
@@ -586,6 +609,17 @@ class OracleSource(SQLAlchemySource):
     def __init__(self, config, ctx):
         super().__init__(config, ctx, "oracle")
+        # if connecting to oracle with enable_thick_mode, it must be initialized before calling
+        # create_engine, which is called in get_inspectors()
+        # https://python-oracledb.readthedocs.io/en/latest/user_guide/initialization.html#enabling-python-oracledb-thick-mode
+        if self.config.enable_thick_mode:
+            if platform.system() == "Darwin" or platform.system() == "Windows":
+                # windows and mac os require lib_dir to be set explicitly
+                oracledb.init_oracle_client(lib_dir=self.config.thick_mode_lib_dir)
+            else:
+                # linux requires configurating the library path with ldconfig or LD_LIBRARY_PATH
+                oracledb.init_oracle_client()
     @classmethod
     def create(cls, config_dict, ctx):
         config = OracleConfig.parse_obj(config_dict)

datahub/ingestion/source/tableau/tableau.py CHANGED Viewed

@@ -1562,8 +1562,9 @@ class TableauSiteSource:
         query: str,
         connection_type: str,
         page_size: int,
-        query_filter: dict = {},
+        query_filter: Optional[dict] = None,
     ) -> Iterable[dict]:
+        query_filter = query_filter or {}
         query_filter = optimize_query_filter(query_filter)
         # Calls the get_connection_object_page function to get the objects,

datahub/ingestion/source/tableau/tableau_common.py CHANGED Viewed

@@ -514,7 +514,8 @@ FIELD_TYPE_MAPPING = {
 }
-def get_tags_from_params(params: List[str] = []) -> GlobalTagsClass:
+def get_tags_from_params(params: Optional[List[str]] = None) -> GlobalTagsClass:
+    params = params or []
     tags = [
         TagAssociationClass(tag=builder.make_tag_urn(tag.upper()))
         for tag in params

datahub/ingestion/source_config/pulsar.py CHANGED Viewed

@@ -33,7 +33,9 @@ def _is_valid_hostname(hostname: str) -> bool:
 class PulsarSourceConfig(
-    StatefulIngestionConfigBase, PlatformInstanceConfigMixin, EnvConfigMixin
+    StatefulIngestionConfigBase,
+    PlatformInstanceConfigMixin,
+    EnvConfigMixin,
 ):
     web_service_url: str = Field(
         default="http://localhost:8080", description="The web URL for the cluster."

datahub/ingestion/transformer/pattern_cleanup_ownership.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import logging
 import re
 from typing import List, Optional, Set, cast
@@ -10,8 +11,11 @@ from datahub.metadata.schema_classes import (
     OwnershipClass,
     OwnershipTypeClass,
 )
+from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn
+from datahub.utilities.urns._urn_base import Urn
+from datahub.utilities.urns.error import InvalidUrnError
-_USER_URN_PREFIX: str = "urn:li:corpuser:"
+logger = logging.getLogger(__name__)
 class PatternCleanUpOwnershipConfig(ConfigModel):
@@ -49,6 +53,11 @@ class PatternCleanUpOwnership(OwnershipTransformer):
         else:
             return set()
+    def _process_owner(self, name: str) -> str:
+        for value in self.config.pattern_for_cleanup:
+            name = re.sub(value, "", name)
+        return name
     def transform_aspect(
         self, entity_urn: str, aspect_name: str, aspect: Optional[builder.Aspect]
     ) -> Optional[builder.Aspect]:
@@ -58,14 +67,23 @@ class PatternCleanUpOwnership(OwnershipTransformer):
         # clean all the owners based on the parameters received from config
         cleaned_owner_urns: List[str] = []
         for owner_urn in current_owner_urns:
-            user_id: str = owner_urn.split(_USER_URN_PREFIX)[1]
-            for value in self.config.pattern_for_cleanup:
-                user_id = re.sub(value, "", user_id)
-            cleaned_owner_urns.append(_USER_URN_PREFIX + user_id)
+            username = ""
+            try:
+                owner: Urn = Urn.from_string(owner_urn)
+                if isinstance(owner, CorpUserUrn):
+                    username = str(CorpUserUrn(self._process_owner(owner.username)))
+                elif isinstance(owner, CorpGroupUrn):
+                    username = str(CorpGroupUrn(self._process_owner(owner.name)))
+                else:
+                    logger.warning(f"{owner_urn} is not a supported owner type.")
+                    username = owner_urn
+            except InvalidUrnError:
+                logger.warning(f"Could not parse {owner_urn} from {entity_urn}")
+                username = owner_urn
+            cleaned_owner_urns.append(username)
         ownership_type, ownership_type_urn = builder.validate_ownership_type(
-            OwnershipTypeClass.DATAOWNER
+            OwnershipTypeClass.TECHNICAL_OWNER
         )
         owners = [
             OwnerClass(

datahub/lite/duckdb_lite.py CHANGED Viewed

@@ -284,9 +284,10 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
         self,
         query: str,
         flavor: SearchFlavor,
-        aspects: List[str] = [],
+        aspects: Optional[List[str]] = None,
         snippet: bool = True,
     ) -> Iterable[Searchable]:
+        aspects = aspects or []
         if flavor == SearchFlavor.FREE_TEXT:
             base_query = f"SELECT distinct(urn), 'urn', NULL from metadata_aspect_v2 where urn ILIKE '%{query}%' UNION SELECT urn, aspect_name, metadata from metadata_aspect_v2 where metadata->>'$.name' ILIKE '%{query}%'"
             for r in self.duckdb_client.execute(base_query).fetchall():

datahub/lite/lite_local.py CHANGED Viewed

@@ -90,7 +90,7 @@ class DataHubLiteLocal(Generic[LiteConfig], Closeable, metaclass=ABCMeta):
         self,
         query: str,
         flavor: SearchFlavor,
-        aspects: List[str] = [],
+        aspects: Optional[List[str]] = None,
         snippet: bool = True,
     ) -> Iterable[Searchable]:
         pass

acryl-datahub 1.0.0rc5__py3-none-any.whl → 1.0.0rc7__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0rc5py3-none-any.whl → 1.0.0rc7py3-none-any.whl