PyPI - acryl-datahub - Versions diffs - 1.0.0rc5__py3-none-any.whl → 1.0.0rc6__py3-none-any.whl - Mend

acryl-datahub 1.0.0rc5py3-none-any.whl → 1.0.0rc6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (47) hide show

{acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/METADATA +2415 -2415
{acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/RECORD +47 -46
{acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/WHEEL +1 -1
datahub/_version.py +1 -1
datahub/cli/ingest_cli.py +3 -1
datahub/ingestion/api/source_helpers.py +4 -0
datahub/ingestion/run/pipeline.py +109 -143
datahub/ingestion/run/sink_callback.py +77 -0
datahub/ingestion/source/cassandra/cassandra.py +152 -233
datahub/ingestion/source/cassandra/cassandra_api.py +11 -4
datahub/ingestion/source/delta_lake/config.py +8 -1
datahub/ingestion/source/delta_lake/report.py +4 -2
datahub/ingestion/source/delta_lake/source.py +20 -5
datahub/ingestion/source/elastic_search.py +26 -6
datahub/ingestion/source/feast.py +27 -8
datahub/ingestion/source/file.py +1 -1
datahub/ingestion/source/identity/okta.py +1 -2
datahub/ingestion/source/mlflow.py +30 -7
datahub/ingestion/source/mode.py +7 -2
datahub/ingestion/source/neo4j/neo4j_source.py +26 -6
datahub/ingestion/source/nifi.py +29 -6
datahub/ingestion/source/powerbi_report_server/report_server.py +25 -6
datahub/ingestion/source/pulsar.py +1 -0
datahub/ingestion/source/redash.py +29 -6
datahub/ingestion/source/s3/config.py +3 -1
datahub/ingestion/source/salesforce.py +28 -6
datahub/ingestion/source/slack/slack.py +31 -10
datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
datahub/ingestion/source/sql/oracle.py +34 -0
datahub/ingestion/source_config/pulsar.py +3 -1
datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
datahub/metadata/_schema_classes.py +517 -410
datahub/metadata/_urns/urn_defs.py +1670 -1670
datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
datahub/metadata/schema.avsc +17362 -17638
datahub/metadata/schemas/IncidentInfo.avsc +130 -46
datahub/metadata/schemas/__init__.py +3 -3
datahub/sdk/__init__.py +29 -12
datahub/sdk/_entity.py +18 -1
datahub/sdk/container.py +3 -1
datahub/sdk/dataset.py +5 -3
datahub/sql_parsing/_sqlglot_patch.py +2 -10
datahub/utilities/unified_diff.py +5 -1
{acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/LICENSE +0 -0
{acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/salesforce.py CHANGED Viewed

@@ -17,7 +17,9 @@ from datahub.configuration.common import (
     ConfigModel,
     ConfigurationError,
 )
-from datahub.configuration.source_common import DatasetSourceConfigMixin
+from datahub.configuration.source_common import (
+    DatasetSourceConfigMixin,
+)
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.emitter.mcp_builder import add_domain_to_entity_wu
 from datahub.ingestion.api.common import PipelineContext
@@ -29,9 +31,17 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.source import Source, SourceReport
+from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.common.subtypes import DatasetSubTypes
+from datahub.ingestion.source.state.stale_entity_removal_handler import (
+    StaleEntityRemovalHandler,
+    StaleEntityRemovalSourceReport,
+)
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+    StatefulIngestionConfigBase,
+    StatefulIngestionSourceBase,
+)
 from datahub.ingestion.source_config.operation_config import (
     OperationConfig,
     is_profiling_enabled,
@@ -85,7 +95,10 @@ class SalesforceProfilingConfig(ConfigModel):
     # TODO - support field level profiling
-class SalesforceConfig(DatasetSourceConfigMixin):
+class SalesforceConfig(
+    StatefulIngestionConfigBase,
+    DatasetSourceConfigMixin,
+):
     platform: str = "salesforce"
     auth: SalesforceAuthType = SalesforceAuthType.USERNAME_PASSWORD
@@ -149,7 +162,7 @@ class SalesforceConfig(DatasetSourceConfigMixin):
 @dataclass
-class SalesforceSourceReport(SourceReport):
+class SalesforceSourceReport(StaleEntityRemovalSourceReport):
     filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
     def report_dropped(self, ent_name: str) -> None:
@@ -214,7 +227,7 @@ FIELD_TYPE_MAPPING = {
     capability_name=SourceCapability.TAGS,
     description="Enabled by default",
 )
-class SalesforceSource(Source):
+class SalesforceSource(StatefulIngestionSourceBase):
     base_url: str
     config: SalesforceConfig
     report: SalesforceSourceReport
@@ -223,7 +236,8 @@ class SalesforceSource(Source):
     fieldCounts: Dict[str, int]
     def __init__(self, config: SalesforceConfig, ctx: PipelineContext) -> None:
-        super().__init__(ctx)
+        super().__init__(config, ctx)
+        self.ctx = ctx
         self.config = config
         self.report = SalesforceSourceReport()
         self.session = requests.Session()
@@ -328,6 +342,14 @@ class SalesforceSource(Source):
             )
         )
+    def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+        return [
+            *super().get_workunit_processors(),
+            StaleEntityRemovalHandler.create(
+                self, self.config, self.ctx
+            ).workunit_processor,
+        ]
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         try:
             sObjects = self.get_salesforce_objects()

datahub/ingestion/source/slack/slack.py CHANGED Viewed

@@ -9,7 +9,6 @@ from tenacity import retry, wait_exponential
 from tenacity.before_sleep import before_sleep_log
 import datahub.emitter.mce_builder as builder
-from datahub.configuration.common import ConfigModel
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
@@ -18,8 +17,19 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.source import Source, SourceReport
+from datahub.ingestion.api.source import (
+    MetadataWorkUnitProcessor,
+    SourceReport,
+)
 from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.state.stale_entity_removal_handler import (
+    StaleEntityRemovalHandler,
+    StaleEntityRemovalSourceReport,
+)
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+    StatefulIngestionConfigBase,
+    StatefulIngestionSourceBase,
+)
 from datahub.metadata.schema_classes import (
     CorpUserEditableInfoClass,
     DatasetPropertiesClass,
@@ -44,7 +54,9 @@ class CorpUser:
     slack_display_name: Optional[str] = None
-class SlackSourceConfig(ConfigModel):
+class SlackSourceConfig(
+    StatefulIngestionConfigBase,
+):
     bot_token: SecretStr = Field(
         description="Bot token for the Slack workspace. Needs `users:read`, `users:read.email` and `users.profile:read` scopes.",
     )
@@ -58,22 +70,22 @@ class SlackSourceConfig(ConfigModel):
         default=10,
         description="Number of API requests per minute. Low-level config. Do not tweak unless you are facing any issues.",
     )
-    ingest_public_channels = Field(
+    ingest_public_channels: bool = Field(
         type=bool,
         default=False,
         description="Whether to ingest public channels. If set to true needs `channels:read` scope.",
     )
-    channels_iteration_limit = Field(
+    channels_iteration_limit: int = Field(
         type=int,
         default=200,
         description="Limit the number of channels to be ingested in a iteration. Low-level config. Do not tweak unless you are facing any issues.",
     )
-    channel_min_members = Field(
+    channel_min_members: int = Field(
         type=int,
         default=2,
         description="Ingest channels with at least this many members.",
     )
-    should_ingest_archived_channels = Field(
+    should_ingest_archived_channels: bool = Field(
         type=bool,
         default=False,
         description="Whether to ingest archived channels.",
@@ -81,7 +93,7 @@ class SlackSourceConfig(ConfigModel):
 @dataclass
-class SlackSourceReport(SourceReport):
+class SlackSourceReport(StaleEntityRemovalSourceReport):
     channels_reported: int = 0
     archived_channels_reported: int = 0
@@ -92,11 +104,12 @@ PLATFORM_NAME = "slack"
 @platform_name("Slack")
 @config_class(SlackSourceConfig)
 @support_status(SupportStatus.TESTING)
-class SlackSource(Source):
+class SlackSource(StatefulIngestionSourceBase):
     def __init__(self, ctx: PipelineContext, config: SlackSourceConfig):
+        super().__init__(config, ctx)
         self.ctx = ctx
         self.config = config
-        self.report = SlackSourceReport()
+        self.report: SlackSourceReport = SlackSourceReport()
         self.workspace_base_url: Optional[str] = None
         self.rate_limiter = RateLimiter(
             max_calls=self.config.api_requests_per_min, period=60
@@ -111,6 +124,14 @@ class SlackSource(Source):
     def get_slack_client(self) -> WebClient:
         return WebClient(token=self.config.bot_token.get_secret_value())
+    def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+        return [
+            *super().get_workunit_processors(),
+            StaleEntityRemovalHandler.create(
+                self, self.config, self.ctx
+            ).workunit_processor,
+        ]
     def get_workunits_internal(
         self,
     ) -> Iterable[MetadataWorkUnit]:

datahub/ingestion/source/snowflake/snowflake_query.py CHANGED Viewed

@@ -134,10 +134,11 @@ class SnowflakeQuery:
         clustering_key AS "CLUSTERING_KEY",
         auto_clustering_on AS "AUTO_CLUSTERING_ON",
         is_dynamic AS "IS_DYNAMIC",
-        is_iceberg AS "IS_ICEBERG"
+        is_iceberg AS "IS_ICEBERG",
+        is_hybrid AS "IS_HYBRID"
         FROM {db_clause}information_schema.tables t
         WHERE table_schema != 'INFORMATION_SCHEMA'
-        and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
+        and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE')
         order by table_schema, table_name"""
     @staticmethod
@@ -156,10 +157,11 @@ class SnowflakeQuery:
         clustering_key AS "CLUSTERING_KEY",
         auto_clustering_on AS "AUTO_CLUSTERING_ON",
         is_dynamic AS "IS_DYNAMIC",
-        is_iceberg AS "IS_ICEBERG"
+        is_iceberg AS "IS_ICEBERG",
+        is_hybrid AS "IS_HYBRID"
         FROM {db_clause}information_schema.tables t
         where table_schema='{schema_name}'
-        and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
+        and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
         order by table_schema, table_name"""
     @staticmethod

datahub/ingestion/source/snowflake/snowflake_schema.py CHANGED Viewed

@@ -96,10 +96,7 @@ class SnowflakeTable(BaseTable):
     column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
     is_dynamic: bool = False
     is_iceberg: bool = False
-    @property
-    def is_hybrid(self) -> bool:
-        return self.type is not None and self.type == "HYBRID TABLE"
+    is_hybrid: bool = False
     def get_subtype(self) -> DatasetSubTypes:
         return DatasetSubTypes.TABLE
@@ -369,6 +366,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
                     clustering_key=table["CLUSTERING_KEY"],
                     is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
                     is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
+                    is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
                 )
             )
         return tables
@@ -395,6 +393,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
                     clustering_key=table["CLUSTERING_KEY"],
                     is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
                     is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
+                    is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
                 )
             )
         return tables

datahub/ingestion/source/sql/oracle.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import datetime
 import logging
+import platform
 import re
 # This import verifies that the dependencies are available.
@@ -85,6 +86,16 @@ class OracleConfig(BasicSQLAlchemyConfig):
         description="The data dictionary views mode, to extract information about schema objects "
         "('ALL' and 'DBA' views are supported). (https://docs.oracle.com/cd/E11882_01/nav/catalog_views.htm)",
     )
+    # oracledb settings to enable thick mode and client library location
+    enable_thick_mode: Optional[bool] = Field(
+        default=False,
+        description="Connection defaults to thin mode. Set to True to enable thick mode.",
+    )
+    thick_mode_lib_dir: Optional[str] = Field(
+        default=None,
+        description="If using thick mode on Windows or Mac, set thick_mode_lib_dir to the oracle client libraries path. "
+        "On Linux, this value is ignored, as ldconfig or LD_LIBRARY_PATH will define the location.",
+    )
     @pydantic.validator("service_name")
     def check_service_name(cls, v, values):
@@ -100,6 +111,18 @@ class OracleConfig(BasicSQLAlchemyConfig):
             raise ValueError("Specify one of data dictionary views mode: 'ALL', 'DBA'.")
         return values
+    @pydantic.validator("thick_mode_lib_dir", always=True)
+    def check_thick_mode_lib_dir(cls, v, values):
+        if (
+            v is None
+            and values.get("enable_thick_mode")
+            and (platform.system() == "Darwin" or platform.system() == "Windows")
+        ):
+            raise ValueError(
+                "Specify 'thick_mode_lib_dir' on Mac/Windows when enable_thick_mode is true"
+            )
+        return v
     def get_sql_alchemy_url(self):
         url = super().get_sql_alchemy_url()
         if self.service_name:
@@ -586,6 +609,17 @@ class OracleSource(SQLAlchemySource):
     def __init__(self, config, ctx):
         super().__init__(config, ctx, "oracle")
+        # if connecting to oracle with enable_thick_mode, it must be initialized before calling
+        # create_engine, which is called in get_inspectors()
+        # https://python-oracledb.readthedocs.io/en/latest/user_guide/initialization.html#enabling-python-oracledb-thick-mode
+        if self.config.enable_thick_mode:
+            if platform.system() == "Darwin" or platform.system() == "Windows":
+                # windows and mac os require lib_dir to be set explicitly
+                oracledb.init_oracle_client(lib_dir=self.config.thick_mode_lib_dir)
+            else:
+                # linux requires configurating the library path with ldconfig or LD_LIBRARY_PATH
+                oracledb.init_oracle_client()
     @classmethod
     def create(cls, config_dict, ctx):
         config = OracleConfig.parse_obj(config_dict)

datahub/ingestion/source_config/pulsar.py CHANGED Viewed

@@ -33,7 +33,9 @@ def _is_valid_hostname(hostname: str) -> bool:
 class PulsarSourceConfig(
-    StatefulIngestionConfigBase, PlatformInstanceConfigMixin, EnvConfigMixin
+    StatefulIngestionConfigBase,
+    PlatformInstanceConfigMixin,
+    EnvConfigMixin,
 ):
     web_service_url: str = Field(
         default="http://localhost:8080", description="The web URL for the cluster."

datahub/ingestion/transformer/pattern_cleanup_ownership.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import logging
 import re
 from typing import List, Optional, Set, cast
@@ -10,8 +11,11 @@ from datahub.metadata.schema_classes import (
     OwnershipClass,
     OwnershipTypeClass,
 )
+from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn
+from datahub.utilities.urns._urn_base import Urn
+from datahub.utilities.urns.error import InvalidUrnError
-_USER_URN_PREFIX: str = "urn:li:corpuser:"
+logger = logging.getLogger(__name__)
 class PatternCleanUpOwnershipConfig(ConfigModel):
@@ -49,6 +53,11 @@ class PatternCleanUpOwnership(OwnershipTransformer):
         else:
             return set()
+    def _process_owner(self, name: str) -> str:
+        for value in self.config.pattern_for_cleanup:
+            name = re.sub(value, "", name)
+        return name
     def transform_aspect(
         self, entity_urn: str, aspect_name: str, aspect: Optional[builder.Aspect]
     ) -> Optional[builder.Aspect]:
@@ -58,14 +67,23 @@ class PatternCleanUpOwnership(OwnershipTransformer):
         # clean all the owners based on the parameters received from config
         cleaned_owner_urns: List[str] = []
         for owner_urn in current_owner_urns:
-            user_id: str = owner_urn.split(_USER_URN_PREFIX)[1]
-            for value in self.config.pattern_for_cleanup:
-                user_id = re.sub(value, "", user_id)
-            cleaned_owner_urns.append(_USER_URN_PREFIX + user_id)
+            username = ""
+            try:
+                owner: Urn = Urn.from_string(owner_urn)
+                if isinstance(owner, CorpUserUrn):
+                    username = str(CorpUserUrn(self._process_owner(owner.username)))
+                elif isinstance(owner, CorpGroupUrn):
+                    username = str(CorpGroupUrn(self._process_owner(owner.name)))
+                else:
+                    logger.warning(f"{owner_urn} is not a supported owner type.")
+                    username = owner_urn
+            except InvalidUrnError:
+                logger.warning(f"Could not parse {owner_urn} from {entity_urn}")
+                username = owner_urn
+            cleaned_owner_urns.append(username)
         ownership_type, ownership_type_urn = builder.validate_ownership_type(
-            OwnershipTypeClass.DATAOWNER
+            OwnershipTypeClass.TECHNICAL_OWNER
         )
         owners = [
             OwnerClass(

acryl-datahub 1.0.0rc5__py3-none-any.whl → 1.0.0rc6__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0rc5py3-none-any.whl → 1.0.0rc6py3-none-any.whl