PyPI - acryl-datahub - Versions diffs - 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

acryl-datahub 0.15.0.6rc2py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (205) hide show

{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
datahub/_version.py +1 -1
datahub/api/entities/common/serialized_value.py +4 -3
datahub/api/entities/dataset/dataset.py +731 -42
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/cli/check_cli.py +72 -19
datahub/cli/docker_cli.py +3 -3
datahub/cli/iceberg_cli.py +31 -7
datahub/cli/ingest_cli.py +30 -93
datahub/cli/lite_cli.py +4 -2
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/cli/specific/dataset_cli.py +128 -14
datahub/configuration/common.py +10 -2
datahub/configuration/git.py +1 -3
datahub/configuration/kafka.py +1 -1
datahub/emitter/mce_builder.py +28 -13
datahub/emitter/mcp_builder.py +4 -1
datahub/emitter/response_helper.py +145 -0
datahub/emitter/rest_emitter.py +323 -10
datahub/ingestion/api/decorators.py +1 -1
datahub/ingestion/api/source_helpers.py +4 -0
datahub/ingestion/fs/s3_fs.py +2 -2
datahub/ingestion/glossary/classification_mixin.py +1 -5
datahub/ingestion/graph/client.py +41 -22
datahub/ingestion/graph/entity_versioning.py +3 -3
datahub/ingestion/graph/filters.py +64 -37
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
datahub/ingestion/run/pipeline.py +112 -148
datahub/ingestion/run/sink_callback.py +77 -0
datahub/ingestion/sink/datahub_rest.py +8 -0
datahub/ingestion/source/abs/config.py +2 -4
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
datahub/ingestion/source/cassandra/cassandra.py +152 -233
datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
datahub/ingestion/source/common/subtypes.py +12 -0
datahub/ingestion/source/csv_enricher.py +3 -3
datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
datahub/ingestion/source/dbt/dbt_common.py +8 -5
datahub/ingestion/source/dbt/dbt_core.py +11 -9
datahub/ingestion/source/dbt/dbt_tests.py +4 -8
datahub/ingestion/source/delta_lake/config.py +8 -1
datahub/ingestion/source/delta_lake/report.py +4 -2
datahub/ingestion/source/delta_lake/source.py +20 -5
datahub/ingestion/source/dremio/dremio_api.py +4 -8
datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
datahub/ingestion/source/elastic_search.py +26 -6
datahub/ingestion/source/feast.py +27 -8
datahub/ingestion/source/file.py +6 -3
datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
datahub/ingestion/source/ge_data_profiler.py +12 -15
datahub/ingestion/source/iceberg/iceberg.py +46 -12
datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
datahub/ingestion/source/identity/okta.py +37 -7
datahub/ingestion/source/kafka/kafka.py +1 -1
datahub/ingestion/source/kafka_connect/common.py +2 -7
datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
datahub/ingestion/source/looker/looker_common.py +6 -5
datahub/ingestion/source/looker/looker_file_loader.py +2 -2
datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
datahub/ingestion/source/looker/looker_source.py +1 -1
datahub/ingestion/source/looker/looker_template_language.py +4 -2
datahub/ingestion/source/looker/lookml_source.py +3 -2
datahub/ingestion/source/metabase.py +57 -35
datahub/ingestion/source/metadata/business_glossary.py +45 -3
datahub/ingestion/source/metadata/lineage.py +2 -2
datahub/ingestion/source/mlflow.py +365 -35
datahub/ingestion/source/mode.py +18 -8
datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
datahub/ingestion/source/nifi.py +37 -11
datahub/ingestion/source/openapi.py +1 -1
datahub/ingestion/source/openapi_parser.py +49 -17
datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
datahub/ingestion/source/powerbi/powerbi.py +1 -3
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
datahub/ingestion/source/preset.py +7 -4
datahub/ingestion/source/pulsar.py +3 -2
datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
datahub/ingestion/source/redash.py +31 -7
datahub/ingestion/source/redshift/config.py +4 -0
datahub/ingestion/source/redshift/datashares.py +236 -0
datahub/ingestion/source/redshift/lineage.py +6 -2
datahub/ingestion/source/redshift/lineage_v2.py +24 -9
datahub/ingestion/source/redshift/profile.py +1 -1
datahub/ingestion/source/redshift/query.py +133 -33
datahub/ingestion/source/redshift/redshift.py +46 -73
datahub/ingestion/source/redshift/redshift_schema.py +186 -6
datahub/ingestion/source/redshift/report.py +3 -0
datahub/ingestion/source/s3/config.py +5 -5
datahub/ingestion/source/s3/source.py +20 -41
datahub/ingestion/source/salesforce.py +550 -275
datahub/ingestion/source/schema_inference/object.py +1 -1
datahub/ingestion/source/sigma/sigma.py +1 -1
datahub/ingestion/source/slack/slack.py +31 -10
datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
datahub/ingestion/source/sql/athena.py +10 -16
datahub/ingestion/source/sql/druid.py +1 -5
datahub/ingestion/source/sql/hive.py +15 -6
datahub/ingestion/source/sql/hive_metastore.py +3 -2
datahub/ingestion/source/sql/mssql/job_models.py +29 -0
datahub/ingestion/source/sql/mssql/source.py +11 -5
datahub/ingestion/source/sql/oracle.py +127 -63
datahub/ingestion/source/sql/sql_common.py +16 -18
datahub/ingestion/source/sql/sql_types.py +2 -2
datahub/ingestion/source/sql/teradata.py +19 -5
datahub/ingestion/source/sql/trino.py +2 -2
datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
datahub/ingestion/source/superset.py +222 -62
datahub/ingestion/source/tableau/tableau.py +22 -6
datahub/ingestion/source/tableau/tableau_common.py +3 -2
datahub/ingestion/source/unity/ge_profiler.py +2 -1
datahub/ingestion/source/unity/source.py +11 -1
datahub/ingestion/source/vertexai.py +697 -0
datahub/ingestion/source_config/pulsar.py +3 -1
datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
datahub/lite/duckdb_lite.py +3 -10
datahub/lite/lite_local.py +1 -1
datahub/lite/lite_util.py +4 -3
datahub/metadata/_schema_classes.py +714 -417
datahub/metadata/_urns/urn_defs.py +1673 -1649
datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
datahub/metadata/schema.avsc +16438 -16603
datahub/metadata/schemas/AssertionInfo.avsc +3 -1
datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
datahub/metadata/schemas/ChartInfo.avsc +1 -0
datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
datahub/metadata/schemas/CorpUserKey.avsc +2 -1
datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
datahub/metadata/schemas/DataProcessKey.avsc +2 -1
datahub/metadata/schemas/DataProductKey.avsc +2 -1
datahub/metadata/schemas/DomainKey.avsc +2 -1
datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
datahub/metadata/schemas/IncidentInfo.avsc +130 -46
datahub/metadata/schemas/InputFields.avsc +3 -1
datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
datahub/metadata/schemas/MLModelKey.avsc +3 -1
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
datahub/metadata/schemas/PostKey.avsc +2 -1
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
datahub/metadata/schemas/VersionProperties.avsc +18 -0
datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
datahub/pydantic/__init__.py +0 -0
datahub/pydantic/compat.py +58 -0
datahub/sdk/__init__.py +30 -12
datahub/sdk/_all_entities.py +1 -1
datahub/sdk/_attribution.py +4 -0
datahub/sdk/_shared.py +258 -16
datahub/sdk/_utils.py +35 -0
datahub/sdk/container.py +30 -6
datahub/sdk/dataset.py +118 -20
datahub/sdk/{_entity.py → entity.py} +24 -1
datahub/sdk/entity_client.py +1 -1
datahub/sdk/main_client.py +23 -0
datahub/sdk/resolver_client.py +17 -29
datahub/sdk/search_client.py +50 -0
datahub/sdk/search_filters.py +374 -0
datahub/specific/dataset.py +3 -4
datahub/sql_parsing/_sqlglot_patch.py +2 -10
datahub/sql_parsing/schema_resolver.py +1 -1
datahub/sql_parsing/split_statements.py +220 -126
datahub/sql_parsing/sql_parsing_common.py +7 -0
datahub/sql_parsing/sqlglot_lineage.py +1 -1
datahub/sql_parsing/sqlglot_utils.py +1 -4
datahub/testing/check_sql_parser_result.py +5 -6
datahub/testing/compare_metadata_json.py +7 -6
datahub/testing/pytest_hooks.py +56 -0
datahub/upgrade/upgrade.py +2 -2
datahub/utilities/file_backed_collections.py +3 -14
datahub/utilities/ingest_utils.py +106 -0
datahub/utilities/mapping.py +1 -1
datahub/utilities/memory_footprint.py +3 -2
datahub/utilities/sentinels.py +22 -0
datahub/utilities/unified_diff.py +5 -1
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/schema_inference/object.py CHANGED Viewed

@@ -149,7 +149,7 @@ def construct_schema(
     extended_schema: Dict[Tuple[str, ...], SchemaDescription] = {}
-    for field_path in schema.keys():
+    for field_path in schema:
         field_types = schema[field_path]["types"]
         field_type: Union[str, type] = "mixed"

datahub/ingestion/source/sigma/sigma.py CHANGED Viewed

@@ -124,7 +124,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
         try:
             self.sigma_api = SigmaAPI(self.config, self.reporter)
         except Exception as e:
-            raise ConfigurationError(f"Unable to connect sigma API. Exception: {e}")
+            raise ConfigurationError("Unable to connect sigma API") from e
     @staticmethod
     def test_connection(config_dict: dict) -> TestConnectionReport:

datahub/ingestion/source/slack/slack.py CHANGED Viewed

@@ -9,7 +9,6 @@ from tenacity import retry, wait_exponential
 from tenacity.before_sleep import before_sleep_log
 import datahub.emitter.mce_builder as builder
-from datahub.configuration.common import ConfigModel
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
@@ -18,8 +17,19 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.source import Source, SourceReport
+from datahub.ingestion.api.source import (
+    MetadataWorkUnitProcessor,
+    SourceReport,
+)
 from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.state.stale_entity_removal_handler import (
+    StaleEntityRemovalHandler,
+    StaleEntityRemovalSourceReport,
+)
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+    StatefulIngestionConfigBase,
+    StatefulIngestionSourceBase,
+)
 from datahub.metadata.schema_classes import (
     CorpUserEditableInfoClass,
     DatasetPropertiesClass,
@@ -44,7 +54,9 @@ class CorpUser:
     slack_display_name: Optional[str] = None
-class SlackSourceConfig(ConfigModel):
+class SlackSourceConfig(
+    StatefulIngestionConfigBase,
+):
     bot_token: SecretStr = Field(
         description="Bot token for the Slack workspace. Needs `users:read`, `users:read.email` and `users.profile:read` scopes.",
     )
@@ -58,22 +70,22 @@ class SlackSourceConfig(ConfigModel):
         default=10,
         description="Number of API requests per minute. Low-level config. Do not tweak unless you are facing any issues.",
     )
-    ingest_public_channels = Field(
+    ingest_public_channels: bool = Field(
         type=bool,
         default=False,
         description="Whether to ingest public channels. If set to true needs `channels:read` scope.",
     )
-    channels_iteration_limit = Field(
+    channels_iteration_limit: int = Field(
         type=int,
         default=200,
         description="Limit the number of channels to be ingested in a iteration. Low-level config. Do not tweak unless you are facing any issues.",
     )
-    channel_min_members = Field(
+    channel_min_members: int = Field(
         type=int,
         default=2,
         description="Ingest channels with at least this many members.",
     )
-    should_ingest_archived_channels = Field(
+    should_ingest_archived_channels: bool = Field(
         type=bool,
         default=False,
         description="Whether to ingest archived channels.",
@@ -81,7 +93,7 @@ class SlackSourceConfig(ConfigModel):
 @dataclass
-class SlackSourceReport(SourceReport):
+class SlackSourceReport(StaleEntityRemovalSourceReport):
     channels_reported: int = 0
     archived_channels_reported: int = 0
@@ -92,11 +104,12 @@ PLATFORM_NAME = "slack"
 @platform_name("Slack")
 @config_class(SlackSourceConfig)
 @support_status(SupportStatus.TESTING)
-class SlackSource(Source):
+class SlackSource(StatefulIngestionSourceBase):
     def __init__(self, ctx: PipelineContext, config: SlackSourceConfig):
+        super().__init__(config, ctx)
         self.ctx = ctx
         self.config = config
-        self.report = SlackSourceReport()
+        self.report: SlackSourceReport = SlackSourceReport()
         self.workspace_base_url: Optional[str] = None
         self.rate_limiter = RateLimiter(
             max_calls=self.config.api_requests_per_min, period=60
@@ -111,6 +124,14 @@ class SlackSource(Source):
     def get_slack_client(self) -> WebClient:
         return WebClient(token=self.config.bot_token.get_secret_value())
+    def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+        return [
+            *super().get_workunit_processors(),
+            StaleEntityRemovalHandler.create(
+                self, self.config, self.ctx
+            ).workunit_processor,
+        ]
     def get_workunits_internal(
         self,
     ) -> Iterable[MetadataWorkUnit]:

datahub/ingestion/source/snowflake/snowflake_connection.py CHANGED Viewed

@@ -125,7 +125,7 @@ class SnowflakeConnectionConfig(ConfigModel):
     @pydantic.validator("authentication_type", always=True)
     def authenticator_type_is_valid(cls, v, values):
-        if v not in _VALID_AUTH_TYPES.keys():
+        if v not in _VALID_AUTH_TYPES:
             raise ValueError(
                 f"unsupported authenticator type '{v}' was provided,"
                 f" use one of {list(_VALID_AUTH_TYPES.keys())}"
@@ -312,7 +312,7 @@ class SnowflakeConnectionConfig(ConfigModel):
             raise ValueError(
                 f"access_token not found in response {response}. "
                 "Please check your OAuth configuration."
-            )
+            ) from None
         connect_args = self.get_options()["connect_args"]
         return snowflake.connector.connect(
             user=self.username,

datahub/ingestion/source/snowflake/snowflake_queries.py CHANGED Viewed

@@ -403,6 +403,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
                     res["session_id"],
                     res["query_start_time"],
                     object_modified_by_ddl,
+                    res["query_type"],
                 )
             if known_ddl_entry:
                 return known_ddl_entry
@@ -537,40 +538,42 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
         session_id: str,
         timestamp: datetime,
         object_modified_by_ddl: dict,
+        query_type: str,
     ) -> Optional[Union[TableRename, TableSwap]]:
         timestamp = timestamp.astimezone(timezone.utc)
-        if object_modified_by_ddl[
-            "operationType"
-        ] == "ALTER" and object_modified_by_ddl["properties"].get("swapTargetName"):
-            urn1 = self.identifiers.gen_dataset_urn(
+        if (
+            object_modified_by_ddl["operationType"] == "ALTER"
+            and query_type == "RENAME_TABLE"
+            and object_modified_by_ddl["properties"].get("objectName")
+        ):
+            original_un = self.identifiers.gen_dataset_urn(
                 self.identifiers.get_dataset_identifier_from_qualified_name(
                     object_modified_by_ddl["objectName"]
                 )
             )
-            urn2 = self.identifiers.gen_dataset_urn(
+            new_urn = self.identifiers.gen_dataset_urn(
                 self.identifiers.get_dataset_identifier_from_qualified_name(
-                    object_modified_by_ddl["properties"]["swapTargetName"]["value"]
+                    object_modified_by_ddl["properties"]["objectName"]["value"]
                 )
             )
-            return TableSwap(urn1, urn2, query, session_id, timestamp)
+            return TableRename(original_un, new_urn, query, session_id, timestamp)
         elif object_modified_by_ddl[
             "operationType"
-        ] == "RENAME_TABLE" and object_modified_by_ddl["properties"].get("objectName"):
-            original_un = self.identifiers.gen_dataset_urn(
+        ] == "ALTER" and object_modified_by_ddl["properties"].get("swapTargetName"):
+            urn1 = self.identifiers.gen_dataset_urn(
                 self.identifiers.get_dataset_identifier_from_qualified_name(
                     object_modified_by_ddl["objectName"]
                 )
             )
-            new_urn = self.identifiers.gen_dataset_urn(
+            urn2 = self.identifiers.gen_dataset_urn(
                 self.identifiers.get_dataset_identifier_from_qualified_name(
-                    object_modified_by_ddl["properties"]["objectName"]["value"]
+                    object_modified_by_ddl["properties"]["swapTargetName"]["value"]
                 )
             )
-            return TableRename(original_un, new_urn, query, session_id, timestamp)
+            return TableSwap(urn1, urn2, query, session_id, timestamp)
         else:
             self.report.num_ddl_queries_dropped += 1
             return None
@@ -731,6 +734,9 @@ fingerprinted_queries as (
     JOIN filtered_access_history a USING (query_id)
 )
 SELECT * FROM query_access_history
+-- Our query aggregator expects the queries to be added in chronological order.
+-- It's easier for us to push down the sorting to Snowflake/SQL instead of doing it in Python.
+ORDER BY QUERY_START_TIME ASC
 """

datahub/ingestion/source/snowflake/snowflake_query.py CHANGED Viewed

@@ -134,10 +134,11 @@ class SnowflakeQuery:
         clustering_key AS "CLUSTERING_KEY",
         auto_clustering_on AS "AUTO_CLUSTERING_ON",
         is_dynamic AS "IS_DYNAMIC",
-        is_iceberg AS "IS_ICEBERG"
+        is_iceberg AS "IS_ICEBERG",
+        is_hybrid AS "IS_HYBRID"
         FROM {db_clause}information_schema.tables t
         WHERE table_schema != 'INFORMATION_SCHEMA'
-        and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
+        and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE')
         order by table_schema, table_name"""
     @staticmethod
@@ -156,10 +157,11 @@ class SnowflakeQuery:
         clustering_key AS "CLUSTERING_KEY",
         auto_clustering_on AS "AUTO_CLUSTERING_ON",
         is_dynamic AS "IS_DYNAMIC",
-        is_iceberg AS "IS_ICEBERG"
+        is_iceberg AS "IS_ICEBERG",
+        is_hybrid AS "IS_HYBRID"
         FROM {db_clause}information_schema.tables t
         where table_schema='{schema_name}'
-        and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
+        and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
         order by table_schema, table_name"""
     @staticmethod

datahub/ingestion/source/snowflake/snowflake_schema.py CHANGED Viewed

@@ -96,10 +96,7 @@ class SnowflakeTable(BaseTable):
     column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
     is_dynamic: bool = False
     is_iceberg: bool = False
-    @property
-    def is_hybrid(self) -> bool:
-        return self.type is not None and self.type == "HYBRID TABLE"
+    is_hybrid: bool = False
     def get_subtype(self) -> DatasetSubTypes:
         return DatasetSubTypes.TABLE
@@ -369,6 +366,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
                     clustering_key=table["CLUSTERING_KEY"],
                     is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
                     is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
+                    is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
                 )
             )
         return tables
@@ -395,6 +393,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
                     clustering_key=table["CLUSTERING_KEY"],
                     is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
                     is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
+                    is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
                 )
             )
         return tables

datahub/ingestion/source/snowflake/snowflake_v2.py CHANGED Viewed

@@ -439,7 +439,7 @@ class SnowflakeV2Source(
                     failure_reason=failure_message,
                 )
-            if c in _report.keys():
+            if c in _report:
                 continue
             # If some capabilities are missing, then mark them as not capable

datahub/ingestion/source/sql/athena.py CHANGED Viewed

@@ -55,7 +55,7 @@ try:
 except ImportError:
     _F = typing.TypeVar("_F", bound=typing.Callable[..., typing.Any])
-    def override(f: _F, /) -> _F:  # noqa: F811
+    def override(f: _F, /) -> _F:
         return f
@@ -104,7 +104,7 @@ class CustomAthenaRestDialect(AthenaRestDialect):
             return "\n".join([r for r in res])
     @typing.no_type_check
-    def _get_column_type(self, type_: Union[str, Dict[str, Any]]) -> TypeEngine:  # noqa: C901
+    def _get_column_type(self, type_: Union[str, Dict[str, Any]]) -> TypeEngine:
         """Derives the data type of the Athena column.
         This method is overwritten to extend the behavior of PyAthena.
@@ -396,7 +396,7 @@ class AthenaSource(SQLAlchemySource):
             metadata.table_type if metadata.table_type else ""
         )
-        location: Optional[str] = custom_properties.get("location", None)
+        location: Optional[str] = custom_properties.get("location")
         if location is not None:
             if location.startswith("s3://"):
                 location = make_s3_urn(location, self.config.env)
@@ -538,21 +538,15 @@ class AthenaSource(SQLAlchemySource):
             column_name=column["name"],
             column_type=column["type"],
             inspector=inspector,
-            description=column.get("comment", None),
+            description=column.get("comment"),
             nullable=column.get("nullable", True),
-            is_part_of_key=(
-                True
-                if (
-                    pk_constraints is not None
-                    and isinstance(pk_constraints, dict)
-                    and column["name"] in pk_constraints.get("constrained_columns", [])
-                )
-                else False
+            is_part_of_key=bool(
+                pk_constraints is not None
+                and isinstance(pk_constraints, dict)
+                and column["name"] in pk_constraints.get("constrained_columns", [])
             ),
-            is_partitioning_key=(
-                True
-                if (partition_keys is not None and column["name"] in partition_keys)
-                else False
+            is_partitioning_key=bool(
+                partition_keys is not None and column["name"] in partition_keys
             ),
         )

datahub/ingestion/source/sql/druid.py CHANGED Viewed

@@ -50,11 +50,7 @@ class DruidConfig(BasicSQLAlchemyConfig):
     """
     def get_identifier(self, schema: str, table: str) -> str:
-        return (
-            f"{self.platform_instance}.{table}"
-            if self.platform_instance
-            else f"{table}"
-        )
+        return f"{table}"
 @platform_name("Druid")

datahub/ingestion/source/sql/hive.py CHANGED Viewed

@@ -777,6 +777,7 @@ class HiveSource(TwoTierSQLAlchemySource):
             column,
             inspector,
             pk_constraints,
+            partition_keys=partition_keys,
         )
         if self._COMPLEX_TYPE.match(fields[0].nativeDataType) and isinstance(
@@ -821,12 +822,8 @@ class HiveSource(TwoTierSQLAlchemySource):
         try:
             view_definition = inspector.get_view_definition(view, schema)
-            if view_definition is None:
-                view_definition = ""
-            else:
-                # Some dialects return a TextClause instead of a raw string,
-                # so we need to convert them to a string.
-                view_definition = str(view_definition)
+            # Some dialects return a TextClause instead of a raw string, so we need to convert them to a string.
+            view_definition = str(view_definition) if view_definition else ""
         except NotImplementedError:
             view_definition = ""
@@ -853,3 +850,15 @@ class HiveSource(TwoTierSQLAlchemySource):
                 default_db=default_db,
                 default_schema=default_schema,
             )
+    def get_partitions(
+        self, inspector: Inspector, schema: str, table: str
+    ) -> Optional[List[str]]:
+        partition_columns: List[dict] = inspector.get_indexes(
+            table_name=table, schema=schema
+        )
+        for partition_column in partition_columns:
+            if partition_column.get("column_names"):
+                return partition_column.get("column_names")
+        return []

datahub/ingestion/source/sql/hive_metastore.py CHANGED Viewed

@@ -67,7 +67,7 @@ TableKey = namedtuple("TableKey", ["schema", "table"])
 class HiveMetastoreConfigMode(StrEnum):
-    hive: str = "hive"  # noqa: F811
+    hive: str = "hive"
     presto: str = "presto"
     presto_on_hive: str = "presto-on-hive"
     trino: str = "trino"
@@ -893,8 +893,9 @@ class HiveMetastoreSource(SQLAlchemySource):
         return get_schema_fields_for_hive_column(
             column["col_name"],
             column["col_type"],
+            # column is actually an sqlalchemy.engine.row.LegacyRow, not a Dict and we cannot make column.get("col_description", "")
             description=(
-                column["col_description"] if "col_description" in column else ""
+                column["col_description"] if "col_description" in column else ""  # noqa: SIM401
             ),
             default_nullable=True,
         )

datahub/ingestion/source/sql/mssql/job_models.py CHANGED Viewed

@@ -11,12 +11,17 @@ from datahub.emitter.mcp_builder import (
     DatabaseKey,
     SchemaKey,
 )
+from datahub.ingestion.source.common.subtypes import (
+    FlowContainerSubTypes,
+    JobContainerSubTypes,
+)
 from datahub.metadata.schema_classes import (
     ContainerClass,
     DataFlowInfoClass,
     DataJobInfoClass,
     DataJobInputOutputClass,
     DataPlatformInstanceClass,
+    SubTypesClass,
 )
@@ -211,6 +216,18 @@ class MSSQLDataJob:
             status=self.status,
         )
+    @property
+    def as_subtypes_aspect(self) -> SubTypesClass:
+        assert isinstance(self.entity, (JobStep, StoredProcedure))
+        type = (
+            JobContainerSubTypes.MSSQL_JOBSTEP
+            if isinstance(self.entity, JobStep)
+            else JobContainerSubTypes.MSSQL_STORED_PROCEDURE
+        )
+        return SubTypesClass(
+            typeNames=[type],
+        )
     @property
     def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]:
         if self.entity.flow.platform_instance:
@@ -276,6 +293,18 @@ class MSSQLDataFlow:
             externalUrl=self.external_url,
         )
+    @property
+    def as_subtypes_aspect(self) -> SubTypesClass:
+        assert isinstance(self.entity, (MSSQLJob, MSSQLProceduresContainer))
+        type = (
+            FlowContainerSubTypes.MSSQL_JOB
+            if isinstance(self.entity, MSSQLJob)
+            else FlowContainerSubTypes.MSSQL_PROCEDURE_CONTAINER
+        )
+        return SubTypesClass(
+            typeNames=[type],
+        )
     @property
     def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]:
         if self.entity.platform_instance:

datahub/ingestion/source/sql/mssql/source.py CHANGED Viewed

@@ -401,7 +401,7 @@ class SQLServerSource(SQLAlchemySource):
                 data_job.add_property(name=data_name, value=str(data_value))
             yield from self.construct_job_workunits(data_job)
-    def loop_stored_procedures(  # noqa: C901
+    def loop_stored_procedures(
         self,
         inspector: Inspector,
         schema: str,
@@ -638,6 +638,11 @@ class SQLServerSource(SQLAlchemySource):
             aspect=data_job.as_datajob_info_aspect,
         ).as_workunit()
+        yield MetadataChangeProposalWrapper(
+            entityUrn=data_job.urn,
+            aspect=data_job.as_subtypes_aspect,
+        ).as_workunit()
         data_platform_instance_aspect = data_job.as_maybe_platform_instance_aspect
         if data_platform_instance_aspect:
             yield MetadataChangeProposalWrapper(
@@ -676,8 +681,6 @@ class SQLServerSource(SQLAlchemySource):
                 ),
             ).as_workunit()
-        # TODO: Add SubType when it appear
     def construct_flow_workunits(
         self,
         data_flow: MSSQLDataFlow,
@@ -687,6 +690,11 @@ class SQLServerSource(SQLAlchemySource):
             aspect=data_flow.as_dataflow_info_aspect,
         ).as_workunit()
+        yield MetadataChangeProposalWrapper(
+            entityUrn=data_flow.urn,
+            aspect=data_flow.as_subtypes_aspect,
+        ).as_workunit()
         data_platform_instance_aspect = data_flow.as_maybe_platform_instance_aspect
         if data_platform_instance_aspect:
             yield MetadataChangeProposalWrapper(
@@ -700,8 +708,6 @@ class SQLServerSource(SQLAlchemySource):
                 aspect=data_flow.as_container_aspect,
             ).as_workunit()
-        # TODO: Add SubType when it appear
     def get_inspectors(self) -> Iterable[Inspector]:
         # This method can be overridden in the case that you want to dynamically
         # run on multiple databases.

acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.6rc2py3-none-any.whl → 1.0.0py3-none-any.whl