PyPI - acryl-datahub - Versions diffs - 1.1.1rc3__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

acryl-datahub 1.1.1rc3py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (226) hide show

{acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2559 -2532
{acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +226 -190
{acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +1 -1
{acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
datahub/_version.py +1 -1
datahub/api/entities/dataset/dataset.py +2 -1
datahub/api/entities/external/__init__.py +0 -0
datahub/api/entities/external/external_entities.py +239 -0
datahub/api/entities/external/external_tag.py +145 -0
datahub/api/entities/external/lake_formation_external_entites.py +161 -0
datahub/api/entities/external/restricted_text.py +247 -0
datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
datahub/cli/check_cli.py +88 -7
datahub/cli/cli_utils.py +63 -0
datahub/cli/container_cli.py +5 -0
datahub/cli/delete_cli.py +124 -27
datahub/cli/docker_check.py +107 -12
datahub/cli/docker_cli.py +149 -227
datahub/cli/exists_cli.py +0 -2
datahub/cli/get_cli.py +0 -2
datahub/cli/iceberg_cli.py +5 -0
datahub/cli/ingest_cli.py +12 -16
datahub/cli/migrate.py +2 -0
datahub/cli/put_cli.py +1 -4
datahub/cli/quickstart_versioning.py +50 -7
datahub/cli/specific/assertions_cli.py +0 -4
datahub/cli/specific/datacontract_cli.py +0 -3
datahub/cli/specific/dataproduct_cli.py +0 -11
datahub/cli/specific/dataset_cli.py +1 -8
datahub/cli/specific/forms_cli.py +0 -4
datahub/cli/specific/group_cli.py +0 -2
datahub/cli/specific/structuredproperties_cli.py +1 -4
datahub/cli/specific/user_cli.py +0 -2
datahub/cli/state_cli.py +0 -2
datahub/cli/timeline_cli.py +0 -2
datahub/emitter/response_helper.py +86 -1
datahub/emitter/rest_emitter.py +71 -13
datahub/entrypoints.py +4 -3
datahub/ingestion/api/decorators.py +15 -3
datahub/ingestion/api/report.py +332 -3
datahub/ingestion/api/sink.py +3 -0
datahub/ingestion/api/source.py +48 -44
datahub/ingestion/autogenerated/__init__.py +0 -0
datahub/ingestion/autogenerated/capability_summary.json +3449 -0
datahub/ingestion/autogenerated/lineage.json +401 -0
datahub/ingestion/autogenerated/lineage_helper.py +177 -0
datahub/ingestion/extractor/schema_util.py +13 -4
datahub/ingestion/glossary/classification_mixin.py +5 -0
datahub/ingestion/graph/client.py +100 -15
datahub/ingestion/graph/config.py +1 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
datahub/ingestion/run/pipeline.py +54 -2
datahub/ingestion/sink/datahub_rest.py +13 -0
datahub/ingestion/source/abs/source.py +1 -1
datahub/ingestion/source/aws/aws_common.py +4 -0
datahub/ingestion/source/aws/glue.py +489 -244
datahub/ingestion/source/aws/tag_entities.py +292 -0
datahub/ingestion/source/azure/azure_common.py +2 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
datahub/ingestion/source/bigquery_v2/common.py +1 -1
datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
datahub/ingestion/source/bigquery_v2/queries.py +3 -3
datahub/ingestion/source/cassandra/cassandra.py +1 -1
datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
datahub/ingestion/source/common/subtypes.py +45 -0
datahub/ingestion/source/data_lake_common/object_store.py +115 -27
datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
datahub/ingestion/source/datahub/config.py +11 -0
datahub/ingestion/source/datahub/datahub_database_reader.py +187 -35
datahub/ingestion/source/datahub/datahub_source.py +1 -1
datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
datahub/ingestion/source/dbt/dbt_common.py +6 -2
datahub/ingestion/source/dbt/dbt_core.py +3 -0
datahub/ingestion/source/debug/__init__.py +0 -0
datahub/ingestion/source/debug/datahub_debug.py +300 -0
datahub/ingestion/source/dremio/dremio_api.py +114 -73
datahub/ingestion/source/dremio/dremio_config.py +2 -0
datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
datahub/ingestion/source/dremio/dremio_source.py +94 -81
datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
datahub/ingestion/source/file.py +3 -0
datahub/ingestion/source/fivetran/fivetran.py +34 -26
datahub/ingestion/source/gcs/gcs_source.py +13 -2
datahub/ingestion/source/ge_data_profiler.py +76 -28
datahub/ingestion/source/ge_profiling_config.py +11 -0
datahub/ingestion/source/hex/api.py +26 -1
datahub/ingestion/source/iceberg/iceberg.py +3 -1
datahub/ingestion/source/identity/azure_ad.py +1 -1
datahub/ingestion/source/identity/okta.py +1 -14
datahub/ingestion/source/kafka/kafka.py +16 -0
datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
datahub/ingestion/source/looker/looker_source.py +1 -0
datahub/ingestion/source/mlflow.py +11 -1
datahub/ingestion/source/mock_data/__init__.py +0 -0
datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
datahub/ingestion/source/nifi.py +1 -1
datahub/ingestion/source/openapi.py +12 -0
datahub/ingestion/source/openapi_parser.py +56 -37
datahub/ingestion/source/powerbi/powerbi.py +1 -5
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
datahub/ingestion/source/preset.py +2 -2
datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
datahub/ingestion/source/redshift/redshift.py +21 -1
datahub/ingestion/source/redshift/usage.py +4 -3
datahub/ingestion/source/s3/report.py +4 -2
datahub/ingestion/source/s3/source.py +367 -115
datahub/ingestion/source/sac/sac.py +3 -1
datahub/ingestion/source/salesforce.py +6 -3
datahub/ingestion/source/sigma/sigma.py +7 -1
datahub/ingestion/source/slack/slack.py +2 -1
datahub/ingestion/source/snowflake/snowflake_config.py +43 -7
datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
datahub/ingestion/source/snowflake/snowflake_v2.py +33 -8
datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
datahub/ingestion/source/sql/athena.py +119 -11
datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
datahub/ingestion/source/sql/clickhouse.py +3 -1
datahub/ingestion/source/sql/cockroachdb.py +0 -1
datahub/ingestion/source/sql/hana.py +3 -1
datahub/ingestion/source/sql/hive_metastore.py +3 -11
datahub/ingestion/source/sql/mariadb.py +0 -1
datahub/ingestion/source/sql/mssql/source.py +239 -34
datahub/ingestion/source/sql/mysql.py +0 -1
datahub/ingestion/source/sql/oracle.py +1 -1
datahub/ingestion/source/sql/postgres.py +0 -1
datahub/ingestion/source/sql/sql_common.py +121 -34
datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
datahub/ingestion/source/sql/teradata.py +997 -235
datahub/ingestion/source/sql/vertica.py +10 -6
datahub/ingestion/source/sql_queries.py +2 -2
datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
datahub/ingestion/source/superset.py +58 -3
datahub/ingestion/source/tableau/tableau.py +58 -37
datahub/ingestion/source/tableau/tableau_common.py +4 -2
datahub/ingestion/source/tableau/tableau_constant.py +0 -4
datahub/ingestion/source/unity/config.py +5 -0
datahub/ingestion/source/unity/proxy.py +118 -0
datahub/ingestion/source/unity/source.py +195 -17
datahub/ingestion/source/unity/tag_entities.py +295 -0
datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
datahub/integrations/assertion/snowflake/compiler.py +4 -3
datahub/metadata/_internal_schema_classes.py +1446 -559
datahub/metadata/_urns/urn_defs.py +1721 -1553
datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
datahub/metadata/schema.avsc +18055 -17802
datahub/metadata/schemas/ApplicationKey.avsc +31 -0
datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
datahub/metadata/schemas/Applications.avsc +38 -0
datahub/metadata/schemas/ChartKey.avsc +1 -0
datahub/metadata/schemas/ContainerKey.avsc +1 -0
datahub/metadata/schemas/ContainerProperties.avsc +8 -0
datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
datahub/metadata/schemas/DashboardKey.avsc +1 -0
datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
datahub/metadata/schemas/DataFlowKey.avsc +1 -0
datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
datahub/metadata/schemas/DataJobInfo.avsc +8 -0
datahub/metadata/schemas/DataJobKey.avsc +1 -0
datahub/metadata/schemas/DataProcessKey.avsc +8 -0
datahub/metadata/schemas/DataProductKey.avsc +1 -0
datahub/metadata/schemas/DataProductProperties.avsc +1 -1
datahub/metadata/schemas/DatasetKey.avsc +11 -1
datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
datahub/metadata/schemas/LogicalParent.avsc +140 -0
datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
datahub/metadata/schemas/MLModelKey.avsc +9 -0
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
datahub/metadata/schemas/NotebookKey.avsc +1 -0
datahub/metadata/schemas/QuerySubjects.avsc +1 -12
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/sdk/__init__.py +6 -0
datahub/sdk/_all_entities.py +11 -0
datahub/sdk/_shared.py +118 -1
datahub/sdk/chart.py +315 -0
datahub/sdk/container.py +7 -0
datahub/sdk/dashboard.py +432 -0
datahub/sdk/dataflow.py +309 -0
datahub/sdk/datajob.py +367 -0
datahub/sdk/dataset.py +8 -2
datahub/sdk/entity_client.py +90 -2
datahub/sdk/lineage_client.py +683 -82
datahub/sdk/main_client.py +46 -16
datahub/sdk/mlmodel.py +101 -38
datahub/sdk/mlmodelgroup.py +7 -0
datahub/sdk/search_client.py +4 -3
datahub/specific/chart.py +1 -1
datahub/specific/dataproduct.py +4 -0
datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
datahub/sql_parsing/sqlglot_lineage.py +62 -13
datahub/telemetry/telemetry.py +17 -11
datahub/testing/sdk_v2_helpers.py +7 -1
datahub/upgrade/upgrade.py +46 -13
datahub/utilities/server_config_util.py +8 -0
datahub/utilities/sqlalchemy_query_combiner.py +5 -2
datahub/utilities/stats_collections.py +4 -0
{acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/data_lake_common/object_store.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import re
 from abc import ABC, abstractmethod
 # Add imports for source customization
@@ -236,42 +237,76 @@ class ABSObjectStore(ObjectStoreInterface):
     """Implementation of ObjectStoreInterface for Azure Blob Storage."""
     PREFIX = "abfss://"
+    HTTPS_REGEX = re.compile(r"(https?://[a-z0-9]{3,24}\.blob\.core\.windows\.net/)")
     @classmethod
     def is_uri(cls, uri: str) -> bool:
-        return uri.startswith(cls.PREFIX)
+        return uri.startswith(cls.PREFIX) or bool(cls.HTTPS_REGEX.match(uri))
     @classmethod
     def get_prefix(cls, uri: str) -> Optional[str]:
         if uri.startswith(cls.PREFIX):
             return cls.PREFIX
+        # Check for HTTPS format
+        match = cls.HTTPS_REGEX.match(uri)
+        if match:
+            return match.group(1)
         return None
     @classmethod
     def strip_prefix(cls, uri: str) -> str:
-        prefix = cls.get_prefix(uri)
-        if not prefix:
-            raise ValueError(f"Not an ABS URI. Must start with prefix: {cls.PREFIX}")
-        return uri[len(prefix) :]
+        if uri.startswith(cls.PREFIX):
+            return uri[len(cls.PREFIX) :]
+        # Handle HTTPS format
+        match = cls.HTTPS_REGEX.match(uri)
+        if match:
+            return uri[len(match.group(1)) :]
+        raise ValueError(
+            f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
+        )
     @classmethod
     def get_bucket_name(cls, uri: str) -> str:
         if not cls.is_uri(uri):
-            raise ValueError(f"Not an ABS URI. Must start with prefix: {cls.PREFIX}")
-        return cls.strip_prefix(uri).split("@")[0]
+            raise ValueError(
+                f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
+            )
+        if uri.startswith(cls.PREFIX):
+            # abfss://container@account.dfs.core.windows.net/path
+            return cls.strip_prefix(uri).split("@")[0]
+        else:
+            # https://account.blob.core.windows.net/container/path
+            return cls.strip_prefix(uri).split("/")[0]
     @classmethod
     def get_object_key(cls, uri: str) -> str:
         if not cls.is_uri(uri):
-            raise ValueError(f"Not an ABS URI. Must start with prefix: {cls.PREFIX}")
-        parts = cls.strip_prefix(uri).split("@", 1)
-        if len(parts) < 2:
-            return ""
-        account_path = parts[1]
-        path_parts = account_path.split("/", 1)
-        if len(path_parts) < 2:
-            return ""
-        return path_parts[1]
+            raise ValueError(
+                f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
+            )
+        if uri.startswith(cls.PREFIX):
+            # abfss://container@account.dfs.core.windows.net/path
+            parts = cls.strip_prefix(uri).split("@", 1)
+            if len(parts) < 2:
+                return ""
+            account_path = parts[1]
+            path_parts = account_path.split("/", 1)
+            if len(path_parts) < 2:
+                return ""
+            return path_parts[1]
+        else:
+            # https://account.blob.core.windows.net/container/path
+            stripped = cls.strip_prefix(uri)
+            parts = stripped.split("/", 1)
+            if len(parts) < 2:
+                return ""
+            return parts[1]
 # Registry of all object store implementations
@@ -331,6 +366,12 @@ def get_object_store_bucket_name(uri: str) -> str:
         return uri[prefix_length:].split("/")[0]
     elif uri.startswith(ABSObjectStore.PREFIX):
         return uri[len(ABSObjectStore.PREFIX) :].split("@")[0]
+    elif ABSObjectStore.HTTPS_REGEX.match(uri):
+        # Handle HTTPS Azure Blob Storage URLs
+        match = ABSObjectStore.HTTPS_REGEX.match(uri)
+        if match:
+            stripped = uri[len(match.group(1)) :]
+            return stripped.split("/")[0]
     raise ValueError(f"Unsupported URI format: {uri}")
@@ -470,18 +511,25 @@ class ObjectStoreSourceAdapter:
         if not ABSObjectStore.is_uri(table_data.table_path):
             return None
-        # Parse the ABS URI
         try:
-            # URI format: abfss://container@account.dfs.core.windows.net/path
-            path_without_prefix = ABSObjectStore.strip_prefix(table_data.table_path)
-            parts = path_without_prefix.split("@", 1)
-            if len(parts) < 2:
-                return None
-            container_name = parts[0]
-            account_parts = parts[1].split("/", 1)
-            account_domain = account_parts[0]
-            account_name = account_domain.split(".")[0]
+            if table_data.table_path.startswith("abfss://"):
+                # URI format: abfss://container@account.dfs.core.windows.net/path
+                path_without_prefix = ABSObjectStore.strip_prefix(table_data.table_path)
+                parts = path_without_prefix.split("@", 1)
+                if len(parts) < 2:
+                    return None
+                container_name = parts[0]
+                account_parts = parts[1].split("/", 1)
+                account_domain = account_parts[0]
+                account_name = account_domain.split(".")[0]
+            else:
+                # Handle HTTPS format: https://account.blob.core.windows.net/container/path
+                container_name = ABSObjectStore.get_bucket_name(table_data.table_path)
+                if "blob.core.windows.net" in table_data.table_path:
+                    account_name = table_data.table_path.split("//")[1].split(".")[0]
+                else:
+                    return None
             # Construct Azure portal URL
             return f"https://portal.azure.com/#blade/Microsoft_Azure_Storage/ContainerMenuBlade/overview/storageAccountId/{account_name}/containerName/{container_name}"
@@ -519,6 +567,13 @@ class ObjectStoreSourceAdapter:
                 "get_external_url",
                 lambda table_data: self.get_gcs_external_url(table_data),
             )
+            # Fix URI mismatch issue in pattern matching
+            self.register_customization(
+                "_normalize_uri_for_pattern_matching",
+                self._normalize_gcs_uri_for_pattern_matching,
+            )
+            # Fix URI handling in schema extraction - override strip_s3_prefix for GCS
+            self.register_customization("strip_s3_prefix", self._strip_gcs_prefix)
         elif platform == "s3":
             self.register_customization("is_s3_platform", lambda: True)
             self.register_customization("create_s3_path", self.create_s3_path)
@@ -612,6 +667,39 @@ class ObjectStoreSourceAdapter:
             return self.get_abs_external_url(table_data)
         return None
+    def _normalize_gcs_uri_for_pattern_matching(self, uri: str) -> str:
+        """
+        Normalize GCS URI for pattern matching.
+        This method converts gs:// URIs to s3:// URIs for pattern matching purposes,
+        fixing the URI mismatch issue in GCS ingestion.
+        Args:
+            uri: The URI to normalize
+        Returns:
+            The normalized URI for pattern matching
+        """
+        if uri.startswith("gs://"):
+            return uri.replace("gs://", "s3://", 1)
+        return uri
+    def _strip_gcs_prefix(self, uri: str) -> str:
+        """
+        Strip GCS prefix from URI.
+        This method removes the gs:// prefix from GCS URIs for path processing.
+        Args:
+            uri: The URI to strip the prefix from
+        Returns:
+            The URI without the gs:// prefix
+        """
+        if uri.startswith("gs://"):
+            return uri[5:]  # Remove "gs://" prefix
+        return uri
 # Factory function to create an adapter for a specific platform
 def create_object_store_adapter(

datahub/ingestion/source/data_lake_common/path_spec.py CHANGED Viewed

@@ -166,7 +166,6 @@ class PathSpec(ConfigModel):
         return False
     def allowed(self, path: str, ignore_ext: bool = False) -> bool:
-        logger.debug(f"Checking file to inclusion: {path}")
         if self.is_path_hidden(path) and not self.include_hidden_folders:
             return False
@@ -174,19 +173,17 @@ class PathSpec(ConfigModel):
             self.glob_include, flags=pathlib.GLOBSTAR
         ):
             return False
-        logger.debug(f"{path} matched include ")
         if self.exclude:
             for exclude_path in self.exclude:
                 if pathlib.PurePath(path).globmatch(
                     exclude_path, flags=pathlib.GLOBSTAR
                 ):
                     return False
-        logger.debug(f"{path} is not excluded")
         table_name, _ = self.extract_table_name_and_path(path)
         if not self.tables_filter_pattern.allowed(table_name):
             return False
-        logger.debug(f"{path} is passed table name check")
         ext = os.path.splitext(path)[1].strip(".")
@@ -196,8 +193,6 @@ class PathSpec(ConfigModel):
             ):
                 return False
-            logger.debug(f"{path} had selected extension {ext}")
-            logger.debug(f"{path} allowed for dataset creation")
         return True
     def dir_allowed(self, path: str) -> bool:
@@ -219,10 +214,8 @@ class PathSpec(ConfigModel):
         for _ in range(slash_to_remove_from_glob):
             glob_include = glob_include.rsplit("/", 1)[0]
-        logger.debug(f"Checking dir to inclusion: {path}")
         if not pathlib.PurePath(path).globmatch(glob_include, flags=pathlib.GLOBSTAR):
             return False
-        logger.debug(f"{path} matched include ")
         if self.exclude:
             for exclude_path in self.exclude:
                 if pathlib.PurePath(path.rstrip("/")).globmatch(
@@ -236,7 +229,7 @@ class PathSpec(ConfigModel):
         )
         if not self.tables_filter_pattern.allowed(table_name):
             return False
-        logger.debug(f"{path} is passed table name check")
+        # logger.debug(f"{path} is passed table name check")
         return True
@@ -246,10 +239,10 @@ class PathSpec(ConfigModel):
         if parsable_include.endswith("/{table}/**"):
             # Remove the last two characters to make it parsable if it ends with {table}/** which marks autodetect partition
             parsable_include = parsable_include[:-2]
-        else:
-            # Replace all * with {folder[i]} to make it parsable
-            for i in range(parsable_include.count("*")):
-                parsable_include = parsable_include.replace("*", f"{{folder[{i}]}}", 1)
+        # Replace all * with {folder[i]} to make it parsable
+        for i in range(parsable_include.count("*")):
+            parsable_include = parsable_include.replace("*", f"{{folder[{i}]}}", 1)
         return parsable_include
     def get_named_vars(self, path: str) -> Union[None, parse.Result, parse.Match]:
@@ -330,8 +323,6 @@ class PathSpec(ConfigModel):
             if "{table}" in values["include"]:
                 v = "{table}"
         else:
-            logger.debug(f"include fields: {compiled_include.named_fields}")
-            logger.debug(f"table_name fields: {parse.compile(v).named_fields}")
             if not all(
                 x in compiled_include.named_fields
                 for x in parse.compile(v).named_fields
@@ -356,9 +347,7 @@ class PathSpec(ConfigModel):
     @cached_property
     def compiled_include(self):
         parsable_include = PathSpec.get_parsable_include(self.include)
-        logger.debug(f"parsable_include: {parsable_include}")
         compiled_include = parse.compile(parsable_include)
-        logger.debug(f"Setting compiled_include: {compiled_include}")
         return compiled_include
     @cached_property
@@ -366,9 +355,8 @@ class PathSpec(ConfigModel):
         parsable_folder_include = PathSpec.get_parsable_include(self.include).rsplit(
             "/", 1
         )[0]
-        logger.debug(f"parsable_folder_include: {parsable_folder_include}")
         compiled_folder_include = parse.compile(parsable_folder_include)
-        logger.debug(f"Setting compiled_folder_include: {compiled_folder_include}")
         return compiled_folder_include
     @cached_property
@@ -376,7 +364,8 @@ class PathSpec(ConfigModel):
         # Regular expression to find all substrings enclosed in {}
         pattern = r"\{(.*?)\}"
         # Find all matches
-        matches = re.findall(pattern, self.include.split("{table}/")[1])
+        split_parts = self.include.split("{table}/")
+        matches = re.findall(pattern, split_parts[1]) if len(split_parts) > 1 else []
         return matches
     def get_partition_from_path(self, path: str) -> Optional[List[Tuple[str, str]]]:
@@ -563,7 +552,7 @@ class PathSpec(ConfigModel):
                         f"{{{template_key}}}", var[key]
                     )
             else:
-                partition_format.replace(f"{{{var_key}}}", var)
+                partition_format = partition_format.replace(f"{{{var_key}}}", var)
         return datetime.datetime.strptime(partition_format, datetime_format).replace(
             tzinfo=datetime.timezone.utc
         )

datahub/ingestion/source/datahub/config.py CHANGED Viewed

@@ -118,6 +118,17 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
         "Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
     )
+    structured_properties_template_cache_invalidation_interval: int = Field(
+        hidden_from_docs=True,
+        default=60,
+        description="Interval in seconds to invalidate the structured properties template cache.",
+    )
+    query_timeout: Optional[int] = Field(
+        default=None,
+        description="Timeout for each query in seconds. ",
+    )
     @root_validator(skip_on_failure=True)
     def check_ingesting_data(cls, values):
         if (

datahub/ingestion/source/datahub/datahub_database_reader.py CHANGED Viewed

@@ -1,10 +1,10 @@
-import contextlib
 import json
 import logging
+import time
 from datetime import datetime
 from typing import Any, Dict, Generic, Iterable, List, Optional, Tuple, TypeVar
-from sqlalchemy import create_engine
+from sqlalchemy import create_engine, text
 from datahub.emitter.aspect import ASPECT_MAP
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
@@ -12,13 +12,14 @@ from datahub.emitter.serialization_helper import post_json_transform
 from datahub.ingestion.source.datahub.config import DataHubSourceConfig
 from datahub.ingestion.source.datahub.report import DataHubSourceReport
 from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
-from datahub.metadata.schema_classes import ChangeTypeClass, SystemMetadataClass
+from datahub.metadata.schema_classes import SystemMetadataClass
 from datahub.utilities.lossy_collections import LossyDict, LossyList
 logger = logging.getLogger(__name__)
 # Should work for at least mysql, mariadb, postgres
 DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
+DATE_FORMAT = "%Y-%m-%d"
 ROW = TypeVar("ROW", bound=Dict[str, Any])
@@ -85,6 +86,9 @@ class DataHubDatabaseReader:
             **connection_config.options,
         )
+        # Cache for available dates to avoid redundant queries
+        self.available_dates_cache: Optional[List[datetime]] = None
     @property
     def soft_deleted_urns_query(self) -> str:
         return f"""
@@ -100,14 +104,12 @@ class DataHubDatabaseReader:
             ORDER BY mav.urn
         """
-    @property
-    def query(self) -> str:
-        # May repeat rows for the same date
-        # Offset is generally 0, unless we repeat the same createdon twice
+    def query(self, set_structured_properties_filter: bool) -> str:
+        """
+        Main query that gets data for specified date range with appropriate filters.
+        """
+        structured_prop_filter = f" AND urn {'' if set_structured_properties_filter else 'NOT'} like 'urn:li:structuredProperty:%%'"
-        # Ensures stable order, chronological per (urn, aspect)
-        # Relies on createdon order to reflect version order
-        # Ordering of entries with the same createdon is handled by VersionOrderer
         return f"""
         SELECT *
         FROM (
@@ -132,6 +134,7 @@ class DataHubDatabaseReader:
                 {"" if self.config.include_all_versions else "AND mav.version = 0"}
                 {"" if not self.config.exclude_aspects else "AND mav.aspect NOT IN %(exclude_aspects)s"}
                 AND mav.createdon >= %(since_createdon)s
+                AND mav.createdon < %(end_createdon)s
             ORDER BY
                 createdon,
                 urn,
@@ -139,50 +142,194 @@ class DataHubDatabaseReader:
                 version
         ) as t
         WHERE 1=1
-            {"" if self.config.include_soft_deleted_entities else "AND (removed = false or removed is NULL)"}
+            {"" if self.config.include_soft_deleted_entities else " AND (removed = false or removed is NULL)"}
+            {structured_prop_filter}
         ORDER BY
             createdon,
             urn,
             aspect,
             version
+        LIMIT %(limit)s
+        OFFSET %(offset)s
         """
+    def execute_with_params(
+        self, query: str, params: Dict[str, Any]
+    ) -> List[Dict[str, Any]]:
+        """Execute query with proper parameter binding that works with your database"""
+        with self.engine.connect() as conn:
+            result = conn.execute(query, params or {})
+            return [dict(row) for row in result.fetchall()]
     def execute_server_cursor(
         self, query: str, params: Dict[str, Any]
     ) -> Iterable[Dict[str, Any]]:
+        """Execute a query with server-side cursor"""
         with self.engine.connect() as conn:
             if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
                 with (
                     conn.begin()
                 ):  # Transaction required for PostgreSQL server-side cursor
-                    # Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
-                    # https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
+                    # Set query timeout at the connection level
+                    if self.config.query_timeout:
+                        if self.engine.dialect.name == "postgresql":
+                            conn.execute(
+                                text(
+                                    f"SET statement_timeout = {self.config.query_timeout * 1000}"
+                                )
+                            )  # milliseconds
+                        elif self.engine.dialect.name in ["mysql", "mariadb"]:
+                            conn.execute(
+                                text(
+                                    f"SET max_execution_time = {self.config.query_timeout * 1000}"
+                                )
+                            )  # milliseconds
+                    # Stream results with batch size
                     conn = conn.execution_options(
                         stream_results=True,
                         yield_per=self.config.database_query_batch_size,
                     )
+                    # Execute query - using native parameterization without text()
+                    # to maintain compatibility with your original code
                     result = conn.execute(query, params)
                     for row in result:
                         yield dict(row)
+                return  # Success, exit the retry loop
             else:
                 raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
     def _get_rows(
-        self, from_createdon: datetime, stop_time: datetime
+        self,
+        start_date: datetime,
+        end_date: datetime,
+        set_structured_properties_filter: bool,
+        limit: int,
     ) -> Iterable[Dict[str, Any]]:
-        params = {
-            "exclude_aspects": list(self.config.exclude_aspects),
-            "since_createdon": from_createdon.strftime(DATETIME_FORMAT),
-        }
-        yield from self.execute_server_cursor(self.query, params)
+        """
+        Retrieves data rows within a specified date range using pagination.
-    def get_aspects(
+        Implements a hybrid pagination strategy that switches between time-based and
+        offset-based approaches depending on the returned data. Uses server-side
+        cursors for efficient memory usage.
+        Note: May return duplicate rows across batch boundaries when multiple rows
+        share the same 'createdon' timestamp. This is expected behavior when
+        transitioning between pagination methods.
+        Args:
+        start_date: Beginning of date range (inclusive)
+        end_date: End of date range (exclusive)
+        set_structured_properties_filter: Whether to apply structured filtering
+        limit: Maximum rows to fetch per query
+        Returns:
+            An iterable of database rows as dictionaries
+        """
+        offset = 0
+        last_createdon = None
+        first_iteration = True
+        while True:
+            try:
+                # Set up query and parameters - using named parameters
+                query = self.query(set_structured_properties_filter)
+                params: Dict[str, Any] = {
+                    "since_createdon": start_date.strftime(DATETIME_FORMAT),
+                    "end_createdon": end_date.strftime(DATETIME_FORMAT),
+                    "limit": limit,
+                    "offset": offset,
+                }
+                # Add exclude_aspects if needed
+                if (
+                    hasattr(self.config, "exclude_aspects")
+                    and self.config.exclude_aspects
+                ):
+                    params["exclude_aspects"] = tuple(self.config.exclude_aspects)
+                logger.info(
+                    f"Querying data from {start_date.strftime(DATETIME_FORMAT)} to {end_date.strftime(DATETIME_FORMAT)} "
+                    f"with limit {limit} and offset {offset} (inclusive range)"
+                )
+                # Execute query with server-side cursor
+                rows = self.execute_server_cursor(query, params)
+                # Process and yield rows
+                rows_processed = 0
+                for row in rows:
+                    if first_iteration:
+                        start_date = row.get("createdon", start_date)
+                        first_iteration = False
+                    last_createdon = row.get("createdon")
+                    rows_processed += 1
+                    yield row
+                # If we processed fewer than the limit or no last_createdon, we're done
+                if rows_processed < limit or not last_createdon:
+                    break
+                # Update parameters for next iteration
+                if start_date != last_createdon:
+                    start_date = last_createdon
+                    offset = 0
+                else:
+                    offset += limit
+                logger.info(
+                    f"Processed {rows_processed} rows for date range {start_date} to {end_date}. Continuing to next batch."
+                )
+            except Exception as e:
+                logger.error(
+                    f"Error processing date range {start_date} to {end_date}: {str(e)}"
+                )
+                # Re-raise the exception after logging
+                raise
+    def get_all_aspects(
         self, from_createdon: datetime, stop_time: datetime
+    ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
+        logger.info("Fetching Structured properties aspects")
+        yield from self.get_aspects(
+            from_createdon=from_createdon,
+            stop_time=stop_time,
+            set_structured_properties_filter=True,
+        )
+        logger.info(
+            f"Waiting for {self.config.structured_properties_template_cache_invalidation_interval} seconds for structured properties cache to invalidate"
+        )
+        time.sleep(
+            self.config.structured_properties_template_cache_invalidation_interval
+        )
+        logger.info("Fetching aspects")
+        yield from self.get_aspects(
+            from_createdon=from_createdon,
+            stop_time=stop_time,
+            set_structured_properties_filter=False,
+        )
+    def get_aspects(
+        self,
+        from_createdon: datetime,
+        stop_time: datetime,
+        set_structured_properties_filter: bool = False,
     ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
         orderer = VersionOrderer[Dict[str, Any]](
             enabled=self.config.include_all_versions
         )
-        rows = self._get_rows(from_createdon=from_createdon, stop_time=stop_time)
+        rows = self._get_rows(
+            start_date=from_createdon,
+            end_date=stop_time,
+            set_structured_properties_filter=set_structured_properties_filter,
+            limit=self.config.database_query_batch_size,
+        )
         for row in orderer(rows):
             mcp = self._parse_row(row)
             if mcp:
@@ -190,23 +337,29 @@ class DataHubDatabaseReader:
     def get_soft_deleted_rows(self) -> Iterable[Dict[str, Any]]:
         """
-        Fetches all soft-deleted entities from the database.
+        Fetches all soft-deleted entities from the database using pagination.
         Yields:
             Row objects containing URNs of soft-deleted entities
         """
-        with self.engine.connect() as conn, contextlib.closing(
-            conn.connection.cursor()
-        ) as cursor:
-            logger.debug("Polling soft-deleted urns from database")
-            cursor.execute(self.soft_deleted_urns_query)
-            columns = [desc[0] for desc in cursor.description]
-            while True:
-                rows = cursor.fetchmany(self.config.database_query_batch_size)
-                if not rows:
-                    return
-                for row in rows:
-                    yield dict(zip(columns, row))
+        try:
+            params: Dict = {}
+            logger.debug("Fetching soft-deleted URNs")
+            # Use server-side cursor implementation
+            rows = self.execute_server_cursor(self.soft_deleted_urns_query, params)
+            processed_rows = 0
+            # Process and yield rows
+            for row in rows:
+                processed_rows += 1
+                yield row
+            logger.debug(f"Fetched batch of {processed_rows} soft-deleted URNs")
+        except Exception:
+            logger.exception("Error fetching soft-deleted row", exc_info=True)
+            raise
     def _parse_row(
         self, row: Dict[str, Any]
@@ -221,7 +374,6 @@ class DataHubDatabaseReader:
                 entityUrn=row["urn"],
                 aspect=ASPECT_MAP[row["aspect"]].from_obj(json_aspect),
                 systemMetadata=system_metadata,
-                changeType=ChangeTypeClass.UPSERT,
             )
         except Exception as e:
             logger.warning(

datahub/ingestion/source/datahub/datahub_source.py CHANGED Viewed

@@ -117,7 +117,7 @@ class DataHubSource(StatefulIngestionSourceBase):
     ) -> Iterable[MetadataWorkUnit]:
         logger.info(f"Fetching database aspects starting from {from_createdon}")
         progress = ProgressTimer(report_every=timedelta(seconds=60))
-        mcps = reader.get_aspects(from_createdon, self.report.stop_time)
+        mcps = reader.get_all_aspects(from_createdon, self.report.stop_time)
         for i, (mcp, createdon) in enumerate(mcps):
             if not self.urn_pattern.allowed(str(mcp.entityUrn)):
                 continue

acryl-datahub 1.1.1rc3__py3-none-any.whl → 1.2.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.1rc3py3-none-any.whl → 1.2.0py3-none-any.whl