PyPI - acryl-datahub - Versions diffs - 1.1.0.3rc1__py3-none-any.whl → 1.1.0.4__py3-none-any.whl - Mend

acryl-datahub 1.1.0.3rc1py3-none-any.whl → 1.1.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (68) hide show

{acryl_datahub-1.1.0.3rc1.dist-info → acryl_datahub-1.1.0.4.dist-info}/METADATA +2474 -2474
{acryl_datahub-1.1.0.3rc1.dist-info → acryl_datahub-1.1.0.4.dist-info}/RECORD +68 -68
datahub/_version.py +1 -1
datahub/cli/check_cli.py +27 -0
datahub/cli/delete_cli.py +117 -19
datahub/emitter/rest_emitter.py +18 -1
datahub/ingestion/api/source.py +2 -0
datahub/ingestion/glossary/classification_mixin.py +5 -0
datahub/ingestion/graph/client.py +42 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +18 -0
datahub/ingestion/source/bigquery_v2/common.py +1 -1
datahub/ingestion/source/dbt/dbt_cloud.py +3 -0
datahub/ingestion/source/dbt/dbt_common.py +3 -1
datahub/ingestion/source/dbt/dbt_core.py +3 -0
datahub/ingestion/source/dremio/dremio_api.py +98 -68
datahub/ingestion/source/dremio/dremio_config.py +2 -0
datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
datahub/ingestion/source/dremio/dremio_source.py +90 -77
datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
datahub/ingestion/source/file.py +3 -0
datahub/ingestion/source/ge_data_profiler.py +48 -8
datahub/ingestion/source/ge_profiling_config.py +11 -0
datahub/ingestion/source/iceberg/iceberg.py +3 -1
datahub/ingestion/source/kafka/kafka.py +16 -0
datahub/ingestion/source/looker/looker_source.py +1 -0
datahub/ingestion/source/powerbi/powerbi.py +1 -0
datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
datahub/ingestion/source/redshift/redshift.py +21 -1
datahub/ingestion/source/sac/sac.py +3 -1
datahub/ingestion/source/sigma/sigma.py +1 -0
datahub/ingestion/source/snowflake/snowflake_config.py +3 -6
datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
datahub/ingestion/source/snowflake/snowflake_v2.py +2 -0
datahub/ingestion/source/sql/clickhouse.py +3 -1
datahub/ingestion/source/sql/cockroachdb.py +0 -1
datahub/ingestion/source/sql/hana.py +3 -1
datahub/ingestion/source/sql/hive_metastore.py +3 -1
datahub/ingestion/source/sql/mariadb.py +0 -1
datahub/ingestion/source/sql/mssql/source.py +8 -1
datahub/ingestion/source/sql/mysql.py +0 -1
datahub/ingestion/source/sql/postgres.py +0 -1
datahub/ingestion/source/sql/sql_common.py +12 -0
datahub/ingestion/source/superset.py +1 -1
datahub/ingestion/source/tableau/tableau.py +1 -0
datahub/ingestion/source/unity/source.py +1 -0
datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
datahub/metadata/_internal_schema_classes.py +25 -0
datahub/metadata/schema.avsc +18 -1
datahub/metadata/schemas/ContainerProperties.avsc +6 -0
datahub/metadata/schemas/DataFlowInfo.avsc +6 -0
datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
datahub/metadata/schemas/DataJobInfo.avsc +6 -0
datahub/metadata/schemas/DataProcessKey.avsc +6 -0
datahub/metadata/schemas/DatasetKey.avsc +6 -0
datahub/metadata/schemas/IcebergWarehouseInfo.avsc +6 -0
datahub/metadata/schemas/MLModelDeploymentKey.avsc +6 -0
datahub/metadata/schemas/MLModelGroupKey.avsc +6 -0
datahub/metadata/schemas/MLModelKey.avsc +6 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +18 -1
datahub/sdk/main_client.py +9 -10
datahub/sql_parsing/sqlglot_lineage.py +22 -0
datahub/utilities/stats_collections.py +4 -0
{acryl_datahub-1.1.0.3rc1.dist-info → acryl_datahub-1.1.0.4.dist-info}/WHEEL +0 -0
{acryl_datahub-1.1.0.3rc1.dist-info → acryl_datahub-1.1.0.4.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.1.0.3rc1.dist-info → acryl_datahub-1.1.0.4.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.1.0.3rc1.dist-info → acryl_datahub-1.1.0.4.dist-info}/top_level.txt +0 -0

datahub/cli/delete_cli.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import logging
 import random
+import sys
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
 from datetime import datetime
@@ -317,6 +318,19 @@ def undo_by_filter(
     is_flag=True,
     help="Recursively delete all contained entities (only for containers and dataPlatformInstances)",
 )
+@click.option(
+    "--streaming-batch",
+    required=False,
+    is_flag=True,
+    help="Use streaming batch deletion for recursive operations. Benefit of being resumable for large hierarchies where getting all URNs at once can take a long time.",
+)
+@click.option(
+    "--streaming-batch-size",
+    required=False,
+    default=12000,
+    type=int,
+    help="Batch size for streaming batch deletion for recursive operations.",
+)
 @click.option(
     "--start-time",
     required=False,
@@ -368,6 +382,8 @@ def by_filter(
     entity_type: Optional[str],
     query: Optional[str],
     recursive: bool,
+    streaming_batch: bool,
+    streaming_batch_size: int,
     start_time: Optional[datetime],
     end_time: Optional[datetime],
     batch_size: int,
@@ -386,6 +402,7 @@ def by_filter(
         env=env,
         query=query,
         recursive=recursive,
+        streaming_batch=streaming_batch,
     )
     soft_delete_filter = _validate_user_soft_delete_flags(
         soft=soft, aspect=aspect, only_soft_deleted=only_soft_deleted
@@ -417,26 +434,27 @@ def by_filter(
     # Determine which urns to delete.
     delete_by_urn = bool(urn) and not recursive
     if urn:
-        urns = [urn]
         if recursive:
-            # Add children urns to the list.
-            if guess_entity_type(urn) == "dataPlatformInstance":
-                urns.extend(
-                    graph.get_urns_by_filter(
-                        platform_instance=urn,
-                        status=soft_delete_filter,
-                        batch_size=batch_size,
-                    )
-                )
-            else:
-                urns.extend(
-                    graph.get_urns_by_filter(
-                        container=urn,
-                        status=soft_delete_filter,
-                        batch_size=batch_size,
-                    )
-                )
+            _delete_urns_streaming_recursive(
+                graph=graph,
+                parent_urn=urn,
+                aspect_name=aspect,
+                soft=soft,
+                dry_run=dry_run,
+                start_time=start_time,
+                end_time=end_time,
+                workers=workers,
+                soft_delete_filter=soft_delete_filter,
+                batch_size=batch_size,
+                force=force,
+                streaming_batch_size=streaming_batch_size
+                if streaming_batch
+                else sys.maxsize,
+            )
+            return
+        else:
+            urns = [urn]
     elif urn_file:
         with open(urn_file, "r") as r:
             urns = []
@@ -557,6 +575,7 @@ def _validate_user_urn_and_filters(
     env: Optional[str],
     query: Optional[str],
     recursive: bool,
+    streaming_batch: bool,
 ) -> None:
     # Check urn / filters options.
     if urn:
@@ -592,6 +611,12 @@ def _validate_user_urn_and_filters(
             f"This will only delete {urn}. Use --recursive to delete all contained entities."
         )
+    # Check streaming flag.
+    if streaming_batch and not recursive:
+        raise click.UsageError(
+            "The --streaming-batch flag can only be used with --recursive."
+        )
 def _validate_user_soft_delete_flags(
     soft: bool, aspect: Optional[str], only_soft_deleted: bool
@@ -738,3 +763,76 @@ def _delete_one_urn(
         num_timeseries_records=ts_rows_affected,
         num_referenced_entities=referenced_entities_affected,
     )
+def _delete_urns_streaming_recursive(
+    graph: DataHubGraph,
+    parent_urn: str,
+    aspect_name: Optional[str],
+    soft: bool,
+    dry_run: bool,
+    start_time: Optional[datetime],
+    end_time: Optional[datetime],
+    workers: int,
+    soft_delete_filter: RemovedStatusFilter,
+    batch_size: int,
+    force: bool,
+    streaming_batch_size: int,
+) -> None:
+    """Streaming recursive batch deletion that processes URNs in batches."""
+    entity_type = guess_entity_type(parent_urn)
+    click.echo(f"Starting recursive deletion of {entity_type} {parent_urn}")
+    if not force and not dry_run:
+        click.confirm(
+            f"This will recursively delete {parent_urn} and all its contained entities. Do you want to continue?",
+            abort=True,
+        )
+    urns = []
+    if entity_type == "dataPlatformInstance":
+        child_urns_iter = graph.get_urns_by_filter(
+            platform_instance=parent_urn,
+            status=soft_delete_filter,
+            batch_size=batch_size,
+            # Important to skip cache so we can resume from where we left off.
+            skip_cache=True,
+        )
+    else:
+        child_urns_iter = graph.get_urns_by_filter(
+            container=parent_urn,
+            status=soft_delete_filter,
+            batch_size=batch_size,
+            # Important to skip cache so we can resume from where we left off.
+            skip_cache=True,
+        )
+    for child_urn in child_urns_iter:
+        urns.append(child_urn)
+        if len(urns) >= streaming_batch_size:
+            _delete_urns_parallel(
+                graph=graph,
+                urns=urns,
+                aspect_name=aspect_name,
+                soft=soft,
+                dry_run=dry_run,
+                delete_by_urn=False,
+                start_time=start_time,
+                end_time=end_time,
+                workers=workers,
+            )
+            urns = []
+    urns.append(parent_urn)
+    _delete_urns_parallel(
+        graph=graph,
+        urns=urns,
+        aspect_name=aspect_name,
+        soft=soft,
+        dry_run=dry_run,
+        delete_by_urn=False,
+        start_time=start_time,
+        end_time=end_time,
+        workers=workers,
+    )

datahub/emitter/rest_emitter.py CHANGED Viewed

@@ -4,6 +4,7 @@ import functools
 import json
 import logging
 import os
+import re
 import time
 from collections import defaultdict
 from dataclasses import dataclass
@@ -104,6 +105,22 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
 )
+def preserve_unicode_escapes(obj: Any) -> Any:
+    """Recursively convert unicode characters back to escape sequences"""
+    if isinstance(obj, dict):
+        return {k: preserve_unicode_escapes(v) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        return [preserve_unicode_escapes(item) for item in obj]
+    elif isinstance(obj, str):
+        # Convert non-ASCII characters back to \u escapes
+        def escape_unicode(match: Any) -> Any:
+            return f"\\u{ord(match.group(0)):04x}"
+        return re.sub(r"[^\x00-\x7F]", escape_unicode, obj)
+    else:
+        return obj
 class EmitMode(ConfigEnum):
     # Fully synchronous processing that updates both primary storage (SQL) and search storage (Elasticsearch) before returning.
     # Provides the strongest consistency guarantee but with the highest cost. Best for critical operations where immediate
@@ -611,7 +628,7 @@ class DataHubRestEmitter(Closeable, Emitter):
         else:
             url = f"{self._gms_server}/aspects?action=ingestProposal"
-            mcp_obj = pre_json_transform(mcp.to_obj())
+            mcp_obj = preserve_unicode_escapes(pre_json_transform(mcp.to_obj()))
             payload_dict = {
                 "proposal": mcp_obj,
                 "async": "true"

datahub/ingestion/api/source.py CHANGED Viewed

@@ -76,6 +76,7 @@ class SourceCapability(Enum):
     SCHEMA_METADATA = "Schema Metadata"
     CONTAINERS = "Asset Containers"
     CLASSIFICATION = "Classification"
+    TEST_CONNECTION = "Test Connection"
 class StructuredLogLevel(Enum):
@@ -247,6 +248,7 @@ class SourceReport(Report):
                             self.aspect_urn_samples[entityType][
                                 "fineGrainedLineages"
                             ].append(urn)
+                            self.aspects[entityType]["fineGrainedLineages"] += 1
     def report_warning(
         self,

datahub/ingestion/glossary/classification_mixin.py CHANGED Viewed

@@ -90,6 +90,11 @@ class ClassificationHandler:
     def get_classifiers(self) -> List[Classifier]:
         classifiers = []
+        if (
+            not isinstance(self.config, ClassificationSourceConfigMixin)
+            or self.config.classification is None
+        ):
+            return classifiers
         for classifier in self.config.classification.classifiers:
             classifier_class = classifier_registry.get(classifier.type)

datahub/ingestion/graph/client.py CHANGED Viewed

@@ -906,6 +906,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
         batch_size: int = 5000,
         extraFilters: Optional[List[RawSearchFilterRule]] = None,
         extra_or_filters: Optional[RawSearchFilter] = None,
+        skip_cache: bool = False,
     ) -> Iterable[str]:
         """Fetch all urns that match all of the given filters.
@@ -924,6 +925,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
             Note that this requires browsePathV2 aspects (added in 0.10.4+).
         :param status: Filter on the deletion status of the entity. The default is only return non-soft-deleted entities.
         :param extraFilters: Additional filters to apply. If specified, the results will match all of the filters.
+        :param skip_cache: Whether to bypass caching. Defaults to False.
         :return: An iterable of urns that match the filters.
         """
@@ -951,7 +953,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
                 $query: String!,
                 $orFilters: [AndFilterInput!],
                 $batchSize: Int!,
-                $scrollId: String) {
+                $scrollId: String,
+                $skipCache: Boolean!) {
                 scrollAcrossEntities(input: {
                     query: $query,
@@ -962,6 +965,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
                     searchFlags: {
                         skipHighlighting: true
                         skipAggregates: true
+                        skipCache: $skipCache
                     }
                 }) {
                     nextScrollId
@@ -980,6 +984,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
             "query": query,
             "orFilters": orFilters,
             "batchSize": batch_size,
+            "skipCache": skip_cache,
         }
         for entity in self._scroll_across_entities(graphql_query, variables):
@@ -1085,7 +1090,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
             "query": query,
             "orFilters": or_filters_final,
             "batchSize": batch_size,
-            "skipCache": "true" if skip_cache else "false",
+            "skipCache": skip_cache,
             "fetchExtraFields": extra_source_fields,
         }
@@ -1429,6 +1434,41 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
         related_aspects = response.get("relatedAspects", [])
         return reference_count, related_aspects
+    def restore_indices(
+        self,
+        urn_pattern: str,
+        aspect: Optional[str] = None,
+        start: Optional[int] = None,
+        batch_size: Optional[int] = None,
+    ) -> str:
+        """Restore the indices for a given urn or urn-like pattern.
+        Args:
+            urn_pattern: The exact URN or a pattern (with % for wildcard) to match URNs.
+            aspect: Optional aspect string to restore indices for a specific aspect.
+            start: Optional integer to decide which row number of sql store to restore from. Default: 0.
+            batch_size: Optional integer to decide how many rows to restore. Default: 10.
+        Returns:
+            A string containing the result of the restore indices operation. This format is subject to change.
+        """
+        if "%" in urn_pattern:
+            payload_obj: dict = {"urnLike": urn_pattern}
+        else:
+            payload_obj = {"urn": urn_pattern}
+        if aspect is not None:
+            payload_obj["aspect"] = aspect
+        if start is not None:
+            payload_obj["start"] = start
+        if batch_size is not None:
+            payload_obj["batchSize"] = batch_size
+        raw_result = self._post_generic(
+            f"{self._gms_server}/operations?action=restoreIndices", payload_obj
+        )
+        result = raw_result["value"]
+        logger.debug(f"Restore indices result: {result}")
+        return result
     @functools.lru_cache
     def _make_schema_resolver(
         self,

datahub/ingestion/source/bigquery_v2/bigquery.py CHANGED Viewed

@@ -4,6 +4,7 @@ import logging
 import os
 from typing import Iterable, List, Optional
+from datahub.configuration.common import AllowDenyPattern
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
     SupportStatus,
@@ -99,6 +100,7 @@ def cleanup(config: BigQueryV2Config) -> None:
     SourceCapability.PARTITION_SUPPORT,
     "Enabled by default, partition keys and clustering keys are supported.",
 )
+@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
 class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
     def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
         super().__init__(config, ctx)
@@ -241,7 +243,23 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
             ).workunit_processor,
         ]
+    def _warn_deprecated_configs(self):
+        if (
+            self.config.match_fully_qualified_names is not None
+            and not self.config.match_fully_qualified_names
+            and self.config.schema_pattern is not None
+            and self.config.schema_pattern != AllowDenyPattern.allow_all()
+        ):
+            self.report.report_warning(
+                message="Please update `schema_pattern` to match against fully qualified schema name `<database_name>.<schema_name>` and set config `match_fully_qualified_names : True`."
+                "Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. "
+                "The config option `match_fully_qualified_names` will be removed in future and the default behavior will be like `match_fully_qualified_names: True`.",
+                context="Config option deprecation warning",
+                title="Config option deprecation warning",
+            )
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
+        self._warn_deprecated_configs()
         projects = get_projects(
             self.bq_schema_extractor.schema_api,
             self.report,

datahub/ingestion/source/bigquery_v2/common.py CHANGED Viewed

@@ -63,7 +63,7 @@ class BigQueryIdentifierBuilder:
         )
     def gen_user_urn(self, user_email: str) -> str:
-        return make_user_urn(user_email.split("@")[0])
+        return make_user_urn(user_email)
     def make_data_platform_urn(self) -> str:
         return make_data_platform_urn(self.platform)

datahub/ingestion/source/dbt/dbt_cloud.py CHANGED Viewed

@@ -9,7 +9,9 @@ import requests
 from pydantic import Field, root_validator
 from datahub.ingestion.api.decorators import (
+    SourceCapability,
     SupportStatus,
+    capability,
     config_class,
     platform_name,
     support_status,
@@ -261,6 +263,7 @@ query DatahubMetadataQuery_{type}($jobId: BigInt!, $runId: BigInt) {{
 @platform_name("dbt")
 @config_class(DBTCloudConfig)
 @support_status(SupportStatus.CERTIFIED)
+@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
 class DBTCloudSource(DBTSourceBase, TestableSource):
     config: DBTCloudConfig

datahub/ingestion/source/dbt/dbt_common.py CHANGED Viewed

@@ -823,7 +823,9 @@ def get_column_type(
 @platform_name("dbt")
 @config_class(DBTCommonConfig)
 @support_status(SupportStatus.CERTIFIED)
-@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
+@capability(
+    SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
+)
 @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
 @capability(
     SourceCapability.LINEAGE_FINE,

datahub/ingestion/source/dbt/dbt_core.py CHANGED Viewed

@@ -15,7 +15,9 @@ from datahub.configuration.git import GitReference
 from datahub.configuration.validate_field_rename import pydantic_renamed_field
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
+    SourceCapability,
     SupportStatus,
+    capability,
     config_class,
     platform_name,
     support_status,
@@ -464,6 +466,7 @@ def load_run_results(
 @platform_name("dbt")
 @config_class(DBTCoreConfig)
 @support_status(SupportStatus.CERTIFIED)
+@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
 class DBTCoreSource(DBTSourceBase, TestableSource):
     config: DBTCoreConfig
     report: DBTCoreReport

datahub/ingestion/source/dremio/dremio_api.py CHANGED Viewed

@@ -21,6 +21,7 @@ from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
 )
 from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
 from datahub.ingestion.source.dremio.dremio_sql_queries import DremioSQLQueries
+from datahub.utilities.perf_timer import PerfTimer
 logger = logging.getLogger(__name__)
@@ -54,6 +55,8 @@ class DremioAPIOperations:
         self.deny_schema_pattern: List[str] = connection_args.schema_pattern.deny
         self._max_workers: int = connection_args.max_workers
         self.is_dremio_cloud = connection_args.is_dremio_cloud
+        self.start_time = connection_args.start_time
+        self.end_time = connection_args.end_time
         self.report = report
         self.session = requests.Session()
         if connection_args.is_dremio_cloud:
@@ -233,47 +236,71 @@ class DremioAPIOperations:
     def get(self, url: str) -> Dict:
         """execute a get request on dremio"""
-        response = self.session.get(
-            url=(self.base_url + url),
-            verify=self._verify,
-            timeout=self._timeout,
-        )
-        return response.json()
+        logger.debug(f"GET request to {self.base_url + url}")
+        self.report.api_calls_total += 1
+        self.report.api_calls_by_method_and_path["GET " + url] += 1
+        with PerfTimer() as timer:
+            response = self.session.get(
+                url=(self.base_url + url),
+                verify=self._verify,
+                timeout=self._timeout,
+            )
+            self.report.api_call_secs_by_method_and_path["GET " + url] += (
+                timer.elapsed_seconds()
+            )
+            # response.raise_for_status()  # Enabling this line, makes integration tests to fail
+            return response.json()
     def post(self, url: str, data: str) -> Dict:
         """execute a get request on dremio"""
-        response = self.session.post(
-            url=(self.base_url + url),
-            data=data,
-            verify=self._verify,
-            timeout=self._timeout,
-        )
-        return response.json()
+        logger.debug(f"POST request to {self.base_url + url}")
+        self.report.api_calls_total += 1
+        self.report.api_calls_by_method_and_path["POST " + url] += 1
+        with PerfTimer() as timer:
+            response = self.session.post(
+                url=(self.base_url + url),
+                data=data,
+                verify=self._verify,
+                timeout=self._timeout,
+            )
+            self.report.api_call_secs_by_method_and_path["POST " + url] += (
+                timer.elapsed_seconds()
+            )
+            # response.raise_for_status()  # Enabling this line, makes integration tests to fail
+            return response.json()
     def execute_query(self, query: str, timeout: int = 3600) -> List[Dict[str, Any]]:
         """Execute SQL query with timeout and error handling"""
         try:
-            response = self.post(url="/sql", data=json.dumps({"sql": query}))
+            with PerfTimer() as timer:
+                logger.info(f"Executing query: {query}")
+                response = self.post(url="/sql", data=json.dumps({"sql": query}))
-            if "errorMessage" in response:
-                self.report.failure(
-                    message="SQL Error", context=f"{response['errorMessage']}"
-                )
-                raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
+                if "errorMessage" in response:
+                    self.report.failure(
+                        message="SQL Error", context=f"{response['errorMessage']}"
+                    )
+                    raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
-            job_id = response["id"]
+                job_id = response["id"]
-            with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
-                future = executor.submit(self.fetch_results, job_id)
-                try:
-                    return future.result(timeout=timeout)
-                except concurrent.futures.TimeoutError:
-                    self.cancel_query(job_id)
-                    raise DremioAPIException(
-                        f"Query execution timed out after {timeout} seconds"
-                    ) from None
-                except RuntimeError as e:
-                    raise DremioAPIException() from e
+                with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+                    future = executor.submit(self.fetch_results, job_id)
+                    try:
+                        result = future.result(timeout=timeout)
+                        logger.info(
+                            f"Query executed in {timer.elapsed_seconds()} seconds with {len(result)} results"
+                        )
+                        return result
+                    except concurrent.futures.TimeoutError:
+                        self.cancel_query(job_id)
+                        raise DremioAPIException(
+                            f"Query execution timed out after {timeout} seconds"
+                        ) from None
+                    except RuntimeError as e:
+                        raise DremioAPIException() from e
         except requests.RequestException as e:
             raise DremioAPIException("Error executing query") from e
@@ -603,10 +630,25 @@ class DremioAPIOperations:
         return parents_list
     def extract_all_queries(self) -> List[Dict[str, Any]]:
+        # Convert datetime objects to string format for SQL queries
+        start_timestamp_str = None
+        end_timestamp_str = None
+        if self.start_time:
+            start_timestamp_str = self.start_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
+        if self.end_time:
+            end_timestamp_str = self.end_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
         if self.edition == DremioEdition.CLOUD:
-            jobs_query = DremioSQLQueries.QUERY_ALL_JOBS_CLOUD
+            jobs_query = DremioSQLQueries.get_query_all_jobs_cloud(
+                start_timestamp_millis=start_timestamp_str,
+                end_timestamp_millis=end_timestamp_str,
+            )
         else:
-            jobs_query = DremioSQLQueries.QUERY_ALL_JOBS
+            jobs_query = DremioSQLQueries.get_query_all_jobs(
+                start_timestamp_millis=start_timestamp_str,
+                end_timestamp_millis=end_timestamp_str,
+            )
         return self.execute_query(query=jobs_query)
@@ -685,6 +727,27 @@ class DremioAPIOperations:
         return any(re.match(regex_pattern, path, re.IGNORECASE) for path in paths)
+    def _could_match_pattern(self, pattern: str, path_components: List[str]) -> bool:
+        """
+        Check if a container path could potentially match a schema pattern.
+        This handles hierarchical path matching for container filtering.
+        """
+        if pattern == ".*":
+            return True
+        current_path = ".".join(path_components)
+        # Handle simple .* patterns (like "a.b.c.*")
+        if pattern.endswith(".*") and not any(c in pattern for c in "^$[](){}+?\\"):
+            # Simple dotstar pattern - check prefix matching
+            pattern_prefix = pattern[:-2]  # Remove ".*"
+            return current_path.lower().startswith(
+                pattern_prefix.lower()
+            ) or pattern_prefix.lower().startswith(current_path.lower())
+        else:
+            # Complex regex pattern - use existing regex matching logic
+            return self._check_pattern_match(pattern, [current_path], allow_prefix=True)
     def should_include_container(self, path: List[str], name: str) -> bool:
         """
         Helper method to check if a container should be included based on schema patterns.
@@ -711,41 +774,8 @@ class DremioAPIOperations:
         # Check allow patterns
         for pattern in self.allow_schema_pattern:
-            # For patterns with wildcards, check if this path is a parent of the pattern
-            if "*" in pattern:
-                pattern_parts = pattern.split(".")
-                path_parts = path_components
-                # If pattern has exact same number of parts, check each component
-                if len(pattern_parts) == len(path_parts):
-                    matches = True
-                    for p_part, c_part in zip(pattern_parts, path_parts):
-                        if p_part != "*" and p_part.lower() != c_part.lower():
-                            matches = False
-                            break
-                    if matches:
-                        self.report.report_container_scanned(full_path)
-                        return True
-                # Otherwise check if current path is prefix match
-                else:
-                    # Remove the trailing wildcard if present
-                    if pattern_parts[-1] == "*":
-                        pattern_parts = pattern_parts[:-1]
-                    for i in range(len(path_parts)):
-                        current_path = ".".join(path_parts[: i + 1])
-                        pattern_prefix = ".".join(pattern_parts[: i + 1])
-                        if pattern_prefix.startswith(current_path):
-                            self.report.report_container_scanned(full_path)
-                            return True
-            # Direct pattern matching
-            if self._check_pattern_match(
-                pattern=pattern,
-                paths=[full_path],
-                allow_prefix=True,
-            ):
+            # Check if current path could potentially match this pattern
+            if self._could_match_pattern(pattern, path_components):
                 self.report.report_container_scanned(full_path)
                 return True

acryl-datahub 1.1.0.3rc1__py3-none-any.whl → 1.1.0.4__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.0.3rc1py3-none-any.whl → 1.1.0.4py3-none-any.whl