acryl-datahub 1.1.0.3rc1__py3-none-any.whl → 1.1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (68) hide show
  1. {acryl_datahub-1.1.0.3rc1.dist-info → acryl_datahub-1.1.0.4.dist-info}/METADATA +2474 -2474
  2. {acryl_datahub-1.1.0.3rc1.dist-info → acryl_datahub-1.1.0.4.dist-info}/RECORD +68 -68
  3. datahub/_version.py +1 -1
  4. datahub/cli/check_cli.py +27 -0
  5. datahub/cli/delete_cli.py +117 -19
  6. datahub/emitter/rest_emitter.py +18 -1
  7. datahub/ingestion/api/source.py +2 -0
  8. datahub/ingestion/glossary/classification_mixin.py +5 -0
  9. datahub/ingestion/graph/client.py +42 -2
  10. datahub/ingestion/source/bigquery_v2/bigquery.py +18 -0
  11. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  12. datahub/ingestion/source/dbt/dbt_cloud.py +3 -0
  13. datahub/ingestion/source/dbt/dbt_common.py +3 -1
  14. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  15. datahub/ingestion/source/dremio/dremio_api.py +98 -68
  16. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  17. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  18. datahub/ingestion/source/dremio/dremio_source.py +90 -77
  19. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  20. datahub/ingestion/source/file.py +3 -0
  21. datahub/ingestion/source/ge_data_profiler.py +48 -8
  22. datahub/ingestion/source/ge_profiling_config.py +11 -0
  23. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  24. datahub/ingestion/source/kafka/kafka.py +16 -0
  25. datahub/ingestion/source/looker/looker_source.py +1 -0
  26. datahub/ingestion/source/powerbi/powerbi.py +1 -0
  27. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  28. datahub/ingestion/source/redshift/redshift.py +21 -1
  29. datahub/ingestion/source/sac/sac.py +3 -1
  30. datahub/ingestion/source/sigma/sigma.py +1 -0
  31. datahub/ingestion/source/snowflake/snowflake_config.py +3 -6
  32. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  33. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  34. datahub/ingestion/source/snowflake/snowflake_v2.py +2 -0
  35. datahub/ingestion/source/sql/clickhouse.py +3 -1
  36. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  37. datahub/ingestion/source/sql/hana.py +3 -1
  38. datahub/ingestion/source/sql/hive_metastore.py +3 -1
  39. datahub/ingestion/source/sql/mariadb.py +0 -1
  40. datahub/ingestion/source/sql/mssql/source.py +8 -1
  41. datahub/ingestion/source/sql/mysql.py +0 -1
  42. datahub/ingestion/source/sql/postgres.py +0 -1
  43. datahub/ingestion/source/sql/sql_common.py +12 -0
  44. datahub/ingestion/source/superset.py +1 -1
  45. datahub/ingestion/source/tableau/tableau.py +1 -0
  46. datahub/ingestion/source/unity/source.py +1 -0
  47. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  48. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  49. datahub/metadata/_internal_schema_classes.py +25 -0
  50. datahub/metadata/schema.avsc +18 -1
  51. datahub/metadata/schemas/ContainerProperties.avsc +6 -0
  52. datahub/metadata/schemas/DataFlowInfo.avsc +6 -0
  53. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  54. datahub/metadata/schemas/DataJobInfo.avsc +6 -0
  55. datahub/metadata/schemas/DataProcessKey.avsc +6 -0
  56. datahub/metadata/schemas/DatasetKey.avsc +6 -0
  57. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +6 -0
  58. datahub/metadata/schemas/MLModelDeploymentKey.avsc +6 -0
  59. datahub/metadata/schemas/MLModelGroupKey.avsc +6 -0
  60. datahub/metadata/schemas/MLModelKey.avsc +6 -0
  61. datahub/metadata/schemas/MetadataChangeEvent.avsc +18 -1
  62. datahub/sdk/main_client.py +9 -10
  63. datahub/sql_parsing/sqlglot_lineage.py +22 -0
  64. datahub/utilities/stats_collections.py +4 -0
  65. {acryl_datahub-1.1.0.3rc1.dist-info → acryl_datahub-1.1.0.4.dist-info}/WHEEL +0 -0
  66. {acryl_datahub-1.1.0.3rc1.dist-info → acryl_datahub-1.1.0.4.dist-info}/entry_points.txt +0 -0
  67. {acryl_datahub-1.1.0.3rc1.dist-info → acryl_datahub-1.1.0.4.dist-info}/licenses/LICENSE +0 -0
  68. {acryl_datahub-1.1.0.3rc1.dist-info → acryl_datahub-1.1.0.4.dist-info}/top_level.txt +0 -0
datahub/cli/delete_cli.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import logging
2
2
  import random
3
+ import sys
3
4
  from concurrent.futures import ThreadPoolExecutor, as_completed
4
5
  from dataclasses import dataclass
5
6
  from datetime import datetime
@@ -317,6 +318,19 @@ def undo_by_filter(
317
318
  is_flag=True,
318
319
  help="Recursively delete all contained entities (only for containers and dataPlatformInstances)",
319
320
  )
321
+ @click.option(
322
+ "--streaming-batch",
323
+ required=False,
324
+ is_flag=True,
325
+ help="Use streaming batch deletion for recursive operations. Benefit of being resumable for large hierarchies where getting all URNs at once can take a long time.",
326
+ )
327
+ @click.option(
328
+ "--streaming-batch-size",
329
+ required=False,
330
+ default=12000,
331
+ type=int,
332
+ help="Batch size for streaming batch deletion for recursive operations.",
333
+ )
320
334
  @click.option(
321
335
  "--start-time",
322
336
  required=False,
@@ -368,6 +382,8 @@ def by_filter(
368
382
  entity_type: Optional[str],
369
383
  query: Optional[str],
370
384
  recursive: bool,
385
+ streaming_batch: bool,
386
+ streaming_batch_size: int,
371
387
  start_time: Optional[datetime],
372
388
  end_time: Optional[datetime],
373
389
  batch_size: int,
@@ -386,6 +402,7 @@ def by_filter(
386
402
  env=env,
387
403
  query=query,
388
404
  recursive=recursive,
405
+ streaming_batch=streaming_batch,
389
406
  )
390
407
  soft_delete_filter = _validate_user_soft_delete_flags(
391
408
  soft=soft, aspect=aspect, only_soft_deleted=only_soft_deleted
@@ -417,26 +434,27 @@ def by_filter(
417
434
  # Determine which urns to delete.
418
435
  delete_by_urn = bool(urn) and not recursive
419
436
  if urn:
420
- urns = [urn]
421
-
422
437
  if recursive:
423
- # Add children urns to the list.
424
- if guess_entity_type(urn) == "dataPlatformInstance":
425
- urns.extend(
426
- graph.get_urns_by_filter(
427
- platform_instance=urn,
428
- status=soft_delete_filter,
429
- batch_size=batch_size,
430
- )
431
- )
432
- else:
433
- urns.extend(
434
- graph.get_urns_by_filter(
435
- container=urn,
436
- status=soft_delete_filter,
437
- batch_size=batch_size,
438
- )
439
- )
438
+ _delete_urns_streaming_recursive(
439
+ graph=graph,
440
+ parent_urn=urn,
441
+ aspect_name=aspect,
442
+ soft=soft,
443
+ dry_run=dry_run,
444
+ start_time=start_time,
445
+ end_time=end_time,
446
+ workers=workers,
447
+ soft_delete_filter=soft_delete_filter,
448
+ batch_size=batch_size,
449
+ force=force,
450
+ streaming_batch_size=streaming_batch_size
451
+ if streaming_batch
452
+ else sys.maxsize,
453
+ )
454
+ return
455
+
456
+ else:
457
+ urns = [urn]
440
458
  elif urn_file:
441
459
  with open(urn_file, "r") as r:
442
460
  urns = []
@@ -557,6 +575,7 @@ def _validate_user_urn_and_filters(
557
575
  env: Optional[str],
558
576
  query: Optional[str],
559
577
  recursive: bool,
578
+ streaming_batch: bool,
560
579
  ) -> None:
561
580
  # Check urn / filters options.
562
581
  if urn:
@@ -592,6 +611,12 @@ def _validate_user_urn_and_filters(
592
611
  f"This will only delete {urn}. Use --recursive to delete all contained entities."
593
612
  )
594
613
 
614
+ # Check streaming flag.
615
+ if streaming_batch and not recursive:
616
+ raise click.UsageError(
617
+ "The --streaming-batch flag can only be used with --recursive."
618
+ )
619
+
595
620
 
596
621
  def _validate_user_soft_delete_flags(
597
622
  soft: bool, aspect: Optional[str], only_soft_deleted: bool
@@ -738,3 +763,76 @@ def _delete_one_urn(
738
763
  num_timeseries_records=ts_rows_affected,
739
764
  num_referenced_entities=referenced_entities_affected,
740
765
  )
766
+
767
+
768
+ def _delete_urns_streaming_recursive(
769
+ graph: DataHubGraph,
770
+ parent_urn: str,
771
+ aspect_name: Optional[str],
772
+ soft: bool,
773
+ dry_run: bool,
774
+ start_time: Optional[datetime],
775
+ end_time: Optional[datetime],
776
+ workers: int,
777
+ soft_delete_filter: RemovedStatusFilter,
778
+ batch_size: int,
779
+ force: bool,
780
+ streaming_batch_size: int,
781
+ ) -> None:
782
+ """Streaming recursive batch deletion that processes URNs in batches."""
783
+
784
+ entity_type = guess_entity_type(parent_urn)
785
+ click.echo(f"Starting recursive deletion of {entity_type} {parent_urn}")
786
+
787
+ if not force and not dry_run:
788
+ click.confirm(
789
+ f"This will recursively delete {parent_urn} and all its contained entities. Do you want to continue?",
790
+ abort=True,
791
+ )
792
+
793
+ urns = []
794
+
795
+ if entity_type == "dataPlatformInstance":
796
+ child_urns_iter = graph.get_urns_by_filter(
797
+ platform_instance=parent_urn,
798
+ status=soft_delete_filter,
799
+ batch_size=batch_size,
800
+ # Important to skip cache so we can resume from where we left off.
801
+ skip_cache=True,
802
+ )
803
+ else:
804
+ child_urns_iter = graph.get_urns_by_filter(
805
+ container=parent_urn,
806
+ status=soft_delete_filter,
807
+ batch_size=batch_size,
808
+ # Important to skip cache so we can resume from where we left off.
809
+ skip_cache=True,
810
+ )
811
+
812
+ for child_urn in child_urns_iter:
813
+ urns.append(child_urn)
814
+ if len(urns) >= streaming_batch_size:
815
+ _delete_urns_parallel(
816
+ graph=graph,
817
+ urns=urns,
818
+ aspect_name=aspect_name,
819
+ soft=soft,
820
+ dry_run=dry_run,
821
+ delete_by_urn=False,
822
+ start_time=start_time,
823
+ end_time=end_time,
824
+ workers=workers,
825
+ )
826
+ urns = []
827
+ urns.append(parent_urn)
828
+ _delete_urns_parallel(
829
+ graph=graph,
830
+ urns=urns,
831
+ aspect_name=aspect_name,
832
+ soft=soft,
833
+ dry_run=dry_run,
834
+ delete_by_urn=False,
835
+ start_time=start_time,
836
+ end_time=end_time,
837
+ workers=workers,
838
+ )
@@ -4,6 +4,7 @@ import functools
4
4
  import json
5
5
  import logging
6
6
  import os
7
+ import re
7
8
  import time
8
9
  from collections import defaultdict
9
10
  from dataclasses import dataclass
@@ -104,6 +105,22 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
104
105
  )
105
106
 
106
107
 
108
+ def preserve_unicode_escapes(obj: Any) -> Any:
109
+ """Recursively convert unicode characters back to escape sequences"""
110
+ if isinstance(obj, dict):
111
+ return {k: preserve_unicode_escapes(v) for k, v in obj.items()}
112
+ elif isinstance(obj, list):
113
+ return [preserve_unicode_escapes(item) for item in obj]
114
+ elif isinstance(obj, str):
115
+ # Convert non-ASCII characters back to \u escapes
116
+ def escape_unicode(match: Any) -> Any:
117
+ return f"\\u{ord(match.group(0)):04x}"
118
+
119
+ return re.sub(r"[^\x00-\x7F]", escape_unicode, obj)
120
+ else:
121
+ return obj
122
+
123
+
107
124
  class EmitMode(ConfigEnum):
108
125
  # Fully synchronous processing that updates both primary storage (SQL) and search storage (Elasticsearch) before returning.
109
126
  # Provides the strongest consistency guarantee but with the highest cost. Best for critical operations where immediate
@@ -611,7 +628,7 @@ class DataHubRestEmitter(Closeable, Emitter):
611
628
  else:
612
629
  url = f"{self._gms_server}/aspects?action=ingestProposal"
613
630
 
614
- mcp_obj = pre_json_transform(mcp.to_obj())
631
+ mcp_obj = preserve_unicode_escapes(pre_json_transform(mcp.to_obj()))
615
632
  payload_dict = {
616
633
  "proposal": mcp_obj,
617
634
  "async": "true"
@@ -76,6 +76,7 @@ class SourceCapability(Enum):
76
76
  SCHEMA_METADATA = "Schema Metadata"
77
77
  CONTAINERS = "Asset Containers"
78
78
  CLASSIFICATION = "Classification"
79
+ TEST_CONNECTION = "Test Connection"
79
80
 
80
81
 
81
82
  class StructuredLogLevel(Enum):
@@ -247,6 +248,7 @@ class SourceReport(Report):
247
248
  self.aspect_urn_samples[entityType][
248
249
  "fineGrainedLineages"
249
250
  ].append(urn)
251
+ self.aspects[entityType]["fineGrainedLineages"] += 1
250
252
 
251
253
  def report_warning(
252
254
  self,
@@ -90,6 +90,11 @@ class ClassificationHandler:
90
90
 
91
91
  def get_classifiers(self) -> List[Classifier]:
92
92
  classifiers = []
93
+ if (
94
+ not isinstance(self.config, ClassificationSourceConfigMixin)
95
+ or self.config.classification is None
96
+ ):
97
+ return classifiers
93
98
 
94
99
  for classifier in self.config.classification.classifiers:
95
100
  classifier_class = classifier_registry.get(classifier.type)
@@ -906,6 +906,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
906
906
  batch_size: int = 5000,
907
907
  extraFilters: Optional[List[RawSearchFilterRule]] = None,
908
908
  extra_or_filters: Optional[RawSearchFilter] = None,
909
+ skip_cache: bool = False,
909
910
  ) -> Iterable[str]:
910
911
  """Fetch all urns that match all of the given filters.
911
912
 
@@ -924,6 +925,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
924
925
  Note that this requires browsePathV2 aspects (added in 0.10.4+).
925
926
  :param status: Filter on the deletion status of the entity. The default is only return non-soft-deleted entities.
926
927
  :param extraFilters: Additional filters to apply. If specified, the results will match all of the filters.
928
+ :param skip_cache: Whether to bypass caching. Defaults to False.
927
929
 
928
930
  :return: An iterable of urns that match the filters.
929
931
  """
@@ -951,7 +953,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
951
953
  $query: String!,
952
954
  $orFilters: [AndFilterInput!],
953
955
  $batchSize: Int!,
954
- $scrollId: String) {
956
+ $scrollId: String,
957
+ $skipCache: Boolean!) {
955
958
 
956
959
  scrollAcrossEntities(input: {
957
960
  query: $query,
@@ -962,6 +965,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
962
965
  searchFlags: {
963
966
  skipHighlighting: true
964
967
  skipAggregates: true
968
+ skipCache: $skipCache
965
969
  }
966
970
  }) {
967
971
  nextScrollId
@@ -980,6 +984,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
980
984
  "query": query,
981
985
  "orFilters": orFilters,
982
986
  "batchSize": batch_size,
987
+ "skipCache": skip_cache,
983
988
  }
984
989
 
985
990
  for entity in self._scroll_across_entities(graphql_query, variables):
@@ -1085,7 +1090,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1085
1090
  "query": query,
1086
1091
  "orFilters": or_filters_final,
1087
1092
  "batchSize": batch_size,
1088
- "skipCache": "true" if skip_cache else "false",
1093
+ "skipCache": skip_cache,
1089
1094
  "fetchExtraFields": extra_source_fields,
1090
1095
  }
1091
1096
 
@@ -1429,6 +1434,41 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1429
1434
  related_aspects = response.get("relatedAspects", [])
1430
1435
  return reference_count, related_aspects
1431
1436
 
1437
+ def restore_indices(
1438
+ self,
1439
+ urn_pattern: str,
1440
+ aspect: Optional[str] = None,
1441
+ start: Optional[int] = None,
1442
+ batch_size: Optional[int] = None,
1443
+ ) -> str:
1444
+ """Restore the indices for a given urn or urn-like pattern.
1445
+
1446
+ Args:
1447
+ urn_pattern: The exact URN or a pattern (with % for wildcard) to match URNs.
1448
+ aspect: Optional aspect string to restore indices for a specific aspect.
1449
+ start: Optional integer to decide which row number of sql store to restore from. Default: 0.
1450
+ batch_size: Optional integer to decide how many rows to restore. Default: 10.
1451
+
1452
+ Returns:
1453
+ A string containing the result of the restore indices operation. This format is subject to change.
1454
+ """
1455
+ if "%" in urn_pattern:
1456
+ payload_obj: dict = {"urnLike": urn_pattern}
1457
+ else:
1458
+ payload_obj = {"urn": urn_pattern}
1459
+ if aspect is not None:
1460
+ payload_obj["aspect"] = aspect
1461
+ if start is not None:
1462
+ payload_obj["start"] = start
1463
+ if batch_size is not None:
1464
+ payload_obj["batchSize"] = batch_size
1465
+ raw_result = self._post_generic(
1466
+ f"{self._gms_server}/operations?action=restoreIndices", payload_obj
1467
+ )
1468
+ result = raw_result["value"]
1469
+ logger.debug(f"Restore indices result: {result}")
1470
+ return result
1471
+
1432
1472
  @functools.lru_cache
1433
1473
  def _make_schema_resolver(
1434
1474
  self,
@@ -4,6 +4,7 @@ import logging
4
4
  import os
5
5
  from typing import Iterable, List, Optional
6
6
 
7
+ from datahub.configuration.common import AllowDenyPattern
7
8
  from datahub.ingestion.api.common import PipelineContext
8
9
  from datahub.ingestion.api.decorators import (
9
10
  SupportStatus,
@@ -99,6 +100,7 @@ def cleanup(config: BigQueryV2Config) -> None:
99
100
  SourceCapability.PARTITION_SUPPORT,
100
101
  "Enabled by default, partition keys and clustering keys are supported.",
101
102
  )
103
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
102
104
  class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
103
105
  def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
104
106
  super().__init__(config, ctx)
@@ -241,7 +243,23 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
241
243
  ).workunit_processor,
242
244
  ]
243
245
 
246
+ def _warn_deprecated_configs(self):
247
+ if (
248
+ self.config.match_fully_qualified_names is not None
249
+ and not self.config.match_fully_qualified_names
250
+ and self.config.schema_pattern is not None
251
+ and self.config.schema_pattern != AllowDenyPattern.allow_all()
252
+ ):
253
+ self.report.report_warning(
254
+ message="Please update `schema_pattern` to match against fully qualified schema name `<database_name>.<schema_name>` and set config `match_fully_qualified_names : True`."
255
+ "Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. "
256
+ "The config option `match_fully_qualified_names` will be removed in future and the default behavior will be like `match_fully_qualified_names: True`.",
257
+ context="Config option deprecation warning",
258
+ title="Config option deprecation warning",
259
+ )
260
+
244
261
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
262
+ self._warn_deprecated_configs()
245
263
  projects = get_projects(
246
264
  self.bq_schema_extractor.schema_api,
247
265
  self.report,
@@ -63,7 +63,7 @@ class BigQueryIdentifierBuilder:
63
63
  )
64
64
 
65
65
  def gen_user_urn(self, user_email: str) -> str:
66
- return make_user_urn(user_email.split("@")[0])
66
+ return make_user_urn(user_email)
67
67
 
68
68
  def make_data_platform_urn(self) -> str:
69
69
  return make_data_platform_urn(self.platform)
@@ -9,7 +9,9 @@ import requests
9
9
  from pydantic import Field, root_validator
10
10
 
11
11
  from datahub.ingestion.api.decorators import (
12
+ SourceCapability,
12
13
  SupportStatus,
14
+ capability,
13
15
  config_class,
14
16
  platform_name,
15
17
  support_status,
@@ -261,6 +263,7 @@ query DatahubMetadataQuery_{type}($jobId: BigInt!, $runId: BigInt) {{
261
263
  @platform_name("dbt")
262
264
  @config_class(DBTCloudConfig)
263
265
  @support_status(SupportStatus.CERTIFIED)
266
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
264
267
  class DBTCloudSource(DBTSourceBase, TestableSource):
265
268
  config: DBTCloudConfig
266
269
 
@@ -823,7 +823,9 @@ def get_column_type(
823
823
  @platform_name("dbt")
824
824
  @config_class(DBTCommonConfig)
825
825
  @support_status(SupportStatus.CERTIFIED)
826
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
826
+ @capability(
827
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
828
+ )
827
829
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
828
830
  @capability(
829
831
  SourceCapability.LINEAGE_FINE,
@@ -15,7 +15,9 @@ from datahub.configuration.git import GitReference
15
15
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
16
16
  from datahub.ingestion.api.common import PipelineContext
17
17
  from datahub.ingestion.api.decorators import (
18
+ SourceCapability,
18
19
  SupportStatus,
20
+ capability,
19
21
  config_class,
20
22
  platform_name,
21
23
  support_status,
@@ -464,6 +466,7 @@ def load_run_results(
464
466
  @platform_name("dbt")
465
467
  @config_class(DBTCoreConfig)
466
468
  @support_status(SupportStatus.CERTIFIED)
469
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
467
470
  class DBTCoreSource(DBTSourceBase, TestableSource):
468
471
  config: DBTCoreConfig
469
472
  report: DBTCoreReport
@@ -21,6 +21,7 @@ from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
21
21
  )
22
22
  from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
23
23
  from datahub.ingestion.source.dremio.dremio_sql_queries import DremioSQLQueries
24
+ from datahub.utilities.perf_timer import PerfTimer
24
25
 
25
26
  logger = logging.getLogger(__name__)
26
27
 
@@ -54,6 +55,8 @@ class DremioAPIOperations:
54
55
  self.deny_schema_pattern: List[str] = connection_args.schema_pattern.deny
55
56
  self._max_workers: int = connection_args.max_workers
56
57
  self.is_dremio_cloud = connection_args.is_dremio_cloud
58
+ self.start_time = connection_args.start_time
59
+ self.end_time = connection_args.end_time
57
60
  self.report = report
58
61
  self.session = requests.Session()
59
62
  if connection_args.is_dremio_cloud:
@@ -233,47 +236,71 @@ class DremioAPIOperations:
233
236
 
234
237
  def get(self, url: str) -> Dict:
235
238
  """execute a get request on dremio"""
236
- response = self.session.get(
237
- url=(self.base_url + url),
238
- verify=self._verify,
239
- timeout=self._timeout,
240
- )
241
- return response.json()
239
+ logger.debug(f"GET request to {self.base_url + url}")
240
+ self.report.api_calls_total += 1
241
+ self.report.api_calls_by_method_and_path["GET " + url] += 1
242
+
243
+ with PerfTimer() as timer:
244
+ response = self.session.get(
245
+ url=(self.base_url + url),
246
+ verify=self._verify,
247
+ timeout=self._timeout,
248
+ )
249
+ self.report.api_call_secs_by_method_and_path["GET " + url] += (
250
+ timer.elapsed_seconds()
251
+ )
252
+ # response.raise_for_status() # Enabling this line, makes integration tests to fail
253
+ return response.json()
242
254
 
243
255
  def post(self, url: str, data: str) -> Dict:
244
256
  """execute a get request on dremio"""
245
- response = self.session.post(
246
- url=(self.base_url + url),
247
- data=data,
248
- verify=self._verify,
249
- timeout=self._timeout,
250
- )
251
- return response.json()
257
+ logger.debug(f"POST request to {self.base_url + url}")
258
+ self.report.api_calls_total += 1
259
+ self.report.api_calls_by_method_and_path["POST " + url] += 1
260
+
261
+ with PerfTimer() as timer:
262
+ response = self.session.post(
263
+ url=(self.base_url + url),
264
+ data=data,
265
+ verify=self._verify,
266
+ timeout=self._timeout,
267
+ )
268
+ self.report.api_call_secs_by_method_and_path["POST " + url] += (
269
+ timer.elapsed_seconds()
270
+ )
271
+ # response.raise_for_status() # Enabling this line, makes integration tests to fail
272
+ return response.json()
252
273
 
253
274
  def execute_query(self, query: str, timeout: int = 3600) -> List[Dict[str, Any]]:
254
275
  """Execute SQL query with timeout and error handling"""
255
276
  try:
256
- response = self.post(url="/sql", data=json.dumps({"sql": query}))
277
+ with PerfTimer() as timer:
278
+ logger.info(f"Executing query: {query}")
279
+ response = self.post(url="/sql", data=json.dumps({"sql": query}))
257
280
 
258
- if "errorMessage" in response:
259
- self.report.failure(
260
- message="SQL Error", context=f"{response['errorMessage']}"
261
- )
262
- raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
281
+ if "errorMessage" in response:
282
+ self.report.failure(
283
+ message="SQL Error", context=f"{response['errorMessage']}"
284
+ )
285
+ raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
263
286
 
264
- job_id = response["id"]
287
+ job_id = response["id"]
265
288
 
266
- with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
267
- future = executor.submit(self.fetch_results, job_id)
268
- try:
269
- return future.result(timeout=timeout)
270
- except concurrent.futures.TimeoutError:
271
- self.cancel_query(job_id)
272
- raise DremioAPIException(
273
- f"Query execution timed out after {timeout} seconds"
274
- ) from None
275
- except RuntimeError as e:
276
- raise DremioAPIException() from e
289
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
290
+ future = executor.submit(self.fetch_results, job_id)
291
+ try:
292
+ result = future.result(timeout=timeout)
293
+ logger.info(
294
+ f"Query executed in {timer.elapsed_seconds()} seconds with {len(result)} results"
295
+ )
296
+ return result
297
+ except concurrent.futures.TimeoutError:
298
+ self.cancel_query(job_id)
299
+ raise DremioAPIException(
300
+ f"Query execution timed out after {timeout} seconds"
301
+ ) from None
302
+ except RuntimeError as e:
303
+ raise DremioAPIException() from e
277
304
 
278
305
  except requests.RequestException as e:
279
306
  raise DremioAPIException("Error executing query") from e
@@ -603,10 +630,25 @@ class DremioAPIOperations:
603
630
  return parents_list
604
631
 
605
632
  def extract_all_queries(self) -> List[Dict[str, Any]]:
633
+ # Convert datetime objects to string format for SQL queries
634
+ start_timestamp_str = None
635
+ end_timestamp_str = None
636
+
637
+ if self.start_time:
638
+ start_timestamp_str = self.start_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
639
+ if self.end_time:
640
+ end_timestamp_str = self.end_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
641
+
606
642
  if self.edition == DremioEdition.CLOUD:
607
- jobs_query = DremioSQLQueries.QUERY_ALL_JOBS_CLOUD
643
+ jobs_query = DremioSQLQueries.get_query_all_jobs_cloud(
644
+ start_timestamp_millis=start_timestamp_str,
645
+ end_timestamp_millis=end_timestamp_str,
646
+ )
608
647
  else:
609
- jobs_query = DremioSQLQueries.QUERY_ALL_JOBS
648
+ jobs_query = DremioSQLQueries.get_query_all_jobs(
649
+ start_timestamp_millis=start_timestamp_str,
650
+ end_timestamp_millis=end_timestamp_str,
651
+ )
610
652
 
611
653
  return self.execute_query(query=jobs_query)
612
654
 
@@ -685,6 +727,27 @@ class DremioAPIOperations:
685
727
 
686
728
  return any(re.match(regex_pattern, path, re.IGNORECASE) for path in paths)
687
729
 
730
+ def _could_match_pattern(self, pattern: str, path_components: List[str]) -> bool:
731
+ """
732
+ Check if a container path could potentially match a schema pattern.
733
+ This handles hierarchical path matching for container filtering.
734
+ """
735
+ if pattern == ".*":
736
+ return True
737
+
738
+ current_path = ".".join(path_components)
739
+
740
+ # Handle simple .* patterns (like "a.b.c.*")
741
+ if pattern.endswith(".*") and not any(c in pattern for c in "^$[](){}+?\\"):
742
+ # Simple dotstar pattern - check prefix matching
743
+ pattern_prefix = pattern[:-2] # Remove ".*"
744
+ return current_path.lower().startswith(
745
+ pattern_prefix.lower()
746
+ ) or pattern_prefix.lower().startswith(current_path.lower())
747
+ else:
748
+ # Complex regex pattern - use existing regex matching logic
749
+ return self._check_pattern_match(pattern, [current_path], allow_prefix=True)
750
+
688
751
  def should_include_container(self, path: List[str], name: str) -> bool:
689
752
  """
690
753
  Helper method to check if a container should be included based on schema patterns.
@@ -711,41 +774,8 @@ class DremioAPIOperations:
711
774
 
712
775
  # Check allow patterns
713
776
  for pattern in self.allow_schema_pattern:
714
- # For patterns with wildcards, check if this path is a parent of the pattern
715
- if "*" in pattern:
716
- pattern_parts = pattern.split(".")
717
- path_parts = path_components
718
-
719
- # If pattern has exact same number of parts, check each component
720
- if len(pattern_parts) == len(path_parts):
721
- matches = True
722
- for p_part, c_part in zip(pattern_parts, path_parts):
723
- if p_part != "*" and p_part.lower() != c_part.lower():
724
- matches = False
725
- break
726
- if matches:
727
- self.report.report_container_scanned(full_path)
728
- return True
729
- # Otherwise check if current path is prefix match
730
- else:
731
- # Remove the trailing wildcard if present
732
- if pattern_parts[-1] == "*":
733
- pattern_parts = pattern_parts[:-1]
734
-
735
- for i in range(len(path_parts)):
736
- current_path = ".".join(path_parts[: i + 1])
737
- pattern_prefix = ".".join(pattern_parts[: i + 1])
738
-
739
- if pattern_prefix.startswith(current_path):
740
- self.report.report_container_scanned(full_path)
741
- return True
742
-
743
- # Direct pattern matching
744
- if self._check_pattern_match(
745
- pattern=pattern,
746
- paths=[full_path],
747
- allow_prefix=True,
748
- ):
777
+ # Check if current path could potentially match this pattern
778
+ if self._could_match_pattern(pattern, path_components):
749
779
  self.report.report_container_scanned(full_path)
750
780
  return True
751
781