acryl-datahub 1.1.0.3rc2__py3-none-any.whl → 1.1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.3rc2.dist-info → acryl_datahub-1.1.0.4.dist-info}/METADATA +2542 -2542
- {acryl_datahub-1.1.0.3rc2.dist-info → acryl_datahub-1.1.0.4.dist-info}/RECORD +66 -66
- datahub/_version.py +1 -1
- datahub/cli/check_cli.py +27 -0
- datahub/cli/delete_cli.py +117 -19
- datahub/emitter/rest_emitter.py +18 -1
- datahub/ingestion/api/source.py +2 -0
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +42 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +18 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +3 -0
- datahub/ingestion/source/dbt/dbt_common.py +3 -1
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/dremio/dremio_api.py +98 -68
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +90 -77
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/ge_data_profiler.py +48 -8
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/powerbi/powerbi.py +1 -0
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/sigma/sigma.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +3 -6
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +2 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -1
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +8 -1
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +12 -0
- datahub/ingestion/source/tableau/tableau.py +1 -0
- datahub/ingestion/source/unity/source.py +1 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/metadata/_internal_schema_classes.py +25 -0
- datahub/metadata/schema.avsc +18 -1
- datahub/metadata/schemas/ContainerProperties.avsc +6 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +6 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +6 -0
- datahub/metadata/schemas/DataProcessKey.avsc +6 -0
- datahub/metadata/schemas/DatasetKey.avsc +6 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +6 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +6 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +6 -0
- datahub/metadata/schemas/MLModelKey.avsc +6 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +18 -1
- datahub/sql_parsing/sqlglot_lineage.py +21 -6
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.0.3rc2.dist-info → acryl_datahub-1.1.0.4.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.3rc2.dist-info → acryl_datahub-1.1.0.4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0.3rc2.dist-info → acryl_datahub-1.1.0.4.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.3rc2.dist-info → acryl_datahub-1.1.0.4.dist-info}/top_level.txt +0 -0
datahub/cli/delete_cli.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import random
|
|
3
|
+
import sys
|
|
3
4
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
4
5
|
from dataclasses import dataclass
|
|
5
6
|
from datetime import datetime
|
|
@@ -317,6 +318,19 @@ def undo_by_filter(
|
|
|
317
318
|
is_flag=True,
|
|
318
319
|
help="Recursively delete all contained entities (only for containers and dataPlatformInstances)",
|
|
319
320
|
)
|
|
321
|
+
@click.option(
|
|
322
|
+
"--streaming-batch",
|
|
323
|
+
required=False,
|
|
324
|
+
is_flag=True,
|
|
325
|
+
help="Use streaming batch deletion for recursive operations. Benefit of being resumable for large hierarchies where getting all URNs at once can take a long time.",
|
|
326
|
+
)
|
|
327
|
+
@click.option(
|
|
328
|
+
"--streaming-batch-size",
|
|
329
|
+
required=False,
|
|
330
|
+
default=12000,
|
|
331
|
+
type=int,
|
|
332
|
+
help="Batch size for streaming batch deletion for recursive operations.",
|
|
333
|
+
)
|
|
320
334
|
@click.option(
|
|
321
335
|
"--start-time",
|
|
322
336
|
required=False,
|
|
@@ -368,6 +382,8 @@ def by_filter(
|
|
|
368
382
|
entity_type: Optional[str],
|
|
369
383
|
query: Optional[str],
|
|
370
384
|
recursive: bool,
|
|
385
|
+
streaming_batch: bool,
|
|
386
|
+
streaming_batch_size: int,
|
|
371
387
|
start_time: Optional[datetime],
|
|
372
388
|
end_time: Optional[datetime],
|
|
373
389
|
batch_size: int,
|
|
@@ -386,6 +402,7 @@ def by_filter(
|
|
|
386
402
|
env=env,
|
|
387
403
|
query=query,
|
|
388
404
|
recursive=recursive,
|
|
405
|
+
streaming_batch=streaming_batch,
|
|
389
406
|
)
|
|
390
407
|
soft_delete_filter = _validate_user_soft_delete_flags(
|
|
391
408
|
soft=soft, aspect=aspect, only_soft_deleted=only_soft_deleted
|
|
@@ -417,26 +434,27 @@ def by_filter(
|
|
|
417
434
|
# Determine which urns to delete.
|
|
418
435
|
delete_by_urn = bool(urn) and not recursive
|
|
419
436
|
if urn:
|
|
420
|
-
urns = [urn]
|
|
421
|
-
|
|
422
437
|
if recursive:
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
438
|
+
_delete_urns_streaming_recursive(
|
|
439
|
+
graph=graph,
|
|
440
|
+
parent_urn=urn,
|
|
441
|
+
aspect_name=aspect,
|
|
442
|
+
soft=soft,
|
|
443
|
+
dry_run=dry_run,
|
|
444
|
+
start_time=start_time,
|
|
445
|
+
end_time=end_time,
|
|
446
|
+
workers=workers,
|
|
447
|
+
soft_delete_filter=soft_delete_filter,
|
|
448
|
+
batch_size=batch_size,
|
|
449
|
+
force=force,
|
|
450
|
+
streaming_batch_size=streaming_batch_size
|
|
451
|
+
if streaming_batch
|
|
452
|
+
else sys.maxsize,
|
|
453
|
+
)
|
|
454
|
+
return
|
|
455
|
+
|
|
456
|
+
else:
|
|
457
|
+
urns = [urn]
|
|
440
458
|
elif urn_file:
|
|
441
459
|
with open(urn_file, "r") as r:
|
|
442
460
|
urns = []
|
|
@@ -557,6 +575,7 @@ def _validate_user_urn_and_filters(
|
|
|
557
575
|
env: Optional[str],
|
|
558
576
|
query: Optional[str],
|
|
559
577
|
recursive: bool,
|
|
578
|
+
streaming_batch: bool,
|
|
560
579
|
) -> None:
|
|
561
580
|
# Check urn / filters options.
|
|
562
581
|
if urn:
|
|
@@ -592,6 +611,12 @@ def _validate_user_urn_and_filters(
|
|
|
592
611
|
f"This will only delete {urn}. Use --recursive to delete all contained entities."
|
|
593
612
|
)
|
|
594
613
|
|
|
614
|
+
# Check streaming flag.
|
|
615
|
+
if streaming_batch and not recursive:
|
|
616
|
+
raise click.UsageError(
|
|
617
|
+
"The --streaming-batch flag can only be used with --recursive."
|
|
618
|
+
)
|
|
619
|
+
|
|
595
620
|
|
|
596
621
|
def _validate_user_soft_delete_flags(
|
|
597
622
|
soft: bool, aspect: Optional[str], only_soft_deleted: bool
|
|
@@ -738,3 +763,76 @@ def _delete_one_urn(
|
|
|
738
763
|
num_timeseries_records=ts_rows_affected,
|
|
739
764
|
num_referenced_entities=referenced_entities_affected,
|
|
740
765
|
)
|
|
766
|
+
|
|
767
|
+
|
|
768
|
+
def _delete_urns_streaming_recursive(
|
|
769
|
+
graph: DataHubGraph,
|
|
770
|
+
parent_urn: str,
|
|
771
|
+
aspect_name: Optional[str],
|
|
772
|
+
soft: bool,
|
|
773
|
+
dry_run: bool,
|
|
774
|
+
start_time: Optional[datetime],
|
|
775
|
+
end_time: Optional[datetime],
|
|
776
|
+
workers: int,
|
|
777
|
+
soft_delete_filter: RemovedStatusFilter,
|
|
778
|
+
batch_size: int,
|
|
779
|
+
force: bool,
|
|
780
|
+
streaming_batch_size: int,
|
|
781
|
+
) -> None:
|
|
782
|
+
"""Streaming recursive batch deletion that processes URNs in batches."""
|
|
783
|
+
|
|
784
|
+
entity_type = guess_entity_type(parent_urn)
|
|
785
|
+
click.echo(f"Starting recursive deletion of {entity_type} {parent_urn}")
|
|
786
|
+
|
|
787
|
+
if not force and not dry_run:
|
|
788
|
+
click.confirm(
|
|
789
|
+
f"This will recursively delete {parent_urn} and all its contained entities. Do you want to continue?",
|
|
790
|
+
abort=True,
|
|
791
|
+
)
|
|
792
|
+
|
|
793
|
+
urns = []
|
|
794
|
+
|
|
795
|
+
if entity_type == "dataPlatformInstance":
|
|
796
|
+
child_urns_iter = graph.get_urns_by_filter(
|
|
797
|
+
platform_instance=parent_urn,
|
|
798
|
+
status=soft_delete_filter,
|
|
799
|
+
batch_size=batch_size,
|
|
800
|
+
# Important to skip cache so we can resume from where we left off.
|
|
801
|
+
skip_cache=True,
|
|
802
|
+
)
|
|
803
|
+
else:
|
|
804
|
+
child_urns_iter = graph.get_urns_by_filter(
|
|
805
|
+
container=parent_urn,
|
|
806
|
+
status=soft_delete_filter,
|
|
807
|
+
batch_size=batch_size,
|
|
808
|
+
# Important to skip cache so we can resume from where we left off.
|
|
809
|
+
skip_cache=True,
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
for child_urn in child_urns_iter:
|
|
813
|
+
urns.append(child_urn)
|
|
814
|
+
if len(urns) >= streaming_batch_size:
|
|
815
|
+
_delete_urns_parallel(
|
|
816
|
+
graph=graph,
|
|
817
|
+
urns=urns,
|
|
818
|
+
aspect_name=aspect_name,
|
|
819
|
+
soft=soft,
|
|
820
|
+
dry_run=dry_run,
|
|
821
|
+
delete_by_urn=False,
|
|
822
|
+
start_time=start_time,
|
|
823
|
+
end_time=end_time,
|
|
824
|
+
workers=workers,
|
|
825
|
+
)
|
|
826
|
+
urns = []
|
|
827
|
+
urns.append(parent_urn)
|
|
828
|
+
_delete_urns_parallel(
|
|
829
|
+
graph=graph,
|
|
830
|
+
urns=urns,
|
|
831
|
+
aspect_name=aspect_name,
|
|
832
|
+
soft=soft,
|
|
833
|
+
dry_run=dry_run,
|
|
834
|
+
delete_by_urn=False,
|
|
835
|
+
start_time=start_time,
|
|
836
|
+
end_time=end_time,
|
|
837
|
+
workers=workers,
|
|
838
|
+
)
|
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -4,6 +4,7 @@ import functools
|
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
7
|
+
import re
|
|
7
8
|
import time
|
|
8
9
|
from collections import defaultdict
|
|
9
10
|
from dataclasses import dataclass
|
|
@@ -104,6 +105,22 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
|
|
|
104
105
|
)
|
|
105
106
|
|
|
106
107
|
|
|
108
|
+
def preserve_unicode_escapes(obj: Any) -> Any:
|
|
109
|
+
"""Recursively convert unicode characters back to escape sequences"""
|
|
110
|
+
if isinstance(obj, dict):
|
|
111
|
+
return {k: preserve_unicode_escapes(v) for k, v in obj.items()}
|
|
112
|
+
elif isinstance(obj, list):
|
|
113
|
+
return [preserve_unicode_escapes(item) for item in obj]
|
|
114
|
+
elif isinstance(obj, str):
|
|
115
|
+
# Convert non-ASCII characters back to \u escapes
|
|
116
|
+
def escape_unicode(match: Any) -> Any:
|
|
117
|
+
return f"\\u{ord(match.group(0)):04x}"
|
|
118
|
+
|
|
119
|
+
return re.sub(r"[^\x00-\x7F]", escape_unicode, obj)
|
|
120
|
+
else:
|
|
121
|
+
return obj
|
|
122
|
+
|
|
123
|
+
|
|
107
124
|
class EmitMode(ConfigEnum):
|
|
108
125
|
# Fully synchronous processing that updates both primary storage (SQL) and search storage (Elasticsearch) before returning.
|
|
109
126
|
# Provides the strongest consistency guarantee but with the highest cost. Best for critical operations where immediate
|
|
@@ -611,7 +628,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
611
628
|
else:
|
|
612
629
|
url = f"{self._gms_server}/aspects?action=ingestProposal"
|
|
613
630
|
|
|
614
|
-
mcp_obj = pre_json_transform(mcp.to_obj())
|
|
631
|
+
mcp_obj = preserve_unicode_escapes(pre_json_transform(mcp.to_obj()))
|
|
615
632
|
payload_dict = {
|
|
616
633
|
"proposal": mcp_obj,
|
|
617
634
|
"async": "true"
|
datahub/ingestion/api/source.py
CHANGED
|
@@ -76,6 +76,7 @@ class SourceCapability(Enum):
|
|
|
76
76
|
SCHEMA_METADATA = "Schema Metadata"
|
|
77
77
|
CONTAINERS = "Asset Containers"
|
|
78
78
|
CLASSIFICATION = "Classification"
|
|
79
|
+
TEST_CONNECTION = "Test Connection"
|
|
79
80
|
|
|
80
81
|
|
|
81
82
|
class StructuredLogLevel(Enum):
|
|
@@ -247,6 +248,7 @@ class SourceReport(Report):
|
|
|
247
248
|
self.aspect_urn_samples[entityType][
|
|
248
249
|
"fineGrainedLineages"
|
|
249
250
|
].append(urn)
|
|
251
|
+
self.aspects[entityType]["fineGrainedLineages"] += 1
|
|
250
252
|
|
|
251
253
|
def report_warning(
|
|
252
254
|
self,
|
|
@@ -90,6 +90,11 @@ class ClassificationHandler:
|
|
|
90
90
|
|
|
91
91
|
def get_classifiers(self) -> List[Classifier]:
|
|
92
92
|
classifiers = []
|
|
93
|
+
if (
|
|
94
|
+
not isinstance(self.config, ClassificationSourceConfigMixin)
|
|
95
|
+
or self.config.classification is None
|
|
96
|
+
):
|
|
97
|
+
return classifiers
|
|
93
98
|
|
|
94
99
|
for classifier in self.config.classification.classifiers:
|
|
95
100
|
classifier_class = classifier_registry.get(classifier.type)
|
|
@@ -906,6 +906,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
906
906
|
batch_size: int = 5000,
|
|
907
907
|
extraFilters: Optional[List[RawSearchFilterRule]] = None,
|
|
908
908
|
extra_or_filters: Optional[RawSearchFilter] = None,
|
|
909
|
+
skip_cache: bool = False,
|
|
909
910
|
) -> Iterable[str]:
|
|
910
911
|
"""Fetch all urns that match all of the given filters.
|
|
911
912
|
|
|
@@ -924,6 +925,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
924
925
|
Note that this requires browsePathV2 aspects (added in 0.10.4+).
|
|
925
926
|
:param status: Filter on the deletion status of the entity. The default is only return non-soft-deleted entities.
|
|
926
927
|
:param extraFilters: Additional filters to apply. If specified, the results will match all of the filters.
|
|
928
|
+
:param skip_cache: Whether to bypass caching. Defaults to False.
|
|
927
929
|
|
|
928
930
|
:return: An iterable of urns that match the filters.
|
|
929
931
|
"""
|
|
@@ -951,7 +953,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
951
953
|
$query: String!,
|
|
952
954
|
$orFilters: [AndFilterInput!],
|
|
953
955
|
$batchSize: Int!,
|
|
954
|
-
$scrollId: String
|
|
956
|
+
$scrollId: String,
|
|
957
|
+
$skipCache: Boolean!) {
|
|
955
958
|
|
|
956
959
|
scrollAcrossEntities(input: {
|
|
957
960
|
query: $query,
|
|
@@ -962,6 +965,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
962
965
|
searchFlags: {
|
|
963
966
|
skipHighlighting: true
|
|
964
967
|
skipAggregates: true
|
|
968
|
+
skipCache: $skipCache
|
|
965
969
|
}
|
|
966
970
|
}) {
|
|
967
971
|
nextScrollId
|
|
@@ -980,6 +984,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
980
984
|
"query": query,
|
|
981
985
|
"orFilters": orFilters,
|
|
982
986
|
"batchSize": batch_size,
|
|
987
|
+
"skipCache": skip_cache,
|
|
983
988
|
}
|
|
984
989
|
|
|
985
990
|
for entity in self._scroll_across_entities(graphql_query, variables):
|
|
@@ -1085,7 +1090,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1085
1090
|
"query": query,
|
|
1086
1091
|
"orFilters": or_filters_final,
|
|
1087
1092
|
"batchSize": batch_size,
|
|
1088
|
-
"skipCache":
|
|
1093
|
+
"skipCache": skip_cache,
|
|
1089
1094
|
"fetchExtraFields": extra_source_fields,
|
|
1090
1095
|
}
|
|
1091
1096
|
|
|
@@ -1429,6 +1434,41 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1429
1434
|
related_aspects = response.get("relatedAspects", [])
|
|
1430
1435
|
return reference_count, related_aspects
|
|
1431
1436
|
|
|
1437
|
+
def restore_indices(
|
|
1438
|
+
self,
|
|
1439
|
+
urn_pattern: str,
|
|
1440
|
+
aspect: Optional[str] = None,
|
|
1441
|
+
start: Optional[int] = None,
|
|
1442
|
+
batch_size: Optional[int] = None,
|
|
1443
|
+
) -> str:
|
|
1444
|
+
"""Restore the indices for a given urn or urn-like pattern.
|
|
1445
|
+
|
|
1446
|
+
Args:
|
|
1447
|
+
urn_pattern: The exact URN or a pattern (with % for wildcard) to match URNs.
|
|
1448
|
+
aspect: Optional aspect string to restore indices for a specific aspect.
|
|
1449
|
+
start: Optional integer to decide which row number of sql store to restore from. Default: 0.
|
|
1450
|
+
batch_size: Optional integer to decide how many rows to restore. Default: 10.
|
|
1451
|
+
|
|
1452
|
+
Returns:
|
|
1453
|
+
A string containing the result of the restore indices operation. This format is subject to change.
|
|
1454
|
+
"""
|
|
1455
|
+
if "%" in urn_pattern:
|
|
1456
|
+
payload_obj: dict = {"urnLike": urn_pattern}
|
|
1457
|
+
else:
|
|
1458
|
+
payload_obj = {"urn": urn_pattern}
|
|
1459
|
+
if aspect is not None:
|
|
1460
|
+
payload_obj["aspect"] = aspect
|
|
1461
|
+
if start is not None:
|
|
1462
|
+
payload_obj["start"] = start
|
|
1463
|
+
if batch_size is not None:
|
|
1464
|
+
payload_obj["batchSize"] = batch_size
|
|
1465
|
+
raw_result = self._post_generic(
|
|
1466
|
+
f"{self._gms_server}/operations?action=restoreIndices", payload_obj
|
|
1467
|
+
)
|
|
1468
|
+
result = raw_result["value"]
|
|
1469
|
+
logger.debug(f"Restore indices result: {result}")
|
|
1470
|
+
return result
|
|
1471
|
+
|
|
1432
1472
|
@functools.lru_cache
|
|
1433
1473
|
def _make_schema_resolver(
|
|
1434
1474
|
self,
|
|
@@ -4,6 +4,7 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
from typing import Iterable, List, Optional
|
|
6
6
|
|
|
7
|
+
from datahub.configuration.common import AllowDenyPattern
|
|
7
8
|
from datahub.ingestion.api.common import PipelineContext
|
|
8
9
|
from datahub.ingestion.api.decorators import (
|
|
9
10
|
SupportStatus,
|
|
@@ -99,6 +100,7 @@ def cleanup(config: BigQueryV2Config) -> None:
|
|
|
99
100
|
SourceCapability.PARTITION_SUPPORT,
|
|
100
101
|
"Enabled by default, partition keys and clustering keys are supported.",
|
|
101
102
|
)
|
|
103
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
102
104
|
class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
103
105
|
def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
|
|
104
106
|
super().__init__(config, ctx)
|
|
@@ -241,7 +243,23 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
241
243
|
).workunit_processor,
|
|
242
244
|
]
|
|
243
245
|
|
|
246
|
+
def _warn_deprecated_configs(self):
|
|
247
|
+
if (
|
|
248
|
+
self.config.match_fully_qualified_names is not None
|
|
249
|
+
and not self.config.match_fully_qualified_names
|
|
250
|
+
and self.config.schema_pattern is not None
|
|
251
|
+
and self.config.schema_pattern != AllowDenyPattern.allow_all()
|
|
252
|
+
):
|
|
253
|
+
self.report.report_warning(
|
|
254
|
+
message="Please update `schema_pattern` to match against fully qualified schema name `<database_name>.<schema_name>` and set config `match_fully_qualified_names : True`."
|
|
255
|
+
"Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. "
|
|
256
|
+
"The config option `match_fully_qualified_names` will be removed in future and the default behavior will be like `match_fully_qualified_names: True`.",
|
|
257
|
+
context="Config option deprecation warning",
|
|
258
|
+
title="Config option deprecation warning",
|
|
259
|
+
)
|
|
260
|
+
|
|
244
261
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
262
|
+
self._warn_deprecated_configs()
|
|
245
263
|
projects = get_projects(
|
|
246
264
|
self.bq_schema_extractor.schema_api,
|
|
247
265
|
self.report,
|
|
@@ -63,7 +63,7 @@ class BigQueryIdentifierBuilder:
|
|
|
63
63
|
)
|
|
64
64
|
|
|
65
65
|
def gen_user_urn(self, user_email: str) -> str:
|
|
66
|
-
return make_user_urn(user_email
|
|
66
|
+
return make_user_urn(user_email)
|
|
67
67
|
|
|
68
68
|
def make_data_platform_urn(self) -> str:
|
|
69
69
|
return make_data_platform_urn(self.platform)
|
|
@@ -9,7 +9,9 @@ import requests
|
|
|
9
9
|
from pydantic import Field, root_validator
|
|
10
10
|
|
|
11
11
|
from datahub.ingestion.api.decorators import (
|
|
12
|
+
SourceCapability,
|
|
12
13
|
SupportStatus,
|
|
14
|
+
capability,
|
|
13
15
|
config_class,
|
|
14
16
|
platform_name,
|
|
15
17
|
support_status,
|
|
@@ -261,6 +263,7 @@ query DatahubMetadataQuery_{type}($jobId: BigInt!, $runId: BigInt) {{
|
|
|
261
263
|
@platform_name("dbt")
|
|
262
264
|
@config_class(DBTCloudConfig)
|
|
263
265
|
@support_status(SupportStatus.CERTIFIED)
|
|
266
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
264
267
|
class DBTCloudSource(DBTSourceBase, TestableSource):
|
|
265
268
|
config: DBTCloudConfig
|
|
266
269
|
|
|
@@ -823,7 +823,9 @@ def get_column_type(
|
|
|
823
823
|
@platform_name("dbt")
|
|
824
824
|
@config_class(DBTCommonConfig)
|
|
825
825
|
@support_status(SupportStatus.CERTIFIED)
|
|
826
|
-
@capability(
|
|
826
|
+
@capability(
|
|
827
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
828
|
+
)
|
|
827
829
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
828
830
|
@capability(
|
|
829
831
|
SourceCapability.LINEAGE_FINE,
|
|
@@ -15,7 +15,9 @@ from datahub.configuration.git import GitReference
|
|
|
15
15
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
16
16
|
from datahub.ingestion.api.common import PipelineContext
|
|
17
17
|
from datahub.ingestion.api.decorators import (
|
|
18
|
+
SourceCapability,
|
|
18
19
|
SupportStatus,
|
|
20
|
+
capability,
|
|
19
21
|
config_class,
|
|
20
22
|
platform_name,
|
|
21
23
|
support_status,
|
|
@@ -464,6 +466,7 @@ def load_run_results(
|
|
|
464
466
|
@platform_name("dbt")
|
|
465
467
|
@config_class(DBTCoreConfig)
|
|
466
468
|
@support_status(SupportStatus.CERTIFIED)
|
|
469
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
467
470
|
class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
468
471
|
config: DBTCoreConfig
|
|
469
472
|
report: DBTCoreReport
|
|
@@ -21,6 +21,7 @@ from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
|
|
|
21
21
|
)
|
|
22
22
|
from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
|
|
23
23
|
from datahub.ingestion.source.dremio.dremio_sql_queries import DremioSQLQueries
|
|
24
|
+
from datahub.utilities.perf_timer import PerfTimer
|
|
24
25
|
|
|
25
26
|
logger = logging.getLogger(__name__)
|
|
26
27
|
|
|
@@ -54,6 +55,8 @@ class DremioAPIOperations:
|
|
|
54
55
|
self.deny_schema_pattern: List[str] = connection_args.schema_pattern.deny
|
|
55
56
|
self._max_workers: int = connection_args.max_workers
|
|
56
57
|
self.is_dremio_cloud = connection_args.is_dremio_cloud
|
|
58
|
+
self.start_time = connection_args.start_time
|
|
59
|
+
self.end_time = connection_args.end_time
|
|
57
60
|
self.report = report
|
|
58
61
|
self.session = requests.Session()
|
|
59
62
|
if connection_args.is_dremio_cloud:
|
|
@@ -233,47 +236,71 @@ class DremioAPIOperations:
|
|
|
233
236
|
|
|
234
237
|
def get(self, url: str) -> Dict:
|
|
235
238
|
"""execute a get request on dremio"""
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
)
|
|
241
|
-
|
|
239
|
+
logger.debug(f"GET request to {self.base_url + url}")
|
|
240
|
+
self.report.api_calls_total += 1
|
|
241
|
+
self.report.api_calls_by_method_and_path["GET " + url] += 1
|
|
242
|
+
|
|
243
|
+
with PerfTimer() as timer:
|
|
244
|
+
response = self.session.get(
|
|
245
|
+
url=(self.base_url + url),
|
|
246
|
+
verify=self._verify,
|
|
247
|
+
timeout=self._timeout,
|
|
248
|
+
)
|
|
249
|
+
self.report.api_call_secs_by_method_and_path["GET " + url] += (
|
|
250
|
+
timer.elapsed_seconds()
|
|
251
|
+
)
|
|
252
|
+
# response.raise_for_status() # Enabling this line, makes integration tests to fail
|
|
253
|
+
return response.json()
|
|
242
254
|
|
|
243
255
|
def post(self, url: str, data: str) -> Dict:
|
|
244
256
|
"""execute a get request on dremio"""
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
257
|
+
logger.debug(f"POST request to {self.base_url + url}")
|
|
258
|
+
self.report.api_calls_total += 1
|
|
259
|
+
self.report.api_calls_by_method_and_path["POST " + url] += 1
|
|
260
|
+
|
|
261
|
+
with PerfTimer() as timer:
|
|
262
|
+
response = self.session.post(
|
|
263
|
+
url=(self.base_url + url),
|
|
264
|
+
data=data,
|
|
265
|
+
verify=self._verify,
|
|
266
|
+
timeout=self._timeout,
|
|
267
|
+
)
|
|
268
|
+
self.report.api_call_secs_by_method_and_path["POST " + url] += (
|
|
269
|
+
timer.elapsed_seconds()
|
|
270
|
+
)
|
|
271
|
+
# response.raise_for_status() # Enabling this line, makes integration tests to fail
|
|
272
|
+
return response.json()
|
|
252
273
|
|
|
253
274
|
def execute_query(self, query: str, timeout: int = 3600) -> List[Dict[str, Any]]:
|
|
254
275
|
"""Execute SQL query with timeout and error handling"""
|
|
255
276
|
try:
|
|
256
|
-
|
|
277
|
+
with PerfTimer() as timer:
|
|
278
|
+
logger.info(f"Executing query: {query}")
|
|
279
|
+
response = self.post(url="/sql", data=json.dumps({"sql": query}))
|
|
257
280
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
281
|
+
if "errorMessage" in response:
|
|
282
|
+
self.report.failure(
|
|
283
|
+
message="SQL Error", context=f"{response['errorMessage']}"
|
|
284
|
+
)
|
|
285
|
+
raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
|
|
263
286
|
|
|
264
|
-
|
|
287
|
+
job_id = response["id"]
|
|
265
288
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
289
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
290
|
+
future = executor.submit(self.fetch_results, job_id)
|
|
291
|
+
try:
|
|
292
|
+
result = future.result(timeout=timeout)
|
|
293
|
+
logger.info(
|
|
294
|
+
f"Query executed in {timer.elapsed_seconds()} seconds with {len(result)} results"
|
|
295
|
+
)
|
|
296
|
+
return result
|
|
297
|
+
except concurrent.futures.TimeoutError:
|
|
298
|
+
self.cancel_query(job_id)
|
|
299
|
+
raise DremioAPIException(
|
|
300
|
+
f"Query execution timed out after {timeout} seconds"
|
|
301
|
+
) from None
|
|
302
|
+
except RuntimeError as e:
|
|
303
|
+
raise DremioAPIException() from e
|
|
277
304
|
|
|
278
305
|
except requests.RequestException as e:
|
|
279
306
|
raise DremioAPIException("Error executing query") from e
|
|
@@ -603,10 +630,25 @@ class DremioAPIOperations:
|
|
|
603
630
|
return parents_list
|
|
604
631
|
|
|
605
632
|
def extract_all_queries(self) -> List[Dict[str, Any]]:
|
|
633
|
+
# Convert datetime objects to string format for SQL queries
|
|
634
|
+
start_timestamp_str = None
|
|
635
|
+
end_timestamp_str = None
|
|
636
|
+
|
|
637
|
+
if self.start_time:
|
|
638
|
+
start_timestamp_str = self.start_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
|
639
|
+
if self.end_time:
|
|
640
|
+
end_timestamp_str = self.end_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
|
641
|
+
|
|
606
642
|
if self.edition == DremioEdition.CLOUD:
|
|
607
|
-
jobs_query = DremioSQLQueries.
|
|
643
|
+
jobs_query = DremioSQLQueries.get_query_all_jobs_cloud(
|
|
644
|
+
start_timestamp_millis=start_timestamp_str,
|
|
645
|
+
end_timestamp_millis=end_timestamp_str,
|
|
646
|
+
)
|
|
608
647
|
else:
|
|
609
|
-
jobs_query = DremioSQLQueries.
|
|
648
|
+
jobs_query = DremioSQLQueries.get_query_all_jobs(
|
|
649
|
+
start_timestamp_millis=start_timestamp_str,
|
|
650
|
+
end_timestamp_millis=end_timestamp_str,
|
|
651
|
+
)
|
|
610
652
|
|
|
611
653
|
return self.execute_query(query=jobs_query)
|
|
612
654
|
|
|
@@ -685,6 +727,27 @@ class DremioAPIOperations:
|
|
|
685
727
|
|
|
686
728
|
return any(re.match(regex_pattern, path, re.IGNORECASE) for path in paths)
|
|
687
729
|
|
|
730
|
+
def _could_match_pattern(self, pattern: str, path_components: List[str]) -> bool:
|
|
731
|
+
"""
|
|
732
|
+
Check if a container path could potentially match a schema pattern.
|
|
733
|
+
This handles hierarchical path matching for container filtering.
|
|
734
|
+
"""
|
|
735
|
+
if pattern == ".*":
|
|
736
|
+
return True
|
|
737
|
+
|
|
738
|
+
current_path = ".".join(path_components)
|
|
739
|
+
|
|
740
|
+
# Handle simple .* patterns (like "a.b.c.*")
|
|
741
|
+
if pattern.endswith(".*") and not any(c in pattern for c in "^$[](){}+?\\"):
|
|
742
|
+
# Simple dotstar pattern - check prefix matching
|
|
743
|
+
pattern_prefix = pattern[:-2] # Remove ".*"
|
|
744
|
+
return current_path.lower().startswith(
|
|
745
|
+
pattern_prefix.lower()
|
|
746
|
+
) or pattern_prefix.lower().startswith(current_path.lower())
|
|
747
|
+
else:
|
|
748
|
+
# Complex regex pattern - use existing regex matching logic
|
|
749
|
+
return self._check_pattern_match(pattern, [current_path], allow_prefix=True)
|
|
750
|
+
|
|
688
751
|
def should_include_container(self, path: List[str], name: str) -> bool:
|
|
689
752
|
"""
|
|
690
753
|
Helper method to check if a container should be included based on schema patterns.
|
|
@@ -711,41 +774,8 @@ class DremioAPIOperations:
|
|
|
711
774
|
|
|
712
775
|
# Check allow patterns
|
|
713
776
|
for pattern in self.allow_schema_pattern:
|
|
714
|
-
#
|
|
715
|
-
if
|
|
716
|
-
pattern_parts = pattern.split(".")
|
|
717
|
-
path_parts = path_components
|
|
718
|
-
|
|
719
|
-
# If pattern has exact same number of parts, check each component
|
|
720
|
-
if len(pattern_parts) == len(path_parts):
|
|
721
|
-
matches = True
|
|
722
|
-
for p_part, c_part in zip(pattern_parts, path_parts):
|
|
723
|
-
if p_part != "*" and p_part.lower() != c_part.lower():
|
|
724
|
-
matches = False
|
|
725
|
-
break
|
|
726
|
-
if matches:
|
|
727
|
-
self.report.report_container_scanned(full_path)
|
|
728
|
-
return True
|
|
729
|
-
# Otherwise check if current path is prefix match
|
|
730
|
-
else:
|
|
731
|
-
# Remove the trailing wildcard if present
|
|
732
|
-
if pattern_parts[-1] == "*":
|
|
733
|
-
pattern_parts = pattern_parts[:-1]
|
|
734
|
-
|
|
735
|
-
for i in range(len(path_parts)):
|
|
736
|
-
current_path = ".".join(path_parts[: i + 1])
|
|
737
|
-
pattern_prefix = ".".join(pattern_parts[: i + 1])
|
|
738
|
-
|
|
739
|
-
if pattern_prefix.startswith(current_path):
|
|
740
|
-
self.report.report_container_scanned(full_path)
|
|
741
|
-
return True
|
|
742
|
-
|
|
743
|
-
# Direct pattern matching
|
|
744
|
-
if self._check_pattern_match(
|
|
745
|
-
pattern=pattern,
|
|
746
|
-
paths=[full_path],
|
|
747
|
-
allow_prefix=True,
|
|
748
|
-
):
|
|
777
|
+
# Check if current path could potentially match this pattern
|
|
778
|
+
if self._could_match_pattern(pattern, path_components):
|
|
749
779
|
self.report.report_container_scanned(full_path)
|
|
750
780
|
return True
|
|
751
781
|
|