acryl-datahub 0.15.0rc18__py3-none-any.whl → 0.15.0rc20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc18.dist-info → acryl_datahub-0.15.0rc20.dist-info}/METADATA +2486 -2486
- {acryl_datahub-0.15.0rc18.dist-info → acryl_datahub-0.15.0rc20.dist-info}/RECORD +14 -14
- datahub/__init__.py +1 -1
- datahub/ingestion/source/dremio/dremio_api.py +193 -86
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -1
- datahub/metadata/schema.avsc +4 -0
- datahub/metadata/schemas/FormInfo.avsc +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
- datahub/sql_parsing/sqlglot_utils.py +8 -2
- {acryl_datahub-0.15.0rc18.dist-info → acryl_datahub-0.15.0rc20.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0rc18.dist-info → acryl_datahub-0.15.0rc20.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0rc18.dist-info → acryl_datahub-0.15.0rc20.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datahub/__init__.py,sha256=
|
|
1
|
+
datahub/__init__.py,sha256=fYgu28dsndrekGv9Pq_ENw7G6Erm7qtsY5H6W3cKFDU,575
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
3
|
datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
|
|
4
4
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -282,13 +282,13 @@ datahub/ingestion/source/delta_lake/delta_lake_utils.py,sha256=VqIDPEXepOnlk4oWM
|
|
|
282
282
|
datahub/ingestion/source/delta_lake/report.py,sha256=uqWWivPltlZ7dwpOOluTvHOKKsSusqihn67clCAwxoM,467
|
|
283
283
|
datahub/ingestion/source/delta_lake/source.py,sha256=jLCN6SeAv3bCD4w4ZDw15eIbFF3yVWcxVtBklovFEBg,13548
|
|
284
284
|
datahub/ingestion/source/dremio/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
285
|
-
datahub/ingestion/source/dremio/dremio_api.py,sha256=
|
|
285
|
+
datahub/ingestion/source/dremio/dremio_api.py,sha256=am8o_mQq7zteI4zasnkRb9B9-_BFrchTIA_oJkqRagA,33470
|
|
286
286
|
datahub/ingestion/source/dremio/dremio_aspects.py,sha256=3VeHzCw9q1ytngmsq_K4Ll9tWD2V8EDFySBImHdhPAw,18287
|
|
287
287
|
datahub/ingestion/source/dremio/dremio_config.py,sha256=5SP66ewGYN0OnyWgpU33EZOmtICsclTtBX5DSYLwl3c,5782
|
|
288
|
-
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py,sha256
|
|
288
|
+
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py,sha256=-Fefw59tXR6QA8ifOz_mieDccMMG_vyQgp7j-BaXFHQ,3070
|
|
289
289
|
datahub/ingestion/source/dremio/dremio_entities.py,sha256=3H3vIvj5ab4d8gmB9-rbZfwRgW87gT1DdjWiMjNgqJ4,15069
|
|
290
290
|
datahub/ingestion/source/dremio/dremio_profiling.py,sha256=TAcnpo8ZRKhLDHnQSJzJg3YdwTSyEa73LUAzENs7wG4,12287
|
|
291
|
-
datahub/ingestion/source/dremio/dremio_reporting.py,sha256=
|
|
291
|
+
datahub/ingestion/source/dremio/dremio_reporting.py,sha256=pYyEOAxiotyVySumY85Ql8vtGsne7B9sDrdqeVFnWLQ,1742
|
|
292
292
|
datahub/ingestion/source/dremio/dremio_source.py,sha256=NJxDXWd19A3MPplPiLPAjxTmjeJBA04PcPytRSslmYQ,26323
|
|
293
293
|
datahub/ingestion/source/dremio/dremio_sql_queries.py,sha256=W0rcXawlwJOHNYr5o73rilMijtFOO3cVkn6pY-JLc6o,8186
|
|
294
294
|
datahub/ingestion/source/dynamodb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -438,7 +438,7 @@ datahub/ingestion/source/snowflake/snowflake_shares.py,sha256=ud3Ah4qHrmSfpD8Od-
|
|
|
438
438
|
datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=kTmuCtRnvHqM8WBYhWeK4XafJq3ssFL9kcS03jEeWT4,5506
|
|
439
439
|
datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=fyfWmFVz2WZrpTJWNIe9m0WpDHgeFrGPf8diORJZUwo,6212
|
|
440
440
|
datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=PEmYNMXJRUvLQmVd8juVqjokfuSPuH9ppcM0ruXamxA,24807
|
|
441
|
-
datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=
|
|
441
|
+
datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=YczNEupY89jeegjR2_1pT4bPi9wQ69EIhGpzyCe9Jdg,12600
|
|
442
442
|
datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=lo_3asTuIZbF-LuEUcYL-9NIZ720n7oB9mYA6WVTWA4,31960
|
|
443
443
|
datahub/ingestion/source/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
444
444
|
datahub/ingestion/source/sql/athena.py,sha256=G3cIY8H_76lIUAzQWW2kLnZOEsfbakmojxbiHb3dYZ8,24059
|
|
@@ -560,7 +560,7 @@ datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1
|
|
|
560
560
|
datahub/lite/lite_util.py,sha256=pgBpT3vTO1YCQ2njZRNyicSkHYeEmQCt41BaXU8WvMo,4503
|
|
561
561
|
datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
|
|
562
562
|
datahub/metadata/_schema_classes.py,sha256=iPeBXGvbNEm0vw5pYwunnvx7bTtBdmIQVtzMOlS6bSI,955042
|
|
563
|
-
datahub/metadata/schema.avsc,sha256=
|
|
563
|
+
datahub/metadata/schema.avsc,sha256=Xx93OdPzQfBb2CtntIYE-HAeKNg-JZcCtRU95v7ZZCs,677728
|
|
564
564
|
datahub/metadata/schema_classes.py,sha256=X5Jl5EaSxyHdXOQv14pJ5WkQALun4MRpJ4q12wVFE18,1299
|
|
565
565
|
datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
|
|
566
566
|
datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
|
|
@@ -757,7 +757,7 @@ datahub/metadata/schemas/ExecutionRequestKey.avsc,sha256=SvjnlTAGYsSnvVE0rZ9-7UP
|
|
|
757
757
|
datahub/metadata/schemas/ExecutionRequestResult.avsc,sha256=kg3xMNr9kYLPnFsV-iqcGm1sh1muQVGJvxUt15L1yKo,2333
|
|
758
758
|
datahub/metadata/schemas/ExecutionRequestSignal.avsc,sha256=dsIUa6tfVSXqYOgh4cW6_Hzi8RjHuJJoO-mBAuZukpA,2515
|
|
759
759
|
datahub/metadata/schemas/Filter.avsc,sha256=PU-aGkc2-sI3ZXY7ci-Y0A7zp1jux3VW_6c8MJRAokg,5933
|
|
760
|
-
datahub/metadata/schemas/FormInfo.avsc,sha256=
|
|
760
|
+
datahub/metadata/schemas/FormInfo.avsc,sha256=FbN34htiCgm3LqKDL3sVsJhMUHIyc5jYpGJtYm7Ysd4,6270
|
|
761
761
|
datahub/metadata/schemas/FormKey.avsc,sha256=1-wE28B8T3WJ3JtexreNtFvP3To3n7U-jvYudCuSM9o,437
|
|
762
762
|
datahub/metadata/schemas/Forms.avsc,sha256=shmkhRoHN2gTaTsqGrGDRoNwe_z-nrFbbLjH9MtVDCs,10955
|
|
763
763
|
datahub/metadata/schemas/GlobalSettingsInfo.avsc,sha256=OVMM6FwhHhufHkezYcVePK0zI2llzFYLVFJhmAiHoiI,10102
|
|
@@ -869,11 +869,11 @@ datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn
|
|
|
869
869
|
datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
|
|
870
870
|
datahub/sql_parsing/schema_resolver.py,sha256=9INZWdxA2dMSLK6RXaVqjbjyLY_VKMhCkQv_Xd6Ln3I,10848
|
|
871
871
|
datahub/sql_parsing/split_statements.py,sha256=uZhAXLaRxDfmK0lPBW2oM_YVdJfSMhdgndnfd9iIXuA,5001
|
|
872
|
-
datahub/sql_parsing/sql_parsing_aggregator.py,sha256=
|
|
872
|
+
datahub/sql_parsing/sql_parsing_aggregator.py,sha256=F-aj7yqOwbo7FpxduFO5a7cLWkojL_Npv3_dlfHPNGY,69877
|
|
873
873
|
datahub/sql_parsing/sql_parsing_common.py,sha256=h_V_m54hJ9EUh5kczq7cYOIeNeo4bgf0Px0H-Nq-UIg,2602
|
|
874
874
|
datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
|
|
875
875
|
datahub/sql_parsing/sqlglot_lineage.py,sha256=CLDOc0HNqL_539eahOP3QOoldIYC6CF29id4Xe3TlEM,47018
|
|
876
|
-
datahub/sql_parsing/sqlglot_utils.py,sha256=
|
|
876
|
+
datahub/sql_parsing/sqlglot_utils.py,sha256=n6yufzEGwSlFeCSU540hEldIuab0q8KGqm9x0vSawkc,14699
|
|
877
877
|
datahub/sql_parsing/tool_meta_extractor.py,sha256=pE-pkRKBfNTXEJkaQM9NlG807mc-X6OtetgskJySCs8,2908
|
|
878
878
|
datahub/telemetry/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
879
879
|
datahub/telemetry/stats.py,sha256=YltbtC3fe6rl1kcxn1A-mSnVpECTPm5k-brrUt7QxTI,967
|
|
@@ -974,8 +974,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
974
974
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
975
975
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
976
976
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
977
|
-
acryl_datahub-0.15.
|
|
978
|
-
acryl_datahub-0.15.
|
|
979
|
-
acryl_datahub-0.15.
|
|
980
|
-
acryl_datahub-0.15.
|
|
981
|
-
acryl_datahub-0.15.
|
|
977
|
+
acryl_datahub-0.15.0rc20.dist-info/METADATA,sha256=KuTZA5lnEW-UAvSPqqkBsDFKkwlJF8WzYbcphVMW_aE,173559
|
|
978
|
+
acryl_datahub-0.15.0rc20.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
979
|
+
acryl_datahub-0.15.0rc20.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
|
|
980
|
+
acryl_datahub-0.15.0rc20.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
981
|
+
acryl_datahub-0.15.0rc20.dist-info/RECORD,,
|
datahub/__init__.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import concurrent.futures
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
|
+
import re
|
|
4
5
|
import warnings
|
|
5
6
|
from collections import defaultdict
|
|
6
7
|
from enum import Enum
|
|
@@ -609,32 +610,6 @@ class DremioAPIOperations:
|
|
|
609
610
|
|
|
610
611
|
return self.execute_query(query=jobs_query)
|
|
611
612
|
|
|
612
|
-
def get_source_by_id(self, source_id: str) -> Optional[Dict]:
|
|
613
|
-
"""
|
|
614
|
-
Fetch source details by ID.
|
|
615
|
-
"""
|
|
616
|
-
response = self.get(
|
|
617
|
-
url=f"/source/{source_id}",
|
|
618
|
-
)
|
|
619
|
-
return response if response else None
|
|
620
|
-
|
|
621
|
-
def get_source_for_dataset(self, schema: str, dataset: str) -> Optional[Dict]:
|
|
622
|
-
"""
|
|
623
|
-
Get source information for a dataset given its schema and name.
|
|
624
|
-
"""
|
|
625
|
-
dataset_id = self.get_dataset_id(schema, dataset)
|
|
626
|
-
if not dataset_id:
|
|
627
|
-
return None
|
|
628
|
-
|
|
629
|
-
catalog_entry = self.get(
|
|
630
|
-
url=f"/catalog/{dataset_id}",
|
|
631
|
-
)
|
|
632
|
-
if not catalog_entry or "path" not in catalog_entry:
|
|
633
|
-
return None
|
|
634
|
-
|
|
635
|
-
source_id = catalog_entry["path"][0]
|
|
636
|
-
return self.get_source_by_id(source_id)
|
|
637
|
-
|
|
638
613
|
def get_tags_for_resource(self, resource_id: str) -> Optional[List[str]]:
|
|
639
614
|
"""
|
|
640
615
|
Get Dremio tags for a given resource_id.
|
|
@@ -673,55 +648,119 @@ class DremioAPIOperations:
|
|
|
673
648
|
)
|
|
674
649
|
return None
|
|
675
650
|
|
|
676
|
-
def
|
|
677
|
-
self,
|
|
678
|
-
|
|
679
|
-
|
|
651
|
+
def _check_pattern_match(
|
|
652
|
+
self,
|
|
653
|
+
pattern: str,
|
|
654
|
+
paths: List[str],
|
|
655
|
+
allow_prefix: bool = True,
|
|
656
|
+
) -> bool:
|
|
657
|
+
"""
|
|
658
|
+
Helper method to check if a pattern matches any of the paths.
|
|
659
|
+
Handles hierarchical matching where each level is matched independently.
|
|
660
|
+
Also handles prefix matching for partial paths.
|
|
661
|
+
"""
|
|
662
|
+
if pattern == ".*":
|
|
663
|
+
return True
|
|
680
664
|
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
665
|
+
# Convert the pattern to regex with proper anchoring
|
|
666
|
+
regex_pattern = pattern
|
|
667
|
+
if pattern.startswith("^"):
|
|
668
|
+
# Already has start anchor
|
|
669
|
+
regex_pattern = pattern.replace(".", r"\.") # Escape dots
|
|
670
|
+
regex_pattern = regex_pattern.replace(
|
|
671
|
+
r"\.*", ".*"
|
|
672
|
+
) # Convert .* to wildcard
|
|
673
|
+
else:
|
|
674
|
+
# Add start anchor and handle dots
|
|
675
|
+
regex_pattern = "^" + pattern.replace(".", r"\.").replace(r"\.*", ".*")
|
|
676
|
+
|
|
677
|
+
# Handle end matching
|
|
678
|
+
if not pattern.endswith(".*"):
|
|
679
|
+
if pattern.endswith("$"):
|
|
680
|
+
# Keep explicit end anchor
|
|
681
|
+
pass
|
|
682
|
+
elif not allow_prefix:
|
|
683
|
+
# Add end anchor for exact matching
|
|
684
|
+
regex_pattern = regex_pattern + "$"
|
|
685
|
+
|
|
686
|
+
for path in paths:
|
|
687
|
+
if re.match(regex_pattern, path, re.IGNORECASE):
|
|
688
|
+
return True
|
|
697
689
|
|
|
698
|
-
|
|
699
|
-
if (
|
|
700
|
-
container.get("type")
|
|
701
|
-
== DremioEntityContainerType.CONTAINER.value
|
|
702
|
-
):
|
|
703
|
-
traverse_path(container.get("id"), container.get("path"))
|
|
690
|
+
return False
|
|
704
691
|
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
exc=exc,
|
|
713
|
-
)
|
|
692
|
+
def should_include_container(self, path: List[str], name: str) -> bool:
|
|
693
|
+
"""
|
|
694
|
+
Helper method to check if a container should be included based on schema patterns.
|
|
695
|
+
Used by both get_all_containers and get_containers_for_location.
|
|
696
|
+
"""
|
|
697
|
+
path_components = path + [name] if path else [name]
|
|
698
|
+
full_path = ".".join(path_components)
|
|
714
699
|
|
|
715
|
-
|
|
700
|
+
# Default allow everything case
|
|
701
|
+
if self.allow_schema_pattern == [".*"] and not self.deny_schema_pattern:
|
|
702
|
+
self.report.report_container_scanned(full_path)
|
|
703
|
+
return True
|
|
716
704
|
|
|
717
|
-
|
|
705
|
+
# Check deny patterns first
|
|
706
|
+
if self.deny_schema_pattern:
|
|
707
|
+
for pattern in self.deny_schema_pattern:
|
|
708
|
+
if self._check_pattern_match(
|
|
709
|
+
pattern=pattern,
|
|
710
|
+
paths=[full_path],
|
|
711
|
+
allow_prefix=False,
|
|
712
|
+
):
|
|
713
|
+
self.report.report_container_filtered(full_path)
|
|
714
|
+
return False
|
|
715
|
+
|
|
716
|
+
# Check allow patterns
|
|
717
|
+
for pattern in self.allow_schema_pattern:
|
|
718
|
+
# For patterns with wildcards, check if this path is a parent of the pattern
|
|
719
|
+
if "*" in pattern:
|
|
720
|
+
pattern_parts = pattern.split(".")
|
|
721
|
+
path_parts = path_components
|
|
722
|
+
|
|
723
|
+
# If pattern has exact same number of parts, check each component
|
|
724
|
+
if len(pattern_parts) == len(path_parts):
|
|
725
|
+
matches = True
|
|
726
|
+
for p_part, c_part in zip(pattern_parts, path_parts):
|
|
727
|
+
if p_part != "*" and p_part.lower() != c_part.lower():
|
|
728
|
+
matches = False
|
|
729
|
+
break
|
|
730
|
+
if matches:
|
|
731
|
+
self.report.report_container_scanned(full_path)
|
|
732
|
+
return True
|
|
733
|
+
# Otherwise check if current path is prefix match
|
|
734
|
+
else:
|
|
735
|
+
# Remove the trailing wildcard if present
|
|
736
|
+
if pattern_parts[-1] == "*":
|
|
737
|
+
pattern_parts = pattern_parts[:-1]
|
|
738
|
+
|
|
739
|
+
for i in range(len(path_parts)):
|
|
740
|
+
current_path = ".".join(path_parts[: i + 1])
|
|
741
|
+
pattern_prefix = ".".join(pattern_parts[: i + 1])
|
|
742
|
+
|
|
743
|
+
if pattern_prefix.startswith(current_path):
|
|
744
|
+
self.report.report_container_scanned(full_path)
|
|
745
|
+
return True
|
|
746
|
+
|
|
747
|
+
# Direct pattern matching
|
|
748
|
+
if self._check_pattern_match(
|
|
749
|
+
pattern=pattern,
|
|
750
|
+
paths=[full_path],
|
|
751
|
+
allow_prefix=True,
|
|
752
|
+
):
|
|
753
|
+
self.report.report_container_scanned(full_path)
|
|
754
|
+
return True
|
|
755
|
+
|
|
756
|
+
self.report.report_container_filtered(full_path)
|
|
757
|
+
return False
|
|
718
758
|
|
|
719
759
|
def get_all_containers(self):
|
|
720
760
|
"""
|
|
721
|
-
Query the Dremio sources API and return source information.
|
|
761
|
+
Query the Dremio sources API and return filtered source information.
|
|
722
762
|
"""
|
|
723
763
|
containers = []
|
|
724
|
-
|
|
725
764
|
response = self.get(url="/catalog")
|
|
726
765
|
|
|
727
766
|
def process_source(source):
|
|
@@ -731,34 +770,41 @@ class DremioAPIOperations:
|
|
|
731
770
|
)
|
|
732
771
|
|
|
733
772
|
source_config = source_resp.get("config", {})
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
773
|
+
db = source_config.get(
|
|
774
|
+
"database", source_config.get("databaseName", "")
|
|
775
|
+
)
|
|
776
|
+
|
|
777
|
+
if self.should_include_container([], source.get("path")[0]):
|
|
778
|
+
return {
|
|
779
|
+
"id": source.get("id"),
|
|
780
|
+
"name": source.get("path")[0],
|
|
781
|
+
"path": [],
|
|
782
|
+
"container_type": DremioEntityContainerType.SOURCE,
|
|
783
|
+
"source_type": source_resp.get("type"),
|
|
784
|
+
"root_path": source_config.get("rootPath"),
|
|
785
|
+
"database_name": db,
|
|
786
|
+
}
|
|
748
787
|
else:
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
788
|
+
if self.should_include_container([], source.get("path")[0]):
|
|
789
|
+
return {
|
|
790
|
+
"id": source.get("id"),
|
|
791
|
+
"name": source.get("path")[0],
|
|
792
|
+
"path": [],
|
|
793
|
+
"container_type": DremioEntityContainerType.SPACE,
|
|
794
|
+
}
|
|
795
|
+
return None
|
|
755
796
|
|
|
756
797
|
def process_source_and_containers(source):
|
|
757
798
|
container = process_source(source)
|
|
799
|
+
if not container:
|
|
800
|
+
return []
|
|
801
|
+
|
|
802
|
+
# Get sub-containers
|
|
758
803
|
sub_containers = self.get_containers_for_location(
|
|
759
804
|
resource_id=container.get("id"),
|
|
760
805
|
path=[container.get("name")],
|
|
761
806
|
)
|
|
807
|
+
|
|
762
808
|
return [container] + sub_containers
|
|
763
809
|
|
|
764
810
|
# Use ThreadPoolExecutor to parallelize the processing of sources
|
|
@@ -771,7 +817,16 @@ class DremioAPIOperations:
|
|
|
771
817
|
}
|
|
772
818
|
|
|
773
819
|
for future in concurrent.futures.as_completed(future_to_source):
|
|
774
|
-
|
|
820
|
+
source = future_to_source[future]
|
|
821
|
+
try:
|
|
822
|
+
containers.extend(future.result())
|
|
823
|
+
except Exception as exc:
|
|
824
|
+
logger.error(f"Error processing source: {exc}")
|
|
825
|
+
self.report.warning(
|
|
826
|
+
message="Failed to process source",
|
|
827
|
+
context=f"{source}",
|
|
828
|
+
exc=exc,
|
|
829
|
+
)
|
|
775
830
|
|
|
776
831
|
return containers
|
|
777
832
|
|
|
@@ -785,3 +840,55 @@ class DremioAPIOperations:
|
|
|
785
840
|
)
|
|
786
841
|
else:
|
|
787
842
|
return ""
|
|
843
|
+
|
|
844
|
+
def get_containers_for_location(
|
|
845
|
+
self, resource_id: str, path: List[str]
|
|
846
|
+
) -> List[Dict[str, str]]:
|
|
847
|
+
containers = []
|
|
848
|
+
|
|
849
|
+
def traverse_path(location_id: str, entity_path: List[str]) -> List:
|
|
850
|
+
nonlocal containers
|
|
851
|
+
try:
|
|
852
|
+
response = self.get(url=f"/catalog/{location_id}")
|
|
853
|
+
|
|
854
|
+
# Check if current folder should be included
|
|
855
|
+
if (
|
|
856
|
+
response.get("entityType")
|
|
857
|
+
== DremioEntityContainerType.FOLDER.value.lower()
|
|
858
|
+
):
|
|
859
|
+
folder_name = entity_path[-1]
|
|
860
|
+
folder_path = entity_path[:-1]
|
|
861
|
+
|
|
862
|
+
if self.should_include_container(folder_path, folder_name):
|
|
863
|
+
containers.append(
|
|
864
|
+
{
|
|
865
|
+
"id": location_id,
|
|
866
|
+
"name": folder_name,
|
|
867
|
+
"path": folder_path,
|
|
868
|
+
"container_type": DremioEntityContainerType.FOLDER,
|
|
869
|
+
}
|
|
870
|
+
)
|
|
871
|
+
|
|
872
|
+
# Recursively process child containers
|
|
873
|
+
for container in response.get("children", []):
|
|
874
|
+
if (
|
|
875
|
+
container.get("type")
|
|
876
|
+
== DremioEntityContainerType.CONTAINER.value
|
|
877
|
+
):
|
|
878
|
+
traverse_path(container.get("id"), container.get("path"))
|
|
879
|
+
|
|
880
|
+
except Exception as exc:
|
|
881
|
+
logging.info(
|
|
882
|
+
"Location {} contains no tables or views. Skipping...".format(
|
|
883
|
+
location_id
|
|
884
|
+
)
|
|
885
|
+
)
|
|
886
|
+
self.report.warning(
|
|
887
|
+
message="Failed to get tables or views",
|
|
888
|
+
context=f"{location_id}",
|
|
889
|
+
exc=exc,
|
|
890
|
+
)
|
|
891
|
+
|
|
892
|
+
return containers
|
|
893
|
+
|
|
894
|
+
return traverse_path(location_id=resource_id, entity_path=path)
|
|
@@ -31,6 +31,7 @@ class DremioToDataHubSourceTypeMapping:
|
|
|
31
31
|
"SNOWFLAKE": "snowflake",
|
|
32
32
|
"SYNAPSE": "mssql",
|
|
33
33
|
"TERADATA": "teradata",
|
|
34
|
+
"VERTICA": "vertica",
|
|
34
35
|
}
|
|
35
36
|
|
|
36
37
|
DATABASE_SOURCE_TYPES = {
|
|
@@ -52,6 +53,7 @@ class DremioToDataHubSourceTypeMapping:
|
|
|
52
53
|
"SNOWFLAKE",
|
|
53
54
|
"SYNAPSE",
|
|
54
55
|
"TERADATA",
|
|
56
|
+
"VERTICA",
|
|
55
57
|
}
|
|
56
58
|
|
|
57
59
|
FILE_OBJECT_STORAGE_TYPES = {
|
|
@@ -14,12 +14,27 @@ class DremioSourceReport(
|
|
|
14
14
|
):
|
|
15
15
|
num_containers_failed: int = 0
|
|
16
16
|
num_datasets_failed: int = 0
|
|
17
|
+
containers_scanned: int = 0
|
|
18
|
+
containers_filtered: int = 0
|
|
17
19
|
|
|
18
20
|
def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
|
|
19
21
|
# recording total combined latency is not very useful, keeping this method as a placeholder
|
|
20
22
|
# for future implementation of min / max / percentiles etc.
|
|
21
23
|
pass
|
|
22
24
|
|
|
25
|
+
def report_container_scanned(self, name: str) -> None:
|
|
26
|
+
"""
|
|
27
|
+
Record that a container was successfully scanned
|
|
28
|
+
"""
|
|
29
|
+
self.containers_scanned += 1
|
|
30
|
+
|
|
31
|
+
def report_container_filtered(self, container_name: str) -> None:
|
|
32
|
+
"""
|
|
33
|
+
Record that a container was filtered out
|
|
34
|
+
"""
|
|
35
|
+
self.containers_filtered += 1
|
|
36
|
+
self.report_dropped(container_name)
|
|
37
|
+
|
|
23
38
|
def report_entity_scanned(self, name: str, ent_type: str = "View") -> None:
|
|
24
39
|
"""
|
|
25
40
|
Entity could be a view or a table
|
|
@@ -119,7 +119,6 @@ class SnowflakeFilter:
|
|
|
119
119
|
) -> bool:
|
|
120
120
|
if not dataset_type or not dataset_name:
|
|
121
121
|
return True
|
|
122
|
-
dataset_params = dataset_name.split(".")
|
|
123
122
|
if dataset_type.lower() not in (
|
|
124
123
|
SnowflakeObjectDomain.TABLE,
|
|
125
124
|
SnowflakeObjectDomain.EXTERNAL_TABLE,
|
|
@@ -131,6 +130,7 @@ class SnowflakeFilter:
|
|
|
131
130
|
if _is_sys_table(dataset_name):
|
|
132
131
|
return False
|
|
133
132
|
|
|
133
|
+
dataset_params = _split_qualified_name(dataset_name)
|
|
134
134
|
if len(dataset_params) != 3:
|
|
135
135
|
self.structured_reporter.info(
|
|
136
136
|
title="Unexpected dataset pattern",
|
datahub/metadata/schema.avsc
CHANGED
|
@@ -18518,6 +18518,10 @@
|
|
|
18518
18518
|
"namespace": "com.linkedin.pegasus2avro.form",
|
|
18519
18519
|
"fields": [
|
|
18520
18520
|
{
|
|
18521
|
+
"Searchable": {
|
|
18522
|
+
"fieldName": "structuredPropertyPromptUrns",
|
|
18523
|
+
"fieldType": "URN"
|
|
18524
|
+
},
|
|
18521
18525
|
"java": {
|
|
18522
18526
|
"class": "com.linkedin.pegasus2avro.common.urn.Urn"
|
|
18523
18527
|
},
|
|
@@ -1383,8 +1383,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
1383
1383
|
return QueryUrn(query_id).urn()
|
|
1384
1384
|
|
|
1385
1385
|
@classmethod
|
|
1386
|
-
def _composite_query_id(cls, composed_of_queries:
|
|
1387
|
-
composed_of_queries = list(composed_of_queries)
|
|
1386
|
+
def _composite_query_id(cls, composed_of_queries: List[QueryId]) -> str:
|
|
1388
1387
|
combined = json.dumps(composed_of_queries)
|
|
1389
1388
|
return f"composite_{generate_hash(combined)}"
|
|
1390
1389
|
|
|
@@ -121,7 +121,7 @@ _BASIC_NORMALIZATION_RULES = {
|
|
|
121
121
|
# Remove /* */ comments.
|
|
122
122
|
re.compile(r"/\*.*?\*/", re.DOTALL): "",
|
|
123
123
|
# Remove -- comments.
|
|
124
|
-
re.compile(r"--.*$"): "",
|
|
124
|
+
re.compile(r"--.*$", re.MULTILINE): "",
|
|
125
125
|
# Replace all runs of whitespace with a single space.
|
|
126
126
|
re.compile(r"\s+"): " ",
|
|
127
127
|
# Remove leading and trailing whitespace and trailing semicolons.
|
|
@@ -131,10 +131,16 @@ _BASIC_NORMALIZATION_RULES = {
|
|
|
131
131
|
# Replace anything that looks like a string with a placeholder.
|
|
132
132
|
re.compile(r"'[^']*'"): "?",
|
|
133
133
|
# Replace sequences of IN/VALUES with a single placeholder.
|
|
134
|
-
|
|
134
|
+
# The r" ?" makes it more robust to uneven spacing.
|
|
135
|
+
re.compile(r"\b(IN|VALUES)\s*\( ?\?(?:, ?\?)* ?\)", re.IGNORECASE): r"\1 (?)",
|
|
135
136
|
# Normalize parenthesis spacing.
|
|
136
137
|
re.compile(r"\( "): "(",
|
|
137
138
|
re.compile(r" \)"): ")",
|
|
139
|
+
# Fix up spaces before commas in column lists.
|
|
140
|
+
# e.g. "col1 , col2" -> "col1, col2"
|
|
141
|
+
# e.g. "col1,col2" -> "col1, col2"
|
|
142
|
+
re.compile(r"\b ,"): ",",
|
|
143
|
+
re.compile(r"\b,\b"): ", ",
|
|
138
144
|
}
|
|
139
145
|
_TABLE_NAME_NORMALIZATION_RULES = {
|
|
140
146
|
# Replace UUID-like strings with a placeholder (both - and _ variants).
|
|
File without changes
|
|
File without changes
|
|
File without changes
|