acryl-datahub 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2348 -2298
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +130 -125
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
- datahub/cli/cli_utils.py +2 -0
- datahub/cli/delete_cli.py +103 -24
- datahub/cli/ingest_cli.py +110 -0
- datahub/cli/put_cli.py +1 -1
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +2 -1
- datahub/configuration/common.py +3 -3
- datahub/configuration/git.py +7 -1
- datahub/configuration/kafka_consumer_config.py +31 -1
- datahub/emitter/mcp_patch_builder.py +43 -0
- datahub/emitter/rest_emitter.py +17 -4
- datahub/ingestion/api/incremental_properties_helper.py +69 -0
- datahub/ingestion/api/source.py +6 -1
- datahub/ingestion/api/source_helpers.py +4 -2
- datahub/ingestion/graph/client.py +2 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
- datahub/ingestion/run/pipeline.py +6 -5
- datahub/ingestion/run/pipeline_config.py +6 -0
- datahub/ingestion/sink/datahub_rest.py +15 -4
- datahub/ingestion/source/abs/source.py +4 -0
- datahub/ingestion/source/aws/aws_common.py +13 -1
- datahub/ingestion/source/aws/sagemaker.py +8 -0
- datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
- datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
- datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
- datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/csv_enricher.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
- datahub/ingestion/source/datahub/datahub_source.py +8 -1
- datahub/ingestion/source/dbt/dbt_common.py +7 -61
- datahub/ingestion/source/dremio/dremio_api.py +204 -86
- datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
- datahub/ingestion/source/dremio/dremio_config.py +5 -0
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
- datahub/ingestion/source/dremio/dremio_entities.py +4 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
- datahub/ingestion/source/dremio/dremio_source.py +7 -2
- datahub/ingestion/source/elastic_search.py +1 -1
- datahub/ingestion/source/feast.py +97 -6
- datahub/ingestion/source/gc/datahub_gc.py +46 -35
- datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
- datahub/ingestion/source/ge_data_profiler.py +23 -1
- datahub/ingestion/source/iceberg/iceberg.py +12 -5
- datahub/ingestion/source/kafka/kafka.py +39 -19
- datahub/ingestion/source/kafka/kafka_connect.py +81 -51
- datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
- datahub/ingestion/source/looker/view_upstream.py +65 -30
- datahub/ingestion/source/metadata/business_glossary.py +35 -18
- datahub/ingestion/source/mode.py +0 -23
- datahub/ingestion/source/neo4j/__init__.py +0 -0
- datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
- datahub/ingestion/source/powerbi/__init__.py +0 -1
- datahub/ingestion/source/powerbi/config.py +3 -3
- datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
- datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
- datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
- datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
- datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
- datahub/ingestion/source/powerbi/powerbi.py +12 -6
- datahub/ingestion/source/preset.py +1 -0
- datahub/ingestion/source/pulsar.py +21 -2
- datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
- datahub/ingestion/source/redash.py +13 -63
- datahub/ingestion/source/redshift/config.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +3 -0
- datahub/ingestion/source/s3/source.py +2 -3
- datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
- datahub/ingestion/source/sql/athena.py +46 -22
- datahub/ingestion/source/sql/mssql/source.py +0 -2
- datahub/ingestion/source/sql/sql_common.py +34 -21
- datahub/ingestion/source/sql/sql_report.py +1 -0
- datahub/ingestion/source/sql/sql_types.py +85 -8
- datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
- datahub/ingestion/source/superset.py +215 -65
- datahub/ingestion/source/tableau/tableau.py +237 -76
- datahub/ingestion/source/tableau/tableau_common.py +12 -6
- datahub/ingestion/source/tableau/tableau_constant.py +2 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
- datahub/ingestion/source/tableau/tableau_validation.py +48 -0
- datahub/ingestion/source/unity/proxy_types.py +1 -0
- datahub/ingestion/source/unity/source.py +4 -0
- datahub/ingestion/source/unity/usage.py +20 -11
- datahub/ingestion/transformer/add_dataset_tags.py +1 -1
- datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
- datahub/integrations/assertion/common.py +1 -1
- datahub/lite/duckdb_lite.py +12 -17
- datahub/metadata/_schema_classes.py +512 -392
- datahub/metadata/_urns/urn_defs.py +1355 -1355
- datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
- datahub/metadata/schema.avsc +17222 -17499
- datahub/metadata/schemas/FormInfo.avsc +4 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
- datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
- datahub/specific/chart.py +0 -39
- datahub/specific/dashboard.py +0 -39
- datahub/specific/datajob.py +7 -57
- datahub/sql_parsing/schema_resolver.py +23 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
- datahub/sql_parsing/sqlglot_lineage.py +55 -14
- datahub/sql_parsing/sqlglot_utils.py +8 -2
- datahub/telemetry/telemetry.py +23 -9
- datahub/testing/compare_metadata_json.py +1 -1
- datahub/testing/doctest.py +12 -0
- datahub/utilities/file_backed_collections.py +35 -2
- datahub/utilities/partition_executor.py +1 -1
- datahub/utilities/urn_encoder.py +2 -1
- datahub/utilities/urns/_urn_base.py +1 -1
- datahub/utilities/urns/structured_properties_urn.py +1 -1
- datahub/utilities/sql_lineage_parser_impl.py +0 -160
- datahub/utilities/sql_parser.py +0 -94
- datahub/utilities/sql_parser_base.py +0 -21
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import concurrent.futures
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
|
+
import re
|
|
4
5
|
import warnings
|
|
5
6
|
from collections import defaultdict
|
|
6
7
|
from enum import Enum
|
|
@@ -609,32 +610,6 @@ class DremioAPIOperations:
|
|
|
609
610
|
|
|
610
611
|
return self.execute_query(query=jobs_query)
|
|
611
612
|
|
|
612
|
-
def get_source_by_id(self, source_id: str) -> Optional[Dict]:
|
|
613
|
-
"""
|
|
614
|
-
Fetch source details by ID.
|
|
615
|
-
"""
|
|
616
|
-
response = self.get(
|
|
617
|
-
url=f"/source/{source_id}",
|
|
618
|
-
)
|
|
619
|
-
return response if response else None
|
|
620
|
-
|
|
621
|
-
def get_source_for_dataset(self, schema: str, dataset: str) -> Optional[Dict]:
|
|
622
|
-
"""
|
|
623
|
-
Get source information for a dataset given its schema and name.
|
|
624
|
-
"""
|
|
625
|
-
dataset_id = self.get_dataset_id(schema, dataset)
|
|
626
|
-
if not dataset_id:
|
|
627
|
-
return None
|
|
628
|
-
|
|
629
|
-
catalog_entry = self.get(
|
|
630
|
-
url=f"/catalog/{dataset_id}",
|
|
631
|
-
)
|
|
632
|
-
if not catalog_entry or "path" not in catalog_entry:
|
|
633
|
-
return None
|
|
634
|
-
|
|
635
|
-
source_id = catalog_entry["path"][0]
|
|
636
|
-
return self.get_source_by_id(source_id)
|
|
637
|
-
|
|
638
613
|
def get_tags_for_resource(self, resource_id: str) -> Optional[List[str]]:
|
|
639
614
|
"""
|
|
640
615
|
Get Dremio tags for a given resource_id.
|
|
@@ -673,55 +648,119 @@ class DremioAPIOperations:
|
|
|
673
648
|
)
|
|
674
649
|
return None
|
|
675
650
|
|
|
676
|
-
def
|
|
677
|
-
self,
|
|
678
|
-
|
|
679
|
-
|
|
651
|
+
def _check_pattern_match(
|
|
652
|
+
self,
|
|
653
|
+
pattern: str,
|
|
654
|
+
paths: List[str],
|
|
655
|
+
allow_prefix: bool = True,
|
|
656
|
+
) -> bool:
|
|
657
|
+
"""
|
|
658
|
+
Helper method to check if a pattern matches any of the paths.
|
|
659
|
+
Handles hierarchical matching where each level is matched independently.
|
|
660
|
+
Also handles prefix matching for partial paths.
|
|
661
|
+
"""
|
|
662
|
+
if pattern == ".*":
|
|
663
|
+
return True
|
|
680
664
|
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
665
|
+
# Convert the pattern to regex with proper anchoring
|
|
666
|
+
regex_pattern = pattern
|
|
667
|
+
if pattern.startswith("^"):
|
|
668
|
+
# Already has start anchor
|
|
669
|
+
regex_pattern = pattern.replace(".", r"\.") # Escape dots
|
|
670
|
+
regex_pattern = regex_pattern.replace(
|
|
671
|
+
r"\.*", ".*"
|
|
672
|
+
) # Convert .* to wildcard
|
|
673
|
+
else:
|
|
674
|
+
# Add start anchor and handle dots
|
|
675
|
+
regex_pattern = "^" + pattern.replace(".", r"\.").replace(r"\.*", ".*")
|
|
676
|
+
|
|
677
|
+
# Handle end matching
|
|
678
|
+
if not pattern.endswith(".*"):
|
|
679
|
+
if pattern.endswith("$"):
|
|
680
|
+
# Keep explicit end anchor
|
|
681
|
+
pass
|
|
682
|
+
elif not allow_prefix:
|
|
683
|
+
# Add end anchor for exact matching
|
|
684
|
+
regex_pattern = regex_pattern + "$"
|
|
685
|
+
|
|
686
|
+
for path in paths:
|
|
687
|
+
if re.match(regex_pattern, path, re.IGNORECASE):
|
|
688
|
+
return True
|
|
697
689
|
|
|
698
|
-
|
|
699
|
-
if (
|
|
700
|
-
container.get("type")
|
|
701
|
-
== DremioEntityContainerType.CONTAINER.value
|
|
702
|
-
):
|
|
703
|
-
traverse_path(container.get("id"), container.get("path"))
|
|
690
|
+
return False
|
|
704
691
|
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
exc=exc,
|
|
713
|
-
)
|
|
692
|
+
def should_include_container(self, path: List[str], name: str) -> bool:
|
|
693
|
+
"""
|
|
694
|
+
Helper method to check if a container should be included based on schema patterns.
|
|
695
|
+
Used by both get_all_containers and get_containers_for_location.
|
|
696
|
+
"""
|
|
697
|
+
path_components = path + [name] if path else [name]
|
|
698
|
+
full_path = ".".join(path_components)
|
|
714
699
|
|
|
715
|
-
|
|
700
|
+
# Default allow everything case
|
|
701
|
+
if self.allow_schema_pattern == [".*"] and not self.deny_schema_pattern:
|
|
702
|
+
self.report.report_container_scanned(full_path)
|
|
703
|
+
return True
|
|
716
704
|
|
|
717
|
-
|
|
705
|
+
# Check deny patterns first
|
|
706
|
+
if self.deny_schema_pattern:
|
|
707
|
+
for pattern in self.deny_schema_pattern:
|
|
708
|
+
if self._check_pattern_match(
|
|
709
|
+
pattern=pattern,
|
|
710
|
+
paths=[full_path],
|
|
711
|
+
allow_prefix=False,
|
|
712
|
+
):
|
|
713
|
+
self.report.report_container_filtered(full_path)
|
|
714
|
+
return False
|
|
715
|
+
|
|
716
|
+
# Check allow patterns
|
|
717
|
+
for pattern in self.allow_schema_pattern:
|
|
718
|
+
# For patterns with wildcards, check if this path is a parent of the pattern
|
|
719
|
+
if "*" in pattern:
|
|
720
|
+
pattern_parts = pattern.split(".")
|
|
721
|
+
path_parts = path_components
|
|
722
|
+
|
|
723
|
+
# If pattern has exact same number of parts, check each component
|
|
724
|
+
if len(pattern_parts) == len(path_parts):
|
|
725
|
+
matches = True
|
|
726
|
+
for p_part, c_part in zip(pattern_parts, path_parts):
|
|
727
|
+
if p_part != "*" and p_part.lower() != c_part.lower():
|
|
728
|
+
matches = False
|
|
729
|
+
break
|
|
730
|
+
if matches:
|
|
731
|
+
self.report.report_container_scanned(full_path)
|
|
732
|
+
return True
|
|
733
|
+
# Otherwise check if current path is prefix match
|
|
734
|
+
else:
|
|
735
|
+
# Remove the trailing wildcard if present
|
|
736
|
+
if pattern_parts[-1] == "*":
|
|
737
|
+
pattern_parts = pattern_parts[:-1]
|
|
738
|
+
|
|
739
|
+
for i in range(len(path_parts)):
|
|
740
|
+
current_path = ".".join(path_parts[: i + 1])
|
|
741
|
+
pattern_prefix = ".".join(pattern_parts[: i + 1])
|
|
742
|
+
|
|
743
|
+
if pattern_prefix.startswith(current_path):
|
|
744
|
+
self.report.report_container_scanned(full_path)
|
|
745
|
+
return True
|
|
746
|
+
|
|
747
|
+
# Direct pattern matching
|
|
748
|
+
if self._check_pattern_match(
|
|
749
|
+
pattern=pattern,
|
|
750
|
+
paths=[full_path],
|
|
751
|
+
allow_prefix=True,
|
|
752
|
+
):
|
|
753
|
+
self.report.report_container_scanned(full_path)
|
|
754
|
+
return True
|
|
755
|
+
|
|
756
|
+
self.report.report_container_filtered(full_path)
|
|
757
|
+
return False
|
|
718
758
|
|
|
719
759
|
def get_all_containers(self):
|
|
720
760
|
"""
|
|
721
|
-
Query the Dremio sources API and return source information.
|
|
761
|
+
Query the Dremio sources API and return filtered source information.
|
|
722
762
|
"""
|
|
723
763
|
containers = []
|
|
724
|
-
|
|
725
764
|
response = self.get(url="/catalog")
|
|
726
765
|
|
|
727
766
|
def process_source(source):
|
|
@@ -731,34 +770,41 @@ class DremioAPIOperations:
|
|
|
731
770
|
)
|
|
732
771
|
|
|
733
772
|
source_config = source_resp.get("config", {})
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
773
|
+
db = source_config.get(
|
|
774
|
+
"database", source_config.get("databaseName", "")
|
|
775
|
+
)
|
|
776
|
+
|
|
777
|
+
if self.should_include_container([], source.get("path")[0]):
|
|
778
|
+
return {
|
|
779
|
+
"id": source.get("id"),
|
|
780
|
+
"name": source.get("path")[0],
|
|
781
|
+
"path": [],
|
|
782
|
+
"container_type": DremioEntityContainerType.SOURCE,
|
|
783
|
+
"source_type": source_resp.get("type"),
|
|
784
|
+
"root_path": source_config.get("rootPath"),
|
|
785
|
+
"database_name": db,
|
|
786
|
+
}
|
|
748
787
|
else:
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
788
|
+
if self.should_include_container([], source.get("path")[0]):
|
|
789
|
+
return {
|
|
790
|
+
"id": source.get("id"),
|
|
791
|
+
"name": source.get("path")[0],
|
|
792
|
+
"path": [],
|
|
793
|
+
"container_type": DremioEntityContainerType.SPACE,
|
|
794
|
+
}
|
|
795
|
+
return None
|
|
755
796
|
|
|
756
797
|
def process_source_and_containers(source):
|
|
757
798
|
container = process_source(source)
|
|
799
|
+
if not container:
|
|
800
|
+
return []
|
|
801
|
+
|
|
802
|
+
# Get sub-containers
|
|
758
803
|
sub_containers = self.get_containers_for_location(
|
|
759
804
|
resource_id=container.get("id"),
|
|
760
805
|
path=[container.get("name")],
|
|
761
806
|
)
|
|
807
|
+
|
|
762
808
|
return [container] + sub_containers
|
|
763
809
|
|
|
764
810
|
# Use ThreadPoolExecutor to parallelize the processing of sources
|
|
@@ -771,6 +817,78 @@ class DremioAPIOperations:
|
|
|
771
817
|
}
|
|
772
818
|
|
|
773
819
|
for future in concurrent.futures.as_completed(future_to_source):
|
|
774
|
-
|
|
820
|
+
source = future_to_source[future]
|
|
821
|
+
try:
|
|
822
|
+
containers.extend(future.result())
|
|
823
|
+
except Exception as exc:
|
|
824
|
+
logger.error(f"Error processing source: {exc}")
|
|
825
|
+
self.report.warning(
|
|
826
|
+
message="Failed to process source",
|
|
827
|
+
context=f"{source}",
|
|
828
|
+
exc=exc,
|
|
829
|
+
)
|
|
775
830
|
|
|
776
831
|
return containers
|
|
832
|
+
|
|
833
|
+
def get_context_for_vds(self, resource_id: str) -> str:
|
|
834
|
+
context_array = self.get(
|
|
835
|
+
url=f"/catalog/{resource_id}",
|
|
836
|
+
).get("sqlContext")
|
|
837
|
+
if context_array:
|
|
838
|
+
return ".".join(
|
|
839
|
+
f'"{part}"' if "." in part else f"{part}" for part in context_array
|
|
840
|
+
)
|
|
841
|
+
else:
|
|
842
|
+
return ""
|
|
843
|
+
|
|
844
|
+
def get_containers_for_location(
|
|
845
|
+
self, resource_id: str, path: List[str]
|
|
846
|
+
) -> List[Dict[str, str]]:
|
|
847
|
+
containers = []
|
|
848
|
+
|
|
849
|
+
def traverse_path(location_id: str, entity_path: List[str]) -> List:
|
|
850
|
+
nonlocal containers
|
|
851
|
+
try:
|
|
852
|
+
response = self.get(url=f"/catalog/{location_id}")
|
|
853
|
+
|
|
854
|
+
# Check if current folder should be included
|
|
855
|
+
if (
|
|
856
|
+
response.get("entityType")
|
|
857
|
+
== DremioEntityContainerType.FOLDER.value.lower()
|
|
858
|
+
):
|
|
859
|
+
folder_name = entity_path[-1]
|
|
860
|
+
folder_path = entity_path[:-1]
|
|
861
|
+
|
|
862
|
+
if self.should_include_container(folder_path, folder_name):
|
|
863
|
+
containers.append(
|
|
864
|
+
{
|
|
865
|
+
"id": location_id,
|
|
866
|
+
"name": folder_name,
|
|
867
|
+
"path": folder_path,
|
|
868
|
+
"container_type": DremioEntityContainerType.FOLDER,
|
|
869
|
+
}
|
|
870
|
+
)
|
|
871
|
+
|
|
872
|
+
# Recursively process child containers
|
|
873
|
+
for container in response.get("children", []):
|
|
874
|
+
if (
|
|
875
|
+
container.get("type")
|
|
876
|
+
== DremioEntityContainerType.CONTAINER.value
|
|
877
|
+
):
|
|
878
|
+
traverse_path(container.get("id"), container.get("path"))
|
|
879
|
+
|
|
880
|
+
except Exception as exc:
|
|
881
|
+
logging.info(
|
|
882
|
+
"Location {} contains no tables or views. Skipping...".format(
|
|
883
|
+
location_id
|
|
884
|
+
)
|
|
885
|
+
)
|
|
886
|
+
self.report.warning(
|
|
887
|
+
message="Failed to get tables or views",
|
|
888
|
+
context=f"{location_id}",
|
|
889
|
+
exc=exc,
|
|
890
|
+
)
|
|
891
|
+
|
|
892
|
+
return containers
|
|
893
|
+
|
|
894
|
+
return traverse_path(location_id=resource_id, entity_path=path)
|
|
@@ -142,6 +142,7 @@ class DremioAspects:
|
|
|
142
142
|
platform: str,
|
|
143
143
|
ui_url: str,
|
|
144
144
|
env: str,
|
|
145
|
+
ingest_owner: bool,
|
|
145
146
|
domain: Optional[str] = None,
|
|
146
147
|
platform_instance: Optional[str] = None,
|
|
147
148
|
):
|
|
@@ -150,6 +151,7 @@ class DremioAspects:
|
|
|
150
151
|
self.env = env
|
|
151
152
|
self.domain = domain
|
|
152
153
|
self.ui_url = ui_url
|
|
154
|
+
self.ingest_owner = ingest_owner
|
|
153
155
|
|
|
154
156
|
def get_container_key(
|
|
155
157
|
self, name: Optional[str], path: Optional[List[str]]
|
|
@@ -426,21 +428,23 @@ class DremioAspects:
|
|
|
426
428
|
return f'{self.ui_url}/{container_type}/{dataset_url_path}"{dataset.resource_name}"'
|
|
427
429
|
|
|
428
430
|
def _create_ownership(self, dataset: DremioDataset) -> Optional[OwnershipClass]:
|
|
429
|
-
if
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
431
|
+
if self.ingest_owner and dataset.owner:
|
|
432
|
+
owner_urn = (
|
|
433
|
+
make_user_urn(dataset.owner)
|
|
434
|
+
if dataset.owner_type == "USER"
|
|
435
|
+
else make_group_urn(dataset.owner)
|
|
436
|
+
)
|
|
437
|
+
ownership: OwnershipClass = OwnershipClass(
|
|
438
|
+
owners=[
|
|
439
|
+
OwnerClass(
|
|
440
|
+
owner=owner_urn,
|
|
441
|
+
type=OwnershipTypeClass.TECHNICAL_OWNER,
|
|
442
|
+
)
|
|
443
|
+
]
|
|
444
|
+
)
|
|
445
|
+
return ownership
|
|
446
|
+
|
|
447
|
+
return None
|
|
444
448
|
|
|
445
449
|
def _create_glossary_terms(self, entity: DremioDataset) -> GlossaryTermsClass:
|
|
446
450
|
return GlossaryTermsClass(
|
|
@@ -174,3 +174,8 @@ class DremioSourceConfig(
|
|
|
174
174
|
default=False,
|
|
175
175
|
description="Whether to include query-based lineage information.",
|
|
176
176
|
)
|
|
177
|
+
|
|
178
|
+
ingest_owner: bool = Field(
|
|
179
|
+
default=True,
|
|
180
|
+
description="Ingest Owner from source. This will override Owner info entered from UI",
|
|
181
|
+
)
|
|
@@ -31,6 +31,7 @@ class DremioToDataHubSourceTypeMapping:
|
|
|
31
31
|
"SNOWFLAKE": "snowflake",
|
|
32
32
|
"SYNAPSE": "mssql",
|
|
33
33
|
"TERADATA": "teradata",
|
|
34
|
+
"VERTICA": "vertica",
|
|
34
35
|
}
|
|
35
36
|
|
|
36
37
|
DATABASE_SOURCE_TYPES = {
|
|
@@ -52,6 +53,7 @@ class DremioToDataHubSourceTypeMapping:
|
|
|
52
53
|
"SNOWFLAKE",
|
|
53
54
|
"SYNAPSE",
|
|
54
55
|
"TERADATA",
|
|
56
|
+
"VERTICA",
|
|
55
57
|
}
|
|
56
58
|
|
|
57
59
|
FILE_OBJECT_STORAGE_TYPES = {
|
|
@@ -200,6 +200,7 @@ class DremioDataset:
|
|
|
200
200
|
columns: List[DremioDatasetColumn]
|
|
201
201
|
sql_definition: Optional[str]
|
|
202
202
|
dataset_type: DremioDatasetType
|
|
203
|
+
default_schema: Optional[str]
|
|
203
204
|
owner: Optional[str]
|
|
204
205
|
owner_type: Optional[str]
|
|
205
206
|
created: str
|
|
@@ -235,6 +236,9 @@ class DremioDataset:
|
|
|
235
236
|
|
|
236
237
|
if self.sql_definition:
|
|
237
238
|
self.dataset_type = DremioDatasetType.VIEW
|
|
239
|
+
self.default_schema = api_operations.get_context_for_vds(
|
|
240
|
+
resource_id=self.resource_id
|
|
241
|
+
)
|
|
238
242
|
else:
|
|
239
243
|
self.dataset_type = DremioDatasetType.TABLE
|
|
240
244
|
|
|
@@ -14,12 +14,27 @@ class DremioSourceReport(
|
|
|
14
14
|
):
|
|
15
15
|
num_containers_failed: int = 0
|
|
16
16
|
num_datasets_failed: int = 0
|
|
17
|
+
containers_scanned: int = 0
|
|
18
|
+
containers_filtered: int = 0
|
|
17
19
|
|
|
18
20
|
def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
|
|
19
21
|
# recording total combined latency is not very useful, keeping this method as a placeholder
|
|
20
22
|
# for future implementation of min / max / percentiles etc.
|
|
21
23
|
pass
|
|
22
24
|
|
|
25
|
+
def report_container_scanned(self, name: str) -> None:
|
|
26
|
+
"""
|
|
27
|
+
Record that a container was successfully scanned
|
|
28
|
+
"""
|
|
29
|
+
self.containers_scanned += 1
|
|
30
|
+
|
|
31
|
+
def report_container_filtered(self, container_name: str) -> None:
|
|
32
|
+
"""
|
|
33
|
+
Record that a container was filtered out
|
|
34
|
+
"""
|
|
35
|
+
self.containers_filtered += 1
|
|
36
|
+
self.report_dropped(container_name)
|
|
37
|
+
|
|
23
38
|
def report_entity_scanned(self, name: str, ent_type: str = "View") -> None:
|
|
24
39
|
"""
|
|
25
40
|
Entity could be a view or a table
|
|
@@ -97,6 +97,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
97
97
|
- Ownership and Glossary Terms:
|
|
98
98
|
- Metadata related to ownership of datasets, extracted from Dremio’s ownership model.
|
|
99
99
|
- Glossary terms and business metadata associated with datasets, providing additional context to the data.
|
|
100
|
+
- Note: Ownership information will only be available for the Cloud and Enterprise editions, it will not be available for the Community edition.
|
|
100
101
|
|
|
101
102
|
- Optional SQL Profiling (if enabled):
|
|
102
103
|
- Table, row, and column statistics can be profiled and ingested via optional SQL queries.
|
|
@@ -123,6 +124,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
123
124
|
self.dremio_aspects = DremioAspects(
|
|
124
125
|
platform=self.get_platform(),
|
|
125
126
|
domain=self.config.domain,
|
|
127
|
+
ingest_owner=self.config.ingest_owner,
|
|
126
128
|
platform_instance=self.config.platform_instance,
|
|
127
129
|
env=self.config.env,
|
|
128
130
|
ui_url=dremio_api.ui_url,
|
|
@@ -394,10 +396,12 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
394
396
|
):
|
|
395
397
|
yield dremio_mcp
|
|
396
398
|
# Check if the emitted aspect is SchemaMetadataClass
|
|
397
|
-
if isinstance(
|
|
399
|
+
if isinstance(
|
|
400
|
+
dremio_mcp.metadata, MetadataChangeProposalWrapper
|
|
401
|
+
) and isinstance(dremio_mcp.metadata.aspect, SchemaMetadataClass):
|
|
398
402
|
self.sql_parsing_aggregator.register_schema(
|
|
399
403
|
urn=dataset_urn,
|
|
400
|
-
schema=dremio_mcp.metadata,
|
|
404
|
+
schema=dremio_mcp.metadata.aspect,
|
|
401
405
|
)
|
|
402
406
|
|
|
403
407
|
if dataset_info.dataset_type == DremioDatasetType.VIEW:
|
|
@@ -415,6 +419,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
415
419
|
view_urn=dataset_urn,
|
|
416
420
|
view_definition=dataset_info.sql_definition,
|
|
417
421
|
default_db=self.default_db,
|
|
422
|
+
default_schema=dataset_info.default_schema,
|
|
418
423
|
)
|
|
419
424
|
|
|
420
425
|
elif dataset_info.dataset_type == DremioDatasetType.TABLE:
|
|
@@ -227,7 +227,7 @@ def collapse_name(name: str, collapse_urns: CollapseUrns) -> str:
|
|
|
227
227
|
def collapse_urn(urn: str, collapse_urns: CollapseUrns) -> str:
|
|
228
228
|
if len(collapse_urns.urns_suffix_regex) == 0:
|
|
229
229
|
return urn
|
|
230
|
-
urn_obj = DatasetUrn.
|
|
230
|
+
urn_obj = DatasetUrn.from_string(urn)
|
|
231
231
|
name = collapse_name(name=urn_obj.get_dataset_name(), collapse_urns=collapse_urns)
|
|
232
232
|
data_platform_urn = urn_obj.get_data_platform_urn()
|
|
233
233
|
return str(
|
|
@@ -42,10 +42,14 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
|
|
|
42
42
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
43
43
|
from datahub.metadata.schema_classes import (
|
|
44
44
|
BrowsePathsClass,
|
|
45
|
+
GlobalTagsClass,
|
|
45
46
|
MLFeaturePropertiesClass,
|
|
46
47
|
MLFeatureTablePropertiesClass,
|
|
47
48
|
MLPrimaryKeyPropertiesClass,
|
|
49
|
+
OwnerClass,
|
|
50
|
+
OwnershipClass,
|
|
48
51
|
StatusClass,
|
|
52
|
+
TagAssociationClass,
|
|
49
53
|
)
|
|
50
54
|
|
|
51
55
|
# FIXME: ValueType module cannot be used as a type
|
|
@@ -91,6 +95,24 @@ class FeastRepositorySourceConfig(ConfigModel):
|
|
|
91
95
|
environment: str = Field(
|
|
92
96
|
default=DEFAULT_ENV, description="Environment to use when constructing URNs"
|
|
93
97
|
)
|
|
98
|
+
# owner_mappings example:
|
|
99
|
+
# This must be added to the recipe in order to extract owners, otherwise NO owners will be extracted
|
|
100
|
+
# owner_mappings:
|
|
101
|
+
# - feast_owner_name: "<owner>"
|
|
102
|
+
# datahub_owner_urn: "urn:li:corpGroup:<owner>"
|
|
103
|
+
# datahub_ownership_type: "BUSINESS_OWNER"
|
|
104
|
+
owner_mappings: Optional[List[Dict[str, str]]] = Field(
|
|
105
|
+
default=None, description="Mapping of owner names to owner types"
|
|
106
|
+
)
|
|
107
|
+
enable_owner_extraction: bool = Field(
|
|
108
|
+
default=False,
|
|
109
|
+
description="If this is disabled, then we NEVER try to map owners. "
|
|
110
|
+
"If this is enabled, then owner_mappings is REQUIRED to extract ownership.",
|
|
111
|
+
)
|
|
112
|
+
enable_tag_extraction: bool = Field(
|
|
113
|
+
default=False,
|
|
114
|
+
description="If this is disabled, then we NEVER try to extract tags.",
|
|
115
|
+
)
|
|
94
116
|
|
|
95
117
|
|
|
96
118
|
@platform_name("Feast")
|
|
@@ -215,10 +237,15 @@ class FeastRepositorySource(Source):
|
|
|
215
237
|
"""
|
|
216
238
|
|
|
217
239
|
feature_view_name = f"{self.feature_store.project}.{feature_view.name}"
|
|
240
|
+
aspects = (
|
|
241
|
+
[StatusClass(removed=False)]
|
|
242
|
+
+ self._get_tags(entity)
|
|
243
|
+
+ self._get_owners(entity)
|
|
244
|
+
)
|
|
218
245
|
|
|
219
246
|
entity_snapshot = MLPrimaryKeySnapshot(
|
|
220
247
|
urn=builder.make_ml_primary_key_urn(feature_view_name, entity.name),
|
|
221
|
-
aspects=
|
|
248
|
+
aspects=aspects,
|
|
222
249
|
)
|
|
223
250
|
|
|
224
251
|
entity_snapshot.aspects.append(
|
|
@@ -243,10 +270,11 @@ class FeastRepositorySource(Source):
|
|
|
243
270
|
Generate an MLFeature work unit for a Feast feature.
|
|
244
271
|
"""
|
|
245
272
|
feature_view_name = f"{self.feature_store.project}.{feature_view.name}"
|
|
273
|
+
aspects = [StatusClass(removed=False)] + self._get_tags(field)
|
|
246
274
|
|
|
247
275
|
feature_snapshot = MLFeatureSnapshot(
|
|
248
276
|
urn=builder.make_ml_feature_urn(feature_view_name, field.name),
|
|
249
|
-
aspects=
|
|
277
|
+
aspects=aspects,
|
|
250
278
|
)
|
|
251
279
|
|
|
252
280
|
feature_sources = []
|
|
@@ -295,13 +323,18 @@ class FeastRepositorySource(Source):
|
|
|
295
323
|
"""
|
|
296
324
|
|
|
297
325
|
feature_view_name = f"{self.feature_store.project}.{feature_view.name}"
|
|
326
|
+
aspects = (
|
|
327
|
+
[
|
|
328
|
+
BrowsePathsClass(paths=[f"/feast/{self.feature_store.project}"]),
|
|
329
|
+
StatusClass(removed=False),
|
|
330
|
+
]
|
|
331
|
+
+ self._get_tags(feature_view)
|
|
332
|
+
+ self._get_owners(feature_view)
|
|
333
|
+
)
|
|
298
334
|
|
|
299
335
|
feature_view_snapshot = MLFeatureTableSnapshot(
|
|
300
336
|
urn=builder.make_ml_feature_table_urn("feast", feature_view_name),
|
|
301
|
-
aspects=
|
|
302
|
-
BrowsePathsClass(paths=[f"/feast/{self.feature_store.project}"]),
|
|
303
|
-
StatusClass(removed=False),
|
|
304
|
-
],
|
|
337
|
+
aspects=aspects,
|
|
305
338
|
)
|
|
306
339
|
|
|
307
340
|
feature_view_snapshot.aspects.append(
|
|
@@ -360,6 +393,64 @@ class FeastRepositorySource(Source):
|
|
|
360
393
|
|
|
361
394
|
return MetadataWorkUnit(id=on_demand_feature_view_name, mce=mce)
|
|
362
395
|
|
|
396
|
+
# If a tag is specified in a Feast object, then the tag will be ingested into Datahub if enable_tag_extraction is
|
|
397
|
+
# True, otherwise NO tags will be ingested
|
|
398
|
+
def _get_tags(self, obj: Union[Entity, FeatureView, FeastField]) -> list:
|
|
399
|
+
"""
|
|
400
|
+
Extracts tags from the given object and returns a list of aspects.
|
|
401
|
+
"""
|
|
402
|
+
aspects: List[Union[GlobalTagsClass]] = []
|
|
403
|
+
|
|
404
|
+
# Extract tags
|
|
405
|
+
if self.source_config.enable_tag_extraction:
|
|
406
|
+
if obj.tags.get("name"):
|
|
407
|
+
tag_name: str = obj.tags["name"]
|
|
408
|
+
tag_association = TagAssociationClass(
|
|
409
|
+
tag=builder.make_tag_urn(tag_name)
|
|
410
|
+
)
|
|
411
|
+
global_tags_aspect = GlobalTagsClass(tags=[tag_association])
|
|
412
|
+
aspects.append(global_tags_aspect)
|
|
413
|
+
|
|
414
|
+
return aspects
|
|
415
|
+
|
|
416
|
+
# If an owner is specified in a Feast object, it will only be ingested into Datahub if owner_mappings is specified
|
|
417
|
+
# and enable_owner_extraction is True in FeastRepositorySourceConfig, otherwise NO owners will be ingested
|
|
418
|
+
def _get_owners(self, obj: Union[Entity, FeatureView, FeastField]) -> list:
|
|
419
|
+
"""
|
|
420
|
+
Extracts owners from the given object and returns a list of aspects.
|
|
421
|
+
"""
|
|
422
|
+
aspects: List[Union[OwnershipClass]] = []
|
|
423
|
+
|
|
424
|
+
# Extract owner
|
|
425
|
+
if self.source_config.enable_owner_extraction:
|
|
426
|
+
owner = getattr(obj, "owner", None)
|
|
427
|
+
if owner:
|
|
428
|
+
# Create owner association, skipping if None
|
|
429
|
+
owner_association = self._create_owner_association(owner)
|
|
430
|
+
if owner_association: # Only add valid owner associations
|
|
431
|
+
owners_aspect = OwnershipClass(owners=[owner_association])
|
|
432
|
+
aspects.append(owners_aspect)
|
|
433
|
+
|
|
434
|
+
return aspects
|
|
435
|
+
|
|
436
|
+
def _create_owner_association(self, owner: str) -> Optional[OwnerClass]:
|
|
437
|
+
"""
|
|
438
|
+
Create an OwnerClass instance for the given owner using the owner mappings.
|
|
439
|
+
"""
|
|
440
|
+
if self.source_config.owner_mappings is not None:
|
|
441
|
+
for mapping in self.source_config.owner_mappings:
|
|
442
|
+
if mapping["feast_owner_name"] == owner:
|
|
443
|
+
ownership_type_class: str = mapping.get(
|
|
444
|
+
"datahub_ownership_type", "TECHNICAL_OWNER"
|
|
445
|
+
)
|
|
446
|
+
datahub_owner_urn = mapping.get("datahub_owner_urn")
|
|
447
|
+
if datahub_owner_urn:
|
|
448
|
+
return OwnerClass(
|
|
449
|
+
owner=datahub_owner_urn,
|
|
450
|
+
type=ownership_type_class,
|
|
451
|
+
)
|
|
452
|
+
return None
|
|
453
|
+
|
|
363
454
|
@classmethod
|
|
364
455
|
def create(cls, config_dict, ctx):
|
|
365
456
|
config = FeastRepositorySourceConfig.parse_obj(config_dict)
|